summaryrefslogtreecommitdiffhomepage
path: root/include/unicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/unicode.h')
-rw-r--r--include/unicode.h40
1 files changed, 21 insertions, 19 deletions
diff --git a/include/unicode.h b/include/unicode.h
index a55eac3..f539e6b 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -1,5 +1,4 @@
// libunicode
-// Copyright (C) 2021 Roland Reichwein
#pragma once
@@ -7,11 +6,20 @@
#include <stdexcept>
#include <string>
-#ifdef __has_cpp_attribute
-#if __has_cpp_attribute(__cpp_char8_t)
+#ifdef __cpp_char8_t
// char8_t available
#endif
-#endif
+
+namespace unicode {
+
+ // usually, char32_t, uint32_t etc.
+ template<typename T>
+ static inline bool is_valid_unicode(const T& value)
+ {
+ return value <= 0x10FFFF && (value <= 0xD7FF || value >= 0xE000);
+ }
+
+}
namespace {
@@ -50,6 +58,8 @@ namespace {
template<typename T1>
void calculate_value()
{
+ static_assert(sizeof(T1) == 4);
+
size_t remaining{remaining_code_units()};
if (!remaining)
@@ -57,7 +67,7 @@ namespace {
value = get_code_unit<0>();
- if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000))
+ if (!unicode::is_valid_unicode(value))
throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
sequence_length = 1;
@@ -88,7 +98,7 @@ namespace {
template<typename... Targs>
inline static char32_t continuation_value(T b, Targs... Fargs)
{
- return continuation_value(b) << 6 | continuation_value(Fargs...);
+ return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
}
template<size_t n>
@@ -159,7 +169,7 @@ namespace {
if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
- value = static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF);
+ value = (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
sequence_length = 2;
}
}
@@ -185,7 +195,7 @@ namespace {
typename string_type::const_iterator iterator;
typename string_type::const_iterator end_iterator;
- value_type value{};
+ char32_t value{}; // always save complete unicode code point at this point
size_t sequence_length{};
};
@@ -276,8 +286,9 @@ namespace {
if (value <= 0xFFFF) { // expect value to be already valid Unicode values
s.push_back(value);
} else {
- s.push_back((value >> 10) + 0xD800);
- s.push_back((value & 0x3FF) + 0xDC00);
+ char32_t value_reduced{value - 0x10000};
+ s.push_back((value_reduced >> 10) + 0xD800);
+ s.push_back((value_reduced & 0x3FF) + 0xDC00);
}
return *this;
}
@@ -317,14 +328,5 @@ std::basic_string<To> utf_to_utf(const std::basic_string<From>& s)
return result;
}
-//std::u8string utf16_to_utf8(const std::u16string& s)
-//{
-// std::u8string result;
-//
-// std::transform(utf16_begin(s), utf16_end(s), std::back_inserter(result));
-//
-// return result;
-//}
-
} // namespace unicode