summaryrefslogtreecommitdiffhomepage
path: root/include
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-02-12 18:12:51 +0100
committerRoland Reichwein <mail@reichwein.it>2021-02-12 18:12:51 +0100
commit24ec1d5ba85503599fd301aa8cd56ee65651ab0b (patch)
treeccabf3b89338825720e926a73602862df03ae801 /include
parentb47110d30db3a416775c5de88e1d946dfdbda734 (diff)
Added support for char and wchar_t
Diffstat (limited to 'include')
-rw-r--r--include/unicode.h115
1 files changed, 64 insertions, 51 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 2424fb1..d6f8e51 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -39,6 +39,8 @@ namespace unicode::detail {
template<typename T>
struct utf_iterator
{
+ static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+
typedef T input_type;
typedef char32_t value_type;
typedef char32_t& reference;
@@ -50,7 +52,7 @@ namespace unicode::detail {
utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
iterator(cbegin), end_iterator(cend)
{
- calculate_value<T>();
+ calculate_value();
}
utf_iterator<T>(const utf_iterator<T>& other) = default;
@@ -67,27 +69,6 @@ namespace unicode::detail {
return *(iterator + index);
}
- // set value member
- // default: char32_t for UTF-32
- // specializations for UTF-8 and UTF-16 below
- template<typename T1>
- void calculate_value()
- {
- static_assert(sizeof(T1) == 4);
-
- size_t remaining{remaining_code_units()};
-
- if (!remaining)
- return;
-
- value = get_code_unit<0>();
-
- if (!unicode::is_valid_unicode(value))
- throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
-
- sequence_length = 1;
- }
-
inline static bool is_continuation_byte(T b)
{
return (b & 0b11000000) == 0b10000000;
@@ -122,10 +103,7 @@ namespace unicode::detail {
return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
}
- // GCC Bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85282
- // specialization for UTF-8
- template<>
- void calculate_value<utf8_t>()
+ void calculate_value_utf8()
{
size_t remaining{remaining_code_units()};
@@ -168,9 +146,7 @@ namespace unicode::detail {
}
}
- // specialization for UTF-16
- template<>
- void calculate_value<char16_t>()
+ void calculate_value_utf16()
{
size_t remaining{remaining_code_units()};
@@ -195,11 +171,40 @@ namespace unicode::detail {
}
}
+ void calculate_value_utf32()
+ {
+ size_t remaining{remaining_code_units()};
+
+ if (!remaining)
+ return;
+
+ value = get_code_unit<0>();
+
+ if (!unicode::is_valid_unicode(value))
+ throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+
+ sequence_length = 1;
+ }
+
+ // set value member
+ void calculate_value()
+ {
+ if constexpr(sizeof(T) == 1) {
+ calculate_value_utf8();
+ } else if constexpr (sizeof(T) == 2) {
+ calculate_value_utf16();
+ } else if constexpr (sizeof(T) == 4) {
+ calculate_value_utf32();
+ } else {
+ throw std::runtime_error("Invalid character size: "s + std::to_string(sizeof(T)));
+ }
+ }
+
// pre-increment
utf_iterator<T>& operator++()
{
iterator += sequence_length;
- calculate_value<T>();
+ calculate_value();
return *this;
}
@@ -224,6 +229,8 @@ namespace unicode::detail {
template<typename T>
struct utf_back_insert_iterator
{
+ static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+
typedef T value_type;
typedef std::basic_string<T> string_type;
typedef utf_back_insert_iterator& reference;
@@ -253,16 +260,6 @@ namespace unicode::detail {
return *this;
}
- // default: utf-32 code unit for UTF-32
- // specializations for UTF-8 and UTF-16 below
- template<typename T1=T>
- reference operator=(const char32_t& value)
- {
- // expect value to be already valid Unicode values
- s.push_back(value);
- return *this;
- }
-
// n is number of UTF-8 bytes in sequence
template<size_t n>
inline static T byte0_of(char32_t value)
@@ -288,10 +285,7 @@ namespace unicode::detail {
return trailing_byte<m - n - 1>(value);
}
- // specialization for UTF-8
- // append utf-8 byte sequence
- template<>
- reference operator=<utf8_t>(const char32_t& value)
+ void append_utf8(const char32_t& value)
{
if (value < 0x80) { // 1 byte
s.push_back(static_cast<value_type>(value));
@@ -309,13 +303,9 @@ namespace unicode::detail {
s.push_back(byte_n_of_m<3,4>(value));
} else
throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
- return *this;
}
- // specialization for UTF-16
- // append utf-16 word sequence
- template<>
- reference operator=<char16_t>(const char32_t& value)
+ void append_utf16(const char32_t& value)
{
if (value <= 0xFFFF) { // expect value to be already valid Unicode values
s.push_back(static_cast<value_type>(value));
@@ -324,6 +314,25 @@ namespace unicode::detail {
s.push_back((value_reduced >> 10) + 0xD800);
s.push_back((value_reduced & 0x3FF) + 0xDC00);
}
+ }
+
+ void append_utf32(const char32_t& value)
+ {
+ // expect value to be already valid Unicode values
+ s.push_back(value);
+ }
+
+ reference operator=(const char32_t& value)
+ {
+ if constexpr(sizeof(T) == 1) {
+ append_utf8(value);
+ } else if constexpr(sizeof(T) == 2) {
+ append_utf16(value);
+ } else if constexpr(sizeof(T) == 4) {
+ append_utf32(value);
+ } else {
+ throw std::runtime_error("Invalid type size: "s + std::to_string(sizeof(T)));
+ }
return *this;
}
@@ -555,9 +564,11 @@ namespace unicode {
template<typename From, typename To>
std::basic_string<To> convert(const std::basic_string<From>& s)
{
+ typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait;
+
std::basic_string<To> result;
- std::copy(Encoding<From>::Facet::begin(s), Encoding<From>::Facet::end(s), Encoding<To>::Facet::back_inserter(result));
+ std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result));
return result;
}
@@ -566,8 +577,10 @@ namespace unicode {
template<typename T>
bool is_valid_utf(const std::basic_string<T>& s)
{
+ typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait;
+
try {
- std::for_each(Encoding<T>::Facet::begin(s), Encoding<T>::Facet::end(s), [](const char32_t& c){});
+ std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){});
} catch (const std::invalid_argument&) {
return false;
}