From 24ec1d5ba85503599fd301aa8cd56ee65651ab0b Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Fri, 12 Feb 2021 18:12:51 +0100 Subject: Added support for char and wchar_t --- .gitignore | 7 ++++ debian/changelog | 1 + include/unicode.h | 115 ++++++++++++++++++++++++++++----------------------- src/test-unicode.cpp | 17 ++++++++ 4 files changed, 89 insertions(+), 51 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..01c9c5b --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.a +*.o +*.d +*.pem +*.so +*.swp +default.profraw diff --git a/debian/changelog b/debian/changelog index 231944b..490318e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,6 +1,7 @@ unicode (1.1) unstable; urgency=medium * Fixed copyright + * Support Unicode conversion for basic types like char and wchar_t -- Roland Reichwein Fri, 05 Feb 2021 21:53:32 +0100 diff --git a/include/unicode.h b/include/unicode.h index 2424fb1..d6f8e51 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -39,6 +39,8 @@ namespace unicode::detail { template struct utf_iterator { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + typedef T input_type; typedef char32_t value_type; typedef char32_t& reference; @@ -50,7 +52,7 @@ namespace unicode::detail { utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) { - calculate_value(); + calculate_value(); } utf_iterator(const utf_iterator& other) = default; @@ -67,27 +69,6 @@ namespace unicode::detail { return *(iterator + index); } - // set value member - // default: char32_t for UTF-32 - // specializations for UTF-8 and UTF-16 below - template - void calculate_value() - { - static_assert(sizeof(T1) == 4); - - size_t remaining{remaining_code_units()}; - - if (!remaining) - return; - - value = get_code_unit<0>(); - - if (!unicode::is_valid_unicode(value)) - throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); - - sequence_length = 1; - } - inline static bool is_continuation_byte(T b) { return (b & 0b11000000) == 0b10000000; @@ -122,10 +103,7 @@ namespace unicode::detail { return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); } - // GCC Bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85282 - // specialization for UTF-8 - template<> - void calculate_value() + void calculate_value_utf8() { size_t remaining{remaining_code_units()}; @@ -168,9 +146,7 @@ namespace unicode::detail { } } - // specialization for UTF-16 - template<> - void calculate_value() + void calculate_value_utf16() { size_t remaining{remaining_code_units()}; @@ -195,11 +171,40 @@ namespace unicode::detail { } } + void calculate_value_utf32() + { + size_t remaining{remaining_code_units()}; + + if (!remaining) + return; + + value = get_code_unit<0>(); + + if (!unicode::is_valid_unicode(value)) + throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); + + sequence_length = 1; + } + + // set value member + void calculate_value() + { + if constexpr(sizeof(T) == 1) { + calculate_value_utf8(); + } else if constexpr (sizeof(T) == 2) { + calculate_value_utf16(); + } else if constexpr (sizeof(T) == 4) { + calculate_value_utf32(); + } else { + throw std::runtime_error("Invalid character size: "s + std::to_string(sizeof(T))); + } + } + // pre-increment utf_iterator& operator++() { iterator += sequence_length; - calculate_value(); + calculate_value(); return *this; } @@ -224,6 +229,8 @@ namespace unicode::detail { template struct utf_back_insert_iterator { + static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + typedef T value_type; typedef std::basic_string string_type; typedef utf_back_insert_iterator& reference; @@ -253,16 +260,6 @@ namespace unicode::detail { return *this; } - // default: utf-32 code unit for UTF-32 - // specializations for UTF-8 and UTF-16 below - template - reference operator=(const char32_t& value) - { - // expect value to be already valid Unicode values - s.push_back(value); - return *this; - } - // n is number of UTF-8 bytes in sequence template inline static T byte0_of(char32_t value) @@ -288,10 +285,7 @@ namespace unicode::detail { return trailing_byte(value); } - // specialization for UTF-8 - // append utf-8 byte sequence - template<> - reference operator=(const char32_t& value) + void append_utf8(const char32_t& value) { if (value < 0x80) { // 1 byte s.push_back(static_cast(value)); @@ -309,13 +303,9 @@ namespace unicode::detail { s.push_back(byte_n_of_m<3,4>(value)); } else throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast(value))); - return *this; } - // specialization for UTF-16 - // append utf-16 word sequence - template<> - reference operator=(const char32_t& value) + void append_utf16(const char32_t& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values s.push_back(static_cast(value)); @@ -324,6 +314,25 @@ namespace unicode::detail { s.push_back((value_reduced >> 10) + 0xD800); s.push_back((value_reduced & 0x3FF) + 0xDC00); } + } + + void append_utf32(const char32_t& value) + { + // expect value to be already valid Unicode values + s.push_back(value); + } + + reference operator=(const char32_t& value) + { + if constexpr(sizeof(T) == 1) { + append_utf8(value); + } else if constexpr(sizeof(T) == 2) { + append_utf16(value); + } else if constexpr(sizeof(T) == 4) { + append_utf32(value); + } else { + throw std::runtime_error("Invalid type size: "s + std::to_string(sizeof(T))); + } return *this; } @@ -555,9 +564,11 @@ namespace unicode { template std::basic_string convert(const std::basic_string& s) { + typedef UTF, utf_back_insert_iterator> UTF_Trait; + std::basic_string result; - std::copy(Encoding::Facet::begin(s), Encoding::Facet::end(s), Encoding::Facet::back_inserter(result)); + std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } @@ -566,8 +577,10 @@ namespace unicode { template bool is_valid_utf(const std::basic_string& s) { + typedef UTF, utf_back_insert_iterator> UTF_Trait; + try { - std::for_each(Encoding::Facet::begin(s), Encoding::Facet::end(s), [](const char32_t& c){}); + std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); } catch (const std::invalid_argument&) { return false; } diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 692dfac..99a8f99 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -334,6 +334,23 @@ BOOST_AUTO_TEST_CASE(convert) BOOST_CHECK((unicode::convert("abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); + + BOOST_CHECK((unicode::convert(u8"äöü")) == std::u32string{U"äöü"}); + +#ifdef _WIN32 + BOOST_CHECK(sizeof(wchar_t) == 2); +#else // Unix like + BOOST_CHECK(sizeof(wchar_t) == 4); +#endif + + // For the following checks, wchar_t size and encoding is system dependent: + // Windows: UTF-16 + // Linux: UTF-32 + BOOST_CHECK((unicode::convert(u8"äöü")) == std::wstring{L"äöü"}); + BOOST_CHECK((unicode::convert(u8"\u732b")) == std::wstring{L"\u732b"}); + BOOST_CHECK((unicode::convert(u8"\U0001F63A")) == std::wstring{L"\U0001F63A"}); + BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::u32string{U"\U0001F63A"}); + BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::string{u8"\U0001F63A"}); } BOOST_AUTO_TEST_CASE(is_valid_utf) -- cgit v1.2.3