From fad8b697dff7c7b47f034124ea6eef25e74bd7af Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Tue, 26 Jan 2021 22:05:08 +0100 Subject: Implement conversion and first tests --- include/unicode.h | 257 ++++++++++++++++++++++++++++++++++++++++++--------- src/test-unicode.cpp | 47 +++++++++- 2 files changed, 257 insertions(+), 47 deletions(-) diff --git a/include/unicode.h b/include/unicode.h index 512891a..a55eac3 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -15,66 +15,164 @@ namespace { - struct utf8_iterator + using namespace std::string_literals; + + template + struct utf_iterator { typedef char32_t value_type; typedef char32_t& reference; + typedef std::basic_string string_type; - utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend): + utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) { - calculate_value(); + calculate_value(); + } + + utf_iterator(const utf_iterator& other) = default; + utf_iterator& operator=(const utf_iterator& other) = default; + + size_t remaining_code_units() + { + return end_iterator - iterator; } - utf8_iterator(const utf8_iterator& other) = default; - utf8_iterator& operator=(const utf8_iterator& other) = default; + template + T get_code_unit() + { + return *(iterator + index); + } // set value member + // default: char32_t for UTF-32 + // specializations for UTF-8 and UTF-16 below + template void calculate_value() { - if (iterator == end_iterator) + size_t remaining{remaining_code_units()}; + + if (!remaining) + return; + + value = get_code_unit<0>(); + + if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000)) + throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); + + sequence_length = 1; + } + + inline static bool is_continuation_byte(T b) + { + return (b & 0b11000000) == 0b10000000; + } + + template + inline static bool is_continuation_byte(T b, Targs... Fargs) + { + return is_continuation_byte(b) && is_continuation_byte(Fargs...); + } + + template + inline static bool is_byte0_of(T b) + { + return (b & static_cast(0xFF << (7 - n))) == static_cast(0xFF << (8 - n)); + } + + inline static char32_t continuation_value(T b) + { + return static_cast(b & 0b00111111); + } + + template + inline static char32_t continuation_value(T b, Targs... Fargs) + { + return continuation_value(b) << 6 | continuation_value(Fargs...); + } + + template + inline static char32_t value_byte0_of(T b) + { + return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); + } + + // specialization for UTF-8 + template<> + void calculate_value() + { + size_t remaining{remaining_code_units()}; + + if (!remaining) return; - char8_t first_byte {*iterator}; - if (first_byte & 0x80) { // 2-4 bytes - if (iterator + 1 != end_iterator) { - char8_t second_byte {*(iterator + 1)}; - if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes - value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111); + char8_t byte0 {get_code_unit<0>()}; + if (byte0 & 0x80) { // 2-4 bytes + if (remaining >= 2) { + char8_t byte1 {get_code_unit<1>()}; + if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes + value = value_byte0_of<2>(byte0) | continuation_value(byte1); sequence_length = 2; - } else if (iterator + 2 != end_iterator) { - char8_t third_byte {*(iterator + 2)}; - if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes - value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111); + } else if (remaining >= 3) { + char8_t byte2 {get_code_unit<2>()}; + if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes + value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); sequence_length = 3; - } else if (iterator + 3 != end_iterator) { - char8_t fourth_byte {*(iterator + 3)}; - if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes - value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111); + } else if (remaining >= 4) { + char8_t byte3 {get_code_unit<3>()}; + if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes + value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); sequence_length = 4; } else - throw std::invalid_argument("bad input: invalid 4 byte sequence"); + throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else - throw std::invalid_argument("bad input: invalid 3 byte sequence"); + throw std::invalid_argument("Bad input: Invalid 3 byte sequence"); } else - throw std::invalid_argument("bad input: invalid 2 byte sequence"); + throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); } else - throw std::invalid_argument("bad input: byte 2 expected, none found"); + throw std::invalid_argument("Bad input: 2nd byte expected, none found"); } else { // 1 byte: 7 bit ASCII - value = first_byte; + value = byte0; sequence_length = 1; } } + // specialization for UTF-16 + template<> + void calculate_value() + { + size_t remaining{remaining_code_units()}; + + if (!remaining) + return; + + char16_t unit0 {get_code_unit<0>()}; + + if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) + value = unit0; + sequence_length = 1; + } else { + if (remaining < 2) + throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); + + char16_t unit1 {get_code_unit<1>()}; + if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) + throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); + + value = static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF); + sequence_length = 2; + } + } + // pre-increment - utf8_iterator& operator++() + utf_iterator& operator++() { iterator += sequence_length; - calculate_value(); + calculate_value(); return *this; } - bool operator!=(const utf8_iterator& other) const + bool operator!=(const utf_iterator& other) const { return iterator != other.iterator; } @@ -84,21 +182,23 @@ namespace { return value; } - std::u8string::const_iterator iterator; - std::u8string::const_iterator end_iterator; + typename string_type::const_iterator iterator; + typename string_type::const_iterator end_iterator; value_type value{}; size_t sequence_length{}; }; - struct utf16_back_insert_iterator + template + struct utf_back_insert_iterator { - typedef utf16_back_insert_iterator& reference; + typedef std::basic_string string_type; + typedef utf_back_insert_iterator& reference; - utf16_back_insert_iterator(std::u16string& s): s(s) {} + utf_back_insert_iterator(string_type& s): s(s) {} // no-op - utf16_back_insert_iterator& operator++() + utf_back_insert_iterator& operator++() { return *this; } @@ -109,10 +209,71 @@ namespace { return *this; } - // append utf-16 word sequence + // default: utf-32 code unit for UTF-32 + // specializations for UTF-8 and UTF-16 below + template reference operator=(const char32_t& value) { - if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t! + // expect value to be already valid Unicode values + s.push_back(value); + return *this; + } + + // n is number of UTF-8 bytes in sequence + template + inline static T byte0_of(char32_t value) + { + return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); + } + + // n is index of 6-bit groups, counting from bit 0 + template + inline static T trailing_byte(char32_t value) + { + return ((value >> n * 6) & 0b111111) | 0b10000000; + } + + // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) + // assume value to be valid Unicode value for given byte position + template + inline static T byte_n_of_m(char32_t value) + { + if constexpr (n == 0) + return byte0_of(value); + else + return trailing_byte(value); + } + + // specialization for UTF-8 + // append utf-8 byte sequence + template<> + reference operator=(const char32_t& value) + { + if (value < 0x80) { // 1 byte + s.push_back(value); + } else if (value < 0x800) { // 2 bytes + s.push_back(byte_n_of_m<0,2>(value)); + s.push_back(byte_n_of_m<1,2>(value)); + } else if (value < 0x10000) { // 3 bytes + s.push_back(byte_n_of_m<0,3>(value)); + s.push_back(byte_n_of_m<1,3>(value)); + s.push_back(byte_n_of_m<2,3>(value)); + } else if (value < 0x110000) { // 4 bytes + s.push_back(byte_n_of_m<0,4>(value)); + s.push_back(byte_n_of_m<1,4>(value)); + s.push_back(byte_n_of_m<2,4>(value)); + s.push_back(byte_n_of_m<3,4>(value)); + } else + throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast(value))); + return *this; + } + + // specialization for UTF-16 + // append utf-16 word sequence + template<> + reference operator=(const char32_t& value) + { + if (value <= 0xFFFF) { // expect value to be already valid Unicode values s.push_back(value); } else { s.push_back((value >> 10) + 0xD800); @@ -121,33 +282,37 @@ namespace { return *this; } - std::u16string& s; + typename utf_back_insert_iterator::string_type& s; }; - utf16_back_insert_iterator utf16_back_inserter(std::u16string& s) + template + utf_back_insert_iterator utf_back_inserter(std::basic_string& s) { - return utf16_back_insert_iterator(s); + return utf_back_insert_iterator(s); } - utf8_iterator utf8_begin(const std::u8string& s) + template + utf_iterator utf_begin(const std::basic_string& s) { - return utf8_iterator{s.cbegin(), s.cend()}; + return utf_iterator{s.cbegin(), s.cend()}; } - utf8_iterator utf8_end(const std::u8string& s) + template + utf_iterator utf_end(const std::basic_string& s) { - return utf8_iterator{s.cend(), s.cend()}; + return utf_iterator{s.cend(), s.cend()}; } } // namespace namespace unicode { -std::u16string utf8_to_utf16(const std::u8string& s) +template +std::basic_string utf_to_utf(const std::basic_string& s) { - std::u16string result; + std::basic_string result; - std::copy(utf8_begin(s), utf8_end(s), utf16_back_inserter(result)); + std::copy(utf_begin(s), utf_end(s), utf_back_inserter(result)); return result; } diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 41fcd20..0560c1b 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -3,19 +3,60 @@ #include #include +#include +#include #include +std::tuple, std::basic_string, std::basic_string> t { + u8"Täst", u"Täst", U"Täst" +}; + +template +void test_utf_to_utf(std::tuple& t) +{ + typedef typename std::tuple_element::type>::type From; + typedef typename std::tuple_element::type>::type To; + + // test + To result { unicode::utf_to_utf(std::get(t)) }; + + BOOST_CHECK(std::get(t) == result); + + //std::cout << std::to_string(std::tuple_size::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl; + + // iterate over other combinations + if constexpr (i + 1 < std::tuple_size::type>::value) + test_utf_to_utf(t); + else if constexpr (j + 1 < std::tuple_size::type>::value) + test_utf_to_utf<0, j + 1>(t); +} + +BOOST_AUTO_TEST_CASE(utf_to_utf) +{ + test_utf_to_utf(t); +} + BOOST_AUTO_TEST_CASE(utf8_to_utf16) { std::u8string u8{u8"ascii string1"}; - std::u16string u16{unicode::utf8_to_utf16(u8)}; + std::u16string u16{unicode::utf_to_utf(u8)}; BOOST_CHECK(u16 == u"ascii string1"); } +BOOST_AUTO_TEST_CASE(utf16_to_utf8) +{ + std::u16string u16{u"ascii string1"}; + + std::u8string u8{unicode::utf_to_utf(u16)}; + + BOOST_CHECK(u8 == u8"ascii string1"); +} + // TODO: +// UTF-8 // invalid bytes // an unexpected continuation byte // a non-continuation byte before the end of the character @@ -24,3 +65,7 @@ BOOST_AUTO_TEST_CASE(utf8_to_utf16) // a sequence that decodes to an invalid code point // // high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF) +// +// char8_t, char16_t, char32_t, char, wchar_t (UTF-16 on Windows, UTF-32 on Linux) +// string, vector? +// uint8_t, uint16_t, uint32_t? -- cgit v1.2.3