From 268b7845af166c68b1c226f0be9ba5cf983ae91c Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 14 Feb 2021 17:50:51 +0100 Subject: Support different std containers, support different basic types --- debian/README.Debian | 21 +++++++++ include/unicode.h | 117 +++++++++++++++++++++++++++++++-------------------- src/test-unicode.cpp | 83 ++++++++++++++++++++++++++++++------ 3 files changed, 164 insertions(+), 57 deletions(-) diff --git a/debian/README.Debian b/debian/README.Debian index 382d20d..0a47d0a 100644 --- a/debian/README.Debian +++ b/debian/README.Debian @@ -57,6 +57,11 @@ The following encodings are implicitly deducted from types: * char16_t: UTF-16 * char32_t: UTF-32 +You can specify different container types directly: + + std::deque utf8_value {...}; + std::list utf16_value{unicode::convert, std::list>(utf8_value)}; + Explicit encoding specification is also possible: std::string value {"äöü"}; @@ -70,6 +75,22 @@ Supported encodings are: * unicode::ISO_8859_1 * unicode::ISO_8859_15 +Supported basic types: + * char + * char8_t (C++20) + * wchar_t (UTF-16 on Windows, UTF-32 on Linux) + * char16_t + * char32_t + * uint8_t, int8_t + * uint16_t, int16_t + * uint32_t, int32_t + * basically, all basic 8-bit, 16-bit and 32-bit that can encode + UTF-8, UTF-16 and UTF-32, respectively. + +Supported container types: + * All std container types that can be iterated (vector, list, deque, array) + * Source and target containers can be different container types + Validation can be done like this: bool valid{unicode::is_valid_utf(utf16_value)}; diff --git a/include/unicode.h b/include/unicode.h index 171496e..6d7ef16 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -37,7 +37,7 @@ namespace unicode::detail { using namespace std::string_literals; - template + template> struct utf_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); @@ -48,7 +48,7 @@ namespace unicode::detail { typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; - typedef std::basic_string string_type; + typedef Container string_type; utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) @@ -56,18 +56,25 @@ namespace unicode::detail { calculate_value(); } - utf_iterator(const utf_iterator& other) = default; - utf_iterator& operator=(const utf_iterator& other) = default; + utf_iterator(const utf_iterator& other) = default; + utf_iterator& operator=(const utf_iterator& other) = default; - size_t remaining_code_units() + size_t remaining_code_units() const { - return end_iterator - iterator; + return std::distance(iterator, end_iterator); } template - T get_code_unit() + T get_code_unit() const { - return *(iterator + index); + if constexpr (std::is_same>::value) { + // std::list doesn't support it + n + auto it{iterator}; + std::advance(it, index); + return *it; + } else { + return *(iterator + index); + } } inline static bool is_continuation_byte(T b) @@ -111,20 +118,20 @@ namespace unicode::detail { if (!remaining) return; - utf8_t byte0 {get_code_unit<0>()}; + utf8_t byte0 {static_cast(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes if (remaining >= 2) { - utf8_t byte1 {get_code_unit<1>()}; + utf8_t byte1 {static_cast(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); sequence_length = 2; } else if (remaining >= 3) { - utf8_t byte2 {get_code_unit<2>()}; + utf8_t byte2 {static_cast(get_code_unit<2>())}; if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); sequence_length = 3; } else if (remaining >= 4) { - utf8_t byte3 {get_code_unit<3>()}; + utf8_t byte3 {static_cast(get_code_unit<3>())}; if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); sequence_length = 4; @@ -154,7 +161,7 @@ namespace unicode::detail { if (!remaining) return; - char16_t unit0 {get_code_unit<0>()}; + char16_t unit0 {static_cast(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) value = unit0; @@ -163,7 +170,7 @@ namespace unicode::detail { if (remaining < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); - char16_t unit1 {get_code_unit<1>()}; + char16_t unit1 {static_cast(get_code_unit<1>())}; if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); @@ -179,7 +186,7 @@ namespace unicode::detail { if (!remaining) return; - value = get_code_unit<0>(); + value = static_cast(get_code_unit<0>()); if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); @@ -202,16 +209,16 @@ namespace unicode::detail { } // pre-increment - utf_iterator& operator++() + utf_iterator& operator++() { - iterator += sequence_length; + std::advance(iterator, sequence_length); calculate_value(); return *this; } - bool operator!=(const utf_iterator& other) const + bool operator!=(const utf_iterator& other) const { - return iterator != other.iterator; + return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } reference operator*() @@ -227,13 +234,13 @@ namespace unicode::detail { size_t sequence_length{}; }; - template + template> struct utf_back_insert_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); typedef T value_type; - typedef std::basic_string string_type; + typedef Container string_type; typedef utf_back_insert_iterator& reference; typedef utf_back_insert_iterator* pointer; typedef size_t difference_type; @@ -378,7 +385,7 @@ namespace unicode { using namespace detail; - template + template> struct iso_iterator { typedef utf8_t input_type; typedef char32_t value_type; @@ -386,7 +393,8 @@ namespace unicode { typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; - typedef std::basic_string::const_iterator iterator; + typedef typename Container::const_iterator iterator; + typedef Container string_type; iso_iterator(const iterator& it): m_it(it) {} @@ -420,14 +428,14 @@ namespace unicode { iterator m_it; }; - template + template> struct iso_back_insert_iterator { typedef iso_back_insert_iterator& reference; typedef iso_back_insert_iterator* pointer; typedef size_t difference_type; typedef utf8_t value_type; typedef std::output_iterator_tag iterator_category; - typedef std::basic_string string_type; + typedef Container string_type; iso_back_insert_iterator(string_type& s): s(s) {} @@ -478,18 +486,19 @@ namespace unicode { struct ISO_8859 { typedef utf8_t value_type; + typedef typename InputIt::string_type string_type; - static InputIt begin(const std::basic_string& s) + static InputIt begin(const typename InputIt::string_type& s) { return InputIt(s.cbegin()); } - static InputIt end(const std::basic_string& s) + static InputIt end(const typename InputIt::string_type& s) { return InputIt(s.cend()); } - static OutputIt back_inserter(std::basic_string& s) + static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } @@ -499,20 +508,20 @@ namespace unicode { template struct UTF { - typedef typename InputIt::input_type input_type; typedef typename OutputIt::value_type value_type; + typedef typename InputIt::string_type string_type; - static InputIt begin(const std::basic_string& s) + static InputIt begin(const typename InputIt::string_type& s) { return InputIt{s.cbegin(), s.cend()}; } - static InputIt end(const std::basic_string& s) + static InputIt end(const typename InputIt::string_type& s) { return InputIt{s.cend(), s.cend()}; } - static OutputIt back_inserter(std::basic_string& s) + static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } @@ -527,10 +536,10 @@ namespace unicode { typedef UTF, utf_back_insert_iterator> UTF_32; // From and To are facets - template::value && std::is_empty::value, bool> = true> - std::basic_string convert(const std::basic_string& s) + template::value, bool> = true> + typename To::string_type convert(const typename From::string_type& s) { - std::basic_string result; + typename To::string_type result; std::copy(From::begin(s), From::end(s), To::back_inserter(result)); @@ -561,27 +570,29 @@ namespace unicode { typedef UTF_32 Facet; }; - // From and To are from: utf8_t, char16_t and char32_t + // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t template::value && std::is_trivial::value, bool> = true - > - std::basic_string convert(const std::basic_string& s) + typename FromContainer=std::basic_string, + typename ToContainer=std::basic_string, + std::enable_if_t::value && std::is_scalar::value && !std::is_empty::value, bool> = true> + ToContainer convert(const FromContainer& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; - std::basic_string result; + ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } + // From and To are containers template::value && !std::is_empty::value, bool> = true > ToContainer convert(const FromContainer& s) { - typedef UTF, utf_back_insert_iterator> UTF_Trait; + typedef UTF, utf_back_insert_iterator> UTF_Trait; ToContainer result; @@ -590,9 +601,25 @@ namespace unicode { return result; } + // Container version + template::value, bool> = true> + bool is_valid_utf(const Container& s) + { + typedef UTF, utf_back_insert_iterator> UTF_Trait; + + try { + std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); + } catch (const std::invalid_argument&) { + return false; + } + return true; + } + // basic type version - template - bool is_valid_utf(const std::basic_string& s) + template, + std::enable_if_t::value && !std::is_empty::value, bool> = true> + bool is_valid_utf(const Container& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; @@ -605,8 +632,8 @@ namespace unicode { } // Facet version - template - bool is_valid_utf(const std::basic_string& s) + template::value, bool> = true> + bool is_valid_utf(const typename Facet::string_type& s) { try { std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 5f5ebbf..fbd4749 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -5,9 +5,12 @@ #include #include +#include #include +#include #include #include +#include #include #include #include @@ -98,14 +101,14 @@ void test_utf_to_utf(std::tuple& t) // test base type interface To result { unicode::convert(std::get(t)) }; + BOOST_CHECK_MESSAGE(std::get(t) == result, "Base: From " << typeid(typename From::value_type).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(typename To::value_type).name() << "(" << j << ", " << std::get(t) << "), got " << result); - BOOST_CHECK_MESSAGE(std::get(t) == result, "Base: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); + // test container interface + result = unicode::convert(std::get(t)); + BOOST_CHECK_MESSAGE(std::get(t) == result, "Container: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); - //std::cout << std::to_string(std::tuple_size::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl; - // test facet interface result = unicode::convert::Facet, typename unicode::Encoding::Facet>(std::get(t)); - BOOST_CHECK_MESSAGE(std::get(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); // iterate over other combinations @@ -132,6 +135,10 @@ void test_is_valid_utf(std::tuple& t) // test via basic type bool result { unicode::is_valid_utf(std::get(t)) }; + BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename T::value_type).name() << "(" << i << ", " << std::get(t) << "), got " << result); + + // test via container type + result = unicode::is_valid_utf(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via Facet @@ -158,7 +165,17 @@ void test_utf_to_utf_failure(std::basic_string& s) // via base type try { (void) unicode::convert(s); - BOOST_ERROR("Base: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + BOOST_ERROR("Base type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for convert() on bad input + } catch (const std::exception& ex) { + BOOST_ERROR("Unexpected error on convert(): " << ex.what()); + }; + + // via container + try { + (void) unicode::convert::Facet::string_type, typename unicode::Encoding::Facet::string_type>(s); + BOOST_ERROR("Container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { @@ -198,6 +215,8 @@ void test_is_valid_utf_failure(std::basic_string& s) { BOOST_CHECK_MESSAGE(unicode::is_valid_utf(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); + BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); + BOOST_CHECK_MESSAGE(unicode::is_valid_utf::Facet>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding::Facet).name()); // iterate over remaining types @@ -275,6 +294,21 @@ void test_random(random_context& rc, size_t length) BOOST_ERROR("Unexpected error on convert(): " << ex.what()); } + // container type interface + try { + To result{unicode::convert(r)}; + + if (r.empty()) { + BOOST_CHECK(result.empty()); + } else { + BOOST_CHECK(!result.empty()); + } + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for convert() on bad input + } catch (const std::exception& ex) { + BOOST_ERROR("Unexpected error on convert(): " << ex.what()); + } + // facet interface try { To result{unicode::convert::Facet,typename unicode::Encoding::Facet>(r)}; @@ -331,7 +365,7 @@ BOOST_AUTO_TEST_CASE(convert) BOOST_CHECK((unicode::convert("abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); - + BOOST_CHECK((unicode::convert("abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); @@ -354,7 +388,37 @@ BOOST_AUTO_TEST_CASE(convert) BOOST_CHECK((unicode::convert(std::string{"äöü"})) == std::wstring{L"äöü"}); - //BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); + BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); + BOOST_CHECK((unicode::convert, std::vector>(std::vector{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector{L'ä', L'ö', L'ü'})); + + // deque + BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); + BOOST_CHECK((unicode::convert, std::deque>(std::deque{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque{L'ä', L'ö', L'ü'})); + + // deque with uint8_t, uint16_t + BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); + BOOST_CHECK((unicode::convert, std::deque>(std::deque{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque{L'ä', L'ö', L'ü'})); + + // deque with int8_t, int16_t + BOOST_CHECK((unicode::convert, std::deque>(std::deque{ + static_cast(0xc3), + static_cast(0xa4), + static_cast(0xc3), + static_cast(0xb6), + static_cast(0xc3), + static_cast(0xbc)})) == (std::deque{L'ä', L'ö', L'ü'})); + + // list + BOOST_CHECK((unicode::convert, std::list>(std::list{})) == std::list{}); + BOOST_CHECK((unicode::convert, std::list>(std::list{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list{L'ä', L'ö', L'ü'})); + + // list -> deque + BOOST_CHECK((unicode::convert, std::deque>(std::list{})) == std::deque{}); + BOOST_CHECK((unicode::convert, std::deque>(std::list{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque{L'ä', L'ö', L'ü'})); + + // array + BOOST_CHECK((unicode::convert, std::list>(std::array{})) == std::list{}); + BOOST_CHECK((unicode::convert, std::list>(std::array{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list{L'ä', L'ö', L'ü'})); } BOOST_AUTO_TEST_CASE(is_valid_utf) @@ -376,8 +440,3 @@ BOOST_AUTO_TEST_CASE(string_u8string) BOOST_CHECK(a == std::string{"\xc3\xa4"}); } - -// TODO: -// -// string, vector? -// uint8_t, uint16_t, uint32_t? -- cgit v1.2.3