#define BOOST_TEST_MODULE unicode_test #include #include #include #include #include #include #if BOOST_VERSION > 106700 // CPU Timer in Debian 10 boost is broken, so leave it to std::chrono wall clock #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test-helper.h" using namespace std::chrono_literals; using namespace std::string_literals; typedef std::tuple, std::basic_string, std::basic_string> types_collection_type; // create tuple of the same string, in UTF-8, UTF-16 and UTF-32 #define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x} // Interesting Unicode ranges for testing corner cases: // 0x0 - smallest Unicode value // 0x7F - highest 1-byte UTF-8 value // 0x7FF - highest 2-byte UTF-8 value // 0xD800 - smallest UTF-16 low surrogate (invalid range for unicode code points) // 0xDBFF - highest UTF-16 low surrogate (invalid range for unicode code points) // 0xDC00 - smallest UTF-16 high surrogate (invalid range for unicode code points) // 0xDFFF - highest UTF-16 high surrogate (invalid range for unicode code points) // 0xFFFF - highest 3-byte UTF-8 value, highest 2-byte UTF-16 value // 0x10FFFF - highest Unicode value // = highest value that can be encoded in UTF-16 // 0x1FFFFF - highest value that can be encoded in UTF-8 (with 4-byte limit) // 0xFFFFFFFF - highest value that can be encoded in UTF-32 // Success cases: convert string to all other types, respectively std::vector success_sets { SUCCESS_TUPLE(""), SUCCESS_TUPLE("\0"), SUCCESS_TUPLE("0"), // various string SUCCESS_TUPLE("ASCII string1"), SUCCESS_TUPLE("Täst just looks like German"), SUCCESS_TUPLE("\u732b is chinese for cat"), SUCCESS_TUPLE("\U0001F63A"), SUCCESS_TUPLE("\U0001F63A is a smiling cat"), // separators SUCCESS_TUPLE("abc\r\ndef"), SUCCESS_TUPLE("äöü\0\u20ac"), SUCCESS_TUPLE("äöü0\u20ac"), // UTF-8 specific: 2 bytes encodings SUCCESS_TUPLE("\u0080"), SUCCESS_TUPLE("\u0101"), SUCCESS_TUPLE("text1\u0101text2"), SUCCESS_TUPLE("\u0101text2"), SUCCESS_TUPLE("text1\u0101"), SUCCESS_TUPLE("\u0701"), SUCCESS_TUPLE("\u07FF"), // UTF-8 specific: 3 bytes encodings SUCCESS_TUPLE("\u0800"), SUCCESS_TUPLE("context1\u0800context2"), SUCCESS_TUPLE("\u0800context2"), SUCCESS_TUPLE("context1\u0800"), SUCCESS_TUPLE("context1\u0800\u0901"), SUCCESS_TUPLE("context1\u0800\u0901context"), SUCCESS_TUPLE("\u1234"), SUCCESS_TUPLE("\u4321"), SUCCESS_TUPLE("\uFFFF"), // UTF-8 specific: 4 bytes encodings SUCCESS_TUPLE("\U00010000"), SUCCESS_TUPLE("\U00043210"), SUCCESS_TUPLE("context1\U00043210context2"), SUCCESS_TUPLE("\U00043210context2"), SUCCESS_TUPLE("context1\U00043210"), SUCCESS_TUPLE("context1\U00043210\U00012345"), SUCCESS_TUPLE("context1\U00043210\U00012345context2"), SUCCESS_TUPLE("\U0010FFFF"), // UTF-8 specific: mixed encodings SUCCESS_TUPLE("abc\u0123\u4321\U00010000\u1234\u0321xyz"), // UTF-16 specific: corner cases of surrogates SUCCESS_TUPLE("\uD7FFcontext\uD7FF"), SUCCESS_TUPLE("\uD7FFcontext\uE000"), // optimization relevant strings SUCCESS_TUPLE("01234567\u20ac01234567"), SUCCESS_TUPLE("0123456\u20ac01234567"), SUCCESS_TUPLE("012345\u20ac01234567"), SUCCESS_TUPLE("01234\u20ac01234567"), SUCCESS_TUPLE("0123\u20ac01234567"), SUCCESS_TUPLE("012\u20ac01234567"), SUCCESS_TUPLE("01\u20ac01234567"), SUCCESS_TUPLE("0\u20ac01234567"), SUCCESS_TUPLE("\u20ac01234567"), SUCCESS_TUPLE("0123456701234567\u20ac0123456701234567"), SUCCESS_TUPLE("012345670123456\u20ac0123456701234567"), SUCCESS_TUPLE("01234567012345\u20ac0123456701234567"), SUCCESS_TUPLE("0123456701234\u20ac0123456701234567"), SUCCESS_TUPLE("012345670123\u20ac0123456701234567"), SUCCESS_TUPLE("01234567012\u20ac0123456701234567"), SUCCESS_TUPLE("0123456701\u20ac0123456701234567"), SUCCESS_TUPLE("012345670\u20ac0123456701234567"), SUCCESS_TUPLE("01234567\u20ac0123456701234567"), }; // Error cases: throwing upon convert to all other types std::vector> failure_strings_char8_t { // using u8"" here doesn't work on MSVC (utf8_t*)"\x80", // utf-8 continuation byte (utf8_t*)"text1\x81text2", (utf8_t*)"\x82text2", (utf8_t*)"text1\x83", (utf8_t*)"\xc3\xc3\xa4", // initial byte of utf-8 "ä", followed by valid utf-8 "ä" (utf8_t*)"text1\xc3text2\xc3\xa4text3", (utf8_t*)"\xc3text2\xc3\xa4text3", (utf8_t*)"text1\xc3\xc3\xa4text3", (utf8_t*)"text1\xc3text2\xc3\xa4", (utf8_t*)"\xF8\x80\x80\x80\x80", // overlong encoding of valid code point (utf8_t*)"text1\xF8\x80\x80\x80\x80text2", (utf8_t*)"\xF8\x80\x80\x80\x80text2", (utf8_t*)"text1\xF8\x80\x80\x80\x80", (utf8_t*)"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point }; std::vector> failure_strings_char16_t { u"\xD801", // single high surrogate u"text1\xD801text2", u"\xD801text2", u"text1\xD801", u"\xD800\xD800", // double high surrogate u"\xD801\xD802", u"\xDBFF\xDBFF", u"\xDBFE\xDBFD", u"\xDFFF", // single low surrogate u"text1\xDFFFtext2", u"\xDFFFtext2", u"text1\xDFFF", u"\xDFFF\xDFFF", // double low surrogate u"\xDC00\xDC00", u"\xDC01\xDFFE", u"\xDFFE\xDC01", u"\xDFFF\xD801", // bad surrogate pair order u"text1\xDFFF\xD801text2", u"\xDFFF\xD801text2", u"text1\xDFFF\xD801", u"\xDC00\xDBFF", u"\xDC00\xDBFE", u"\xDC01\xDBFE", u"\xDC01\xDBFF", }; std::vector> failure_strings_char32_t { U"\xD800 and more text", // invalid unicode (surrogate half) U"blabla \xD801", // invalid unicode (surrogate half) U"moreblabla \xDFFF", // invalid unicode (surrogate half) U"\x10000000", // invalid unicode (number too big) U"\x1111111", U"\x110000", U"\x110001\x110002\x110003", U"\x7FFFFFFF", U"\xFFFFFFF", }; // check assumptions about environment BOOST_AUTO_TEST_CASE(string_u8string) { std::string a{"\xc3\xa4"}; std::basic_string b{a.begin(), a.end()}; BOOST_CHECK(b == std::basic_string{u8"ä"}); a = std::string{b.begin(), b.end()}; BOOST_CHECK(a == std::string{"\xc3\xa4"}); BOOST_CHECK(sizeof(size_t) == 4 || sizeof(size_t) == 8); std::cout << "Detected CPU Accu size: " << (sizeof(size_t) * 8) << std::endl; } // check environment: demonstrate how boost convert u8->u8 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_boost_u8_u8) { for (auto& s: failure_strings_char8_t) { try { auto result1{boost::locale::conv::utf_to_utf(s, boost::locale::conv::stop)}; BOOST_FAIL("Expected boost convert to fail"); } catch(...) { // expected } } } // check environment: demonstrate how boost convert u8->u16 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_boost_u8_u16) { for (auto& s: failure_strings_char8_t) { try { auto result{boost::locale::conv::utf_to_utf(s, boost::locale::conv::stop)}; BOOST_FAIL("Expected boost convert to fail"); } catch(...) { // expected } } } // check environment: demonstrate how std u8->u8 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_std_u8_u8) { for (auto& s: failure_strings_char8_t) { try { auto result{std_convert(s)}; #ifdef _WIN32 std::cout << "Conversion error from MSVC STDC++ for: "s + std::string{ s.begin(), s.end() } + ", result size: " + std::to_string(result.size()) << std::endl; std::cout << "Note: MSVC's implementation is known to be broken, ignoring." << std::endl; #else BOOST_FAIL(("Expected std_convert to fail for: "s + std::string{ s.begin(), s.end() } + ", result size: " + std::to_string(result.size())).c_str()); #endif } catch(...) { // expected } } } // check environment: demonstrate how std u8->u16 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_std_u8_u16) { for (auto& s: failure_strings_char8_t) { try { auto result{std_convert(s)}; #ifdef _WIN32 std::cout << "Conversion error from MSVC STDC++ for: "s + std::string{ s.begin(), s.end() } + ", result size: " + std::to_string(result.size()) << std::endl; std::cout << "Note: MSVC's implementation is known to be broken, ignoring." << std::endl; #else BOOST_FAIL(("Expected std_convert to fail for: "s + std::string{ s.begin(), s.end() } + ", result size: " + std::to_string(result.size())).c_str()); #endif } catch(...) { // expected } } } template void test_utf_to_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type From; typedef typename std::tuple_element::type>::type To; // test base type interface To result { unicode::convert(std::get(t)) }; BOOST_CHECK_MESSAGE(std::get(t) == result, "Base: From " << typeid(typename From::value_type).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(typename To::value_type).name() << "(" << j << ", " << std::get(t) << "), got " << result); // test container interface result = unicode::convert(std::get(t)); BOOST_CHECK_MESSAGE(std::get(t) == result, "Container: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); // test encoding interface result = unicode::convert, typename unicode::Encoding_t>(std::get(t)); BOOST_CHECK_MESSAGE(std::get(t) == result, "Encoding: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); // test container interface with std::vector instead of std::string auto string_value{std::get(t)}; auto string_reference{std::get(t)}; std::vector vector_value{string_value.begin(), string_value.end()}; std::vector vector_reference{string_reference.begin(), string_reference.end()}; std::vector vector_result { unicode::convert, std::vector>(vector_value)}; BOOST_CHECK_MESSAGE(vector_reference == vector_result, "Vector Container: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << ")"); // test actual results by comparing with boost::locale::conv results BOOST_CHECK_EQUAL(result, (boost::locale::conv::utf_to_utf(std::get(t)))); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_utf_to_utf(t); else if constexpr (j + 1 < std::tuple_size::type>::value) test_utf_to_utf<0, j + 1>(t); } // We don't use BOOST_DATA_TEST_CASE here because boost::test tries to assign // a new variable to each tuple element which we don't want // https://lists.boost.org/boost-bugs/2016/05/45214.php BOOST_AUTO_TEST_CASE(utf_to_utf_success) { for (auto& t: success_sets) test_utf_to_utf(t); } template void test_is_valid_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type T; // test via basic type bool result { unicode::is_valid_utf(std::get(t)) }; BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename T::value_type).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via container type result = unicode::is_valid_utf(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via Encoding result = unicode::is_valid_utf>(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename unicode::Encoding_t).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via other container type auto string_value{std::get(t)}; std::vector vector_value{string_value.begin(), string_value.end()}; result = unicode::is_valid_utf>(vector_value); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get(t) << "), got " << result); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_is_valid_utf(t); } BOOST_AUTO_TEST_CASE(is_valid_utf_success) { for (auto& t: success_sets) test_is_valid_utf(t); } // iterate over std::tuple T types template void test_utf_to_utf_failure(std::basic_string& s) { typedef typename std::tuple_element::type::value_type To; // via base type try { (void) unicode::convert(s); BOOST_ERROR("Base type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // via container try { (void) unicode::convert::string_type, typename unicode::Encoding_t::string_type>(s); BOOST_ERROR("Container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // via encoding try { (void) unicode::convert,typename unicode::Encoding_t>(s); BOOST_ERROR("Encoding: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // via other container type try { std::vector vector_value{s.begin(), s.end()}; (void) unicode::convert, std::vector>(vector_value); BOOST_ERROR("Vector container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_utf_to_utf_failure(s); } BOOST_AUTO_TEST_CASE(utf_to_utf_failure) { for (auto& s: failure_strings_char8_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); } // iterate over std::tuple T types template void test_is_valid_utf_failure(std::basic_string& s) { BOOST_CHECK_MESSAGE(unicode::is_valid_utf(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding_t).name()); std::vector vector_value{s.begin(), s.end()}; BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(vector_value) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_is_valid_utf_failure(s); } BOOST_AUTO_TEST_CASE(is_valid_utf_failure) { for (auto& s: failure_strings_char8_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); } BOOST_AUTO_TEST_CASE(is_valid_unicode) { BOOST_CHECK(unicode::is_valid_unicode('\0')); BOOST_CHECK(unicode::is_valid_unicode(U'a')); BOOST_CHECK(unicode::is_valid_unicode(U'ä')); BOOST_CHECK(unicode::is_valid_unicode(U'\u732b')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(U'\U0001F63A')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(0x0001F63A)); // cat smiley BOOST_CHECK(!unicode::is_valid_unicode(0x00110000)); BOOST_CHECK(!unicode::is_valid_unicode(0xFFFFFFFF)); // U"\UFFFFFFFF" is invalid C++ BOOST_CHECK(!unicode::is_valid_unicode(0x01234567)); BOOST_CHECK(!unicode::is_valid_unicode(0x12345678)); BOOST_CHECK(!unicode::is_valid_unicode(0xD800)); BOOST_CHECK(!unicode::is_valid_unicode(0xD987)); BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF)); } // Test ISO encodings BOOST_AUTO_TEST_CASE(convert_iso) { BOOST_CHECK((std::string{unicode::convert({})}) == std::string{}); BOOST_CHECK((std::string{unicode::convert("abc")}) == std::string{"abc"}); BOOST_CHECK((std::string{unicode::convert("\xe4\xf6\xfc")}) == std::string{"\xe4\xf6\xfc"}); // Latin-1 äöü BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € BOOST_CHECK((std::string{unicode::convert({})}) == std::string{}); BOOST_CHECK((std::string{unicode::convert("abc")}) == std::string{"abc"}); BOOST_CHECK((std::string{unicode::convert("\xe4\xf6\xfc")}) == std::string{"\xe4\xf6\xfc"}); // Latin-1 äöü BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € BOOST_CHECK_THROW(((void)std::string{unicode::convert("\xa4")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert("\xa4")}), std::invalid_argument); // € not available in ISO-8859-1 } // Test conversion between ISO and UTF encodings BOOST_AUTO_TEST_CASE(convert_iso_utf) { BOOST_CHECK_THROW(((void)std::string{unicode::convert(u8"\u20ac")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u"\u20ac")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(U"\u20ac")}), std::invalid_argument); BOOST_CHECK((std::string{unicode::convert(u8"\u20ac")}) == std::string{"\xa4"}); // € BOOST_CHECK((std::string{unicode::convert(u"\u20ac")}) == std::string{"\xa4"}); // € BOOST_CHECK((std::string{unicode::convert(U"\u20ac")}) == std::string{"\xa4"}); // € BOOST_CHECK_THROW(((void)std::string{unicode::convert(u8"\u00A4")}), std::invalid_argument); // currency sign: Latin-1, but not Latin-15 BOOST_CHECK((std::string{unicode::convert(u8"\u00A4")}) == std::string{"\xa4"}); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u8"\u732b")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u"\u732b")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(U"\u732b")}), std::invalid_argument); } // Test UTF encodings BOOST_AUTO_TEST_CASE(convert_utf) { BOOST_CHECK_THROW((unicode::convert(std::u32string{(char32_t*)"\x00\xD8\x00\x00\x00\x00\x00\x00"})) , std::invalid_argument); BOOST_CHECK((unicode::convert(u8"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(u8"a\0bc")) == std::u16string{u"a\0bc"}); BOOST_CHECK((unicode::convert(u8"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert("äöü")) == std::u32string{U"äöü"}); BOOST_CHECK((unicode::convert(u"\xD800\xDC00")) == std::u32string{U"\U00010000"}); BOOST_CHECK((unicode::convert(u"\xD800\xDC01")) == std::u32string{U"\U00010001"}); BOOST_CHECK((unicode::convert(u"\xD810\xDC01")) == std::u32string{U"\U00014001"}); // vector BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector{u'ä', u'ö', u'ü'})); // deque BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::deque{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque{L'ä', L'ö', L'ü'})); // yet unsupported: //BOOST_CHECK((unicode::convert(std::deque{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque{u'ä', u'ö', u'ü'})); //BOOST_CHECK((unicode::convert(std::deque{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque{u'ä', u'ö', u'ü'})); // deque with uint8_t, uint16_t BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::deque{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque{L'ä', L'ö', L'ü'})); // deque with int8_t, int16_t BOOST_CHECK((unicode::convert, std::deque>(std::deque{ static_cast(0xc3), static_cast(0xa4), static_cast(0xc3), static_cast(0xb6), static_cast(0xc3), static_cast(0xbc)})) == (std::deque{L'ä', L'ö', L'ü'})); // list BOOST_CHECK((unicode::convert, std::list>(std::list{})) == std::list{}); BOOST_CHECK((unicode::convert, std::list>(std::list{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list{L'ä', L'ö', L'ü'})); // list -> deque BOOST_CHECK((unicode::convert, std::deque>(std::list{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::list{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque{L'ä', L'ö', L'ü'})); // array BOOST_CHECK((unicode::convert, std::list>(std::array{})) == std::list{}); BOOST_CHECK((unicode::convert, std::list>(std::array{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list{L'ä', L'ö', L'ü'})); } // wchar_t specific tests: system dependent BOOST_AUTO_TEST_CASE(convert_wstring) { #ifdef _WIN32 BOOST_CHECK(sizeof(wchar_t) == 2); #else // Unix like BOOST_CHECK(sizeof(wchar_t) == 4); #endif // For the following checks, wchar_t size and encoding is system dependent: // Windows: UTF-16 // Linux: UTF-32 BOOST_CHECK((unicode::convert("äöü")) == std::wstring{L"äöü"}); BOOST_CHECK((unicode::convert("\u732b")) == std::wstring{L"\u732b"}); BOOST_CHECK((unicode::convert("\U0001F63A")) == std::wstring{L"\U0001F63A"}); BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::u32string{U"\U0001F63A"}); BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::basic_string{(utf8_t*)"\U0001F63A"}); BOOST_CHECK((unicode::convert(std::string{"äöü"})) == std::wstring{L"äöü"}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector{L'ä', L'ö', L'ü'})); std::u16string u16_value{u"\U0001F63A"}; std::u32string u32_value{U"\U0001F63A"}; std::wstring w_value{L"\U0001F63A"}; std::u16string result_u16_value{unicode::convert(w_value)}; std::u32string result_u32_value{unicode::convert(w_value)}; std::wstring result_w_value_1{unicode::convert(u16_value)}; std::wstring result_w_value_2{unicode::convert(u32_value)}; BOOST_CHECK_EQUAL(u16_value.size(), 2); BOOST_CHECK_EQUAL(u32_value.size(), 1); BOOST_CHECK_EQUAL(result_u16_value.size(), 2); BOOST_CHECK_EQUAL(result_u32_value.size(), 1); BOOST_CHECK_EQUAL(u16_value, result_u16_value); BOOST_CHECK_EQUAL(u32_value, result_u32_value); BOOST_CHECK(w_value == result_w_value_1); BOOST_CHECK(w_value == result_w_value_2); #ifdef _WIN32 BOOST_CHECK_EQUAL(w_value.size(), 2); BOOST_CHECK_EQUAL(result_w_value_1.size(), 2); BOOST_CHECK_EQUAL(result_w_value_2.size(), 2); #else // Unix like BOOST_CHECK_EQUAL(w_value.size(), 1); BOOST_CHECK_EQUAL(result_w_value_1.size(), 1); BOOST_CHECK_EQUAL(result_w_value_2.size(), 1); #endif } BOOST_AUTO_TEST_CASE(is_valid_utf) { BOOST_CHECK(unicode::is_valid_utf(u"äöü")); BOOST_CHECK(unicode::is_valid_utf(u8"äöü")); } BOOST_AUTO_TEST_CASE(exceptions) { { // UTF-8: Incomplete string std::vector x{(utf8_t)'\xC0'}; try { auto result{unicode::convert,std::vector>(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Bad input: Not enough bytes left for decoding UTF-8 sequence"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // UTF-8: Encoded value too high std::vector x{(utf8_t)'\xF7', (utf8_t)'\xBF', (utf8_t)'\xBF', (utf8_t)'\xBF'}; try { auto result{unicode::convert,std::vector>(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Invalid Unicode character: 2097151"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // UTF-8: Overlong encoding std::vector x{(utf8_t)'\xF8', (utf8_t)'\x80', (utf8_t)'\x80', (utf8_t)'\x80', (utf8_t)'\x80'}; try { auto result{unicode::convert,std::vector>(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Bad UTF-8 input: Invalid 4 byte sequence"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // UTF-16: Incomplete currogate encoding std::vector x{(char16_t)u'\xD800'}; try { auto result{unicode::convert,std::vector>(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Bad input: Continuation of first UTF-16 unit missing"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // UTF-16: Invalid surrogates encoding std::vector x{(char16_t)u'\xD800', (char16_t)u'\xD800'}; try { auto result{unicode::convert,std::vector>(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Bad input: 2 malformed UTF-16 surrogates"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // UTF-32: Invalid value std::vector x{(char32_t)U'\xFFFFFFFF'}; try { auto result{unicode::convert,std::vector>(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Invalid Unicode character: 4294967295"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // ISO: Invalid value (from Unicode) std::u32string x{U"\U00000123"}; try { auto result{unicode::convert(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Bad ISO 8859 value above 255: 291"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // ISO: Invalid 8-bit value that can't be mapped (from Unicode) std::u32string x{U"\U000000BC"}; try { auto result{unicode::convert(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Bad Unicode value to map to ISO 8859-15: 188"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // ISO: Invalid 8-bit value that can't be mapped between ISO-8859-1 and ISO-8859-15 std::string x{"\xBC"}; try { auto result{unicode::convert(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Bad Unicode value to map to ISO 8859-15: 188"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // ISO: Invalid 8-bit value that can't be mapped between ISO-8859-1 and ISO-8859-15 std::string x{"\xBC"}; try { auto result{unicode::convert(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Bad ISO 8859 value above 255: 338"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // Conversion from UTF-x to UTF-x: Fast mode just validating (resulting in error) std::u16string x{u"\xD800"}; try { auto result{unicode::convert(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Invalid UTF input"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } #if defined(_WIN32) || defined(__linux__) { // Optimization: UTF-8 decoding invalid Unicode value in 3 byte sequence std::basic_string x{(utf8_t*)"\xED\xA0\x80 aaa"}; try { auto result{unicode::convert(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Invalid Unicode character in 3 byte UTF-8 sequence"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // Optimization: UTF-8 decoding invalid Unicode value in 4 byte sequence std::basic_string x{(utf8_t*)"\xF7\xBF\xBF\xBF aaa"}; try { auto result{unicode::convert(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Invalid Unicode character in 4 byte UTF-8 sequence"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } { // Optimization: UTF-8 decoding invalid byte sequence std::basic_string x{(utf8_t*)"\xC0 aabbbb"}; try { auto result{unicode::convert(x)}; BOOST_FAIL("Expected boost convert to fail"); } catch (const std::invalid_argument& ex) { BOOST_CHECK_EQUAL("Invalid UTF-8 byte sequence"s, ex.what()); } catch (...) { BOOST_ERROR("Unexpected error on convert"); } } #endif }