#define BOOST_TEST_MODULE unicode_test #include #include #include #include #include #include #if BOOST_VERSION > 106700 // CPU Timer in Debian 10 boost is broken, so leave it to std::chrono wall clock #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test-helper.h" using namespace std::chrono_literals; using namespace std::string_literals; typedef std::tuple, std::basic_string, std::basic_string> types_collection_type; // create tuple of the same string, in UTF-8, UTF-16 and UTF-32 #define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x} // Success cases: convert string to all other types, respectively std::vector success_sets { SUCCESS_TUPLE(""), // various string SUCCESS_TUPLE("ASCII string1"), SUCCESS_TUPLE("Täst just looks like German"), SUCCESS_TUPLE("\u732b is chinese for cat"), SUCCESS_TUPLE("\U0001F63A"), SUCCESS_TUPLE("\U0001F63A is a smiling cat"), // separators SUCCESS_TUPLE("abc\r\ndef"), SUCCESS_TUPLE("äöü\0\u20ac"), SUCCESS_TUPLE("äöü0\u20ac"), // optimization relevant strings SUCCESS_TUPLE("01234567\u20ac01234567"), SUCCESS_TUPLE("0123456\u20ac01234567"), SUCCESS_TUPLE("012345\u20ac01234567"), SUCCESS_TUPLE("01234\u20ac01234567"), SUCCESS_TUPLE("0123\u20ac01234567"), SUCCESS_TUPLE("012\u20ac01234567"), SUCCESS_TUPLE("01\u20ac01234567"), SUCCESS_TUPLE("0\u20ac01234567"), SUCCESS_TUPLE("\u20ac01234567"), SUCCESS_TUPLE("0123456701234567\u20ac0123456701234567"), SUCCESS_TUPLE("012345670123456\u20ac0123456701234567"), SUCCESS_TUPLE("01234567012345\u20ac0123456701234567"), SUCCESS_TUPLE("0123456701234\u20ac0123456701234567"), SUCCESS_TUPLE("012345670123\u20ac0123456701234567"), SUCCESS_TUPLE("01234567012\u20ac0123456701234567"), SUCCESS_TUPLE("0123456701\u20ac0123456701234567"), SUCCESS_TUPLE("012345670\u20ac0123456701234567"), SUCCESS_TUPLE("01234567\u20ac0123456701234567"), }; // Error cases: throwing upon convert to all other types std::vector> failure_strings_char8_t { // using u8"" here doesn't work on MSVC (utf8_t*)"\x80", // utf-8 continuation byte (utf8_t*)"\x81", // utf-8 continuation byte (utf8_t*)"\xc3\xc3\xa4", // initial byte of utf-8 "ä", followed by valid utf-8 "ä" (utf8_t*)"\xF8\x80\x80\x80\x80", // overlong encoding (utf8_t*)"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point }; std::vector> failure_strings_char16_t { u"\xD801", // single high surrogate u"\xDFFF", // single low surrogate u"\xDFFF\xD801", // bad surrogate pair order }; std::vector> failure_strings_char32_t { U"\xD800 and more text", // invalid unicode (surrogate half) U"blabla \xD801", // invalid unicode (surrogate half) U"moreblabla \xDFFF", // invalid unicode (surrogate half) U"\x10000000", // invalid unicode (number too big) }; // check assumptions about environment BOOST_AUTO_TEST_CASE(string_u8string) { std::string a{"\xc3\xa4"}; std::basic_string b{a.begin(), a.end()}; BOOST_CHECK(b == std::basic_string{u8"ä"}); a = std::string{b.begin(), b.end()}; BOOST_CHECK(a == std::string{"\xc3\xa4"}); BOOST_CHECK(sizeof(size_t) == 4 || sizeof(size_t) == 8); std::cout << "Detected CPU Accu size: " << (sizeof(size_t) * 8) << std::endl; } // check environment: demonstrate how boost convert u8->u8 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_boost_u8_u8) { for (auto& s: failure_strings_char8_t) { try { auto result1{boost::locale::conv::utf_to_utf(s, boost::locale::conv::stop)}; BOOST_FAIL("Expected boost convert to fail"); } catch(...) { // expected } } } // check environment: demonstrate how boost convert u8->u16 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_boost_u8_u16) { for (auto& s: failure_strings_char8_t) { try { auto result{boost::locale::conv::utf_to_utf(s, boost::locale::conv::stop)}; BOOST_FAIL("Expected boost convert to fail"); } catch(...) { // expected } } } // check environment: demonstrate how std u8->u8 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_std_u8_u8) { for (auto& s: failure_strings_char8_t) { try { auto result{std_convert(s)}; #ifdef _WIN32 std::cout << "Conversion error from MSVC STDC++ for: "s + std::string{ s.begin(), s.end() } + ", result size: " + std::to_string(result.size()) << std::endl; std::cout << "Note: MSVC's implementation is known to be broken, ignoring." << std::endl; #else BOOST_FAIL(("Expected std_convert to fail for: "s + std::string{ s.begin(), s.end() } + ", result size: " + std::to_string(result.size())).c_str()); #endif } catch(...) { // expected } } } // check environment: demonstrate how std u8->u16 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_std_u8_u16) { for (auto& s: failure_strings_char8_t) { try { auto result{std_convert(s)}; #ifdef _WIN32 std::cout << "Conversion error from MSVC STDC++ for: "s + std::string{ s.begin(), s.end() } + ", result size: " + std::to_string(result.size()) << std::endl; std::cout << "Note: MSVC's implementation is known to be broken, ignoring." << std::endl; #else BOOST_FAIL(("Expected std_convert to fail for: "s + std::string{ s.begin(), s.end() } + ", result size: " + std::to_string(result.size())).c_str()); #endif } catch(...) { // expected } } } template void test_utf_to_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type From; typedef typename std::tuple_element::type>::type To; // test base type interface To result { unicode::convert(std::get(t)) }; BOOST_CHECK_MESSAGE(std::get(t) == result, "Base: From " << typeid(typename From::value_type).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(typename To::value_type).name() << "(" << j << ", " << std::get(t) << "), got " << result); // test container interface result = unicode::convert(std::get(t)); BOOST_CHECK_MESSAGE(std::get(t) == result, "Container: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); // test encoding interface result = unicode::convert, typename unicode::Encoding_t>(std::get(t)); BOOST_CHECK_MESSAGE(std::get(t) == result, "Encoding: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); // test actual results by comparing with boost::locale::conv results BOOST_CHECK_EQUAL(result, (boost::locale::conv::utf_to_utf(std::get(t)))); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_utf_to_utf(t); else if constexpr (j + 1 < std::tuple_size::type>::value) test_utf_to_utf<0, j + 1>(t); } // We don't use BOOST_DATA_TEST_CASE here because boost::test tries to assign // a new variable to each tuple element which we don't want // https://lists.boost.org/boost-bugs/2016/05/45214.php BOOST_AUTO_TEST_CASE(utf_to_utf_success) { for (auto& t: success_sets) test_utf_to_utf(t); } template void test_is_valid_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type T; // test via basic type bool result { unicode::is_valid_utf(std::get(t)) }; BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename T::value_type).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via container type result = unicode::is_valid_utf(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via Encoding result = unicode::is_valid_utf>(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename unicode::Encoding_t).name() << "(" << i << ", " << std::get(t) << "), got " << result); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_is_valid_utf(t); } BOOST_AUTO_TEST_CASE(is_valid_utf_success) { for (auto& t: success_sets) test_is_valid_utf(t); } // iterate over std::tuple T types template void test_utf_to_utf_failure(std::basic_string& s) { typedef typename std::tuple_element::type::value_type To; // via base type try { (void) unicode::convert(s); BOOST_ERROR("Base type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // via container try { (void) unicode::convert::string_type, typename unicode::Encoding_t::string_type>(s); BOOST_ERROR("Container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // via encoding try { (void) unicode::convert,typename unicode::Encoding_t>(s); BOOST_ERROR("Encoding: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_utf_to_utf_failure(s); } BOOST_AUTO_TEST_CASE(utf_to_utf_failure) { for (auto& s: failure_strings_char8_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); } // iterate over std::tuple T types template void test_is_valid_utf_failure(std::basic_string& s) { BOOST_CHECK_MESSAGE(unicode::is_valid_utf(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding_t).name()); // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_is_valid_utf_failure(s); } BOOST_AUTO_TEST_CASE(is_valid_utf_failure) { for (auto& s: failure_strings_char8_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); } BOOST_AUTO_TEST_CASE(is_valid_unicode) { BOOST_CHECK(unicode::is_valid_unicode('\0')); BOOST_CHECK(unicode::is_valid_unicode(U'a')); BOOST_CHECK(unicode::is_valid_unicode(U'ä')); BOOST_CHECK(unicode::is_valid_unicode(U'\u732b')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(U'\U0001F63A')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(0x0001F63A)); // cat smiley BOOST_CHECK(!unicode::is_valid_unicode(0x00110000)); BOOST_CHECK(!unicode::is_valid_unicode(0xFFFFFFFF)); // U"\UFFFFFFFF" is invalid C++ BOOST_CHECK(!unicode::is_valid_unicode(0x01234567)); BOOST_CHECK(!unicode::is_valid_unicode(0x12345678)); BOOST_CHECK(!unicode::is_valid_unicode(0xD800)); BOOST_CHECK(!unicode::is_valid_unicode(0xD987)); BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF)); } // Test ISO encodings BOOST_AUTO_TEST_CASE(convert_iso) { BOOST_CHECK((std::string{unicode::convert({})}) == std::string{}); BOOST_CHECK((std::string{unicode::convert("abc")}) == std::string{"abc"}); BOOST_CHECK((std::string{unicode::convert("\xe4\xf6\xfc")}) == std::string{"\xe4\xf6\xfc"}); // Latin-1 äöü BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € BOOST_CHECK((std::string{unicode::convert({})}) == std::string{}); BOOST_CHECK((std::string{unicode::convert("abc")}) == std::string{"abc"}); BOOST_CHECK((std::string{unicode::convert("\xe4\xf6\xfc")}) == std::string{"\xe4\xf6\xfc"}); // Latin-1 äöü BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € BOOST_CHECK_THROW(((void)std::string{unicode::convert("\xa4")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert("\xa4")}), std::invalid_argument); // € not available in ISO-8859-1 } // Test conversion between ISO and UTF encodings BOOST_AUTO_TEST_CASE(convert_iso_utf) { BOOST_CHECK_THROW(((void)std::string{unicode::convert(u8"\u20ac")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u"\u20ac")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(U"\u20ac")}), std::invalid_argument); BOOST_CHECK((std::string{unicode::convert(u8"\u20ac")}) == std::string{"\xa4"}); // € BOOST_CHECK((std::string{unicode::convert(u"\u20ac")}) == std::string{"\xa4"}); // € BOOST_CHECK((std::string{unicode::convert(U"\u20ac")}) == std::string{"\xa4"}); // € BOOST_CHECK_THROW(((void)std::string{unicode::convert(u8"\u00A4")}), std::invalid_argument); // currency sign: Latin-1, but not Latin-15 BOOST_CHECK((std::string{unicode::convert(u8"\u00A4")}) == std::string{"\xa4"}); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u8"\u732b")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u"\u732b")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(U"\u732b")}), std::invalid_argument); } // Test UTF encodings BOOST_AUTO_TEST_CASE(convert_utf) { BOOST_CHECK_THROW((unicode::convert(std::u32string{(char32_t*)"\x00\xD8\x00\x00\x00\x00\x00\x00"})) , std::invalid_argument); BOOST_CHECK((unicode::convert(u8"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(u8"a\0bc")) == std::u16string{u"a\0bc"}); BOOST_CHECK((unicode::convert(u8"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert("äöü")) == std::u32string{U"äöü"}); // vector BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector{u'ä', u'ö', u'ü'})); // deque BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::deque{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque{L'ä', L'ö', L'ü'})); // yet unsupported: //BOOST_CHECK((unicode::convert(std::deque{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque{u'ä', u'ö', u'ü'})); //BOOST_CHECK((unicode::convert(std::deque{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque{u'ä', u'ö', u'ü'})); // deque with uint8_t, uint16_t BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::deque{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque{L'ä', L'ö', L'ü'})); // deque with int8_t, int16_t BOOST_CHECK((unicode::convert, std::deque>(std::deque{ static_cast(0xc3), static_cast(0xa4), static_cast(0xc3), static_cast(0xb6), static_cast(0xc3), static_cast(0xbc)})) == (std::deque{L'ä', L'ö', L'ü'})); // list BOOST_CHECK((unicode::convert, std::list>(std::list{})) == std::list{}); BOOST_CHECK((unicode::convert, std::list>(std::list{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list{L'ä', L'ö', L'ü'})); // list -> deque BOOST_CHECK((unicode::convert, std::deque>(std::list{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::list{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque{L'ä', L'ö', L'ü'})); // array BOOST_CHECK((unicode::convert, std::list>(std::array{})) == std::list{}); BOOST_CHECK((unicode::convert, std::list>(std::array{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list{L'ä', L'ö', L'ü'})); } // wchar_t specific tests: system dependent BOOST_AUTO_TEST_CASE(convert_wstring) { #ifdef _WIN32 BOOST_CHECK(sizeof(wchar_t) == 2); #else // Unix like BOOST_CHECK(sizeof(wchar_t) == 4); #endif // For the following checks, wchar_t size and encoding is system dependent: // Windows: UTF-16 // Linux: UTF-32 BOOST_CHECK((unicode::convert("äöü")) == std::wstring{L"äöü"}); BOOST_CHECK((unicode::convert("\u732b")) == std::wstring{L"\u732b"}); BOOST_CHECK((unicode::convert("\U0001F63A")) == std::wstring{L"\U0001F63A"}); BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::u32string{U"\U0001F63A"}); BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::basic_string{(utf8_t*)"\U0001F63A"}); BOOST_CHECK((unicode::convert(std::string{"äöü"})) == std::wstring{L"äöü"}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector{L'ä', L'ö', L'ü'})); std::u16string u16_value{u"\U0001F63A"}; std::u32string u32_value{U"\U0001F63A"}; std::wstring w_value{L"\U0001F63A"}; std::u16string result_u16_value{unicode::convert(w_value)}; std::u32string result_u32_value{unicode::convert(w_value)}; std::wstring result_w_value_1{unicode::convert(u16_value)}; std::wstring result_w_value_2{unicode::convert(u32_value)}; BOOST_CHECK_EQUAL(u16_value.size(), 2); BOOST_CHECK_EQUAL(u32_value.size(), 1); BOOST_CHECK_EQUAL(result_u16_value.size(), 2); BOOST_CHECK_EQUAL(result_u32_value.size(), 1); BOOST_CHECK_EQUAL(u16_value, result_u16_value); BOOST_CHECK_EQUAL(u32_value, result_u32_value); BOOST_CHECK(w_value == result_w_value_1); BOOST_CHECK(w_value == result_w_value_2); #ifdef _WIN32 BOOST_CHECK_EQUAL(w_value.size(), 2); BOOST_CHECK_EQUAL(result_w_value_1.size(), 2); BOOST_CHECK_EQUAL(result_w_value_2.size(), 2); #else // Unix like BOOST_CHECK_EQUAL(w_value.size(), 1); BOOST_CHECK_EQUAL(result_w_value_1.size(), 1); BOOST_CHECK_EQUAL(result_w_value_2.size(), 1); #endif } BOOST_AUTO_TEST_CASE(is_valid_utf) { BOOST_CHECK(unicode::is_valid_utf(u"äöü")); BOOST_CHECK(unicode::is_valid_utf(u8"äöü")); }