From 8126c217931363a05121f56f5790758ced9270d1 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Wed, 5 Jan 2022 20:47:07 +0100 Subject: Add tests --- src/test-unicode.cpp | 305 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 303 insertions(+), 2 deletions(-) diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 1ea704b..aebc644 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -37,9 +37,25 @@ typedef std::tuple, std::basic_string, std:: // create tuple of the same string, in UTF-8, UTF-16 and UTF-32 #define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x} +// Interesting Unicode ranges for testing corner cases: +// 0x0 - smallest Unicode value +// 0x7F - highest 1-byte UTF-8 value +// 0x7FF - highest 2-byte UTF-8 value +// 0xD800 - smallest UTF-16 low surrogate (invalid range for unicode code points) +// 0xDBFF - highest UTF-16 low surrogate (invalid range for unicode code points) +// 0xDC00 - smallest UTF-16 high surrogate (invalid range for unicode code points) +// 0xDFFF - highest UTF-16 high surrogate (invalid range for unicode code points) +// 0xFFFF - highest 3-byte UTF-8 value, highest 2-byte UTF-16 value +// 0x10FFFF - highest Unicode value +// = highest value that can be encoded in UTF-16 +// 0x1FFFFF - highest value that can be encoded in UTF-8 (with 4-byte limit) +// 0xFFFFFFFF - highest value that can be encoded in UTF-32 + // Success cases: convert string to all other types, respectively std::vector success_sets { SUCCESS_TUPLE(""), + SUCCESS_TUPLE("\0"), + SUCCESS_TUPLE("0"), // various string SUCCESS_TUPLE("ASCII string1"), @@ -53,6 +69,43 @@ std::vector success_sets { SUCCESS_TUPLE("äöü\0\u20ac"), SUCCESS_TUPLE("äöü0\u20ac"), + // UTF-8 specific: 2 bytes encodings + SUCCESS_TUPLE("\u0080"), + SUCCESS_TUPLE("\u0101"), + SUCCESS_TUPLE("text1\u0101text2"), + SUCCESS_TUPLE("\u0101text2"), + SUCCESS_TUPLE("text1\u0101"), + SUCCESS_TUPLE("\u0701"), + SUCCESS_TUPLE("\u07FF"), + + // UTF-8 specific: 3 bytes encodings + SUCCESS_TUPLE("\u0800"), + SUCCESS_TUPLE("context1\u0800context2"), + SUCCESS_TUPLE("\u0800context2"), + SUCCESS_TUPLE("context1\u0800"), + SUCCESS_TUPLE("context1\u0800\u0901"), + SUCCESS_TUPLE("context1\u0800\u0901context"), + SUCCESS_TUPLE("\u1234"), + SUCCESS_TUPLE("\u4321"), + SUCCESS_TUPLE("\uFFFF"), + + // UTF-8 specific: 4 bytes encodings + SUCCESS_TUPLE("\U00010000"), + SUCCESS_TUPLE("\U00043210"), + SUCCESS_TUPLE("context1\U00043210context2"), + SUCCESS_TUPLE("\U00043210context2"), + SUCCESS_TUPLE("context1\U00043210"), + SUCCESS_TUPLE("context1\U00043210\U00012345"), + SUCCESS_TUPLE("context1\U00043210\U00012345context2"), + SUCCESS_TUPLE("\U0010FFFF"), + + // UTF-8 specific: mixed encodings + SUCCESS_TUPLE("abc\u0123\u4321\U00010000\u1234\u0321xyz"), + + // UTF-16 specific: corner cases of surrogates + SUCCESS_TUPLE("\uD7FFcontext\uD7FF"), + SUCCESS_TUPLE("\uD7FFcontext\uE000"), + // optimization relevant strings SUCCESS_TUPLE("01234567\u20ac01234567"), SUCCESS_TUPLE("0123456\u20ac01234567"), @@ -78,16 +131,53 @@ std::vector success_sets { std::vector> failure_strings_char8_t { // using u8"" here doesn't work on MSVC (utf8_t*)"\x80", // utf-8 continuation byte - (utf8_t*)"\x81", // utf-8 continuation byte + (utf8_t*)"text1\x81text2", + (utf8_t*)"\x82text2", + (utf8_t*)"text1\x83", + (utf8_t*)"\xc3\xc3\xa4", // initial byte of utf-8 "ä", followed by valid utf-8 "ä" - (utf8_t*)"\xF8\x80\x80\x80\x80", // overlong encoding + (utf8_t*)"text1\xc3text2\xc3\xa4text3", + (utf8_t*)"\xc3text2\xc3\xa4text3", + (utf8_t*)"text1\xc3\xc3\xa4text3", + (utf8_t*)"text1\xc3text2\xc3\xa4", + + (utf8_t*)"\xF8\x80\x80\x80\x80", // overlong encoding of valid code point + (utf8_t*)"text1\xF8\x80\x80\x80\x80text2", + (utf8_t*)"\xF8\x80\x80\x80\x80text2", + (utf8_t*)"text1\xF8\x80\x80\x80\x80", + (utf8_t*)"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point }; std::vector> failure_strings_char16_t { u"\xD801", // single high surrogate + u"text1\xD801text2", + u"\xD801text2", + u"text1\xD801", + + u"\xD800\xD800", // double high surrogate + u"\xD801\xD802", + u"\xDBFF\xDBFF", + u"\xDBFE\xDBFD", + u"\xDFFF", // single low surrogate + u"text1\xDFFFtext2", + u"\xDFFFtext2", + u"text1\xDFFF", + + u"\xDFFF\xDFFF", // double low surrogate + u"\xDC00\xDC00", + u"\xDC01\xDFFE", + u"\xDFFE\xDC01", + u"\xDFFF\xD801", // bad surrogate pair order + u"text1\xDFFF\xD801text2", + u"\xDFFF\xD801text2", + u"text1\xDFFF\xD801", + u"\xDC00\xDBFF", + u"\xDC00\xDBFE", + u"\xDC01\xDBFE", + u"\xDC01\xDBFF", }; std::vector> failure_strings_char32_t { @@ -95,6 +185,11 @@ std::vector> failure_strings_char32_t { U"blabla \xD801", // invalid unicode (surrogate half) U"moreblabla \xDFFF", // invalid unicode (surrogate half) U"\x10000000", // invalid unicode (number too big) + U"\x1111111", + U"\x110000", + U"\x110001\x110002\x110003", + U"\x7FFFFFFF", + U"\xFFFFFFF", }; // check assumptions about environment @@ -196,6 +291,14 @@ void test_utf_to_utf(std::tuple& t) result = unicode::convert, typename unicode::Encoding_t>(std::get(t)); BOOST_CHECK_MESSAGE(std::get(t) == result, "Encoding: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); + // test container interface with std::vector instead of std::string + auto string_value{std::get(t)}; + auto string_reference{std::get(t)}; + std::vector vector_value{string_value.begin(), string_value.end()}; + std::vector vector_reference{string_reference.begin(), string_reference.end()}; + std::vector vector_result { unicode::convert, std::vector>(vector_value)}; + BOOST_CHECK_MESSAGE(vector_reference == vector_result, "Vector Container: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << ")"); + // test actual results by comparing with boost::locale::conv results BOOST_CHECK_EQUAL(result, (boost::locale::conv::utf_to_utf(std::get(t)))); @@ -232,6 +335,12 @@ void test_is_valid_utf(std::tuple& t) // test via Encoding result = unicode::is_valid_utf>(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename unicode::Encoding_t).name() << "(" << i << ", " << std::get(t) << "), got " << result); + + // test via other container type + auto string_value{std::get(t)}; + std::vector vector_value{string_value.begin(), string_value.end()}; + result = unicode::is_valid_utf>(vector_value); + BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get(t) << "), got " << result); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) @@ -280,6 +389,17 @@ void test_utf_to_utf_failure(std::basic_string& s) BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; + // via other container type + try { + std::vector vector_value{s.begin(), s.end()}; + (void) unicode::convert, std::vector>(vector_value); + BOOST_ERROR("Vector container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for convert() on bad input + } catch (const std::exception& ex) { + BOOST_ERROR("Unexpected error on convert(): " << ex.what()); + }; + // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_utf_to_utf_failure(s); @@ -307,6 +427,9 @@ void test_is_valid_utf_failure(std::basic_string& s) BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding_t).name()); + std::vector vector_value{s.begin(), s.end()}; + BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(vector_value) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); + // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_is_valid_utf_failure(s); @@ -393,6 +516,10 @@ BOOST_AUTO_TEST_CASE(convert_utf) BOOST_CHECK((unicode::convert("äöü")) == std::u32string{U"äöü"}); + BOOST_CHECK((unicode::convert(u"\xD800\xDC00")) == std::u32string{U"\U00010000"}); + BOOST_CHECK((unicode::convert(u"\xD800\xDC01")) == std::u32string{U"\U00010001"}); + BOOST_CHECK((unicode::convert(u"\xD810\xDC01")) == std::u32string{U"\U00014001"}); + // vector BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector{u'ä', u'ö', u'ü'})); @@ -489,3 +616,177 @@ BOOST_AUTO_TEST_CASE(is_valid_utf) BOOST_CHECK(unicode::is_valid_utf(u8"äöü")); } +BOOST_AUTO_TEST_CASE(exceptions) +{ + { // UTF-8: Incomplete string + std::vector x{(utf8_t)'\xC0'}; + try { + auto result{unicode::convert,std::vector>(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Bad input: Not enough bytes left for decoding UTF-8 sequence"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // UTF-8: Encoded value too high + std::vector x{(utf8_t)'\xF7', (utf8_t)'\xBF', (utf8_t)'\xBF', (utf8_t)'\xBF'}; + try { + auto result{unicode::convert,std::vector>(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Invalid Unicode character: 2097151"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // UTF-8: Overlong encoding + std::vector x{(utf8_t)'\xF8', (utf8_t)'\x80', (utf8_t)'\x80', (utf8_t)'\x80', (utf8_t)'\x80'}; + try { + auto result{unicode::convert,std::vector>(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Bad UTF-8 input: Invalid 4 byte sequence"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // UTF-16: Incomplete currogate encoding + std::vector x{(char16_t)u'\xD800'}; + try { + auto result{unicode::convert,std::vector>(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Bad input: Continuation of first UTF-16 unit missing"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // UTF-16: Invalid surrogates encoding + std::vector x{(char16_t)u'\xD800', (char16_t)u'\xD800'}; + try { + auto result{unicode::convert,std::vector>(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Bad input: 2 malformed UTF-16 surrogates"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // UTF-32: Invalid value + std::vector x{(char32_t)U'\xFFFFFFFF'}; + try { + auto result{unicode::convert,std::vector>(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Invalid Unicode character: 4294967295"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // ISO: Invalid value (from Unicode) + std::u32string x{U"\U00000123"}; + try { + auto result{unicode::convert(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Bad ISO 8859 value above 255: 291"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // ISO: Invalid 8-bit value that can't be mapped (from Unicode) + std::u32string x{U"\U000000BC"}; + try { + auto result{unicode::convert(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Bad Unicode value to map to ISO 8859-15: 188"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // ISO: Invalid 8-bit value that can't be mapped between ISO-8859-1 and ISO-8859-15 + std::string x{"\xBC"}; + try { + auto result{unicode::convert(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Bad Unicode value to map to ISO 8859-15: 188"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // ISO: Invalid 8-bit value that can't be mapped between ISO-8859-1 and ISO-8859-15 + std::string x{"\xBC"}; + try { + auto result{unicode::convert(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Bad ISO 8859 value above 255: 338"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // Conversion from UTF-x to UTF-x: Fast mode just validating (resulting in error) + std::u16string x{u"\xD800"}; + try { + auto result{unicode::convert(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Invalid UTF input"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + +#if defined(_WIN32) || defined(__linux__) + { // Optimization: UTF-8 decoding invalid Unicode value in 3 byte sequence + std::basic_string x{(utf8_t*)"\xED\xA0\x80 aaa"}; + try { + auto result{unicode::convert(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Invalid Unicode character in 3 byte UTF-8 sequence"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // Optimization: UTF-8 decoding invalid Unicode value in 4 byte sequence + std::basic_string x{(utf8_t*)"\xF7\xBF\xBF\xBF aaa"}; + try { + auto result{unicode::convert(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Invalid Unicode character in 4 byte UTF-8 sequence"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } + + { // Optimization: UTF-8 decoding invalid byte sequence + std::basic_string x{(utf8_t*)"\xC0 aabbbb"}; + try { + auto result{unicode::convert(x)}; + BOOST_FAIL("Expected boost convert to fail"); + } catch (const std::invalid_argument& ex) { + BOOST_CHECK_EQUAL("Invalid UTF-8 byte sequence"s, ex.what()); + } catch (...) { + BOOST_ERROR("Unexpected error on convert"); + } + } +#endif + +} + -- cgit v1.2.3