From 8126c217931363a05121f56f5790758ced9270d1 Mon Sep 17 00:00:00 2001
From: Roland Reichwein <mail@reichwein.it>
Date: Wed, 5 Jan 2022 20:47:07 +0100
Subject: Add tests

---
 src/test-unicode.cpp | 305 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 303 insertions(+), 2 deletions(-)

diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 1ea704b..aebc644 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -37,9 +37,25 @@ typedef std::tuple<std::basic_string<utf8_t>, std::basic_string<char16_t>, std::
 // create tuple of the same string, in UTF-8, UTF-16 and UTF-32
 #define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x}
 
+// Interesting Unicode ranges for testing corner cases:
+// 0x0        - smallest Unicode value
+// 0x7F       - highest 1-byte UTF-8 value
+// 0x7FF      - highest 2-byte UTF-8 value
+// 0xD800     - smallest UTF-16 low surrogate (invalid range for unicode code points)
+// 0xDBFF     - highest UTF-16 low surrogate (invalid range for unicode code points)
+// 0xDC00     - smallest UTF-16 high surrogate (invalid range for unicode code points)
+// 0xDFFF     - highest UTF-16 high surrogate (invalid range for unicode code points)
+// 0xFFFF     - highest 3-byte UTF-8 value, highest 2-byte UTF-16 value
+// 0x10FFFF   - highest Unicode value
+//            = highest value that can be encoded in UTF-16
+// 0x1FFFFF   - highest value that can be encoded in UTF-8 (with 4-byte limit)
+// 0xFFFFFFFF - highest value that can be encoded in UTF-32
+
 // Success cases: convert string to all other types, respectively
 std::vector<types_collection_type> success_sets {
  SUCCESS_TUPLE(""),
+ SUCCESS_TUPLE("\0"),
+ SUCCESS_TUPLE("0"),
  
  // various string
  SUCCESS_TUPLE("ASCII string1"),
@@ -53,6 +69,43 @@ std::vector<types_collection_type> success_sets {
  SUCCESS_TUPLE("äöü\0\u20ac"),
  SUCCESS_TUPLE("äöü0\u20ac"),
 
+ // UTF-8 specific: 2 bytes encodings
+ SUCCESS_TUPLE("\u0080"),
+ SUCCESS_TUPLE("\u0101"),
+ SUCCESS_TUPLE("text1\u0101text2"),
+ SUCCESS_TUPLE("\u0101text2"),
+ SUCCESS_TUPLE("text1\u0101"),
+ SUCCESS_TUPLE("\u0701"),
+ SUCCESS_TUPLE("\u07FF"),
+ 
+ // UTF-8 specific: 3 bytes encodings
+ SUCCESS_TUPLE("\u0800"),
+ SUCCESS_TUPLE("context1\u0800context2"),
+ SUCCESS_TUPLE("\u0800context2"),
+ SUCCESS_TUPLE("context1\u0800"),
+ SUCCESS_TUPLE("context1\u0800\u0901"),
+ SUCCESS_TUPLE("context1\u0800\u0901context"),
+ SUCCESS_TUPLE("\u1234"),
+ SUCCESS_TUPLE("\u4321"),
+ SUCCESS_TUPLE("\uFFFF"),
+
+ // UTF-8 specific: 4 bytes encodings
+ SUCCESS_TUPLE("\U00010000"),
+ SUCCESS_TUPLE("\U00043210"),
+ SUCCESS_TUPLE("context1\U00043210context2"),
+ SUCCESS_TUPLE("\U00043210context2"),
+ SUCCESS_TUPLE("context1\U00043210"),
+ SUCCESS_TUPLE("context1\U00043210\U00012345"),
+ SUCCESS_TUPLE("context1\U00043210\U00012345context2"),
+ SUCCESS_TUPLE("\U0010FFFF"),
+ 
+ // UTF-8 specific: mixed encodings
+ SUCCESS_TUPLE("abc\u0123\u4321\U00010000\u1234\u0321xyz"),
+ 
+ // UTF-16 specific: corner cases of surrogates
+ SUCCESS_TUPLE("\uD7FFcontext\uD7FF"),
+ SUCCESS_TUPLE("\uD7FFcontext\uE000"),
+ 
  // optimization relevant strings
  SUCCESS_TUPLE("01234567\u20ac01234567"),
  SUCCESS_TUPLE("0123456\u20ac01234567"),
@@ -78,16 +131,53 @@ std::vector<types_collection_type> success_sets {
 std::vector<std::basic_string<utf8_t>> failure_strings_char8_t {
  // using u8"" here doesn't work on MSVC
  (utf8_t*)"\x80", // utf-8 continuation byte
- (utf8_t*)"\x81", // utf-8 continuation byte
+ (utf8_t*)"text1\x81text2",
+ (utf8_t*)"\x82text2",
+ (utf8_t*)"text1\x83",
+
  (utf8_t*)"\xc3\xc3\xa4", // initial byte of utf-8 "ä", followed by valid utf-8 "ä"
- (utf8_t*)"\xF8\x80\x80\x80\x80", // overlong encoding
+ (utf8_t*)"text1\xc3text2\xc3\xa4text3",
+ (utf8_t*)"\xc3text2\xc3\xa4text3",
+ (utf8_t*)"text1\xc3\xc3\xa4text3",
+ (utf8_t*)"text1\xc3text2\xc3\xa4",
+
+ (utf8_t*)"\xF8\x80\x80\x80\x80", // overlong encoding of valid code point
+ (utf8_t*)"text1\xF8\x80\x80\x80\x80text2",
+ (utf8_t*)"\xF8\x80\x80\x80\x80text2",
+ (utf8_t*)"text1\xF8\x80\x80\x80\x80",
+
  (utf8_t*)"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point
 };
 
 std::vector<std::basic_string<char16_t>> failure_strings_char16_t {
  u"\xD801", // single high surrogate
+ u"text1\xD801text2",
+ u"\xD801text2",
+ u"text1\xD801",
+ 
+ u"\xD800\xD800", // double high surrogate
+ u"\xD801\xD802",
+ u"\xDBFF\xDBFF",
+ u"\xDBFE\xDBFD",
+
  u"\xDFFF", // single low surrogate
+ u"text1\xDFFFtext2",
+ u"\xDFFFtext2",
+ u"text1\xDFFF",
+ 
+ u"\xDFFF\xDFFF", // double low surrogate
+ u"\xDC00\xDC00",
+ u"\xDC01\xDFFE",
+ u"\xDFFE\xDC01",
+
  u"\xDFFF\xD801", // bad surrogate pair order
+ u"text1\xDFFF\xD801text2",
+ u"\xDFFF\xD801text2",
+ u"text1\xDFFF\xD801",
+ u"\xDC00\xDBFF",
+ u"\xDC00\xDBFE",
+ u"\xDC01\xDBFE",
+ u"\xDC01\xDBFF",
 };
 
 std::vector<std::basic_string<char32_t>> failure_strings_char32_t {
@@ -95,6 +185,11 @@ std::vector<std::basic_string<char32_t>> failure_strings_char32_t {
  U"blabla \xD801", // invalid unicode (surrogate half)
  U"moreblabla \xDFFF", // invalid unicode (surrogate half)
  U"\x10000000", // invalid unicode (number too big)
+ U"\x1111111",
+ U"\x110000",
+ U"\x110001\x110002\x110003",
+ U"\x7FFFFFFF",
+ U"\xFFFFFFF",
 };
 
 // check assumptions about environment
@@ -196,6 +291,14 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
  result = unicode::convert<typename unicode::Encoding_t<typename From::value_type>, typename unicode::Encoding_t<typename To::value_type>>(std::get<i>(t));
  BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Encoding: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
 
+ // test container interface with std::vector instead of std::string
+ auto string_value{std::get<i>(t)};
+ auto string_reference{std::get<j>(t)};
+ std::vector<typename From::value_type> vector_value{string_value.begin(), string_value.end()};
+ std::vector<typename To::value_type> vector_reference{string_reference.begin(), string_reference.end()};
+ std::vector<typename To::value_type> vector_result { unicode::convert<std::vector<typename From::value_type>, std::vector<typename To::value_type>>(vector_value)};
+ BOOST_CHECK_MESSAGE(vector_reference == vector_result, "Vector Container: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << ")");
+
  // test actual results by comparing with boost::locale::conv results
  BOOST_CHECK_EQUAL(result, (boost::locale::conv::utf_to_utf<typename To::value_type, typename From::value_type>(std::get<i>(t))));
  
@@ -232,6 +335,12 @@ void test_is_valid_utf(std::tuple<Ts...>& t)
  // test via Encoding
  result = unicode::is_valid_utf<typename unicode::Encoding_t<typename T::value_type>>(std::get<i>(t));
  BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename unicode::Encoding_t<typename T::value_type>).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result);
+ 
+ // test via other container type
+ auto string_value{std::get<i>(t)};
+ std::vector<typename T::value_type> vector_value{string_value.begin(), string_value.end()};
+ result = unicode::is_valid_utf<std::vector<typename T::value_type>>(vector_value);
+ BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result);
 
  // iterate over other combinations
  if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
@@ -280,6 +389,17 @@ void test_utf_to_utf_failure(std::basic_string<From>& s)
   BOOST_ERROR("Unexpected error on convert(): " << ex.what());
  };
 
+ // via other container type
+ try {
+  std::vector<From> vector_value{s.begin(), s.end()};
+  (void) unicode::convert<std::vector<From>, std::vector<To>>(vector_value);
+  BOOST_ERROR("Vector container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
+ } catch (const std::invalid_argument&) {
+  // OK: this is an expected exception for convert() on bad input
+ } catch (const std::exception& ex) {
+  BOOST_ERROR("Unexpected error on convert(): " << ex.what());
+ };
+
  // iterate over remaining types 
  if constexpr (index + 1 < std::tuple_size<Collection>::value)
   test_utf_to_utf_failure<From, Collection, index + 1>(s);
@@ -307,6 +427,9 @@ void test_is_valid_utf_failure(std::basic_string<T>& s)
  
  BOOST_CHECK_MESSAGE(unicode::is_valid_utf<typename unicode::Encoding_t<T>>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding_t<T>).name());
 
+ std::vector<T> vector_value{s.begin(), s.end()};
+ BOOST_CHECK_MESSAGE(unicode::is_valid_utf<typename std::vector<T>>(vector_value) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name());
+
  // iterate over remaining types 
  if constexpr (index + 1 < std::tuple_size<Collection>::value)
   test_is_valid_utf_failure<T, Collection, index + 1>(s);
@@ -393,6 +516,10 @@ BOOST_AUTO_TEST_CASE(convert_utf)
 
  BOOST_CHECK((unicode::convert<char, char32_t>("äöü")) == std::u32string{U"äöü"});
 
+ BOOST_CHECK((unicode::convert<unicode::UTF_16,unicode::UTF_32>(u"\xD800\xDC00")) == std::u32string{U"\U00010000"});
+ BOOST_CHECK((unicode::convert<unicode::UTF_16,unicode::UTF_32>(u"\xD800\xDC01")) == std::u32string{U"\U00010001"});
+ BOOST_CHECK((unicode::convert<unicode::UTF_16,unicode::UTF_32>(u"\xD810\xDC01")) == std::u32string{U"\U00014001"});
+
  // vector
  BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<char16_t>>(std::vector<char>{})) == std::vector<char16_t>{});
  BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<char16_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<char16_t>{u'ä', u'ö', u'ü'}));
@@ -489,3 +616,177 @@ BOOST_AUTO_TEST_CASE(is_valid_utf)
  BOOST_CHECK(unicode::is_valid_utf<unicode::UTF_8>(u8"äöü"));
 }
 
+BOOST_AUTO_TEST_CASE(exceptions)
+{
+ { // UTF-8: Incomplete string
+  std::vector<utf8_t> x{(utf8_t)'\xC0'};
+  try {
+   auto result{unicode::convert<std::vector<utf8_t>,std::vector<char16_t>>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Bad input: Not enough bytes left for decoding UTF-8 sequence"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // UTF-8: Encoded value too high 
+  std::vector<utf8_t> x{(utf8_t)'\xF7', (utf8_t)'\xBF', (utf8_t)'\xBF', (utf8_t)'\xBF'};
+  try {
+   auto result{unicode::convert<std::vector<utf8_t>,std::vector<char16_t>>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Invalid Unicode character: 2097151"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+ 
+ { // UTF-8: Overlong encoding
+  std::vector<utf8_t> x{(utf8_t)'\xF8', (utf8_t)'\x80', (utf8_t)'\x80', (utf8_t)'\x80', (utf8_t)'\x80'};
+  try {
+   auto result{unicode::convert<std::vector<utf8_t>,std::vector<char16_t>>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Bad UTF-8 input: Invalid 4 byte sequence"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+ 
+ { // UTF-16: Incomplete currogate encoding
+  std::vector<char16_t> x{(char16_t)u'\xD800'};
+  try {
+   auto result{unicode::convert<std::vector<char16_t>,std::vector<char32_t>>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Bad input: Continuation of first UTF-16 unit missing"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+ 
+ { // UTF-16: Invalid surrogates encoding
+  std::vector<char16_t> x{(char16_t)u'\xD800', (char16_t)u'\xD800'};
+  try {
+   auto result{unicode::convert<std::vector<char16_t>,std::vector<char32_t>>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Bad input: 2 malformed UTF-16 surrogates"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // UTF-32: Invalid value
+  std::vector<char32_t> x{(char32_t)U'\xFFFFFFFF'};
+  try {
+   auto result{unicode::convert<std::vector<char32_t>,std::vector<char8_t>>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Invalid Unicode character: 4294967295"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // ISO: Invalid value (from Unicode)
+  std::u32string x{U"\U00000123"};
+  try {
+   auto result{unicode::convert<unicode::UTF_32,unicode::ISO_8859_1>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Bad ISO 8859 value above 255: 291"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // ISO: Invalid 8-bit value that can't be mapped (from Unicode)
+  std::u32string x{U"\U000000BC"};
+  try {
+   auto result{unicode::convert<unicode::UTF_32,unicode::ISO_8859_15>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Bad Unicode value to map to ISO 8859-15: 188"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // ISO: Invalid 8-bit value that can't be mapped between ISO-8859-1 and ISO-8859-15
+  std::string x{"\xBC"};
+  try {
+   auto result{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_15>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Bad Unicode value to map to ISO 8859-15: 188"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // ISO: Invalid 8-bit value that can't be mapped between ISO-8859-1 and ISO-8859-15
+  std::string x{"\xBC"};
+  try {
+   auto result{unicode::convert<unicode::ISO_8859_15,unicode::ISO_8859_1>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Bad ISO 8859 value above 255: 338"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // Conversion from UTF-x to UTF-x: Fast mode just validating (resulting in error)
+  std::u16string x{u"\xD800"};
+  try {
+   auto result{unicode::convert<unicode::UTF_16,unicode::UTF_16>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Invalid UTF input"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+#if defined(_WIN32) || defined(__linux__)
+ { // Optimization: UTF-8 decoding invalid Unicode value in 3 byte sequence
+  std::basic_string<utf8_t> x{(utf8_t*)"\xED\xA0\x80  aaa"};
+  try {
+   auto result{unicode::convert<unicode::UTF_8,unicode::UTF_16>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Invalid Unicode character in 3 byte UTF-8 sequence"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // Optimization: UTF-8 decoding invalid Unicode value in 4 byte sequence
+  std::basic_string<utf8_t> x{(utf8_t*)"\xF7\xBF\xBF\xBF aaa"};
+  try {
+   auto result{unicode::convert<unicode::UTF_8,unicode::UTF_16>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Invalid Unicode character in 4 byte UTF-8 sequence"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+
+ { // Optimization: UTF-8 decoding invalid byte sequence
+  std::basic_string<utf8_t> x{(utf8_t*)"\xC0 aabbbb"};
+  try {
+   auto result{unicode::convert<unicode::UTF_8,unicode::UTF_16>(x)};
+   BOOST_FAIL("Expected boost convert to fail");
+  } catch (const std::invalid_argument& ex) {
+   BOOST_CHECK_EQUAL("Invalid UTF-8 byte sequence"s, ex.what());
+  } catch (...) {
+   BOOST_ERROR("Unexpected error on convert");
+  }
+ }
+#endif
+
+}
+
-- 
cgit v1.2.3