From ae0ccdf4569d6d4f49c60392a9e849aaa58c3fa6 Mon Sep 17 00:00:00 2001
From: Roland Reichwein <mail@reichwein.it>
Date: Thu, 28 Jan 2021 21:18:39 +0100
Subject: Bugfix, test

---
 include/unicode.h    |  5 +++
 src/test-unicode.cpp | 87 +++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 78 insertions(+), 14 deletions(-)
diff --git a/include/unicode.h b/include/unicode.h
index f539e6b..908c75f 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -141,6 +141,11 @@ namespace {
       throw std::invalid_argument("Bad input: Invalid 2 byte sequence");
     } else
      throw std::invalid_argument("Bad input: 2nd byte expected, none found");
+  
+    // check only for sequences >= 2 bytes (ASCII is always compliant)
+    if (!unicode::is_valid_unicode(value))
+     throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+
    } else { // 1 byte: 7 bit ASCII
     value = byte0;
     sequence_length = 1;
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 2cc8393..2dfabef 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -5,7 +5,10 @@
 #include <boost/test/data/monomorphic.hpp>
 #include <boost/test/data/test_case.hpp>
 
+#include <chrono>
 #include <exception>
+#include <limits>
+#include <random>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -13,6 +16,8 @@
 
 #include <unicode.h>
 
+using namespace std::chrono_literals;
+
 typedef std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> types_collection_type;
 
 // create tuple of the same string, in UTF-8, UTF-16 and UTF-32
@@ -30,17 +35,22 @@ std::vector<types_collection_type> success_sets {
 
 // Error cases: throwing upon convert to all other types
 std::vector<std::basic_string<char8_t>> failure_strings_char8_t {
- u8"\x80",
- u8"\x81"
+ u8"\x80", // utf-8 continuation byte
+ u8"\x81", // utf-8 continuation byte
+ u8"\xc3ä", // initial byte of utf-8 "ä", followed by valid utf-8 "ä"
+ u8"\xF8\x80\x80\x80\x80", // overlong encoding
+ u8"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point
 };
 
 std::vector<std::basic_string<char16_t>> failure_strings_char16_t {
- u"\xD801",
+ u"\xD801", // single high surrogate
+ u"\xDFFF", // single low surrogate
+ u"\xDFFF\xD801", // bad surrogate pair order
 };
 
 std::vector<std::basic_string<char32_t>> failure_strings_char32_t {
- U"\xD801",
- U"\x10000000",
+ U"blabla \xD801", // invalid unicode (surrogate half)
+ U"\x10000000", // invalid unicode (number too big)
 };
 
 // output operators must be in same namespace as the type itself
@@ -156,16 +166,65 @@ BOOST_AUTO_TEST_CASE(is_valid_unicode)
  BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF));
 }
 
+struct random_context {
+ std::random_device rd;  // OS random number engine to seed RNG (below)
+ std::mt19937 gen{rd()};
+ std::uniform_int_distribution<> sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units
+};
+
+template<typename T>
+T generate_random(random_context& rc, size_t length)
+{
+ std::uniform_int_distribution<> code_unit(0, std::numeric_limits<typename T::value_type>::max()); // code unit value
+ T result;
+ std::generate_n(std::back_inserter(result), length, [&](){return code_unit(rc.gen);});
+
+ return result;
+}
+
+template<typename From, typename ToTypesCollectionType, size_t i = 0>
+void test_random(random_context& rc, size_t length)
+{
+ //std::cerr << "LENGTH: " << length << std::endl;
+ typedef typename std::tuple_element<i,ToTypesCollectionType>::type To;
+
+ From r {generate_random<From>(rc, length)};
+
+ try {
+  To result{unicode::utf_to_utf<typename From::value_type,typename To::value_type>(r)};
+ } catch (const std::runtime_error&) {
+  // OK: this is an expected exception for utf_to_utf on bad input
+ } catch (const std::invalid_argument&) {
+  // OK: this is an expected exception for utf_to_utf on bad input
+ }
+
+ //std::cerr << "DEBUG: " << typeid(From).name() << std::endl;
+ //std::cerr << " DEBUG2: " << typeid(To).name() << std::endl;
+
+ // iterate over remaining To types
+ if constexpr (i + 1 < std::tuple_size<ToTypesCollectionType>::value)
+  test_random<From, ToTypesCollectionType, i + 1>(rc, length);
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type)
+{
+ random_context rc;
+
+ // run for 1s (debug) 10s (release)
+#ifdef _DEBUG
+ const auto timeout{1.0s};
+#else
+ const auto timeout{10.0s};
+#endif
+
+ auto timeout_stamp { std::chrono::steady_clock::now() + (timeout / std::tuple_size<types_collection_type>::value)};
+
+ while (!(std::chrono::steady_clock::now() > timeout_stamp)) {
+  test_random<T,types_collection_type>(rc, rc.sequence_length(rc.gen));
+ }
+}
+
 // TODO:
-// UTF-8
-//  invalid bytes
-//  an unexpected continuation byte
-//  a non-continuation byte before the end of the character
-//  the string ending before the end of the character (which can happen in simple string truncation)
-//  an overlong encoding
-//  a sequence that decodes to an invalid code point
-//
-//  high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF)
 //
 // char8_t, char16_t, char32_t, char, wchar_t (UTF-16 on Windows, UTF-32 on Linux)
 // string, vector?
-- 
cgit v1.2.3