From ae0ccdf4569d6d4f49c60392a9e849aaa58c3fa6 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Thu, 28 Jan 2021 21:18:39 +0100 Subject: Bugfix, test --- include/unicode.h | 5 +++ src/test-unicode.cpp | 87 +++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 78 insertions(+), 14 deletions(-) diff --git a/include/unicode.h b/include/unicode.h index f539e6b..908c75f 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -141,6 +141,11 @@ namespace { throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); } else throw std::invalid_argument("Bad input: 2nd byte expected, none found"); + + // check only for sequences >= 2 bytes (ASCII is always compliant) + if (!unicode::is_valid_unicode(value)) + throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); + } else { // 1 byte: 7 bit ASCII value = byte0; sequence_length = 1; diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 2cc8393..2dfabef 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -5,7 +5,10 @@ #include #include +#include #include +#include +#include #include #include #include @@ -13,6 +16,8 @@ #include +using namespace std::chrono_literals; + typedef std::tuple, std::basic_string, std::basic_string> types_collection_type; // create tuple of the same string, in UTF-8, UTF-16 and UTF-32 @@ -30,17 +35,22 @@ std::vector success_sets { // Error cases: throwing upon convert to all other types std::vector> failure_strings_char8_t { - u8"\x80", - u8"\x81" + u8"\x80", // utf-8 continuation byte + u8"\x81", // utf-8 continuation byte + u8"\xc3ä", // initial byte of utf-8 "ä", followed by valid utf-8 "ä" + u8"\xF8\x80\x80\x80\x80", // overlong encoding + u8"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point }; std::vector> failure_strings_char16_t { - u"\xD801", + u"\xD801", // single high surrogate + u"\xDFFF", // single low surrogate + u"\xDFFF\xD801", // bad surrogate pair order }; std::vector> failure_strings_char32_t { - U"\xD801", - U"\x10000000", + U"blabla \xD801", // invalid unicode (surrogate half) + U"\x10000000", // invalid unicode (number too big) }; // output operators must be in same namespace as the type itself @@ -156,16 +166,65 @@ BOOST_AUTO_TEST_CASE(is_valid_unicode) BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF)); } +struct random_context { + std::random_device rd; // OS random number engine to seed RNG (below) + std::mt19937 gen{rd()}; + std::uniform_int_distribution<> sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units +}; + +template +T generate_random(random_context& rc, size_t length) +{ + std::uniform_int_distribution<> code_unit(0, std::numeric_limits::max()); // code unit value + T result; + std::generate_n(std::back_inserter(result), length, [&](){return code_unit(rc.gen);}); + + return result; +} + +template +void test_random(random_context& rc, size_t length) +{ + //std::cerr << "LENGTH: " << length << std::endl; + typedef typename std::tuple_element::type To; + + From r {generate_random(rc, length)}; + + try { + To result{unicode::utf_to_utf(r)}; + } catch (const std::runtime_error&) { + // OK: this is an expected exception for utf_to_utf on bad input + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for utf_to_utf on bad input + } + + //std::cerr << "DEBUG: " << typeid(From).name() << std::endl; + //std::cerr << " DEBUG2: " << typeid(To).name() << std::endl; + + // iterate over remaining To types + if constexpr (i + 1 < std::tuple_size::value) + test_random(rc, length); +} + +BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type) +{ + random_context rc; + + // run for 1s (debug) 10s (release) +#ifdef _DEBUG + const auto timeout{1.0s}; +#else + const auto timeout{10.0s}; +#endif + + auto timeout_stamp { std::chrono::steady_clock::now() + (timeout / std::tuple_size::value)}; + + while (!(std::chrono::steady_clock::now() > timeout_stamp)) { + test_random(rc, rc.sequence_length(rc.gen)); + } +} + // TODO: -// UTF-8 -// invalid bytes -// an unexpected continuation byte -// a non-continuation byte before the end of the character -// the string ending before the end of the character (which can happen in simple string truncation) -// an overlong encoding -// a sequence that decodes to an invalid code point -// -// high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF) // // char8_t, char16_t, char32_t, char, wchar_t (UTF-16 on Windows, UTF-32 on Linux) // string, vector? -- cgit v1.2.3