summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-01-28 21:18:39 +0100
committerRoland Reichwein <mail@reichwein.it>2021-01-28 21:18:39 +0100
commitae0ccdf4569d6d4f49c60392a9e849aaa58c3fa6 (patch)
tree42f8f4a3a62edff2d2d851f688d6a6cae26c7ae2
parentcd4fad54c0be9fb7fca57e8e03228b8b649b5b51 (diff)
Bugfix, test
-rw-r--r--include/unicode.h5
-rw-r--r--src/test-unicode.cpp87
2 files changed, 78 insertions, 14 deletions
diff --git a/include/unicode.h b/include/unicode.h
index f539e6b..908c75f 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -141,6 +141,11 @@ namespace {
throw std::invalid_argument("Bad input: Invalid 2 byte sequence");
} else
throw std::invalid_argument("Bad input: 2nd byte expected, none found");
+
+ // check only for sequences >= 2 bytes (ASCII is always compliant)
+ if (!unicode::is_valid_unicode(value))
+ throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+
} else { // 1 byte: 7 bit ASCII
value = byte0;
sequence_length = 1;
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 2cc8393..2dfabef 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -5,7 +5,10 @@
#include <boost/test/data/monomorphic.hpp>
#include <boost/test/data/test_case.hpp>
+#include <chrono>
#include <exception>
+#include <limits>
+#include <random>
#include <string>
#include <tuple>
#include <type_traits>
@@ -13,6 +16,8 @@
#include <unicode.h>
+using namespace std::chrono_literals;
+
typedef std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> types_collection_type;
// create tuple of the same string, in UTF-8, UTF-16 and UTF-32
@@ -30,17 +35,22 @@ std::vector<types_collection_type> success_sets {
// Error cases: throwing upon convert to all other types
std::vector<std::basic_string<char8_t>> failure_strings_char8_t {
- u8"\x80",
- u8"\x81"
+ u8"\x80", // utf-8 continuation byte
+ u8"\x81", // utf-8 continuation byte
+ u8"\xc3ä", // initial byte of utf-8 "ä", followed by valid utf-8 "ä"
+ u8"\xF8\x80\x80\x80\x80", // overlong encoding
+ u8"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point
};
std::vector<std::basic_string<char16_t>> failure_strings_char16_t {
- u"\xD801",
+ u"\xD801", // single high surrogate
+ u"\xDFFF", // single low surrogate
+ u"\xDFFF\xD801", // bad surrogate pair order
};
std::vector<std::basic_string<char32_t>> failure_strings_char32_t {
- U"\xD801",
- U"\x10000000",
+ U"blabla \xD801", // invalid unicode (surrogate half)
+ U"\x10000000", // invalid unicode (number too big)
};
// output operators must be in same namespace as the type itself
@@ -156,16 +166,65 @@ BOOST_AUTO_TEST_CASE(is_valid_unicode)
BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF));
}
+struct random_context {
+ std::random_device rd; // OS random number engine to seed RNG (below)
+ std::mt19937 gen{rd()};
+ std::uniform_int_distribution<> sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units
+};
+
+template<typename T>
+T generate_random(random_context& rc, size_t length)
+{
+ std::uniform_int_distribution<> code_unit(0, std::numeric_limits<typename T::value_type>::max()); // code unit value
+ T result;
+ std::generate_n(std::back_inserter(result), length, [&](){return code_unit(rc.gen);});
+
+ return result;
+}
+
+template<typename From, typename ToTypesCollectionType, size_t i = 0>
+void test_random(random_context& rc, size_t length)
+{
+ //std::cerr << "LENGTH: " << length << std::endl;
+ typedef typename std::tuple_element<i,ToTypesCollectionType>::type To;
+
+ From r {generate_random<From>(rc, length)};
+
+ try {
+ To result{unicode::utf_to_utf<typename From::value_type,typename To::value_type>(r)};
+ } catch (const std::runtime_error&) {
+ // OK: this is an expected exception for utf_to_utf on bad input
+ } catch (const std::invalid_argument&) {
+ // OK: this is an expected exception for utf_to_utf on bad input
+ }
+
+ //std::cerr << "DEBUG: " << typeid(From).name() << std::endl;
+ //std::cerr << " DEBUG2: " << typeid(To).name() << std::endl;
+
+ // iterate over remaining To types
+ if constexpr (i + 1 < std::tuple_size<ToTypesCollectionType>::value)
+ test_random<From, ToTypesCollectionType, i + 1>(rc, length);
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type)
+{
+ random_context rc;
+
+ // run for 1s (debug) 10s (release)
+#ifdef _DEBUG
+ const auto timeout{1.0s};
+#else
+ const auto timeout{10.0s};
+#endif
+
+ auto timeout_stamp { std::chrono::steady_clock::now() + (timeout / std::tuple_size<types_collection_type>::value)};
+
+ while (!(std::chrono::steady_clock::now() > timeout_stamp)) {
+ test_random<T,types_collection_type>(rc, rc.sequence_length(rc.gen));
+ }
+}
+
// TODO:
-// UTF-8
-// invalid bytes
-// an unexpected continuation byte
-// a non-continuation byte before the end of the character
-// the string ending before the end of the character (which can happen in simple string truncation)
-// an overlong encoding
-// a sequence that decodes to an invalid code point
-//
-// high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF)
//
// char8_t, char16_t, char32_t, char, wchar_t (UTF-16 on Windows, UTF-32 on Linux)
// string, vector?