From cd4fad54c0be9fb7fca57e8e03228b8b649b5b51 Mon Sep 17 00:00:00 2001
From: Roland Reichwein <mail@reichwein.it>
Date: Wed, 27 Jan 2021 22:21:04 +0100
Subject: Bugfixes, tests

---
 include/unicode.h    |  40 ++++++++--------
 src/test-unicode.cpp | 129 +++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 136 insertions(+), 33 deletions(-)
diff --git a/include/unicode.h b/include/unicode.h
index a55eac3..f539e6b 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -1,5 +1,4 @@
 // libunicode
-// Copyright (C) 2021 Roland Reichwein
 
 #pragma once
 
@@ -7,11 +6,20 @@
 #include <stdexcept>
 #include <string>
 
-#ifdef __has_cpp_attribute
-#if __has_cpp_attribute(__cpp_char8_t)
+#ifdef __cpp_char8_t
 // char8_t available
 #endif
-#endif
+
+namespace unicode {
+
+ // usually, char32_t, uint32_t etc.
+ template<typename T>
+ static inline bool is_valid_unicode(const T& value)
+ {
+   return value <= 0x10FFFF && (value <= 0xD7FF || value >= 0xE000);
+ }
+
+}
 
 namespace {
 
@@ -50,6 +58,8 @@ namespace {
   template<typename T1>
   void calculate_value()
   {
+   static_assert(sizeof(T1) == 4);
+
    size_t remaining{remaining_code_units()};
 
    if (!remaining)
@@ -57,7 +67,7 @@ namespace {
 
    value = get_code_unit<0>();
    
-   if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000))
+   if (!unicode::is_valid_unicode(value))
     throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
 
    sequence_length = 1;
@@ -88,7 +98,7 @@ namespace {
   template<typename... Targs>
   inline static char32_t continuation_value(T b, Targs... Fargs)
   {
-   return continuation_value(b) << 6 | continuation_value(Fargs...);
+   return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
   }
 
   template<size_t n>
@@ -159,7 +169,7 @@ namespace {
     if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
      throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
 
-    value = static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF);
+    value = (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
     sequence_length = 2;
    }
   }
@@ -185,7 +195,7 @@ namespace {
   typename string_type::const_iterator iterator;
   typename string_type::const_iterator end_iterator;
 
-  value_type value{};
+  char32_t value{}; // always save complete unicode code point at this point
   size_t sequence_length{};
  };
 
@@ -276,8 +286,9 @@ namespace {
    if (value <= 0xFFFF) { // expect value to be already valid Unicode values
     s.push_back(value);
    } else {
-    s.push_back((value >> 10) + 0xD800);
-    s.push_back((value & 0x3FF) + 0xDC00);
+    char32_t value_reduced{value - 0x10000};
+    s.push_back((value_reduced >> 10) + 0xD800);
+    s.push_back((value_reduced & 0x3FF) + 0xDC00);
    }
    return *this;
   }
@@ -317,14 +328,5 @@ std::basic_string<To> utf_to_utf(const std::basic_string<From>& s)
  return result;
 }
 
-//std::u8string utf16_to_utf8(const std::u16string& s)
-//{
-// std::u8string result;
-//
-// std::transform(utf16_begin(s), utf16_end(s), std::back_inserter(result));
-//
-// return result;
-//}
-
 } // namespace unicode
 
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 0560c1b..2cc8393 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -1,17 +1,83 @@
 #define BOOST_TEST_MODULE unicode_test
 
 #include <boost/test/included/unit_test.hpp>
+#include <boost/test/data/dataset.hpp>
+#include <boost/test/data/monomorphic.hpp>
+#include <boost/test/data/test_case.hpp>
 
+#include <exception>
 #include <string>
 #include <tuple>
 #include <type_traits>
+#include <vector>
 
 #include <unicode.h>
 
-std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> t {
- u8"Täst", u"Täst", U"Täst"
+typedef std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> types_collection_type;
+
+// create tuple of the same string, in UTF-8, UTF-16 and UTF-32
+#define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x}
+
+// Success cases: convert string to all other types, respectively
+std::vector<types_collection_type> success_sets {
+ SUCCESS_TUPLE(""),
+ SUCCESS_TUPLE("ASCII string1"),
+ SUCCESS_TUPLE("Täst just looks like German"),
+ SUCCESS_TUPLE("\u732b is chinese for cat"),
+ SUCCESS_TUPLE("\U0001F63A"),
+ SUCCESS_TUPLE("\U0001F63A is a smiling cat"),
+};
+
+// Error cases: throwing upon convert to all other types
+std::vector<std::basic_string<char8_t>> failure_strings_char8_t {
+ u8"\x80",
+ u8"\x81"
+};
+
+std::vector<std::basic_string<char16_t>> failure_strings_char16_t {
+ u"\xD801",
+};
+
+std::vector<std::basic_string<char32_t>> failure_strings_char32_t {
+ U"\xD801",
+ U"\x10000000",
 };
 
+// output operators must be in same namespace as the type itself
+namespace std {
+
+std::ostream& operator<<(std::ostream& os, std::basic_string<char8_t> const& s)
+{
+ os << "[";
+ for (auto& c: s)
+  os << " " << std::to_string(static_cast<uint8_t>(c));
+ os << "]";
+
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, std::basic_string<char16_t> const& s)
+{
+ os << "[";
+ for (auto& c: s)
+  os << " " << std::to_string(static_cast<uint16_t>(c));
+ os << "]";
+
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, std::basic_string<char32_t> const& s)
+{
+ os << "[";
+ for (auto& c: s)
+  os << " " << std::to_string(static_cast<uint32_t>(c));
+ os << "]";
+
+ return os;
+}
+
+}
+
 template<size_t i = 0, size_t j = 0, typename... Ts>
 void test_utf_to_utf(std::tuple<Ts...>& t)
 {
@@ -21,7 +87,7 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
  // test
  To result { unicode::utf_to_utf<typename From::value_type, typename To::value_type>(std::get<i>(t)) };
 
- BOOST_CHECK(std::get<j>(t) == result);
+ BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
 
  //std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl;
 
@@ -32,27 +98,62 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
   test_utf_to_utf<0, j + 1>(t);
 }
 
-BOOST_AUTO_TEST_CASE(utf_to_utf)
+// We don't use BOOST_DATA_TEST_CASE here because boost::test tries to assign
+// a new variable to each tuple element which we don't want
+// https://lists.boost.org/boost-bugs/2016/05/45214.php
+
+BOOST_AUTO_TEST_CASE(utf_to_utf_success)
 {
- test_utf_to_utf(t);
+ for (auto& t: success_sets)
+  test_utf_to_utf(t);
 }
 
-BOOST_AUTO_TEST_CASE(utf8_to_utf16)
+// iterate over std::tuple T types
+template<typename From, typename Collection, size_t index = 0>
+void test_utf_to_utf_failure(std::basic_string<From>& s)
 {
- std::u8string u8{u8"ascii string1"};
- 
- std::u16string u16{unicode::utf_to_utf<char8_t, char16_t>(u8)};
+ typedef typename std::tuple_element<index, Collection>::type::value_type To;
 
- BOOST_CHECK(u16 == u"ascii string1");
+ try {
+  unicode::utf_to_utf<From,To>(s);
+  BOOST_FAIL("Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
+ } catch (...) {
+  // OK
+ };
+
+ // iterate over remaining types 
+ if constexpr (index + 1 < std::tuple_size<Collection>::value)
+  test_utf_to_utf_failure<From, Collection, index + 1>(s);
 }
 
-BOOST_AUTO_TEST_CASE(utf16_to_utf8)
+BOOST_AUTO_TEST_CASE(utf_to_utf_failure)
 {
- std::u16string u16{u"ascii string1"};
+ for (auto& s: failure_strings_char8_t)
+  test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s);
  
- std::u8string u8{unicode::utf_to_utf<char16_t, char8_t>(u16)};
+ for (auto& s: failure_strings_char16_t)
+  test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s);
+
+ for (auto& s: failure_strings_char32_t)
+  test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s);
+}
+
+BOOST_AUTO_TEST_CASE(is_valid_unicode)
+{
+ BOOST_CHECK(unicode::is_valid_unicode('\0'));
+ BOOST_CHECK(unicode::is_valid_unicode(U'a'));
+ BOOST_CHECK(unicode::is_valid_unicode(U'ä'));
+ BOOST_CHECK(unicode::is_valid_unicode(U'\u732b')); // cat chinese
+ BOOST_CHECK(unicode::is_valid_unicode(U'\U0001F63A')); // cat chinese
+ BOOST_CHECK(unicode::is_valid_unicode(0x0001F63A)); // cat smiley
 
- BOOST_CHECK(u8 == u8"ascii string1");
+ BOOST_CHECK(!unicode::is_valid_unicode(0x00110000));
+ BOOST_CHECK(!unicode::is_valid_unicode(0xFFFFFFFF)); // U"\UFFFFFFFF" is invalid C++
+ BOOST_CHECK(!unicode::is_valid_unicode(0x01234567));
+ BOOST_CHECK(!unicode::is_valid_unicode(0x12345678));
+ BOOST_CHECK(!unicode::is_valid_unicode(0xD800));
+ BOOST_CHECK(!unicode::is_valid_unicode(0xD987));
+ BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF));
 }
 
 // TODO:
-- 
cgit v1.2.3