From fad8b697dff7c7b47f034124ea6eef25e74bd7af Mon Sep 17 00:00:00 2001
From: Roland Reichwein <mail@reichwein.it>
Date: Tue, 26 Jan 2021 22:05:08 +0100
Subject: Implement conversion and first tests

---
 include/unicode.h    | 257 ++++++++++++++++++++++++++++++++++++++++++---------
 src/test-unicode.cpp |  47 +++++++++-
 2 files changed, 257 insertions(+), 47 deletions(-)
diff --git a/include/unicode.h b/include/unicode.h
index 512891a..a55eac3 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -15,66 +15,164 @@
 
 namespace {
 
- struct utf8_iterator
+ using namespace std::string_literals;
+
+ template<typename T>
+ struct utf_iterator
  {
   typedef char32_t value_type;
   typedef char32_t& reference;
+  typedef std::basic_string<T> string_type;
 
-  utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend):
+  utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
    iterator(cbegin), end_iterator(cend)
   {
-   calculate_value();
+   calculate_value<T>();
+  }
+
+  utf_iterator<T>(const utf_iterator<T>& other) = default;
+  utf_iterator<T>& operator=(const utf_iterator<T>& other) = default;
+
+  size_t remaining_code_units()
+  {
+   return end_iterator - iterator;
   }
 
-  utf8_iterator(const utf8_iterator& other) = default;
-  utf8_iterator& operator=(const utf8_iterator& other) = default;
+  template<size_t index>
+  T get_code_unit()
+  {
+   return *(iterator + index);
+  }
 
   // set value member
+  // default: char32_t for UTF-32
+  // specializations for UTF-8 and UTF-16 below
+  template<typename T1>
   void calculate_value()
   {
-   if (iterator == end_iterator)
+   size_t remaining{remaining_code_units()};
+
+   if (!remaining)
+    return;
+
+   value = get_code_unit<0>();
+   
+   if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000))
+    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+
+   sequence_length = 1;
+  }
+
+  inline static bool is_continuation_byte(T b)
+  {
+   return (b & 0b11000000) == 0b10000000;
+  }
+
+  template<typename... Targs>
+  inline static bool is_continuation_byte(T b, Targs... Fargs)
+  {
+   return is_continuation_byte(b) && is_continuation_byte(Fargs...);
+  }
+
+  template<size_t n>
+  inline static bool is_byte0_of(T b)
+  {
+   return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n));
+  }
+
+  inline static char32_t continuation_value(T b)
+  {
+   return static_cast<char32_t>(b & 0b00111111);
+  }
+
+  template<typename... Targs>
+  inline static char32_t continuation_value(T b, Targs... Fargs)
+  {
+   return continuation_value(b) << 6 | continuation_value(Fargs...);
+  }
+
+  template<size_t n>
+  inline static char32_t value_byte0_of(T b)
+  {
+   return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
+  }
+
+  // specialization for UTF-8
+  template<>
+  void calculate_value<char8_t>()
+  {
+   size_t remaining{remaining_code_units()};
+   
+   if (!remaining)
     return;
 
-   char8_t first_byte {*iterator};
-   if (first_byte & 0x80) { // 2-4 bytes
-    if (iterator + 1 != end_iterator) {
-     char8_t second_byte {*(iterator + 1)};
-     if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes
-      value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111);
+   char8_t byte0 {get_code_unit<0>()};
+   if (byte0 & 0x80) { // 2-4 bytes
+    if (remaining >= 2) {
+     char8_t byte1 {get_code_unit<1>()};
+     if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
+      value = value_byte0_of<2>(byte0) | continuation_value(byte1);
       sequence_length = 2;
-     } else if (iterator + 2 != end_iterator) {
-      char8_t third_byte {*(iterator + 2)};
-      if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes
-       value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111);
+     } else if (remaining >= 3) {
+      char8_t byte2 {get_code_unit<2>()};
+      if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes
+       value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
        sequence_length = 3;
-      } else if (iterator + 3 != end_iterator) {
-       char8_t fourth_byte {*(iterator + 3)};
-       if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes
-        value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111);
+      } else if (remaining >= 4) {
+       char8_t byte3 {get_code_unit<3>()};
+       if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes
+        value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
         sequence_length = 4;
        } else
-        throw std::invalid_argument("bad input: invalid 4 byte sequence");
+        throw std::invalid_argument("Bad input: Invalid 4 byte sequence");
       } else
-       throw std::invalid_argument("bad input: invalid 3 byte sequence");
+       throw std::invalid_argument("Bad input: Invalid 3 byte sequence");
      } else
-      throw std::invalid_argument("bad input: invalid 2 byte sequence");
+      throw std::invalid_argument("Bad input: Invalid 2 byte sequence");
     } else
-     throw std::invalid_argument("bad input: byte 2 expected, none found");
+     throw std::invalid_argument("Bad input: 2nd byte expected, none found");
    } else { // 1 byte: 7 bit ASCII
-    value = first_byte;
+    value = byte0;
     sequence_length = 1;
    }
   }
 
+  // specialization for UTF-16
+  template<>
+  void calculate_value<char16_t>()
+  {
+   size_t remaining{remaining_code_units()};
+   
+   if (!remaining)
+    return;
+
+   char16_t unit0 {get_code_unit<0>()};
+
+   if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane)
+    value = unit0;
+    sequence_length = 1;
+   } else {
+    if (remaining < 2)
+     throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
+
+    char16_t unit1 {get_code_unit<1>()};
+    if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
+     throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
+
+    value = static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF);
+    sequence_length = 2;
+   }
+  }
+
   // pre-increment
-  utf8_iterator& operator++()
+  utf_iterator<T>& operator++()
   {
    iterator += sequence_length;
-   calculate_value();
+   calculate_value<T>();
    return *this;
   }
 
-  bool operator!=(const utf8_iterator& other) const
+  bool operator!=(const utf_iterator<T>& other) const
   {
    return iterator != other.iterator;
   }
@@ -84,21 +182,23 @@ namespace {
    return value;
   }
 
-  std::u8string::const_iterator iterator;
-  std::u8string::const_iterator end_iterator;
+  typename string_type::const_iterator iterator;
+  typename string_type::const_iterator end_iterator;
 
   value_type value{};
   size_t sequence_length{};
  };
 
- struct utf16_back_insert_iterator
+ template<typename T>
+ struct utf_back_insert_iterator
  {
-  typedef utf16_back_insert_iterator& reference;
+  typedef std::basic_string<T> string_type;
+  typedef utf_back_insert_iterator& reference;
 
-  utf16_back_insert_iterator(std::u16string& s): s(s) {}
+  utf_back_insert_iterator(string_type& s): s(s) {}
 
   // no-op
-  utf16_back_insert_iterator& operator++()
+  utf_back_insert_iterator& operator++()
   {
    return *this;
   }
@@ -109,10 +209,71 @@ namespace {
    return *this;
   }
 
-  // append utf-16 word sequence
+  // default: utf-32 code unit for UTF-32
+  // specializations for UTF-8 and UTF-16 below
+  template<typename T1=T>
   reference operator=(const char32_t& value)
   {
-   if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t!
+   // expect value to be already valid Unicode values
+   s.push_back(value);
+   return *this;
+  }
+
+  // n is number of UTF-8 bytes in sequence
+  template<size_t n>
+  inline static T byte0_of(char32_t value)
+  {
+   return (value >> 6 * (n - 1)) | (0xFF << (8 - n));
+  }
+
+  // n is index of 6-bit groups, counting from bit 0
+  template<size_t n>
+  inline static T trailing_byte(char32_t value)
+  {
+   return ((value >> n * 6) & 0b111111) | 0b10000000;
+  }
+
+  // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII)
+  // assume value to be valid Unicode value for given byte position
+  template<size_t n, size_t m>
+  inline static T byte_n_of_m(char32_t value)
+  {
+   if constexpr (n == 0)
+    return byte0_of<m>(value);
+   else
+    return trailing_byte<m - n - 1>(value);
+  }
+
+  // specialization for UTF-8
+  // append utf-8 byte sequence
+  template<>
+  reference operator=<char8_t>(const char32_t& value)
+  {
+   if (value < 0x80) { // 1 byte
+    s.push_back(value);
+   } else if (value < 0x800) { // 2 bytes
+    s.push_back(byte_n_of_m<0,2>(value));
+    s.push_back(byte_n_of_m<1,2>(value));
+   } else if (value < 0x10000) { // 3 bytes
+    s.push_back(byte_n_of_m<0,3>(value));
+    s.push_back(byte_n_of_m<1,3>(value));
+    s.push_back(byte_n_of_m<2,3>(value));
+   } else if (value < 0x110000) { // 4 bytes
+    s.push_back(byte_n_of_m<0,4>(value));
+    s.push_back(byte_n_of_m<1,4>(value));
+    s.push_back(byte_n_of_m<2,4>(value));
+    s.push_back(byte_n_of_m<3,4>(value));
+   } else
+    throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
+   return *this;
+  }
+
+  // specialization for UTF-16
+  // append utf-16 word sequence
+  template<>
+  reference operator=<char16_t>(const char32_t& value)
+  {
+   if (value <= 0xFFFF) { // expect value to be already valid Unicode values
     s.push_back(value);
    } else {
     s.push_back((value >> 10) + 0xD800);
@@ -121,33 +282,37 @@ namespace {
    return *this;
   }
 
-  std::u16string& s;
+  typename utf_back_insert_iterator::string_type& s;
  };
 
- utf16_back_insert_iterator utf16_back_inserter(std::u16string& s)
+ template<typename T>
+ utf_back_insert_iterator<T> utf_back_inserter(std::basic_string<T>& s)
  {
-  return utf16_back_insert_iterator(s);
+  return utf_back_insert_iterator<T>(s);
  }
 
- utf8_iterator utf8_begin(const std::u8string& s)
+ template<typename T>
+ utf_iterator<T> utf_begin(const std::basic_string<T>& s)
  {
-  return utf8_iterator{s.cbegin(), s.cend()};
+  return utf_iterator<T>{s.cbegin(), s.cend()};
  }
 
- utf8_iterator utf8_end(const std::u8string& s)
+ template<typename T>
+ utf_iterator<T> utf_end(const std::basic_string<T>& s)
  {
-  return utf8_iterator{s.cend(), s.cend()};
+  return utf_iterator<T>{s.cend(), s.cend()};
  }
 
 } // namespace
 
 namespace unicode {
 
-std::u16string utf8_to_utf16(const std::u8string& s)
+template<typename From, typename To>
+std::basic_string<To> utf_to_utf(const std::basic_string<From>& s)
 {
- std::u16string result;
+ std::basic_string<To> result;
 
- std::copy(utf8_begin(s), utf8_end(s), utf16_back_inserter(result));
+ std::copy(utf_begin<From>(s), utf_end<From>(s), utf_back_inserter<To>(result));
 
  return result;
 }
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 41fcd20..0560c1b 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -3,19 +3,60 @@
 #include <boost/test/included/unit_test.hpp>
 
 #include <string>
+#include <tuple>
+#include <type_traits>
 
 #include <unicode.h>
 
+std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> t {
+ u8"Täst", u"Täst", U"Täst"
+};
+
+template<size_t i = 0, size_t j = 0, typename... Ts>
+void test_utf_to_utf(std::tuple<Ts...>& t)
+{
+ typedef typename std::tuple_element<i,typename std::remove_reference<decltype(t)>::type>::type From;
+ typedef typename std::tuple_element<j,typename std::remove_reference<decltype(t)>::type>::type To;
+
+ // test
+ To result { unicode::utf_to_utf<typename From::value_type, typename To::value_type>(std::get<i>(t)) };
+
+ BOOST_CHECK(std::get<j>(t) == result);
+
+ //std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl;
+
+ // iterate over other combinations
+ if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
+  test_utf_to_utf<i + 1, j>(t);
+ else if constexpr (j + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
+  test_utf_to_utf<0, j + 1>(t);
+}
+
+BOOST_AUTO_TEST_CASE(utf_to_utf)
+{
+ test_utf_to_utf(t);
+}
+
 BOOST_AUTO_TEST_CASE(utf8_to_utf16)
 {
  std::u8string u8{u8"ascii string1"};
  
- std::u16string u16{unicode::utf8_to_utf16(u8)};
+ std::u16string u16{unicode::utf_to_utf<char8_t, char16_t>(u8)};
 
  BOOST_CHECK(u16 == u"ascii string1");
 }
 
+BOOST_AUTO_TEST_CASE(utf16_to_utf8)
+{
+ std::u16string u16{u"ascii string1"};
+ 
+ std::u8string u8{unicode::utf_to_utf<char16_t, char8_t>(u16)};
+
+ BOOST_CHECK(u8 == u8"ascii string1");
+}
+
 // TODO:
+// UTF-8
 //  invalid bytes
 //  an unexpected continuation byte
 //  a non-continuation byte before the end of the character
@@ -24,3 +65,7 @@ BOOST_AUTO_TEST_CASE(utf8_to_utf16)
 //  a sequence that decodes to an invalid code point
 //
 //  high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF)
+//
+// char8_t, char16_t, char32_t, char, wchar_t (UTF-16 on Windows, UTF-32 on Linux)
+// string, vector?
+// uint8_t, uint16_t, uint32_t?
-- 
cgit v1.2.3