From 2b27deb54fec75ed529776f30be8eeb4ea239257 Mon Sep 17 00:00:00 2001
From: Roland Reichwein <mail@reichwein.it>
Date: Tue, 28 Dec 2021 16:10:33 +0100
Subject: Refactoring UTF-8 decoding, bugfixing

---
 include/unicode.h | 51 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 19 deletions(-)
diff --git a/include/unicode.h b/include/unicode.h
index 4e4c7eb..2d7bf71 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -1,6 +1,6 @@
 // libunicode
 //
-// Author: Roland Reichwein
+// Author: Roland Reichwein <mail@reichwein.it>
 //
 // Available under the conditions of CC0 1.0 Universal
 // https://creativecommons.org/publicdomain/zero/1.0/
@@ -104,11 +104,11 @@ namespace unicode::detail {
  template<typename value_type, typename... Twords>
  inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept
  {
-  constexpr auto n{sizeof...(Twords) + 1};
+  constexpr auto sequence_length{sizeof...(Twords) + 1};
 
-  static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
+  static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
 
-  if constexpr(n == 1) {
+  if constexpr(sequence_length == 1) {
    return is_valid_unicode(word0);
   } else {
    char16_t unit0 {static_cast<char16_t>(word0)};
@@ -143,22 +143,35 @@ namespace unicode::detail {
   return true;
  }
 
+ template<size_t sequence_length, typename value_type>
+ inline char32_t decode_utf8_leading_byte(value_type b) noexcept
+ {
+  return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6);
+ }
+
  template<typename value_type>
- inline char32_t continuation_value(value_type b) noexcept
+ inline char32_t decode_utf8_followup_byte(value_type b) noexcept
  {
   return static_cast<char32_t>(b & 0b00111111);
  }
 
  template<typename value_type, typename... Targs>
- inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept
+ inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept
  {
-  return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
+  return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...);
  }
 
- template<size_t n, typename value_type>
- inline char32_t value_byte0_of(value_type b) noexcept
+ template<typename value_type, typename... Targs>
+ inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept
  {
-  return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
+  size_t constexpr sequence_length{sizeof...(Targs) + 1};
+
+  static_assert(sequence_length <= 4);
+
+  if constexpr (sequence_length == 1)
+   return b;
+  else
+   return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...);
  }
 
  template<typename T, typename Container=std::basic_string<T>>
@@ -206,32 +219,32 @@ namespace unicode::detail {
    utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
    if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII
     std::advance(iterator, 1);
-    return byte0;
+    return decode_utf8_sequence(byte0);
    } else {
     internal_type value{};
     if (size_t remaining{remaining_code_units()}; remaining >= 2) {
      utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
      if (is_utf8_sequence(byte0, byte1)) { // 2 bytes
-      value = value_byte0_of<2>(byte0) | continuation_value(byte1);
+      value = decode_utf8_sequence(byte0, byte1);
       std::advance(iterator, 2);
      } else if (remaining >= 3) {
       utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
       if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes
-       value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
+       value = decode_utf8_sequence(byte0, byte1, byte2);
        std::advance(iterator, 3);
       } else if (remaining >= 4) {
        utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
        if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes
-        value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
+        value = decode_utf8_sequence(byte0, byte1, byte2, byte3);
         std::advance(iterator, 4);
        } else
         throw std::invalid_argument("Bad input: Invalid 4 byte sequence");
       } else
-       throw std::invalid_argument("Bad input: Invalid 3 byte sequence");
+       throw std::invalid_argument("Bad UTF-8 input: 4th byte expected, none found (end of sequence)");
      } else
-      throw std::invalid_argument("Bad input: Invalid 2 byte sequence");
+      throw std::invalid_argument("Bad UTF-8 input: 3rd byte expected, none found (end of sequence)");
     } else
-     throw std::invalid_argument("Bad input: 2nd byte expected, none found");
+     throw std::invalid_argument("Bad UTF-8 input: 2nd byte expected, none found (end of sequence)");
   
     // check only for sequences >= 2 bytes (ASCII is always compliant)
     if (!unicode::is_valid_unicode(value))
@@ -782,13 +795,13 @@ namespace unicode {
  typename To::string_type convert(const typename From::string_type& s)
  {
   // if input type == output type, only validate and return input, is appropriate
-  if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 &&
+  if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) &&
                std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> &&
                std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) {
    if (validate_utf<typename From::value_type>(s)) {
     return s;
    } else {
-    throw std::invalid_argument("Invalid UTF-8");
+    throw std::invalid_argument("Invalid UTF input");
    }
   } if constexpr(accu_size == 4 || accu_size == 8) {
    return convert_optimized<From, To>(s);
-- 
cgit v1.2.3