Optimize UTF-8 validation

author: Roland Reichwein <mail@reichwein.it> 2021-12-27 21:51:05 +0100
committer: Roland Reichwein <mail@reichwein.it> 2021-12-27 21:51:05 +0100
commit: 970ba4111160fbf78351b21a024c46c0978e0440 (patch)
tree: 300e5a70adde02999845aa05e1727b3510fd62aa
parent: d992304dc12f078f12eb971c6283e0b54054e6b1 (diff)
1 files changed, 132 insertions, 78 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 395f172..4064233 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -47,6 +47,80 @@ namespace unicode::detail {
 
  using namespace std::string_literals;
 
+ template<typename value_type>
+ inline bool is_utf8_followup_byte(value_type b) noexcept
+ {
+  return (b & 0b11000000) == 0b10000000;
+ }
+
+ template<size_t sequence_length, typename value_type>
+ inline bool is_utf8_leading_byte(value_type byte) noexcept
+ {
+  static_assert(sequence_length <= 4);
+
+  if constexpr(sequence_length == 1) {
+   return !(byte & 0x80);
+  } else {
+   return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length));
+  }
+ }
+
+ template<typename value_type, typename... Tbytes>
+ inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept
+ {
+  constexpr auto n{sizeof...(Tbytes) + 1};
+
+  static_assert(n <= 4);
+
+  return is_utf8_leading_byte<n>(byte0) &&
+         (is_utf8_followup_byte(bytes) && ...);
+ }
+
+ template<typename T>
+ inline bool validate_utf8(const std::basic_string<T>& s)
+ {
+  static_assert(sizeof(T) == 1);
+
+  int i{};
+  auto size{s.size()};
+  while (i < size) {
+   if (is_utf8_sequence(s[i])) {
+    i++;
+   } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) {
+    i += 2;
+   } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) {
+    if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20))
+     return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF
+    i += 3;
+   } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) {
+    if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11)
+     return false; // Unicode too big above 0x10FFFF
+    i += 4;
+   } else {
+    return false;
+   }
+  }
+  return true;
+ }
+
+ template<typename value_type>
+ inline char32_t continuation_value(value_type b) noexcept
+ {
+  return static_cast<char32_t>(b & 0b00111111);
+ }
+
+ template<typename value_type, typename... Targs>
+ inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept
+ {
+  return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
+ }
+
+ template<size_t n, typename value_type>
+ inline char32_t value_byte0_of(value_type b) noexcept
+ {
+  return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
+ }
+
  template<typename T, typename Container=std::basic_string<T>>
  struct utf_iterator
  {
@@ -86,61 +160,28 @@ namespace unicode::detail {
    }
   }
 
-  inline static bool is_continuation_byte(value_type b) noexcept
-  {
-   return (b & 0b11000000) == 0b10000000;
-  }
-
-  template<typename... Targs>
-  inline static bool is_continuation_byte(value_type b, Targs... Fargs) noexcept
-  {
-   return is_continuation_byte(b) && is_continuation_byte(Fargs...);
-  }
-
-  template<size_t n>
-  inline static bool is_byte0_of(value_type b) noexcept
-  {
-   static_assert(n >= 2 && n <= 4);
-
-   return (b & static_cast<value_type>(0xFF << (7 - n))) == static_cast<value_type>(0xFF << (8 - n));
-  }
-
-  inline static internal_type continuation_value(value_type b) noexcept
-  {
-   return static_cast<internal_type>(b & 0b00111111);
-  }
-
-  template<typename... Targs>
-  inline static internal_type continuation_value(value_type b, Targs... Fargs) noexcept
-  {
-   return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
-  }
-
-  template<size_t n>
-  inline static internal_type value_byte0_of(value_type b) noexcept
-  {
-   return static_cast<internal_type>(b & (0b1111111 >> n)) << ((n - 1) * 6);
-  }
-
   template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
   inline internal_type calculate_value()
   {
    utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
-   if (byte0 & 0x80) { // 2-4 bytes
+   if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII
+    std::advance(iterator, 1);
+    return byte0;
+   } else {
     internal_type value{};
     if (size_t remaining{remaining_code_units()}; remaining >= 2) {
      utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
-     if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
+     if (is_utf8_sequence(byte0, byte1)) { // 2 bytes
       value = value_byte0_of<2>(byte0) | continuation_value(byte1);
       std::advance(iterator, 2);
      } else if (remaining >= 3) {
       utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
-      if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes
+      if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes
        value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
        std::advance(iterator, 3);
       } else if (remaining >= 4) {
        utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
-       if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes
+       if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes
         value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
         std::advance(iterator, 4);
        } else
@@ -157,9 +198,6 @@ namespace unicode::detail {
      throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
 
     return value;
-   } else { // 1 byte: 7 bit ASCII
-    std::advance(iterator, 1);
-    return byte0;
    }
   }
 
@@ -655,56 +693,72 @@ namespace unicode {
 
  // From and To are facets
  template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
- typename To::string_type convert(const typename From::string_type& s)
+ typename To::string_type convert_optimized(const typename From::string_type& s)
  {
   typename To::string_type result;
-
-  if constexpr(accu_size == 4 || accu_size == 8) {
-   typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer;
-   typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer;
-
-   auto begin{From::begin(s)};
-   auto end{From::end(s)};
-   auto back_inserter{To::back_inserter(result)};
-   auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])};
-   while (input_distance_bytes(begin, end) >= accu_size) {
-    if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) {
-     while (input_distance_bytes(begin, end) >= accu_size) {
-      typename arch_optimizer::accu_type data{*addr};
-      if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) {
-       arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result);
-       begin += accu_size / sizeof(typename From::value_type);
-       ++addr;
-      } else {
-       // just advance one code unit for now and break to trigger unoptimized
-       // version until next accu boundary
-       back_inserter = *begin;
-       ++begin;
-       break;
-      }
+  typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer;
+  typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer;
+
+  auto begin{From::begin(s)};
+  auto end{From::end(s)};
+  auto back_inserter{To::back_inserter(result)};
+  auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])};
+  while (input_distance_bytes(begin, end) >= accu_size) {
+   if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) {
+    while (input_distance_bytes(begin, end) >= accu_size) {
+     typename arch_optimizer::accu_type data{*addr};
+     if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) {
+      arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result);
+      begin += accu_size / sizeof(typename From::value_type);
+      ++addr;
+     } else {
+      // just advance one code unit for now and break to trigger unoptimized
+      // version until next accu boundary
+      back_inserter = *begin;
+      ++begin;
+      break;
      }
     }
-
-    // keep up after unaligned Non-ASCII code points
-    while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) {
-     back_inserter = *begin;
-     ++begin;
-    }
    }
 
-   // remainder < 8 bytes   
-   while (begin != end) {
+   // keep up after unaligned Non-ASCII code points
+   while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) {
     back_inserter = *begin;
     ++begin;
    }
+  }
 
-  } else {
-   std::copy(From::begin(s), From::end(s), To::back_inserter(result));
+  // remainder < 8 bytes   
+  while (begin != end) {
+   back_inserter = *begin;
+   ++begin;
   }
 
   return result;
  }
 
+ // From and To are facets
+ template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
+ typename To::string_type convert(const typename From::string_type& s)
+ {
+  if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) {
+   if (validate_utf8<typename From::value_type>(s)) {
+    if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>)
+     return s;
+    else
+     return typename To::string_type{s.begin(), s.end()};
+   } else {
+    throw std::invalid_argument("Invalid UTF-8");
+   }
+  } if constexpr(accu_size == 4 || accu_size == 8) {
+   return convert_optimized<From, To>(s);
+  } else {
+   typename To::string_type result;
+   std::copy(From::begin(s), From::end(s), To::back_inserter(result));
+   return result;
+  }
+ }
+
  // Helper to get correct Facet from char type, e.g. Encoding<typename decltype(s)::value_type>::Facet
  template<typename T>
  struct Encoding
author	Roland Reichwein <mail@reichwein.it>	2021-12-27 21:51:05 +0100
committer	Roland Reichwein <mail@reichwein.it>	2021-12-27 21:51:05 +0100
commit	970ba4111160fbf78351b21a024c46c0978e0440 (patch)
tree	300e5a70adde02999845aa05e1727b3510fd62aa
parent	d992304dc12f078f12eb971c6283e0b54054e6b1 (diff)