Separated out headers files; optimizations; type traits; better naming

author: Roland Reichwein <mail@reichwein.it> 2022-01-01 20:25:34 +0100
committer: Roland Reichwein <mail@reichwein.it> 2022-01-01 20:25:34 +0100
commit: 52d4375b10d920a59f1309c272a2e525feb1c25d (patch)
tree: 9d5417a9d214f4b0ba68b75e8908e28da46dd5c8
parent: ae7b430afd1239947b8f8b2d9dc0ca72dbce91ac (diff)
8 files changed, 997 insertions, 648 deletions
diff --git a/Makefile b/Makefile
index 37c769a..36c503d 100644
--- a/Makefile
+++ b/Makefile
@@ -96,8 +96,8 @@ src/recode: src/recode.o src/file.o dep
 src/validate: src/validate.o src/file.o dep
 	$(CXX) $(LDFLAGS) src/validate.o src/file.o $(LDLIBS) $(LIBS) -o $@
 
-src/test-unicode: src/test-unicode.o src/file.o dep
-	$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@
+src/test-unicode: src/test-unicode.o dep
+	$(CXX) $(LDFLAGS) src/test-unicode.o $(LDLIBS) $(LIBS) -o $@
 
 dep: $(SRC:.cpp=.d)
 
@@ -139,6 +139,7 @@ DISTFILES= \
 	   src/file.h \
 	   Makefile \
 	   include/unicode.h \
+	   include/unicode/type_traits.h \
            debian/control \
            debian/compat \
            debian/copyright \
diff --git a/include/unicode.h b/include/unicode.h
index 1190292..a50f525 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -7,625 +7,25 @@
 
 #pragma once
 
+#include "unicode/endian.h"
+#include "unicode/iso.h"
+#include "unicode/predicate.h"
+#include "unicode/types.h"
+#include "unicode/type_traits.h"
+#include "unicode/utf.h"
+
 #include <algorithm>
+#include <array>
 #include <cstdint>
 #include <iterator>
-#include <list>
 #include <memory>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
-#include <unordered_map>
-
-#ifdef __cpp_char8_t
-// char8_t available
- typedef char8_t utf8_t;
-#else
- typedef char utf8_t;
-#endif
-typedef char iso_t;
+#include <utility>
 
 namespace unicode {
 
- // bits_to_compare: limit bits to consider even further than defined by T
- // T: usually, char32_t, uint32_t etc.
- template<size_t bits_to_compare = 32, typename T>
- static inline bool is_valid_unicode(const T& value) noexcept
- {
-  if constexpr(sizeof(T) == 1 || bits_to_compare <= 15)
-   return true;
-  else if constexpr(sizeof(T) == 2 || bits_to_compare <= 20)
-   //return value <= 0xD7FF || value >= 0xE000;
-   return (value & 0xF800) != 0xD800;
-  else
-   //return (value & 0xFFFFF800) != 0x0000D800 && (value >> 16) <= 0x10;
-   return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF);
- }
-
-}
-
-namespace unicode::detail {
-
- using namespace std::string_literals;
-
- template<size_t sequence_length, typename value_type>
- inline bool is_utf8_leading_byte(value_type byte) noexcept
- {
-  static_assert(sequence_length <= 4);
-
-  if constexpr(sequence_length == 1) {
-   return !(byte & 0x80);
-  } else {
-   return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length));
-  }
- }
-
- template<typename value_type>
- inline bool is_utf8_followup_byte(value_type b) noexcept
- {
-  return (b & 0b11000000) == 0b10000000;
- }
-
- template<typename value_type, typename... Tbytes>
- inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept
- {
-  constexpr auto sequence_length{sizeof...(Tbytes) + 1};
-
-  static_assert(sequence_length <= 4, "UTF-8 sequences of 1 through 4 code units are supported");
-
-  return is_utf8_leading_byte<sequence_length>(byte0) &&
-         (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right
- }
-
- template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true>
- inline bool validate_utf(const std::basic_string<T>& s)
- {
-  int i{};
-  auto size{s.size()};
-  while (i < size) {
-   if (is_utf8_sequence(s[i])) {
-    i++;
-   } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) {
-    i += 2;
-   } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) {
-    if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20))
-     return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF
-    i += 3;
-   } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) {
-    if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11)
-     return false; // Unicode too big above 0x10FFFF
-    i += 4;
-   } else {
-    return false;
-   }
-  }
-  return true;
- }
-
- template<typename value_type, typename... Twords>
- inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept
- {
-  constexpr auto sequence_length{sizeof...(Twords) + 1};
-
-  static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
-
-  if constexpr(sequence_length == 1) {
-   return is_valid_unicode(word0);
-  } else {
-   char16_t unit0 {static_cast<char16_t>(word0)};
-   char16_t unit1 {static_cast<char16_t>((words, ...))};
-   return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00;
-  }
- }
-
- template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true>
- inline bool validate_utf(const std::basic_string<T>& s)
- {
-  int i{};
-  auto size{s.size()};
-  while (i < size) {
-   if (is_utf16_sequence(s[i])) {
-    i++;
-   } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) {
-    i += 2;
-   } else {
-    return false;
-   }
-  }
-  return true;
- }
-
- template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true>
- inline bool validate_utf(const std::basic_string<T>& s)
- {
-  for (auto i: s)
-   if (!is_valid_unicode(i))
-    return false;
-  return true;
- }
-
- template<size_t sequence_length, typename value_type>
- inline char32_t decode_utf8_leading_byte(value_type b) noexcept
- {
-  return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6);
- }
-
- template<typename value_type>
- inline char32_t decode_utf8_followup_byte(value_type b) noexcept
- {
-  return static_cast<char32_t>(b & 0b00111111);
- }
-
- template<typename value_type, typename... Targs>
- inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept
- {
-  return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...);
- }
-
- template<typename value_type, typename... Targs>
- inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept
- {
-  size_t constexpr sequence_length{sizeof...(Targs) + 1};
-
-  static_assert(sequence_length <= 4);
-
-  if constexpr (sequence_length == 1)
-   return b;
-  else
-   return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...);
- }
-
- template<typename T, typename Container=std::basic_string<T>>
- struct utf_iterator
- {
-  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
-
-  typedef T value_type;
-  typedef char32_t internal_type;
-  typedef char32_t& reference;
-  typedef char32_t* pointer;
-  typedef size_t difference_type;
-  typedef std::input_iterator_tag iterator_category;
-  typedef Container string_type;
-
-  utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
-   iterator(cbegin), end_iterator(cend)
-  {
-  }
-
-  utf_iterator(const utf_iterator& other) = default;
-  utf_iterator& operator=(const utf_iterator& other) = default;
-
-  inline size_t remaining_code_units() const noexcept
-  {
-   return std::distance(iterator, end_iterator);
-  }
-
-  template<size_t index>
-  inline value_type get_code_unit() const noexcept
-  {
-   if constexpr (std::is_same_v<Container, typename std::list<value_type>>) {
-    // std::list doesn't support it + n
-    auto it{iterator};
-    std::advance(it, index);
-    return *it;
-   } else {
-    return *(iterator + index);
-   }
-  }
-
-  template<typename... Tbytes>
-  inline internal_type calculate_utf8_value(Tbytes... bytes)
-  {
-   size_t constexpr sequence_length{sizeof...(Tbytes)};
-   static_assert(sequence_length >= 1 && sequence_length <= 4);
-
-   if constexpr(sequence_length > 1) {
-    if (remaining_code_units() < sequence_length)
-     throw std::invalid_argument("Bad input: Not enough bytes left for decoding UTF-8 sequence");
-   }
-
-   if (is_utf8_sequence(bytes...)) {
-    std::advance(iterator, sequence_length);
-    internal_type result{decode_utf8_sequence(bytes...)};
-    if (!unicode::is_valid_unicode<sequence_length * 6>(result))
-     throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
-    return result;
-   } else {
-    if constexpr(sequence_length <= 3) // template recursion break condition: UTF-8 has 1..4 code units
-     return calculate_utf8_value(bytes..., static_cast<utf8_t>(get_code_unit<sequence_length>()));
-    else
-     throw std::invalid_argument("Bad UTF-8 input: Invalid 4 byte sequence");
-   }
-  }
-
-  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
-  inline internal_type calculate_value()
-  {
-   return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>()));
-  }
-
-  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
-  inline internal_type calculate_value()
-  {
-   char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
-
-   if (is_valid_unicode(unit0)) { // 1 unit (BMP Basic Multilingual Plane)
-    std::advance(iterator, 1);
-    return unit0;
-   } else {
-    if (remaining_code_units() < 2)
-     throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
-
-    char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())};
-    if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
-     throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
-
-    std::advance(iterator, 2);
-    return (static_cast<internal_type>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
-   }
-  }
-
-  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
-  inline internal_type calculate_value()
-  {
-   internal_type result {static_cast<internal_type>(get_code_unit<0>())};
-
-   if (!unicode::is_valid_unicode(result))
-    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
-
-   std::advance(iterator, 1);
-
-   return result;
-  }
-
-  // pre-increment
-  utf_iterator& operator++()
-  {
-   return *this;
-  }
-
-  bool operator!=(const utf_iterator& other) const
-  {
-   return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
-  }
-
-  internal_type operator*()
-  {
-   return calculate_value();
-  }
-
-  utf_iterator& operator+=(size_t distance)
-  {
-   std::advance(iterator, distance);
-   return *this;
-  }
-
-  size_t operator-(const utf_iterator& other) const
-  {
-   return iterator - other.iterator;
-  }
-
- private:
-  typename string_type::const_iterator iterator;
-  typename string_type::const_iterator end_iterator;
- };
-
- template<typename T, typename Container=std::basic_string<T>>
- struct utf_back_insert_iterator
- {
-  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
-
-  typedef T value_type;
-  typedef char32_t internal_type;
-  typedef Container string_type;
-  typedef utf_back_insert_iterator& reference;
-  typedef utf_back_insert_iterator* pointer;
-  typedef size_t difference_type;
-  typedef std::output_iterator_tag iterator_category;
-
-  utf_back_insert_iterator(string_type& s): s(s) {}
-
-  utf_back_insert_iterator& operator=(const utf_back_insert_iterator& other)
-  {
-   if (std::addressof(other.s) != std::addressof(s))
-    throw std::runtime_error("utf_back_insert_iterator assignment operator actually called! Iterator should not be assigned to.");
-
-   return *this;
-  }
-
-  // no-op
-  reference operator++()
-  {
-   return *this;
-  }
-
-  // support *x = value, together with operator=()
-  reference operator*()
-  {
-   return *this;
-  }
-
-  // n is number of UTF-8 bytes in sequence
-  template<size_t n>
-  inline static value_type byte0_of(internal_type value)
-  {
-   return (value >> 6 * (n - 1)) | (0xFF << (8 - n));
-  }
-
-  // n is index of 6-bit groups, counting from bit 0
-  template<size_t n>
-  inline static value_type trailing_byte(internal_type value)
-  {
-   return ((value >> n * 6) & 0b111111) | 0b10000000;
-  }
-
-  // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII)
-  // assume value to be valid Unicode value for given byte position
-  template<size_t n, size_t m>
-  inline static value_type byte_n_of_m(internal_type value)
-  {
-   if constexpr (n == 0)
-    return byte0_of<m>(value);
-   else
-    return trailing_byte<m - n - 1>(value);
-  }
-
-  template<typename... Args>
-  inline void append(Args&&... args)
-  {
-   if constexpr (std::is_same_v<Container, typename std::basic_string<value_type>>) {
-    s.append({args...});
-   } else {
-    (s.emplace_back(args), ...);
-   }
-  }
-
-  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
-  inline void append_utf(const internal_type& value)
-  {
-   if (value < 0x80) { // 1 byte
-    append(static_cast<value_type>(value));
-   } else if (value < 0x800) { // 2 bytes
-    append(byte_n_of_m<0,2>(value), byte_n_of_m<1,2>(value));
-   } else if (value < 0x10000) { // 3 bytes
-    append(byte_n_of_m<0,3>(value), byte_n_of_m<1,3>(value), byte_n_of_m<2,3>(value));
-   } else { // 4 bytes
-    // expect value to be already valid Unicode values (checked in input iterator)
-    append(byte_n_of_m<0,4>(value), byte_n_of_m<1,4>(value), byte_n_of_m<2,4>(value), byte_n_of_m<3,4>(value));
-   }
-  }
-
-  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
-  inline void append_utf(const internal_type& value)
-  {
-   if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
-    append(static_cast<value_type>(value));
-   } else {
-    internal_type value_reduced{value - 0x10000};
-    append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00));
-   }
-  }
-
-  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
-  inline void append_utf(const internal_type& value)
-  {
-   // expect value to be already valid Unicode values (checked in input iterator)
-   append(static_cast<value_type>(value));
-  }
-
-  reference operator=(const internal_type& value)
-  {
-   append_utf(value);
-   return *this;
-  }
-
- private:
-  typename utf_back_insert_iterator::string_type& s;
- };
-
- typedef std::unordered_map<iso_t, char32_t> iso_map_type;
- typedef std::unordered_map<char32_t, iso_t> iso_map_type_reverse;
-
- // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary
- static inline iso_map_type iso_8859_1_map;
-
- // ISO-8859-15 is lower 8-bit of Unicode, except for:
- static inline iso_map_type iso_8859_15_map {
-  { '\xA4', U'\u20AC' }, // €
-  { '\xA6', U'\u0160' }, // Š
-  { '\xA8', U'\u0161' }, // š
-  { '\xB4', U'\u017D' }, // Ž
-  { '\xB8', U'\u017E' }, // ž
-  { '\xBC', U'\u0152' }, // Œ
-  { '\xBD', U'\u0153' }, // œ
-  { '\xBE', U'\u0178' }, // Ÿ
- };
-
- inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) {
-  iso_map_type_reverse result;
-  std::for_each(map.cbegin(), map.cend(),
-                [&](const iso_map_type::value_type& pair)
-                 {
-                  result.emplace(pair.second, pair.first);
-                  result.emplace(static_cast<char32_t>(static_cast<uint8_t>(pair.first)), 0); // map invalid characters to a known non-mapped value as marker
-                 });
-  return result;
- }
-
- static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) };
- static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) };
-
-} // namespace unicode::detail
-
-namespace unicode {
-
- using namespace detail;
-
- template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>>
- struct iso_iterator {
-  typedef iso_t value_type;
-  typedef char32_t internal_type;
-  typedef char32_t& reference;
-  typedef char32_t* pointer;
-  typedef size_t difference_type;
-  typedef std::input_iterator_tag iterator_category;
-  typedef typename Container::const_iterator iterator;
-  typedef Container string_type;
-
-  iso_iterator(const iterator& it): m_it(it) {}
-
-  // pre-increment
-  iso_iterator& operator++()
-  {
-   ++m_it;
-   return *this;
-  }
-
-  bool operator!=(const iso_iterator& other) const
-  {
-   return m_it != other.m_it;
-  }
-
-  // return reference?
-  internal_type operator*() const
-  {
-   value_type value{*m_it};
-
-   if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 if needed
-   {
-    auto it{Map.find(value)};
-    if (it != Map.end())
-     return it->second;
-   }
-   return static_cast<internal_type>(static_cast<uint8_t>(value));
-  }
-
-  iso_iterator& operator+=(size_t distance)
-  {
-   std::advance(m_it, distance);
-   return *this;
-  }
-
-  difference_type operator-(const iso_iterator& other) const
-  {
-   return m_it - other.m_it;
-  }
-
- private:
-  iterator m_it;
- };
-
- template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<iso_t>>
- struct iso_back_insert_iterator {
-  typedef iso_back_insert_iterator& reference;
-  typedef iso_back_insert_iterator* pointer;
-  typedef size_t difference_type;
-  typedef iso_t value_type;
-  typedef char32_t internal_type;
-  typedef std::output_iterator_tag iterator_category;
-  typedef Container string_type;
-  
-  iso_back_insert_iterator(string_type& s): s(s) {}
-
-  iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other)
-  {
-   if (std::addressof(other.s) != std::addressof(s))
-    throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to.");
-
-   return *this;
-  }
-
-  // no-op
-  reference operator++()
-  {
-   return *this;
-  }
-
-  // support *x = value, together with operator=()
-  reference operator*()
-  {
-   return *this;
-  }
-
-  reference operator=(const internal_type& value)
-  {
-   if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping back to 128 <= x <= 255 if needed
-   {
-    auto it{Map.find(value)};
-    if (it != Map.end()) {
-     if (it->second == 0) // marker for non-mappable character found
-      throw std::invalid_argument("Bad Unicode value to map to ISO 8859-15: "s + std::to_string(static_cast<uint32_t>(value)));
-     s.push_back(it->second);
-     return *this;
-    }
-   }
-
-   if (value > 255)
-    throw std::invalid_argument("Bad ISO 8859 value above 255: "s + std::to_string(static_cast<uint32_t>(value)));
-
-   s.push_back(static_cast<typename iso_back_insert_iterator::value_type>(value));
-   return *this;
-  }
-
- private:
-  typename iso_back_insert_iterator::string_type& s;
- };
-
- // Encoding for convert() and ISO-8859-*
- template<typename InputIt, typename OutputIt>
- struct ISO_8859
- {
-  typedef iso_t value_type;
-  typedef typename InputIt::string_type string_type;
-
-  static InputIt begin(const typename InputIt::string_type& s)
-  {
-   return InputIt(s.cbegin());
-  }
-
-  static InputIt end(const typename InputIt::string_type& s)
-  {
-   return InputIt(s.cend());
-  }
-
-  static OutputIt back_inserter(typename OutputIt::string_type& s)
-  {
-   return OutputIt(s);
-  }
- };
-
- // Encoding for convert() and UTF-*
- template<typename InputIt, typename OutputIt>
- struct UTF
- {
-  typedef typename OutputIt::value_type value_type;
-  typedef typename InputIt::string_type string_type;
-
-  static InputIt begin(const typename InputIt::string_type& s)
-  {
-   return InputIt{s.cbegin(), s.cend()};
-  }
-
-  static InputIt end(const typename InputIt::string_type& s)
-  {
-   return InputIt{s.cend(), s.cend()};
-  }
-
-  static OutputIt back_inserter(typename OutputIt::string_type& s)
-  {
-   return OutputIt(s);
-  }
- };
-
- // Encoding for convert()
- typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1;
- typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15;
- 
- typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8;
- typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;
- typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
-
  // std::distance doesn't work here: it is based on "output" distance of iterators
  template<class Iterator>
  inline size_t input_distance(const Iterator& it1, const Iterator& it2)
@@ -729,7 +129,7 @@ namespace unicode {
 
  // Optimize for the case of all ASCII (7-bit) data in a accu size row
  // From and To are Encodings
- template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
+ template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true>
  typename To::string_type convert_optimized(const typename From::string_type& s)
  {
   typename To::string_type result;
@@ -774,20 +174,224 @@ namespace unicode {
   return result;
  }
 
+ template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 1), bool> = true>
+ inline void append_utf(std::basic_string<To>& result, const char32_t& value)
+ {
+  using From = char32_t;
+  if (bits_to_compare <= 7 || value < 0x80) { // 1 byte
+   result.push_back(static_cast<To>(value));
+  } else if (bits_to_compare <= 11 || value < 0x800) { // 2 bytes
+   result.append({utf8_byte_n_of_m<0,2,From,To>(value), utf8_byte_n_of_m<1,2,From,To>(value)});
+  } else if (bits_to_compare <= 16 || value < 0x10000) { // 3 bytes
+   result.append({utf8_byte_n_of_m<0,3,From,To>(value), utf8_byte_n_of_m<1,3,From,To>(value), utf8_byte_n_of_m<2,3,From,To>(value)});
+  } else { // 4 bytes
+   // expect value to be already valid Unicode values
+   result.append({utf8_byte_n_of_m<0,4,From,To>(value), utf8_byte_n_of_m<1,4,From,To>(value), utf8_byte_n_of_m<2,4,From,To>(value), utf8_byte_n_of_m<3,4,From,To>(value)});
+  }
+ }
+
+ template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 2), bool> = true>
+ inline void append_utf(std::basic_string<To>& result, const char32_t& value)
+ {
+  if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values
+   result.push_back(static_cast<To>(value));
+  } else {
+   char32_t value_reduced{value - 0x10000};
+   result.append({static_cast<To>((value_reduced >> 10) + 0xD800), static_cast<To>((value_reduced & 0x3FF) + 0xDC00)});
+  }
+ }
+
+ template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<(sizeof(To) == 4), bool> = true>
+ inline void append_utf(std::basic_string<To>& result, const char32_t& value)
+ {
+  // expect value to be already valid Unicode values (checked in input iterator)
+  result.push_back(static_cast<To>(value));
+ }
+
+ // Little Endian optimized version for UTF-8
+ // In block_mode, at least 4 bytes are in accu. On first call, even 8.
+ // otherwise, at least one code unit is in accu
+ template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<(sizeof(From) == 1), bool> = true>
+ inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)
+ {
+#if 1
+  if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) {
+   result.append({
+                 static_cast<To>(accu & 0x7F),
+                 static_cast<To>((accu >> 8) & 0x7F),
+                 static_cast<To>((accu >> 16) & 0x7F),
+                 static_cast<To>((accu >> 24) & 0x7F),
+                 static_cast<To>((accu >> 32) & 0x7F),
+                 static_cast<To>((accu >> 40) & 0x7F),
+                 static_cast<To>((accu >> 48) & 0x7F),
+                 static_cast<To>((accu >> 56) & 0x7F),
+                 });
+   accu = 0;
+   bytes_in_accu = 0;
+  } else
+#endif
+  if ((accu & 0x80) == 0) { // 1 byte sequence
+   append_utf<7>(result, static_cast<char32_t>(accu & 0x7F));
+   accu >>= 8;
+   bytes_in_accu -= 1;
+  } else if ((block_mode || bytes_in_accu >= 2) && (accu & 0xC0E0) == 0x80C0) { // 2 byte sequence
+   char32_t value {static_cast<char32_t>(((accu & 0x1F) << 6) | ((accu >> 8) & 0x3f))};
+   accu >>= 16;
+   bytes_in_accu -= 2;
+   if (is_valid_unicode<11>(value))
+    append_utf<11>(result, value);
+   else
+    throw std::invalid_argument("Invalid Unicode character in 2 byte UTF-8 sequence");
+  } else if ((block_mode || bytes_in_accu >= 3) && (accu & 0xC0C0F0) == 0x8080E0) { // 3 byte sequence
+   char32_t value {static_cast<char32_t>(((accu & 0x0F) << 12) | ((accu >> 2) & 0x0FC0) | ((accu >> 16) & 0x3f))};
+   accu >>= 24;
+   bytes_in_accu -= 3;
+   if (is_valid_unicode<16>(value))
+    append_utf<16>(result, value);
+   else
+    throw std::invalid_argument("Invalid Unicode character in 3 byte UTF-8 sequence");
+  } else if ((block_mode || bytes_in_accu >= 4) && (accu & 0xC0C0C0F8) == 0x808080F0) { // 4 byte sequence
+   char32_t value {static_cast<char32_t>(((accu & 0x07) << 18) | ((accu << 4) & 0x3f000) | ((accu >> 10) & 0xFC0) | ((accu >> 24) & 0x3f))};
+   accu >>= 32;
+   bytes_in_accu -= 4;
+   if (is_valid_unicode<21>(value))
+    append_utf(result, value);
+   else
+    throw std::invalid_argument("Invalid Unicode character in 4 byte UTF-8 sequence");
+  } else
+   throw std::invalid_argument("Invalid UTF-8 byte sequence");
+ }
+
+ // Little Endian optimized version for UTF-16
+ // In block_mode, at least 4 bytes are in accu. On first call, even 8.
+ // otherwise, at least one code unit is in accu
+ template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<(sizeof(From) == 2), bool> = true>
+ inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu)
+ {
+#if 1
+  if ((accu & 0xFF80FF80FF80FF80) == 0) {
+   auto number_of_values{bytes_in_accu / sizeof(From)};
+   result.resize(result.size() + number_of_values);
+   for (int i = 0; i < number_of_values; i++) {
+    result[result.size() - number_of_values + i] = static_cast<To>(accu & 0x7F);
+    accu >>= 16;
+   }
+   bytes_in_accu = 0;
+  } else
+#endif
+  if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) {
+   // found 4 code units forming 3 code points in UTF-16;
+   // by definition of UTF-16, we have valid unicode values at this point
+   if constexpr(sizeof(To) == 4) {
+    //result.resize(result.size() + 2);
+    //*reinterpret_cast<uint64_t*>(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000;
+    result.append({
+                  static_cast<To>(((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000),
+                  static_cast<To>(((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000)
+                  });
+   } else {
+    append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000);
+    append_utf(result, ((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000);
+   }
+   accu = 0;
+   bytes_in_accu = 0;
+  } else
+  if (From unit0 {static_cast<From>(accu & 0xFFFF)}; is_valid_unicode<16>(unit0)) {
+   append_utf<16>(result, unit0);
+   accu >>= 16;
+   bytes_in_accu -= 2;
+  } else
+  if ((accu & 0xFC00FC00) == 0xDC00D800) {
+   // found 2 code units forming 1 code point in UTF-16;
+   // by definition of UTF-16, we have a valid unicode value at this point
+   append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000);
+   accu >>= 32;
+   bytes_in_accu -= 4;
+  } else
+   throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
+ }
+
+ // Little Endian optimized version
+ template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true>
+ typename To::string_type convert_optimized_utf(const typename From::string_type& s)
+ {
+  typename To::string_type result;
+  if constexpr(sizeof(typename From::value_type) == 4) {
+   for (const auto value: s) {
+    if (is_valid_unicode(value))
+     append_utf(result, value);
+    else
+     throw std::invalid_argument("Invalid Unicode character in UTF-32");
+   }
+#if 0
+  } else if constexpr(sizeof(typename From::value_type) == 2) {
+   for (int i = 0; i < s.size(); i++) {
+    typename From::value_type unit0{s[i]};
+    if (is_valid_unicode(unit0)) {
+     append_utf(result, unit0);
+    } else {
+     i++;
+     if (i < s.size()) {
+      typename From::value_type unit1 {s[i]};
+      if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
+       throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
+
+      append_utf(result, (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000);
+     } else
+      throw std::invalid_argument("Invalid code unit at end of UTF-16 string");
+    }
+   }
+#endif
+  } else {
+   uint64_t accu{};
+   int bytes_in_accu{};
+
+   size_t s_index{};
+   size_t s_size{s.size()};
+   while (s_index + 8 / sizeof(typename From::value_type) <= s_size) {
+    // read input
+    // assume: bytes_in_accu < 8
+    accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
+    s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type);
+    bytes_in_accu = 8;
+
+    while (bytes_in_accu >= 4) {
+     append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu);
+    }
+   }
+
+   // 0..3 bytes left in accu
+   // 0..7 bytes left in s
+
+   while (s_index < s_size || bytes_in_accu > 0) {
+    while (s_index < s_size && bytes_in_accu < 8) {
+     accu |= static_cast<uint64_t>(*reinterpret_cast<const typename From::value_type*>(&(s.data()[s_index]))) << (bytes_in_accu * 8);
+     ++s_index;
+     bytes_in_accu += sizeof(typename From::value_type);
+    }
+
+    append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu);
+   }
+  }
+  return result;
+ }
+
  // From and To are Encodings
- template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
+ template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true>
  typename To::string_type convert(const typename From::string_type& s)
  {
   // if input type == output type, only validate and return input, if appropriate
   if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) &&
-               std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> &&
-               std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) {
+               is_utf_encoding_v<From> && is_utf_encoding_v<To>) {
    if (validate_utf<typename From::value_type>(s)) {
     return s;
    } else {
     throw std::invalid_argument("Invalid UTF input");
    }
-  } if constexpr(accu_size == 4 || accu_size == 8) {
+  } else if constexpr(accu_size == 8 && is_little_endian() && sizeof(typename From::value_type) == 1 &&
+                      is_utf_encoding_v<From> && is_utf_encoding_v<To>) { // endian specific optimization
+   return convert_optimized_utf<From, To>(s);
+  } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input
    return convert_optimized<From, To>(s);
   } else {
    typename To::string_type result;
@@ -796,38 +400,11 @@ namespace unicode {
   }
  }
 
- // Helper to get correct Encoding from char type, e.g. Encoding<typename decltype(s)::value_type>::type or Encoding_t<typename decltype(s)::value_type>
- template<typename T>
- struct Encoding
- {
- };
-
- template<>
- struct Encoding<utf8_t>
- {
-  typedef UTF_8 type;
- };
-
- template<>
- struct Encoding<char16_t>
- {
-  typedef UTF_16 type;
- };
-
- template<>
- struct Encoding<char32_t>
- {
-  typedef UTF_32 type;
- };
-
- template<typename T>
- using Encoding_t = typename Encoding<T>::type;
-
  // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t
  template<typename From, typename To,
   typename FromContainer=std::basic_string<From>,
   typename ToContainer=std::basic_string<To>,
-  std::enable_if_t<std::is_trivial<From>::value && std::is_scalar<From>::value && !std::is_empty<From>::value, bool> = true>
+  std::enable_if_t<is_char_v<From> && is_char_v<To>, bool> = true>
  ToContainer convert(const FromContainer& s)
  {
   typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait;
@@ -841,7 +418,7 @@ namespace unicode {
 
  // From and To are containers
  template<typename FromContainer, typename ToContainer,
-  std::enable_if_t<!std::is_empty<FromContainer>::value && !std::is_empty<ToContainer>::value, bool> = true
+  std::enable_if_t<is_container_v<FromContainer> && is_container_v<ToContainer>, bool> = true
  >
  ToContainer convert(const FromContainer& s)
  {
@@ -855,7 +432,7 @@ namespace unicode {
  }
 
  // Container version
- template<typename Container, std::enable_if_t<!std::is_empty<Container>::value, bool> = true>
+ template<typename Container, std::enable_if_t<is_container_v<Container>, bool> = true>
  bool is_valid_utf(const Container& s)
  {
   typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait;
@@ -871,7 +448,7 @@ namespace unicode {
  // basic type version
  template<typename T,
   typename Container=std::basic_string<T>,
-  std::enable_if_t<std::is_trivial<T>::value && !std::is_empty<T>::value, bool> = true>
+  std::enable_if_t<is_char_v<T>, bool> = true>
  bool is_valid_utf(const Container& s)
  {
   typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait;
@@ -885,7 +462,7 @@ namespace unicode {
  }
 
  // Encoding version
- template<typename Encoding, std::enable_if_t<std::is_empty<Encoding>::value, bool> = true>
+ template<typename Encoding, std::enable_if_t<is_encoding_v<Encoding>, bool> = true>
  bool is_valid_utf(const typename Encoding::string_type& s)
  {
   return validate_utf<typename Encoding::value_type>(s);
diff --git a/include/unicode/endian.h b/include/unicode/endian.h
new file mode 100644
index 0000000..38bc1b7
--- /dev/null
+++ b/include/unicode/endian.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#if __cplusplus >= 202002L
+#include <bit>
+#endif
+
+namespace unicode::detail {
+
+#if __cplusplus >= 202002L
+ consteval
+#else
+ constexpr uint16_t endian_value{0x0102};
+ constexpr uint8_t endian_value_1st_byte{(const uint8_t&)endian_value};
+
+ constexpr
+#endif
+ bool is_little_endian()
+ {
+#if __cplusplus >= 202002L
+  return std::endian::native == std::endian::little;
+#else
+  return endian_value_1st_byte == 0x02;
+#endif
+ }
+
+} // namespace unicode::detail
diff --git a/include/unicode/iso.h b/include/unicode/iso.h
new file mode 100644
index 0000000..9b20afd
--- /dev/null
+++ b/include/unicode/iso.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include "types.h"
+
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+
+namespace unicode::detail {
+
+ using namespace std::string_literals;
+
+ typedef std::unordered_map<iso_t, char32_t> iso_map_type;
+ typedef std::unordered_map<char32_t, iso_t> iso_map_type_reverse;
+
+ // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary
+ static inline iso_map_type iso_8859_1_map;
+
+ // ISO-8859-15 is lower 8-bit of Unicode, except for:
+ static inline iso_map_type iso_8859_15_map {
+  { '\xA4', U'\u20AC' }, // €
+  { '\xA6', U'\u0160' }, // Š
+  { '\xA8', U'\u0161' }, // š
+  { '\xB4', U'\u017D' }, // Ž
+  { '\xB8', U'\u017E' }, // ž
+  { '\xBC', U'\u0152' }, // Œ
+  { '\xBD', U'\u0153' }, // œ
+  { '\xBE', U'\u0178' }, // Ÿ
+ };
+
+ inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) {
+  iso_map_type_reverse result;
+  std::for_each(map.cbegin(), map.cend(),
+                [&](const iso_map_type::value_type& pair)
+                 {
+                  result.emplace(pair.second, pair.first);
+                  result.emplace(static_cast<char32_t>(static_cast<uint8_t>(pair.first)), 0); // map invalid characters to a known non-mapped value as marker
+                 });
+  return result;
+ }
+
+ static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) };
+ static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) };
+
+ template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>>
+ struct iso_iterator {
+  typedef iso_t value_type;
+  typedef char32_t internal_type;
+  typedef char32_t& reference;
+  typedef char32_t* pointer;
+  typedef size_t difference_type;
+  typedef std::input_iterator_tag iterator_category;
+  typedef typename Container::const_iterator iterator;
+  typedef Container string_type;
+
+  iso_iterator(const iterator& it): m_it(it) {}
+
+  // pre-increment
+  iso_iterator& operator++()
+  {
+   ++m_it;
+   return *this;
+  }
+
+  bool operator!=(const iso_iterator& other) const
+  {
+   return m_it != other.m_it;
+  }
+
+  // return reference?
+  internal_type operator*() const
+  {
+   value_type value{*m_it};
+
+   if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 if needed
+   {
+    auto it{Map.find(value)};
+    if (it != Map.end())
+     return it->second;
+   }
+   return static_cast<internal_type>(static_cast<uint8_t>(value));
+  }
+
+  iso_iterator& operator+=(size_t distance)
+  {
+   std::advance(m_it, distance);
+   return *this;
+  }
+
+  difference_type operator-(const iso_iterator& other) const
+  {
+   return m_it - other.m_it;
+  }
+
+ private:
+  iterator m_it;
+ };
+
+ template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<iso_t>>
+ struct iso_back_insert_iterator {
+  typedef iso_back_insert_iterator& reference;
+  typedef iso_back_insert_iterator* pointer;
+  typedef size_t difference_type;
+  typedef iso_t value_type;
+  typedef char32_t internal_type;
+  typedef std::output_iterator_tag iterator_category;
+  typedef Container string_type;
+  
+  iso_back_insert_iterator(string_type& s): s(s) {}
+
+  iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other)
+  {
+   if (std::addressof(other.s) != std::addressof(s))
+    throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to.");
+
+   return *this;
+  }
+
+  // no-op
+  reference operator++()
+  {
+   return *this;
+  }
+
+  // support *x = value, together with operator=()
+  reference operator*()
+  {
+   return *this;
+  }
+
+  reference operator=(const internal_type& value)
+  {
+   if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping back to 128 <= x <= 255 if needed
+   {
+    auto it{Map.find(value)};
+    if (it != Map.end()) {
+     if (it->second == 0) // marker for non-mappable character found
+      throw std::invalid_argument("Bad Unicode value to map to ISO 8859-15: "s + std::to_string(static_cast<uint32_t>(value)));
+     s.push_back(it->second);
+     return *this;
+    }
+   }
+
+   if (value > 255)
+    throw std::invalid_argument("Bad ISO 8859 value above 255: "s + std::to_string(static_cast<uint32_t>(value)));
+
+   s.push_back(static_cast<typename iso_back_insert_iterator::value_type>(value));
+   return *this;
+  }
+
+ private:
+  typename iso_back_insert_iterator::string_type& s;
+ };
+
+} // namespace unicode::detail
+
+namespace unicode {
+
+ using namespace detail;
+
+ // Encoding for convert() and ISO-8859-*
+ template<typename InputIt, typename OutputIt>
+ struct ISO_8859
+ {
+  typedef iso_t value_type;
+  typedef typename InputIt::string_type string_type;
+
+  static InputIt begin(const typename InputIt::string_type& s)
+  {
+   return InputIt(s.cbegin());
+  }
+
+  static InputIt end(const typename InputIt::string_type& s)
+  {
+   return InputIt(s.cend());
+  }
+
+  static OutputIt back_inserter(typename OutputIt::string_type& s)
+  {
+   return OutputIt(s);
+  }
+ };
+
+ // Encoding for convert()
+ typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1;
+ typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15;
+
+} // namespace unicode
+
diff --git a/include/unicode/predicate.h b/include/unicode/predicate.h
new file mode 100644
index 0000000..5f8c6a4
--- /dev/null
+++ b/include/unicode/predicate.h
@@ -0,0 +1,21 @@
+#pragma once
+
+namespace unicode {
+
+ // bits_to_compare: limit bits to consider even further than defined by T
+ // T: usually, char32_t, uint32_t etc.
+ template<size_t bits_to_compare = 32, typename T>
+ static inline bool is_valid_unicode(const T& value) noexcept
+ {
+  if constexpr(sizeof(T) == 1 || bits_to_compare <= 15)
+   return true;
+  else if constexpr(sizeof(T) == 2 || bits_to_compare <= 20)
+   //return value <= 0xD7FF || value >= 0xE000;
+   return (value & 0xF800) != 0xD800;
+  else
+   //return (value & 0xFFFFF800) != 0x0000D800 && (value >> 16) <= 0x10;
+   return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF);
+ }
+
+} // namespace unicode
+
diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h
new file mode 100644
index 0000000..3ee1d82
--- /dev/null
+++ b/include/unicode/type_traits.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "utf.h"
+
+#include <string>
+#include <type_traits>
+
+namespace unicode {
+
+ using namespace detail;
+
+ // helper traits
+ 
+ template<typename T>
+ struct is_encoding
+ {
+  static const bool value{std::is_empty_v<T>};
+ };
+ 
+ template<typename T>
+ inline constexpr bool is_encoding_v {is_encoding<T>::value};
+
+ template<typename T>
+ struct is_container
+ {
+  static const bool value{!std::is_empty_v<T>};
+ };
+ 
+ template<typename T>
+ inline constexpr bool is_container_v {is_container<T>::value};
+
+ template<typename T>
+ struct is_char
+ {
+  static const bool value{std::is_trivial_v<T> && std::is_scalar_v<T> && !std::is_empty_v<T>};
+ };
+ 
+ template<typename T>
+ inline constexpr bool is_char_v {is_char<T>::value};
+
+ template<typename T>
+ struct is_utf_encoding
+ {
+  static const bool value{std::is_same_v<T, UTF<utf_iterator<typename T::value_type>, utf_back_insert_iterator<typename T::value_type>>>};
+ };
+
+ template<typename T>
+ inline constexpr bool is_utf_encoding_v {is_utf_encoding<T>::value};
+
+ template<typename T>
+ struct is_utf_8
+ {
+  static const bool value{std::is_trivial_v<T> && sizeof(T) == 1};
+ };
+ 
+ template<typename T>
+ inline constexpr bool is_utf_8_v {is_utf_8<T>::value};
+
+ template<typename T>
+ struct is_utf_16
+ {
+  static const bool value{std::is_trivial_v<T> && sizeof(T) == 2};
+ };
+ 
+ template<typename T>
+ inline constexpr bool is_utf_16_v {is_utf_16<T>::value};
+
+ template<typename T>
+ struct is_utf_32
+ {
+  static const bool value{std::is_trivial_v<T> && sizeof(T) == 4};
+ };
+ 
+ template<typename T>
+ inline constexpr bool is_utf_32_v {is_utf_32<T>::value};
+
+} // namespace unicode
diff --git a/include/unicode/types.h b/include/unicode/types.h
new file mode 100644
index 0000000..a4461d7
--- /dev/null
+++ b/include/unicode/types.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#ifdef __cpp_char8_t
+// char8_t available
+ typedef char8_t utf8_t;
+#else
+ typedef char utf8_t;
+#endif
+typedef char iso_t;
+
diff --git a/include/unicode/utf.h b/include/unicode/utf.h
new file mode 100644
index 0000000..dd504a7
--- /dev/null
+++ b/include/unicode/utf.h
@@ -0,0 +1,448 @@
+#pragma once
+
+#include <list>
+#include <string>
+#include <stdexcept>
+
+namespace unicode::detail {
+
+ using namespace std::string_literals;
+
+ template<size_t sequence_length, typename value_type>
+ inline bool is_utf8_leading_byte(value_type byte) noexcept
+ {
+  static_assert(sequence_length <= 4);
+
+  if constexpr(sequence_length == 1) {
+   return !(byte & 0x80);
+  } else {
+   return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length));
+  }
+ }
+
+ template<typename value_type>
+ inline bool is_utf8_followup_byte(value_type b) noexcept
+ {
+  return (b & 0b11000000) == 0b10000000;
+ }
+
+ template<typename value_type, typename... Tbytes>
+ inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept
+ {
+  constexpr auto sequence_length{sizeof...(Tbytes) + 1};
+
+  static_assert(sequence_length <= 4, "UTF-8 sequences of 1 through 4 code units are supported");
+
+  return is_utf8_leading_byte<sequence_length>(byte0) &&
+         (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+  int i{};
+  auto size{s.size()};
+  while (i < size) {
+   if (is_utf8_sequence(s[i])) {
+    i++;
+   } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) {
+    i += 2;
+   } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) {
+    if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20))
+     return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF
+    i += 3;
+   } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) {
+    if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11)
+     return false; // Unicode too big above 0x10FFFF
+    i += 4;
+   } else {
+    return false;
+   }
+  }
+  return true;
+ }
+
+ template<typename value_type, typename... Twords>
+ inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept
+ {
+  constexpr auto sequence_length{sizeof...(Twords) + 1};
+
+  static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
+
+  if constexpr(sequence_length == 1) {
+   return is_valid_unicode(word0);
+  } else {
+   char16_t unit0 {static_cast<char16_t>(word0)};
+   char16_t unit1 {static_cast<char16_t>((words, ...))};
+   return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00;
+  }
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+  int i{};
+  auto size{s.size()};
+  while (i < size) {
+   if (is_utf16_sequence(s[i])) {
+    i++;
+   } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) {
+    i += 2;
+   } else {
+    return false;
+   }
+  }
+  return true;
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+  for (auto i: s)
+   if (!is_valid_unicode(i))
+    return false;
+  return true;
+ }
+
+ template<size_t sequence_length, typename value_type>
+ inline char32_t decode_utf8_leading_byte(value_type b) noexcept
+ {
+  return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6);
+ }
+
+ template<typename value_type>
+ inline char32_t decode_utf8_followup_byte(value_type b) noexcept
+ {
+  return static_cast<char32_t>(b & 0b00111111);
+ }
+
+ template<typename value_type, typename... Targs>
+ inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept
+ {
+  return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...);
+ }
+
+ template<typename value_type, typename... Targs>
+ inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept
+ {
+  size_t constexpr sequence_length{sizeof...(Targs) + 1};
+
+  static_assert(sequence_length <= 4);
+
+  if constexpr (sequence_length == 1)
+   return b;
+  else
+   return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...);
+ }
+
+ template<typename T, typename Container=std::basic_string<T>>
+ struct utf_iterator
+ {
+  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+
+  typedef T value_type;
+  typedef char32_t internal_type;
+  typedef char32_t& reference;
+  typedef char32_t* pointer;
+  typedef size_t difference_type;
+  typedef std::input_iterator_tag iterator_category;
+  typedef Container string_type;
+
+  utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
+   iterator(cbegin), end_iterator(cend)
+  {
+  }
+
+  utf_iterator(const utf_iterator& other) = default;
+  utf_iterator& operator=(const utf_iterator& other) = default;
+
+  inline size_t remaining_code_units() const noexcept
+  {
+   return std::distance(iterator, end_iterator);
+  }
+
+  template<size_t index>
+  inline value_type get_code_unit() const noexcept
+  {
+   if constexpr (std::is_same_v<Container, typename std::list<value_type>>) {
+    // std::list doesn't support it + n
+    auto it{iterator};
+    std::advance(it, index);
+    return *it;
+   } else {
+    return *(iterator + index);
+   }
+  }
+
+  template<typename... Tbytes>
+  inline internal_type calculate_utf8_value(Tbytes... bytes)
+  {
+   size_t constexpr sequence_length{sizeof...(Tbytes)};
+   static_assert(sequence_length >= 1 && sequence_length <= 4);
+
+   if constexpr(sequence_length > 1) {
+    if (remaining_code_units() < sequence_length)
+     throw std::invalid_argument("Bad input: Not enough bytes left for decoding UTF-8 sequence");
+   }
+
+   if (is_utf8_sequence(bytes...)) {
+    std::advance(iterator, sequence_length);
+    internal_type result{decode_utf8_sequence(bytes...)};
+    if (!unicode::is_valid_unicode<sequence_length * 6>(result))
+     throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
+    return result;
+   } else {
+    if constexpr(sequence_length <= 3) // template recursion break condition: UTF-8 has 1..4 code units
+     return calculate_utf8_value(bytes..., static_cast<utf8_t>(get_code_unit<sequence_length>()));
+    else
+     throw std::invalid_argument("Bad UTF-8 input: Invalid 4 byte sequence");
+   }
+  }
+
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
+  inline internal_type calculate_value()
+  {
+   return calculate_utf8_value(static_cast<utf8_t>(get_code_unit<0>()));
+  }
+
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
+  inline internal_type calculate_value()
+  {
+   char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
+
+   if (is_valid_unicode(unit0)) { // 1 unit (BMP Basic Multilingual Plane)
+    std::advance(iterator, 1);
+    return unit0;
+   } else {
+    if (remaining_code_units() < 2)
+     throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
+
+    char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())};
+    if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
+     throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
+
+    std::advance(iterator, 2);
+    return (static_cast<internal_type>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
+   }
+  }
+
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
+  inline internal_type calculate_value()
+  {
+   internal_type result {static_cast<internal_type>(get_code_unit<0>())};
+
+   if (!unicode::is_valid_unicode(result))
+    throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
+
+   std::advance(iterator, 1);
+
+   return result;
+  }
+
+  // pre-increment
+  utf_iterator& operator++()
+  {
+   return *this;
+  }
+
+  bool operator!=(const utf_iterator& other) const
+  {
+   return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
+  }
+
+  internal_type operator*()
+  {
+   return calculate_value();
+  }
+
+  utf_iterator& operator+=(size_t distance)
+  {
+   std::advance(iterator, distance);
+   return *this;
+  }
+
+  size_t operator-(const utf_iterator& other) const
+  {
+   return iterator - other.iterator;
+  }
+
+ private:
+  typename string_type::const_iterator iterator;
+  typename string_type::const_iterator end_iterator;
+ };
+
+ // n is number of UTF-8 bytes in sequence
+ template<size_t n, typename From, typename To>
+ inline To utf8_byte0_of(const From& value)
+ {
+  return (value >> 6 * (n - 1)) | (0xFF << (8 - n));
+ }
+
+ // n is index of 6-bit groups, counting from bit 0
+ template<size_t n, typename From, typename To>
+ inline To utf8_trailing_byte(const From& value)
+ {
+  return ((value >> n * 6) & 0b111111) | 0b10000000;
+ }
+
+ // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII)
+ // assume value to be valid Unicode value for given byte position
+ template<size_t n, size_t m, typename From, typename To>
+ inline To utf8_byte_n_of_m(const From& value)
+ {
+  if constexpr (n == 0)
+   return utf8_byte0_of<m, From, To>(value);
+  else
+   return utf8_trailing_byte<m - n - 1, From, To>(value);
+ }
+
+ template<typename T, typename Container=std::basic_string<T>>
+ struct utf_back_insert_iterator
+ {
+  static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+
+  typedef T value_type;
+  typedef char32_t internal_type;
+  typedef Container string_type;
+  typedef utf_back_insert_iterator& reference;
+  typedef utf_back_insert_iterator* pointer;
+  typedef size_t difference_type;
+  typedef std::output_iterator_tag iterator_category;
+
+  utf_back_insert_iterator(string_type& s): s(s) {}
+
+  utf_back_insert_iterator& operator=(const utf_back_insert_iterator& other)
+  {
+   if (std::addressof(other.s) != std::addressof(s))
+    throw std::runtime_error("utf_back_insert_iterator assignment operator actually called! Iterator should not be assigned to.");
+
+   return *this;
+  }
+
+  // no-op
+  reference operator++()
+  {
+   return *this;
+  }
+
+  // support *x = value, together with operator=()
+  reference operator*()
+  {
+   return *this;
+  }
+
+  template<typename... Args>
+  inline void append(Args&&... args)
+  {
+   if constexpr (std::is_same_v<Container, typename std::basic_string<value_type>>) {
+    s.append({args...});
+   } else {
+    (s.emplace_back(args), ...);
+   }
+  }
+
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
+  inline void append_utf(const internal_type& value)
+  {
+   using Y = internal_type;
+   if (value < 0x80) { // 1 byte
+    append(static_cast<value_type>(value));
+   } else if (value < 0x800) { // 2 bytes
+    append(utf8_byte_n_of_m<0,2,Y,X>(value), utf8_byte_n_of_m<1,2,Y,X>(value));
+   } else if (value < 0x10000) { // 3 bytes
+    append(utf8_byte_n_of_m<0,3,Y,X>(value), utf8_byte_n_of_m<1,3,Y,X>(value), utf8_byte_n_of_m<2,3,Y,X>(value));
+   } else { // 4 bytes
+    // expect value to be already valid Unicode values (checked in input iterator)
+    append(utf8_byte_n_of_m<0,4,Y,X>(value), utf8_byte_n_of_m<1,4,Y,X>(value), utf8_byte_n_of_m<2,4,Y,X>(value), utf8_byte_n_of_m<3,4,Y,X>(value));
+   }
+  }
+
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
+  inline void append_utf(const internal_type& value)
+  {
+   if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
+    append(static_cast<value_type>(value));
+   } else {
+    internal_type value_reduced{value - 0x10000};
+    append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00));
+   }
+  }
+
+  template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
+  inline void append_utf(const internal_type& value)
+  {
+   // expect value to be already valid Unicode values (checked in input iterator)
+   append(static_cast<value_type>(value));
+  }
+
+  reference operator=(const internal_type& value)
+  {
+   append_utf(value);
+   return *this;
+  }
+
+ private:
+  typename utf_back_insert_iterator::string_type& s;
+ };
+
+} // namespace unicode::detail
+
+namespace unicode {
+ 
+ // Encoding for convert() and UTF-*
+ template<typename InputIt, typename OutputIt>
+ struct UTF
+ {
+  typedef typename OutputIt::value_type value_type;
+  typedef typename InputIt::string_type string_type;
+
+  static InputIt begin(const typename InputIt::string_type& s)
+  {
+   return InputIt{s.cbegin(), s.cend()};
+  }
+
+  static InputIt end(const typename InputIt::string_type& s)
+  {
+   return InputIt{s.cend(), s.cend()};
+  }
+
+  static OutputIt back_inserter(typename OutputIt::string_type& s)
+  {
+   return OutputIt(s);
+  }
+ };
+
+ // Encoding for convert()
+ typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8;
+ typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;
+ typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
+
+ // Helper to get correct Encoding from char type, e.g. Encoding<typename decltype(s)::value_type>::type or Encoding_t<typename decltype(s)::value_type>
+ template<typename T>
+ struct Encoding
+ {
+ };
+
+ template<>
+ struct Encoding<utf8_t>
+ {
+  typedef UTF_8 type;
+ };
+
+ template<>
+ struct Encoding<char16_t>
+ {
+  typedef UTF_16 type;
+ };
+
+ template<>
+ struct Encoding<char32_t>
+ {
+  typedef UTF_32 type;
+ };
+
+ template<typename T>
+ using Encoding_t = typename Encoding<T>::type;
+
+} // namespace unicode
+
author	Roland Reichwein <mail@reichwein.it>	2022-01-01 20:25:34 +0100
committer	Roland Reichwein <mail@reichwein.it>	2022-01-01 20:25:34 +0100
commit	52d4375b10d920a59f1309c272a2e525feb1c25d (patch)
tree	9d5417a9d214f4b0ba68b75e8908e28da46dd5c8
parent	ae7b430afd1239947b8f8b2d9dc0ca72dbce91ac (diff)