// libunicode // // Author: Roland Reichwein // // Available under the conditions of CC0 1.0 Universal // https://creativecommons.org/publicdomain/zero/1.0/ #pragma once #include #include #include #include #include #include #include #include #ifdef __cpp_char8_t // char8_t available typedef char8_t utf8_t; typedef char iso_t; #else typedef char utf8_t; typedef char iso_t; #endif namespace unicode { // usually, char32_t, uint32_t etc. template static inline bool is_valid_unicode(const T& value) { return value <= 0x10FFFF && (value <= 0xD7FF || value >= 0xE000); } } namespace unicode::detail { using namespace std::string_literals; template> struct utf_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); typedef T input_type; typedef char32_t value_type; typedef char32_t& reference; typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; typedef Container string_type; utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) { calculate_value(); } utf_iterator(const utf_iterator& other) = default; utf_iterator& operator=(const utf_iterator& other) = default; size_t remaining_code_units() const { return std::distance(iterator, end_iterator); } template T get_code_unit() const { if constexpr (std::is_same>::value) { // std::list doesn't support it + n auto it{iterator}; std::advance(it, index); return *it; } else { return *(iterator + index); } } inline static bool is_continuation_byte(T b) { return (b & 0b11000000) == 0b10000000; } template inline static bool is_continuation_byte(T b, Targs... Fargs) { return is_continuation_byte(b) && is_continuation_byte(Fargs...); } template inline static bool is_byte0_of(T b) { return (b & static_cast(0xFF << (7 - n))) == static_cast(0xFF << (8 - n)); } inline static char32_t continuation_value(T b) { return static_cast(b & 0b00111111); } template inline static char32_t continuation_value(T b, Targs... Fargs) { return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); } template inline static char32_t value_byte0_of(T b) { return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); } void calculate_value_utf8() { size_t remaining{remaining_code_units()}; if (!remaining) return; utf8_t byte0 {static_cast(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes if (remaining >= 2) { utf8_t byte1 {static_cast(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); sequence_length = 2; } else if (remaining >= 3) { utf8_t byte2 {static_cast(get_code_unit<2>())}; if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); sequence_length = 3; } else if (remaining >= 4) { utf8_t byte3 {static_cast(get_code_unit<3>())}; if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); sequence_length = 4; } else throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else throw std::invalid_argument("Bad input: Invalid 3 byte sequence"); } else throw std::invalid_argument("Bad input: Invalid 2 byte sequence"); } else throw std::invalid_argument("Bad input: 2nd byte expected, none found"); // check only for sequences >= 2 bytes (ASCII is always compliant) if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); } else { // 1 byte: 7 bit ASCII value = byte0; sequence_length = 1; } } void calculate_value_utf16() { size_t remaining{remaining_code_units()}; if (!remaining) return; char16_t unit0 {static_cast(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) value = unit0; sequence_length = 1; } else { if (remaining < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); char16_t unit1 {static_cast(get_code_unit<1>())}; if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); value = (static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; sequence_length = 2; } } void calculate_value_utf32() { size_t remaining{remaining_code_units()}; if (!remaining) return; value = static_cast(get_code_unit<0>()); if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); sequence_length = 1; } // set value member void calculate_value() { if constexpr(sizeof(T) == 1) { calculate_value_utf8(); } else if constexpr (sizeof(T) == 2) { calculate_value_utf16(); } else if constexpr (sizeof(T) == 4) { calculate_value_utf32(); } else { throw std::runtime_error("Invalid character size: "s + std::to_string(sizeof(T))); } } // pre-increment utf_iterator& operator++() { std::advance(iterator, sequence_length); calculate_value(); return *this; } bool operator!=(const utf_iterator& other) const { return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } reference operator*() { return value; } private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; char32_t value{}; // always save complete unicode code point at this point size_t sequence_length{}; }; template> struct utf_back_insert_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); typedef T value_type; typedef Container string_type; typedef utf_back_insert_iterator& reference; typedef utf_back_insert_iterator* pointer; typedef size_t difference_type; typedef std::output_iterator_tag iterator_category; utf_back_insert_iterator(string_type& s): s(s) {} utf_back_insert_iterator& operator=(const utf_back_insert_iterator& other) { if (std::addressof(other.s) != std::addressof(s)) throw std::runtime_error("utf_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); return *this; } // no-op reference operator++() { return *this; } // support *x = value, together with operator=() reference operator*() { return *this; } // n is number of UTF-8 bytes in sequence template inline static T byte0_of(char32_t value) { return (value >> 6 * (n - 1)) | (0xFF << (8 - n)); } // n is index of 6-bit groups, counting from bit 0 template inline static T trailing_byte(char32_t value) { return ((value >> n * 6) & 0b111111) | 0b10000000; } // calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII) // assume value to be valid Unicode value for given byte position template inline static T byte_n_of_m(char32_t value) { if constexpr (n == 0) return byte0_of(value); else return trailing_byte(value); } void append_utf8(const char32_t& value) { if (value < 0x80) { // 1 byte s.push_back(static_cast(value)); } else if (value < 0x800) { // 2 bytes s.push_back(byte_n_of_m<0,2>(value)); s.push_back(byte_n_of_m<1,2>(value)); } else if (value < 0x10000) { // 3 bytes s.push_back(byte_n_of_m<0,3>(value)); s.push_back(byte_n_of_m<1,3>(value)); s.push_back(byte_n_of_m<2,3>(value)); } else if (value < 0x110000) { // 4 bytes s.push_back(byte_n_of_m<0,4>(value)); s.push_back(byte_n_of_m<1,4>(value)); s.push_back(byte_n_of_m<2,4>(value)); s.push_back(byte_n_of_m<3,4>(value)); } else throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast(value))); } void append_utf16(const char32_t& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values s.push_back(static_cast(value)); } else { char32_t value_reduced{value - 0x10000}; s.push_back((value_reduced >> 10) + 0xD800); s.push_back((value_reduced & 0x3FF) + 0xDC00); } } void append_utf32(const char32_t& value) { // expect value to be already valid Unicode values s.push_back(value); } reference operator=(const char32_t& value) { if constexpr(sizeof(T) == 1) { append_utf8(value); } else if constexpr(sizeof(T) == 2) { append_utf16(value); } else if constexpr(sizeof(T) == 4) { append_utf32(value); } else { throw std::runtime_error("Invalid type size: "s + std::to_string(sizeof(T))); } return *this; } private: typename utf_back_insert_iterator::string_type& s; }; typedef std::unordered_map iso_map_type; typedef std::unordered_map iso_map_type_reverse; // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary static inline iso_map_type iso_8859_1_map; // ISO-8859-15 is lower 8-bit of Unicode, except for: static inline iso_map_type iso_8859_15_map { { '\xA4', U'\u20AC' }, // € { '\xA6', U'\u0160' }, // Š { '\xA8', U'\u0161' }, // š { '\xB4', U'\u017D' }, // Ž { '\xB8', U'\u017E' }, // ž { '\xBC', U'\u0152' }, // Œ { '\xBD', U'\u0153' }, // œ { '\xBE', U'\u0178' }, // Ÿ }; inline iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { iso_map_type_reverse result; std::for_each(map.cbegin(), map.cend(), [&](const iso_map_type::value_type& pair) { result.emplace(pair.second, pair.first); }); return result; } static inline iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; static inline iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; } // namespace unicode::detail namespace unicode { using namespace detail; template> struct iso_iterator { typedef iso_t input_type; typedef char32_t value_type; typedef char32_t& reference; typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; typedef typename Container::const_iterator iterator; typedef Container string_type; iso_iterator(const iterator& it): m_it(it) {} // pre-increment iso_iterator& operator++() { ++m_it; return *this; } bool operator!=(const iso_iterator& other) const { return m_it != other.m_it; } // return reference? value_type operator*() { input_type value{*m_it}; if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed { auto it{Map.find(value)}; if (it != Map.end()) return it->second; } return static_cast(static_cast(value)); } private: iterator m_it; }; template> struct iso_back_insert_iterator { typedef iso_back_insert_iterator& reference; typedef iso_back_insert_iterator* pointer; typedef size_t difference_type; typedef iso_t value_type; typedef std::output_iterator_tag iterator_category; typedef Container string_type; iso_back_insert_iterator(string_type& s): s(s) {} iso_back_insert_iterator& operator=(const iso_back_insert_iterator& other) { if (std::addressof(other.s) != std::addressof(s)) throw std::runtime_error("iso_back_insert_iterator assignment operator actually called! Iterator should not be assigned to."); return *this; } // no-op reference operator++() { return *this; } // support *x = value, together with operator=() reference operator*() { return *this; } reference operator=(const char32_t& value) { if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping of 128 <= x <= 255 needed { auto it{Map.find(value)}; if (it != Map.end()) { s.push_back(it->second); return *this; } } if (value > 255) throw std::invalid_argument("Bad Unicode value above 255: "s + std::to_string(static_cast(value))); s.push_back(static_cast(value)); return *this; } private: typename iso_back_insert_iterator::string_type& s; }; // Facet for convert() and ISO-8859-* template struct ISO_8859 { typedef iso_t value_type; typedef typename InputIt::string_type string_type; static InputIt begin(const typename InputIt::string_type& s) { return InputIt(s.cbegin()); } static InputIt end(const typename InputIt::string_type& s) { return InputIt(s.cend()); } static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } }; // Facet for convert() and UTF-* template struct UTF { typedef typename OutputIt::value_type value_type; typedef typename InputIt::string_type string_type; static InputIt begin(const typename InputIt::string_type& s) { return InputIt{s.cbegin(), s.cend()}; } static InputIt end(const typename InputIt::string_type& s) { return InputIt{s.cend(), s.cend()}; } static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } }; // Facet for convert() typedef ISO_8859, iso_back_insert_iterator<>> ISO_8859_1; typedef ISO_8859, iso_back_insert_iterator> ISO_8859_15; typedef UTF, utf_back_insert_iterator> UTF_8; typedef UTF, utf_back_insert_iterator> UTF_16; typedef UTF, utf_back_insert_iterator> UTF_32; // From and To are facets template::value, bool> = true> typename To::string_type convert(const typename From::string_type& s) { typename To::string_type result; std::copy(From::begin(s), From::end(s), To::back_inserter(result)); return result; } // Helper to get correct Facet from char type, e.g. Encoding::Facet template struct Encoding { }; template<> struct Encoding { typedef UTF_8 Facet; }; template<> struct Encoding { typedef UTF_16 Facet; }; template<> struct Encoding { typedef UTF_32 Facet; }; // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t template, typename ToContainer=std::basic_string, std::enable_if_t::value && std::is_scalar::value && !std::is_empty::value, bool> = true> ToContainer convert(const FromContainer& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } // From and To are containers template::value && !std::is_empty::value, bool> = true > ToContainer convert(const FromContainer& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } // Container version template::value, bool> = true> bool is_valid_utf(const Container& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; try { std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); } catch (const std::invalid_argument&) { return false; } return true; } // basic type version template, std::enable_if_t::value && !std::is_empty::value, bool> = true> bool is_valid_utf(const Container& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; try { std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); } catch (const std::invalid_argument&) { return false; } return true; } // Facet version template::value, bool> = true> bool is_valid_utf(const typename Facet::string_type& s) { try { std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); } catch (const std::invalid_argument&) { return false; } return true; } } // namespace unicode