From c969cddf87a2c6d2eb74353f3115a70d166136e5 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sat, 1 Jan 2022 21:02:15 +0100 Subject: Use own type traits --- Makefile | 6 ++++++ include/unicode.h | 30 +++++++++++++++--------------- include/unicode/type_traits.h | 2 +- include/unicode/utf.h | 29 ++++++++++++++++------------- include/unicode/utf_fwd.h | 23 +++++++++++++++++++++++ 5 files changed, 61 insertions(+), 29 deletions(-) create mode 100644 include/unicode/utf_fwd.h diff --git a/Makefile b/Makefile index 36c503d..02498b3 100644 --- a/Makefile +++ b/Makefile @@ -139,7 +139,13 @@ DISTFILES= \ src/file.h \ Makefile \ include/unicode.h \ + include/unicode/endian.h \ + include/unicode/iso.h \ + include/unicode/predicate.h \ + include/unicode/types.h \ include/unicode/type_traits.h \ + include/unicode/utf.h \ + include/unicode/utf_fwd.h \ debian/control \ debian/compat \ debian/copyright \ diff --git a/include/unicode.h b/include/unicode.h index a50f525..eb872ec 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -77,15 +77,15 @@ namespace unicode { { if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { s.append(reinterpret_cast(addr), accu_size / sizeof(input_value_type)); - } else if constexpr(sizeof(input_value_type) == 1) { + } else if constexpr(is_utf_8_v) { s.append({static_cast(addr[0]), static_cast(addr[1]), static_cast(addr[2]), static_cast(addr[3])}); - } else if constexpr(sizeof(input_value_type) == 2) { + } else if constexpr(is_utf_16_v) { s.append({static_cast(addr[0]), static_cast(addr[1])}); - } else if constexpr(sizeof(input_value_type) == 4) { + } else if constexpr(is_utf_32_v) { s.append({static_cast(addr[0])}); } } @@ -105,7 +105,7 @@ namespace unicode { { if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { s.append(reinterpret_cast(addr), accu_size / sizeof(input_value_type)); - } else if constexpr(sizeof(input_value_type) == 1) { + } else if constexpr(is_utf_8_v) { s.append({static_cast(addr[0]), static_cast(addr[1]), static_cast(addr[2]), @@ -114,12 +114,12 @@ namespace unicode { static_cast(addr[5]), static_cast(addr[6]), static_cast(addr[7])}); - } else if constexpr(sizeof(input_value_type) == 2) { + } else if constexpr(is_utf_16_v) { s.append({static_cast(addr[0]), static_cast(addr[1]), static_cast(addr[2]), static_cast(addr[3])}); - } else if constexpr(sizeof(input_value_type) == 4) { + } else if constexpr(is_utf_32_v) { s.append({static_cast(addr[0]), static_cast(addr[1])}); } @@ -174,7 +174,7 @@ namespace unicode { return result; } - template = true> + template, bool> = true> inline void append_utf(std::basic_string& result, const char32_t& value) { using From = char32_t; @@ -190,7 +190,7 @@ namespace unicode { } } - template = true> + template, bool> = true> inline void append_utf(std::basic_string& result, const char32_t& value) { if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values @@ -201,7 +201,7 @@ namespace unicode { } } - template = true> + template, bool> = true> inline void append_utf(std::basic_string& result, const char32_t& value) { // expect value to be already valid Unicode values (checked in input iterator) @@ -211,7 +211,7 @@ namespace unicode { // Little Endian optimized version for UTF-8 // In block_mode, at least 4 bytes are in accu. On first call, even 8. // otherwise, at least one code unit is in accu - template = true> + template, bool> = true> inline static void append_accu(std::basic_string& result, uint64_t& accu, int& bytes_in_accu) { #if 1 @@ -265,7 +265,7 @@ namespace unicode { // Little Endian optimized version for UTF-16 // In block_mode, at least 4 bytes are in accu. On first call, even 8. // otherwise, at least one code unit is in accu - template = true> + template, bool> = true> inline static void append_accu(std::basic_string& result, uint64_t& accu, int& bytes_in_accu) { #if 1 @@ -282,7 +282,7 @@ namespace unicode { if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) { // found 4 code units forming 3 code points in UTF-16; // by definition of UTF-16, we have valid unicode values at this point - if constexpr(sizeof(To) == 4) { + if constexpr(is_utf_32_v) { //result.resize(result.size() + 2); //*reinterpret_cast(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000; result.append({ @@ -316,7 +316,7 @@ namespace unicode { typename To::string_type convert_optimized_utf(const typename From::string_type& s) { typename To::string_type result; - if constexpr(sizeof(typename From::value_type) == 4) { + if constexpr(is_utf_32_v) { for (const auto value: s) { if (is_valid_unicode(value)) append_utf(result, value); @@ -324,7 +324,7 @@ namespace unicode { throw std::invalid_argument("Invalid Unicode character in UTF-32"); } #if 0 - } else if constexpr(sizeof(typename From::value_type) == 2) { + } else if constexpr(is_utf_16_v) { for (int i = 0; i < s.size(); i++) { typename From::value_type unit0{s[i]}; if (is_valid_unicode(unit0)) { @@ -388,7 +388,7 @@ namespace unicode { } else { throw std::invalid_argument("Invalid UTF input"); } - } else if constexpr(accu_size == 8 && is_little_endian() && sizeof(typename From::value_type) == 1 && + } else if constexpr(accu_size == 8 && is_little_endian() && is_utf_8_v && is_utf_encoding_v && is_utf_encoding_v) { // endian specific optimization return convert_optimized_utf(s); } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h index 3ee1d82..c3507e7 100644 --- a/include/unicode/type_traits.h +++ b/include/unicode/type_traits.h @@ -1,6 +1,6 @@ #pragma once -#include "utf.h" +#include "utf_fwd.h" #include #include diff --git a/include/unicode/utf.h b/include/unicode/utf.h index dd504a7..81e8f2b 100644 --- a/include/unicode/utf.h +++ b/include/unicode/utf.h @@ -1,5 +1,8 @@ #pragma once +#include "utf_fwd.h" +#include "type_traits.h" + #include #include #include @@ -37,7 +40,7 @@ namespace unicode::detail { (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right } - template = true> + template, bool> = true> inline bool validate_utf(const std::basic_string& s) { int i{}; @@ -78,7 +81,7 @@ namespace unicode::detail { } } - template = true> + template, bool> = true> inline bool validate_utf(const std::basic_string& s) { int i{}; @@ -95,7 +98,7 @@ namespace unicode::detail { return true; } - template = true> + template, bool> = true> inline bool validate_utf(const std::basic_string& s) { for (auto i: s) @@ -135,10 +138,10 @@ namespace unicode::detail { return decode_utf8_leading_byte(b) | decode_utf8_followup_byte(bytes...); } - template> + template struct utf_iterator { - static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + static_assert(is_utf_8_v || is_utf_16_v || is_utf_32_v); typedef T value_type; typedef char32_t internal_type; @@ -199,13 +202,13 @@ namespace unicode::detail { } } - template = true> + template, bool> = true> inline internal_type calculate_value() { return calculate_utf8_value(static_cast(get_code_unit<0>())); } - template = true> + template, bool> = true> inline internal_type calculate_value() { char16_t unit0 {static_cast(get_code_unit<0>())}; @@ -226,7 +229,7 @@ namespace unicode::detail { } } - template = true> + template, bool> = true> inline internal_type calculate_value() { internal_type result {static_cast(get_code_unit<0>())}; @@ -296,10 +299,10 @@ namespace unicode::detail { return utf8_trailing_byte(value); } - template> + template struct utf_back_insert_iterator { - static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + static_assert(is_utf_8_v || is_utf_16_v || is_utf_32_v); typedef T value_type; typedef char32_t internal_type; @@ -341,7 +344,7 @@ namespace unicode::detail { } } - template = true> + template, bool> = true> inline void append_utf(const internal_type& value) { using Y = internal_type; @@ -357,7 +360,7 @@ namespace unicode::detail { } } - template = true> + template, bool> = true> inline void append_utf(const internal_type& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) @@ -368,7 +371,7 @@ namespace unicode::detail { } } - template = true> + template, bool> = true> inline void append_utf(const internal_type& value) { // expect value to be already valid Unicode values (checked in input iterator) diff --git a/include/unicode/utf_fwd.h b/include/unicode/utf_fwd.h new file mode 100644 index 0000000..f3f6c52 --- /dev/null +++ b/include/unicode/utf_fwd.h @@ -0,0 +1,23 @@ +#pragma once + +// Forward declarations + +#include + +namespace unicode::detail { + + template> + struct utf_iterator; + + template> + struct utf_back_insert_iterator; + +} // namespace unicode::detail + +namespace unicode { + + template + struct UTF; + +} // namespace unicode + -- cgit v1.2.3