From ac045216d6e7fcb0ec4d2169ac2b6dffbe21707a Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 2 Jan 2022 15:02:59 +0100 Subject: Remove dead code from optimizations --- include/unicode.h | 129 ++++++++---------------------------------- include/unicode/type_traits.h | 6 +- include/unicode/utf.h | 5 -- include/unicode/utf_fwd.h | 7 +++ 4 files changed, 35 insertions(+), 112 deletions(-) diff --git a/include/unicode.h b/include/unicode.h index eb872ec..2bf17f4 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -214,7 +214,6 @@ namespace unicode { template, bool> = true> inline static void append_accu(std::basic_string& result, uint64_t& accu, int& bytes_in_accu) { -#if 1 if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) { result.append({ static_cast(accu & 0x7F), @@ -229,7 +228,6 @@ namespace unicode { accu = 0; bytes_in_accu = 0; } else -#endif if ((accu & 0x80) == 0) { // 1 byte sequence append_utf<7>(result, static_cast(accu & 0x7F)); accu >>= 8; @@ -262,116 +260,39 @@ namespace unicode { throw std::invalid_argument("Invalid UTF-8 byte sequence"); } - // Little Endian optimized version for UTF-16 - // In block_mode, at least 4 bytes are in accu. On first call, even 8. - // otherwise, at least one code unit is in accu - template, bool> = true> - inline static void append_accu(std::basic_string& result, uint64_t& accu, int& bytes_in_accu) - { -#if 1 - if ((accu & 0xFF80FF80FF80FF80) == 0) { - auto number_of_values{bytes_in_accu / sizeof(From)}; - result.resize(result.size() + number_of_values); - for (int i = 0; i < number_of_values; i++) { - result[result.size() - number_of_values + i] = static_cast(accu & 0x7F); - accu >>= 16; - } - bytes_in_accu = 0; - } else -#endif - if ((accu & 0xFC00FC00FC00FC00) == 0xDC00D800DC00D800) { - // found 4 code units forming 3 code points in UTF-16; - // by definition of UTF-16, we have valid unicode values at this point - if constexpr(is_utf_32_v) { - //result.resize(result.size() + 2); - //*reinterpret_cast(&result[result.size() - 2]) = (((accu & 0x03FF000003FF) << 10) | ((accu >> 16) & 0x03FF000003FF)) + 0x0001000000010000; - result.append({ - static_cast(((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000), - static_cast(((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000) - }); - } else { - append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000); - append_utf(result, ((accu & 0x03FF00000000) >> 22 | ((accu >> 48) & 0x03FF)) + 0x10000); - } - accu = 0; - bytes_in_accu = 0; - } else - if (From unit0 {static_cast(accu & 0xFFFF)}; is_valid_unicode<16>(unit0)) { - append_utf<16>(result, unit0); - accu >>= 16; - bytes_in_accu -= 2; - } else - if ((accu & 0xFC00FC00) == 0xDC00D800) { - // found 2 code units forming 1 code point in UTF-16; - // by definition of UTF-16, we have a valid unicode value at this point - append_utf(result, ((accu & 0x03FF) << 10 | ((accu >> 16) & 0x03FF)) + 0x10000); - accu >>= 32; - bytes_in_accu -= 4; - } else - throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); - } - // Little Endian optimized version template && is_encoding_v, bool> = true> typename To::string_type convert_optimized_utf(const typename From::string_type& s) { typename To::string_type result; - if constexpr(is_utf_32_v) { - for (const auto value: s) { - if (is_valid_unicode(value)) - append_utf(result, value); - else - throw std::invalid_argument("Invalid Unicode character in UTF-32"); + uint64_t accu{}; + int bytes_in_accu{}; + + size_t s_index{}; + size_t s_size{s.size()}; + while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { + // read input + // assume: bytes_in_accu < 8 + accu |= (*reinterpret_cast(&(s.data()[s_index]))) << (bytes_in_accu * 8); + s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); + bytes_in_accu = 8; + + while (bytes_in_accu >= 4) { + append_accu(result, accu, bytes_in_accu); } -#if 0 - } else if constexpr(is_utf_16_v) { - for (int i = 0; i < s.size(); i++) { - typename From::value_type unit0{s[i]}; - if (is_valid_unicode(unit0)) { - append_utf(result, unit0); - } else { - i++; - if (i < s.size()) { - typename From::value_type unit1 {s[i]}; - if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) - throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); - - append_utf(result, (static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000); - } else - throw std::invalid_argument("Invalid code unit at end of UTF-16 string"); - } - } -#endif - } else { - uint64_t accu{}; - int bytes_in_accu{}; - - size_t s_index{}; - size_t s_size{s.size()}; - while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { - // read input - // assume: bytes_in_accu < 8 - accu |= (*reinterpret_cast(&(s.data()[s_index]))) << (bytes_in_accu * 8); - s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); - bytes_in_accu = 8; - - while (bytes_in_accu >= 4) { - append_accu(result, accu, bytes_in_accu); - } - } - - // 0..3 bytes left in accu - // 0..7 bytes left in s + } - while (s_index < s_size || bytes_in_accu > 0) { - while (s_index < s_size && bytes_in_accu < 8) { - accu |= static_cast(*reinterpret_cast(&(s.data()[s_index]))) << (bytes_in_accu * 8); - ++s_index; - bytes_in_accu += sizeof(typename From::value_type); - } + // 0..3 bytes left in accu + // 0..7 bytes left in s - append_accu(result, accu, bytes_in_accu); + while (s_index < s_size || bytes_in_accu > 0) { + while (s_index < s_size && bytes_in_accu < 8) { + accu |= static_cast(*reinterpret_cast(&(s.data()[s_index]))) << (bytes_in_accu * 8); + ++s_index; + bytes_in_accu += sizeof(typename From::value_type); } + + append_accu(result, accu, bytes_in_accu); } return result; } @@ -408,7 +329,7 @@ namespace unicode { ToContainer convert(const FromContainer& s) { typedef UTF, utf_back_insert_iterator> UTF_Trait; - + ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h index c3507e7..63c7d69 100644 --- a/include/unicode/type_traits.h +++ b/include/unicode/type_traits.h @@ -50,7 +50,7 @@ namespace unicode { template struct is_utf_8 { - static const bool value{std::is_trivial_v && sizeof(T) == 1}; + static const bool value{std::is_same_v || (std::is_trivial_v && sizeof(T) == 1)}; }; template @@ -59,7 +59,7 @@ namespace unicode { template struct is_utf_16 { - static const bool value{std::is_trivial_v && sizeof(T) == 2}; + static const bool value{std::is_same_v || (std::is_trivial_v && sizeof(T) == 2)}; }; template @@ -68,7 +68,7 @@ namespace unicode { template struct is_utf_32 { - static const bool value{std::is_trivial_v && sizeof(T) == 4}; + static const bool value{std::is_same_v || (std::is_trivial_v && sizeof(T) == 4)}; }; template diff --git a/include/unicode/utf.h b/include/unicode/utf.h index 81e8f2b..046d9c6 100644 --- a/include/unicode/utf.h +++ b/include/unicode/utf.h @@ -415,11 +415,6 @@ namespace unicode { } }; - // Encoding for convert() - typedef UTF, utf_back_insert_iterator> UTF_8; - typedef UTF, utf_back_insert_iterator> UTF_16; - typedef UTF, utf_back_insert_iterator> UTF_32; - // Helper to get correct Encoding from char type, e.g. Encoding::type or Encoding_t template struct Encoding diff --git a/include/unicode/utf_fwd.h b/include/unicode/utf_fwd.h index f3f6c52..c42dea1 100644 --- a/include/unicode/utf_fwd.h +++ b/include/unicode/utf_fwd.h @@ -2,6 +2,8 @@ // Forward declarations +#include "types.h" + #include namespace unicode::detail { @@ -19,5 +21,10 @@ namespace unicode { template struct UTF; + // Encoding for convert() + typedef UTF, utf_back_insert_iterator> UTF_8; + typedef UTF, utf_back_insert_iterator> UTF_16; + typedef UTF, utf_back_insert_iterator> UTF_32; + } // namespace unicode -- cgit v1.2.3