From e0e5623b46fdaa0988faa76af506d5bc1035ee42 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 26 Dec 2021 16:58:04 +0100 Subject: Optimization fixes --- include/unicode.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/include/unicode.h b/include/unicode.h index 3d6477c..395f172 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -31,6 +31,13 @@ namespace unicode { template static inline bool is_valid_unicode(const T& value) noexcept { + if constexpr(sizeof(T) == 1) + return true; + else if constexpr(sizeof(T) == 2) + //return value <= 0xD7FF || value >= 0xE000; + return (value & 0xF800) != 0xD800; + else + //return (value & 0xFFFFF800) != 0x0000D800 && (value >> 16) <= 0x10; return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF); } @@ -161,7 +168,7 @@ namespace unicode::detail { { char16_t unit0 {static_cast(get_code_unit<0>())}; - if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) + if (is_valid_unicode(unit0)) { // 1 unit (BMP Basic Multilingual Plane) std::advance(iterator, 1); return unit0; } else { @@ -547,10 +554,16 @@ namespace unicode { // std::distance doesn't work here: it is based on "output" distance of iterators template - size_t input_distance(const Iterator& it1, const Iterator& it2) + inline size_t input_distance(const Iterator& it1, const Iterator& it2) { return it2 - it1; } + + template + inline size_t input_distance_bytes(const Iterator& it1, const Iterator& it2) + { + return input_distance(it1, it2) * sizeof(typename Iterator::value_type); + } // Optimizations following: static const size_t accu_size {sizeof(size_t)}; @@ -637,7 +650,8 @@ namespace unicode { static_cast(addr[1])}); } } - }; + + }; // class ArchitectureOptimizer // From and To are facets template::value, bool> = true> @@ -653,16 +667,17 @@ namespace unicode { auto end{From::end(s)}; auto back_inserter{To::back_inserter(result)}; auto addr{reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])}; - while (input_distance(begin, end) >= accu_size) { + while (input_distance_bytes(begin, end) >= accu_size) { if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { - while (input_distance(begin, end) >= accu_size) { + while (input_distance_bytes(begin, end) >= accu_size) { typename arch_optimizer::accu_type data{*addr}; if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) { - arch_optimizer::template append(reinterpret_cast(addr), result); + arch_optimizer::template append(reinterpret_cast(addr), result); begin += accu_size / sizeof(typename From::value_type); ++addr; } else { - // just advance one code unit for now + // just advance one code unit for now and break to trigger unoptimized + // version until next accu boundary back_inserter = *begin; ++begin; break; -- cgit v1.2.3