From 65c8f3fa5856b216816a25c2d3dc9d87bd3896ee Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Tue, 21 Dec 2021 20:11:06 +0100 Subject: Optimize on UTF input and output --- include/unicode.h | 69 ++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/include/unicode.h b/include/unicode.h index 7965a6e..43dc44e 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -115,16 +115,10 @@ namespace unicode::detail { template::type = true> inline value_type calculate_value() { - size_t remaining{remaining_code_units()}; - - if (!remaining) - return {}; - - value_type value{}; - utf8_t byte0 {static_cast(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes - if (remaining >= 2) { + value_type value{}; + if (size_t remaining{remaining_code_units()}; remaining >= 2) { utf8_t byte1 {static_cast(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); @@ -152,29 +146,23 @@ namespace unicode::detail { if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); + return value; } else { // 1 byte: 7 bit ASCII - value = byte0; std::advance(iterator, 1); + return byte0; } - - return value; } template::type = true> inline value_type calculate_value() { - size_t remaining{remaining_code_units()}; - - if (!remaining) - return {}; - char16_t unit0 {static_cast(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) std::advance(iterator, 1); return unit0; } else { - if (remaining < 2) + if (remaining_code_units() < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); char16_t unit1 {static_cast(get_code_unit<1>())}; @@ -189,11 +177,6 @@ namespace unicode::detail { template::type = true> inline value_type calculate_value() { - size_t remaining{remaining_code_units()}; - - if (!remaining) - return {}; - value_type result {static_cast(get_code_unit<0>())}; if (!unicode::is_valid_unicode(result)) @@ -284,23 +267,38 @@ namespace unicode::detail { return trailing_byte(value); } + template + inline void append(Arg&& arg) + { + if constexpr (std::is_same>::value) { + s.append({arg}); + } else { + s.emplace_back(arg); + } + } + + template + inline void append(Arg&& arg, Args&&... args) + { + if constexpr (std::is_same>::value) { + s.append({arg, args...}); + } else { + s.emplace_back(arg); + append(args...); + } + } + template::type = true> inline void append_utf(const char32_t& value) { if (value < 0x80) { // 1 byte - s.push_back(static_cast(value)); + append(static_cast(value)); } else if (value < 0x800) { // 2 bytes - s.push_back(byte_n_of_m<0,2>(value)); - s.push_back(byte_n_of_m<1,2>(value)); + append(byte_n_of_m<0,2>(value), byte_n_of_m<1,2>(value)); } else if (value < 0x10000) { // 3 bytes - s.push_back(byte_n_of_m<0,3>(value)); - s.push_back(byte_n_of_m<1,3>(value)); - s.push_back(byte_n_of_m<2,3>(value)); + append(byte_n_of_m<0,3>(value), byte_n_of_m<1,3>(value), byte_n_of_m<2,3>(value)); } else if (value < 0x110000) { // 4 bytes - s.push_back(byte_n_of_m<0,4>(value)); - s.push_back(byte_n_of_m<1,4>(value)); - s.push_back(byte_n_of_m<2,4>(value)); - s.push_back(byte_n_of_m<3,4>(value)); + append(byte_n_of_m<0,4>(value), byte_n_of_m<1,4>(value), byte_n_of_m<2,4>(value), byte_n_of_m<3,4>(value)); } else throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast(value))); } @@ -309,11 +307,10 @@ namespace unicode::detail { inline void append_utf(const char32_t& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) - s.push_back(static_cast(value)); + append(static_cast(value)); } else { char32_t value_reduced{value - 0x10000}; - s.push_back((value_reduced >> 10) + 0xD800); - s.push_back((value_reduced & 0x3FF) + 0xDC00); + append(static_cast((value_reduced >> 10) + 0xD800), static_cast((value_reduced & 0x3FF) + 0xDC00)); } } @@ -321,7 +318,7 @@ namespace unicode::detail { inline void append_utf(const char32_t& value) { // expect value to be already valid Unicode values (checked in input iterator) - s.push_back(value); + append(static_cast(value)); } reference operator=(const char32_t& value) -- cgit v1.2.3