summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-21 20:11:06 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-21 20:11:06 +0100
commit65c8f3fa5856b216816a25c2d3dc9d87bd3896ee (patch)
tree9d6b1c66021301ab1e83c9866196e47b2ad2405c
parent3ca9f389084a2defe1fff2046dd3450e0b242e58 (diff)
Optimize on UTF input and output
-rw-r--r--include/unicode.h69
1 files changed, 33 insertions, 36 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 7965a6e..43dc44e 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -115,16 +115,10 @@ namespace unicode::detail {
template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
inline value_type calculate_value()
{
- size_t remaining{remaining_code_units()};
-
- if (!remaining)
- return {};
-
- value_type value{};
-
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
if (byte0 & 0x80) { // 2-4 bytes
- if (remaining >= 2) {
+ value_type value{};
+ if (size_t remaining{remaining_code_units()}; remaining >= 2) {
utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
value = value_byte0_of<2>(byte0) | continuation_value(byte1);
@@ -152,29 +146,23 @@ namespace unicode::detail {
if (!unicode::is_valid_unicode(value))
throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+ return value;
} else { // 1 byte: 7 bit ASCII
- value = byte0;
std::advance(iterator, 1);
+ return byte0;
}
-
- return value;
}
template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
inline value_type calculate_value()
{
- size_t remaining{remaining_code_units()};
-
- if (!remaining)
- return {};
-
char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane)
std::advance(iterator, 1);
return unit0;
} else {
- if (remaining < 2)
+ if (remaining_code_units() < 2)
throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())};
@@ -189,11 +177,6 @@ namespace unicode::detail {
template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
inline value_type calculate_value()
{
- size_t remaining{remaining_code_units()};
-
- if (!remaining)
- return {};
-
value_type result {static_cast<char32_t>(get_code_unit<0>())};
if (!unicode::is_valid_unicode(result))
@@ -284,23 +267,38 @@ namespace unicode::detail {
return trailing_byte<m - n - 1>(value);
}
+ template<typename Arg>
+ inline void append(Arg&& arg)
+ {
+ if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) {
+ s.append({arg});
+ } else {
+ s.emplace_back(arg);
+ }
+ }
+
+ template<typename Arg, typename... Args>
+ inline void append(Arg&& arg, Args&&... args)
+ {
+ if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) {
+ s.append({arg, args...});
+ } else {
+ s.emplace_back(arg);
+ append(args...);
+ }
+ }
+
template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
inline void append_utf(const char32_t& value)
{
if (value < 0x80) { // 1 byte
- s.push_back(static_cast<value_type>(value));
+ append(static_cast<value_type>(value));
} else if (value < 0x800) { // 2 bytes
- s.push_back(byte_n_of_m<0,2>(value));
- s.push_back(byte_n_of_m<1,2>(value));
+ append(byte_n_of_m<0,2>(value), byte_n_of_m<1,2>(value));
} else if (value < 0x10000) { // 3 bytes
- s.push_back(byte_n_of_m<0,3>(value));
- s.push_back(byte_n_of_m<1,3>(value));
- s.push_back(byte_n_of_m<2,3>(value));
+ append(byte_n_of_m<0,3>(value), byte_n_of_m<1,3>(value), byte_n_of_m<2,3>(value));
} else if (value < 0x110000) { // 4 bytes
- s.push_back(byte_n_of_m<0,4>(value));
- s.push_back(byte_n_of_m<1,4>(value));
- s.push_back(byte_n_of_m<2,4>(value));
- s.push_back(byte_n_of_m<3,4>(value));
+ append(byte_n_of_m<0,4>(value), byte_n_of_m<1,4>(value), byte_n_of_m<2,4>(value), byte_n_of_m<3,4>(value));
} else
throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
}
@@ -309,11 +307,10 @@ namespace unicode::detail {
inline void append_utf(const char32_t& value)
{
if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
- s.push_back(static_cast<value_type>(value));
+ append(static_cast<value_type>(value));
} else {
char32_t value_reduced{value - 0x10000};
- s.push_back((value_reduced >> 10) + 0xD800);
- s.push_back((value_reduced & 0x3FF) + 0xDC00);
+ append(static_cast<T>((value_reduced >> 10) + 0xD800), static_cast<T>((value_reduced & 0x3FF) + 0xDC00));
}
}
@@ -321,7 +318,7 @@ namespace unicode::detail {
inline void append_utf(const char32_t& value)
{
// expect value to be already valid Unicode values (checked in input iterator)
- s.push_back(value);
+ append(static_cast<value_type>(value));
}
reference operator=(const char32_t& value)