summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-27 21:51:05 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-27 21:51:05 +0100
commit970ba4111160fbf78351b21a024c46c0978e0440 (patch)
tree300e5a70adde02999845aa05e1727b3510fd62aa
parentd992304dc12f078f12eb971c6283e0b54054e6b1 (diff)
Optimize UTF-8 validation
-rw-r--r--include/unicode.h210
1 files changed, 132 insertions, 78 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 395f172..4064233 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -47,6 +47,80 @@ namespace unicode::detail {
using namespace std::string_literals;
+ template<typename value_type>
+ inline bool is_utf8_followup_byte(value_type b) noexcept
+ {
+ return (b & 0b11000000) == 0b10000000;
+ }
+
+ template<size_t sequence_length, typename value_type>
+ inline bool is_utf8_leading_byte(value_type byte) noexcept
+ {
+ static_assert(sequence_length <= 4);
+
+ if constexpr(sequence_length == 1) {
+ return !(byte & 0x80);
+ } else {
+ return (byte & static_cast<value_type>(0xFF << (7 - sequence_length))) == static_cast<value_type>(0xFF << (8 - sequence_length));
+ }
+ }
+
+ template<typename value_type, typename... Tbytes>
+ inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept
+ {
+ constexpr auto n{sizeof...(Tbytes) + 1};
+
+ static_assert(n <= 4);
+
+ return is_utf8_leading_byte<n>(byte0) &&
+ (is_utf8_followup_byte(bytes) && ...);
+ }
+
+ template<typename T>
+ inline bool validate_utf8(const std::basic_string<T>& s)
+ {
+ static_assert(sizeof(T) == 1);
+
+ int i{};
+ auto size{s.size()};
+ while (i < size) {
+ if (is_utf8_sequence(s[i])) {
+ i++;
+ } else if ((i < size - 1) && is_utf8_sequence(s[i], s[i + 1])) {
+ i += 2;
+ } else if ((i < size - 2) && is_utf8_sequence(s[i], s[i + 1], s[i + 2])) {
+ if (((s[i] & 0xF) == 0xD) && ((s[i + 1] & 0x20) == 0x20))
+ return false; // Reserved for UTF-16 surrogates: 0xD800..0xDFFF
+ i += 3;
+ } else if ((i < size - 3) && is_utf8_sequence(s[i], s[i + 1], s[i + 2], s[i + 3])) {
+ if ((((s[i] & 7) << 2) | ((s[i + 1] >> 4) & 3)) >= 0x11)
+ return false; // Unicode too big above 0x10FFFF
+ i += 4;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template<typename value_type>
+ inline char32_t continuation_value(value_type b) noexcept
+ {
+ return static_cast<char32_t>(b & 0b00111111);
+ }
+
+ template<typename value_type, typename... Targs>
+ inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept
+ {
+ return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
+ }
+
+ template<size_t n, typename value_type>
+ inline char32_t value_byte0_of(value_type b) noexcept
+ {
+ return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
+ }
+
template<typename T, typename Container=std::basic_string<T>>
struct utf_iterator
{
@@ -86,61 +160,28 @@ namespace unicode::detail {
}
}
- inline static bool is_continuation_byte(value_type b) noexcept
- {
- return (b & 0b11000000) == 0b10000000;
- }
-
- template<typename... Targs>
- inline static bool is_continuation_byte(value_type b, Targs... Fargs) noexcept
- {
- return is_continuation_byte(b) && is_continuation_byte(Fargs...);
- }
-
- template<size_t n>
- inline static bool is_byte0_of(value_type b) noexcept
- {
- static_assert(n >= 2 && n <= 4);
-
- return (b & static_cast<value_type>(0xFF << (7 - n))) == static_cast<value_type>(0xFF << (8 - n));
- }
-
- inline static internal_type continuation_value(value_type b) noexcept
- {
- return static_cast<internal_type>(b & 0b00111111);
- }
-
- template<typename... Targs>
- inline static internal_type continuation_value(value_type b, Targs... Fargs) noexcept
- {
- return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
- }
-
- template<size_t n>
- inline static internal_type value_byte0_of(value_type b) noexcept
- {
- return static_cast<internal_type>(b & (0b1111111 >> n)) << ((n - 1) * 6);
- }
-
template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
inline internal_type calculate_value()
{
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
- if (byte0 & 0x80) { // 2-4 bytes
+ if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII
+ std::advance(iterator, 1);
+ return byte0;
+ } else {
internal_type value{};
if (size_t remaining{remaining_code_units()}; remaining >= 2) {
utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
- if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
+ if (is_utf8_sequence(byte0, byte1)) { // 2 bytes
value = value_byte0_of<2>(byte0) | continuation_value(byte1);
std::advance(iterator, 2);
} else if (remaining >= 3) {
utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
- if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes
+ if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes
value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
std::advance(iterator, 3);
} else if (remaining >= 4) {
utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
- if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes
+ if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes
value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
std::advance(iterator, 4);
} else
@@ -157,9 +198,6 @@ namespace unicode::detail {
throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
return value;
- } else { // 1 byte: 7 bit ASCII
- std::advance(iterator, 1);
- return byte0;
}
}
@@ -655,56 +693,72 @@ namespace unicode {
// From and To are facets
template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
- typename To::string_type convert(const typename From::string_type& s)
+ typename To::string_type convert_optimized(const typename From::string_type& s)
{
typename To::string_type result;
-
- if constexpr(accu_size == 4 || accu_size == 8) {
- typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer;
- typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer;
-
- auto begin{From::begin(s)};
- auto end{From::end(s)};
- auto back_inserter{To::back_inserter(result)};
- auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])};
- while (input_distance_bytes(begin, end) >= accu_size) {
- if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) {
- while (input_distance_bytes(begin, end) >= accu_size) {
- typename arch_optimizer::accu_type data{*addr};
- if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) {
- arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result);
- begin += accu_size / sizeof(typename From::value_type);
- ++addr;
- } else {
- // just advance one code unit for now and break to trigger unoptimized
- // version until next accu boundary
- back_inserter = *begin;
- ++begin;
- break;
- }
+ typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer;
+ typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer;
+
+ auto begin{From::begin(s)};
+ auto end{From::end(s)};
+ auto back_inserter{To::back_inserter(result)};
+ auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])};
+ while (input_distance_bytes(begin, end) >= accu_size) {
+ if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) {
+ while (input_distance_bytes(begin, end) >= accu_size) {
+ typename arch_optimizer::accu_type data{*addr};
+ if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) {
+ arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result);
+ begin += accu_size / sizeof(typename From::value_type);
+ ++addr;
+ } else {
+ // just advance one code unit for now and break to trigger unoptimized
+ // version until next accu boundary
+ back_inserter = *begin;
+ ++begin;
+ break;
}
}
-
- // keep up after unaligned Non-ASCII code points
- while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) {
- back_inserter = *begin;
- ++begin;
- }
}
- // remainder < 8 bytes
- while (begin != end) {
+ // keep up after unaligned Non-ASCII code points
+ while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) {
back_inserter = *begin;
++begin;
}
+ }
- } else {
- std::copy(From::begin(s), From::end(s), To::back_inserter(result));
+ // remainder < 8 bytes
+ while (begin != end) {
+ back_inserter = *begin;
+ ++begin;
}
return result;
}
+ // From and To are facets
+ template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
+ typename To::string_type convert(const typename From::string_type& s)
+ {
+ if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) {
+ if (validate_utf8<typename From::value_type>(s)) {
+ if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>)
+ return s;
+ else
+ return typename To::string_type{s.begin(), s.end()};
+ } else {
+ throw std::invalid_argument("Invalid UTF-8");
+ }
+ } if constexpr(accu_size == 4 || accu_size == 8) {
+ return convert_optimized<From, To>(s);
+ } else {
+ typename To::string_type result;
+ std::copy(From::begin(s), From::end(s), To::back_inserter(result));
+ return result;
+ }
+ }
+
// Helper to get correct Facet from char type, e.g. Encoding<typename decltype(s)::value_type>::Facet
template<typename T>
struct Encoding