summaryrefslogtreecommitdiffhomepage
path: root/include
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-28 12:46:30 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-28 12:46:30 +0100
commit403c885d67f79c637ebcb303722adfd6a4b8195e (patch)
treed8f40c674a5c65176e028a1c7bb9122baa2e7756 /include
parent970ba4111160fbf78351b21a024c46c0978e0440 (diff)
Optimize UTF validation
Diffstat (limited to 'include')
-rw-r--r--include/unicode.h95
1 files changed, 65 insertions, 30 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 4064233..be91d77 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -47,12 +47,6 @@ namespace unicode::detail {
using namespace std::string_literals;
- template<typename value_type>
- inline bool is_utf8_followup_byte(value_type b) noexcept
- {
- return (b & 0b11000000) == 0b10000000;
- }
-
template<size_t sequence_length, typename value_type>
inline bool is_utf8_leading_byte(value_type byte) noexcept
{
@@ -65,22 +59,26 @@ namespace unicode::detail {
}
}
+ template<typename value_type>
+ inline bool is_utf8_followup_byte(value_type b) noexcept
+ {
+ return (b & 0b11000000) == 0b10000000;
+ }
+
template<typename value_type, typename... Tbytes>
inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept
{
constexpr auto n{sizeof...(Tbytes) + 1};
- static_assert(n <= 4);
+ static_assert(n <= 4, "UTF-8 sequences of 1 through 4 code units are supported");
return is_utf8_leading_byte<n>(byte0) &&
- (is_utf8_followup_byte(bytes) && ...);
+ (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right
}
- template<typename T>
- inline bool validate_utf8(const std::basic_string<T>& s)
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
{
- static_assert(sizeof(T) == 1);
-
int i{};
auto size{s.size()};
while (i < size) {
@@ -103,6 +101,48 @@ namespace unicode::detail {
return true;
}
+ template<typename value_type, typename... Twords>
+ inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept
+ {
+ constexpr auto n{sizeof...(Twords) + 1};
+
+ static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
+
+ if constexpr(n == 1) {
+ return is_valid_unicode(word0);
+ } else {
+ char16_t unit0 {static_cast<char16_t>(word0)};
+ char16_t unit1 {static_cast<char16_t>((words, ...))};
+ return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00;
+ }
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+ int i{};
+ auto size{s.size()};
+ while (i < size) {
+ if (is_utf16_sequence(s[i])) {
+ i++;
+ } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) {
+ i += 2;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+ for (auto i: s)
+ if (!is_valid_unicode(i))
+ return false;
+ return true;
+ }
+
template<typename value_type>
inline char32_t continuation_value(value_type b) noexcept
{
@@ -160,7 +200,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
inline internal_type calculate_value()
{
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
@@ -201,7 +241,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
inline internal_type calculate_value()
{
char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
@@ -222,7 +262,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
inline internal_type calculate_value()
{
internal_type result {static_cast<internal_type>(get_code_unit<0>())};
@@ -348,7 +388,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
inline void append_utf(const internal_type& value)
{
if (value < 0x80) { // 1 byte
@@ -363,7 +403,7 @@ namespace unicode::detail {
throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
inline void append_utf(const internal_type& value)
{
if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
@@ -374,7 +414,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
inline void append_utf(const internal_type& value)
{
// expect value to be already valid Unicode values (checked in input iterator)
@@ -741,12 +781,12 @@ namespace unicode {
template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
typename To::string_type convert(const typename From::string_type& s)
{
- if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) {
- if (validate_utf8<typename From::value_type>(s)) {
- if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>)
- return s;
- else
- return typename To::string_type{s.begin(), s.end()};
+ // if input type == output type, only validate and return input, is appropriate
+ if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 &&
+ std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> &&
+ std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) {
+ if (validate_utf<typename From::value_type>(s)) {
+ return s;
} else {
throw std::invalid_argument("Invalid UTF-8");
}
@@ -848,12 +888,7 @@ namespace unicode {
template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true>
bool is_valid_utf(const typename Facet::string_type& s)
{
- try {
- std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){});
- } catch (const std::invalid_argument&) {
- return false;
- }
- return true;
+ return validate_utf<typename Facet::value_type>(s);
}
} // namespace unicode