summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-28 12:46:30 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-28 12:46:30 +0100
commit403c885d67f79c637ebcb303722adfd6a4b8195e (patch)
treed8f40c674a5c65176e028a1c7bb9122baa2e7756
parent970ba4111160fbf78351b21a024c46c0978e0440 (diff)
Optimize UTF validation
-rw-r--r--debian/control5
-rw-r--r--include/unicode.h95
-rw-r--r--src/test-unicode.cpp77
3 files changed, 126 insertions, 51 deletions
diff --git a/debian/control b/debian/control
index a06886a..fcc0185 100644
--- a/debian/control
+++ b/debian/control
@@ -15,10 +15,11 @@ Description: Unicode conversion library
UTF-8, UTF-16 and UTF-32.
.
Features:
+ - Tested on Debian 10+11, Ubuntu 2004 to 2204
+ - C++17 and C++20 compatible
- Additional support for ISO-8859-1 encoding (Latin-1) as subset of Unicode
- Additional support for ISO-8859-15
- - Tested on Debian 10+11, Ubuntu 2004 to 2110
- - C++17 and C++20 compatible
+ - Header only
Package: unicode-tools
Architecture: any
diff --git a/include/unicode.h b/include/unicode.h
index 4064233..be91d77 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -47,12 +47,6 @@ namespace unicode::detail {
using namespace std::string_literals;
- template<typename value_type>
- inline bool is_utf8_followup_byte(value_type b) noexcept
- {
- return (b & 0b11000000) == 0b10000000;
- }
-
template<size_t sequence_length, typename value_type>
inline bool is_utf8_leading_byte(value_type byte) noexcept
{
@@ -65,22 +59,26 @@ namespace unicode::detail {
}
}
+ template<typename value_type>
+ inline bool is_utf8_followup_byte(value_type b) noexcept
+ {
+ return (b & 0b11000000) == 0b10000000;
+ }
+
template<typename value_type, typename... Tbytes>
inline bool is_utf8_sequence(value_type byte0, Tbytes... bytes) noexcept
{
constexpr auto n{sizeof...(Tbytes) + 1};
- static_assert(n <= 4);
+ static_assert(n <= 4, "UTF-8 sequences of 1 through 4 code units are supported");
return is_utf8_leading_byte<n>(byte0) &&
- (is_utf8_followup_byte(bytes) && ...);
+ (... && is_utf8_followup_byte(bytes)); // left fold for linear evaluation from left to right
}
- template<typename T>
- inline bool validate_utf8(const std::basic_string<T>& s)
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 1), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
{
- static_assert(sizeof(T) == 1);
-
int i{};
auto size{s.size()};
while (i < size) {
@@ -103,6 +101,48 @@ namespace unicode::detail {
return true;
}
+ template<typename value_type, typename... Twords>
+ inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept
+ {
+ constexpr auto n{sizeof...(Twords) + 1};
+
+ static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
+
+ if constexpr(n == 1) {
+ return is_valid_unicode(word0);
+ } else {
+ char16_t unit0 {static_cast<char16_t>(word0)};
+ char16_t unit1 {static_cast<char16_t>((words, ...))};
+ return (unit0 & 0xFC00) == 0xD800 && (unit1 & 0xFC00) == 0xDC00;
+ }
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 2), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+ int i{};
+ auto size{s.size()};
+ while (i < size) {
+ if (is_utf16_sequence(s[i])) {
+ i++;
+ } else if ((i < size - 1) && is_utf16_sequence(s[i], s[i + 1])) {
+ i += 2;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template<typename T, typename std::enable_if_t<(sizeof(T) == 4), bool> = true>
+ inline bool validate_utf(const std::basic_string<T>& s)
+ {
+ for (auto i: s)
+ if (!is_valid_unicode(i))
+ return false;
+ return true;
+ }
+
template<typename value_type>
inline char32_t continuation_value(value_type b) noexcept
{
@@ -160,7 +200,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
inline internal_type calculate_value()
{
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
@@ -201,7 +241,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
inline internal_type calculate_value()
{
char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
@@ -222,7 +262,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
inline internal_type calculate_value()
{
internal_type result {static_cast<internal_type>(get_code_unit<0>())};
@@ -348,7 +388,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 1), bool> = true>
inline void append_utf(const internal_type& value)
{
if (value < 0x80) { // 1 byte
@@ -363,7 +403,7 @@ namespace unicode::detail {
throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 2), bool> = true>
inline void append_utf(const internal_type& value)
{
if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
@@ -374,7 +414,7 @@ namespace unicode::detail {
}
}
- template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ template<class X = value_type, typename std::enable_if_t<(sizeof(X) == 4), bool> = true>
inline void append_utf(const internal_type& value)
{
// expect value to be already valid Unicode values (checked in input iterator)
@@ -741,12 +781,12 @@ namespace unicode {
template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
typename To::string_type convert(const typename From::string_type& s)
{
- if constexpr(sizeof(typename From::value_type) == 1 && sizeof(typename To::value_type) == 1 && std::is_same_v<From, UTF_8> && std::is_same_v<To, UTF_8>) {
- if (validate_utf8<typename From::value_type>(s)) {
- if constexpr (std::is_same_v<typename From::value_type, typename To::value_type>)
- return s;
- else
- return typename To::string_type{s.begin(), s.end()};
+ // if input type == output type, only validate and return input, is appropriate
+ if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 &&
+ std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> &&
+ std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) {
+ if (validate_utf<typename From::value_type>(s)) {
+ return s;
} else {
throw std::invalid_argument("Invalid UTF-8");
}
@@ -848,12 +888,7 @@ namespace unicode {
template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true>
bool is_valid_utf(const typename Facet::string_type& s)
{
- try {
- std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){});
- } catch (const std::invalid_argument&) {
- return false;
- }
- return true;
+ return validate_utf<typename Facet::value_type>(s);
}
} // namespace unicode
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 2675989..99e164b 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -283,6 +283,9 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
// test facet interface
result = unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet, typename unicode::Encoding<typename To::value_type>::Facet>(std::get<i>(t));
BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
+
+ // test actual results by comparing with boost::locale::conv results
+ BOOST_CHECK_EQUAL(result, (boost::locale::conv::utf_to_utf<typename To::value_type, typename From::value_type>(std::get<i>(t))));
// iterate over other combinations
if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
@@ -650,26 +653,10 @@ BOOST_AUTO_TEST_CASE(convert)
BOOST_CHECK((unicode::convert<char, char32_t>("äöü")) == std::u32string{U"äöü"});
-#ifdef _WIN32
- BOOST_CHECK(sizeof(wchar_t) == 2);
-#else // Unix like
- BOOST_CHECK(sizeof(wchar_t) == 4);
-#endif
-
- // For the following checks, wchar_t size and encoding is system dependent:
- // Windows: UTF-16
- // Linux: UTF-32
- BOOST_CHECK((unicode::convert<char, wchar_t>("äöü")) == std::wstring{L"äöü"});
- BOOST_CHECK((unicode::convert<char, wchar_t>("\u732b")) == std::wstring{L"\u732b"});
- BOOST_CHECK((unicode::convert<char, wchar_t>("\U0001F63A")) == std::wstring{L"\U0001F63A"});
- BOOST_CHECK((unicode::convert<wchar_t, char32_t>(L"\U0001F63A")) == std::u32string{U"\U0001F63A"});
- BOOST_CHECK((unicode::convert<wchar_t, utf8_t>(L"\U0001F63A")) == std::basic_string<utf8_t>{(utf8_t*)"\U0001F63A"});
+ // vector
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<char16_t>>(std::vector<char>{})) == std::vector<char16_t>{});
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<char16_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<char16_t>{u'ä', u'ö', u'ü'}));
- BOOST_CHECK((unicode::convert<std::string, std::wstring>(std::string{"äöü"})) == std::wstring{L"äöü"});
-
- BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{});
- BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<wchar_t>{L'ä', L'ö', L'ü'}));
-
// deque
BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{})) == std::deque<wchar_t>{});
BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque<wchar_t>{L'ä', L'ö', L'ü'}));
@@ -703,6 +690,58 @@ BOOST_AUTO_TEST_CASE(convert)
BOOST_CHECK((unicode::convert<std::array<uint8_t, 6>, std::list<uint16_t>>(std::array<uint8_t, 6>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list<uint16_t>{L'ä', L'ö', L'ü'}));
}
+// wchar_t specific tests: system dependent
+BOOST_AUTO_TEST_CASE(convert_wstring)
+{
+#ifdef _WIN32
+ BOOST_CHECK(sizeof(wchar_t) == 2);
+#else // Unix like
+ BOOST_CHECK(sizeof(wchar_t) == 4);
+#endif
+
+ // For the following checks, wchar_t size and encoding is system dependent:
+ // Windows: UTF-16
+ // Linux: UTF-32
+ BOOST_CHECK((unicode::convert<char, wchar_t>("äöü")) == std::wstring{L"äöü"});
+ BOOST_CHECK((unicode::convert<char, wchar_t>("\u732b")) == std::wstring{L"\u732b"});
+ BOOST_CHECK((unicode::convert<char, wchar_t>("\U0001F63A")) == std::wstring{L"\U0001F63A"});
+ BOOST_CHECK((unicode::convert<wchar_t, char32_t>(L"\U0001F63A")) == std::u32string{U"\U0001F63A"});
+ BOOST_CHECK((unicode::convert<wchar_t, utf8_t>(L"\U0001F63A")) == std::basic_string<utf8_t>{(utf8_t*)"\U0001F63A"});
+
+ BOOST_CHECK((unicode::convert<std::string, std::wstring>(std::string{"äöü"})) == std::wstring{L"äöü"});
+
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{});
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<wchar_t>{L'ä', L'ö', L'ü'}));
+
+ std::u16string u16_value{u"\U0001F63A"};
+ std::u32string u32_value{U"\U0001F63A"};
+ std::wstring w_value{L"\U0001F63A"};
+
+ std::u16string result_u16_value{unicode::convert<std::wstring, std::u16string>(w_value)};
+ std::u32string result_u32_value{unicode::convert<std::wstring, std::u32string>(w_value)};
+ std::wstring result_w_value_1{unicode::convert<std::u16string, std::wstring>(u16_value)};
+ std::wstring result_w_value_2{unicode::convert<std::u32string, std::wstring>(u32_value)};
+
+ BOOST_CHECK_EQUAL(u16_value.size(), 2);
+ BOOST_CHECK_EQUAL(u32_value.size(), 1);
+ BOOST_CHECK_EQUAL(result_u16_value.size(), 2);
+ BOOST_CHECK_EQUAL(result_u32_value.size(), 1);
+ BOOST_CHECK_EQUAL(u16_value, result_u16_value);
+ BOOST_CHECK_EQUAL(u32_value, result_u32_value);
+ BOOST_CHECK(w_value == result_w_value_1);
+ BOOST_CHECK(w_value == result_w_value_2);
+#ifdef _WIN32
+ BOOST_CHECK_EQUAL(w_value.size(), 2);
+ BOOST_CHECK_EQUAL(result_w_value_1.size(), 2);
+ BOOST_CHECK_EQUAL(result_w_value_2.size(), 2);
+#else // Unix like
+ BOOST_CHECK_EQUAL(w_value.size(), 1);
+ BOOST_CHECK_EQUAL(result_w_value_1.size(), 1);
+ BOOST_CHECK_EQUAL(result_w_value_2.size(), 1);
+#endif
+
+}
+
BOOST_AUTO_TEST_CASE(is_valid_utf)
{
BOOST_CHECK(unicode::is_valid_utf<char16_t>(u"äöü"));