summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-28 16:10:33 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-28 16:10:33 +0100
commit2b27deb54fec75ed529776f30be8eeb4ea239257 (patch)
tree4415fee92e9def0fadb6679098ec6a07f125c1ba
parent563557be9c97496b7435bef4e64730a379e55037 (diff)
Refactoring UTF-8 decoding, bugfixing
-rw-r--r--include/unicode.h51
1 files changed, 32 insertions, 19 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 4e4c7eb..2d7bf71 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -1,6 +1,6 @@
// libunicode
//
-// Author: Roland Reichwein
+// Author: Roland Reichwein <mail@reichwein.it>
//
// Available under the conditions of CC0 1.0 Universal
// https://creativecommons.org/publicdomain/zero/1.0/
@@ -104,11 +104,11 @@ namespace unicode::detail {
template<typename value_type, typename... Twords>
inline bool is_utf16_sequence(value_type word0, Twords... words) noexcept
{
- constexpr auto n{sizeof...(Twords) + 1};
+ constexpr auto sequence_length{sizeof...(Twords) + 1};
- static_assert(n <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
+ static_assert(sequence_length <= 2, "UTF-16 sequences of only 1 or 2 code units are supported");
- if constexpr(n == 1) {
+ if constexpr(sequence_length == 1) {
return is_valid_unicode(word0);
} else {
char16_t unit0 {static_cast<char16_t>(word0)};
@@ -143,22 +143,35 @@ namespace unicode::detail {
return true;
}
+ template<size_t sequence_length, typename value_type>
+ inline char32_t decode_utf8_leading_byte(value_type b) noexcept
+ {
+ return static_cast<char32_t>(b & (0b1111111 >> sequence_length)) << ((sequence_length - 1) * 6);
+ }
+
template<typename value_type>
- inline char32_t continuation_value(value_type b) noexcept
+ inline char32_t decode_utf8_followup_byte(value_type b) noexcept
{
return static_cast<char32_t>(b & 0b00111111);
}
template<typename value_type, typename... Targs>
- inline char32_t continuation_value(value_type b, Targs... Fargs) noexcept
+ inline char32_t decode_utf8_followup_byte(value_type b, Targs... bytes) noexcept
{
- return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
+ return decode_utf8_followup_byte(b) << (6 * sizeof...(Targs)) | decode_utf8_followup_byte(bytes...);
}
- template<size_t n, typename value_type>
- inline char32_t value_byte0_of(value_type b) noexcept
+ template<typename value_type, typename... Targs>
+ inline char32_t decode_utf8_sequence(value_type b, Targs... bytes) noexcept
{
- return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
+ size_t constexpr sequence_length{sizeof...(Targs) + 1};
+
+ static_assert(sequence_length <= 4);
+
+ if constexpr (sequence_length == 1)
+ return b;
+ else
+ return decode_utf8_leading_byte<sequence_length>(b) | decode_utf8_followup_byte(bytes...);
}
template<typename T, typename Container=std::basic_string<T>>
@@ -206,32 +219,32 @@ namespace unicode::detail {
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
if (is_utf8_sequence(byte0)) { // 1 byte: 7 bit ASCII
std::advance(iterator, 1);
- return byte0;
+ return decode_utf8_sequence(byte0);
} else {
internal_type value{};
if (size_t remaining{remaining_code_units()}; remaining >= 2) {
utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
if (is_utf8_sequence(byte0, byte1)) { // 2 bytes
- value = value_byte0_of<2>(byte0) | continuation_value(byte1);
+ value = decode_utf8_sequence(byte0, byte1);
std::advance(iterator, 2);
} else if (remaining >= 3) {
utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
if (is_utf8_sequence(byte0, byte1, byte2)) { // 3 bytes
- value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
+ value = decode_utf8_sequence(byte0, byte1, byte2);
std::advance(iterator, 3);
} else if (remaining >= 4) {
utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
if (is_utf8_sequence(byte0, byte1, byte2, byte3)) { // 4 bytes
- value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
+ value = decode_utf8_sequence(byte0, byte1, byte2, byte3);
std::advance(iterator, 4);
} else
throw std::invalid_argument("Bad input: Invalid 4 byte sequence");
} else
- throw std::invalid_argument("Bad input: Invalid 3 byte sequence");
+ throw std::invalid_argument("Bad UTF-8 input: 4th byte expected, none found (end of sequence)");
} else
- throw std::invalid_argument("Bad input: Invalid 2 byte sequence");
+ throw std::invalid_argument("Bad UTF-8 input: 3rd byte expected, none found (end of sequence)");
} else
- throw std::invalid_argument("Bad input: 2nd byte expected, none found");
+ throw std::invalid_argument("Bad UTF-8 input: 2nd byte expected, none found (end of sequence)");
// check only for sequences >= 2 bytes (ASCII is always compliant)
if (!unicode::is_valid_unicode(value))
@@ -782,13 +795,13 @@ namespace unicode {
typename To::string_type convert(const typename From::string_type& s)
{
// if input type == output type, only validate and return input, is appropriate
- if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) == 1 &&
+ if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) &&
std::is_same_v<From, UTF<utf_iterator<typename From::value_type>, utf_back_insert_iterator<typename From::value_type>>> &&
std::is_same_v<To, UTF<utf_iterator<typename To::value_type>, utf_back_insert_iterator<typename To::value_type>>>) {
if (validate_utf<typename From::value_type>(s)) {
return s;
} else {
- throw std::invalid_argument("Invalid UTF-8");
+ throw std::invalid_argument("Invalid UTF input");
}
} if constexpr(accu_size == 4 || accu_size == 8) {
return convert_optimized<From, To>(s);