summaryrefslogtreecommitdiffhomepage
path: root/include
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-19 19:51:38 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-19 19:51:38 +0100
commit9dc97269201603dd479e15a736a64479a5095556 (patch)
treeb5f215bf9cfbbf2bee092505f4fdfbf3e4501b7b /include
parente24a0d5d371d0916dbfb375d3ea404f7e6237c74 (diff)
Simplify utf_iterator for input, build on Debian 10+11, Ubuntu 2004-2204
Diffstat (limited to 'include')
-rw-r--r--include/unicode.h85
1 files changed, 37 insertions, 48 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 6b6f21a..6d8aac5 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -28,7 +28,7 @@ namespace unicode {
// usually, char32_t, uint32_t etc.
template<typename T>
- static inline bool is_valid_unicode(const T& value)
+ static inline bool is_valid_unicode(const T& value) noexcept
{
return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF);
}
@@ -55,19 +55,18 @@ namespace unicode::detail {
utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
iterator(cbegin), end_iterator(cend)
{
- calculate_value();
}
utf_iterator(const utf_iterator& other) = default;
utf_iterator& operator=(const utf_iterator& other) = default;
- size_t remaining_code_units() const
+ size_t remaining_code_units() const noexcept
{
return std::distance(iterator, end_iterator);
}
template<size_t index>
- T get_code_unit() const
+ T get_code_unit() const noexcept
{
if constexpr (std::is_same<Container, typename std::list<T>>::value) {
// std::list doesn't support it + n
@@ -79,46 +78,49 @@ namespace unicode::detail {
}
}
- inline static bool is_continuation_byte(T b)
+ inline static bool is_continuation_byte(T b) noexcept
{
return (b & 0b11000000) == 0b10000000;
}
template<typename... Targs>
- inline static bool is_continuation_byte(T b, Targs... Fargs)
+ inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept
{
return is_continuation_byte(b) && is_continuation_byte(Fargs...);
}
template<size_t n>
- inline static bool is_byte0_of(T b)
+ inline static bool is_byte0_of(T b) noexcept
{
return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n));
}
- inline static char32_t continuation_value(T b)
+ inline static char32_t continuation_value(T b) noexcept
{
return static_cast<char32_t>(b & 0b00111111);
}
template<typename... Targs>
- inline static char32_t continuation_value(T b, Targs... Fargs)
+ inline static char32_t continuation_value(T b, Targs... Fargs) noexcept
{
return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
}
template<size_t n>
- inline static char32_t value_byte0_of(T b)
+ inline static char32_t value_byte0_of(T b) noexcept
{
return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
}
- void calculate_value_utf8()
+ template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ inline value_type calculate_value()
{
size_t remaining{remaining_code_units()};
if (!remaining)
- return;
+ return {};
+
+ value_type value{};
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
if (byte0 & 0x80) { // 2-4 bytes
@@ -126,17 +128,17 @@ namespace unicode::detail {
utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
value = value_byte0_of<2>(byte0) | continuation_value(byte1);
- sequence_length = 2;
+ std::advance(iterator, 2);
} else if (remaining >= 3) {
utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes
value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
- sequence_length = 3;
+ std::advance(iterator, 3);
} else if (remaining >= 4) {
utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes
value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
- sequence_length = 4;
+ std::advance(iterator, 4);
} else
throw std::invalid_argument("Bad input: Invalid 4 byte sequence");
} else
@@ -152,22 +154,25 @@ namespace unicode::detail {
} else { // 1 byte: 7 bit ASCII
value = byte0;
- sequence_length = 1;
+ std::advance(iterator, 1);
}
+
+ return value;
}
- void calculate_value_utf16()
+ template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ inline value_type calculate_value()
{
size_t remaining{remaining_code_units()};
if (!remaining)
- return;
+ return {};
char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane)
- value = unit0;
- sequence_length = 1;
+ std::advance(iterator, 1);
+ return unit0;
} else {
if (remaining < 2)
throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
@@ -176,45 +181,32 @@ namespace unicode::detail {
if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
- value = (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
- sequence_length = 2;
+ std::advance(iterator, 2);
+ return (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
}
}
- void calculate_value_utf32()
+ template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ inline value_type calculate_value()
{
size_t remaining{remaining_code_units()};
if (!remaining)
- return;
+ return {};
- value = static_cast<char32_t>(get_code_unit<0>());
-
- if (!unicode::is_valid_unicode(value))
- throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+ value_type result {static_cast<char32_t>(get_code_unit<0>())};
- sequence_length = 1;
- }
+ if (!unicode::is_valid_unicode(result))
+ throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
- // set value member
- void calculate_value()
- {
- static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+ std::advance(iterator, 1);
- if constexpr(sizeof(T) == 1) {
- calculate_value_utf8();
- } else if constexpr (sizeof(T) == 2) {
- calculate_value_utf16();
- } else if constexpr (sizeof(T) == 4) {
- calculate_value_utf32();
- }
+ return result;
}
// pre-increment
utf_iterator& operator++()
{
- std::advance(iterator, sequence_length);
- calculate_value();
return *this;
}
@@ -223,17 +215,14 @@ namespace unicode::detail {
return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
}
- reference operator*()
+ value_type operator*()
{
- return value;
+ return calculate_value();
}
private:
typename string_type::const_iterator iterator;
typename string_type::const_iterator end_iterator;
-
- char32_t value{}; // always save complete unicode code point at this point
- size_t sequence_length{};
};
template<typename T, typename Container=std::basic_string<T>>