summaryrefslogtreecommitdiffhomepage
path: root/include/unicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/unicode.h')
-rw-r--r--include/unicode.h73
1 files changed, 57 insertions, 16 deletions
diff --git a/include/unicode.h b/include/unicode.h
index b90ed15..512891a 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -4,8 +4,15 @@
#pragma once
#include <algorithm>
+#include <stdexcept>
#include <string>
+#ifdef __has_cpp_attribute
+#if __has_cpp_attribute(__cpp_char8_t)
+// char8_t available
+#endif
+#endif
+
namespace {
struct utf8_iterator
@@ -13,22 +20,57 @@ namespace {
typedef char32_t value_type;
typedef char32_t& reference;
- void get_value()
+ utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend):
+ iterator(cbegin), end_iterator(cend)
{
- // TODO: set value to current data in *iterator ...
- value = 'X';
+ calculate_value();
}
- size_t get_number_of_utf8_bytes()
+ utf8_iterator(const utf8_iterator& other) = default;
+ utf8_iterator& operator=(const utf8_iterator& other) = default;
+
+ // set value member
+ void calculate_value()
{
- // TODO: how many bytes
- return 1;
+ if (iterator == end_iterator)
+ return;
+
+ char8_t first_byte {*iterator};
+ if (first_byte & 0x80) { // 2-4 bytes
+ if (iterator + 1 != end_iterator) {
+ char8_t second_byte {*(iterator + 1)};
+ if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes
+ value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111);
+ sequence_length = 2;
+ } else if (iterator + 2 != end_iterator) {
+ char8_t third_byte {*(iterator + 2)};
+ if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes
+ value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111);
+ sequence_length = 3;
+ } else if (iterator + 3 != end_iterator) {
+ char8_t fourth_byte {*(iterator + 3)};
+ if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes
+ value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111);
+ sequence_length = 4;
+ } else
+ throw std::invalid_argument("bad input: invalid 4 byte sequence");
+ } else
+ throw std::invalid_argument("bad input: invalid 3 byte sequence");
+ } else
+ throw std::invalid_argument("bad input: invalid 2 byte sequence");
+ } else
+ throw std::invalid_argument("bad input: byte 2 expected, none found");
+ } else { // 1 byte: 7 bit ASCII
+ value = first_byte;
+ sequence_length = 1;
+ }
}
// pre-increment
utf8_iterator& operator++()
{
- iterator += get_number_of_utf8_bytes();
+ iterator += sequence_length;
+ calculate_value();
return *this;
}
@@ -39,14 +81,14 @@ namespace {
reference operator*()
{
- get_value();
return value;
}
std::u8string::const_iterator iterator;
-
std::u8string::const_iterator end_iterator;
+
value_type value{};
+ size_t sequence_length{};
};
struct utf16_back_insert_iterator
@@ -70,7 +112,12 @@ namespace {
// append utf-16 word sequence
reference operator=(const char32_t& value)
{
- s.push_back(0); // TODO
+ if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t!
+ s.push_back(value);
+ } else {
+ s.push_back((value >> 10) + 0xD800);
+ s.push_back((value & 0x3FF) + 0xDC00);
+ }
return *this;
}
@@ -96,12 +143,6 @@ namespace {
namespace unicode {
-// returns number of bytes in UTF-8 byte sequence of first found code point,
-// if found. 0 if none found or sequence empty.
-//size_t utf8_start()
-//{
-//}
-
std::u16string utf8_to_utf16(const std::u8string& s)
{
std::u16string result;