summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-01-25 18:54:25 +0100
committerRoland Reichwein <mail@reichwein.it>2021-01-25 18:54:25 +0100
commit918d015302a004755ce0cf4968793cdf6a61bca8 (patch)
treea5c962b33a46249dfebf2a8506d9acedefac523c
parentd8bddb9dc248bb3cc04116c97259ea6f5c13e6d0 (diff)
Add first working conversion UTF-8 -> UTF-16
-rw-r--r--include/unicode.h73
-rw-r--r--src/test-unicode.cpp15
2 files changed, 69 insertions, 19 deletions
diff --git a/include/unicode.h b/include/unicode.h
index b90ed15..512891a 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -4,8 +4,15 @@
#pragma once
#include <algorithm>
+#include <stdexcept>
#include <string>
+#ifdef __has_cpp_attribute
+#if __has_cpp_attribute(__cpp_char8_t)
+// char8_t available
+#endif
+#endif
+
namespace {
struct utf8_iterator
@@ -13,22 +20,57 @@ namespace {
typedef char32_t value_type;
typedef char32_t& reference;
- void get_value()
+ utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend):
+ iterator(cbegin), end_iterator(cend)
{
- // TODO: set value to current data in *iterator ...
- value = 'X';
+ calculate_value();
}
- size_t get_number_of_utf8_bytes()
+ utf8_iterator(const utf8_iterator& other) = default;
+ utf8_iterator& operator=(const utf8_iterator& other) = default;
+
+ // set value member
+ void calculate_value()
{
- // TODO: how many bytes
- return 1;
+ if (iterator == end_iterator)
+ return;
+
+ char8_t first_byte {*iterator};
+ if (first_byte & 0x80) { // 2-4 bytes
+ if (iterator + 1 != end_iterator) {
+ char8_t second_byte {*(iterator + 1)};
+ if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes
+ value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111);
+ sequence_length = 2;
+ } else if (iterator + 2 != end_iterator) {
+ char8_t third_byte {*(iterator + 2)};
+ if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes
+ value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111);
+ sequence_length = 3;
+ } else if (iterator + 3 != end_iterator) {
+ char8_t fourth_byte {*(iterator + 3)};
+ if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes
+ value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111);
+ sequence_length = 4;
+ } else
+ throw std::invalid_argument("bad input: invalid 4 byte sequence");
+ } else
+ throw std::invalid_argument("bad input: invalid 3 byte sequence");
+ } else
+ throw std::invalid_argument("bad input: invalid 2 byte sequence");
+ } else
+ throw std::invalid_argument("bad input: byte 2 expected, none found");
+ } else { // 1 byte: 7 bit ASCII
+ value = first_byte;
+ sequence_length = 1;
+ }
}
// pre-increment
utf8_iterator& operator++()
{
- iterator += get_number_of_utf8_bytes();
+ iterator += sequence_length;
+ calculate_value();
return *this;
}
@@ -39,14 +81,14 @@ namespace {
reference operator*()
{
- get_value();
return value;
}
std::u8string::const_iterator iterator;
-
std::u8string::const_iterator end_iterator;
+
value_type value{};
+ size_t sequence_length{};
};
struct utf16_back_insert_iterator
@@ -70,7 +112,12 @@ namespace {
// append utf-16 word sequence
reference operator=(const char32_t& value)
{
- s.push_back(0); // TODO
+ if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t!
+ s.push_back(value);
+ } else {
+ s.push_back((value >> 10) + 0xD800);
+ s.push_back((value & 0x3FF) + 0xDC00);
+ }
return *this;
}
@@ -96,12 +143,6 @@ namespace {
namespace unicode {
-// returns number of bytes in UTF-8 byte sequence of first found code point,
-// if found. 0 if none found or sequence empty.
-//size_t utf8_start()
-//{
-//}
-
std::u16string utf8_to_utf16(const std::u8string& s)
{
std::u16string result;
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 4576d06..41fcd20 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -4,14 +4,23 @@
#include <string>
-//#include <unicode.h>
+#include <unicode.h>
BOOST_AUTO_TEST_CASE(utf8_to_utf16)
{
std::u8string u8{u8"ascii string1"};
- //std::u16string u16{unicode::utf8_to_utf16(u8)};
+ std::u16string u16{unicode::utf8_to_utf16(u8)};
- //BOOST_CHECK_EQUAL(u16, u"ascii string1");
+ BOOST_CHECK(u16 == u"ascii string1");
}
+// TODO:
+// invalid bytes
+// an unexpected continuation byte
+// a non-continuation byte before the end of the character
+// the string ending before the end of the character (which can happen in simple string truncation)
+// an overlong encoding
+// a sequence that decodes to an invalid code point
+//
+// high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF)