Add first working conversion UTF-8 -> UTF-16

author: Roland Reichwein <mail@reichwein.it> 2021-01-25 18:54:25 +0100
committer: Roland Reichwein <mail@reichwein.it> 2021-01-25 18:54:25 +0100
commit: 918d015302a004755ce0cf4968793cdf6a61bca8 (patch)
tree: a5c962b33a46249dfebf2a8506d9acedefac523c
parent: d8bddb9dc248bb3cc04116c97259ea6f5c13e6d0 (diff)
2 files changed, 69 insertions, 19 deletions
diff --git a/include/unicode.h b/include/unicode.h
index b90ed15..512891a 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -4,8 +4,15 @@
 #pragma once
 
 #include <algorithm>
+#include <stdexcept>
 #include <string>
 
+#ifdef __has_cpp_attribute
+#if __has_cpp_attribute(__cpp_char8_t)
+// char8_t available
+#endif
+#endif
+
 namespace {
 
  struct utf8_iterator
@@ -13,22 +20,57 @@ namespace {
   typedef char32_t value_type;
   typedef char32_t& reference;
 
-  void get_value()
+  utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend):
+   iterator(cbegin), end_iterator(cend)
   {
-   // TODO: set value to current data in *iterator ...
-   value = 'X';
+   calculate_value();
   }
 
-  size_t get_number_of_utf8_bytes()
+  utf8_iterator(const utf8_iterator& other) = default;
+  utf8_iterator& operator=(const utf8_iterator& other) = default;
+
+  // set value member
+  void calculate_value()
   {
-   // TODO: how many bytes
-   return 1;
+   if (iterator == end_iterator)
+    return;
+
+   char8_t first_byte {*iterator};
+   if (first_byte & 0x80) { // 2-4 bytes
+    if (iterator + 1 != end_iterator) {
+     char8_t second_byte {*(iterator + 1)};
+     if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes
+      value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111);
+      sequence_length = 2;
+     } else if (iterator + 2 != end_iterator) {
+      char8_t third_byte {*(iterator + 2)};
+      if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes
+       value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111);
+       sequence_length = 3;
+      } else if (iterator + 3 != end_iterator) {
+       char8_t fourth_byte {*(iterator + 3)};
+       if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes
+        value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111);
+        sequence_length = 4;
+       } else
+        throw std::invalid_argument("bad input: invalid 4 byte sequence");
+      } else
+       throw std::invalid_argument("bad input: invalid 3 byte sequence");
+     } else
+      throw std::invalid_argument("bad input: invalid 2 byte sequence");
+    } else
+     throw std::invalid_argument("bad input: byte 2 expected, none found");
+   } else { // 1 byte: 7 bit ASCII
+    value = first_byte;
+    sequence_length = 1;
+   }
   }
 
   // pre-increment
   utf8_iterator& operator++()
   {
-   iterator += get_number_of_utf8_bytes();
+   iterator += sequence_length;
+   calculate_value();
    return *this;
   }
 
@@ -39,14 +81,14 @@ namespace {
 
   reference operator*()
   {
-   get_value();
    return value;
   }
 
   std::u8string::const_iterator iterator;
-
   std::u8string::const_iterator end_iterator;
+
   value_type value{};
+  size_t sequence_length{};
  };
 
  struct utf16_back_insert_iterator
@@ -70,7 +112,12 @@ namespace {
   // append utf-16 word sequence
   reference operator=(const char32_t& value)
   {
-   s.push_back(0); // TODO
+   if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t!
+    s.push_back(value);
+   } else {
+    s.push_back((value >> 10) + 0xD800);
+    s.push_back((value & 0x3FF) + 0xDC00);
+   }
    return *this;
   }
 
@@ -96,12 +143,6 @@ namespace {
 
 namespace unicode {
 
-// returns number of bytes in UTF-8 byte sequence of first found code point,
-// if found. 0 if none found or sequence empty.
-//size_t utf8_start()
-//{
-//}
-
 std::u16string utf8_to_utf16(const std::u8string& s)
 {
  std::u16string result;
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 4576d06..41fcd20 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -4,14 +4,23 @@
 
 #include <string>
 
-//#include <unicode.h>
+#include <unicode.h>
 
 BOOST_AUTO_TEST_CASE(utf8_to_utf16)
 {
  std::u8string u8{u8"ascii string1"};
  
- //std::u16string u16{unicode::utf8_to_utf16(u8)};
+ std::u16string u16{unicode::utf8_to_utf16(u8)};
 
- //BOOST_CHECK_EQUAL(u16, u"ascii string1");
+ BOOST_CHECK(u16 == u"ascii string1");
 }
 
+// TODO:
+//  invalid bytes
+//  an unexpected continuation byte
+//  a non-continuation byte before the end of the character
+//  the string ending before the end of the character (which can happen in simple string truncation)
+//  an overlong encoding
+//  a sequence that decodes to an invalid code point
+//
+//  high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF)
author	Roland Reichwein <mail@reichwein.it>	2021-01-25 18:54:25 +0100
committer	Roland Reichwein <mail@reichwein.it>	2021-01-25 18:54:25 +0100
commit	918d015302a004755ce0cf4968793cdf6a61bca8 (patch)
tree	a5c962b33a46249dfebf2a8506d9acedefac523c
parent	d8bddb9dc248bb3cc04116c97259ea6f5c13e6d0 (diff)