include/unicode.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165

// libunicode
// Copyright (C) 2021 Roland Reichwein

#pragma once

#include <algorithm>
#include <stdexcept>
#include <string>

#ifdef __has_cpp_attribute
#if __has_cpp_attribute(__cpp_char8_t)
// char8_t available
#endif
#endif

namespace {

 struct utf8_iterator
 {
  typedef char32_t value_type;
  typedef char32_t& reference;

  utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend):
   iterator(cbegin), end_iterator(cend)
  {
   calculate_value();
  }

  utf8_iterator(const utf8_iterator& other) = default;
  utf8_iterator& operator=(const utf8_iterator& other) = default;

  // set value member
  void calculate_value()
  {
   if (iterator == end_iterator)
    return;

   char8_t first_byte {*iterator};
   if (first_byte & 0x80) { // 2-4 bytes
    if (iterator + 1 != end_iterator) {
     char8_t second_byte {*(iterator + 1)};
     if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes
      value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111);
      sequence_length = 2;
     } else if (iterator + 2 != end_iterator) {
      char8_t third_byte {*(iterator + 2)};
      if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes
       value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111);
       sequence_length = 3;
      } else if (iterator + 3 != end_iterator) {
       char8_t fourth_byte {*(iterator + 3)};
       if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes
        value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111);
        sequence_length = 4;
       } else
        throw std::invalid_argument("bad input: invalid 4 byte sequence");
      } else
       throw std::invalid_argument("bad input: invalid 3 byte sequence");
     } else
      throw std::invalid_argument("bad input: invalid 2 byte sequence");
    } else
     throw std::invalid_argument("bad input: byte 2 expected, none found");
   } else { // 1 byte: 7 bit ASCII
    value = first_byte;
    sequence_length = 1;
   }
  }

  // pre-increment
  utf8_iterator& operator++()
  {
   iterator += sequence_length;
   calculate_value();
   return *this;
  }

  bool operator!=(const utf8_iterator& other) const
  {
   return iterator != other.iterator;
  }

  reference operator*()
  {
   return value;
  }

  std::u8string::const_iterator iterator;
  std::u8string::const_iterator end_iterator;

  value_type value{};
  size_t sequence_length{};
 };

 struct utf16_back_insert_iterator
 {
  typedef utf16_back_insert_iterator& reference;

  utf16_back_insert_iterator(std::u16string& s): s(s) {}

  // no-op
  utf16_back_insert_iterator& operator++()
  {
   return *this;
  }

  // support *x = value, together with operator=()
  reference operator*()
  {
   return *this;
  }

  // append utf-16 word sequence
  reference operator=(const char32_t& value)
  {
   if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t!
    s.push_back(value);
   } else {
    s.push_back((value >> 10) + 0xD800);
    s.push_back((value & 0x3FF) + 0xDC00);
   }
   return *this;
  }

  std::u16string& s;
 };

 utf16_back_insert_iterator utf16_back_inserter(std::u16string& s)
 {
  return utf16_back_insert_iterator(s);
 }

 utf8_iterator utf8_begin(const std::u8string& s)
 {
  return utf8_iterator{s.cbegin(), s.cend()};
 }

 utf8_iterator utf8_end(const std::u8string& s)
 {
  return utf8_iterator{s.cend(), s.cend()};
 }

} // namespace

namespace unicode {

std::u16string utf8_to_utf16(const std::u8string& s)
{
 std::u16string result;

 std::copy(utf8_begin(s), utf8_end(s), utf16_back_inserter(result));

 return result;
}

//std::u8string utf16_to_utf8(const std::u16string& s)
//{
// std::u8string result;
//
// std::transform(utf16_begin(s), utf16_end(s), std::back_inserter(result));
//
// return result;
//}

} // namespace unicode