summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-02-14 17:50:51 +0100
committerRoland Reichwein <mail@reichwein.it>2021-02-14 17:50:51 +0100
commit268b7845af166c68b1c226f0be9ba5cf983ae91c (patch)
treeec17636d4411ae9f4fe70857a3cfddbe035e39e1
parent07c77b45ba9f74cfe1bed547bea1eeb705f0582b (diff)
Support different std containers, support different basic types
-rw-r--r--debian/README.Debian21
-rw-r--r--include/unicode.h117
-rw-r--r--src/test-unicode.cpp83
3 files changed, 164 insertions, 57 deletions
diff --git a/debian/README.Debian b/debian/README.Debian
index 382d20d..0a47d0a 100644
--- a/debian/README.Debian
+++ b/debian/README.Debian
@@ -57,6 +57,11 @@ The following encodings are implicitly deducted from types:
* char16_t: UTF-16
* char32_t: UTF-32
+You can specify different container types directly:
+
+ std::deque<char> utf8_value {...};
+ std::list<wchar_t> utf16_value{unicode::convert<std::deque<char>, std::list<wchar_t>>(utf8_value)};
+
Explicit encoding specification is also possible:
std::string value {"äöü"};
@@ -70,6 +75,22 @@ Supported encodings are:
* unicode::ISO_8859_1
* unicode::ISO_8859_15
+Supported basic types:
+ * char
+ * char8_t (C++20)
+ * wchar_t (UTF-16 on Windows, UTF-32 on Linux)
+ * char16_t
+ * char32_t
+ * uint8_t, int8_t
+ * uint16_t, int16_t
+ * uint32_t, int32_t
+ * basically, all basic 8-bit, 16-bit and 32-bit that can encode
+ UTF-8, UTF-16 and UTF-32, respectively.
+
+Supported container types:
+ * All std container types that can be iterated (vector, list, deque, array)
+ * Source and target containers can be different container types
+
Validation can be done like this:
bool valid{unicode::is_valid_utf<char16_t>(utf16_value)};
diff --git a/include/unicode.h b/include/unicode.h
index 171496e..6d7ef16 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -37,7 +37,7 @@ namespace unicode::detail {
using namespace std::string_literals;
- template<typename T>
+ template<typename T, typename Container=std::basic_string<T>>
struct utf_iterator
{
static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
@@ -48,7 +48,7 @@ namespace unicode::detail {
typedef char32_t* pointer;
typedef size_t difference_type;
typedef std::input_iterator_tag iterator_category;
- typedef std::basic_string<T> string_type;
+ typedef Container string_type;
utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
iterator(cbegin), end_iterator(cend)
@@ -56,18 +56,25 @@ namespace unicode::detail {
calculate_value();
}
- utf_iterator<T>(const utf_iterator<T>& other) = default;
- utf_iterator<T>& operator=(const utf_iterator<T>& other) = default;
+ utf_iterator(const utf_iterator& other) = default;
+ utf_iterator& operator=(const utf_iterator& other) = default;
- size_t remaining_code_units()
+ size_t remaining_code_units() const
{
- return end_iterator - iterator;
+ return std::distance(iterator, end_iterator);
}
template<size_t index>
- T get_code_unit()
+ T get_code_unit() const
{
- return *(iterator + index);
+ if constexpr (std::is_same<Container, typename std::list<T>>::value) {
+ // std::list doesn't support it + n
+ auto it{iterator};
+ std::advance(it, index);
+ return *it;
+ } else {
+ return *(iterator + index);
+ }
}
inline static bool is_continuation_byte(T b)
@@ -111,20 +118,20 @@ namespace unicode::detail {
if (!remaining)
return;
- utf8_t byte0 {get_code_unit<0>()};
+ utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
if (byte0 & 0x80) { // 2-4 bytes
if (remaining >= 2) {
- utf8_t byte1 {get_code_unit<1>()};
+ utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
value = value_byte0_of<2>(byte0) | continuation_value(byte1);
sequence_length = 2;
} else if (remaining >= 3) {
- utf8_t byte2 {get_code_unit<2>()};
+ utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes
value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
sequence_length = 3;
} else if (remaining >= 4) {
- utf8_t byte3 {get_code_unit<3>()};
+ utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes
value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
sequence_length = 4;
@@ -154,7 +161,7 @@ namespace unicode::detail {
if (!remaining)
return;
- char16_t unit0 {get_code_unit<0>()};
+ char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane)
value = unit0;
@@ -163,7 +170,7 @@ namespace unicode::detail {
if (remaining < 2)
throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
- char16_t unit1 {get_code_unit<1>()};
+ char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())};
if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
@@ -179,7 +186,7 @@ namespace unicode::detail {
if (!remaining)
return;
- value = get_code_unit<0>();
+ value = static_cast<char32_t>(get_code_unit<0>());
if (!unicode::is_valid_unicode(value))
throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
@@ -202,16 +209,16 @@ namespace unicode::detail {
}
// pre-increment
- utf_iterator<T>& operator++()
+ utf_iterator& operator++()
{
- iterator += sequence_length;
+ std::advance(iterator, sequence_length);
calculate_value();
return *this;
}
- bool operator!=(const utf_iterator<T>& other) const
+ bool operator!=(const utf_iterator& other) const
{
- return iterator != other.iterator;
+ return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
}
reference operator*()
@@ -227,13 +234,13 @@ namespace unicode::detail {
size_t sequence_length{};
};
- template<typename T>
+ template<typename T, typename Container=std::basic_string<T>>
struct utf_back_insert_iterator
{
static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
typedef T value_type;
- typedef std::basic_string<T> string_type;
+ typedef Container string_type;
typedef utf_back_insert_iterator& reference;
typedef utf_back_insert_iterator* pointer;
typedef size_t difference_type;
@@ -378,7 +385,7 @@ namespace unicode {
using namespace detail;
- template<unicode::detail::iso_map_type& Map=iso_8859_1_map>
+ template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<utf8_t>>
struct iso_iterator {
typedef utf8_t input_type;
typedef char32_t value_type;
@@ -386,7 +393,8 @@ namespace unicode {
typedef char32_t* pointer;
typedef size_t difference_type;
typedef std::input_iterator_tag iterator_category;
- typedef std::basic_string<utf8_t>::const_iterator iterator;
+ typedef typename Container::const_iterator iterator;
+ typedef Container string_type;
iso_iterator(const iterator& it): m_it(it) {}
@@ -420,14 +428,14 @@ namespace unicode {
iterator m_it;
};
- template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse>
+ template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<utf8_t>>
struct iso_back_insert_iterator {
typedef iso_back_insert_iterator& reference;
typedef iso_back_insert_iterator* pointer;
typedef size_t difference_type;
typedef utf8_t value_type;
typedef std::output_iterator_tag iterator_category;
- typedef std::basic_string<utf8_t> string_type;
+ typedef Container string_type;
iso_back_insert_iterator(string_type& s): s(s) {}
@@ -478,18 +486,19 @@ namespace unicode {
struct ISO_8859
{
typedef utf8_t value_type;
+ typedef typename InputIt::string_type string_type;
- static InputIt begin(const std::basic_string<value_type>& s)
+ static InputIt begin(const typename InputIt::string_type& s)
{
return InputIt(s.cbegin());
}
- static InputIt end(const std::basic_string<value_type>& s)
+ static InputIt end(const typename InputIt::string_type& s)
{
return InputIt(s.cend());
}
- static OutputIt back_inserter(std::basic_string<value_type>& s)
+ static OutputIt back_inserter(typename OutputIt::string_type& s)
{
return OutputIt(s);
}
@@ -499,20 +508,20 @@ namespace unicode {
template<typename InputIt, typename OutputIt>
struct UTF
{
- typedef typename InputIt::input_type input_type;
typedef typename OutputIt::value_type value_type;
+ typedef typename InputIt::string_type string_type;
- static InputIt begin(const std::basic_string<input_type>& s)
+ static InputIt begin(const typename InputIt::string_type& s)
{
return InputIt{s.cbegin(), s.cend()};
}
- static InputIt end(const std::basic_string<input_type>& s)
+ static InputIt end(const typename InputIt::string_type& s)
{
return InputIt{s.cend(), s.cend()};
}
- static OutputIt back_inserter(std::basic_string<value_type>& s)
+ static OutputIt back_inserter(typename OutputIt::string_type& s)
{
return OutputIt(s);
}
@@ -527,10 +536,10 @@ namespace unicode {
typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
// From and To are facets
- template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value && std::is_empty<To>::value, bool> = true>
- std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s)
+ template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
+ typename To::string_type convert(const typename From::string_type& s)
{
- std::basic_string<typename To::value_type> result;
+ typename To::string_type result;
std::copy(From::begin(s), From::end(s), To::back_inserter(result));
@@ -561,27 +570,29 @@ namespace unicode {
typedef UTF_32 Facet;
};
- // From and To are from: utf8_t, char16_t and char32_t
+ // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t
template<typename From, typename To,
- std::enable_if_t<std::is_trivial<From>::value && std::is_trivial<To>::value, bool> = true
- >
- std::basic_string<To> convert(const std::basic_string<From>& s)
+ typename FromContainer=std::basic_string<From>,
+ typename ToContainer=std::basic_string<To>,
+ std::enable_if_t<std::is_trivial<From>::value && std::is_scalar<From>::value && !std::is_empty<From>::value, bool> = true>
+ ToContainer convert(const FromContainer& s)
{
typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait;
- std::basic_string<To> result;
+ ToContainer result;
std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result));
return result;
}
+ // From and To are containers
template<typename FromContainer, typename ToContainer,
std::enable_if_t<!std::is_empty<FromContainer>::value && !std::is_empty<ToContainer>::value, bool> = true
>
ToContainer convert(const FromContainer& s)
{
- typedef UTF<utf_iterator<typename FromContainer::value_type>, utf_back_insert_iterator<typename ToContainer::value_type>> UTF_Trait;
+ typedef UTF<utf_iterator<typename FromContainer::value_type, FromContainer>, utf_back_insert_iterator<typename ToContainer::value_type, ToContainer>> UTF_Trait;
ToContainer result;
@@ -590,9 +601,25 @@ namespace unicode {
return result;
}
+ // Container version
+ template<typename Container, std::enable_if_t<!std::is_empty<Container>::value, bool> = true>
+ bool is_valid_utf(const Container& s)
+ {
+ typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait;
+
+ try {
+ std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){});
+ } catch (const std::invalid_argument&) {
+ return false;
+ }
+ return true;
+ }
+
// basic type version
- template<typename T>
- bool is_valid_utf(const std::basic_string<T>& s)
+ template<typename T,
+ typename Container=std::basic_string<T>,
+ std::enable_if_t<std::is_trivial<T>::value && !std::is_empty<T>::value, bool> = true>
+ bool is_valid_utf(const Container& s)
{
typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait;
@@ -605,8 +632,8 @@ namespace unicode {
}
// Facet version
- template<typename Facet>
- bool is_valid_utf(const std::basic_string<typename Facet::value_type>& s)
+ template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true>
+ bool is_valid_utf(const typename Facet::string_type& s)
{
try {
std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){});
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 5f5ebbf..fbd4749 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -5,9 +5,12 @@
#include <boost/test/data/monomorphic.hpp>
#include <boost/test/data/test_case.hpp>
+#include <array>
#include <chrono>
+#include <deque>
#include <exception>
#include <limits>
+#include <list>
#include <random>
#include <string>
#include <tuple>
@@ -98,14 +101,14 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
// test base type interface
To result { unicode::convert<typename From::value_type, typename To::value_type>(std::get<i>(t)) };
+ BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Base: From " << typeid(typename From::value_type).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(typename To::value_type).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
- BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Base: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
+ // test container interface
+ result = unicode::convert<From, To>(std::get<i>(t));
+ BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Container: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
- //std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl;
-
// test facet interface
result = unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet, typename unicode::Encoding<typename To::value_type>::Facet>(std::get<i>(t));
-
BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
// iterate over other combinations
@@ -132,6 +135,10 @@ void test_is_valid_utf(std::tuple<Ts...>& t)
// test via basic type
bool result { unicode::is_valid_utf<typename T::value_type>(std::get<i>(t)) };
+ BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename T::value_type).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result);
+
+ // test via container type
+ result = unicode::is_valid_utf<T>(std::get<i>(t));
BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result);
// test via Facet
@@ -158,7 +165,17 @@ void test_utf_to_utf_failure(std::basic_string<From>& s)
// via base type
try {
(void) unicode::convert<From,To>(s);
- BOOST_ERROR("Base: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
+ BOOST_ERROR("Base type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
+ } catch (const std::invalid_argument&) {
+ // OK: this is an expected exception for convert() on bad input
+ } catch (const std::exception& ex) {
+ BOOST_ERROR("Unexpected error on convert(): " << ex.what());
+ };
+
+ // via container
+ try {
+ (void) unicode::convert<typename unicode::Encoding<From>::Facet::string_type, typename unicode::Encoding<To>::Facet::string_type>(s);
+ BOOST_ERROR("Container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
} catch (const std::invalid_argument&) {
// OK: this is an expected exception for convert() on bad input
} catch (const std::exception& ex) {
@@ -198,6 +215,8 @@ void test_is_valid_utf_failure(std::basic_string<T>& s)
{
BOOST_CHECK_MESSAGE(unicode::is_valid_utf<T>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name());
+ BOOST_CHECK_MESSAGE(unicode::is_valid_utf<typename std::basic_string<T>>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name());
+
BOOST_CHECK_MESSAGE(unicode::is_valid_utf<typename unicode::Encoding<T>::Facet>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding<T>::Facet).name());
// iterate over remaining types
@@ -275,6 +294,21 @@ void test_random(random_context& rc, size_t length)
BOOST_ERROR("Unexpected error on convert(): " << ex.what());
}
+ // container type interface
+ try {
+ To result{unicode::convert<From, To>(r)};
+
+ if (r.empty()) {
+ BOOST_CHECK(result.empty());
+ } else {
+ BOOST_CHECK(!result.empty());
+ }
+ } catch (const std::invalid_argument&) {
+ // OK: this is an expected exception for convert() on bad input
+ } catch (const std::exception& ex) {
+ BOOST_ERROR("Unexpected error on convert(): " << ex.what());
+ }
+
// facet interface
try {
To result{unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet,typename unicode::Encoding<typename To::value_type>::Facet>(r)};
@@ -331,7 +365,7 @@ BOOST_AUTO_TEST_CASE(convert)
BOOST_CHECK((unicode::convert<unicode::UTF_8,unicode::UTF_16>("abc")) == std::u16string{u"abc"});
BOOST_CHECK((unicode::convert<unicode::UTF_32,unicode::UTF_16>(U"abc")) == std::u16string{u"abc"});
-
+
BOOST_CHECK((unicode::convert<utf8_t,char16_t>("abc")) == std::u16string{u"abc"});
BOOST_CHECK((unicode::convert<char32_t,char16_t>(U"abc")) == std::u16string{u"abc"});
@@ -354,7 +388,37 @@ BOOST_AUTO_TEST_CASE(convert)
BOOST_CHECK((unicode::convert<std::string, std::wstring>(std::string{"äöü"})) == std::wstring{L"äöü"});
- //BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{});
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{});
+ BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<wchar_t>{L'ä', L'ö', L'ü'}));
+
+ // deque
+ BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{})) == std::deque<wchar_t>{});
+ BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque<wchar_t>{L'ä', L'ö', L'ü'}));
+
+ // deque with uint8_t, uint16_t
+ BOOST_CHECK((unicode::convert<std::deque<uint8_t>, std::deque<uint16_t>>(std::deque<uint8_t>{})) == std::deque<uint16_t>{});
+ BOOST_CHECK((unicode::convert<std::deque<uint8_t>, std::deque<uint16_t>>(std::deque<uint8_t>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque<uint16_t>{L'ä', L'ö', L'ü'}));
+
+ // deque with int8_t, int16_t
+ BOOST_CHECK((unicode::convert<std::deque<int8_t>, std::deque<int16_t>>(std::deque<int8_t>{
+ static_cast<int8_t>(0xc3),
+ static_cast<int8_t>(0xa4),
+ static_cast<int8_t>(0xc3),
+ static_cast<int8_t>(0xb6),
+ static_cast<int8_t>(0xc3),
+ static_cast<int8_t>(0xbc)})) == (std::deque<int16_t>{L'ä', L'ö', L'ü'}));
+
+ // list
+ BOOST_CHECK((unicode::convert<std::list<uint8_t>, std::list<uint16_t>>(std::list<uint8_t>{})) == std::list<uint16_t>{});
+ BOOST_CHECK((unicode::convert<std::list<uint8_t>, std::list<uint16_t>>(std::list<uint8_t>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list<uint16_t>{L'ä', L'ö', L'ü'}));
+
+ // list -> deque
+ BOOST_CHECK((unicode::convert<std::list<uint8_t>, std::deque<uint16_t>>(std::list<uint8_t>{})) == std::deque<uint16_t>{});
+ BOOST_CHECK((unicode::convert<std::list<uint8_t>, std::deque<uint16_t>>(std::list<uint8_t>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque<uint16_t>{L'ä', L'ö', L'ü'}));
+
+ // array
+ BOOST_CHECK((unicode::convert<std::array<uint8_t, 0>, std::list<uint16_t>>(std::array<uint8_t, 0>{})) == std::list<uint16_t>{});
+ BOOST_CHECK((unicode::convert<std::array<uint8_t, 6>, std::list<uint16_t>>(std::array<uint8_t, 6>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list<uint16_t>{L'ä', L'ö', L'ü'}));
}
BOOST_AUTO_TEST_CASE(is_valid_utf)
@@ -376,8 +440,3 @@ BOOST_AUTO_TEST_CASE(string_u8string)
BOOST_CHECK(a == std::string{"\xc3\xa4"});
}
-
-// TODO:
-//
-// string, vector?
-// uint8_t, uint16_t, uint32_t?