summaryrefslogtreecommitdiffhomepage
path: root/include/unicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/unicode.h')
-rw-r--r--include/unicode.h117
1 files changed, 72 insertions, 45 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 171496e..6d7ef16 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -37,7 +37,7 @@ namespace unicode::detail {
using namespace std::string_literals;
- template<typename T>
+ template<typename T, typename Container=std::basic_string<T>>
struct utf_iterator
{
static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
@@ -48,7 +48,7 @@ namespace unicode::detail {
typedef char32_t* pointer;
typedef size_t difference_type;
typedef std::input_iterator_tag iterator_category;
- typedef std::basic_string<T> string_type;
+ typedef Container string_type;
utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
iterator(cbegin), end_iterator(cend)
@@ -56,18 +56,25 @@ namespace unicode::detail {
calculate_value();
}
- utf_iterator<T>(const utf_iterator<T>& other) = default;
- utf_iterator<T>& operator=(const utf_iterator<T>& other) = default;
+ utf_iterator(const utf_iterator& other) = default;
+ utf_iterator& operator=(const utf_iterator& other) = default;
- size_t remaining_code_units()
+ size_t remaining_code_units() const
{
- return end_iterator - iterator;
+ return std::distance(iterator, end_iterator);
}
template<size_t index>
- T get_code_unit()
+ T get_code_unit() const
{
- return *(iterator + index);
+ if constexpr (std::is_same<Container, typename std::list<T>>::value) {
+ // std::list doesn't support it + n
+ auto it{iterator};
+ std::advance(it, index);
+ return *it;
+ } else {
+ return *(iterator + index);
+ }
}
inline static bool is_continuation_byte(T b)
@@ -111,20 +118,20 @@ namespace unicode::detail {
if (!remaining)
return;
- utf8_t byte0 {get_code_unit<0>()};
+ utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
if (byte0 & 0x80) { // 2-4 bytes
if (remaining >= 2) {
- utf8_t byte1 {get_code_unit<1>()};
+ utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
value = value_byte0_of<2>(byte0) | continuation_value(byte1);
sequence_length = 2;
} else if (remaining >= 3) {
- utf8_t byte2 {get_code_unit<2>()};
+ utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes
value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
sequence_length = 3;
} else if (remaining >= 4) {
- utf8_t byte3 {get_code_unit<3>()};
+ utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes
value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
sequence_length = 4;
@@ -154,7 +161,7 @@ namespace unicode::detail {
if (!remaining)
return;
- char16_t unit0 {get_code_unit<0>()};
+ char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane)
value = unit0;
@@ -163,7 +170,7 @@ namespace unicode::detail {
if (remaining < 2)
throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
- char16_t unit1 {get_code_unit<1>()};
+ char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())};
if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
@@ -179,7 +186,7 @@ namespace unicode::detail {
if (!remaining)
return;
- value = get_code_unit<0>();
+ value = static_cast<char32_t>(get_code_unit<0>());
if (!unicode::is_valid_unicode(value))
throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
@@ -202,16 +209,16 @@ namespace unicode::detail {
}
// pre-increment
- utf_iterator<T>& operator++()
+ utf_iterator& operator++()
{
- iterator += sequence_length;
+ std::advance(iterator, sequence_length);
calculate_value();
return *this;
}
- bool operator!=(const utf_iterator<T>& other) const
+ bool operator!=(const utf_iterator& other) const
{
- return iterator != other.iterator;
+ return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
}
reference operator*()
@@ -227,13 +234,13 @@ namespace unicode::detail {
size_t sequence_length{};
};
- template<typename T>
+ template<typename T, typename Container=std::basic_string<T>>
struct utf_back_insert_iterator
{
static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
typedef T value_type;
- typedef std::basic_string<T> string_type;
+ typedef Container string_type;
typedef utf_back_insert_iterator& reference;
typedef utf_back_insert_iterator* pointer;
typedef size_t difference_type;
@@ -378,7 +385,7 @@ namespace unicode {
using namespace detail;
- template<unicode::detail::iso_map_type& Map=iso_8859_1_map>
+ template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<utf8_t>>
struct iso_iterator {
typedef utf8_t input_type;
typedef char32_t value_type;
@@ -386,7 +393,8 @@ namespace unicode {
typedef char32_t* pointer;
typedef size_t difference_type;
typedef std::input_iterator_tag iterator_category;
- typedef std::basic_string<utf8_t>::const_iterator iterator;
+ typedef typename Container::const_iterator iterator;
+ typedef Container string_type;
iso_iterator(const iterator& it): m_it(it) {}
@@ -420,14 +428,14 @@ namespace unicode {
iterator m_it;
};
- template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse>
+ template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<utf8_t>>
struct iso_back_insert_iterator {
typedef iso_back_insert_iterator& reference;
typedef iso_back_insert_iterator* pointer;
typedef size_t difference_type;
typedef utf8_t value_type;
typedef std::output_iterator_tag iterator_category;
- typedef std::basic_string<utf8_t> string_type;
+ typedef Container string_type;
iso_back_insert_iterator(string_type& s): s(s) {}
@@ -478,18 +486,19 @@ namespace unicode {
struct ISO_8859
{
typedef utf8_t value_type;
+ typedef typename InputIt::string_type string_type;
- static InputIt begin(const std::basic_string<value_type>& s)
+ static InputIt begin(const typename InputIt::string_type& s)
{
return InputIt(s.cbegin());
}
- static InputIt end(const std::basic_string<value_type>& s)
+ static InputIt end(const typename InputIt::string_type& s)
{
return InputIt(s.cend());
}
- static OutputIt back_inserter(std::basic_string<value_type>& s)
+ static OutputIt back_inserter(typename OutputIt::string_type& s)
{
return OutputIt(s);
}
@@ -499,20 +508,20 @@ namespace unicode {
template<typename InputIt, typename OutputIt>
struct UTF
{
- typedef typename InputIt::input_type input_type;
typedef typename OutputIt::value_type value_type;
+ typedef typename InputIt::string_type string_type;
- static InputIt begin(const std::basic_string<input_type>& s)
+ static InputIt begin(const typename InputIt::string_type& s)
{
return InputIt{s.cbegin(), s.cend()};
}
- static InputIt end(const std::basic_string<input_type>& s)
+ static InputIt end(const typename InputIt::string_type& s)
{
return InputIt{s.cend(), s.cend()};
}
- static OutputIt back_inserter(std::basic_string<value_type>& s)
+ static OutputIt back_inserter(typename OutputIt::string_type& s)
{
return OutputIt(s);
}
@@ -527,10 +536,10 @@ namespace unicode {
typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
// From and To are facets
- template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value && std::is_empty<To>::value, bool> = true>
- std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s)
+ template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
+ typename To::string_type convert(const typename From::string_type& s)
{
- std::basic_string<typename To::value_type> result;
+ typename To::string_type result;
std::copy(From::begin(s), From::end(s), To::back_inserter(result));
@@ -561,27 +570,29 @@ namespace unicode {
typedef UTF_32 Facet;
};
- // From and To are from: utf8_t, char16_t and char32_t
+ // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t
template<typename From, typename To,
- std::enable_if_t<std::is_trivial<From>::value && std::is_trivial<To>::value, bool> = true
- >
- std::basic_string<To> convert(const std::basic_string<From>& s)
+ typename FromContainer=std::basic_string<From>,
+ typename ToContainer=std::basic_string<To>,
+ std::enable_if_t<std::is_trivial<From>::value && std::is_scalar<From>::value && !std::is_empty<From>::value, bool> = true>
+ ToContainer convert(const FromContainer& s)
{
typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait;
- std::basic_string<To> result;
+ ToContainer result;
std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result));
return result;
}
+ // From and To are containers
template<typename FromContainer, typename ToContainer,
std::enable_if_t<!std::is_empty<FromContainer>::value && !std::is_empty<ToContainer>::value, bool> = true
>
ToContainer convert(const FromContainer& s)
{
- typedef UTF<utf_iterator<typename FromContainer::value_type>, utf_back_insert_iterator<typename ToContainer::value_type>> UTF_Trait;
+ typedef UTF<utf_iterator<typename FromContainer::value_type, FromContainer>, utf_back_insert_iterator<typename ToContainer::value_type, ToContainer>> UTF_Trait;
ToContainer result;
@@ -590,9 +601,25 @@ namespace unicode {
return result;
}
+ // Container version
+ template<typename Container, std::enable_if_t<!std::is_empty<Container>::value, bool> = true>
+ bool is_valid_utf(const Container& s)
+ {
+ typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait;
+
+ try {
+ std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){});
+ } catch (const std::invalid_argument&) {
+ return false;
+ }
+ return true;
+ }
+
// basic type version
- template<typename T>
- bool is_valid_utf(const std::basic_string<T>& s)
+ template<typename T,
+ typename Container=std::basic_string<T>,
+ std::enable_if_t<std::is_trivial<T>::value && !std::is_empty<T>::value, bool> = true>
+ bool is_valid_utf(const Container& s)
{
typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait;
@@ -605,8 +632,8 @@ namespace unicode {
}
// Facet version
- template<typename Facet>
- bool is_valid_utf(const std::basic_string<typename Facet::value_type>& s)
+ template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true>
+ bool is_valid_utf(const typename Facet::string_type& s)
{
try {
std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){});