summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-23 13:27:34 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-23 13:27:34 +0100
commit721064dc293d8915fbb33d83bd983a40dcca180f (patch)
tree3259680f1b4d46ef98030aae5b0b3cc89ae49396
parentcd7e832e2f47fa35d36794808582118cb34eca3f (diff)
Speed optimization
-rw-r--r--include/unicode.h73
-rw-r--r--src/test-unicode.cpp3
2 files changed, 74 insertions, 2 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 43dc44e..5774db7 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -8,6 +8,7 @@
#pragma once
#include <algorithm>
+#include <cstdint>
#include <iterator>
#include <list>
#include <memory>
@@ -203,6 +204,17 @@ namespace unicode::detail {
return calculate_value();
}
+ utf_iterator& operator+=(size_t distance)
+ {
+ std::advance(iterator, distance);
+ return *this;
+ }
+
+ size_t operator-(const utf_iterator& other) const
+ {
+ return iterator - other.iterator;
+ }
+
private:
typename string_type::const_iterator iterator;
typename string_type::const_iterator end_iterator;
@@ -394,7 +406,7 @@ namespace unicode {
}
// return reference?
- value_type operator*()
+ value_type operator*() const
{
input_type value{*m_it};
@@ -407,6 +419,17 @@ namespace unicode {
return static_cast<value_type>(static_cast<uint8_t>(value));
}
+ iso_iterator& operator+=(size_t distance)
+ {
+ std::advance(m_it, distance);
+ return *this;
+ }
+
+ difference_type operator-(const iso_iterator& other) const
+ {
+ return m_it - other.m_it;
+ }
+
private:
iterator m_it;
};
@@ -518,13 +541,59 @@ namespace unicode {
typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;
typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
+ // std::distance doesn't work here: it is based on "output" distance of iterators
+ template<class Iterator>
+ size_t input_distance(const Iterator& it1, const Iterator& it2)
+ {
+ return it2 - it1;
+ }
+
// From and To are facets
template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true>
typename To::string_type convert(const typename From::string_type& s)
{
typename To::string_type result;
- std::copy(From::begin(s), From::end(s), To::back_inserter(result));
+ if constexpr(sizeof(typename From::string_type::value_type) == 1 &&
+ sizeof(typename To::value_type) == 1 &&
+ sizeof(size_t) >= 8) {
+ auto begin{From::begin(s)};
+ auto end{From::end(s)};
+ auto back_inserter{To::back_inserter(result)};
+ auto addr{reinterpret_cast<const uint64_t*>(&s.data()[s.size() - input_distance(begin, end)])};
+ while (input_distance(begin, end) >= 8) {
+ if (((uintptr_t)(void*)addr & 7) == 0) {
+ while (input_distance(begin, end) >= 8) {
+ uint64_t data{*addr};
+ if ((data & 0x8080808080808080ULL) == 0ULL) {
+ result.append(reinterpret_cast<const typename To::value_type*>(addr), 8);
+ begin += 8;
+ ++addr;
+ } else {
+ // just advance one code unit for now
+ back_inserter = *begin;
+ ++begin;
+ break;
+ }
+ }
+ }
+
+ // keep up after unaligned Non-ASCII code points
+ while (begin!= end && (uintptr_t)(void*)(addr = reinterpret_cast<const uint64_t*>(&s.data()[s.size() - input_distance(begin, end)])) & 7) {
+ back_inserter = *begin;
+ ++begin;
+ }
+ }
+
+ // remainder < 8 bytes
+ while (begin != end) {
+ back_inserter = *begin;
+ ++begin;
+ }
+
+ } else {
+ std::copy(From::begin(s), From::end(s), To::back_inserter(result));
+ }
return result;
}
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 59d55b9..d638cbb 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -566,6 +566,9 @@ BOOST_AUTO_TEST_CASE(convert)
// deque
BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{})) == std::deque<wchar_t>{});
BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque<wchar_t>{L'ä', L'ö', L'ü'}));
+ // yet unsupported:
+ //BOOST_CHECK((unicode::convert<utf8_t, char16_t>(std::deque<utf8_t>{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque<char16_t>{u'ä', u'ö', u'ü'}));
+ //BOOST_CHECK((unicode::convert<unicode::UTF_8, unicode::UTF_16>(std::deque<utf8_t>{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque<char16_t>{u'ä', u'ö', u'ü'}));
// deque with uint8_t, uint16_t
BOOST_CHECK((unicode::convert<std::deque<uint8_t>, std::deque<uint16_t>>(std::deque<uint8_t>{})) == std::deque<uint16_t>{});