summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-25 14:38:46 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-25 14:38:46 +0100
commit79dc9edc72c5b9fefb129fe36029d4781b1e969c (patch)
tree9e5ff95ef84ab089c652935ae8f94758318b6dbc
parent98f9132997353bb3e750e8e2db99ebd474a8dbb6 (diff)
Generalized type usage and optimizations
-rw-r--r--Makefile4
-rw-r--r--include/unicode.h155
-rw-r--r--src/test-unicode.cpp51
3 files changed, 138 insertions, 72 deletions
diff --git a/Makefile b/Makefile
index 6ed0e68..e037bc5 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ ONDEBIAN=no
endif
# On Ubuntu 2104 and 2110, dh_strip / debugedit is broken, therefore different Non-Debian options in the following
-DISTROS=base debian10 debian11 ubuntu2004 ubuntu2010 ubuntu2104 ubuntu2110 ubuntu2204
+DISTROS=base base-i386 debian10 debian11 ubuntu2004 ubuntu2010 ubuntu2104 ubuntu2110 ubuntu2204
ifeq ($(wildcard $(shell which clang++-13)),)
ifeq ($(wildcard $(shell which clang++-12)),)
@@ -124,7 +124,7 @@ deb-src:
$(DISTROS): deb-src
sudo pbuilder build --basetgz /var/cache/pbuilder/$@.tgz --buildresult result/$@ ../$(PROJECTNAME)_$(VERSION).dsc
- debsign result/$@/$(PROJECTNAME)_$(VERSION)_amd64.changes
+ -debsign result/$@/$(PROJECTNAME)_$(VERSION)_amd64.changes
debs: $(DISTROS)
diff --git a/include/unicode.h b/include/unicode.h
index 8dedb19..c2d727a 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -45,8 +45,8 @@ namespace unicode::detail {
{
static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
- typedef T input_type;
- typedef char32_t value_type;
+ typedef T value_type;
+ typedef char32_t internal_type;
typedef char32_t& reference;
typedef char32_t* pointer;
typedef size_t difference_type;
@@ -67,9 +67,9 @@ namespace unicode::detail {
}
template<size_t index>
- T get_code_unit() const noexcept
+ value_type get_code_unit() const noexcept
{
- if constexpr (std::is_same<Container, typename std::list<T>>::value) {
+ if constexpr (std::is_same<Container, typename std::list<value_type>>::value) {
// std::list doesn't support it + n
auto it{iterator};
std::advance(it, index);
@@ -79,46 +79,46 @@ namespace unicode::detail {
}
}
- inline static bool is_continuation_byte(T b) noexcept
+ inline static bool is_continuation_byte(value_type b) noexcept
{
return (b & 0b11000000) == 0b10000000;
}
template<typename... Targs>
- inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept
+ inline static bool is_continuation_byte(value_type b, Targs... Fargs) noexcept
{
return is_continuation_byte(b) && is_continuation_byte(Fargs...);
}
template<size_t n>
- inline static bool is_byte0_of(T b) noexcept
+ inline static bool is_byte0_of(value_type b) noexcept
{
- return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n));
+ return (b & static_cast<value_type>(0xFF << (7 - n))) == static_cast<value_type>(0xFF << (8 - n));
}
- inline static char32_t continuation_value(T b) noexcept
+ inline static internal_type continuation_value(value_type b) noexcept
{
- return static_cast<char32_t>(b & 0b00111111);
+ return static_cast<internal_type>(b & 0b00111111);
}
template<typename... Targs>
- inline static char32_t continuation_value(T b, Targs... Fargs) noexcept
+ inline static internal_type continuation_value(value_type b, Targs... Fargs) noexcept
{
return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
}
template<size_t n>
- inline static char32_t value_byte0_of(T b) noexcept
+ inline static internal_type value_byte0_of(value_type b) noexcept
{
return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
- inline value_type calculate_value()
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ inline internal_type calculate_value()
{
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
if (byte0 & 0x80) { // 2-4 bytes
- value_type value{};
+ internal_type value{};
if (size_t remaining{remaining_code_units()}; remaining >= 2) {
utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
@@ -154,8 +154,8 @@ namespace unicode::detail {
}
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
- inline value_type calculate_value()
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ inline internal_type calculate_value()
{
char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
@@ -175,10 +175,10 @@ namespace unicode::detail {
}
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
- inline value_type calculate_value()
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ inline internal_type calculate_value()
{
- value_type result {static_cast<char32_t>(get_code_unit<0>())};
+ internal_type result {static_cast<internal_type>(get_code_unit<0>())};
if (!unicode::is_valid_unicode(result))
throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
@@ -199,7 +199,7 @@ namespace unicode::detail {
return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
}
- value_type operator*()
+ internal_type operator*()
{
return calculate_value();
}
@@ -256,14 +256,14 @@ namespace unicode::detail {
// n is number of UTF-8 bytes in sequence
template<size_t n>
- inline static T byte0_of(char32_t value)
+ inline static value_type byte0_of(char32_t value)
{
return (value >> 6 * (n - 1)) | (0xFF << (8 - n));
}
// n is index of 6-bit groups, counting from bit 0
template<size_t n>
- inline static T trailing_byte(char32_t value)
+ inline static value_type trailing_byte(char32_t value)
{
return ((value >> n * 6) & 0b111111) | 0b10000000;
}
@@ -271,7 +271,7 @@ namespace unicode::detail {
// calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII)
// assume value to be valid Unicode value for given byte position
template<size_t n, size_t m>
- inline static T byte_n_of_m(char32_t value)
+ inline static value_type byte_n_of_m(char32_t value)
{
if constexpr (n == 0)
return byte0_of<m>(value);
@@ -282,7 +282,7 @@ namespace unicode::detail {
template<typename Arg>
inline void append(Arg&& arg)
{
- if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) {
+ if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) {
s.append({arg});
} else {
s.emplace_back(arg);
@@ -292,7 +292,7 @@ namespace unicode::detail {
template<typename Arg, typename... Args>
inline void append(Arg&& arg, Args&&... args)
{
- if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) {
+ if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) {
s.append({arg, args...});
} else {
s.emplace_back(arg);
@@ -300,7 +300,7 @@ namespace unicode::detail {
}
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
inline void append_utf(const char32_t& value)
{
if (value < 0x80) { // 1 byte
@@ -315,18 +315,18 @@ namespace unicode::detail {
throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
inline void append_utf(const char32_t& value)
{
if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
append(static_cast<value_type>(value));
} else {
char32_t value_reduced{value - 0x10000};
- append(static_cast<T>((value_reduced >> 10) + 0xD800), static_cast<T>((value_reduced & 0x3FF) + 0xDC00));
+ append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00));
}
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
inline void append_utf(const char32_t& value)
{
// expect value to be already valid Unicode values (checked in input iterator)
@@ -382,8 +382,8 @@ namespace unicode {
template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>>
struct iso_iterator {
- typedef iso_t input_type;
- typedef char32_t value_type;
+ typedef iso_t value_type;
+ typedef char32_t internal_type;
typedef char32_t& reference;
typedef char32_t* pointer;
typedef size_t difference_type;
@@ -406,9 +406,9 @@ namespace unicode {
}
// return reference?
- value_type operator*() const
+ internal_type operator*() const
{
- input_type value{*m_it};
+ value_type value{*m_it};
if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed
{
@@ -416,7 +416,7 @@ namespace unicode {
if (it != Map.end())
return it->second;
}
- return static_cast<value_type>(static_cast<uint8_t>(value));
+ return static_cast<internal_type>(static_cast<uint8_t>(value));
}
iso_iterator& operator+=(size_t distance)
@@ -554,28 +554,61 @@ namespace unicode {
template<> struct ConvertInputOptimizer<1>
{
static const uint32_t ascii_mask { 0x80808080 };
+ // 00112233
+ // 00112222
+ // 00111122
+ // 00111111
+ // 00001122
+ // 00001111
+ // 00000011
};
- template<int value_size>
- struct ConvertOutputOptimizer {};
+ template<> struct ConvertInputOptimizer<2>
+ {
+ static const uint32_t ascii_mask { 0xFF80FF80 };
+ };
+
+ template<> struct ConvertInputOptimizer<4>
+ {
+ static const uint32_t ascii_mask { 0xFFFFFF80 };
+ };
+
+ template<int AccuSize, class ConvertInputOptimizer>
+ struct ArchitectureOptimizer {};
- template<> struct ConvertOutputOptimizer<1>
+ template<class ConvertInputOptimizer>
+ struct ArchitectureOptimizer<4, ConvertInputOptimizer>
{
- template<typename input_value_type, class output_string_type, int code_units>
+ typedef ConvertInputOptimizer input_optimizer;
+ typedef uint32_t accu_type;
+ static const size_t accu_size {4};
+ static const accu_type addr_mask {accu_size - 1};
+ static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask };
+ static const accu_type ascii_value { 0ULL };
+
+ template<typename input_value_type, class output_string_type>
inline static void append(const input_value_type* addr, output_string_type& s)
{
- s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), code_units);
+ if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) {
+ s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type));
+ } else if constexpr(sizeof(input_value_type) == 1) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1]),
+ static_cast<typename output_string_type::value_type>(addr[2]),
+ static_cast<typename output_string_type::value_type>(addr[3])});
+ } else if constexpr(sizeof(input_value_type) == 2) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1])});
+ } else if constexpr(sizeof(input_value_type) == 4) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0])});
+ }
}
};
-
- template<int AccuSize, class ConvertInputOptimizer, class ConvertOutputOptimizer>
- struct ArchitectureOptimizer {};
- template<class ConvertInputOptimizer, class ConvertOutputOptimizer>
- struct ArchitectureOptimizer<8, ConvertInputOptimizer, ConvertOutputOptimizer>
+ template<class ConvertInputOptimizer>
+ struct ArchitectureOptimizer<8, ConvertInputOptimizer>
{
typedef ConvertInputOptimizer input_optimizer;
- typedef ConvertOutputOptimizer output_optimizer;
typedef uint64_t accu_type;
static const size_t accu_size {8};
static const accu_type addr_mask {accu_size - 1};
@@ -585,7 +618,26 @@ namespace unicode {
template<typename input_value_type, class output_string_type>
inline static void append(const input_value_type* addr, output_string_type& s)
{
- output_optimizer::template append<input_value_type, output_string_type, accu_size>(addr, s);
+ if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) {
+ s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type));
+ } else if constexpr(sizeof(input_value_type) == 1) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1]),
+ static_cast<typename output_string_type::value_type>(addr[2]),
+ static_cast<typename output_string_type::value_type>(addr[3]),
+ static_cast<typename output_string_type::value_type>(addr[4]),
+ static_cast<typename output_string_type::value_type>(addr[5]),
+ static_cast<typename output_string_type::value_type>(addr[6]),
+ static_cast<typename output_string_type::value_type>(addr[7])});
+ } else if constexpr(sizeof(input_value_type) == 2) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1]),
+ static_cast<typename output_string_type::value_type>(addr[2]),
+ static_cast<typename output_string_type::value_type>(addr[3])});
+ } else if constexpr(sizeof(input_value_type) == 4) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1])});
+ }
}
};
@@ -595,12 +647,9 @@ namespace unicode {
{
typename To::string_type result;
- if constexpr(sizeof(typename From::string_type::value_type) == 1 &&
- sizeof(typename To::value_type) == 1 &&
- sizeof(size_t) >= 8) {
+ if constexpr(sizeof(size_t) == 4 || sizeof(size_t) == 8) {
typedef ConvertInputOptimizer<sizeof(typename From::string_type::value_type)> input_optimizer;
- typedef ConvertOutputOptimizer<sizeof(typename To::value_type)> output_optimizer;
- typedef ArchitectureOptimizer<sizeof(size_t), input_optimizer, output_optimizer> arch_optimizer;
+ typedef ArchitectureOptimizer<sizeof(size_t), input_optimizer> arch_optimizer;
auto begin{From::begin(s)};
auto end{From::end(s)};
@@ -612,7 +661,7 @@ namespace unicode {
typename arch_optimizer::accu_type data{*addr};
if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) {
arch_optimizer::template append<typename From::string_type::value_type, typename To::string_type>(reinterpret_cast<const typename From::string_type::value_type*>(addr), result);
- begin += arch_optimizer::accu_size;
+ begin += arch_optimizer::accu_size / sizeof(typename From::string_type::value_type);
++addr;
} else {
// just advance one code unit for now
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index d638cbb..a30be70 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -1,11 +1,11 @@
#define BOOST_TEST_MODULE unicode_test
+#include <boost/locale.hpp>
#include <boost/test/included/unit_test.hpp>
#include <boost/test/data/dataset.hpp>
#include <boost/test/data/monomorphic.hpp>
#include <boost/test/data/test_case.hpp>
-
-#include <boost/locale.hpp>
+#include <boost/timer/timer.hpp>
#include <array>
#include <chrono>
@@ -24,6 +24,7 @@
#include <unicode.h>
using namespace std::chrono_literals;
+using namespace std::string_literals;
typedef std::tuple<std::basic_string<utf8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> types_collection_type;
@@ -442,6 +443,27 @@ namespace {
}
}
+class CPUTimer
+{
+public:
+ CPUTimer(const std::string& name = "Timer"): mName(name), mWallTime0(std::chrono::steady_clock::now())
+ {
+ }
+
+ ~CPUTimer()
+ {
+ auto elapsed_cpu{mCPUTimer.elapsed()};
+ std::cout << mName << ": " << std::chrono::duration<double>(std::chrono::steady_clock::now() - mWallTime0).count() <<
+ "s (" << (double(elapsed_cpu.user + elapsed_cpu.system) / 1000000000) <<
+ "s CPU)" << std::endl;
+ }
+
+private:
+ std::string mName;
+ std::chrono::time_point<std::chrono::steady_clock> mWallTime0;
+ boost::timer::cpu_timer mCPUTimer;
+};
+
template<typename From, typename ToTypesCollectionType, size_t index = 0>
void test_random_valid(random_context& rc, size_t length, const std::string& description)
{
@@ -466,33 +488,24 @@ void test_random_valid(random_context& rc, size_t length, const std::string& des
}
{
- auto t0{std::chrono::steady_clock::now()};
+ CPUTimer timer("Performance test for converting "s + std::to_string(list.size()) +
+ " "s + description +
+ " from UTF-"s + std::to_string(sizeof(typename From::value_type) * 8) +
+ " to UTF-"s + std::to_string(sizeof(typename To::value_type) * 8));
for (const auto& i: list)
To result{unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet,typename unicode::Encoding<typename To::value_type>::Facet>(i)};
- std::cout << "Performance test for converting " << list.size() <<
- " " << description <<
- " from UTF-" << (sizeof(typename From::value_type) * 8) <<
- " to UTF-" << (sizeof(typename To::value_type) * 8) << ": " <<
- std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << "s" <<
- std::endl;
}
{
- auto t0{std::chrono::steady_clock::now()};
+ CPUTimer timer(" -> Compare to boost::locale::conv::utf_to_utf");
for (const auto& i: list)
To result{boost::locale::conv::utf_to_utf<typename To::value_type, typename From::value_type>(i)};
- std::cout << " -> Compare to boost::locale::conv::utf_to_utf: " <<
- std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << "s" <<
- std::endl;
}
{
- auto t0{std::chrono::steady_clock::now()};
+ CPUTimer timer(" -> Compare to std::wstring_convert");
for (const auto& i: list)
To result{std_convert<typename From::value_type, typename To::value_type>(i)};
- std::cout << " -> Compare to std::wstring_convert: " <<
- std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << "s" <<
- std::endl;
}
// iterate over remaining To types
@@ -615,6 +628,10 @@ BOOST_AUTO_TEST_CASE(string_u8string)
a = std::string{b.begin(), b.end()};
BOOST_CHECK(a == std::string{"\xc3\xa4"});
+
+ BOOST_CHECK(sizeof(size_t) == 4 || sizeof(size_t) == 8);
+
+ std::cout << "Detected CPU Accu size: " << (sizeof(size_t) * 8) << std::endl;
}
// check environment: demonstrate how boost convert u8->u8 throws exception on invalid input