summaryrefslogtreecommitdiffhomepage
path: root/src/test-unicode.cpp
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-21 15:36:48 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-21 15:36:48 +0100
commit3ca9f389084a2defe1fff2046dd3450e0b242e58 (patch)
treec6e8ad716db3d1cbadf33c421425803a2e89cd1b /src/test-unicode.cpp
parentf3025691d12727bbab138c13680cc21a451626b6 (diff)
Added comparison tests with boost::locale::conv and std::wstring_convert
Diffstat (limited to 'src/test-unicode.cpp')
-rw-r--r--src/test-unicode.cpp205
1 files changed, 115 insertions, 90 deletions
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index d00a33d..c325f6c 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -5,12 +5,16 @@
#include <boost/test/data/monomorphic.hpp>
#include <boost/test/data/test_case.hpp>
+#include <boost/locale.hpp>
+
#include <array>
#include <chrono>
+#include <codecvt>
#include <deque>
#include <exception>
#include <limits>
#include <list>
+#include <locale>
#include <random>
#include <string>
#include <tuple>
@@ -258,10 +262,11 @@ BOOST_AUTO_TEST_CASE(is_valid_unicode)
}
struct random_context {
+ random_context(int max_value = 0x10FFFF - 0x800): code_point_distribution(0, max_value) {}
std::random_device rd; // OS random number engine to seed RNG (below)
std::mt19937 gen{rd()};
std::uniform_int_distribution<size_t> sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units
- std::uniform_int_distribution<unsigned long> code_point_distribution{0, 0x10FFFF - 0x800};
+ std::uniform_int_distribution<unsigned long> code_point_distribution;
};
// generates valid and invalid strings of different type
@@ -293,7 +298,7 @@ std::u32string generate_random_string(random_context& rc, size_t length)
}
template<typename From, typename ToTypesCollectionType, size_t i = 0>
-void test_random(random_context& rc, size_t length)
+void test_random_invalid(random_context& rc, size_t length)
{
//std::cerr << "LENGTH: " << length << std::endl;
typedef typename std::tuple_element<i,ToTypesCollectionType>::type To;
@@ -347,7 +352,7 @@ void test_random(random_context& rc, size_t length)
// iterate over remaining To types
if constexpr (i + 1 < std::tuple_size<ToTypesCollectionType>::value)
- test_random<From, ToTypesCollectionType, i + 1>(rc, length);
+ test_random_invalid<From, ToTypesCollectionType, i + 1>(rc, length);
}
BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_invalid, T, types_collection_type)
@@ -355,134 +360,154 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_invalid, T, types_collection_type
random_context rc;
for (int i = 0; i < 10; i++) {
- test_random<T,types_collection_type>(rc, rc.sequence_length(rc.gen));
+ test_random_invalid<T,types_collection_type>(rc, rc.sequence_length(rc.gen));
}
}
-BOOST_AUTO_TEST_CASE(random_sequences_valid)
+// utility wrapper to adapt locale-bound facets for wstring/wbuffer convert
+template<class Facet>
+struct deletable_facet : Facet
{
- random_context rc;
+ template<class ...Args>
+ deletable_facet(Args&& ...args) : Facet(std::forward<Args>(args)...) {}
+ ~deletable_facet() {}
+};
- // Fill UTF-32 data list
- std::vector<std::u32string> u32list;
- std::generate_n(std::back_inserter(u32list), 1000, [&](){return generate_random_string(rc, rc.sequence_length(rc.gen));});
+namespace {
+ // char8_t instead of char doesn't work w/ clang++-13 + C++20 (yet?)
+ std::wstring_convert<deletable_facet<std::codecvt<char16_t, char, std::mbstate_t>>, char16_t> conv16;
+ std::wstring_convert<deletable_facet<std::codecvt<char32_t, char, std::mbstate_t>>, char32_t> conv32;
+
+ template<typename From, typename To>
+ std::basic_string<To> std_convert(const std::basic_string<From>& s);
- // Fill UTF-16 data list
- std::vector<std::u16string> u16list;
- std::transform(u32list.begin(), u32list.end(), std::back_inserter(u16list), [](const std::u32string& s){return unicode::convert<unicode::UTF_32, unicode::UTF_16>(s);});
-
- // Fill UTF-8 data list
- std::vector<std::basic_string<utf8_t>> u8list;
- std::transform(u32list.begin(), u32list.end(), std::back_inserter(u8list), [](const std::u32string& s){return unicode::convert<unicode::UTF_32, unicode::UTF_8>(s);});
-
- for (const auto& i : u32list) {
- std::u32string s32{unicode::convert<unicode::UTF_32, unicode::UTF_32>(i)};
- BOOST_CHECK(s32.size() == i.size());
- std::u16string s16{unicode::convert<unicode::UTF_32, unicode::UTF_16>(i)};
- BOOST_CHECK(s16.size() >= i.size());
- std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
- BOOST_CHECK(s8.size() >= i.size());
+ template<>
+ std::basic_string<utf8_t> std_convert<utf8_t, utf8_t>(const std::basic_string<utf8_t>& s)
+ {
+ return s;
}
- for (const auto& i : u16list) {
- std::u32string s32{unicode::convert<unicode::UTF_16, unicode::UTF_32>(i)};
- BOOST_CHECK(s32.size() > 0 || i.size() == 0);
- std::u16string s16{unicode::convert<unicode::UTF_16, unicode::UTF_16>(i)};
- BOOST_CHECK(s16.size() == i.size());
- std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
- BOOST_CHECK(s8.size() >= i.size());
+ template<>
+ std::basic_string<char16_t> std_convert<utf8_t, char16_t>(const std::basic_string<utf8_t>& s)
+ {
+ std::string a{s.begin(), s.end()};
+ return conv16.from_bytes(a);
}
- for (const auto& i : u8list) {
- std::u32string s32{unicode::convert<unicode::UTF_8, unicode::UTF_32>(i)};
- BOOST_CHECK(s32.size() > 0 || i.size() == 0);
- std::u16string s16{unicode::convert<unicode::UTF_8, unicode::UTF_16>(i)};
- BOOST_CHECK(s16.size() > 0 || i.size() == 0);
- std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
- BOOST_CHECK(s8.size() == i.size());
+ template<>
+ std::basic_string<char32_t> std_convert<utf8_t, char32_t>(const std::basic_string<utf8_t>& s)
+ {
+ std::string a{s.begin(), s.end()};
+ return conv32.from_bytes(a);
}
+ template<>
+ std::basic_string<utf8_t> std_convert<char16_t, utf8_t>(const std::basic_string<char16_t>& s)
{
- // Performance test UTF-32 -> UTF-32
- auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u32list) {
- std::u32string s{unicode::convert<unicode::UTF_32, unicode::UTF_32>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-32: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ auto result{conv16.to_bytes(s)};
+ return std::basic_string<utf8_t>(result.begin(), result.end());
}
+ template<>
+ std::basic_string<char16_t> std_convert<char16_t, char16_t>(const std::basic_string<char16_t>& s)
{
- // Performance test UTF-32 -> UTF-16
- auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u32list) {
- std::u16string s{unicode::convert<unicode::UTF_32, unicode::UTF_16>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-16: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ return s;
}
+ template<>
+ std::basic_string<char32_t> std_convert<char16_t, char32_t>(const std::basic_string<char16_t>& s)
{
- // Performance test UTF-32 -> UTF-8
- auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u32list) {
- std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ return conv32.from_bytes(conv16.to_bytes(s));
}
+ template<>
+ std::basic_string<utf8_t> std_convert<char32_t, utf8_t>(const std::basic_string<char32_t>& s)
{
- // Performance test UTF-16 -> UTF-32
- auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u16list) {
- std::u32string s{unicode::convert<unicode::UTF_16, unicode::UTF_32>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-32: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ auto result{conv32.to_bytes(s)};
+ return std::basic_string<utf8_t>(result.begin(), result.end());
}
+ template<>
+ std::basic_string<char16_t> std_convert<char32_t, char16_t>(const std::basic_string<char32_t>& s)
{
- // Performance test UTF-16 -> UTF-16
- auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u16list) {
- std::u16string s{unicode::convert<unicode::UTF_16, unicode::UTF_16>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-16: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ return conv16.from_bytes(conv32.to_bytes(s));
}
+ template<>
+ std::basic_string<char32_t> std_convert<char32_t, char32_t>(const std::basic_string<char32_t>& s)
{
- // Performance test UTF-16 -> UTF-8
- auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u16list) {
- std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ return s;
}
+}
+template<typename From, typename ToTypesCollectionType, size_t index = 0>
+void test_random_valid(random_context& rc, size_t length, const std::string& description)
+{
+ typedef typename std::tuple_element<index,ToTypesCollectionType>::type To;
+
+ // Fill UTF-32 data list: source for tests
+ std::vector<std::u32string> u32list;
+ std::generate_n(std::back_inserter(u32list), 1000, [&](){return generate_random_string(rc, rc.sequence_length(rc.gen));});
+
+ // Fill From data list
+ std::vector<From> list;
+ std::transform(u32list.begin(), u32list.end(), std::back_inserter(list), [](const std::u32string& s){
+ return unicode::convert<unicode::UTF_32, typename unicode::Encoding<typename From::value_type>::Facet>(s);
+ });
+
+ for (int i = 0; i < list.size(); i++) {
+ BOOST_CHECK(list[i].size() >= u32list[i].size());
+ To result{unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet,typename unicode::Encoding<typename To::value_type>::Facet>(list[i])};
+ BOOST_CHECK(result.size() >= u32list[i].size());
+ }
+
{
- // Performance test UTF-8 -> UTF-32
auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u8list) {
- std::u32string s{unicode::convert<unicode::UTF_8, unicode::UTF_32>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-32: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ for (const auto& i: list)
+ To result{unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet,typename unicode::Encoding<typename To::value_type>::Facet>(i)};
+ std::cout << "Performance test for converting " << list.size() <<
+ " " << description <<
+ " from UTF-" << (sizeof(typename From::value_type) * 8) <<
+ " to UTF-" << (sizeof(typename To::value_type) * 8) << ": " <<
+ std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << "s" <<
+ std::endl;
}
-
+
{
- // Performance test UTF-8 -> UTF-16
auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u8list) {
- std::u16string s{unicode::convert<unicode::UTF_8, unicode::UTF_16>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-16: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ for (const auto& i: list)
+ To result{boost::locale::conv::utf_to_utf<typename To::value_type, typename From::value_type>(i)};
+ std::cout << " -> Compare to boost::locale::conv::utf_to_utf: " <<
+ std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << "s" <<
+ std::endl;
}
{
- // Performance test UTF-8 -> UTF-8
auto t0{std::chrono::steady_clock::now()};
- for (const auto& i : u8list) {
- std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
- }
- std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
+ for (const auto& i: list)
+ To result{std_convert<typename From::value_type, typename To::value_type>(i)};
+ std::cout << " -> Compare to std::wstring_convert: " <<
+ std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << "s" <<
+ std::endl;
}
+ // iterate over remaining To types
+ if constexpr (index + 1 < std::tuple_size<ToTypesCollectionType>::value)
+ test_random_valid<From, ToTypesCollectionType, index + 1>(rc, length, description);
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_valid_all_unicode, T, types_collection_type)
+{
+ random_context rc;
+
+ test_random_valid<T,types_collection_type>(rc, rc.sequence_length(rc.gen), "All Unicode strings");
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_valid_ascii, T, types_collection_type)
+{
+ random_context rc{127};
+
+ test_random_valid<T,types_collection_type>(rc, rc.sequence_length(rc.gen), "ASCII only strings");
}
// Test ISO and UTF encodings