From 9dc97269201603dd479e15a736a64479a5095556 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 19 Dec 2021 19:51:38 +0100 Subject: Simplify utf_iterator for input, build on Debian 10+11, Ubuntu 2004-2204 --- Makefile | 30 +++++++++++++++---- debian/control | 4 +-- include/unicode.h | 85 +++++++++++++++++++++++----------------------------- src/test-unicode.cpp | 14 ++++----- 4 files changed, 70 insertions(+), 63 deletions(-) diff --git a/Makefile b/Makefile index e8a8f29..346f8a0 100644 --- a/Makefile +++ b/Makefile @@ -1,38 +1,56 @@ PROJECTNAME=unicode VERSION=$(shell dpkg-parsechangelog --show-field Version) +ifeq ($(shell lsb_release -si),Debian) +ONDEBIAN=yes +else +ONDEBIAN=no +endif -DISTROS=base debian11 ubuntu2110 +# On Ubuntu 2104 and 2110, dh_strip / debugedit is broken, therefore different Non-Debian options in the following +DISTROS=base debian10 debian11 ubuntu2004 ubuntu2010 ubuntu2104 ubuntu2110 ubuntu2204 ifeq ($(wildcard $(shell which clang++-13)),) +ifeq ($(wildcard $(shell which clang++-12)),) +ifeq ($(wildcard $(shell which clang++-11)),) ifeq ($(wildcard $(shell which clang++)),) CXX=g++-11 else CXX=clang++ endif else +CXX=clang++-11 +endif +else +CXX=clang++-12 +endif +else CXX=clang++-13 endif -# boost is buggy for C++20: error: static_assert failed due to requirement 'detail::is_endian_reversible_inplace -#STANDARD=c++17 -STANDARD=c++20 +STANDARD=c++17 ifeq ($(CXXFLAGS),) #CXXFLAGS=-O0 -g -D_DEBUG CXXFLAGS=-O2 -DNDEBUG endif -CXXFLAGS+=-Wall -Iinclude -std=$(STANDARD) - ifeq ($(CXX),clang++-13) +ifeq ($(ONDEBIAN),yes) COMPILER_SUITE=clang LIBS+=-fuse-ld=lld-13 +# boost is buggy for C++20: error: static_assert failed due to requirement 'detail::is_endian_reversible_inplace +STANDARD=c++20 +endif endif ifeq ($(CXX),clang++) +ifeq ($(ONDEBIAN),yes) COMPILER_SUITE=clang LIBS+=-fuse-ld=lld endif +endif + +CXXFLAGS+=-Wall -Iinclude -std=$(STANDARD) LDLIBS+=\ -lboost_context \ diff --git a/debian/control b/debian/control index 933d5f8..0c236a3 100644 --- a/debian/control +++ b/debian/control @@ -2,7 +2,7 @@ Source: unicode Section: devel Priority: optional Maintainer: Roland Reichwein -Build-Depends: debhelper (>= 12), libboost-all-dev | libboost1.74-all-dev, libc++-dev | libc++-13-dev | libc++-11-dev, libc++abi-dev | libc++abi-13-dev | libc++abi-11-dev, lld | lld-13 | lld-11, clang | clang-13 | clang-11 +Build-Depends: debhelper (>= 12), libboost1.74-all-dev | libboost-all-dev, libc++-13-dev | libc++-12-dev | libc++-11-dev | libc++-dev, libc++abi-13-dev | libc++abi-12-dev | libc++abi-11-dev | libc++abi-dev, lld-13 | lld-12 | lld-11 | lld, clang-13 | clang-12 | clang-11 | clang, libunwind-13-dev | libunwind-12-dev | libunwind-dev, llvm-13-linker-tools | llvm-12-linker-tools | llvm-11-linker-tools | clang Standards-Version: 4.5.0 Homepage: http://www.reichwein.it/unicode/ @@ -17,7 +17,7 @@ Description: Unicode conversion library Features: - Additional support for ISO-8859-1 encoding (Latin-1) as subset of Unicode - Additional support for ISO-8859-15 - - Tested on Debian 11, Debian 10, Ubuntu 2004, Ubuntu 2010 + - Tested on Debian 10+11, Ubuntu 2004 to 2110 - C++17 and C++20 compatible Package: unicode-tools diff --git a/include/unicode.h b/include/unicode.h index 6b6f21a..6d8aac5 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -28,7 +28,7 @@ namespace unicode { // usually, char32_t, uint32_t etc. template - static inline bool is_valid_unicode(const T& value) + static inline bool is_valid_unicode(const T& value) noexcept { return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF); } @@ -55,19 +55,18 @@ namespace unicode::detail { utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) { - calculate_value(); } utf_iterator(const utf_iterator& other) = default; utf_iterator& operator=(const utf_iterator& other) = default; - size_t remaining_code_units() const + size_t remaining_code_units() const noexcept { return std::distance(iterator, end_iterator); } template - T get_code_unit() const + T get_code_unit() const noexcept { if constexpr (std::is_same>::value) { // std::list doesn't support it + n @@ -79,46 +78,49 @@ namespace unicode::detail { } } - inline static bool is_continuation_byte(T b) + inline static bool is_continuation_byte(T b) noexcept { return (b & 0b11000000) == 0b10000000; } template - inline static bool is_continuation_byte(T b, Targs... Fargs) + inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept { return is_continuation_byte(b) && is_continuation_byte(Fargs...); } template - inline static bool is_byte0_of(T b) + inline static bool is_byte0_of(T b) noexcept { return (b & static_cast(0xFF << (7 - n))) == static_cast(0xFF << (8 - n)); } - inline static char32_t continuation_value(T b) + inline static char32_t continuation_value(T b) noexcept { return static_cast(b & 0b00111111); } template - inline static char32_t continuation_value(T b, Targs... Fargs) + inline static char32_t continuation_value(T b, Targs... Fargs) noexcept { return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); } template - inline static char32_t value_byte0_of(T b) + inline static char32_t value_byte0_of(T b) noexcept { return static_cast(b & (0b1111111 >> n)) << ((n - 1) * 6); } - void calculate_value_utf8() + template::type = true> + inline value_type calculate_value() { size_t remaining{remaining_code_units()}; if (!remaining) - return; + return {}; + + value_type value{}; utf8_t byte0 {static_cast(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes @@ -126,17 +128,17 @@ namespace unicode::detail { utf8_t byte1 {static_cast(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); - sequence_length = 2; + std::advance(iterator, 2); } else if (remaining >= 3) { utf8_t byte2 {static_cast(get_code_unit<2>())}; if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); - sequence_length = 3; + std::advance(iterator, 3); } else if (remaining >= 4) { utf8_t byte3 {static_cast(get_code_unit<3>())}; if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); - sequence_length = 4; + std::advance(iterator, 4); } else throw std::invalid_argument("Bad input: Invalid 4 byte sequence"); } else @@ -152,22 +154,25 @@ namespace unicode::detail { } else { // 1 byte: 7 bit ASCII value = byte0; - sequence_length = 1; + std::advance(iterator, 1); } + + return value; } - void calculate_value_utf16() + template::type = true> + inline value_type calculate_value() { size_t remaining{remaining_code_units()}; if (!remaining) - return; + return {}; char16_t unit0 {static_cast(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) - value = unit0; - sequence_length = 1; + std::advance(iterator, 1); + return unit0; } else { if (remaining < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); @@ -176,45 +181,32 @@ namespace unicode::detail { if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); - value = (static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; - sequence_length = 2; + std::advance(iterator, 2); + return (static_cast(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; } } - void calculate_value_utf32() + template::type = true> + inline value_type calculate_value() { size_t remaining{remaining_code_units()}; if (!remaining) - return; + return {}; - value = static_cast(get_code_unit<0>()); - - if (!unicode::is_valid_unicode(value)) - throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(value))); + value_type result {static_cast(get_code_unit<0>())}; - sequence_length = 1; - } + if (!unicode::is_valid_unicode(result)) + throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast(result))); - // set value member - void calculate_value() - { - static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); + std::advance(iterator, 1); - if constexpr(sizeof(T) == 1) { - calculate_value_utf8(); - } else if constexpr (sizeof(T) == 2) { - calculate_value_utf16(); - } else if constexpr (sizeof(T) == 4) { - calculate_value_utf32(); - } + return result; } // pre-increment utf_iterator& operator++() { - std::advance(iterator, sequence_length); - calculate_value(); return *this; } @@ -223,17 +215,14 @@ namespace unicode::detail { return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } - reference operator*() + value_type operator*() { - return value; + return calculate_value(); } private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; - - char32_t value{}; // always save complete unicode code point at this point - size_t sequence_length{}; }; template> diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 29e5c2e..d00a33d 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -372,7 +372,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid) std::transform(u32list.begin(), u32list.end(), std::back_inserter(u16list), [](const std::u32string& s){return unicode::convert(s);}); // Fill UTF-8 data list - std::vector u8list; + std::vector> u8list; std::transform(u32list.begin(), u32list.end(), std::back_inserter(u8list), [](const std::u32string& s){return unicode::convert(s);}); for (const auto& i : u32list) { @@ -380,7 +380,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid) BOOST_CHECK(s32.size() == i.size()); std::u16string s16{unicode::convert(i)}; BOOST_CHECK(s16.size() >= i.size()); - std::u8string s8{unicode::convert(i)}; + std::basic_string s8{unicode::convert(i)}; BOOST_CHECK(s8.size() >= i.size()); } @@ -389,7 +389,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid) BOOST_CHECK(s32.size() > 0 || i.size() == 0); std::u16string s16{unicode::convert(i)}; BOOST_CHECK(s16.size() == i.size()); - std::u8string s8{unicode::convert(i)}; + std::basic_string s8{unicode::convert(i)}; BOOST_CHECK(s8.size() >= i.size()); } @@ -398,7 +398,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid) BOOST_CHECK(s32.size() > 0 || i.size() == 0); std::u16string s16{unicode::convert(i)}; BOOST_CHECK(s16.size() > 0 || i.size() == 0); - std::u8string s8{unicode::convert(i)}; + std::basic_string s8{unicode::convert(i)}; BOOST_CHECK(s8.size() == i.size()); } @@ -424,7 +424,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid) // Performance test UTF-32 -> UTF-8 auto t0{std::chrono::steady_clock::now()}; for (const auto& i : u32list) { - std::u8string s{unicode::convert(i)}; + std::basic_string s{unicode::convert(i)}; } std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-8: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; } @@ -451,7 +451,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid) // Performance test UTF-16 -> UTF-8 auto t0{std::chrono::steady_clock::now()}; for (const auto& i : u16list) { - std::u8string s{unicode::convert(i)}; + std::basic_string s{unicode::convert(i)}; } std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-8: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; } @@ -478,7 +478,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid) // Performance test UTF-8 -> UTF-8 auto t0{std::chrono::steady_clock::now()}; for (const auto& i : u8list) { - std::u8string s{unicode::convert(i)}; + std::basic_string s{unicode::convert(i)}; } std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-8: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; } -- cgit v1.2.3