summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-19 19:51:38 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-19 19:51:38 +0100
commit9dc97269201603dd479e15a736a64479a5095556 (patch)
treeb5f215bf9cfbbf2bee092505f4fdfbf3e4501b7b
parente24a0d5d371d0916dbfb375d3ea404f7e6237c74 (diff)
Simplify utf_iterator for input, build on Debian 10+11, Ubuntu 2004-2204
-rw-r--r--Makefile30
-rw-r--r--debian/control4
-rw-r--r--include/unicode.h85
-rw-r--r--src/test-unicode.cpp14
4 files changed, 70 insertions, 63 deletions
diff --git a/Makefile b/Makefile
index e8a8f29..346f8a0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,38 +1,56 @@
PROJECTNAME=unicode
VERSION=$(shell dpkg-parsechangelog --show-field Version)
+ifeq ($(shell lsb_release -si),Debian)
+ONDEBIAN=yes
+else
+ONDEBIAN=no
+endif
-DISTROS=base debian11 ubuntu2110
+# On Ubuntu 2104 and 2110, dh_strip / debugedit is broken, therefore different Non-Debian options in the following
+DISTROS=base debian10 debian11 ubuntu2004 ubuntu2010 ubuntu2104 ubuntu2110 ubuntu2204
ifeq ($(wildcard $(shell which clang++-13)),)
+ifeq ($(wildcard $(shell which clang++-12)),)
+ifeq ($(wildcard $(shell which clang++-11)),)
ifeq ($(wildcard $(shell which clang++)),)
CXX=g++-11
else
CXX=clang++
endif
else
+CXX=clang++-11
+endif
+else
+CXX=clang++-12
+endif
+else
CXX=clang++-13
endif
-# boost is buggy for C++20: error: static_assert failed due to requirement 'detail::is_endian_reversible_inplace<char8_t>
-#STANDARD=c++17
-STANDARD=c++20
+STANDARD=c++17
ifeq ($(CXXFLAGS),)
#CXXFLAGS=-O0 -g -D_DEBUG
CXXFLAGS=-O2 -DNDEBUG
endif
-CXXFLAGS+=-Wall -Iinclude -std=$(STANDARD)
-
ifeq ($(CXX),clang++-13)
+ifeq ($(ONDEBIAN),yes)
COMPILER_SUITE=clang
LIBS+=-fuse-ld=lld-13
+# boost is buggy for C++20: error: static_assert failed due to requirement 'detail::is_endian_reversible_inplace<char8_t>
+STANDARD=c++20
+endif
endif
ifeq ($(CXX),clang++)
+ifeq ($(ONDEBIAN),yes)
COMPILER_SUITE=clang
LIBS+=-fuse-ld=lld
endif
+endif
+
+CXXFLAGS+=-Wall -Iinclude -std=$(STANDARD)
LDLIBS+=\
-lboost_context \
diff --git a/debian/control b/debian/control
index 933d5f8..0c236a3 100644
--- a/debian/control
+++ b/debian/control
@@ -2,7 +2,7 @@ Source: unicode
Section: devel
Priority: optional
Maintainer: Roland Reichwein <mail@reichwein.it>
-Build-Depends: debhelper (>= 12), libboost-all-dev | libboost1.74-all-dev, libc++-dev | libc++-13-dev | libc++-11-dev, libc++abi-dev | libc++abi-13-dev | libc++abi-11-dev, lld | lld-13 | lld-11, clang | clang-13 | clang-11
+Build-Depends: debhelper (>= 12), libboost1.74-all-dev | libboost-all-dev, libc++-13-dev | libc++-12-dev | libc++-11-dev | libc++-dev, libc++abi-13-dev | libc++abi-12-dev | libc++abi-11-dev | libc++abi-dev, lld-13 | lld-12 | lld-11 | lld, clang-13 | clang-12 | clang-11 | clang, libunwind-13-dev | libunwind-12-dev | libunwind-dev, llvm-13-linker-tools | llvm-12-linker-tools | llvm-11-linker-tools | clang
Standards-Version: 4.5.0
Homepage: http://www.reichwein.it/unicode/
@@ -17,7 +17,7 @@ Description: Unicode conversion library
Features:
- Additional support for ISO-8859-1 encoding (Latin-1) as subset of Unicode
- Additional support for ISO-8859-15
- - Tested on Debian 11, Debian 10, Ubuntu 2004, Ubuntu 2010
+ - Tested on Debian 10+11, Ubuntu 2004 to 2110
- C++17 and C++20 compatible
Package: unicode-tools
diff --git a/include/unicode.h b/include/unicode.h
index 6b6f21a..6d8aac5 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -28,7 +28,7 @@ namespace unicode {
// usually, char32_t, uint32_t etc.
template<typename T>
- static inline bool is_valid_unicode(const T& value)
+ static inline bool is_valid_unicode(const T& value) noexcept
{
return value <= 0xD7FF || (value >= 0xE000 && value <= 0x10FFFF);
}
@@ -55,19 +55,18 @@ namespace unicode::detail {
utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend):
iterator(cbegin), end_iterator(cend)
{
- calculate_value();
}
utf_iterator(const utf_iterator& other) = default;
utf_iterator& operator=(const utf_iterator& other) = default;
- size_t remaining_code_units() const
+ size_t remaining_code_units() const noexcept
{
return std::distance(iterator, end_iterator);
}
template<size_t index>
- T get_code_unit() const
+ T get_code_unit() const noexcept
{
if constexpr (std::is_same<Container, typename std::list<T>>::value) {
// std::list doesn't support it + n
@@ -79,46 +78,49 @@ namespace unicode::detail {
}
}
- inline static bool is_continuation_byte(T b)
+ inline static bool is_continuation_byte(T b) noexcept
{
return (b & 0b11000000) == 0b10000000;
}
template<typename... Targs>
- inline static bool is_continuation_byte(T b, Targs... Fargs)
+ inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept
{
return is_continuation_byte(b) && is_continuation_byte(Fargs...);
}
template<size_t n>
- inline static bool is_byte0_of(T b)
+ inline static bool is_byte0_of(T b) noexcept
{
return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n));
}
- inline static char32_t continuation_value(T b)
+ inline static char32_t continuation_value(T b) noexcept
{
return static_cast<char32_t>(b & 0b00111111);
}
template<typename... Targs>
- inline static char32_t continuation_value(T b, Targs... Fargs)
+ inline static char32_t continuation_value(T b, Targs... Fargs) noexcept
{
return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
}
template<size_t n>
- inline static char32_t value_byte0_of(T b)
+ inline static char32_t value_byte0_of(T b) noexcept
{
return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
}
- void calculate_value_utf8()
+ template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ inline value_type calculate_value()
{
size_t remaining{remaining_code_units()};
if (!remaining)
- return;
+ return {};
+
+ value_type value{};
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
if (byte0 & 0x80) { // 2-4 bytes
@@ -126,17 +128,17 @@ namespace unicode::detail {
utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
value = value_byte0_of<2>(byte0) | continuation_value(byte1);
- sequence_length = 2;
+ std::advance(iterator, 2);
} else if (remaining >= 3) {
utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())};
if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes
value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2);
- sequence_length = 3;
+ std::advance(iterator, 3);
} else if (remaining >= 4) {
utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())};
if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes
value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3);
- sequence_length = 4;
+ std::advance(iterator, 4);
} else
throw std::invalid_argument("Bad input: Invalid 4 byte sequence");
} else
@@ -152,22 +154,25 @@ namespace unicode::detail {
} else { // 1 byte: 7 bit ASCII
value = byte0;
- sequence_length = 1;
+ std::advance(iterator, 1);
}
+
+ return value;
}
- void calculate_value_utf16()
+ template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ inline value_type calculate_value()
{
size_t remaining{remaining_code_units()};
if (!remaining)
- return;
+ return {};
char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane)
- value = unit0;
- sequence_length = 1;
+ std::advance(iterator, 1);
+ return unit0;
} else {
if (remaining < 2)
throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing");
@@ -176,45 +181,32 @@ namespace unicode::detail {
if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
- value = (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
- sequence_length = 2;
+ std::advance(iterator, 2);
+ return (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
}
}
- void calculate_value_utf32()
+ template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ inline value_type calculate_value()
{
size_t remaining{remaining_code_units()};
if (!remaining)
- return;
+ return {};
- value = static_cast<char32_t>(get_code_unit<0>());
-
- if (!unicode::is_valid_unicode(value))
- throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
+ value_type result {static_cast<char32_t>(get_code_unit<0>())};
- sequence_length = 1;
- }
+ if (!unicode::is_valid_unicode(result))
+ throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
- // set value member
- void calculate_value()
- {
- static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
+ std::advance(iterator, 1);
- if constexpr(sizeof(T) == 1) {
- calculate_value_utf8();
- } else if constexpr (sizeof(T) == 2) {
- calculate_value_utf16();
- } else if constexpr (sizeof(T) == 4) {
- calculate_value_utf32();
- }
+ return result;
}
// pre-increment
utf_iterator& operator++()
{
- std::advance(iterator, sequence_length);
- calculate_value();
return *this;
}
@@ -223,17 +215,14 @@ namespace unicode::detail {
return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
}
- reference operator*()
+ value_type operator*()
{
- return value;
+ return calculate_value();
}
private:
typename string_type::const_iterator iterator;
typename string_type::const_iterator end_iterator;
-
- char32_t value{}; // always save complete unicode code point at this point
- size_t sequence_length{};
};
template<typename T, typename Container=std::basic_string<T>>
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 29e5c2e..d00a33d 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -372,7 +372,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
std::transform(u32list.begin(), u32list.end(), std::back_inserter(u16list), [](const std::u32string& s){return unicode::convert<unicode::UTF_32, unicode::UTF_16>(s);});
// Fill UTF-8 data list
- std::vector<std::u8string> u8list;
+ std::vector<std::basic_string<utf8_t>> u8list;
std::transform(u32list.begin(), u32list.end(), std::back_inserter(u8list), [](const std::u32string& s){return unicode::convert<unicode::UTF_32, unicode::UTF_8>(s);});
for (const auto& i : u32list) {
@@ -380,7 +380,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
BOOST_CHECK(s32.size() == i.size());
std::u16string s16{unicode::convert<unicode::UTF_32, unicode::UTF_16>(i)};
BOOST_CHECK(s16.size() >= i.size());
- std::u8string s8{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
+ std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
BOOST_CHECK(s8.size() >= i.size());
}
@@ -389,7 +389,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
BOOST_CHECK(s32.size() > 0 || i.size() == 0);
std::u16string s16{unicode::convert<unicode::UTF_16, unicode::UTF_16>(i)};
BOOST_CHECK(s16.size() == i.size());
- std::u8string s8{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
+ std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
BOOST_CHECK(s8.size() >= i.size());
}
@@ -398,7 +398,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
BOOST_CHECK(s32.size() > 0 || i.size() == 0);
std::u16string s16{unicode::convert<unicode::UTF_8, unicode::UTF_16>(i)};
BOOST_CHECK(s16.size() > 0 || i.size() == 0);
- std::u8string s8{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
+ std::basic_string<utf8_t> s8{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
BOOST_CHECK(s8.size() == i.size());
}
@@ -424,7 +424,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
// Performance test UTF-32 -> UTF-8
auto t0{std::chrono::steady_clock::now()};
for (const auto& i : u32list) {
- std::u8string s{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
+ std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)};
}
std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
}
@@ -451,7 +451,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
// Performance test UTF-16 -> UTF-8
auto t0{std::chrono::steady_clock::now()};
for (const auto& i : u16list) {
- std::u8string s{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
+ std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)};
}
std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
}
@@ -478,7 +478,7 @@ BOOST_AUTO_TEST_CASE(random_sequences_valid)
// Performance test UTF-8 -> UTF-8
auto t0{std::chrono::steady_clock::now()};
for (const auto& i : u8list) {
- std::u8string s{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
+ std::basic_string<utf8_t> s{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)};
}
std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl;
}