summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-01-31 19:00:34 +0100
committerRoland Reichwein <mail@reichwein.it>2021-01-31 19:00:34 +0100
commit611601ec36a5603bc9c94cdac9a307c4bb07c929 (patch)
tree0b1c27d5958a2a3bdfe3c421a27f6ab528fbc3e1
parent2ef9f51df48b14556e236d14213233e1bd7f829a (diff)
Add facet based interface
-rw-r--r--Makefile8
-rw-r--r--debian/control10
-rw-r--r--include/unicode.h221
-rw-r--r--src/test-unicode.cpp72
-rw-r--r--src/validate.cpp4
5 files changed, 288 insertions, 27 deletions
diff --git a/Makefile b/Makefile
index b66c17e..5d64631 100644
--- a/Makefile
+++ b/Makefile
@@ -60,9 +60,10 @@ endif
SRC=\
src/recode.cpp \
+ src/validate.cpp \
src/test-unicode.cpp
-all: src/recode src/test-unicode
+all: src/recode src/test-unicode src/validate
test: src/test-unicode
src/test-unicode
@@ -70,6 +71,9 @@ test: src/test-unicode
src/recode: src/recode.o dep
$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@
+src/validate: src/validate.o dep
+ $(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@
+
src/test-unicode: src/test-unicode.o dep
$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@
@@ -82,7 +86,7 @@ dep: $(SRC:.cpp=.d)
$(CXX) $(CXXFLAGS) -c $< -o $@
clean:
- -rm -f src/recode src/test-unicode
+ -rm -f src/recode src/test-unicode src/validate
-rm -rf result
-find . -name '*.o' -o -name '*.d' -o -name '*.gcno' -o -name '*.gcda' | xargs rm -f
diff --git a/debian/control b/debian/control
index 1572512..9d31022 100644
--- a/debian/control
+++ b/debian/control
@@ -19,3 +19,13 @@ Description: Unicode conversion library
- Additional support for ISO-8859-15
- Tested on Debian 10, Ubuntu 2004, Ubuntu 2010
- C++17 and C++20 compatible
+
+Package: unicode-tools
+Architecture: any
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Homepage: http://www.reichwein.it/unicode/
+Description: Unicode conversion tools
+ unicode-tools is a collection of tools for Unicode file conversion:
+ .
+ - unicode-recode: Recode Unicode or ISO-8859 file
+ - unicode-validate: Check file for Unicode compliance
diff --git a/include/unicode.h b/include/unicode.h
index f31cbac..4b676bf 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -3,8 +3,10 @@
#pragma once
#include <algorithm>
+#include <memory>
#include <stdexcept>
#include <string>
+#include <unordered_map>
#ifdef __cpp_char8_t
// char8_t available
@@ -31,7 +33,7 @@ namespace unicode::detail {
template<typename T>
struct utf_iterator
{
- typedef char32_t value_type;
+ typedef T value_type;
typedef char32_t& reference;
typedef std::basic_string<T> string_type;
@@ -201,6 +203,7 @@ namespace unicode::detail {
return value;
}
+ private:
typename string_type::const_iterator iterator;
typename string_type::const_iterator end_iterator;
@@ -211,13 +214,14 @@ namespace unicode::detail {
template<typename T>
struct utf_back_insert_iterator
{
+ typedef T value_type;
typedef std::basic_string<T> string_type;
typedef utf_back_insert_iterator& reference;
utf_back_insert_iterator(string_type& s): s(s) {}
// no-op
- utf_back_insert_iterator& operator++()
+ reference operator++()
{
return *this;
}
@@ -302,39 +306,220 @@ namespace unicode::detail {
return *this;
}
+ private:
typename utf_back_insert_iterator::string_type& s;
};
- template<typename T>
- utf_back_insert_iterator<T> utf_back_inserter(std::basic_string<T>& s)
- {
- return utf_back_insert_iterator<T>(s);
+ typedef std::unordered_map<utf8_t, char32_t> iso_map_type;
+ typedef std::unordered_map<char32_t, utf8_t> iso_map_type_reverse;
+
+ // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary
+ iso_map_type iso_8859_1_map;
+
+ // ISO-8859-15 is lower 8-bit of Unicode, except for:
+ iso_map_type iso_8859_15_map {
+ { '\xA4', U'\u20AC' }, // €
+ { '\xA6', U'\u0160' }, // Š
+ { '\xA8', U'\u0161' }, // š
+ { '\xB4', U'\u017D' }, // Ž
+ { '\xB8', U'\u017E' }, // ž
+ { '\xBC', U'\u0152' }, // Œ
+ { '\xBD', U'\u0153' }, // œ
+ { '\xBE', U'\u0178' }, // Ÿ
+ };
+
+ iso_map_type_reverse reverse_iso_map(const iso_map_type& map) {
+ iso_map_type_reverse result;
+ std::for_each(map.cbegin(), map.cend(),
+ [&](const iso_map_type::value_type& pair)
+ {
+ result.emplace(pair.second, pair.first);
+ });
+ return result;
}
- template<typename T>
- utf_iterator<T> utf_begin(const std::basic_string<T>& s)
+ iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) };
+ iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) };
+
+} // namespace unicode::detail
+
+namespace unicode {
+
+ using namespace detail;
+
+ template<unicode::detail::iso_map_type& Map=iso_8859_1_map>
+ struct iso_iterator {
+ typedef char32_t value_type;
+ typedef char32_t& reference;
+ typedef std::basic_string<utf8_t>::const_iterator iterator;
+
+ iso_iterator(const iterator& it): m_it(it) {}
+
+ // pre-increment
+ iso_iterator& operator++()
+ {
+ ++m_it;
+ return *this;
+ }
+
+ bool operator!=(const iso_iterator& other) const
+ {
+ return m_it != other.m_it;
+ }
+
+ // return reference?
+ value_type operator*()
+ {
+ utf8_t value{*m_it};
+
+ if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed
+ {
+ auto it{Map.find(value)};
+ if (it != Map.end())
+ return it->second;
+ }
+ return static_cast<value_type>(static_cast<uint8_t>(value));
+ }
+
+ private:
+ iterator m_it;
+ };
+
+ template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse>
+ struct iso_back_insert_iterator {
+ typedef iso_back_insert_iterator& reference;
+ typedef std::basic_string<utf8_t> string_type;
+
+ iso_back_insert_iterator(string_type& s): s(s) {}
+
+ // no-op
+ reference operator++()
+ {
+ return *this;
+ }
+
+ // support *x = value, together with operator=()
+ reference operator*()
+ {
+ return *this;
+ }
+
+ reference operator=(const char32_t& value)
+ {
+ if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping of 128 <= x <= 255 needed
+ {
+ auto it{Map.find(value)};
+ if (it != Map.end()) {
+ s.push_back(it->second);
+ return *this;
+ }
+ }
+
+ if (value > 255)
+ throw std::invalid_argument("Bad Unicode value above 255: "s + std::to_string(static_cast<uint32_t>(value)));
+
+ s.push_back(static_cast<utf8_t>(value));
+ return *this;
+ }
+
+ private:
+ typename iso_back_insert_iterator::string_type& s;
+ };
+
+ // Facet for convert() and ISO-8859-*
+ template<typename InputIt, typename OutputIt>
+ struct ISO_8859
+ {
+ typedef utf8_t value_type;
+
+ static InputIt begin(const std::basic_string<value_type>& s)
+ {
+ return InputIt(s.cbegin());
+ }
+
+ static InputIt end(const std::basic_string<value_type>& s)
+ {
+ return InputIt(s.cend());
+ }
+
+ static OutputIt back_inserter(std::basic_string<value_type>& s)
+ {
+ return OutputIt(s);
+ }
+ };
+
+ // Facet for convert() and UTF-*
+ template<typename InputIt, typename OutputIt>
+ struct UTF
{
- return utf_iterator<T>{s.cbegin(), s.cend()};
+ typedef typename InputIt::value_type value_type; // OutputIt::value_type is the same
+
+ static InputIt begin(const std::basic_string<value_type>& s)
+ {
+ return InputIt{s.cbegin(), s.cend()};
+ }
+
+ static InputIt end(const std::basic_string<value_type>& s)
+ {
+ return InputIt{s.cend(), s.cend()};
+ }
+
+ static OutputIt back_inserter(std::basic_string<value_type>& s)
+ {
+ return OutputIt(s);
+ }
+ };
+
+ // Facet for convert()
+ typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1;
+ typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15;
+
+ typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8;
+ typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;
+ typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
+
+ // From and To are facets
+ template<typename From, typename To>
+ std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s)
+ {
+ std::basic_string<typename To::value_type> result;
+
+ std::copy(From::begin(s), From::end(s), To::back_inserter(result));
+
+ return result;
}
+ // Helper to get correct Facet from char type, e.g. Encoding<typename decltype(s)::value_type>::Facet
template<typename T>
- utf_iterator<T> utf_end(const std::basic_string<T>& s)
+ struct Encoding
{
- return utf_iterator<T>{s.cend(), s.cend()};
- }
+ };
-} // namespace
+ template<>
+ struct Encoding<utf8_t>
+ {
+ typedef UTF_8 Facet;
+ };
-namespace unicode {
+ template<>
+ struct Encoding<char16_t>
+ {
+ typedef UTF_16 Facet;
+ };
- using namespace detail;
+ template<>
+ struct Encoding<char32_t>
+ {
+ typedef UTF_32 Facet;
+ };
+ // From and To are from: utf8_t, char16_t and char32_t
template<typename From, typename To>
- std::basic_string<To> utf_to_utf(const std::basic_string<From>& s)
+ std::basic_string<To> convert(const std::basic_string<From>& s)
{
std::basic_string<To> result;
- std::copy(utf_begin<From>(s), utf_end<From>(s), utf_back_inserter<To>(result));
+ std::copy(Encoding<From>::Facet::begin(s), Encoding<From>::Facet::end(s), Encoding<To>::Facet::back_inserter(result));
return result;
}
@@ -343,7 +528,7 @@ namespace unicode {
bool is_valid_utf(const std::basic_string<T>& s)
{
try {
- std::for_each(utf_begin<T>(s), utf_end<T>(s), [](const T& c){});
+ std::for_each(Encoding<T>::Facet::begin(s), Encoding<T>::Facet::end(s), [](const T& c){});
} catch(...) {
return false;
}
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 3d67124..e1aa23d 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -96,13 +96,18 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
typedef typename std::tuple_element<i,typename std::remove_reference<decltype(t)>::type>::type From;
typedef typename std::tuple_element<j,typename std::remove_reference<decltype(t)>::type>::type To;
- // test
- To result { unicode::utf_to_utf<typename From::value_type, typename To::value_type>(std::get<i>(t)) };
+ // test base type interface
+ To result { unicode::convert<typename From::value_type, typename To::value_type>(std::get<i>(t)) };
- BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
+ BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Base: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
//std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl;
+
+ // test facet interface
+ result = unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet, typename unicode::Encoding<typename To::value_type>::Facet>(std::get<i>(t));
+ BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
+
// iterate over other combinations
if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
test_utf_to_utf<i + 1, j>(t);
@@ -147,9 +152,18 @@ void test_utf_to_utf_failure(std::basic_string<From>& s)
{
typedef typename std::tuple_element<index, Collection>::type::value_type To;
+ // via base type
+ try {
+ (void) unicode::convert<From,To>(s);
+ BOOST_ERROR("Base: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
+ } catch (...) {
+ // OK
+ };
+
+ // via facet
try {
- unicode::utf_to_utf<From,To>(s);
- BOOST_ERROR("Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
+ (void) unicode::convert<typename unicode::Encoding<From>::Facet,typename unicode::Encoding<To>::Facet>(s);
+ BOOST_ERROR("Facet: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
} catch (...) {
// OK
};
@@ -236,14 +250,35 @@ void test_random(random_context& rc, size_t length)
From r {generate_random<From>(rc, length)};
+ // base type interface
try {
- To result{unicode::utf_to_utf<typename From::value_type,typename To::value_type>(r)};
+ To result{unicode::convert<typename From::value_type,typename To::value_type>(r)};
+
+ if (r.empty()) {
+ BOOST_CHECK(result.empty());
+ } else {
+ BOOST_CHECK(!result.empty());
+ }
} catch (const std::runtime_error&) {
// OK: this is an expected exception for utf_to_utf on bad input
} catch (const std::invalid_argument&) {
// OK: this is an expected exception for utf_to_utf on bad input
}
+ // facet interface
+ try {
+ To result{unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet,typename unicode::Encoding<typename To::value_type>::Facet>(r)};
+
+ if (r.empty()) {
+ BOOST_CHECK(result.empty());
+ } else {
+ BOOST_CHECK(!result.empty());
+ }
+ } catch (const std::runtime_error&) {
+ // OK: this is an expected exception for utf_to_utf on bad input
+ } catch (const std::invalid_argument&) {
+ // OK: this is an expected exception for utf_to_utf on bad input
+ }
//std::cerr << "DEBUG: " << typeid(From).name() << std::endl;
//std::cerr << " DEBUG2: " << typeid(To).name() << std::endl;
@@ -255,8 +290,9 @@ void test_random(random_context& rc, size_t length)
BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type)
{
random_context rc;
+ int i{};
- // run for 1s (debug) 10s (release)
+ // run for 1s (debug) 10s (release) = total time for all random_sequences types!
#ifdef _DEBUG
const auto timeout{1.0s};
#else
@@ -267,7 +303,29 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type)
while (!(std::chrono::steady_clock::now() > timeout_stamp)) {
test_random<T,types_collection_type>(rc, rc.sequence_length(rc.gen));
+ i++;
}
+
+ BOOST_CHECK_MESSAGE(i > 1, "Not enough iterations done!");
+}
+
+// Test ISO and UTF encodings
+BOOST_AUTO_TEST_CASE(convert)
+{
+ BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_1>({})}) == std::string{});
+ BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_1>("abc")}) == std::string{"abc"});
+ BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_1>("äöü")}) == std::string{"äöü"});
+ BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_1>("\xa4")}) == std::string{"\xa4"}); // €
+
+ BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_15,unicode::ISO_8859_15>("\xa4")}) == std::string{"\xa4"}); // €
+
+ BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::ISO_8859_15,unicode::ISO_8859_1>("\xa4")}), std::invalid_argument); // € not available in ISO-8859-1
+
+ BOOST_CHECK((unicode::convert<unicode::UTF_8,unicode::UTF_16>("abc")) == std::u16string{u"abc"});
+ BOOST_CHECK((unicode::convert<unicode::UTF_32,unicode::UTF_16>(U"abc")) == std::u16string{u"abc"});
+
+ BOOST_CHECK((unicode::convert<utf8_t,char16_t>("abc")) == std::u16string{u"abc"});
+ BOOST_CHECK((unicode::convert<char32_t,char16_t>(U"abc")) == std::u16string{u"abc"});
}
// TODO:
diff --git a/src/validate.cpp b/src/validate.cpp
new file mode 100644
index 0000000..8927fe4
--- /dev/null
+++ b/src/validate.cpp
@@ -0,0 +1,4 @@
+int main(int argc, char* argv[])
+{
+ return 0;
+}