summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-02-05 14:10:53 +0100
committerRoland Reichwein <mail@reichwein.it>2021-02-05 14:10:53 +0100
commit3d7a431811748c5aa1f49c35436696fc3f05de5d (patch)
tree252cb1896ebd994ee6c4f7b09b0927bd7fa709f3
parent6a12dddc641be34b323835a495399715790811e0 (diff)
Documentation, support validation via Traits
-rw-r--r--debian/README.Debian75
-rw-r--r--include/unicode.h13
-rw-r--r--src/test-unicode.cpp16
3 files changed, 102 insertions, 2 deletions
diff --git a/debian/README.Debian b/debian/README.Debian
index 162e3f0..382d20d 100644
--- a/debian/README.Debian
+++ b/debian/README.Debian
@@ -4,6 +4,81 @@ unicode for Debian
This package is the Debian version of unicode, a C++ library for Unicode encoding.
+CLI interface (package unicode-tools)
+-------------------------------------
+
+* unicode-recode
+
+ Usage: recode <from-format> <from-file> <to-format> <to-file>
+ Format:
+ UTF-8 UTF-8
+ UTF-16 UTF-16, native endian
+ UTF-16LE UTF-16, little endian
+ UTF-16BE UTF-16, big endian
+ UTF-32 UTF-32, native endian
+ UTF-32LE UTF-32, little endian
+ UTF-32BE UTF-32, big endian
+ ISO-8859-1 ISO-8859-1 (Latin-1)
+ ISO-8859-15 ISO-8859-15 (Latin-9)
+ Exit code: 0 if valid, 1 otherwise.
+
+* unicode-validate
+
+ Usage: validate <format> <file>
+ Format:
+ UTF-8 UTF-8
+ UTF-16 UTF-16, big or little endian
+ UTF-16LE UTF-16, little endian
+ UTF-16BE UTF-16, big endian
+ UTF-32 UTF-32, big or little endian
+ UTF-32LE UTF-32, little endian
+ UTF-32BE UTF-32, big endian
+ Exit code: 0 if valid, 1 otherwise.
+
+
+C++ interface (package libunicode-dev)
+--------------------------------------
+
+Example:
+
+#include <unicode.h>
+...
+
+ std::string utf8_value {u8"äöü"};
+ std::u16string utf16_value{unicode::convert<char, char16_t>(utf8_value)};
+
+And for C++20:
+
+ std::u8string utf8_value {u8"äöü"};
+ std::u16string utf16_value{unicode::convert<char8_t, char16_t>(utf8_value)};
+
+The following encodings are implicitly deducted from types:
+ * char resp. char8_t (C++20): UTF-8
+ * char16_t: UTF-16
+ * char32_t: UTF-32
+
+Explicit encoding specification is also possible:
+
+ std::string value {"äöü"};
+ std::u32string utf32_value{unicode::convert<unicode::ISO_8859_1, unicode::UTF_32>(value)};
+
+Supported encodings are:
+
+ * unicode::UTF_8
+ * unicode::UTF_16
+ * unicode::UTF_32
+ * unicode::ISO_8859_1
+ * unicode::ISO_8859_15
+
+Validation can be done like this:
+
+ bool valid{unicode::is_valid_utf<char16_t>(utf16_value)};
+
+Or via explicit encoding specification:
+
+ bool valid{unicode::is_valid_utf<unicode::UTF_8>(utf8_value)};
+
+
Contact
-------
diff --git a/include/unicode.h b/include/unicode.h
index df61ac3..2424fb1 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -562,6 +562,7 @@ namespace unicode {
return result;
}
+ // basic type version
template<typename T>
bool is_valid_utf(const std::basic_string<T>& s)
{
@@ -573,5 +574,17 @@ namespace unicode {
return true;
}
+ // Facet version
+ template<typename Facet>
+ bool is_valid_utf(const std::basic_string<typename Facet::value_type>& s)
+ {
+ try {
+ std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){});
+ } catch (const std::invalid_argument&) {
+ return false;
+ }
+ return true;
+ }
+
} // namespace unicode
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 5529d2c..692dfac 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -130,11 +130,14 @@ void test_is_valid_utf(std::tuple<Ts...>& t)
{
typedef typename std::tuple_element<i,typename std::remove_reference<decltype(t)>::type>::type T;
- // test
+ // test via basic type
bool result { unicode::is_valid_utf<typename T::value_type>(std::get<i>(t)) };
-
BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result);
+ // test via Facet
+ result = unicode::is_valid_utf<typename unicode::Encoding<typename T::value_type>::Facet>(std::get<i>(t));
+ BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename unicode::Encoding<typename T::value_type>::Facet).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result);
+
// iterate over other combinations
if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
test_is_valid_utf<i + 1>(t);
@@ -194,6 +197,8 @@ template<typename T, typename Collection, size_t index = 0>
void test_is_valid_utf_failure(std::basic_string<T>& s)
{
BOOST_CHECK_MESSAGE(unicode::is_valid_utf<T>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name());
+
+ BOOST_CHECK_MESSAGE(unicode::is_valid_utf<typename unicode::Encoding<T>::Facet>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding<T>::Facet).name());
// iterate over remaining types
if constexpr (index + 1 < std::tuple_size<Collection>::value)
@@ -331,6 +336,13 @@ BOOST_AUTO_TEST_CASE(convert)
BOOST_CHECK((unicode::convert<char32_t,char16_t>(U"abc")) == std::u16string{u"abc"});
}
+BOOST_AUTO_TEST_CASE(is_valid_utf)
+{
+ BOOST_CHECK(unicode::is_valid_utf<char16_t>(u"äöü"));
+
+ BOOST_CHECK(unicode::is_valid_utf<unicode::UTF_8>(u8"äöü"));
+}
+
BOOST_AUTO_TEST_CASE(string_u8string)
{
std::string a{"\xc3\xa4"};