Implemented recode and validate tools

author: Roland Reichwein <mail@reichwein.it> 2021-02-01 16:45:18 +0100
committer: Roland Reichwein <mail@reichwein.it> 2021-02-01 16:45:18 +0100
commit: f34a0aa3a2d46d349a41c0b28939176791c2efbe (patch)
tree: 663e5d5fd02cbb9b8f44cc502083f85b5b4d5c17
parent: 611601ec36a5603bc9c94cdac9a307c4bb07c929 (diff)
7 files changed, 385 insertions, 5 deletions
diff --git a/Makefile b/Makefile
index 5d64631..191e8b0 100644
--- a/Makefile
+++ b/Makefile
@@ -59,6 +59,7 @@ LIBS+= \
 endif
 
 SRC=\
+    src/file.cpp \
     src/recode.cpp \
     src/validate.cpp \
     src/test-unicode.cpp
@@ -68,13 +69,13 @@ all: src/recode src/test-unicode src/validate
 test: src/test-unicode
 	src/test-unicode
 
-src/recode: src/recode.o dep
-	$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@
+src/recode: src/recode.o src/file.o dep
+	$(CXX) $(LDFLAGS) src/recode.o src/file.o $(LDLIBS) $(LIBS) -o $@
 
-src/validate: src/validate.o dep
-	$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@
+src/validate: src/validate.o src/file.o dep
+	$(CXX) $(LDFLAGS) src/validate.o src/file.o $(LDLIBS) $(LIBS) -o $@
 
-src/test-unicode: src/test-unicode.o dep
+src/test-unicode: src/test-unicode.o src/file.o dep
 	$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@
 
 dep: $(SRC:.cpp=.d)
@@ -93,6 +94,9 @@ clean:
 install:
 	mkdir -p $(DESTDIR)/usr/include
 	cp include/unicode.h $(DESTDIR)/usr/include
+	mkdir -p $(DESTDIR)/usr/bin
+	cp src/recode $(DESTDIR)/usr/bin/unicode-recode
+	cp src/validate $(DESTDIR)/usr/bin/unicode-validate
 
 deb:
 	# build binary deb package
diff --git a/debian/libunicode-dev.install b/debian/libunicode-dev.install
new file mode 100644
index 0000000..92c3336
--- /dev/null
+++ b/debian/libunicode-dev.install
@@ -0,0 +1 @@
+usr/include/unicode.h
diff --git a/debian/unicode-tools.install b/debian/unicode-tools.install
new file mode 100644
index 0000000..11d89bb
--- /dev/null
+++ b/debian/unicode-tools.install
@@ -0,0 +1,2 @@
+usr/bin/unicode-recode
+usr/bin/unicode-validate
diff --git a/src/file.cpp b/src/file.cpp
new file mode 100644
index 0000000..571a9f8
--- /dev/null
+++ b/src/file.cpp
@@ -0,0 +1,36 @@
+#include "file.h"
+
+#include <fstream>
+
+namespace fs = std::filesystem;
+
+using namespace std::string_literals;
+
+std::string unicode::File::getFile(const fs::path& filename)
+{
+ std::ifstream file(filename.string(), std::ios::in | std::ios::binary | std::ios::ate);
+
+ if (file.is_open()) {
+  std::ifstream::pos_type fileSize { file.tellg() };
+  file.seekg(0, std::ios::beg);
+
+  std::string bytes(fileSize, '\0');
+  file.read(reinterpret_cast<char*>(bytes.data()), fileSize);
+
+  return bytes;
+
+ } else {
+  throw std::runtime_error("Opening "s + filename.string() + " for reading");
+ }
+}
+
+void unicode::File::setFile(const fs::path& filename, const std::string& s)
+{
+ std::ofstream file(filename.string(), std::ios::out | std::ios::binary);
+ if (file.is_open()) {
+  file.write(s.data(), s.size());
+ } else {
+  throw std::runtime_error("Opening "s + filename.string() + " for writing");
+ }
+}
+
diff --git a/src/file.h b/src/file.h
new file mode 100644
index 0000000..d2e396b
--- /dev/null
+++ b/src/file.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <cstdint>
+#include <filesystem>
+#include <string>
+#include <vector>
+
+namespace unicode::File {
+
+ std::string getFile(const std::filesystem::path& filename);
+ void setFile(const std::filesystem::path& filename, const std::string& s);
+
+}
diff --git a/src/recode.cpp b/src/recode.cpp
index 8927fe4..b8ada69 100644
--- a/src/recode.cpp
+++ b/src/recode.cpp
@@ -1,4 +1,177 @@
+#include "file.h"
+
+#include "unicode.h"
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/endian.hpp>
+
+#include <filesystem>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+
+namespace fs = std::filesystem;
+
+using namespace std::string_literals;
+
+namespace {
+
+void usage()
+{
+ std::cout << "Usage: recode <from-format> <from-file> <to-format> <to-file>" << std::endl;
+ std::cout << "Format:" << std::endl;
+ std::cout << "    UTF-8       UTF-8" << std::endl;
+ std::cout << "    UTF-16      UTF-16, native endian" << std::endl;
+ std::cout << "    UTF-16LE    UTF-16, little endian" << std::endl;
+ std::cout << "    UTF-16BE    UTF-16, big endian" << std::endl;
+ std::cout << "    UTF-32      UTF-32, native endian" << std::endl;
+ std::cout << "    UTF-32LE    UTF-32, little endian" << std::endl;
+ std::cout << "    UTF-32BE    UTF-32, big endian" << std::endl;
+ std::cout << "    ISO-8859-1  ISO-8859-1 (Latin-1)" << std::endl;
+ std::cout << "    ISO-8859-15 ISO-8859-15 (Latin-9)" << std::endl;
+ std::cout << "Exit code: 0 if valid, 1 otherwise." << std::endl;
+}
+
+std::unordered_map<std::string, std::string> typeid_name_map
+{
+ { "UTF-8", typeid(unicode::UTF_8).name() },
+ { "UTF-16", typeid(unicode::UTF_16).name() },
+ { "UTF-16LE", typeid(unicode::UTF_16).name() },
+ { "UTF-16BE", typeid(unicode::UTF_16).name() },
+ { "UTF-32", typeid(unicode::UTF_32).name() },
+ { "UTF-32LE", typeid(unicode::UTF_32).name() },
+ { "UTF-32BE", typeid(unicode::UTF_32).name() },
+ { "ISO-8859-1", typeid(unicode::ISO_8859_1).name() },
+ { "ISO-8859-15", typeid(unicode::ISO_8859_15).name() },
+};
+
+std::string get_id(const std::string& from, const std::string& to)
+{
+ return from + "," + to;
+}
+
+template<typename From, typename To>
+std::string get_id()
+{
+ return get_id(std::string{typeid(From).name()}, typeid(To).name());
+}
+
+template<typename T>
+void reverse_endian(std::basic_string<T>& s)
+{
+ std::for_each(s.begin(), s.end(), [](T& c){boost::endian::endian_reverse_inplace(c);});
+}
+
+std::unordered_map<std::string, std::function<std::string(const std::string&, bool, bool)>> convert_map {};
+
+template<typename From, typename To>
+void register_convert()
+{
+ std::string id{ get_id<From, To>() };
+
+ std::function<std::string(const std::string&, bool, bool)> f([](const std::string& s, bool swap_from_endian, bool swap_to_endian) -> std::string
+  {
+   if (s.size() % sizeof(typename From::value_type) != 0)
+    throw std::invalid_argument("Bad number of input bytes. Need multiple of "s + std::to_string(sizeof(typename From::value_type)) + ", got " + std::to_string(s.size()));
+
+   std::basic_string<typename From::value_type> from_data(s.size(), static_cast<typename From::value_type>(0));
+
+   std::memcpy(from_data.data(), s.data(), s.size());
+  
+   if (swap_from_endian) {
+    reverse_endian(from_data);
+   }
+
+   std::basic_string<typename To::value_type> to_data {unicode::convert<From, To>(from_data)};
+   
+   if (swap_to_endian) {
+    reverse_endian(to_data);
+   }
+
+   std::string result(to_data.size() * sizeof(typename To::value_type), '\0');
+   
+   std::memcpy(result.data(), to_data.data(), to_data.size() * sizeof(typename To::value_type));
+
+   return result;
+  });
+
+ convert_map[id] = f;
+}
+
+template<int N, typename... Ts> using NthTypeOf =
+ typename std::tuple_element<N, std::tuple<Ts...>>::type;
+
+template<size_t i, size_t j, typename ...Ts>
+void iterate_over()
+{
+ register_convert<NthTypeOf<i,Ts...>, NthTypeOf<j,Ts...>>();
+
+ if constexpr (i + 1 < sizeof...(Ts)) {
+  iterate_over<i + 1, j, Ts...>();
+ } else if constexpr (j + 1 < sizeof...(Ts)) {
+  iterate_over<0, j + 1, Ts...>();
+ }
+}
+
+template<typename...Ts>
+void build_map()
+{
+ iterate_over<0, 0, Ts...>();
+}
+
+}
+
 int main(int argc, char* argv[])
 {
+ if (argc != 5) {
+  usage();
+  return 1;
+ }
+
+ try {
+  build_map<unicode::UTF_8, unicode::UTF_16, unicode::UTF_32, unicode::ISO_8859_1, unicode::ISO_8859_15>();
+
+  std::string from_format {argv[1]};
+  fs::path from_path {argv[2]};
+  std::string to_format {argv[3]};
+  fs::path to_path {argv[4]};
+
+  std::string data{unicode::File::getFile(from_path)};
+
+  auto it_from{typeid_name_map.find(from_format)};
+  if (it_from == typeid_name_map.end())
+   throw std::invalid_argument("Bad input format: "s + from_format);
+
+  auto it_to{typeid_name_map.find(to_format)};
+  if (it_to == typeid_name_map.end())
+   throw std::invalid_argument("Bad output format: "s + to_format);
+
+  std::string id{get_id(it_from->second, it_to->second)};
+
+  std::cout << "DEBUG: " << id << std::endl;
+
+  auto it { convert_map.find(id) };
+  if (it == convert_map.end()) {
+   std::cerr << "Error: Conversion ID " << id << " not supported." << std::endl;
+   return 1;
+  }
+
+  bool swap_from_endian{(boost::algorithm::ends_with(from_format, "LE") && boost::endian::order::native != boost::endian::order::little) ||
+                        (boost::algorithm::ends_with(from_format, "BE") && boost::endian::order::native != boost::endian::order::big)};
+  bool swap_to_endian{(boost::algorithm::ends_with(to_format, "LE") && boost::endian::order::native != boost::endian::order::little) ||
+                      (boost::algorithm::ends_with(to_format, "BE") && boost::endian::order::native != boost::endian::order::big)};
+
+  // actual conversion
+  std::string to_data{it->second(data, swap_from_endian, swap_to_endian)};
+
+  unicode::File::setFile(to_path, to_data);
+
+ } catch (const std::exception& ex) {
+  std::cerr << "Error: " << ex.what() << std::endl;
+  return 1;
+ }
  return 0;
 }
+
diff --git a/src/validate.cpp b/src/validate.cpp
index 8927fe4..78e6175 100644
--- a/src/validate.cpp
+++ b/src/validate.cpp
@@ -1,4 +1,155 @@
+#include "file.h"
+
+#include "unicode.h"
+
+#include <boost/endian.hpp>
+
+#include <filesystem>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+
+namespace fs = std::filesystem;
+
+namespace {
+
+void usage()
+{
+ std::cout << "Usage: validate <format> <file>" << std::endl;
+ std::cout << "Format:" << std::endl;
+ std::cout << "    UTF-8     UTF-8" << std::endl;
+ std::cout << "    UTF-16    UTF-16, big or little endian" << std::endl;
+ std::cout << "    UTF-16LE  UTF-16, little endian" << std::endl;
+ std::cout << "    UTF-16BE  UTF-16, big endian" << std::endl;
+ std::cout << "    UTF-32    UTF-32, big or little endian" << std::endl;
+ std::cout << "    UTF-32LE  UTF-32, little endian" << std::endl;
+ std::cout << "    UTF-32BE  UTF-32, big endian" << std::endl;
+ std::cout << "Exit code: 0 if valid, 1 otherwise." << std::endl;
+}
+
+std::unordered_map<std::string, std::function<bool(const std::string&)>> validate_map
+{
+ { "UTF-8", [](const std::string& s) -> bool { return unicode::is_valid_utf(s); }},
+ { "UTF-16", [](const std::string& s) -> bool
+  {
+   if (s.size() & 1) // need even number of bytes
+    return false;
+
+   std::u16string data(s.size() / 2, u'\0');
+   std::memcpy(data.data(), s.data(), s.size());
+
+   if (unicode::is_valid_utf(data))
+    return true;
+
+   // maybe reverse endianess
+   std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::endian_reverse_inplace(c);});
+
+   return unicode::is_valid_utf(data);
+  }
+ },
+ { "UTF-16LE", [](const std::string& s) -> bool
+  {
+   if (s.size() & 1) // need even number of bytes
+    return false;
+
+   std::u16string data(s.size() / 2, u'\0');
+   std::memcpy(data.data(), s.data(), s.size());
+
+   if (boost::endian::order::native != boost::endian::order::little)
+    std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_little_inplace(c);});
+
+   return unicode::is_valid_utf(data);
+  }
+ },
+ { "UTF-16BE", [](const std::string& s) -> bool
+  {
+   if (s.size() & 1) // need even number of bytes
+    return false;
+
+   std::u16string data(s.size() / 2, u'\0');
+   std::memcpy(data.data(), s.data(), s.size());
+
+   if (boost::endian::order::native != boost::endian::order::big)
+    std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_big_inplace(c);});
+
+   return unicode::is_valid_utf(data);
+  }
+ },
+ { "UTF-32", [](const std::string& s) -> bool
+  {
+   if (s.size() & 3) // need even number of bytes
+    return false;
+
+   std::u32string data(s.size() / 4, U'\0');
+   std::memcpy(data.data(), s.data(), s.size());
+
+   if (unicode::is_valid_utf(data))
+    return true;
+
+   // maybe reverse endianess
+   std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::endian_reverse_inplace(c);});
+
+   return unicode::is_valid_utf(data);
+  }
+ },
+ { "UTF-32LE", [](const std::string& s) -> bool
+  {
+   if (s.size() & 3) // need multiple of 4 bytes
+    return false;
+
+   std::u32string data(s.size() / 4, U'\0');
+   std::memcpy(data.data(), s.data(), s.size());
+
+   if (boost::endian::order::native != boost::endian::order::little)
+    std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_little_inplace(c);});
+
+   return unicode::is_valid_utf(data);
+  }
+ },
+ { "UTF-32BE", [](const std::string& s) -> bool
+  {
+   if (s.size() & 3) // need multiple of 4 bytes
+    return false;
+
+   std::u32string data(s.size() / 4, U'\0');
+   std::memcpy(data.data(), s.data(), s.size());
+
+   if (boost::endian::order::native != boost::endian::order::big)
+    std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_big_inplace(c);});
+
+   return unicode::is_valid_utf(data);
+  }
+ },
+};
+
+}
+
 int main(int argc, char* argv[])
 {
+ if (argc != 3) {
+  usage();
+  return 1;
+ }
+
+ try {
+  std::string format {argv[1]};
+  fs::path path {argv[2]};
+
+  std::string data{unicode::File::getFile(path)};
+
+  auto it { validate_map.find(format) };
+  if (it == validate_map.end()) {
+   std::cerr << "Error: Encoding " << format << " not supported." << std::endl;
+   return 1;
+  }
+
+  return it->second(data) ? 0 : 1;
+
+ } catch (const std::exception& ex) {
+  std::cerr << "Error: " << ex.what() << std::endl;
+  return 1;
+ }
  return 0;
 }
+
author	Roland Reichwein <mail@reichwein.it>	2021-02-01 16:45:18 +0100
committer	Roland Reichwein <mail@reichwein.it>	2021-02-01 16:45:18 +0100
commit	f34a0aa3a2d46d349a41c0b28939176791c2efbe (patch)
tree	663e5d5fd02cbb9b8f44cc502083f85b5b4d5c17
parent	611601ec36a5603bc9c94cdac9a307c4bb07c929 (diff)