summaryrefslogtreecommitdiffhomepage
path: root/src/validate.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/validate.cpp')
-rw-r--r--src/validate.cpp151
1 files changed, 151 insertions, 0 deletions
diff --git a/src/validate.cpp b/src/validate.cpp
index 8927fe4..78e6175 100644
--- a/src/validate.cpp
+++ b/src/validate.cpp
@@ -1,4 +1,155 @@
+#include "file.h"
+
+#include "unicode.h"
+
+#include <boost/endian.hpp>
+
+#include <filesystem>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+
+namespace fs = std::filesystem;
+
+namespace {
+
+void usage()
+{
+ std::cout << "Usage: validate <format> <file>" << std::endl;
+ std::cout << "Format:" << std::endl;
+ std::cout << " UTF-8 UTF-8" << std::endl;
+ std::cout << " UTF-16 UTF-16, big or little endian" << std::endl;
+ std::cout << " UTF-16LE UTF-16, little endian" << std::endl;
+ std::cout << " UTF-16BE UTF-16, big endian" << std::endl;
+ std::cout << " UTF-32 UTF-32, big or little endian" << std::endl;
+ std::cout << " UTF-32LE UTF-32, little endian" << std::endl;
+ std::cout << " UTF-32BE UTF-32, big endian" << std::endl;
+ std::cout << "Exit code: 0 if valid, 1 otherwise." << std::endl;
+}
+
+std::unordered_map<std::string, std::function<bool(const std::string&)>> validate_map
+{
+ { "UTF-8", [](const std::string& s) -> bool { return unicode::is_valid_utf(s); }},
+ { "UTF-16", [](const std::string& s) -> bool
+ {
+ if (s.size() & 1) // need even number of bytes
+ return false;
+
+ std::u16string data(s.size() / 2, u'\0');
+ std::memcpy(data.data(), s.data(), s.size());
+
+ if (unicode::is_valid_utf(data))
+ return true;
+
+ // maybe reverse endianess
+ std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::endian_reverse_inplace(c);});
+
+ return unicode::is_valid_utf(data);
+ }
+ },
+ { "UTF-16LE", [](const std::string& s) -> bool
+ {
+ if (s.size() & 1) // need even number of bytes
+ return false;
+
+ std::u16string data(s.size() / 2, u'\0');
+ std::memcpy(data.data(), s.data(), s.size());
+
+ if (boost::endian::order::native != boost::endian::order::little)
+ std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_little_inplace(c);});
+
+ return unicode::is_valid_utf(data);
+ }
+ },
+ { "UTF-16BE", [](const std::string& s) -> bool
+ {
+ if (s.size() & 1) // need even number of bytes
+ return false;
+
+ std::u16string data(s.size() / 2, u'\0');
+ std::memcpy(data.data(), s.data(), s.size());
+
+ if (boost::endian::order::native != boost::endian::order::big)
+ std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_big_inplace(c);});
+
+ return unicode::is_valid_utf(data);
+ }
+ },
+ { "UTF-32", [](const std::string& s) -> bool
+ {
+ if (s.size() & 3) // need even number of bytes
+ return false;
+
+ std::u32string data(s.size() / 4, U'\0');
+ std::memcpy(data.data(), s.data(), s.size());
+
+ if (unicode::is_valid_utf(data))
+ return true;
+
+ // maybe reverse endianess
+ std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::endian_reverse_inplace(c);});
+
+ return unicode::is_valid_utf(data);
+ }
+ },
+ { "UTF-32LE", [](const std::string& s) -> bool
+ {
+ if (s.size() & 3) // need multiple of 4 bytes
+ return false;
+
+ std::u32string data(s.size() / 4, U'\0');
+ std::memcpy(data.data(), s.data(), s.size());
+
+ if (boost::endian::order::native != boost::endian::order::little)
+ std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_little_inplace(c);});
+
+ return unicode::is_valid_utf(data);
+ }
+ },
+ { "UTF-32BE", [](const std::string& s) -> bool
+ {
+ if (s.size() & 3) // need multiple of 4 bytes
+ return false;
+
+ std::u32string data(s.size() / 4, U'\0');
+ std::memcpy(data.data(), s.data(), s.size());
+
+ if (boost::endian::order::native != boost::endian::order::big)
+ std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_big_inplace(c);});
+
+ return unicode::is_valid_utf(data);
+ }
+ },
+};
+
+}
+
int main(int argc, char* argv[])
{
+ if (argc != 3) {
+ usage();
+ return 1;
+ }
+
+ try {
+ std::string format {argv[1]};
+ fs::path path {argv[2]};
+
+ std::string data{unicode::File::getFile(path)};
+
+ auto it { validate_map.find(format) };
+ if (it == validate_map.end()) {
+ std::cerr << "Error: Encoding " << format << " not supported." << std::endl;
+ return 1;
+ }
+
+ return it->second(data) ? 0 : 1;
+
+ } catch (const std::exception& ex) {
+ std::cerr << "Error: " << ex.what() << std::endl;
+ return 1;
+ }
return 0;
}
+