diff options
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | coff.cpp | 693 | ||||
| -rw-r--r-- | coff.h | 8 | ||||
| -rw-r--r-- | intel.cpp | 503 | 
4 files changed, 1206 insertions, 0 deletions
| @@ -48,10 +48,12 @@ PROGSRC=\      bnf.cpp \      cpp.cpp \      cppbnf.cpp \ +    coff.cpp \      debug.cpp \      elf.cpp \      file.cpp \      grammer.cpp \ +    intel.cpp \      lexer.cpp \      minicc.cpp \ diff --git a/coff.cpp b/coff.cpp new file mode 100644 index 0000000..f2a5aa8 --- /dev/null +++ b/coff.cpp @@ -0,0 +1,693 @@ +#include "coff.h" + +#include <boost/algorithm/string/predicate.hpp> +#include <boost/endian/conversion.hpp> + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <filesystem> +#include <fstream> +#include <iostream> +#include <sstream> +#include <stdexcept> +#include <string> +#include <vector> + +namespace fs = std::filesystem; + +using namespace std::string_literals; + +namespace { + +#pragma pack(push) +#pragma pack(1) +struct MSDOSStub +{ + uint8_t padding[0x3c]; + uint32_t PESignatureOffset; +}; + +struct PESignature +{ + uint8_t bytes[4]{}; // "PE\0\0" +}; + +struct COFFHeader +{ + uint16_t Machine{}; + uint16_t NumberOfSections{}; + uint32_t TimeDateStamp{}; + uint32_t PointerToSymbolTable{}; + uint32_t NumberOfSymbols{}; + uint16_t SizeOfOptionalHeader{}; + uint16_t Characteristics{}; +}; + +// COFFHeader.Machine: +const uint16_t IMAGE_FILE_MACHINE_UNKNOWN = 0; +const uint16_t IMAGE_FILE_MACHINE_AMD64   = 0x8664; + +// COFFHeader.Characteristics: +const uint16_t IMAGE_FILE_EXECUTABLE_IMAGE    = 0x002; +const uint16_t IMAGE_FILE_LARGE_ADDRESS_AWARE = 0x020; + +// COFFOptionalHeader_Windows.SubSystem +const uint16_t IMAGE_SUBSYSTEM_WINDOWS_CUI = 3; + +struct COFFOptionalHeader +{ + uint16_t Magic{}; + uint8_t MajorLinkerVersion{}; + uint8_t MinorLinkerVersion{}; + uint32_t SizeOfCode{}; + uint32_t SizeOfInitializedData{}; + uint32_t SizeOfUninitializedData{}; + uint32_t AddressOfEntryPoint{}; + uint32_t BaseOfCode{}; + uint32_t BaseOfData{}; +}; + +// COFFOptionalHeader.Magic +const uint16_t MAGIC_PE32 = 0x010b; +const uint16_t MAGIC_PE32p = 0x020b; + +// SectionHeader.Characteristics +const uint32_t IMAGE_SCN_CNT_CODE             = 0x00000020; +const uint32_t IMAGE_SCN_CNT_INITIALIZED_DATA = 0x00000040; +const uint32_t IMAGE_SCN_MEM_EXECUTE          = 0x20000000; +const uint32_t IMAGE_SCN_MEM_READ             = 0x40000000; +const uint32_t IMAGE_SCN_MEM_WRITE            = 0x80000000; + +struct COFFOptionalHeader_PE32p +{ + uint16_t Magic{}; + uint8_t MajorLinkerVersion{}; + uint8_t MinorLinkerVersion{}; + uint32_t SizeOfCode{}; + uint32_t SizeOfInitializedData{}; + uint32_t SizeOfUninitializedData{}; + uint32_t AddressOfEntryPoint{}; + uint32_t BaseOfCode{}; +}; + +struct COFFOptionalHeader_Windows +{ + uint32_t ImageBase{}; + uint32_t SectionAlignment{}; + uint32_t FileAlignment{}; + uint16_t MajorOperatingSystemVersion{}; + uint16_t MinorOperatingSystemVersion{}; + uint16_t MajorImageVersion{}; + uint16_t MinorImageVersion{}; + uint16_t MajorSubsystemVersion{}; + uint16_t MinorSubsystemVersion{}; + uint32_t Win32VersionValue{}; // reserved, =0 + uint32_t SizeOfImage{}; + uint32_t SizeOfHeaders{}; + uint32_t CheckSum{}; + uint16_t Subsystem{}; + uint16_t DllCharacteristics{}; + uint32_t SizeOfStackReserve{}; + uint32_t SizeOfStackCommit{}; + uint32_t SizeOfHeapReserve{}; + uint32_t SizeOfHeapCommit{}; + uint32_t LoaderFlags{}; + uint32_t NumberOfRvaAndSizes{}; +}; + +struct COFFOptionalHeader_Windows_PE32p +{ + uint64_t ImageBase{}; + uint32_t SectionAlignment{}; + uint32_t FileAlignment{}; + uint16_t MajorOperatingSystemVersion{}; + uint16_t MinorOperatingSystemVersion{}; + uint16_t MajorImageVersion{}; + uint16_t MinorImageVersion{}; + uint16_t MajorSubsystemVersion{}; + uint16_t MinorSubsystemVersion{}; + uint32_t Win32VersionValue{}; // reserved, =0 + uint32_t SizeOfImage{}; + uint32_t SizeOfHeaders{}; + uint32_t CheckSum{}; + uint16_t Subsystem{}; + uint16_t DllCharacteristics{}; + uint64_t SizeOfStackReserve{}; + uint64_t SizeOfStackCommit{}; + uint64_t SizeOfHeapReserve{}; + uint64_t SizeOfHeapCommit{}; + uint32_t LoaderFlags{}; + uint32_t NumberOfRvaAndSizes{}; +}; + +// For each section: +struct SectionHeader +{ + uint8_t Name[8]{}; + uint32_t VirtualSize{}; + uint32_t VirtualAddress{}; + uint32_t SizeOfRawData{}; + uint32_t PointerToRawData{}; + uint32_t PointerToRelocations{}; + uint32_t PointerToLinenumbers{}; + uint16_t NumberOfRelocations{}; + uint16_t NumberOfLinenumbers{}; + uint32_t Characteristics{}; +}; + +struct COFFRelocation +{ + uint32_t VirtualAddress{}; + uint32_t SymbolTableIndex{}; + uint16_t Type{}; +}; + +struct COFFSymbolTableRecord +{ + uint64_t Name{}; // up-to-8-Byte String or COFFSymbolTableRecordName (if longer) + uint32_t Value{}; + uint16_t SectionNumber{}; + uint16_t Type{}; + uint8_t StorageClass{}; + uint8_t NumberOfAuxSymbols{}; +}; + +struct COFFSymbolTableRecordName +{ + uint32_t Zeroes{}; + uint32_t Offset{}; +}; + +struct LibSignature +{ + uint8_t bytes[8]{}; // "!<arch>\n" +}; + +struct LibHeader +{ + uint8_t Name[16]{}; + uint8_t Date[12]{}; + uint8_t UserID[6]{}; + uint8_t GroupID[6]{}; + uint8_t Mode[8]{}; + uint8_t Size[10]{}; // ASCII-decimal size of Member Body (size without LibHeader size) + uint8_t End[2]{}; // "~\n" + + std::string GetName() const + { +  std::string s{ (char*)&Name, sizeof(Name) }; + +  size_t pos = s.find("/"); +  if (pos == s.npos) +   throw std::runtime_error("LibHeader Name doesn't contain '/'"); + +  if (pos == 0) { +   if (s[1] == '/') { // "//" +    return "//"; // longnames header +   } else if (s[1] == ' ') { // "/ " +    return "/"; // linker members (#0 and #1) +   } else { +    pos = s.find(" "); // string is zero-padded. We return string without trailing zeros. +    return s.substr(0, pos); // "/<number-as-offset-into-longnames-member>" +   } +  } else { +   return s.substr(0, pos); // "name/" +  } + } + + size_t BodySize() const + { +  std::string s{(char*)&Size, sizeof(Size)}; +  size_t pos{s.find(" ")}; // remove trailing space-padding +  s = s.substr(0, pos); + +  try { +   return std::stoll(s); +  } +  catch (const std::exception &) { +   throw std::runtime_error("Bad size for LibHeader"); +  } + } +}; + +struct FirstLinkerMember { + uint32_t NumberOfSymbols; // big endian + // NumberOfSymbols x uint32_t Offsets; + // String Table +}; + +struct SecondLinkerMember { + uint32_t NumberOfMembers; + // NumberOfMembers x uint32_t Offsets; + // uint32_t NumberOfSymbols; + // NumberOfSymbols x uint16_t Indices; +}; + +// TODO: export table +// TODO: import table +// TODO: relocations table +// TODO: TLS table (thread local storage) +#pragma pack(pop) + + std::vector<uint8_t> getFile(const fs::path& filename) + { +  std::ifstream file(filename.string(), std::ios::in | std::ios::binary | std::ios::ate); + +  if (file.is_open()) { +   std::ifstream::pos_type fileSize = file.tellg(); +   file.seekg(0, std::ios::beg); + +   std::vector<uint8_t> bytes(fileSize, 0); +   file.read(reinterpret_cast<char*>(bytes.data()), fileSize); + +   return bytes; + +  } else { +   throw std::runtime_error("Opening "s + filename.string() + " for reading"); +  } + } + + uint32_t PE_addr(const std::vector<uint8_t>& data) + { +  if (data.size() >= 0x40) { +   size_t offset = *(reinterpret_cast<const uint32_t*>(data.data() + 0x3c)); +   if (data.size() >= offset + 4) { +    std::vector<uint8_t> ref{ 'P', 'E', '\0', '\0' }; +    auto [data_it, ref_it] { std::mismatch(data.begin() + offset, data.end(), ref.begin(), ref.end()) }; +    if (ref_it == ref.end()) +     return uint8_t(offset) + 4; +   } +  } +  return 0; + } + + bool isPE(const std::vector<uint8_t>& data) + { +  return PE_addr(data); + } +  + char to_hex_digit(uint8_t value) + { +  if (value < 10) +   return '0' + value; +  else +   return 'a' + value - 10; + } +  + template< typename T > + std::string to_hex(T i) + { +  std::stringstream stream; +  if (sizeof(T) == 1) +   stream << to_hex_digit(i >> 4) << to_hex_digit(i & 0xF); +  else +   stream +    << std::setfill('0') << std::setw(sizeof(T) * 2) +    << std::hex << i; +  return stream.str(); + } +  + template< typename T > + std::string to_0xhex(T i) + { +  std::stringstream stream; +  if (sizeof(T) == 1) +   stream << to_hex_digit(i >> 4) << to_hex_digit(i & 0xF); +  else +   stream << "0x" +    << std::setfill('0') << std::setw(sizeof(T) * 2) +    << std::hex << i; +  return stream.str(); + } +  + std::string to_string(const uint8_t(&name)[8]) { +  if (name[0] == '/') { +   // rest contains a decimal number ASCII coded as offset into string table +   throw std::runtime_error("Unimplemented /-based name, TODO!"); +  } + +  return std::string(reinterpret_cast<const char*>(name), 8); + } + + void DumpSection(const std::vector<uint8_t>& data, uint32_t Offset, uint32_t Size, uint32_t VirtualSize) + { +  if (data.size() < Offset + Size) +   throw std::runtime_error("Not enough raw data to dump, got "s + std::to_string(data.size()) + ", expected "s + std::to_string(Offset + Size)); +  // Size < VirtualSize: the rest is implicitly padded +  std::string printable; + +  for (uint32_t i = 0; i < VirtualSize; i++) { +   if (i % 16 == 0) { +    std::cout << "  " << printable << "\n    " << to_0xhex(i) << " "; +    printable = ""; +   } else if (i % 16 == 8) +    std::cout << " "; +   std::string value = (i < Size) ? to_hex(uint8_t(data[Offset + i])) : "oo"; +   int c = (i < Size) ? data[Offset + i] : 0; +   std::cout << " " << value; +   if (std::isprint(c)) +    printable.append(size_t(1), char(c)); +   else +    printable.append("."); +  } +  std::cout << (VirtualSize % 16 > 0 ? std::string(size_t(3 * (16 - VirtualSize % 16)), ' ') + (VirtualSize % 16 <= 8 ? " " : "") : "") << "  " << printable; + +  std::cout << "\n"; + } + + // PE = + //   MSDOSStub + //   PESignature + //   COFFHeader + //    + COFFOptionalHeader or COFFOptionalHeader_P32p + //    + COFFOptionalHeader_Windows or COFFOptionalHeader_Windows_PE32p + //    + N x DataDirectory + //   SectionHeader(s) +  + void DumpExe(const std::vector<uint8_t>& data) + { +  size_t offset{ PE_addr(data) }; + +  if (data.size() >= offset + sizeof(COFFHeader)) { +   std::cout << "COFF Image (EXE) found.\n" << std::endl; +   const COFFHeader& coffHeader{ *(reinterpret_cast<const COFFHeader*>(data.data() + offset)) }; + +   std::cout << "Machine: " << to_0xhex(coffHeader.Machine) << "\n"; +   if (coffHeader.Machine != IMAGE_FILE_MACHINE_AMD64) +    std::cout << "  Warning: Unsupported.\n"; +   std::cout << "NumberOfSections: " << coffHeader.NumberOfSections << "\n"; + +   if (coffHeader.SizeOfOptionalHeader == 0) +    std::cout << "Warning: SizeOfOptionalHeader is " << coffHeader.SizeOfOptionalHeader << ". Expected " << sizeof(COFFOptionalHeader) << ".\n"; + +   for (int i = 1; i <= coffHeader.NumberOfSections; i++) { +    if (data.size() < offset + sizeof(COFFHeader) + coffHeader.SizeOfOptionalHeader + i * sizeof(SectionHeader)) +     throw std::runtime_error("Data size too small to read next Section Header"); +    const SectionHeader& sectionHeader{ *(reinterpret_cast<const SectionHeader*>(data.data() + offset + coffHeader.SizeOfOptionalHeader + sizeof(COFFHeader) + (i - 1) * sizeof(SectionHeader))) }; +    std::cout << "\nSection #" << i << ":\n"; +    std::cout << "  Name: " << to_string(sectionHeader.Name) << "\n"; +    std::cout << "  Size: " << sectionHeader.VirtualSize << " bytes\n"; +    std::cout << "  Raw Data:\n"; +    DumpSection(data, sectionHeader.PointerToRawData, sectionHeader.SizeOfRawData, sectionHeader.VirtualSize); +   } +  } else +   throw std::runtime_error("Data size too small to read COFF Header."); + } +  + // COFF OBJ = + //   COFFHeader + //   SectionHeader(s) + void DumpObj(const std::vector<uint8_t>& data) + { +  if (data.size() >= sizeof(COFFHeader)) { +   std::cout << "COFF OBJ found.\n" << std::endl; +   const COFFHeader& coffHeader{ *(reinterpret_cast<const COFFHeader*>(data.data())) }; +  +   std::cout << "Machine: " << to_0xhex(coffHeader.Machine) << "\n"; +   if (coffHeader.Machine != IMAGE_FILE_MACHINE_AMD64) { +    std::cout << "  Warning: Unsupported.\n"; +    return; +   } +   std::cout << "NumberOfSections: " << coffHeader.NumberOfSections << "\n"; +  +   if (coffHeader.SizeOfOptionalHeader != 0) +    std::cout << "Warning: SizeOfOptionalHeader is " << coffHeader.SizeOfOptionalHeader << ". Expected 0.\n"; +  +   for (int i = 1; i <= coffHeader.NumberOfSections; i++) { +    if (data.size() < sizeof(COFFHeader) + i * sizeof(SectionHeader)) +     throw std::runtime_error("Data size too small to read next Section Header"); +    const SectionHeader& sectionHeader{ *(reinterpret_cast<const SectionHeader*>(data.data() + sizeof(COFFHeader) + (i - 1) * sizeof(SectionHeader))) }; +    std::cout << "\nSection #" << i << ":\n"; +    std::cout << "  Name: " << to_string(sectionHeader.Name) << "\n"; +    std::cout << "  Size: " << sectionHeader.SizeOfRawData << " bytes\n"; +    std::cout << "  Raw Data:\n"; +    DumpSection(data, sectionHeader.PointerToRawData, sectionHeader.SizeOfRawData, sectionHeader.SizeOfRawData); // sectionHeader.VirtualSize is 0 for obj +   } +  } else +   throw std::runtime_error("Data size too small to read COFF Header."); + } +  + void DumpMember(size_t n, const std::vector<uint8_t>& data, size_t byteoffset) + { +  const LibHeader& libHeader{ *(reinterpret_cast<const LibHeader*>(data.data() + byteoffset)) }; + +  if (libHeader.End[0] != 0x60 || libHeader.End[1] != 0x0A) +   throw std::runtime_error("Bad EndOFHeader signature for header #"s + std::to_string(n + 1) + " at byte offset "s + std::to_string(byteoffset)); + +  if (data.size() < byteoffset + sizeof(LibHeader) + libHeader.BodySize()) +   throw std::runtime_error("Too few bytes for linker member #"s + std::to_string(n + 1)); + +  if (n == 0) { // 1st Linker Member +   if (libHeader.GetName() != "/") +    throw std::runtime_error("Bad Name for 1st Linker Member: "s + libHeader.GetName()); + +   if (data.size() < byteoffset + sizeof(LibHeader) + sizeof(FirstLinkerMember)) +    throw std::runtime_error("Too few bytes for first linker member."); + +   const FirstLinkerMember& firstLinkerMember{ *(reinterpret_cast<const FirstLinkerMember*>(data.data() + byteoffset + sizeof(LibHeader))) }; +   std::cout << "First Linker Member with " << boost::endian::big_to_native(firstLinkerMember.NumberOfSymbols) << " Symbol(s): Ignored (obsolete).\n" << std::endl; +  } else if (n == 1) { // 2nd Linker Member +   if (libHeader.GetName() != "/") +    throw std::runtime_error("Bad Name for 2nd Linker Member: "s + libHeader.GetName()); + +   if (data.size() < byteoffset + sizeof(LibHeader) + sizeof(SecondLinkerMember)) +    throw std::runtime_error("Too few bytes for second linker member."); + +   const SecondLinkerMember& secondLinkerMember{ *(reinterpret_cast<const SecondLinkerMember*>(data.data() + byteoffset + sizeof(LibHeader))) }; +   std::cout << "Second Linker Member: " << secondLinkerMember.NumberOfMembers << " Archive Member(s)\n" << std::endl; +  } else if (n == 2 && libHeader.GetName() == "//") { // Longnames Member. +   // undocumented: Longnames Member not always present +   if (libHeader.GetName() != "//") +    throw std::runtime_error("Bad Name for Longnames Member: "s + libHeader.GetName()); + +   std::cout << "Longnames Member\n" << std::endl; +  } else { // n >= 3: OBJ members +   std::cout << "OBJ Member #" << (n - 2) << "\n" << std::endl; +   std::vector<uint8_t> obj{ data.begin() + byteoffset + sizeof(LibHeader), data.begin() + byteoffset + sizeof(LibHeader) + libHeader.BodySize() }; +   DumpObj(obj); +  } + } + + // LIB = + //   LibSignature + //   LibHeaders (+ Body each): + //     Linker Member 1 (directory, obsolete) + //     Linker Member 2 (directory) + //     Longnames Member (names of archive members) + //     OBJ1 + //     [OBJ2] + //     [...] + + void DumpLib(const std::vector<uint8_t>& data) + { +#if 0 +  size_t p1{ 0 }; +  std::vector<uint8_t> x{ {'\\', '\\'} }; +  auto it = std::search(data.begin(), data.end(), x.begin(), x.end()); + +  if (it != data.end()) +   std::cout << "DEBUG: " << (it - data.begin()) << std::endl; +  else +   std::cout << "DEBUG: " << "not found." << std::endl; +#endif +  size_t n{ 0 }; +  size_t byteoffset{ sizeof(LibSignature) }; +  while (byteoffset < data.size()) { +   const LibHeader& libHeader{ *(reinterpret_cast<const LibHeader*>(data.data() + byteoffset)) }; + +   if (data.size() < byteoffset + sizeof(LibHeader)) +    throw std::runtime_error("Too few bytes in lib header for member #"s + std::to_string(n + 1) + ": "s + std::to_string(data.size())); + +   DumpMember(n, data, byteoffset); + +   n++; +   byteoffset += sizeof(LibHeader) + libHeader.BodySize(); + +   while (byteoffset % 2 != 0) // align to 2-byte ??? (undocumented) +    byteoffset++; +  } + } + +} // namespace + + +void COFF::Dump(fs::path path) +{ + auto data{getFile(path)}; + + if (data.size() >= 8 && boost::starts_with(data, "!<arch>\n"s)) { +  DumpLib(data); + } else if (data.size() >= 2 && data[0] == 0x64 && data[1] == 0x86) { +  DumpObj(data); + } else if (isPE(data)) { +  DumpExe(data); + } else +  throw std::runtime_error("Bad file type."); +} + +namespace { + + void setFile(const fs::path& filename, const char* data, size_t size) + { +  std::ofstream file(filename.string(), std::ios::out | std::ios::binary | std::ios::trunc); +  if (file.is_open()) { +   file.write(data, size); +  } +  else { +   throw std::runtime_error("Opening "s + filename.string() + " for writing"); +  } + } + + void setFile(const fs::path& filename, const std::string& s) + { +  setFile(filename, s.data(), s.size()); + } + + void setFile(const fs::path& filename, const std::vector<uint8_t>& s) + { +  setFile(filename, reinterpret_cast<const char*>(s.data()), s.size()); + } + + void PutDOSStub(std::vector<uint8_t>& data) + { +  std::vector<uint8_t> x{ 'M', 'Z' }; +  x.resize(0x3c); +  data.insert(data.end(), x.begin(), x.end()); +  std::vector<uint8_t> address{0x40, 0, 0, 0}; // 32-bit address points to end of thus DOSStub +  data.insert(data.end(), address.begin(), address.end()); + } + + void PutPESignature(std::vector<uint8_t>& data) + { +  std::vector<uint8_t> sig{ 'P', 'E', '\0', '\0' }; +  data.insert(data.end(), sig.begin(), sig.end()); + } + + void PutCOFFHeader(std::vector<uint8_t>& data) + { +  { +   std::vector<uint8_t> header_v(sizeof(COFFHeader), uint8_t{}); +   COFFHeader& header{ *reinterpret_cast<COFFHeader*>(header_v.data()) }; +   header.Machine = 0x8664; // AMD64 +   header.NumberOfSections = 2; +   header.SizeOfOptionalHeader = sizeof(COFFOptionalHeader_PE32p) + sizeof(COFFOptionalHeader_Windows_PE32p) + 8 * 16; // 0xf0 +   header.Characteristics = IMAGE_FILE_EXECUTABLE_IMAGE | IMAGE_FILE_LARGE_ADDRESS_AWARE; +   data.insert(data.end(), header_v.begin(), header_v.end()); +  } + +  { +   std::vector<uint8_t> optional_header_v(sizeof(COFFOptionalHeader_PE32p), uint8_t{}); +   COFFOptionalHeader_PE32p& optional_header{ *reinterpret_cast<COFFOptionalHeader_PE32p*>(optional_header_v.data()) }; +   optional_header.Magic = 0x20B; // PE32+ +   optional_header.SizeOfCode = 512; +   optional_header.SizeOfInitializedData = 512; +   optional_header.SizeOfUninitializedData = 0; +   optional_header.AddressOfEntryPoint = 0x1000; +   optional_header.BaseOfCode = 0x1000; +   data.insert(data.end(), optional_header_v.begin(), optional_header_v.end()); +  } + +  { +   std::vector<uint8_t> optional_windows_v(sizeof(COFFOptionalHeader_Windows_PE32p), uint8_t{}); +   COFFOptionalHeader_Windows_PE32p& optional_windows{ *reinterpret_cast<COFFOptionalHeader_Windows_PE32p*>(optional_windows_v.data()) }; +   optional_windows.ImageBase = 0x140000000; +   optional_windows.SectionAlignment = 0x1000; +   optional_windows.FileAlignment = 512; +#if 1 +   optional_windows.MajorImageVersion = 6; +   optional_windows.MajorOperatingSystemVersion = 6; +   optional_windows.MajorSubsystemVersion = 6; +#endif +   optional_windows.SizeOfImage = 0x3000; +   optional_windows.SizeOfHeaders = 512; +   optional_windows.CheckSum = 0; +   optional_windows.Subsystem = IMAGE_SUBSYSTEM_WINDOWS_CUI; +#if 0 +   optional_windows.DllCharacteristics = 0x8160; +#endif +   optional_windows.SizeOfStackReserve = 0x100000; +   optional_windows.SizeOfStackCommit  = 0x1000; +   optional_windows.SizeOfHeapReserve  = 0x100000; +   optional_windows.SizeOfHeapCommit   = 0x1000; +   optional_windows.NumberOfRvaAndSizes = 0x10; +   data.insert(data.end(), optional_windows_v.begin(), optional_windows_v.end()); +  } + +  { +   std::vector<uint8_t> data_directories(8 * 16, uint8_t{}); +   data.insert(data.end(), data_directories.begin(), data_directories.end()); +  } + } + + void PutCOFFSectionCodeHeader(std::vector<uint8_t>& data) + { +  std::vector<uint8_t> section_header_v(sizeof(SectionHeader), uint8_t{}); +  SectionHeader& section_header{ *reinterpret_cast<SectionHeader*>(section_header_v.data()) }; +  uint8_t Name[8]{ '.', 't', 'e', 'x', 't', 0, 0, 0 }; +  memcpy(section_header.Name, Name, 8); +  section_header.VirtualSize = 3; // TODO +  section_header.VirtualAddress = 0x1000; +  section_header.SizeOfRawData = 512; // multiple of optional_windows.FileAlignment +  section_header.PointerToRawData = 512; +  section_header.Characteristics = IMAGE_SCN_CNT_CODE | IMAGE_SCN_MEM_EXECUTE | IMAGE_SCN_MEM_READ; +  data.insert(data.end(), section_header_v.begin(), section_header_v.end()); + } + + void PutCOFFSectionCode(std::vector<uint8_t>& data) + { +  { // pad before code +   std::vector<uint8_t> pad(512 - data.size(), uint8_t{}); +   data.insert(data.end(), pad.begin(), pad.end()); +  } + +  { // test code: return 0 +   std::vector<uint8_t> code{0x33, 0xC0, 0xC3}; +   data.insert(data.end(), code.begin(), code.end()); +  } + +  { // pad after code +   std::vector<uint8_t> pad(1024 - data.size(), uint8_t{}); +   data.insert(data.end(), pad.begin(), pad.end()); +  } + } + + void PutCOFFSectionDataHeader(std::vector<uint8_t>& data) + { +  std::vector<uint8_t> section_header_v(sizeof(SectionHeader), uint8_t{}); +  SectionHeader& section_header{ *reinterpret_cast<SectionHeader*>(section_header_v.data()) }; +  uint8_t Name[8]{ '.', 'd', 'a', 't', 'a', 0, 0, 0 }; +  memcpy(section_header.Name, Name, 8); +  section_header.VirtualSize = 3; // TODO +  section_header.VirtualAddress = 0x2000; +  section_header.SizeOfRawData = 512; // multiple of optional_windows.FileAlignment +  section_header.PointerToRawData = 1024; +  section_header.Characteristics = IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ; +  data.insert(data.end(), section_header_v.begin(), section_header_v.end()); + } +  + void PutCOFFSectionData(std::vector<uint8_t>& data) + { +  { // test data +   std::vector<uint8_t> x(1536 - data.size(), uint8_t{}); +   data.insert(data.end(), x.begin(), x.end()); +  } + + } +} // namespace + +void COFF::Create(std::filesystem::path path) +{ + std::vector<uint8_t> data; + + PutDOSStub(data); + PutPESignature(data); + PutCOFFHeader(data); + PutCOFFSectionCodeHeader(data); + PutCOFFSectionDataHeader(data); + PutCOFFSectionCode(data); + PutCOFFSectionData(data); + + setFile(path, data); +} @@ -0,0 +1,8 @@ +#pragma once + +#include <filesystem> + +namespace COFF { + void Dump(std::filesystem::path path); + void Create(std::filesystem::path path); +}
\ No newline at end of file diff --git a/intel.cpp b/intel.cpp new file mode 100644 index 0000000..dfcaa75 --- /dev/null +++ b/intel.cpp @@ -0,0 +1,503 @@ +// Intel assembly language + + +// segments: code, stack + +#include "minicc.h" + +#include <algorithm> +#include <array> +#include <deque> +#include <functional> +#include <stdexcept> +#include <functional> +#include <stdexcept> +#include <string> +#include <unordered_map> +#include <vector> + +using namespace std::string_literals; +using namespace std::placeholders; + +namespace { + + // binary code operators + std::vector<uint8_t> operator+(std::vector<uint8_t> a, const std::vector<uint8_t>& b) { +  a.insert(a.end(), b.begin(), b.end()); +  return a; + } + + std::vector<uint8_t> operator+(std::vector<uint8_t> a, const uint8_t& b) { +  a.push_back(b); +  return a; + } + + // REX prefix: 0b0100WRXB + std::vector<uint8_t> REX(std::string s) { +  uint8_t result{0b01000000}; +  if (s == "W") +   result |= 0b00001000; +  if (s == "R") +   result |= 0b00000100; +  if (s == "X") +   result |= 0b00000010; +  if (s == "B") +   result |= 0b00000001; + +  return { result };  + } + + std::vector<uint8_t> imm8(std::string s) { +  long value{ std::stol(s) }; +  uint8_t* bin = reinterpret_cast<uint8_t*>(&value); +  return { uint8_t(*bin & 0xFF) }; + } + + std::vector<uint8_t> imm32(std::string s) { +  long value{ std::stol(s) }; +  uint32_t* bin = reinterpret_cast<uint32_t*>(&value); +  return {uint8_t(*bin & 0xFF), uint8_t(*bin >> 8 & 0xFF), uint8_t(*bin >> 16 & 0xFF), uint8_t(*bin >> 24 & 0xFF) }; + } + + std::unordered_map<std::string, size_t> IndexOfRegister{ +  {"al", 0}, {"ah", 4}, +  {"bl", 3}, {"bh", 7}, +  {"cl", 1}, {"ch", 5}, +  {"dl", 2}, {"dh", 6}, + +  {"ax", 0}, {"sp", 4}, +  {"bx", 3}, {"bp", 7}, +  {"cx", 1}, {"si", 5}, +  {"dx", 2}, {"di", 6}, + +  {"eax", 0}, {"esp", 4}, +  {"ebx", 3}, {"ebp", 7}, +  {"ecx", 1}, {"esi", 5}, +  {"edx", 2}, {"edi", 6}, + }; + + // Manual, page 530 + // Reg + Reg/Memory + uint8_t ModRM(std::string reg, std::string rm) { +  // TODO: extend +  uint8_t result{0b11000000}; + +  auto index1{ IndexOfRegister.find(reg) }; +  if (index1 == IndexOfRegister.end()) +   throw std::runtime_error("Unknown register for arg1: "s + reg); + +  result |= (index1->second << 3); + +  auto index2{ IndexOfRegister.find(rm) }; +  if (index2 == IndexOfRegister.end()) +   throw std::runtime_error("Unknown register for arg2: "s + rm); + +  result |= index2->second; + +  return result; + } + + enum class AddressType { +  Relative8, +  Relative16, +  Relative32, +  Absolute8, +  Absolute16, +  Absolute32, + }; + + struct Address + { +  AddressType type; +  size_t position; // relative to respective machine code, e.g. byte 1 in jump +  std::string label; // where to jump to, as label + }; + + struct InstructionCode + { +  std::vector<uint8_t> machine_code; +  std::vector<Address> addresses; + }; + + // List of alternative codes + typedef std::deque<InstructionCode> InstructionCodeList; + + bool O1{ true }; // Optimization + + using OP_T = std::vector<uint8_t>; + + InstructionCodeList op_jmp(const std::vector<Token>& sl, std::vector<uint8_t> op_bytes_8, std::vector<uint8_t> op_bytes_32) + { +  if (sl.size() == 2) { // JMP rel8 / rel32 +   const std::string& label{ sl[1].value }; +   InstructionCodeList result; +   if (op_bytes_32.size() > 0) { +    op_bytes_32.resize(op_bytes_32.size() + 4, 0x00); +    result.push_back({ op_bytes_32, { {AddressType::Relative32, op_bytes_32.size() - 4, label} } } ); +   } +   if (op_bytes_8.size() > 0 && (O1 || op_bytes_32.size() == 0)) { +    op_bytes_8.push_back(0x00); +    result.push_back({ op_bytes_8, { {AddressType::Relative8, op_bytes_8.size() - 1, label} } }); +   } +   return result; +  } + +  // ... TODO +  throw std::runtime_error("Unknown command: "s + sl[0].value); + } + + std::unordered_map<std::string, std::function<InstructionCodeList(const std::vector<Token>&)>> ops{ + +  // Integer Addition +  {"add", [](const std::vector<Token>& sl) -> InstructionCodeList { +   if (sl.size() == 3) { +    if (sl[1].value == "eax") { // ADD EAX, imm32 +     return { { std::vector<uint8_t>{ 0x05 } +imm32(sl[2].value), {} } }; +    } else if (sl[1].value == "rax") { // ADD RAX, imm32 +     return  { { REX("W") + std::vector<uint8_t>{ 0x05 } +imm32(sl[2].value), {} } }; +    } +   } + +   // ... TODO +   throw std::runtime_error("Unknown command: "s + sl[0].value); +  }}, + +  // Call Procedure +  {"call",  std::bind(op_jmp, _1, OP_T{}, OP_T{ 0xE8 })}, + +  // Interrupt +  {"int", [](const std::vector<Token>& sl) -> InstructionCodeList { +   if (sl.size() == 2) { +    if (sl[1].value == "0") { // INT 0 +     return { { std::vector<uint8_t>{ 0xCE }} }; +    } else if (sl[1].value == "1") { // INT 1 +     return { { std::vector<uint8_t>{ 0xF1 }} }; +    } else if (sl[1].value == "3") { // INT 3 +     return { { std::vector<uint8_t>{ 0xCC }} }; +    } else { // INT <...> +     return  { { std::vector<uint8_t>{ 0xCD } +imm8(sl[2].value) } }; +    } +   } + +   // ... TODO +   throw std::runtime_error("Unknown command: "s + sl[0].value); +  }}, + +  // Unconditional Jump +  {"jmp", std::bind(op_jmp, _1, OP_T{ 0xEB }, OP_T{ 0xE9 })}, + +  // Conditional Jumps +  {"ja",     std::bind(op_jmp, _1, OP_T{ 0x77 }, OP_T{ 0x0F, 0x87 })}, +  {"jae",    std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, +  {"jb",     std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, +  {"jbe",    std::bind(op_jmp, _1, OP_T{ 0x76 }, OP_T{ 0x0F, 0x86 })}, +  {"jc",     std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, +  {"jecxz",  std::bind(op_jmp, _1, OP_T{ 0xE3 }, OP_T{})}, +  {"jrcxz",  std::bind(op_jmp, _1, OP_T{ 0xE3 }, OP_T{})}, +  {"je",     std::bind(op_jmp, _1, OP_T{ 0x74 }, OP_T{ 0x0F, 0x84 })}, +  {"jg",     std::bind(op_jmp, _1, OP_T{ 0x7F }, OP_T{ 0x0F, 0x8F })}, +  {"jge",    std::bind(op_jmp, _1, OP_T{ 0x7D }, OP_T{ 0x0F, 0x8D })}, +  {"jl",     std::bind(op_jmp, _1, OP_T{ 0x7C }, OP_T{ 0x0F, 0x8C })}, +  {"jle",    std::bind(op_jmp, _1, OP_T{ 0x7E }, OP_T{ 0x0F, 0x8E })}, +  {"jna",    std::bind(op_jmp, _1, OP_T{ 0x76 }, OP_T{ 0x0F, 0x86 })}, +  {"jnae",   std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, +  {"jnb",    std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, +  {"jnbe",   std::bind(op_jmp, _1, OP_T{ 0x77 }, OP_T{ 0x0F, 0x87 })}, +  {"jnc",    std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, +  {"jne",    std::bind(op_jmp, _1, OP_T{ 0x75 }, OP_T{ 0x0F, 0x85 })}, +  {"jng",    std::bind(op_jmp, _1, OP_T{ 0x7E }, OP_T{ 0x0F, 0x8E })}, +  {"jnge",   std::bind(op_jmp, _1, OP_T{ 0x7C }, OP_T{ 0x0F, 0x8C })}, +  {"jnl",    std::bind(op_jmp, _1, OP_T{ 0x7D }, OP_T{ 0x0F, 0x8D })}, +  {"jnle",   std::bind(op_jmp, _1, OP_T{ 0x7F }, OP_T{ 0x0F, 0x8F })}, +  {"jno",    std::bind(op_jmp, _1, OP_T{ 0x71 }, OP_T{ 0x0F, 0x81 })}, +  {"jnp",    std::bind(op_jmp, _1, OP_T{ 0x7B }, OP_T{ 0x0F, 0x8B })}, +  {"jns",    std::bind(op_jmp, _1, OP_T{ 0x79 }, OP_T{ 0x0F, 0x89 })}, +  {"jnz",    std::bind(op_jmp, _1, OP_T{ 0x75 }, OP_T{ 0x0F, 0x85 })}, +  {"jo",     std::bind(op_jmp, _1, OP_T{ 0x70 }, OP_T{ 0x0F, 0x80 })}, +  {"jp",     std::bind(op_jmp, _1, OP_T{ 0x7A }, OP_T{ 0x0F, 0x8A })}, +  {"jpe",    std::bind(op_jmp, _1, OP_T{ 0x7A }, OP_T{ 0x0F, 0x8A })}, +  {"jpo",    std::bind(op_jmp, _1, OP_T{ 0x7B }, OP_T{ 0x0F, 0x8B })}, +  {"js",     std::bind(op_jmp, _1, OP_T{ 0x78 }, OP_T{ 0x0F, 0x88 })}, +  {"jz",     std::bind(op_jmp, _1, OP_T{ 0x74 }, OP_T{ 0x0F, 0x84 })}, + +  // Memory Move +  { "mov", [](const std::vector<Token>& sl) -> InstructionCodeList { +   if (sl.size() == 3) { +    return { { std::vector<uint8_t>{ 0x88 } + ModRM(sl[2].value, sl[1].value), {} } };  // r/m8, r8: ModRM:r/m (w), ModRM:reg (r) +   } + +   // ... TODO +   throw std::runtime_error("Unknown command: "s + sl[0].value); +  }}, + +  // No Operation +  { "nop", [](const std::vector<Token>& sl) -> InstructionCodeList { +   return {{ std::vector<uint8_t>{ 0x90 }, {}}}; +  }}, + +  // Return from procedure +  { "ret", [](const std::vector<Token>& sl) -> InstructionCodeList { +   return {{ std::vector<uint8_t>{ 0xC3 }, {}}}; // near return; TODO: far return is 0xCB +  }}, + +  { "xor", [](const std::vector<Token>& sl) -> InstructionCodeList { +   if (sl.size() == 3) { +    return { { std::vector<uint8_t>{ 0x33 } + ModRM(sl[1].value, sl[2].value) } };  // r8, r/m8: ModRM:reg (w), ModRM:r/m (r) +   } + +   // ... TODO +   throw std::runtime_error("Unknown command: "s + sl[0].value); +  }}, + + }; + +#if 0 + prefixes{ +  "lock", 0xf0, + +  // branch hint +  0x2e, "branch not taken" +  0x3e, "branch taken" + +  0x66, "operand size override" // switch between 16 and 32 bit operands +  0x67, "address size override" // switch between 16 and 32 bit addresses + }; + }; +#endif + +#ifdef ASM_PARSER + BNF GetBNF() { +  // TODO: +  return { +   { "assembler-unit", { +    {} +   }}, +   { "immediate-32", { +    {} +   }}, +   { "mnemonic", { +    {} +   }}, +   { "register", { +    {} +   }}, +   { "register-8", { +    {} +   }}, +   { "register-16", { +    {} +   }}, +   { "register-32", { +    {} +   }}, +   { "register-64", { +    {} +   }}, +  +  }; + }; +#endif +  + // Checks a 32 bit relative address if it's valid as 8 bit address + bool IsSmallAddress(const InstructionCode& insn) { +  if (insn.addresses.size() != 1) +   throw std::runtime_error("Bad number of addresses in insn"); + +  size_t i{insn.addresses[0].position}; + +  if (i > insn.machine_code.size() - 3) +   throw std::runtime_error("Bad Address index "s + std::to_string(i) + " in insn with "s + std::to_string(insn.machine_code.size()) + " bytes"s); + +  if (std::count(insn.machine_code.begin() + i, insn.machine_code.begin() + i + 3, 0x00) == 3 || +   std::count(insn.machine_code.begin() + i, insn.machine_code.begin() + i + 3, 0xFF) == 3) +   return true; + +  return false; + } + + +} // namespace + +class Assembler { + + std::unordered_map<std::string, size_t> labels; ///< labels with their positions in instruction list + + /// 1st Level: Instructions + /// 2nd Level: Alternatives + /// 3rd Level: Bytes of single instruction + std::vector<InstructionCodeList> insn_list; + + uint64_t addressFromInstructionIndex(size_t index) + { +  // TODO: cache this to prevent repetitive summing + +  if (index > insn_list.size()) +   throw std::runtime_error("Index "s + std::to_string(index) + " out of range ("s + std::to_string(insn_list.size()) + ")"s); + +  uint64_t sum{}; + +  for (size_t i = 0; i < index; i++) { +   if (insn_list[i].size() < 1) { +    throw std::runtime_error("Insufficient alternatives at index "s + std::to_string(i)); +   } + +   sum += static_cast<uint64_t>(insn_list[i][0].machine_code.size()); +  } + +  return sum; + } + + uint64_t addressFromLabel(std::string label) + { +  auto it{ labels.find(label) }; +  if (it == labels.end()) +   throw std::runtime_error("Label not found: "s + label); + +  return addressFromInstructionIndex(it->second); + } + + std::unordered_map<AddressType, std::function<void(std::vector<uint8_t>&, const Address&, uint64_t)>> addressInserters{ +  {AddressType::Relative8,  [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) +   { +    int64_t difference = static_cast<int64_t>(addressFromLabel(target_address.label)) - insn_address; +    if (difference < -128 || difference > 127) +     throw std::runtime_error("Distance too big"); + +    int8_t diff8 = static_cast<int8_t>(difference); +    uint8_t diff_u8 = *reinterpret_cast<uint8_t*>(&diff8); + +    machine_code[target_address.position] = diff_u8; +   } +  }, +  {AddressType::Relative16, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) { throw std::runtime_error("Relative16 Address not yet supported."); }}, +  {AddressType::Relative32, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) +   { +    int64_t difference = static_cast<int64_t>(addressFromLabel(target_address.label)) - insn_address; +    if (difference < -4294967296 || difference > 4294967295) +     throw std::runtime_error("Distance too big"); + +    int32_t diff32 = static_cast<int32_t>(difference); +    uint32_t diff_u32 = *reinterpret_cast<uint32_t*>(&diff32); + +    machine_code[target_address.position] = diff_u32 & 0xFF; // little endian +    machine_code[target_address.position + 1] = diff_u32 >> 8 & 0xFF; +    machine_code[target_address.position + 2] = diff_u32 >> 16 & 0xFF; +    machine_code[target_address.position + 3] = diff_u32 >> 24 & 0xFF; +   } +  }, +  {AddressType::Absolute8,  [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute8 Address not yet supported."); }}, +  {AddressType::Absolute16, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute16 Address not yet supported."); }}, +  {AddressType::Absolute32, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute32 Address not yet supported."); }}, + }; + + void produce_machine_code(std::vector<std::vector<Token>>& tl) + { +  for (const auto& t : tl) { +   // label: +   // label: mnemonic arg1, arg2, arg3 +   //        mnemonic arg1, arg2, arg3 + +   if (t.size() == 2 && t[0].type == "label" && t[1].type == ":") { // label +    if (labels.find(t[0].value) != labels.end()) +     throw std::runtime_error("Label already defined: "s + t[0].value); + +    labels[t[0].value] = insn_list.size(); +   } else if (t.size() >= 1 && t[0].type == "instruction") { // instruction +    std::string instruction{ t[0].value }; +    auto it = ops.find(instruction); +    if (it == ops.end()) +     throw std::runtime_error("Unknown instruction: "s + instruction); + +    InstructionCodeList codes = it->second(t); + +    if (codes.size() == 0) +     throw std::runtime_error("No instruction generated"); + +    insn_list.push_back(codes); + +   } else +    throw std::runtime_error("Syntax error"s); +  } + } + + void insert_addresses() + { +  for (size_t i = 0; i < insn_list.size(); i++) { +   InstructionCodeList& list{ insn_list[i] }; +   if (list.size() == 0) +    throw std::runtime_error("No instruction at index "s + std::to_string(i)); + +   InstructionCode& code{ list[0] }; + +   for (const auto& address : code.addresses) { +    addressInserters[address.type](code.machine_code, address, addressFromInstructionIndex(i)); +   } +  } + } + + void optimize() + { +  // reduce Jump sizes via alternatives if possible +  bool changed{}; +  do { +   changed = false; + +   for (size_t i = 0; i < insn_list.size(); i++) { +    InstructionCodeList& list{ insn_list[i] }; // Alternatives + +    // apply specific heuristics to optimization case +    if (list.size() == 2) { +     if (list[0].addresses.size() == 1 && list[1].addresses.size() == 1) { +      if (list[0].addresses[0].type == AddressType::Relative32 && list[1].addresses[0].type == AddressType::Relative8) { +       if (IsSmallAddress(list[0])) { +        list.pop_front(); +        break; // start over from start of program +       } +      } +     } +    } +   } + +   if (changed) +    insert_addresses(); // update + +  } while (changed); + } + + std::vector<uint8_t> collect_code() + { +  std::vector<uint8_t> result; + +  // collect generated machine instructions for result +  // Alternatives already resolved, if configured. Consider only 1st entry (no matter if optimized or not). +  for (size_t i = 0; i < insn_list.size(); i++) { +   InstructionCodeList& list{ insn_list[i] }; +   if (list.size() == 0) +    throw std::runtime_error("No instruction at index "s + std::to_string(i)); + +   InstructionCode& code{ list[0] }; + +   result.insert(result.end(), code.machine_code.begin(), code.machine_code.end()); +  } + +  return result; + } + +public: + Assembler() {} + + std::vector<uint8_t> assemble(std::vector<std::vector<Token>> tl) + { +  labels.clear(); +  insn_list.clear(); + +  produce_machine_code(tl); // 1st pass +  insert_addresses(); // 2nd pass +  if (O1) { +   optimize(); // 3rd pass +  } + +  return collect_code(); // 4th pass + } + +}; // class Assembler | 
