diff options
| author | Roland Reichwein <mail@reichwein.it> | 2020-11-19 22:31:33 +0100 | 
|---|---|---|
| committer | Roland Reichwein <mail@reichwein.it> | 2020-11-19 22:31:33 +0100 | 
| commit | 5c0611b998e039c8547cfa3841da3567e13446a8 (patch) | |
| tree | 3f1dc0a8371996426f99d395ad3f0fa9be503ea5 /asm | |
| parent | 1937e301b6cd185c8ce907b9184142e82e76fda4 (diff) | |
Add assembler parser (WIP)
Diffstat (limited to 'asm')
| -rw-r--r-- | asm/intel64/encode.cpp | 12 | ||||
| -rw-r--r-- | asm/parse.cpp | 251 | ||||
| -rw-r--r-- | asm/parse.h | 3 | 
3 files changed, 257 insertions, 9 deletions
| diff --git a/asm/intel64/encode.cpp b/asm/intel64/encode.cpp index 21b6629..51ca7a0 100644 --- a/asm/intel64/encode.cpp +++ b/asm/intel64/encode.cpp @@ -133,18 +133,18 @@ void Asm::toMachineCode(const FlowGraph::Graph& graph, Segment& segment)      if (op.type() == FlowGraph::UnaryOperationType::BitwiseNot) {       segment.push_back(makeLoadValue(operands[1], graph)); -     segment.push_back(parseAsm("not eax")); +     segment.append(parseAsm("not eax"));       segment.push_back(makeStoreValue(operands[0], graph));      } else if (op.type() == FlowGraph::UnaryOperationType::LogicalNot) {       segment.push_back(makeLoadValue(operands[1], graph)); -     segment.push_back(parseAsm("bsr eax")); // ZF=1 iff eax=0 -     segment.push_back(parseAsm("lahf")); // ZF in AH bit 6 -     segment.push_back(parseAsm("shr eax, 14")); // ZF in eax bit 0 -     segment.push_back(parseAsm("and eax, 1")); // now, 0 or 1 is in eax, negated because of zero flag +     segment.append(parseAsm("bsr eax")); // ZF=1 iff eax=0 +     segment.append(parseAsm("lahf")); // ZF in AH bit 6 +     segment.append(parseAsm("shr eax, 14")); // ZF in eax bit 0 +     segment.append(parseAsm("and eax, 1")); // now, 0 or 1 is in eax, negated because of zero flag       segment.push_back(makeStoreValue(operands[0], graph));      } else if (op.type() == FlowGraph::UnaryOperationType::Minus) {       segment.push_back(makeLoadValue(operands[1], graph)); -     segment.push_back(parseAsm("neg eax")); +     segment.append(parseAsm("neg eax"));       segment.push_back(makeStoreValue(operands[0], graph));      } else       throw std::runtime_error("ICE: Asm: Unsupported unary operation type: "s + std::to_string(static_cast<int>(op.type()))); diff --git a/asm/parse.cpp b/asm/parse.cpp index 350d86e..3b6e6be 100644 --- a/asm/parse.cpp +++ b/asm/parse.cpp @@ -2,7 +2,254 @@  #include "asm/assembler.h" -std::shared_ptr<Chunk> parseAsm(const std::string& line) +#include <boost/algorithm/string.hpp> + +#include <exception> +#include <regex> +#include <unordered_set> + +using namespace std::string_literals; + +namespace { +  + std::unordered_set<std::string> reg8 { +  "al", "ah", +  "bl", "bh", +  "cl", "ch", +  "dl", "dh", + }; + + std::unordered_set<std::string> reg32 { +  "eax", "esp", +  "ebx", "ebp", +  "ecx", "esi", +  "edx", "edi", + }; + + std::unordered_set<std::string> reg64 { +  "rax", "rsp", +  "rbx", "rbp", +  "rcx", "rsi", +  "rdx", "rdi", + }; + + // skip optional whitespace + void parseWhitespace(const std::string& asm_code, size_t& pos) { +  std::regex re_whitespace("( \\t)+", std::regex_constants::ECMAScript); + +  std::smatch match; +  if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_whitespace, std::regex_constants::match_continuous)) { +   pos += match[0].length(); +  } + } + + // parse optional label + bool parseLabel(const std::string& asm_code, size_t& pos, std::string& result) { +  parseWhitespace(asm_code, pos); +   +  std::regex re_label("([[:alpha:]]([[:alnum:]])+):", std::regex_constants::ECMAScript); + +  std::smatch match; +  if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_label, std::regex_constants::match_continuous)) { +   pos += match[0].length(); +   result = match[1]; +   return true; +  } + +  return false; + } + + // parse optional mnemonic + // return true iff mnemonic found + bool parseMnemonic(const std::string& asm_code, size_t& pos, std::string& result) { +  parseWhitespace(asm_code, pos); +   +  std::regex re_mnemonic("[[:alpha:]]([[:alnum:]])+", std::regex_constants::ECMAScript); + +  std::smatch match; +  if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_mnemonic, std::regex_constants::match_continuous)) { +   std::string name {boost::algorithm::to_lower_copy(match[0].str())}; +   pos += name.size(); +   result = name; +   return true; +  } + +  return false; + } +  + bool parseRegister8(const std::string& asm_code, size_t& pos, std::any& result) { +  parseWhitespace(asm_code, pos); +   +  std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + +  std::smatch match; +  if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { +   std::string name {boost::algorithm::to_lower_copy(match[0].str())}; +   if (reg8.contains(name)) { +    pos += name.size(); +    result = Asm::Args::Register8(name); +    return true; +   } +  } + +  return false; + } + + bool parseRegister32(const std::string& asm_code, size_t& pos, std::any& result) { +  parseWhitespace(asm_code, pos); +   +  std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + +  std::smatch match; +  if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { +   std::string name {boost::algorithm::to_lower_copy(match[0].str())}; +   if (reg32.contains(name)) { +    pos += name.size(); +    result = Asm::Args::Register32(name); +    return true; +   } +  } + +  return false; + } + + bool parseRegister64(const std::string& asm_code, size_t& pos, std::any& result) { +  parseWhitespace(asm_code, pos); +   +  std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + +  std::smatch match; +  if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { +   std::string name {boost::algorithm::to_lower_copy(match[0].str())}; +   if (reg64.contains(name)) { +    pos += name.size(); +    result = Asm::Args::Register64(name); +    return true; +   } +  } + +  return false; + } + + bool parseImmediate32(const std::string& asm_code, size_t& pos, std::any& result) { +  parseWhitespace(asm_code, pos); +   +  std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); + +  std::smatch match; +  if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { +   int32_t value{}; +   try { +    value = stoll(match[0]); +   } catch (...) { +    throw std::runtime_error("Assembler parse error: Bad Immediate: "s + match[0].str()); +   } +   pos += match[0].length(); +   result = Asm::Args::Immediate32(static_cast<uint32_t>(value)); +   return true; +  } + +  return false; + } + + // parse optional single operand + bool parseOperand(const std::string& asm_code, size_t& pos, std::any& result) { +  parseWhitespace(asm_code, pos); + +  if (parseRegister8(asm_code, pos, result)) +   return true; +  if (parseRegister32(asm_code, pos, result)) +   return true; +  if (parseRegister64(asm_code, pos, result)) +   return true; + +  if (parseImmediate32(asm_code, pos, result)) +   return true; + +  return false; + } + + // parse optional multiple operands, separated by commas + void parseOperands(const std::string& asm_code, size_t& pos, Asm::Args& result) { +  std::any operand; +  if (parseOperand(asm_code, pos, operand)) { +   result.push_back(operand); +   parseWhitespace(asm_code, pos); +   while (pos < asm_code.size() && asm_code[pos] == ',') { +    pos++; +    if (parseOperand(asm_code, pos, operand)) { +     result.push_back(operand); +    } else { +     throw std::runtime_error("Assembler error: expected operand after comma"); +    } +    parseWhitespace(asm_code, pos); +   } +  } + } + + // parse optional comment + void parseComment(const std::string& asm_code, size_t& pos) { +  parseWhitespace(asm_code, pos); +   +  std::regex re_comment("(#|//).*", std::regex_constants::ECMAScript); + +  std::smatch match; +  if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_comment, std::regex_constants::match_continuous)) { +   pos += match[0].length(); +  } + } + + // parse end of line (or whole code) + bool parseEol(const std::string& asm_code, size_t& pos) { +  parseWhitespace(asm_code, pos); + +  if (pos < asm_code.size() && asm_code[pos] != 0x0a && asm_code[pos] != 0x0d) +   return false; // this is the only case where parseEol() doesn't work + +  while (pos < asm_code.size()) { +   char c { asm_code[pos] }; +   if (c == 0x0a || c == 0x0d) { +    pos++; +   } else { +    break; +   } +  } + +  return true; + } + + // parse single line + void parseLine(const std::string& asm_code, size_t& pos, std::vector<std::shared_ptr<Chunk>>& result) { +  // all optional: +  // label: mnemonic operands... ;comment <eol> + +  std::string result_string; +  if (parseLabel(asm_code, pos, result_string)) +   result.emplace_back(std::make_shared<Label>(result_string)); + +  if (parseMnemonic(asm_code, pos, result_string)) { +   Asm::Args args; +   parseOperands(asm_code, pos, args); +   result.emplace_back(makeOp(result_string, args)); +  } + +  parseComment(asm_code, pos); + +  if (!parseEol(asm_code, pos)) +   throw std::runtime_error("Assembler error at pos "s + std::to_string(pos)); + } + +} // namespace + +std::vector<std::shared_ptr<Chunk>> parseAsm(const std::string& asm_code)  { - return makeOp("lahf"); // TODO + std::vector<std::shared_ptr<Chunk>> result; + size_t pos{0}; + + while (pos != asm_code.size()) { +  parseLine(asm_code, pos, result); + } + + return result;  } + diff --git a/asm/parse.h b/asm/parse.h index 1e6a202..1b55f7f 100644 --- a/asm/parse.h +++ b/asm/parse.h @@ -5,4 +5,5 @@  #include <memory>  #include <string> -std::shared_ptr<Chunk> parseAsm(const std::string& line); +// asm_code: multiline asm code +std::vector<std::shared_ptr<Chunk>> parseAsm(const std::string& asm_code); | 
