diff options
| author | Roland Reichwein <mail@reichwein.it> | 2020-03-15 18:19:49 +0100 | 
|---|---|---|
| committer | Roland Reichwein <mail@reichwein.it> | 2020-03-15 18:19:49 +0100 | 
| commit | 9f69b006dde3c3fbe19ed3e0275d3b7348f2aa87 (patch) | |
| tree | 6ac42793568339463f913cf39474794c8613d0b6 | |
| parent | 3a7006fcf5f8ecffd852fbba6d8ee03ce8a35dce (diff) | |
New lexer
| -rw-r--r-- | bnf.cpp | 5 | ||||
| -rw-r--r-- | bnf.h | 2 | ||||
| -rw-r--r-- | cpp.cpp | 2 | ||||
| -rwxr-xr-x | cppbnf.cpp | 11 | ||||
| -rw-r--r-- | lexer.cpp | 166 | ||||
| -rw-r--r-- | lexer.h | 32 | ||||
| -rw-r--r-- | minicc.cpp | 17 | ||||
| -rw-r--r-- | minicc.h | 8 | ||||
| -rw-r--r-- | test-lexer.cpp | 17 | 
9 files changed, 241 insertions, 19 deletions
| @@ -68,3 +68,8 @@ BNF SubBNF(const BNF& bnf, const std::string& top)   return result;  } +bool isTerminal(const BNF& bnf, const std::string& symbol) +{ + return bnf.find(symbol) == bnf.end(); +} + @@ -15,3 +15,5 @@ std::unordered_map<std::string, std::set<std::string>> Reverse(BNF bnf); // unus  std::unordered_map<std::string, std::set<std::string>> reverseFirst(BNF bnf);  BNF SubBNF(const BNF& bnf, const std::string& top); + +bool isTerminal(const BNF& bnf, const std::string& symbol); @@ -247,12 +247,14 @@ TEST_F(CppTest, preprocessing_tokenize) {  }  #endif +#if 0  TEST_F(CppTest, preprocessing_tokenize2) {   CPP cpp;   auto ppTree = cpp.preprocessing_tokenize("in ma");   cpp.tokens_from_pptokens(ppTree);  } +#endif  #if 0  TEST(Cpp, translate) { @@ -39,11 +39,6 @@ namespace {    return result;   } - bool isTerminal(const std::string& symbol, const BNF& bnf) - { -  return bnf.find(symbol) == bnf.end(); - } -   size_t numberOfStartSymbols(const BNF& bnf)   {    // exactly 1 start symbol @@ -75,7 +70,7 @@ namespace {     for (const auto& list : lists) {      for (const auto& i : list) { -     if (!isTerminal(i, bnf)) { +     if (!isTerminal(bnf, i)) {        // every non-terminal symbol must be longer that 1 char        if (i.size() == 1) {         std::cerr << "Warning: Non-Terminal symbol " << i << " in " << symbol << " is too short." << std::endl; @@ -107,7 +102,7 @@ namespace {    for (const auto& [symbol, lists] : bnf) {     for (const auto& list : lists) {      for (const auto& i : list) { -     if (i.size() != 1 && isTerminal(i, bnf)) { +     if (i.size() != 1 && isTerminal(bnf, i)) {        std::cerr << "Warning: Terminal symbol in " << symbol << " is too long: "s << i << std::endl;        return false;       } @@ -197,7 +192,7 @@ namespace {    for (auto& [symbol, lists] : bnf) {     for (auto& list : lists) {      for (int i = 0; i < list.size(); i++) { -     if (list[i].size() > 1 && isTerminal(list[i], bnf)) { +     if (list[i].size() > 1 && isTerminal(bnf, list[i])) {        auto newList = vectorize(list[i]);        list.erase(list.begin() + i);        list.insert(list.begin() + i, newList.begin(), newList.end()); @@ -2,14 +2,178 @@  using namespace Lex; -Lexer::Lexer(const BNF& bnf, const std::string& Top) +size_t Lexer::newState(std::string state_type)  { + m_state_types.push_back(state_type); + return states++; +} + +void Lexer::addTransition(size_t state0, size_t state1, char c) +{ + auto it{transitions.find(state0)}; + + if (it == transitions.end()) { // new entry is a list with 1 entry +  transitions[state0] = {{state1, c}}; + } else { // extend list entry +  it->second.push_back({state1, c}); + } +} + +std::vector<std::pair<size_t, char>> Lexer::getSuccessors(size_t state) +{ + std::vector<std::pair<size_t, char>> result; + + auto it{transitions.find(state)}; + + if (it != transitions.end()) { // new list entry +  for (auto &i: it->second) { +   if (i.first == m_endState || i.second != '\0') { // add transition to end state or transition via actual char +    result.push_back(i); +   } else { // follow empty transition +    auto successors { getSuccessors(i.first) }; +    result.insert(result.end(), successors.begin(), successors.end()); +   } +  } + } + + return result; +} + +// Helper function +void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type) +{ + if (isTerminal(m_bnf, symbol)) { // end recursion with new transition +  if (symbol.size() != 1) +   throw std::runtime_error("Bad sized terminal symbol: "s + symbol); +  addTransition(state0, state1, symbol[0]); + } else { // recurse via non-terminal symbol +  addPath(state0, state1, symbol, type); + } +} + +// Helper function: add one rule +void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type) +{ + size_t previousState{state0}; +  + // add intermediate states with transitions + for (size_t i = list_index_from; i < list_index_to - 1; i++) { +  std::string symbol{list[i]}; +  if (symbol == rule_symbol) +   throw std::runtime_error("Recursion found but not allowed"); + +  size_t state{newState(type)}; +  addPathOrTransition(previousState, state, symbol, type); +  previousState = state; + } + if (list.back() == rule_symbol) +  throw std::runtime_error("Recursion found but not allowed"); + + // last transition + addPathOrTransition(previousState, state1, list.back(), type); +} + +// Add paths between state0 and state1, including new states and transitions +void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type) +{ + if (type == "" && s != "" && s != m_top) +  type = s; + + // state0 -> [paths] -> state01 -> state1 + //                      ^     v + //                 [recursion paths] + size_t state01{newState(type)}; + auto it {m_bnf.find(s)}; + + if (it == m_bnf.end()) +  throw std::runtime_error("Path ("s + std::to_string(state0) + ", "s + std::to_string(state1) + ") not possible."s); + + for (auto& list: it->second) { // for every path between state0 and state1 +  size_t list_size{list.size()}; +  if (list_size < 1) +   throw std::runtime_error("List too small in rule "s + s); + +  if (list[0] == s) { // recursion rule +   addRule(list, 1, list_size, state01, state01, s, type); +  } else { // non-recursion rule +   addRule(list, 0, list_size, state0, state01, s, type); +  } + } + addTransition(state01, state1, 0); // empty transition to end +} + +Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top) +                                                      , m_startState(newState()), m_endState(newState()) +{ + addPath(m_startState, m_endState, m_top, ""); +} + +Token Lexer::getToken(const std::string& s, Location& location) +{ + Location oldLocation{location}; // start of token + + std::vector<size_t> states{m_startState}; // can be in multiple states at once + std::vector<size_t> newStates; + + Location found; + std::string state_type; + + // match as much as possible + while (location.pos < s.size() && states.size() > 0) { +  char currentChar{s[location.pos]}; +  std::cout << "DEBUG: Char: " << currentChar << std::endl; + +  for (const auto& state: states) { +   std::vector<std::pair<size_t, char>> successors{getSuccessors(state)}; +   for (const auto& [nextState, c]: successors) { +    if (c == currentChar) { +     newStates.push_back(nextState); +     if (nextState == m_endState) { // save intermediate result upon match +      found = location; +      found.advance(); +      state_type = m_state_types[state]; +     } +    } else if (nextState == m_endState) { // save intermediate result w/o match of c +     found = location; +     state_type = m_state_types[state]; +    } +   } +  } +  states = newStates; +  newStates.clear(); +  location.advance(currentChar == '\n'); + } + + std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)}; + + if (found.pos > 0) +  std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; + else +  throw std::runtime_error("Tokenize error at "s + oldLocation.toString()); + + location = found; // reset to end of match + + return {state_type, value, oldLocation}; +} + +void Lexer::skipWhitespace(const std::string& s, Location& location) +{ + while (location.pos < s.size() && std::isspace(s[location.pos])) { +  location.advance(s[location.pos] == '\n'); + }  }  std::vector<Token> Lexer::Lex(const std::string& s)  {   std::vector<Token> result; + Location location; + skipWhitespace(s, location); + while (location.pos < s.size()) { +  result.emplace_back(getToken(s, location)); +  skipWhitespace(s, location); + } +   return result;  } @@ -3,16 +3,42 @@  #include "minicc.h"  #include "bnf.h" +#include <string> +#include <unordered_map> +#include <vector> +  namespace Lex {  class Lexer  { + // constructor input + const BNF& m_bnf; + const std::string& m_top; + + // Graph +  + size_t states{}; // start, ... + std::unordered_map<size_t, std::vector<std::pair<size_t, char>>> transitions; //transitions: state -> {state,character}, ...; empty transition is marked by \0 + std::vector<std::string> m_state_types; + size_t m_startState; + size_t m_endState; + + // Graph manipulation +  + size_t newState(std::string state_type = ""); + void addTransition(size_t state0, size_t state1, char c); + std::vector<std::pair<size_t, char>> getSuccessors(size_t state); + + // Build up automaton, recursively + void addPath(size_t state0, size_t state1, std::string s, std::string type); + void addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type); + void addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type); - //states; // start, ... - //transitions; // state, state, character + Token getToken(const std::string& s, Location& location); + void skipWhitespace(const std::string& s, Location& location);  public: - Lexer(const BNF& bnf, const std::string& Top); + Lexer(const BNF& bnf, const std::string& top);   std::vector<Token> Lex(const std::string& s);  }; @@ -14,6 +14,8 @@  #include <utility>  #include <vector> +using namespace std::string_literals; +  std::vector<std::string> split(std::string s)  {   std::vector<std::string> result; @@ -37,3 +39,18 @@ std::ostream& operator<<(std::ostream& os, const Token& token) {   return os << token.type << ": " << token.value << "(" << token.location.line << ":" << token.location.column << ")";  } +void Location::advance(bool newline) +{ + pos++; + if (newline) { +  line++; +  column = 1; + } else { +  column++; + } +} + +std::string Location::toString() +{ + return std::to_string(line) + ":"s + std::to_string(column); +} @@ -10,8 +10,12 @@ using index_t = size_t;  std::vector<std::string> split(std::string s);  struct Location { - size_t line; - size_t column; + size_t line{1}; + size_t column{1}; + size_t pos{0}; + + void advance(bool newline = false); + std::string toString();  };  bool operator==(const Location &a, const Location &b); diff --git a/test-lexer.cpp b/test-lexer.cpp index a94e550..9f1cb77 100644 --- a/test-lexer.cpp +++ b/test-lexer.cpp @@ -1,5 +1,6 @@  #include "bnf.h"  #include "cpp.h" +#include "cppbnf.h"  #include "lexer.h"  #include "grammer.h"  #include "minicc.h" @@ -19,14 +20,20 @@  #include <utility>  #include <vector> -class Test: public ::testing::Test { +class LexerTest: public ::testing::Test {  protected: - Test(){ + LexerTest(){    debug = false;   } - ~Test() override {} + ~LexerTest() override {}  }; -TEST_F(Test, BNF) { -} +TEST_F(LexerTest, Lex) { + auto bnf{SubBNF(GetCppBNFLex(), "preprocessing-token")}; + + Lex::Lexer lexer(bnf, "preprocessing-token"); + std::vector<Token> tokens{lexer.Lex("int main() { return 1; }")}; + + ASSERT_EQ(tokens.size(), 9); +} | 
