diff options
| author | Roland Reichwein <mail@reichwein.it> | 2020-03-16 21:27:38 +0100 | 
|---|---|---|
| committer | Roland Reichwein <mail@reichwein.it> | 2020-03-16 21:27:38 +0100 | 
| commit | 74350b52fee9f576a1cc71d99cfd4ebdf5a73e0f (patch) | |
| tree | fbcc823f28f31b3671af3a1e01a109abae0dbb92 | |
| parent | 9502989aec80e8f75cf14e7dd7d1d85333090866 (diff) | |
Fixed lexer
| -rwxr-xr-x | cppbnf.cpp | 2 | ||||
| -rw-r--r-- | lexer.cpp | 62 | ||||
| -rw-r--r-- | lexer.h | 14 | 
3 files changed, 47 insertions, 31 deletions
| @@ -220,7 +220,7 @@ BNF GetCppBNFLex()    {"preprocessing-token", {     {"header-name"}, -   {"import-keyword"}, +   //{"import-keyword"}, // TODO     {"identifier"},     {"pp-number"},     {"character-literal"}, @@ -4,9 +4,8 @@  using namespace Lex; -size_t Lexer::newState(std::string state_type) +size_t Lexer::newState()  { - m_state_types.push_back(state_type);   return states++;  } @@ -66,19 +65,19 @@ std::vector<std::pair<size_t, char>> Lexer::getSuccessors(size_t state)  }  // Helper function -void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type) +void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol)  {   if (isTerminal(m_bnf, symbol)) { // end recursion with new transition    if (symbol.size() != 1)     throw std::runtime_error("Bad sized terminal symbol: "s + symbol);    addTransition(state0, state1, symbol[0]);   } else { // recurse via non-terminal symbol -  addPath(state0, state1, symbol, type); +  addPath(state0, state1, symbol);   }  }  // Helper function: add one rule -void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type) +void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol)  {   size_t previousState{state0}; @@ -88,27 +87,24 @@ void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from    if (symbol == rule_symbol)     throw std::runtime_error("Recursion found but not allowed"); -  size_t state{newState(type)}; -  addPathOrTransition(previousState, state, symbol, type); +  size_t state{newState()}; +  addPathOrTransition(previousState, state, symbol);    previousState = state;   }   if (list.back() == rule_symbol)    throw std::runtime_error("Recursion found but not allowed");   // last transition - addPathOrTransition(previousState, state1, list.back(), type); + addPathOrTransition(previousState, state1, list.back());  }  // Add paths between state0 and state1, including new states and transitions -void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type) +void Lexer::addPath(size_t state0, size_t state1, std::string s)  { - if (type == "" && s != "" && s != m_top) -  type = s; -   // state0 -> [paths] -> state01 -> state1   //                      ^     v   //                 [recursion paths] - size_t state01{newState(type)}; + size_t state01{newState()};   auto it {m_bnf.find(s)};   if (it == m_bnf.end()) @@ -120,9 +116,9 @@ void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string typ     throw std::runtime_error("List too small in rule "s + s);    if (list[0] == s) { // recursion rule -   addRule(list, 1, list_size, state01, state01, s, type); +   addRule(list, 1, list_size, state01, state01, s);    } else { // non-recursion rule -   addRule(list, 0, list_size, state0, state01, s, type); +   addRule(list, 0, list_size, state0, state01, s);    }   }   addTransition(state01, state1, 0); // empty transition to end @@ -154,14 +150,35 @@ void Lexer::removeEmptyTransitions()   }  } -Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState()), m_endState(newState()) +Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState())  { - addPath(m_startState, m_endState, m_top, ""); + // types are the second level symbols, directly under top + auto it {m_bnf.find(m_top)}; + if (it == m_bnf.end()) +  throw std::runtime_error("Start symbol "s + m_top + " not found."s); + + auto& list {it->second}; + + for (auto& element : list) { +  if (element.size() != 1) +   throw std::runtime_error("Bad type rule in "s + m_top + ": size = "s + std::to_string(element.size())); + +  auto endState{newState()}; +  std::string type{element[0]}; +  m_state_types[endState] = type; +  +  addPath(m_startState, endState, type); + }   replaceEmptyTransitions();   removeEmptyTransitions();  } +bool Lexer::isEndState(size_t state) +{ + return m_state_types.find(state) != m_state_types.end(); +} +  Token Lexer::getToken(const std::string& s, Location& location)  {   Location oldLocation{location}; // start of token @@ -175,16 +192,16 @@ Token Lexer::getToken(const std::string& s, Location& location)   // match as much as possible   while (location.pos < s.size() && states.size() > 0) {    char currentChar{s[location.pos]}; -  std::cout << "DEBUG: Char: " << currentChar << std::endl; +  //std::cout << "DEBUG: Char: " << currentChar << std::endl;    for (const auto& state: states) {     std::vector<std::pair<size_t, char>> successors{getSuccessors(state)};     for (const auto& [nextState, c]: successors) {      if (c == currentChar) { -     if (nextState == m_endState) { // save intermediate result upon match +     if (isEndState(nextState)) { // save intermediate result upon match        found = location;        found.advance(); -      state_type = m_state_types[state]; +      state_type = m_state_types[nextState];       } else {        newStates.push_back(nextState);       } @@ -198,10 +215,9 @@ Token Lexer::getToken(const std::string& s, Location& location)   std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)}; - if (found.pos > 0) -  std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; - else + if (found.pos == 0)    throw std::runtime_error("Bad Token at "s + oldLocation.toString()); + //else std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl;   location = found; // reset to end of match @@ -18,23 +18,23 @@ class Lexer   // Graph   size_t states{}; // start, ... - std::unordered_map<size_t, std::vector<std::pair<size_t, char>>> transitions; //transitions: state -> {state,character}, ...; empty transition is marked by \0 - std::vector<std::string> m_state_types; + std::unordered_map<size_t, std::vector<std::pair<size_t, char>>> transitions; // transitions: state -> {state,character}, ...; empty transition is marked by \0 + std::unordered_map<size_t, std::string> m_state_types; // only necessary for 2nd level symbol names   size_t m_startState; - size_t m_endState;   // Graph manipulation - size_t newState(std::string state_type = ""); + size_t newState(); + bool isEndState(size_t state);   void addTransition(size_t state0, size_t state1, char c);   void removeTransition(size_t state0, size_t state1, char c);   std::vector<std::pair<size_t, char>> getSuccessorsViaEmpty(size_t state);   std::vector<std::pair<size_t, char>> getSuccessors(size_t state);   // Build up automaton, recursively - void addPath(size_t state0, size_t state1, std::string s, std::string type); - void addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type); - void addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type); + void addPath(size_t state0, size_t state1, std::string s); + void addPathOrTransition(size_t state0, size_t state1, std::string symbol); + void addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol);   Token getToken(const std::string& s, Location& location);   void skipWhitespace(const std::string& s, Location& location); | 
