diff options
| -rw-r--r-- | bnf.cpp | 53 | ||||
| -rw-r--r-- | bnf.h | 1 | ||||
| -rw-r--r-- | grammer.cpp | 58 | ||||
| -rw-r--r-- | grammer.h | 10 | ||||
| -rw-r--r-- | lexer.cpp | 4 | ||||
| -rw-r--r-- | test-cpp.cpp | 2 | 
6 files changed, 96 insertions, 32 deletions
| @@ -68,6 +68,59 @@ BNF SubBNF(const BNF& bnf, const std::string& top)   return result;  } +namespace { + + bool isHeadRecursive(const std::vector<std::string>& list, const std::string& symbol) { +  if (list.size() > 0 && list[0] == symbol) +   return true; +  return false; + } + + bool isHeadRecursive(const std::vector<std::vector<std::string>>& lists, const std::string& symbol) { +  for (const auto& list: lists) { +   if (isHeadRecursive(list, symbol)) +    return true; +  } +  return false; + } + +} // anonymous namespace + +// Change head recursion to tail recursion +BNF removeHeadRecursion(const BNF& bnf) +{ + std::unordered_map<std::string, std::vector<std::vector<std::string>>> result; + + for (const auto& [from, to]: bnf) { +  if (isHeadRecursive(to, from)) { // modify rule by adding additional one +   std::string from_ext = from + "-ext"; +   if (bnf.find(from_ext) != bnf.end()) +    throw std::runtime_error("ICE: Symbol "s + from_ext + " already exists in original BNF"); + +   std::vector<std::vector<std::string>> to_new; +   std::vector<std::vector<std::string>> to_ext{{}}; // starts with empty list as first element +   for (const auto& list: to) { +    if (isHeadRecursive(list, from)) { +     std::vector<std::string> list_new{list.begin() + 1, list.end()}; +     list_new.push_back(from_ext); +     to_ext.push_back(list_new); +    } else { +     std::vector<std::string> list_new{list.begin(), list.end()}; +     list_new.push_back(from_ext); +     to_new.push_back(list_new); +    } +   } + +   result[from] = to_new; +   result[from_ext] = to_ext; +  } else { // just add +   result[from] = to; +  } + } + + return result; +} +  bool isTerminal(const BNF& bnf, const std::string& symbol)  {   return bnf.find(symbol) == bnf.end(); @@ -15,6 +15,7 @@ std::unordered_map<std::string, std::unordered_set<std::string>> Reverse(const B  std::unordered_map<std::string, std::unordered_set<std::string>> reverseFirst(const BNF& bnf);  BNF SubBNF(const BNF& bnf, const std::string& top); +BNF removeHeadRecursion(const BNF& bnf);  bool isTerminal(const BNF& bnf, const std::string& symbol);  std::unordered_set<std::string> getTerminals(const BNF& bnf); diff --git a/grammer.cpp b/grammer.cpp index 50617ce..d7afaef 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -92,7 +92,7 @@ void Compiler::DumpTree()  index_t Compiler::AddNode(const std::string& type, index_t variant, NodePosition pos)  { - auto& list { bnf[type][variant]}; + auto& list { m_bnf[type][variant]};   index_t node_id{nodes.size()};   if (nodes.size() > 0)    nodes[pos.node_id].child_ids[pos.child_pos] = node_id; @@ -149,7 +149,7 @@ void Compiler::IncNodePosition(NodePosition& pos)  size_t Compiler::minimumSymbolsNeeded(const std::string& symbol)  { - if (isTerminal(bnf, symbol)) { + if (isTerminal(m_bnf, symbol)) {    return 1;   } else {    auto it_min{m_min.find(symbol)}; @@ -157,8 +157,8 @@ size_t Compiler::minimumSymbolsNeeded(const std::string& symbol)     return it_min->second;    m_min[symbol] = std::numeric_limits<size_t>::max(); -  auto it{bnf.find(symbol)}; -  if (it != bnf.end()) { +  auto it{m_bnf.find(symbol)}; +  if (it != m_bnf.end()) {     size_t minimum{std::numeric_limits<size_t>::max()};     for (const auto& list: it->second) { @@ -196,7 +196,7 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t    symbol_list.erase(symbol_list.begin());   } - // matching of empty lists + // matching of empty list in non-terminals   if (symbol_list.size() == 0) {    if (begin == end) { // only match real empty list     // this is the point of the final match @@ -206,6 +206,19 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t    return false;   } + // matching of empty list in terminals + if (begin == end) { +  const auto& symbol{symbol_list.front()}; +  auto it{m_empty_lut.find(symbol)}; +  if (it == m_empty_lut.end()) // can't match empty tokens list with this nt-symbol +   return false; + +  AddNodeGuard guard(*this, it->second); +  std::vector<std::string> list {m_bnf[symbol][it->second]}; +  list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); +  return match(list, begin, end); + } +   // now, symbol_list has size > 0 and contains non-terminal symbols at start and end   // resolve first symbol @@ -213,12 +226,11 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t   if (it != m_match_lut.end()) {    for (size_t i: it->second) {     AddNodeGuard guard(*this, i); -   std::vector<std::string> list {bnf[symbol_list.front()][i]}; +   std::vector<std::string> list {m_bnf[symbol_list.front()][i]};     list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end());     if (minimumSymbolsNeeded(list) > end - begin) // stop recursion      continue; -   // TODO: recurse last?     if (match(list, begin, end))      return true;    } @@ -236,7 +248,7 @@ bool Compiler::match(std::string symbol, size_t begin, size_t end)  void Compiler::fillStartCache()  { - std::unordered_set<std::string> terminals {getTerminals(bnf)}; + std::unordered_set<std::string> terminals {getTerminals(m_bnf)};   { // follow terminal symbols    for (const std::string& terminal: terminals) { @@ -273,12 +285,13 @@ void Compiler::fillStartCache()   }   { // follow empty non-terminal symbols, and combine all found non-terminals with all terminals -  std::unordered_set<std::pair<std::string, size_t>, PairHash> current_set {getEmptyPositions(bnf)}; +  std::unordered_set<std::pair<std::string, size_t>, PairHash> current_set {getEmptyPositions(m_bnf)};    std::unordered_set<std::pair<std::string, size_t>, PairHash> done_set;    std::unordered_set<std::pair<std::string, size_t>, PairHash> next_set;    do {     for (const auto& current_pos: current_set) { +    m_empty_lut[current_pos.first] = current_pos.second;      for (const std::string& terminal: terminals) {       auto it_lut {m_match_lut.find({current_pos.first, terminal})};       if (it_lut == m_match_lut.end()) { // add new list @@ -286,7 +299,6 @@ void Compiler::fillStartCache()       } else { // extend list        it_lut->second.emplace_back(current_pos.second);       } -      }      auto it {reversedPosFirst.find(current_pos.first)}; @@ -317,24 +329,24 @@ void Compiler::constructTree()  {   symbol_variants_it = symbol_variants.begin(); - m_symbol_list = {m_top}; - m_symbol_list_pos = 0; + std::vector<std::string> symbol_list{m_top}; + index_t symbol_list_pos{0};   NodePosition tree_pos; - while (m_symbol_list_pos < m_symbol_list.size()) { -  std::string symbol{m_symbol_list[m_symbol_list_pos]}; + while (symbol_list_pos < symbol_list.size()) { +  std::string symbol{symbol_list[symbol_list_pos]}; -  if (isTerminal(bnf, symbol)) { +  if (isTerminal(m_bnf, symbol)) {     // Advance terminal symbol -   nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(m_symbol_list_pos); +   nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(symbol_list_pos);     IncNodePosition(tree_pos); -   m_symbol_list_pos++; +   symbol_list_pos++;    } else {     // Replace non-terminal symbol -   m_symbol_list.erase(m_symbol_list.begin() + m_symbol_list_pos); -   std::vector<std::string> list {bnf[symbol][*symbol_variants_it]}; -   m_symbol_list.insert(m_symbol_list.begin() + m_symbol_list_pos, list.begin(), list.end()); +   symbol_list.erase(symbol_list.begin() + symbol_list_pos); +   std::vector<std::string> list {m_bnf[symbol][*symbol_variants_it]}; +   symbol_list.insert(symbol_list.begin() + symbol_list_pos, list.begin(), list.end());     index_t node_id {AddNode(symbol, *symbol_variants_it, tree_pos)}; @@ -350,11 +362,9 @@ void Compiler::constructTree()  }  Compiler::Compiler(BNF& bnf, const std::string& top) - : bnf(bnf) + : m_bnf {removeHeadRecursion(bnf)}   , m_top(top) - //, ReverseBNF{Reverse(bnf)} - //, reversedFirst{reverseFirst(bnf)} - , reversedPosFirst{reversePosFirst(bnf)} + , reversedPosFirst{reversePosFirst(m_bnf)}  {   //   // prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements) @@ -39,11 +39,9 @@ private:   // Input   std::vector<Token> tokens; - BNF &bnf; // not const for access via operator[] + BNF m_bnf; // not const for access via operator[]   const std::string m_top; - //std::unordered_map<std::string, std::unordered_set<std::string>> ReverseBNF; // possible parent types of a given type; unused now: remove? - //std::unordered_map<std::string, std::unordered_set<std::string>> reversedFirst; // possible parent types of first childs of a given type   std::unordered_map<std::string, std::unordered_set<std::pair<std::string, index_t>, PairHash>> reversedPosFirst; // possible parent types of first childs of a given type   std::deque<index_t> symbol_variants; @@ -81,12 +79,14 @@ private:     return h0 ^ (h1 << 1) ^ (h2 << 2);    }   }; + // map combination of non-terminal+terminal symbol combination to list of + // possible BNF positions for the specified non-terminal symbol   std::unordered_map<std::pair<std::string, std::string>, std::vector<size_t>, PairHashSS> m_match_lut; + std::unordered_map<std::string, size_t> m_empty_lut; // list of non-terminal symbols that can lead to empty token list (maps to variant) +   void fillStartCache();   void constructTree(); - std::vector<std::string> m_symbol_list; - index_t m_symbol_list_pos{};  public:   Compiler(BNF& bnf, const std::string& top); @@ -85,14 +85,14 @@ void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from   for (size_t i = list_index_from; i < list_index_to - 1; i++) {    std::string symbol{list[i]};    if (symbol == rule_symbol) -   throw std::runtime_error("Recursion found but not allowed"); +   throw std::runtime_error("Recursion found but not allowed. Only head recursion allowed for lexer.");    size_t state{newState()};    addPathOrTransition(previousState, state, symbol);    previousState = state;   }   if (list.back() == rule_symbol) -  throw std::runtime_error("Recursion found but not allowed"); +  throw std::runtime_error("Tail recursion found but not allowed. Only head recursion allowed for lexer.");   // last transition   addPathOrTransition(previousState, state1, list.back()); diff --git a/test-cpp.cpp b/test-cpp.cpp index 8a9b7cb..e5b2a1a 100644 --- a/test-cpp.cpp +++ b/test-cpp.cpp @@ -42,7 +42,7 @@ TEST_F(CppTest, preprocessing_tokenize) {   auto nodes = cpp.analysis(tokens); - ASSERT_EQ(nodes.size(), 44); + ASSERT_EQ(nodes.size(), 60/*44*/);  }  TEST_F(CppTest, preprocessing_tokenize_compile_error) { | 
