diff options
| author | Roland Reichwein <mail@reichwein.it> | 2020-03-27 17:39:58 +0100 | 
|---|---|---|
| committer | Roland Reichwein <mail@reichwein.it> | 2020-03-27 17:39:58 +0100 | 
| commit | 3057729f132d516dd9ed58c6964a495aa1c11c3d (patch) | |
| tree | 7eca9a03fd08200868a5ab0c27e33170b9dc917a | |
| parent | 5467147d9470ee294ddab938098c3ef172222066 (diff) | |
Top-down algo recognizes first C*+ program
| -rw-r--r-- | cpp.cpp | 2 | ||||
| -rw-r--r-- | cpp.h | 2 | ||||
| -rw-r--r-- | grammer.cpp | 117 | ||||
| -rw-r--r-- | grammer.h | 21 | ||||
| -rw-r--r-- | test-cpp.cpp | 4 | 
5 files changed, 114 insertions, 32 deletions
| @@ -213,7 +213,7 @@ std::vector<Token> CPP::tokens_from_pptokens(std::vector<Token> pp_tokens)  }  // Phase 7.b: Grammar Analysis -std::pair<index_t, std::vector<Gram::TreeNode>> CPP::analysis(std::vector<Token> tokens) +std::vector<Gram::TreeNode> CPP::analysis(std::vector<Token> tokens)  {   auto bnf = SubBNF(CPPBNF::GetCppBNFGram(), "translation-unit"); @@ -22,7 +22,7 @@ void preprocess(); // phase 4  void execution_charset_map(); // phase 5  void concatenate_strings(); // phase 6  std::vector<Token> tokens_from_pptokens(std::vector<Token> pp_tokens); // phase 7.a -std::pair<index_t, std::vector<Gram::TreeNode>> analysis(std::vector<Token>); // phase 7.b +std::vector<Gram::TreeNode> analysis(std::vector<Token>); // phase 7.b  void translate(); // phase 7.c  void instantiate(); // phase 8  void link(); // phase 9 diff --git a/grammer.cpp b/grammer.cpp index 3cc88fa..cd5b50f 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -15,8 +15,8 @@ void Debug(std::string s)  void Compiler::clear()  { + symbol_variants.clear();   nodes.clear(); - begin_pos = NodePosition{};  }  std::string Compiler::GetTypeOfNode(index_t node_id) const @@ -26,11 +26,21 @@ std::string Compiler::GetTypeOfNode(index_t node_id) const   return nodes[node_id].type;  } +bool Gram::ChildIdIsEmpty(int32_t child_id) +{ + return child_id == 0; +} +  bool Gram::ChildIdIsToken(int32_t child_id)  {   return child_id < 0;  } +bool Gram::ChildIdIsNode(int32_t child_id) +{ + return child_id > 0; +} +  index_t Gram::TokenIdFromChildId(int32_t child_id)  {   return index_t(-child_id) - 1; @@ -94,33 +104,53 @@ index_t Compiler::AddNode(const std::string& type, index_t variant, NodePosition    nodes[pos.node_id].child_ids[pos.child_pos] = node_id;   nodes.emplace_back(TreeNode{pos, node_id, type, variant, std::vector<int32_t>(size_t(list.size()), 0)}); - Debug("AddNode(): "s + nodes[pos.node_id].type + "->"s + type + ": "s + std::to_string(node_id)); - DumpTree(); -   return node_id;  } -void Compiler::RemoveNode() +Compiler::AddNodeGuard::AddNodeGuard(Compiler& compiler, index_t variant): m_compiler(compiler)  { - auto& pos{nodes.back().pos}; - - nodes[pos.node_id].child_ids[pos.child_pos] = 0; - nodes.pop_back(); -} - -Compiler::AddNodeGuard::AddNodeGuard(Compiler& compiler, const std::string& type, index_t variant, NodePosition pos): m_compiler(compiler) -{ - m_compiler.AddNode(type, variant, pos); + m_compiler.symbol_variants.push_back(variant);  }  Compiler::AddNodeGuard::~AddNodeGuard()  { - m_compiler.RemoveNode(); + m_compiler.symbol_variants.pop_back();  }  void Compiler::IncNodePosition(NodePosition& pos)  { - // TODO + if (nodes.size() == 0) // special case: empty tree  +  return; + if (pos.node_id >= nodes.size()) +  throw std::runtime_error("ICE: NodePosition with node_id "s + std::to_string(pos.node_id) + " doesn't exist."s); + if (pos.child_pos >= nodes[pos.node_id].child_ids.size()) +  throw std::runtime_error("ICE: NodePosition with child_pos "s + std::to_string(pos.child_pos) + " in node_id "s + std::to_string(pos.node_id) + " doesn't exist."s); + + int32_t child_id{nodes[pos.node_id].child_ids[pos.child_pos]}; + + if (ChildIdIsEmpty(child_id)) +  throw std::runtime_error("ICE: NodePosition is empty"); + + // Actually, advance + if (ChildIdIsToken(child_id)) { +  pos.child_pos++; + } else { +  pos.node_id = child_id; +  pos.child_pos = 0; + } +  + // Go to parent if child ids completely traversed + while (pos.node_id > 0 && pos.child_pos >= nodes[pos.node_id].child_ids.size()) { +  pos = nodes[pos.node_id].pos; +  pos.child_pos++; + } + + // Advancing at root node for last child is allowed: Finished + if (pos.child_pos >= nodes[pos.node_id].child_ids.size()) +  return; + + if (ChildIdIsNode(nodes[pos.node_id].child_ids[pos.child_pos])) +  throw std::runtime_error("ICE: No node expected at "s + std::to_string(pos.node_id) + "/"s + std::to_string(pos.child_pos));  }  size_t Compiler::minimumSymbolsNeeded(std::string symbol) @@ -170,7 +200,6 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t   while (begin < end && symbol_list.size() > 0 && symbol_list.front() == tokens[begin].type) {    begin++;    symbol_list.erase(symbol_list.begin()); -  IncNodePosition(begin_pos); // TODO: guard?   }   // match terminal symbols at end @@ -181,8 +210,11 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t   // matching of empty lists   if (symbol_list.size() == 0) { -  if (begin == end) // only match real empty list +  if (begin == end) { // only match real empty list +   // this is the point of the final match +   constructTree();     return true; +  }    return false;   } @@ -192,7 +224,7 @@ bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t   auto it{bnf.find(symbol_list.front())};   if (it != bnf.end()) {    for (size_t i = 0; i < it->second.size(); i++) { // iterate over alternatives -   //AddNodeGuard guard(*this, symbol_list.front(), i, begin_pos); +   AddNodeGuard guard(*this, i);     std::vector<std::string> list {it->second[i]};     list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end());     if (minimumSymbolsNeeded(list) > end - begin) // stop recursion @@ -214,6 +246,42 @@ bool Compiler::match(std::string symbol, size_t begin, size_t end)   return match(symbol_list, begin, end);  } +void Compiler::constructTree() +{ + symbol_variants_it = symbol_variants.begin(); +  + m_symbol_list = {m_top}; + m_symbol_list_pos = 0; + + NodePosition tree_pos; + + while (m_symbol_list_pos < m_symbol_list.size()) { +  std::string symbol{m_symbol_list[m_symbol_list_pos]}; + +  if (isTerminal(bnf, symbol)) { +   // Advance terminal symbol +   nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = ChildIdFromTokenId(m_symbol_list_pos); +   IncNodePosition(tree_pos); +   m_symbol_list_pos++; +  } else { +   // Replace non-terminal symbol +   m_symbol_list.erase(m_symbol_list.begin() + m_symbol_list_pos); +   std::vector<std::string> list {bnf[symbol][*symbol_variants_it]}; +   m_symbol_list.insert(m_symbol_list.begin() + m_symbol_list_pos, list.begin(), list.end()); + +   index_t node_id {AddNode(symbol, *symbol_variants_it, tree_pos)}; +    +   if (node_id > 0) { +    nodes[tree_pos.node_id].child_ids[tree_pos.child_pos] = node_id; +    IncNodePosition(tree_pos); +   } +    +   symbol_variants_it++; +  } + } + +} +  Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)}  {   //std::cout << "DEBUG: " << m_top << std::endl; @@ -234,7 +302,7 @@ Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), Reve   minimumSymbolsNeeded("translation-unit");  } -std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> p_tokens) +std::vector<TreeNode> Compiler::compile(std::vector<Token> p_tokens)  {   clear();   tokens = p_tokens; @@ -243,11 +311,16 @@ std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> p    throw std::runtime_error("No tokens!");   // - // top-down algorithm + // top-down algorithm: + // + // 1. Match linear tokens list to bnf, building up list of used variants (symbol_variants) + // 2. Construct Node Tree from symbol_variants   //   if (!match(m_top, 0, tokens.size()))    throw std::runtime_error("Compile error."); - return std::pair<index_t, std::vector<TreeNode>>{0, nodes}; + //DumpTree(); + + return nodes;  } @@ -4,6 +4,7 @@  #include "minicc.h"  #include <cstdint> +#include <deque>  #include <unordered_set>  #include <utility> @@ -16,7 +17,7 @@ struct NodePosition {   index_t child_pos{}; // 0-based  }; -// TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes +// TreeNodes are only intermediate. Terminal symbols don't get TreeNodes  // token_id: index into token list  // node_id: index into tree node list  struct TreeNode { @@ -43,24 +44,24 @@ private:   std::unordered_map<std::string, std::set<std::string>> ReverseBNF; // possible parent types of a given type; unused now: remove?   std::unordered_map<std::string, std::set<std::string>> reversedFirst; // possible parent types of first childs of a given type + std::deque<index_t> symbol_variants; + decltype(symbol_variants)::iterator symbol_variants_it; +   // Tree specific   void clear();   // Node specific   std::string GetTypeOfNode(index_t node_id) const;   index_t AddNode(const std::string& type, index_t variant, NodePosition pos = {}); - void RemoveNode(); - // Adds Node and Removes it on scope exit (RAII) + // Adds actually used Non-Terminal Symbol Removes it on scope exit (RAII)   class AddNodeGuard {    Compiler& m_compiler;   public: -  AddNodeGuard(Compiler& compiler, const std::string& type, index_t variant, NodePosition pos); +  AddNodeGuard(Compiler& compiler, index_t variant);    ~AddNodeGuard();   };   void IncNodePosition(NodePosition& pos); - NodePosition begin_pos; -   // top-down algorithm   std::unordered_map<std::string, size_t> m_min; // cache   size_t minimumSymbolsNeeded(std::string symbol); @@ -68,15 +69,21 @@ private:   bool match(std::string symbol, size_t begin, size_t end);   bool match(std::vector<std::string> symbol_list, size_t begin, size_t end); + void constructTree(); + std::vector<std::string> m_symbol_list; + index_t m_symbol_list_pos{}; +  public:   Compiler(BNF& bnf, const std::string& top); - std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> p_tokens); + std::vector<TreeNode> compile(std::vector<Token> p_tokens);   void DumpTree();   friend class ::GrammerTest;  }; +bool ChildIdIsEmpty(int32_t child_id);  bool ChildIdIsToken(int32_t child_id); +bool ChildIdIsNode(int32_t child_id);  index_t TokenIdFromChildId(int32_t child_id);  int32_t ChildIdFromTokenId(index_t token_id); diff --git a/test-cpp.cpp b/test-cpp.cpp index 2a67b38..47a57f5 100644 --- a/test-cpp.cpp +++ b/test-cpp.cpp @@ -46,7 +46,9 @@ TEST_F(CppTest, preprocessing_tokenize) {   }  #endif - auto result = cpp.analysis(tokens); + auto nodes = cpp.analysis(tokens); +  + ASSERT_EQ(nodes.size(), 44);  }  #endif | 
