diff options
| author | Roland Reichwein <mail@reichwein.it> | 2020-01-29 21:52:35 +0100 | 
|---|---|---|
| committer | Roland Reichwein <mail@reichwein.it> | 2020-01-29 21:52:35 +0100 | 
| commit | 1f3f7c2693686d847bf1a9bb3e47a023fe9d7992 (patch) | |
| tree | 032ee62d5c19b54f64b6fcb3a1aa941d2dc31c62 | |
| parent | b97f6b86b85553acd3863ee18a67b8868e0ea7b4 (diff) | |
New algo
| -rw-r--r-- | TODO | 6 | ||||
| -rw-r--r-- | grammer.cpp | 468 | ||||
| -rw-r--r-- | grammer.h | 65 | ||||
| -rw-r--r-- | test-lexer.cpp | 2 | 
4 files changed, 328 insertions, 213 deletions
| @@ -0,0 +1,6 @@ + + Token() = default; + Token(const std::string& s) { type = s; } // Assign type via "=" from string + +start symbol implicitly from bnf +validate bnf: no empty types or values, or empty target lists diff --git a/grammer.cpp b/grammer.cpp index 25fe837..8f38e3e 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -1,241 +1,344 @@  #include "grammer.h" +#include <algorithm> +  using namespace Gram; -void Tree::clear() { +void Compiler::clear() {   nodes.clear(); - root = 0; - last = 0; - node_num = 0; + root_node_id = 0; + + tokens_used = 0;  } -bool Tree::Valid(const std::string& Top) const { +std::string Compiler::GetTypeOfNode(index_t node_id) const +{ + return nodes[node_id].type; +} + +bool Compiler::IsRootNode(index_t node_id) const +{ + auto node& {nodes[node_id]}; + return node.parent_node_id == node.node_id; +} + +void Compiler::Validate() const {   // A program is non empty - if (node_num == 0) -  return false; + if (nodes.size() == 0) +  throw std::runtime_error(""); - // Start symbol on top - auto rootNode{nodes.find(root)}; - if (rootNode == nodes.end()) -  throw std::runtime_error("Node not found: "s + std::to_string(root)); + // Consistency check for nodes + if (root_node_id >= nodes.size()) +  throw std::runtime_error("Bad root node: "s + std::to_string(root_node_id) + " vs. "s + std::to_string(nodes.size())); - if (rootNode->second.token.type != Top) -  return false; + // Start symbol on top + if (GetTypeOfNode(root_node_id) != Top) +  throw std::runtime_error("Root node not start symbol!"); + + // All nodes filled + for (const auto& node: nodes) { +  if (node.child_ids.size() != bnf[node.type][node.variant].size()) +   throw std::runtime_error("Node not filled: "s + node.type + "["s + std::to_string(node.variant) + "]"s); + } +} - // All nodes filled (implies all leaves are terminal) - for (const auto& [index, node]: nodes) { -  if (node.childs.size() < node.child_types.size()) -   return false; // node not filled +void Compiler::DumpTree() +{ + std::cout << "=== Tree =====================================" << std::endl; + for (const auto& i : nodes) { +  std::cout << i.type << std::endl;   } +} - return true; +bool RootIsStartSymbol() +{ + return GetTypeOfNode(root_node_id) == Top;  } -bool Tree::AddFirstNode(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - node_num ++; - root = node_num; - last = node_num; +bool ChildIdIsToken(int32_t child_id) +{ + return child_i < 0; +} - auto reverseRule{reverseBNF.find(token.type)}; - if (reverseRule == reverseBNF.end()) -  throw std::runtime_error("Reverse rule not found for "s + token.type); +index_t TokenIdFromChildId(int32_t child_id) +{ + return index_t(-child_id) - 1; +} - auto rule{bnf.find(token.type)}; - if (rule != bnf.end()) { // multiple variants! -  throw std::runtime_error("BNF rule for terminal symbol "s + token.type + " found."s); +int32_t ChildIdFromTokenId(index_t token_id) +{ + return -1 - int32_t(token_id); +} + +bool AllTokensUsed() +{ + return tokens_used == tokens.size(); +} + +bool Compiler::treeIsComplete() +{ + return RootIsStartSymbol() && AllTokensUsed(); +} + +std::vector<std::string>& Compiler::getNodeExpectedChilds(node_id) +{ + std::string& type = nodes[node_id].type; + index_t& variant = nodes[node_id].variant; + return bnf[type][variant]; +} + +// returns true if all childs are filled, recursively. Else false with to_fill as hint to node to fill +bool Compiler::subTreeIsComplete(index_t node_id, index_t& to_fill) +{ + for (const auto& i : nodes[node_id].child_ids) { +  if (!ChildIdIsToken(i)) { // consider subtrees +   if (!subTreeIsComplete(i, to_fill)) +    return false; // found incomplete subtree -> return it! +  }   } - nodes.emplace(root, TreeNode{0, std::vector<index_t>{}, std::vector<std::string>{}, token}); + + if (nodes[node_id].child_ids.size() < getNodeExpectedChilds(node_id).size()) { +  to_fill = node_id; +  return false; + } +   return true;  } -std::vector<TreeNode> Tree::getParentTreeNode(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - std::vector<TreeNode> result; // default: empty +bool Compiler::StartsWith(const std::vector<Token>& tokens, const std::vector<std::string>& types) +{ + if (tokens.size() < types.size()) +  return false; - auto& root_name {nodes[root].token.type}; - auto bnfParents {reverseBNF.find(root_name)}; - if (bnfParents == reverseBNF.end()) -  return result; + auto [tokens_it, types_it] = std::mismatch(tokens.begin(), tokens.end(), types.begin(), types.end(), [](const Token& token, const std::string& s){ return token.type == s; }); + return types_it == types.end(); // no mismatch: tokens (types) start wit specified types list +} - for (const auto& parent_node_name : bnfParents->second) { -  auto lists {bnf.at(parent_node_name)}; -  for (const auto& list : lists) { -   if (list.size() > 0 && list[0] == root_name) { -    TreeNode node{0, std::vector<index_t>{root}, list, Token{parent_node_name}}; -    result.push_back(node); +void Compiler::AddFirstNode() +{ + root_node_id = 0; + + const std::string& child_type = tokens[0].type; + auto it = ReverseBNF.find(child_type); + if (it == ReverseBNF.end()) +  throw std::runtime_error("Type not found: "s + child_type + " ("s + tokens[0].value + ")"s); + + std::set<std::string>& alternatives_set {it->second}; + + std::string node_type; + index_t node_variant; + std::deque<std::string> alternatives; // only for valid elements from alternatives_set + std::vector<index_t> child_ids; + + for (const auto& type : alternatives_set) { +  const auto& variants{bnf[type]}; +  for (int i = 0; i < variants.size(); i++) { +   const std::vector<std::string> & variant{variants[i]}; +   if (StartsWith(tokens, variant)) { // match +    if (node_type == "") { +     node_type = type; +     node_variant = i; +     for (int token_id = 0; token_id < variant.size()) +      child_ids.push_back(ChildIdFromTokenId(token_id)); +    } else +     alternatives.push_back(type); // duplicates possible: variants of same type!     }    }   } - return result; + if (node_type == "") // no matching type found +  throw std::runtime_error("No matching first node found."); + + nodes.emplace_back({0, 0, node_type, node_variant, alternatives, child_ids}); + + tokens_used = child_ids.size();  } -index_t Tree::GetLast() { - index_t result {root}; +bool Compiler::AddRootNode() +{ + if (nodes.size() == 0) { +  AddFirstNode(); + } else { +  const std::string& child_type = nodes[root_node_id].type; // starting at old root +  auto it = ReverseBNF.find(child_type); +  if (it == ReverseBNF.end()) // this one doesn't have a parent, maybe a start symbol to discard? +   return false; + +  index_t old_root_node_id {root_node_id}; +  index_t new_root_node_id {nodes.size()}; +  nodes[root_node_id].parent_node_id = new_root_node_id; + +  std::set<std::string>& alternatives_set {it->second}; + +  std::string node_type; +  index_t node_variant; +  std::deque<std::string> alternatives; // only for valid elements from alternatives_set +  std::vector<index_t> child_ids{1, old_root_node_id}; + +  for (const auto& type : alternatives_set) { +   const auto& variants{bnf[type]}; +   for (int i = 0; i < variants.size(); i++) { +    const std::vector<std::string> & variant{variants[i]}; +    if (child_type == variant[0]) { +     if (node_type == "") { +      node_type = type; +      node_variant = i; +     } else +      alternatives.push_back(type); // duplicates possible: variants of same type +    } +   } +  } - while(result != 0 && nodes[result].childs.size() >= 2) { -  result = nodes[result].childs[nodes[result].childs.size() - 1]; +  if (node_type == "") // no matching type found +   return false; + +  // now add!  +  root_node_id = new_root_node_id; +  nodes.emplace_back({root_node_id, root_node_id, node_type, node_variant, alternatives, child_ids}); +  // keep tokens_used as is   } - return result; + return true;  } -void Tree::AddRootNode(const TreeNode& newRootNode) { - node_num++; - nodes[node_num] = newRootNode; - root = node_num; - last = node_num; +void RemoveLastNode() +{ + TreeNode& node {nodes.back()}; + index_t node_id = node.node_id; +  + if (node_id == root_node_id) { // No parent -> remove root +  if (node.child_ids().empty()) { // No children -> now empty +   clear(); +  } else if (node.child_ids().size() == 1) { // One child: removing possible +   root_node_id = node.child_ids()[0]; +   nodes.pop_back(); +  } else +   throw std::runtime_error("Backtrack not possible: Root not empty"); // ICE + } else if (node.child_ids().empty()) { // No children -> remove leaf +  // We have a parent, otherwise we would have taken previous branch +  TreeNode& parent {nodes[node.parent_node_id]}; +  if (parent.child_ids().empty() || parent.child_ids().last() != node_id) +   throw std::runtime_error("Backtrack: Bad child nodes"); // ICE +  parent.childs_ids().pop_back(); +  nodes.pop_back(); + } else { // In the middle +  throw std::runtime_error("Backtrack in the middle of the tree."); // ICE + }  } -void Tree::RemoveRootNode() { - root = nodes[root].childs[0]; - nodes.erase(node_num); - node_num--; - last = GetLast(); +// Change type of last node according to alternatives +void ChangeNodeType() +{ + TreeNode& node {nodes.back()}; + index_t node_id = node.node_id; + + if (node.alternative_types.empty()) +  throw std::runtime_error("No alternatives found during Backtrack"); // ICE + + if (node.alternative_types.front() == node.type) { // Keep type, change variant +  if (root_node_id == node_id) { // Root node +   // TODO ... +  } else if (node.child_ids().empty()) { // leaf node +  } else +   throw std::runtime_error("Backtrack: Can't set alternative in the middle of the tree."); + } else { // Different type +  if (root_node_id == node_id) { // Root node +  } else if (node.child_ids().empty()) { // leaf node +  } else +   throw std::runtime_error("Backtrack: Can't set alternative in the middle of the tree."); + } +} + +// throws if no further track back is possible: compile error +void Compiler::TrackBack() +{ + // Search backwards for alternatives: last node with alternatives (variant or alt. token) + while (!nodes.empty() && nodes.last().alternative_types.empty()) { +  RemoveLastNode(); + } + + if (nodes.empty()) { +  throw std::runtime_error("Compile error: Invalid program."); + } + + ChangeNodeType();  } -// Path from leaf to root -std::vector<std::string> Tree::GetPath(std::string a, std::string b, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { +// returns list from lower (including) to upper (excluding) +// returns empty list on fail +std::vector<std::string> Compiler::GetPath(std::string upper, std::string lower) {   std::vector<std::string> result; - while (a != b) { -  auto parents {reverseBNF.find(a)}; -  if (parents == reverseBNF.end()) + while (lower != upper) { +  auto parents {ReverseBNF.find(lower)}; +  if (parents == ReverseBNF.end())     return {}; +  std::string new_lower;    bool hit{false};    for (const auto& parent : parents->second) {     for (const auto& list : bnf.at(parent)) { -    if (list.size() > 0 && list[0] == a) { +    if (list.size() > 0 && list[0] == lower) {       if (!hit) { -      result.push_back(a); -      a = parent; +      result.push_back(lower); +      new_lower = parent;        hit = true;       } else -      throw std::runtime_error("Double match for "s + parent + "/"s + a); +      throw std::runtime_error("Double match for "s + lower + ": "s + parent + ", "s + new_lower); // TODO: also get alternatives at each step      }     }    } - } - if (a == b) { -  result.push_back(a); +  lower = new_lower;   }   return result;  } -index_t Tree::AddNode(const std::string& name, const std::string& child_name, index_t parent_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) +index_t Compiler::AddNode(const std::string& name, const std::string& child_type, index_t parent_index)  {   TreeNode& parent {nodes[parent_index]}; - node_num++; - index_t index = node_num; - parent.childs.push_back(index); - std::vector<std::string> child_types; - auto rule {bnf.find(name)}; - if (rule != bnf.end()) { -  for (auto& list : rule->second) { -   if (list.size() > 0 && list[0] == child_name) -    child_types = list; -  } - } - nodes.emplace(index, TreeNode{parent_index, {}, child_types, Token{name}}); - //root stays - last = GetLast(); + index_t index = nodes.size(); + parent.child_ids.push_back(index); - return index; -} + index_t variant; + std::deque<std::string> alternatives; -void Tree::AddPath(const std::vector<std::string>& path, index_t current_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - for (int i = path.size() - 1; i >= 0; i--) { -  std::string child_name; -  if (i > 0) -   child_name = path[i - 1]; -  current_index = AddNode(path[i], child_name, current_index, bnf, reverseBNF); + const auto& lists { bnf[parent.type] }; + for (int i = 0; i < lists.size(); i++) { // variants +  if (lists[i].size() > 0 && lists[i][0] == child_type) +   variant = i;   } -} -// try to add Node to tree -bool Tree::Add(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - if (nodes.empty()) { // first node -  return AddFirstNode(token, bnf, reverseBNF); - } else { // at least one character is already present -  // Traverse tree until partially filled node found -  // or new node can be added -  index_t current_index{last}; - -  while (current_index != 0) { -   TreeNode& node {nodes[current_index]}; -   if (node.childs.size() < node.child_types.size()) { // partially filled node -    std::vector<std::string> list = GetPath(token.type, node.child_types[node.childs.size()], bnf, reverseBNF); -    if (list.size() > 0) { -     AddPath(list, current_index, bnf, reverseBNF); -     return true; -    } else { -     return false; // The path a->b is not available via bnf -    } -   } -   current_index = node.parent; -  } - -  // Add node at root - -  std::vector<TreeNode> parent_nodes = getParentTreeNode(bnf, reverseBNF); -  if (parent_nodes.size() == 0) -   throw std::runtime_error("Couldn't add new parent node for "s + nodes[root].token.type + "("s + std::to_string(root) + ")"s); + nodes.emplace_back({parent_index, index, child_type, variant, alternatives, {}}); + //root stays, tokens_used stays -  for (const auto &i : parent_nodes) { -   AddRootNode(i); -   if (Add(token, bnf, reverseBNF)) -    return true; -   RemoveRootNode(); -  } - - } - return false; + return index;  } -// add path to start symbol -void Tree::Resolve(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF) { - if (nodes.empty()) // only handle non-empty trees -  return; - - while (true) { -  std::string& old_root_name { nodes[root].token.type }; // current root node type - -  auto parents {reverseBNF.find(old_root_name)}; -  if (parents != reverseBNF.end()) { // parents in bnf available -   bool hit{false}; -   for (auto& parent : parents->second) { -    for (const auto& list : bnf.at(parent)) { -     if (list.size() == 1 && list[0] == old_root_name) { -      if (!hit) { -       // Add new TreeNode in the direction to root: -       // New root with 1 child (old root) -       nodes.emplace(++node_num, -                    TreeNode{0, // parent -                             std::vector<index_t>{root}, // child indices -                             std::vector<std::string>{old_root_name}, // child types -                             Token{parent} -                    }); -       nodes[root].parent = node_num; -       root = node_num; -       // this->last stays -       hit = true; -      } else -       throw std::runtime_error("Error: Multiple resolve nodes for "s + old_root_name); -     } -    } -   } -   if (!hit) -    break; -  } else -   break; +void Compiler::AddPath(const std::vector<std::string>& path, index_t current_index) { + for (int i = path.size() - 1; i > 0; i--) { +  std::string child_name = path[i - 1]; +  current_index = AddNode(path[i], child_name, current_index);   } + + nodes.back().child_ids.emplace_back(ChildIdFromTokenId(tokens_used)); + tokens_used++;  } -void Tree::Dump() +bool Compiler::FillTree()  { - std::cout << "=== Tree =====================================" << std::endl; - for (const auto& i : nodes) { -  std::cout << i.second.token.type << " (" << i.second.token.value << ")" << std::endl; + if (nodes.size() == 0) // ignore empty tree, successfully +  return true; + + index_t to_fill; + + while (!subTreeIsComplete(root_node_id, to_fill)) { +  auto list = GetPath(nodes[to_fill].type, tokens[tokens_used]); +  if (list.size() > 0) { +   AddPath(list, to_fill); +   return true; +  } else { +   return false; +  }   }  } @@ -245,22 +348,23 @@ Compiler::Compiler(const BNF& bnf, const std::string& Top): bnf(bnf), Top(Top),  Tree Compiler::compile(std::vector<Token> Tokens)  { - if (Tokens.size()){ - } else -  throw std::runtime_error("No tokens!"); + clear(); + tokens = Tokens; - Tree tree; + if (tokens.size() == 0) +  throw std::runtime_error("No tokens!"); - for (const Token& i : Tokens) { -  if (!tree.Add(i, bnf, ReverseBNF)) { -   //tree.Dump(); -   throw std::runtime_error("Compile error: Invalid token "s + i.type); -  } + while (!treeIsComplete()) { +  if (!FillTree()) +   TrackBack(); +  else if (!AddRootNode()) +   TrackBack(); +  else if (!FillTree()) +   TrackBack();   } - tree.Resolve(bnf, ReverseBNF); - if (!tree.Valid(Top)) -  throw std::runtime_error("Compile error: Program incomplete"); + Validate();   return tree;  } + @@ -3,51 +3,58 @@  #include "bnf.h"  #include "minicc.h" +#include <cstdint> +  namespace Gram { +// TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes +// token_id: index into token list +// node_id: index into tree node list  struct TreeNode { - index_t parent{}; - std::vector<index_t> childs; // fill Token by Token - std::vector<std::string> child_types; // fill always - Token token; -}; - -class Tree { -private: - std::map<index_t, TreeNode> nodes; // index 0 = non existing; index starting at 1 - index_t node_num{}; - index_t root{}; - index_t last{}; + index_t parent_node_id{}; // root if parent_node_id == node_id + index_t node_id{}; -public: - void clear(); - bool Valid(const std::string& Top) const; - bool AddFirstNode(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - std::vector<TreeNode> getParentTreeNode(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - index_t GetLast(); - void AddRootNode(const TreeNode& newRootNode); - void RemoveRootNode(); - std::vector<std::string> GetPath(std::string a, std::string b, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - index_t AddNode(const std::string& name, const std::string& child_name, index_t parent_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - void AddPath(const std::vector<std::string>& path, index_t current_index, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - bool Add(const Token& token, const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - void Resolve(const BNF& bnf, const std::map<std::string, std::set<std::string>>& reverseBNF); - - void Dump(); + std::string type; + index_t variant; // bnf[type][variant] + std::deque<std::string> alternative_types; // alternatives that we can consider if type fails. Discard after parsing! + std::vector<int32_t> child_ids; // < 0: terminal: token_id; >= 0: non-terminal: node_id; fill token by token  };  class Compiler  {  private: + // The result + index_t root_node_id{}; + std::vector<TreeNode> nodes; +  + // Input + std::vector<std::string> tokens; + + // Helper data + index_t tokens_used{0}; // number of tokens used, this is also the next token index to use +   const BNF &bnf;   const std::string& Top; - std::map<std::string, std::set<std::string>> ReverseBNF; + std::map<std::string, std::set<std::string>> ReverseBNF; // possible parent types of a given type  public: + // Tree specific + void clear(); +  + // Node specific + std::string GetTypeOfNode(index_t node_id) const; + + index_t TrackBack(); +  + void Validate(const std::string& Top, const BNF& bnf) const; + + void Dump(); +   Compiler(const BNF& bnf, const std::string& Top); - Tree compile(std::vector<Token> Tokens); + + std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> Tokens);  };  } // namespace Gram diff --git a/test-lexer.cpp b/test-lexer.cpp index e85fc5c..3a4ed2a 100644 --- a/test-lexer.cpp +++ b/test-lexer.cpp @@ -95,8 +95,6 @@ TEST_F(Test, BNF) {   Gram::Compiler compiler(bnf, Top);   auto Tree = compiler.compile(tokens); - ASSERT_TRUE(Tree.Valid(Top)); -   Tree.Dump();  } | 
