diff options
| -rw-r--r-- | Makefile | 1 | ||||
| -rw-r--r-- | cpp.cpp | 2 | ||||
| -rw-r--r-- | grammer.cpp | 136 | ||||
| -rw-r--r-- | grammer.h | 19 | ||||
| -rw-r--r-- | test-cpp.cpp | 4 | ||||
| -rw-r--r-- | test-grammer.cpp | 65 | 
6 files changed, 215 insertions, 12 deletions
| @@ -49,6 +49,7 @@ SRC=\      cppbnf.cpp \      test-cppbnf.cpp \      grammer.cpp \ +    test-grammer.cpp \      lexer.cpp \      test-lexer.cpp \      minicc.cpp \ @@ -198,7 +198,7 @@ std::vector<Token> CPP::tokens_from_pptokens(std::vector<Token> pp_tokens)    if (pp_types.find(token.type) != pp_types.end()) {     if (token.type == "identifier") {      if (keywords.find(token.value) != keywords.end()) -     result.emplace_back(token.value, token.value); +     result.emplace_back(Token{token.value, token.value});      else       result.emplace_back(Token{"identifier"s, token.value});     } diff --git a/grammer.cpp b/grammer.cpp index 5557b12..cc285b9 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -1,6 +1,7 @@  #include "grammer.h"  #include <algorithm> +#include <limits>  using namespace Gram; @@ -44,7 +45,7 @@ void Compiler::Validate() const    throw std::runtime_error("Bad root node: "s + std::to_string(root_node_id) + " vs. "s + std::to_string(nodes.size()));   // Start symbol on top - if (GetTypeOfNode(root_node_id) != Top) + if (GetTypeOfNode(root_node_id) != m_top)    throw std::runtime_error("Root node not start symbol!");   // All nodes filled @@ -56,7 +57,7 @@ void Compiler::Validate() const  bool Compiler::rootIsStartSymbol() const  { - return GetTypeOfNode(root_node_id) == Top; + return GetTypeOfNode(root_node_id) == m_top;  }  bool Gram::ChildIdIsToken(int32_t child_id) @@ -493,18 +494,136 @@ bool Compiler::FillTree()   return true;  } -Compiler::Compiler(BNF& bnf, const std::string& Top): bnf(bnf), Top(Top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)} +size_t Compiler::minimumSymbolsNeeded(std::string symbol)  { + if (isTerminal(bnf, symbol)) { +  return 1; + } else { +  auto it_min{m_min.find(symbol)}; +  if (it_min != m_min.end()) +   return it_min->second; +  m_min[symbol] = std::numeric_limits<size_t>::max(); + +  auto it{bnf.find(symbol)}; +  if (it != bnf.end()) { +   size_t minimum{std::numeric_limits<size_t>::max()}; +    +   for (const auto& list: it->second) { +    minimum = std::min(minimum, minimumSymbolsNeeded(list)); +   } + +   m_min[symbol] = minimum; + +   return minimum; +  } else +   throw std::runtime_error("ICE: Symbol "s + symbol + " expected in BNF"s); + } +} + +size_t Compiler::minimumSymbolsNeeded(std::vector<std::string> symbol_list) +{ + size_t result{0}; + + for (const auto& symbol: symbol_list) { +  size_t min { minimumSymbolsNeeded(symbol) }; +  if (min == std::numeric_limits<size_t>::max()) +   return min; +  result += min; + } + + return result; +} + +/// begin, end: indexes in tokens list +bool Compiler::match(std::vector<std::string> symbol_list, size_t begin, size_t end) +{ + // TODO: isTerminal() necessary here? +  + std::cout << "DEBUG: Matching:"; + for (auto symbol: symbol_list) { +  std::cout << " |" << symbol << "|"; + } + std::cout << std::endl; +  + // match terminal symbols at start + while (begin < end && isTerminal(bnf, tokens[begin].type) && symbol_list.size() > 0 && symbol_list.front() == tokens[begin].type) { +  begin++; +  symbol_list.erase(symbol_list.begin()); + } + + // match terminal symbols at end + while (begin < end && isTerminal(bnf, tokens[end - 1].type) && symbol_list.size() > 0 && symbol_list.back() == tokens[end - 1].type) { +  end--; +  symbol_list.erase(symbol_list.end() - 1); + } + + // matching of empty lists + if (symbol_list.size() == 0) { +  if (begin == end) // only match real empty list +   return true; +  return false; + } + + // now, symbol_list[begin .. end - 1] has size > 0 and contains non-terminal symbols at start and end +  + // resolve first symbol + auto it{bnf.find(symbol_list.front())}; + if (it != bnf.end()) { +  for (std::vector<std::string> list: it->second) { // iterate over alternatives +   std::cout << "ALTERNATIVE for " << symbol_list.front() << " with " << list.size()  << std::endl; +   list.insert(list.end(), symbol_list.begin() + 1, symbol_list.end()); +   std::cout << "Min: " << minimumSymbolsNeeded(list) << ", end-begin: " << (end-begin)  << std::endl; +   if (minimumSymbolsNeeded(list) > end - begin) // stop recursion  +    continue; + +   // TODO: recurse last? +   if (match(list, begin, end)) +    return true; +  } + } else +  return false; // terminal symbol not found in bnf, non-matching +  + return false; // no match found +} + +bool Compiler::match(std::string symbol, size_t begin, size_t end) +{ + std::vector<std::string> symbol_list{symbol}; + return match(symbol_list, begin, end); +} + +Compiler::Compiler(BNF& bnf, const std::string& top): bnf(bnf), m_top(top), ReverseBNF{Reverse(bnf)}, reversedFirst{reverseFirst(bnf)} +{ + //std::cout << "DEBUG: " << m_top << std::endl; +  + // + // prepare helper cache (TODO: remove this ugly workaround for remaining bad marker elements) + // + minimumSymbolsNeeded("translation-unit"); + // remove bad marker elements + auto it{m_min.begin()}; + while (it != m_min.end()) { +  if (it->second == std::numeric_limits<size_t>::max()) { +   it = m_min.erase(it); +  } else { +   ++it; +  } + } + minimumSymbolsNeeded("translation-unit");  } -std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> Tokens) +std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> p_tokens)  {   clear(); - tokens = Tokens; + tokens = p_tokens;   if (tokens.size() == 0)    throw std::runtime_error("No tokens!"); +#if 0 + // + // bottom-up algorithm + //   while (!treeIsComplete()) {    if (!FillTree()) {     TrackBack(); @@ -514,6 +633,13 @@ std::pair<index_t, std::vector<TreeNode>> Compiler::compile(std::vector<Token> T     TrackBack();    }   } +#else + // + // top-down algorithm + // + if (!match(m_top, 0, tokens.size())) +  throw std::runtime_error("Compile error."); +#endif   Validate(); @@ -4,8 +4,11 @@  #include "minicc.h"  #include <cstdint> +#include <unordered_set>  #include <utility> +class GrammerTest; +  namespace Gram {  // TreeNodes are only intermediate. Terminal symbols don't get of TreeNodes @@ -23,7 +26,6 @@ struct TreeNode {  class Compiler  { -  private:   // The result   index_t root_node_id{}; @@ -36,7 +38,7 @@ private:   index_t tokens_used{0}; // number of tokens used, this is also the next token index to use   BNF &bnf; // not const for access via operator[] - const std::string& Top; + const std::string m_top;   std::unordered_map<std::string, std::set<std::string>> ReverseBNF; // possible parent types of a given type; unused now: remove?   std::unordered_map<std::string, std::set<std::string>> reversedFirst; // possible parent types of first childs of a given type @@ -67,10 +69,19 @@ private:   void AddPath(const std::vector<std::string>& path, index_t current_index);   bool FillTree(); + // top-down algorithm + std::unordered_map<std::string, size_t> m_min; // cache + size_t minimumSymbolsNeeded(std::string symbol); + size_t minimumSymbolsNeeded(std::vector<std::string> symbol_list); + bool match(std::string symbol, size_t begin, size_t end); + bool match(std::vector<std::string> symbol_list, size_t begin, size_t end); +  public: - Compiler(BNF& bnf, const std::string& Top); - std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> Tokens); + Compiler(BNF& bnf, const std::string& top); + std::pair<index_t, std::vector<TreeNode>> compile(std::vector<Token> p_tokens);   void DumpTree(); + + friend class ::GrammerTest;  };  bool ChildIdIsToken(int32_t child_id); diff --git a/test-cpp.cpp b/test-cpp.cpp index 2a67b38..6727966 100644 --- a/test-cpp.cpp +++ b/test-cpp.cpp @@ -30,7 +30,7 @@ protected:   }  }; -#if 1 +#if 0  TEST_F(CppTest, preprocessing_tokenize) {   CPP cpp;   auto pp_tokens = cpp.preprocessing_tokenize("int main() { return 1; }"); @@ -40,7 +40,7 @@ TEST_F(CppTest, preprocessing_tokenize) {   auto tokens = cpp.tokens_from_pptokens(pp_tokens);   ASSERT_EQ(tokens.size(), 9); -#if 0 +#if 1   for (auto &i: tokens) {    std::cout << i.type << ": " << i.value << std::endl;   } diff --git a/test-grammer.cpp b/test-grammer.cpp new file mode 100644 index 0000000..1734da2 --- /dev/null +++ b/test-grammer.cpp @@ -0,0 +1,65 @@ +#include "bnf.h" +#include "cpp.h" +#include "cppbnf.h" +#include "lexer.h" +#include "grammer.h" +#include "minicc.h" +#include "debug.h" + +#include <boost/algorithm/string.hpp> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include <algorithm> +#include <cctype> +#include <deque> +#include <map> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +using namespace std::string_literals; + +class GrammerTest: public ::testing::Test +{ +protected: + GrammerTest() { +  //debug = true; + } + ~GrammerTest() { + } + + // Accessors for friend + size_t minimumSymbolsNeeded(Gram::Compiler& compiler, std::vector<std::string> list) { +  return compiler.minimumSymbolsNeeded(list); + } + size_t minimumSymbolsNeeded(Gram::Compiler& compiler, std::string s) { +  return compiler.minimumSymbolsNeeded(s); + } +}; + +TEST_F(GrammerTest, minimumSymbolsNeeded) { + auto bnf = SubBNF(CPPBNF::GetCppBNFGram(), "translation-unit"); + + Gram::Compiler compiler(bnf, "translation-unit"); + + EXPECT_EQ(minimumSymbolsNeeded(compiler, std::vector<std::string>{}), 0); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "translation-unit"), 0); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "logical-or-expression"), 1); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "assignment-expression"), 1); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "declaration"), 1); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "block-declaration"), 3); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "simple-declaration"), 2); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "asm-declaration"), 5); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "namespace-alias-definition"), 5); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-declaration"), 4); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-enum-declaration"), 4); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "using-directive"), 4); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "static_assert-declaration"), 5); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "alias-declaration"), 7); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "opaque-enum-declaration"), 3); + EXPECT_EQ(minimumSymbolsNeeded(compiler, "function-definition"), 4); +} + | 
