Grammar applied to lex (WIP)

author: Roland Reichwein <mail@reichwein.it> 2020-02-20 23:26:57 +0100
committer: Roland Reichwein <mail@reichwein.it> 2020-02-20 23:26:57 +0100
commit: 10c2b7f9b6676dafd62d0eeda507b5ee5c6db216 (patch)
tree: 705885f41d9224e8678578e99db9f80af8136e94
parent: ba8520d3435c75c2568c05f1333966a4c1a4d69b (diff)
5 files changed, 148 insertions, 30 deletions
diff --git a/cpp.cpp b/cpp.cpp
index 8526cff..f02b047 100644
--- a/cpp.cpp
+++ b/cpp.cpp
@@ -9,6 +9,14 @@
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
 
+#include <unordered_set>
+
+using namespace Gram;
+
+CPP::CPP(){}
+
+CPP::~CPP(){}
+
 // Phase 1: Map physical character set to basic source character set
 void CPP::source_charset_map()
 {
@@ -45,9 +53,11 @@ std::vector<Token> sourceToCharTokens(const std::string& code)
 }
 
 // Phase 3: Parse preprocessing tokens
-void CPP::preprocessing_tokenize(const std::string& s)
+std::pair<index_t, std::vector<TreeNode>> CPP::preprocessing_tokenize(const std::string& s)
 {
- auto charTokens {sourceToCharTokens(s)};
+ m_code = s;
+
+ m_charTokens = sourceToCharTokens(s);
 
  auto bnf{SubBNF(GetCppBNFLex(), "preprocessing-token")};
 
@@ -72,8 +82,12 @@ void CPP::preprocessing_tokenize(const std::string& s)
   {" "}, {"\t"}, {"\n"}, {"\r"}
  };
  Gram::Compiler compiler(bnf, "file");
+ std::pair<index_t, std::vector<TreeNode>> Tree = compiler.compile(m_charTokens);
+
  debug = true;
- auto Tree = compiler.compile(charTokens);
+ compiler.DumpTree();
+
+ return Tree;
 }
 
 // Phase 4: Preprocessing
@@ -94,12 +108,84 @@ void CPP::concatenate_strings()
  // TODO
 }
 
-// Phase 7: Create tokens from preprocessing tokens
-void CPP::tokens_from_pptokens()
+std::string CPP::valueOfNode(index_t node_index, const std::vector<TreeNode>& Tree)
 {
- // TODO
+ std::string result;
+
+ std::vector<int32_t> todo(1, int32_t(node_index));
+
+ while (!todo.empty()) {
+  int32_t current_index = todo.back();
+  todo.pop_back();
+
+  // visit node if token
+  if (ChildIdIsToken(current_index)) {
+   result += m_code[TokenIdFromChildId(current_index)];
+  } else {
+
+   const TreeNode &node{Tree[current_index]};
+
+   // iterate backwards in childs, to get depth-first search in tree, from the beginning
+   std::for_each(node.child_ids.rbegin(), node.child_ids.rend(), [&](int32_t child){
+    todo.push_back(child);
+   });
+  }
+ }
+
+ return result;
+};
+
+namespace {
+ std::unordered_set<std::string> pp_types{
+  "identifier",
+  "pp-number",
+  "character-literal",
+  "user-defined-character-literal",
+  "string-literal",
+  "user-defined-string-literal",
+  "preprocessing-op-or-punc"
+ };
 }
 
+// Phase 7.a: Create tokens from preprocessing tokens
+std::vector<Token> CPP::tokens_from_pptokens(std::pair<index_t, std::vector<TreeNode>> Tree)
+{
+ std::vector<Token> result;
+
+ // "identifier" + value -> "identifier" + value, except identifiers from table 5.11, p.14 -> keyword as value, value
+ // "pp-number" + value -> "literal" + value
+ // "character-literal" -> "literal" + value
+ // "user-defined-character-literal" -> "literal" + value
+ // "string-literal" -> "literal" + value
+ // "user-defined-string-literal" -> "literal" + value
+ // "preprocessing-op-or-punc" -> value+value (operator,punctuator)
+ 
+ // TODO: traverse Tree, creating Token list
+ std::vector<index_t> todo(1, index_t(Tree.first));
+
+ while (!todo.empty()) {
+  index_t current_index = todo.back();
+  todo.pop_back();
+
+  TreeNode &node{Tree.second[current_index]};
+
+  // visit node
+  if (pp_types.find(node.type) != pp_types.end()) { // TODO
+   std::cout << node.type << ": " << valueOfNode(current_index, Tree.second) << std::endl;
+  } else { // only traverse further if not handled
+
+   // iterate backwards in childs, to get depth-first search in tree, from the beginning
+   std::for_each(node.child_ids.rbegin(), node.child_ids.rend(), [&](int32_t child){
+    if (!ChildIdIsToken(child))
+     todo.push_back(child);
+   });
+  }
+ }
+ 
+ return result;
+}
+
+// TODO: remove in favor of tokens_from_pptokens()
 void CPP::PreprocessorTokensToTokens(std::vector<Token>& tokens)
 {
  for (auto& i : tokens) {
@@ -108,6 +194,18 @@ void CPP::PreprocessorTokensToTokens(std::vector<Token>& tokens)
  }
 }
 
+// Phase 7.b: Grammar Analysis
+std::pair<index_t, std::vector<Gram::TreeNode>> analysis(std::vector<Token>)
+{
+ return {0 , {}};
+}
+
+// Phase 7.c: Translate
+void CPP::translate()
+{
+ // TODO
+}
+
 // Phase 8: Instantiate objects
 void CPP::instantiate()
 {
@@ -123,6 +221,7 @@ void CPP::link()
 // phases of translation, according to standard
 void CPP::translate(const std::string& code)
 {
+#if 0 // fix signatures!
  source_charset_map();
  backslash_escape();
  preprocessing_tokenize(code);
@@ -130,22 +229,28 @@ void CPP::translate(const std::string& code)
  execution_charset_map();
  concatenate_strings();
  tokens_from_pptokens();
+  analysis();
+  translate();
  instantiate();
  link();
+#endif
 }
 
 class CppTest: public ::testing::Test
 {
 protected:
  CppTest() {
-  debug = false;
+  //debug = true;
  }
  ~CppTest() {
  }
 };
 
 TEST_F(CppTest, preprocessing_tokenize) {
- CPP::preprocessing_tokenize("int main() { return 1; }");
+ CPP cpp;
+ auto ppTree = cpp.preprocessing_tokenize("int main() { return 1; }");
+
+ cpp.tokens_from_pptokens(ppTree);
 }
 
 #if 0
diff --git a/cpp.h b/cpp.h
index 7388e94..5ec43fb 100644
--- a/cpp.h
+++ b/cpp.h
@@ -1,25 +1,37 @@
 #pragma once
 
+#include "grammer.h"
 #include "minicc.h"
 
 #include <vector>
 
-namespace CPP {
+class CPP {
 
-void PreprocessorTokensToTokens(std::vector<Token>& tokens);
+public:
+
+CPP();
+~CPP();
+
+std::string valueOfNode(index_t node_index, const std::vector<Gram::TreeNode>& Tree);
+static void PreprocessorTokensToTokens(std::vector<Token>& tokens); // obsolete
 
 // phases of translation, according to standard
 void source_charset_map(); // phase 1
 void backslash_escape(); // phase 2
-void preprocessing_tokenize(const std::string& s); // phase 3
+std::pair<index_t, std::vector<Gram::TreeNode>> preprocessing_tokenize(const std::string& s); // phase 3
 void preprocess(); // phase 4
 void execution_charset_map(); // phase 5
 void concatenate_strings(); // phase 6
-void tokens_from_pptokens(); // phase 7
+std::vector<Token> tokens_from_pptokens(std::pair<index_t, std::vector<Gram::TreeNode>> Tree); // phase 7.a
+std::pair<index_t, std::vector<Gram::TreeNode>> analysis(std::vector<Token>); // phase 7.b
+void translate(); // phase 7.c
 void instantiate(); // phase 8
 void link(); // phase 9
 
 // all phases of translation
 void translate(const std::string& code);
 
-}
+private:
+ std::string m_code;
+ std::vector<Token> m_charTokens;
+};
diff --git a/grammer.cpp b/grammer.cpp
index a54b65d..8243fa8 100644
--- a/grammer.cpp
+++ b/grammer.cpp
@@ -59,24 +59,20 @@ bool Compiler::rootIsStartSymbol() const
  return GetTypeOfNode(root_node_id) == Top;
 }
 
-namespace {
-
- bool ChildIdIsToken(int32_t child_id)
- {
-  return child_id < 0;
- }
-
- index_t TokenIdFromChildId(int32_t child_id)
- {
-  return index_t(-child_id) - 1;
- }
+bool Gram::ChildIdIsToken(int32_t child_id)
+{
+ return child_id < 0;
+}
 
- int32_t ChildIdFromTokenId(index_t token_id)
- {
-  return -1 - int32_t(token_id);
- }
+index_t Gram::TokenIdFromChildId(int32_t child_id)
+{
+ return index_t(-child_id) - 1;
+}
 
-} // namespace
+int32_t Gram::ChildIdFromTokenId(index_t token_id)
+{
+ return -1 - int32_t(token_id);
+}
 
 void Compiler::DumpTree()
 {
diff --git a/grammer.h b/grammer.h
index f6f9d95..bdb2718 100644
--- a/grammer.h
+++ b/grammer.h
@@ -73,4 +73,9 @@ public:
  void DumpTree();
 };
 
+bool ChildIdIsToken(int32_t child_id);
+index_t TokenIdFromChildId(int32_t child_id);
+int32_t ChildIdFromTokenId(index_t token_id);
+
 } // namespace Gram
+
diff --git a/test-lexer.cpp b/test-lexer.cpp
index b0706df..79b9930 100644
--- a/test-lexer.cpp
+++ b/test-lexer.cpp
@@ -88,7 +88,7 @@ TEST_F(Test, BNF) {
  auto tokens = lexer.Lex(Code);
 
  ASSERT_EQ(tokens, tokens_reference);
-#if 1
+#if 0 // TODO: use Debug() interface
  std::cout << "=== Tokens =================================" << std::endl;
  for (const auto& i: tokens) {
   std::cout << i.type << ": " << i.value << std::endl;
author	Roland Reichwein <mail@reichwein.it>	2020-02-20 23:26:57 +0100
committer	Roland Reichwein <mail@reichwein.it>	2020-02-20 23:26:57 +0100
commit	10c2b7f9b6676dafd62d0eeda507b5ee5c6db216 (patch)
tree	705885f41d9224e8678578e99db9f80af8136e94
parent	ba8520d3435c75c2568c05f1333966a4c1a4d69b (diff)