Add lexer to CPP

author: Roland Reichwein <mail@reichwein.it> 2020-03-21 15:38:05 +0100
committer: Roland Reichwein <mail@reichwein.it> 2020-03-21 15:38:05 +0100
commit: 3cc139bce0283018473d4906ee2ea5f40f771255 (patch)
tree: 658956ab52b8892d419a2ba905f4c41f21f6d926
parent: 74350b52fee9f576a1cc71d99cfd4ebdf5a73e0f (diff)
2 files changed, 39 insertions, 85 deletions
diff --git a/cpp.cpp b/cpp.cpp
index 3d8c20d..bbb6e27 100644
--- a/cpp.cpp
+++ b/cpp.cpp
@@ -2,9 +2,10 @@
 
 #include "bnf.h"
 #include "cppbnf.h"
+#include "debug.h"
+#include "lexer.h"
 #include "grammer.h"
 #include "minicc.h"
-#include "debug.h"
 
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
@@ -29,66 +30,14 @@ void CPP::backslash_escape()
  // TODO
 }
 
-namespace {
-
-std::vector<Token> sourceToCharTokens(const std::string& code)
-{
- std::vector<Token> result;
-
- Location location{1, 1};
-
- for (char c: code) {
-  if (c == '\n') {
-   location.column = 1;
-   location.line++;
-  } else if (std::isprint(c)) {
-   location.column++;
-  }
-
-  result.emplace_back(Token{std::string(1, c), std::string(1, c), location});
- }
- return result;
-}
-
-}
-
 // Phase 3: Parse preprocessing tokens
-std::pair<index_t, std::vector<TreeNode>> CPP::preprocessing_tokenize(const std::string& s)
+std::vector<Token> CPP::preprocessing_tokenize(const std::string& s)
 {
- m_code = s;
-
- m_charTokens = sourceToCharTokens(s);
-
  auto bnf{SubBNF(GetCppBNFLex(), "preprocessing-token")};
+ 
+ Lex::Lexer lexer(bnf, "preprocessing-token");
 
- // add to bnf to match whole file
- bnf["file"] = {
-  {"preprocessing-token-list"},
-  {"whitespace-list", "preprocessing-token-list"}
- };
- bnf["preprocessing-token-list"] = {
-  {"preprocessing-token-padded"},
-  {"preprocessing-token-list", "preprocessing-token-padded"}
- };
- bnf["preprocessing-token-padded"] = {
-  {"preprocessing-token"},
-  {"preprocessing-token", "whitespace-list"}
- };
- bnf["whitespace-list"] = {
-  {"whitespace-char"},
-  {"whitespace-list", "whitespace-char" }
- };
- bnf["whitespace-char"] = {
-  {" "}, {"\t"}, {"\n"}, {"\r"}
- };
- Gram::Compiler compiler(bnf, "file");
- debug = true;
- std::pair<index_t, std::vector<TreeNode>> Tree = compiler.compile(m_charTokens);
-
- debug = true;
- compiler.DumpTree();
-
- return Tree;
+ return lexer.Lex(s);
 }
 
 // Phase 4: Preprocessing
@@ -137,7 +86,7 @@ std::string CPP::valueOfNode(index_t node_index, const std::vector<TreeNode>& Tr
 };
 
 namespace {
- std::unordered_set<std::string> pp_types{
+ std::unordered_set<std::string> pp_types {
   "identifier",
   "pp-number",
   "character-literal",
@@ -146,10 +95,16 @@ namespace {
   "user-defined-string-literal",
   "preprocessing-op-or-punc"
  };
+
+ std::unordered_set<std::string> keywords {
+  "alignas",
+  "alignof",
+  // ... Keywords table, p.15
+ };
 }
 
 // Phase 7.a: Create tokens from preprocessing tokens
-std::vector<Token> CPP::tokens_from_pptokens(std::pair<index_t, std::vector<TreeNode>> Tree)
+std::vector<Token> CPP::tokens_from_pptokens(std::vector<Token> pp_tokens)
 {
  std::vector<Token> result;
 
@@ -161,28 +116,23 @@ std::vector<Token> CPP::tokens_from_pptokens(std::pair<index_t, std::vector<Tree
  // "user-defined-string-literal" -> "literal" + value
  // "preprocessing-op-or-punc" -> value+value (operator,punctuator)
  
- // TODO: traverse Tree, creating Token list
- std::vector<index_t> todo(1, index_t(Tree.first));
-
- while (!todo.empty()) {
-  index_t current_index = todo.back();
-  todo.pop_back();
-
-  TreeNode &node{Tree.second[current_index]};
-
-  // visit node
-  if (pp_types.find(node.type) != pp_types.end()) { // TODO
-   std::cout << node.type << ": " << valueOfNode(current_index, Tree.second) << std::endl;
-  } else { // only traverse further if not handled
-
-   // iterate backwards in childs, to get depth-first search in tree, from the beginning
-   std::for_each(node.child_ids.rbegin(), node.child_ids.rend(), [&](int32_t child){
-    if (!ChildIdIsToken(child))
-     todo.push_back(child);
-   });
-  }
+ for (auto& token: pp_tokens) {
+  if (pp_types.find(token.type) != pp_types.end()) {
+   if (token.type == "identifier") {
+#if 0
+    if (keywords.find(token.value) != keywords.end())
+     result.emplace_back("keyword", token.value);
+    else
+#endif
+    result.emplace_back(Token{"identifier"s, token.value});
+   }
+   else if (token.type == "preprocessing-op-or-punc")
+    result.emplace_back(Token{token.value, token.value});
+   else
+    result.emplace_back(Token{"literal", token.value});
+  } else
+   throw std::runtime_error("Unhandled preprocessing token: "s + token.value + " ("s + token.type + ")"s);
  }
- 
  return result;
 }
 
@@ -238,12 +188,16 @@ protected:
  }
 };
 
-#if 0
+#if 1
 TEST_F(CppTest, preprocessing_tokenize) {
  CPP cpp;
- auto ppTree = cpp.preprocessing_tokenize("int main() { return 1; }");
+ auto pp_tokens = cpp.preprocessing_tokenize("int main() { return 1; }");
 
- cpp.tokens_from_pptokens(ppTree);
+ ASSERT_EQ(pp_tokens.size(), 9);
+
+ auto tokens = cpp.tokens_from_pptokens(pp_tokens);
+
+ ASSERT_EQ(tokens.size(), 9);
 }
 #endif
 
diff --git a/cpp.h b/cpp.h
index 724e08b..282c83d 100644
--- a/cpp.h
+++ b/cpp.h
@@ -17,11 +17,11 @@ std::string valueOfNode(index_t node_index, const std::vector<Gram::TreeNode>& T
 // phases of translation, according to standard
 void source_charset_map(); // phase 1
 void backslash_escape(); // phase 2
-std::pair<index_t, std::vector<Gram::TreeNode>> preprocessing_tokenize(const std::string& s); // phase 3
+std::vector<Token> preprocessing_tokenize(const std::string& s); // phase 3
 void preprocess(); // phase 4
 void execution_charset_map(); // phase 5
 void concatenate_strings(); // phase 6
-std::vector<Token> tokens_from_pptokens(std::pair<index_t, std::vector<Gram::TreeNode>> Tree); // phase 7.a
+std::vector<Token> tokens_from_pptokens(std::vector<Token> pp_tokens); // phase 7.a
 std::pair<index_t, std::vector<Gram::TreeNode>> analysis(std::vector<Token>); // phase 7.b
 void translate(); // phase 7.c
 void instantiate(); // phase 8
author	Roland Reichwein <mail@reichwein.it>	2020-03-21 15:38:05 +0100
committer	Roland Reichwein <mail@reichwein.it>	2020-03-21 15:38:05 +0100
commit	3cc139bce0283018473d4906ee2ea5f40f771255 (patch)
tree	658956ab52b8892d419a2ba905f4c41f21f6d926
parent	74350b52fee9f576a1cc71d99cfd4ebdf5a73e0f (diff)