diff options
| author | Roland Reichwein <mail@reichwein.it> | 2020-02-20 23:26:57 +0100 | 
|---|---|---|
| committer | Roland Reichwein <mail@reichwein.it> | 2020-02-20 23:26:57 +0100 | 
| commit | 10c2b7f9b6676dafd62d0eeda507b5ee5c6db216 (patch) | |
| tree | 705885f41d9224e8678578e99db9f80af8136e94 | |
| parent | ba8520d3435c75c2568c05f1333966a4c1a4d69b (diff) | |
Grammar applied to lex (WIP)
| -rw-r--r-- | cpp.cpp | 121 | ||||
| -rw-r--r-- | cpp.h | 22 | ||||
| -rw-r--r-- | grammer.cpp | 28 | ||||
| -rw-r--r-- | grammer.h | 5 | ||||
| -rw-r--r-- | test-lexer.cpp | 2 | 
5 files changed, 148 insertions, 30 deletions
| @@ -9,6 +9,14 @@  #include <gtest/gtest.h>  #include <gmock/gmock.h> +#include <unordered_set> + +using namespace Gram; + +CPP::CPP(){} + +CPP::~CPP(){} +  // Phase 1: Map physical character set to basic source character set  void CPP::source_charset_map()  { @@ -45,9 +53,11 @@ std::vector<Token> sourceToCharTokens(const std::string& code)  }  // Phase 3: Parse preprocessing tokens -void CPP::preprocessing_tokenize(const std::string& s) +std::pair<index_t, std::vector<TreeNode>> CPP::preprocessing_tokenize(const std::string& s)  { - auto charTokens {sourceToCharTokens(s)}; + m_code = s; + + m_charTokens = sourceToCharTokens(s);   auto bnf{SubBNF(GetCppBNFLex(), "preprocessing-token")}; @@ -72,8 +82,12 @@ void CPP::preprocessing_tokenize(const std::string& s)    {" "}, {"\t"}, {"\n"}, {"\r"}   };   Gram::Compiler compiler(bnf, "file"); + std::pair<index_t, std::vector<TreeNode>> Tree = compiler.compile(m_charTokens); +   debug = true; - auto Tree = compiler.compile(charTokens); + compiler.DumpTree(); + + return Tree;  }  // Phase 4: Preprocessing @@ -94,12 +108,84 @@ void CPP::concatenate_strings()   // TODO  } -// Phase 7: Create tokens from preprocessing tokens -void CPP::tokens_from_pptokens() +std::string CPP::valueOfNode(index_t node_index, const std::vector<TreeNode>& Tree)  { - // TODO + std::string result; + + std::vector<int32_t> todo(1, int32_t(node_index)); + + while (!todo.empty()) { +  int32_t current_index = todo.back(); +  todo.pop_back(); + +  // visit node if token +  if (ChildIdIsToken(current_index)) { +   result += m_code[TokenIdFromChildId(current_index)]; +  } else { + +   const TreeNode &node{Tree[current_index]}; + +   // iterate backwards in childs, to get depth-first search in tree, from the beginning +   std::for_each(node.child_ids.rbegin(), node.child_ids.rend(), [&](int32_t child){ +    todo.push_back(child); +   }); +  } + } + + return result; +}; + +namespace { + std::unordered_set<std::string> pp_types{ +  "identifier", +  "pp-number", +  "character-literal", +  "user-defined-character-literal", +  "string-literal", +  "user-defined-string-literal", +  "preprocessing-op-or-punc" + };  } +// Phase 7.a: Create tokens from preprocessing tokens +std::vector<Token> CPP::tokens_from_pptokens(std::pair<index_t, std::vector<TreeNode>> Tree) +{ + std::vector<Token> result; + + // "identifier" + value -> "identifier" + value, except identifiers from table 5.11, p.14 -> keyword as value, value + // "pp-number" + value -> "literal" + value + // "character-literal" -> "literal" + value + // "user-defined-character-literal" -> "literal" + value + // "string-literal" -> "literal" + value + // "user-defined-string-literal" -> "literal" + value + // "preprocessing-op-or-punc" -> value+value (operator,punctuator) +  + // TODO: traverse Tree, creating Token list + std::vector<index_t> todo(1, index_t(Tree.first)); + + while (!todo.empty()) { +  index_t current_index = todo.back(); +  todo.pop_back(); + +  TreeNode &node{Tree.second[current_index]}; + +  // visit node +  if (pp_types.find(node.type) != pp_types.end()) { // TODO +   std::cout << node.type << ": " << valueOfNode(current_index, Tree.second) << std::endl; +  } else { // only traverse further if not handled + +   // iterate backwards in childs, to get depth-first search in tree, from the beginning +   std::for_each(node.child_ids.rbegin(), node.child_ids.rend(), [&](int32_t child){ +    if (!ChildIdIsToken(child)) +     todo.push_back(child); +   }); +  } + } +  + return result; +} + +// TODO: remove in favor of tokens_from_pptokens()  void CPP::PreprocessorTokensToTokens(std::vector<Token>& tokens)  {   for (auto& i : tokens) { @@ -108,6 +194,18 @@ void CPP::PreprocessorTokensToTokens(std::vector<Token>& tokens)   }  } +// Phase 7.b: Grammar Analysis +std::pair<index_t, std::vector<Gram::TreeNode>> analysis(std::vector<Token>) +{ + return {0 , {}}; +} + +// Phase 7.c: Translate +void CPP::translate() +{ + // TODO +} +  // Phase 8: Instantiate objects  void CPP::instantiate()  { @@ -123,6 +221,7 @@ void CPP::link()  // phases of translation, according to standard  void CPP::translate(const std::string& code)  { +#if 0 // fix signatures!   source_charset_map();   backslash_escape();   preprocessing_tokenize(code); @@ -130,22 +229,28 @@ void CPP::translate(const std::string& code)   execution_charset_map();   concatenate_strings();   tokens_from_pptokens(); +  analysis(); +  translate();   instantiate();   link(); +#endif  }  class CppTest: public ::testing::Test  {  protected:   CppTest() { -  debug = false; +  //debug = true;   }   ~CppTest() {   }  };  TEST_F(CppTest, preprocessing_tokenize) { - CPP::preprocessing_tokenize("int main() { return 1; }"); + CPP cpp; + auto ppTree = cpp.preprocessing_tokenize("int main() { return 1; }"); + + cpp.tokens_from_pptokens(ppTree);  }  #if 0 @@ -1,25 +1,37 @@  #pragma once +#include "grammer.h"  #include "minicc.h"  #include <vector> -namespace CPP { +class CPP { -void PreprocessorTokensToTokens(std::vector<Token>& tokens); +public: + +CPP(); +~CPP(); + +std::string valueOfNode(index_t node_index, const std::vector<Gram::TreeNode>& Tree); +static void PreprocessorTokensToTokens(std::vector<Token>& tokens); // obsolete  // phases of translation, according to standard  void source_charset_map(); // phase 1  void backslash_escape(); // phase 2 -void preprocessing_tokenize(const std::string& s); // phase 3 +std::pair<index_t, std::vector<Gram::TreeNode>> preprocessing_tokenize(const std::string& s); // phase 3  void preprocess(); // phase 4  void execution_charset_map(); // phase 5  void concatenate_strings(); // phase 6 -void tokens_from_pptokens(); // phase 7 +std::vector<Token> tokens_from_pptokens(std::pair<index_t, std::vector<Gram::TreeNode>> Tree); // phase 7.a +std::pair<index_t, std::vector<Gram::TreeNode>> analysis(std::vector<Token>); // phase 7.b +void translate(); // phase 7.c  void instantiate(); // phase 8  void link(); // phase 9  // all phases of translation  void translate(const std::string& code); -} +private: + std::string m_code; + std::vector<Token> m_charTokens; +}; diff --git a/grammer.cpp b/grammer.cpp index a54b65d..8243fa8 100644 --- a/grammer.cpp +++ b/grammer.cpp @@ -59,24 +59,20 @@ bool Compiler::rootIsStartSymbol() const   return GetTypeOfNode(root_node_id) == Top;  } -namespace { - - bool ChildIdIsToken(int32_t child_id) - { -  return child_id < 0; - } - - index_t TokenIdFromChildId(int32_t child_id) - { -  return index_t(-child_id) - 1; - } +bool Gram::ChildIdIsToken(int32_t child_id) +{ + return child_id < 0; +} - int32_t ChildIdFromTokenId(index_t token_id) - { -  return -1 - int32_t(token_id); - } +index_t Gram::TokenIdFromChildId(int32_t child_id) +{ + return index_t(-child_id) - 1; +} -} // namespace +int32_t Gram::ChildIdFromTokenId(index_t token_id) +{ + return -1 - int32_t(token_id); +}  void Compiler::DumpTree()  { @@ -73,4 +73,9 @@ public:   void DumpTree();  }; +bool ChildIdIsToken(int32_t child_id); +index_t TokenIdFromChildId(int32_t child_id); +int32_t ChildIdFromTokenId(index_t token_id); +  } // namespace Gram + diff --git a/test-lexer.cpp b/test-lexer.cpp index b0706df..79b9930 100644 --- a/test-lexer.cpp +++ b/test-lexer.cpp @@ -88,7 +88,7 @@ TEST_F(Test, BNF) {   auto tokens = lexer.Lex(Code);   ASSERT_EQ(tokens, tokens_reference); -#if 1 +#if 0 // TODO: use Debug() interface   std::cout << "=== Tokens =================================" << std::endl;   for (const auto& i: tokens) {    std::cout << i.type << ": " << i.value << std::endl; | 
