From 2fa4c782b37855f2e26c2f425dd149d2922adf5e Mon Sep 17 00:00:00 2001 From: Kubat <mael.martin31@gmail.com> Date: Thu, 10 Feb 2022 21:31:56 +0100 Subject: [PATCH] FIX: Fix lexer => correct simple and qname lexing Some qname where cut and parsed as simple tokens because of the chop thing, now we parse every simple token as qnames then we try to promote them to simple tokens if they matches entirely the said simple token. --- src/Lib/Script/FrontEnd/Lexer.cc | 24 +++++++++++++++++++++--- src/Lib/Script/FrontEnd/Tokens.hh | 30 ++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/src/Lib/Script/FrontEnd/Lexer.cc b/src/Lib/Script/FrontEnd/Lexer.cc index 3ee1cfd1..e3b5f2b0 100644 --- a/src/Lib/Script/FrontEnd/Lexer.cc +++ b/src/Lib/Script/FrontEnd/Lexer.cc @@ -43,6 +43,22 @@ getRideOfComments(std::string *retString) } while (begin != std::string::npos); } +static void +promoteQNameToSimpleTokens(TokenList *tokens) noexcept +{ + for (size_t i = 0; i < tokens->size(); i += 1) { + Token tok = tokens->at(i); + if (!tok.isQName()) + continue; + StrV qnameToPromote = tok.asQName(); + + for (const auto &simpleToken : Vivy::Script::SIMPLE_TOKENS) { + if (StrV::equal(qnameToPromote, simpleToken)) + tokens->at(i) = Token::fromSimple(tok.location(), qnameToPromote); + } + } +} + void tokenizeFile(const char *file, TokenList *tokens, std::string *storage) { @@ -69,10 +85,10 @@ tokenizeFile(const char *file, TokenList *tokens, std::string *storage) for (;;) { global_continue: - /* First simple tokens */ + /* First simple tokens that are not alpha-numeric */ fileContent = StrV::trimL(fileContent, &trimmedAmount); loc = Location::shift(loc, trimmedAmount); - for (const auto &simpleToken : Vivy::Script::SIMPLE_TOKENS) { + for (const auto &simpleToken : Vivy::Script::SIMPLE_TOKENS_NON_ALPHANUM) { if (StrV::startsWith(fileContent, simpleToken)) { /* If the charatecr is an UTF8 character, the Location can skip * multiple chars because the glyph is multiple characters long! */ @@ -132,10 +148,12 @@ tokenizeFile(const char *file, TokenList *tokens, std::string *storage) continue; } - /* EOF? */ + /* EOF? => May promote QNAME to SIMPLE */ fileContent = StrV::trimL(fileContent, &trimmedAmount); loc = Location::shift(loc, trimmedAmount); if (fileContent.len() == 0) { + /* Parse simple tokens that are alpha numeric */ + promoteQNameToSimpleTokens(tokens); return; } diff --git a/src/Lib/Script/FrontEnd/Tokens.hh b/src/Lib/Script/FrontEnd/Tokens.hh index c6df5e0c..5d532142 100644 --- a/src/Lib/Script/FrontEnd/Tokens.hh +++ b/src/Lib/Script/FrontEnd/Tokens.hh @@ -64,12 +64,38 @@ namespace Vivy::Script } #define TOKEN_RULE_INT "(+|-)?[0-9]+" -#define TOKEN_RULE_ID "[a-zA-Z\u0391-\u03C9\u220F_\u221A_][a-zA-Z\u0391-\u03C9_0-9]*" +#define TOKEN_RULE_ID "[a-zA-Z\u0391-\u03C9\u220F_\u221A_][a-zA-Z\u0391-\u03C9_0-9\\-\\+]*" #define TOKEN_RULE_REAL TOKEN_RULE_INT ".(" TOKEN_RULE_INT "((e|E)(+|-)" TOKEN_RULE_INT ")?)?" #define TOKEN_RULE_QNAME TOKEN_RULE_ID "(." TOKEN_RULE_ID ")*" #define TOKEN_RULE_UPPER_QNAME "[A-Z\u0391-\u03C9\u220F_\u221A_][A-Z\u0391-\u03C9_0-9]*" -/* The list of all possible simple tokens */ +[[maybe_unused]] static constexpr auto SIMPLE_TOKENS_NON_ALPHANUM = { TOKEN_PARENT_LEFT, + TOKEN_PARENT_RIGHT, + TOKEN_BRACKET_LEFT, + TOKEN_BRACKET_RIGHT, + TOKEN_HAT, + TOKEN_HASHTAG, + TOKEN_CURLY_BRACKET_LEFT, + TOKEN_CURLY_BRACKET_RIGHT, + TOKEN_SEMICOL, + TOKEN_COL, + TOKEN_PIPE, + TOKEN_INTEROGATION, + TOKEN_COMMA, + TOKEN_DOT, + TOKEN_ARROW, + TOKEN_PLUS, + TOKEN_MINUS, + TOKEN_DIV, + TOKEN_TIMES, + TOKEN_EQ, + TOKEN_NEQ, + TOKEN_LT, + TOKEN_LE, + TOKEN_GT, + TOKEN_GE, + TOKEN_ASSIGN }; + [[maybe_unused]] static constexpr auto SIMPLE_TOKENS = { TOKEN_PARENT_LEFT, TOKEN_PARENT_RIGHT, TOKEN_BRACKET_LEFT, -- GitLab