| //===- Parser.cpp - Matcher expression parser -----------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // Recursive parser implementation for the matcher expression grammar. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "Parser.h" |
| |
| #include <vector> |
| |
| namespace mlir::query::matcher::internal { |
| |
| // Simple structure to hold information for one token from the parser. |
| struct Parser::TokenInfo { |
| TokenInfo() = default; |
| |
| // Method to set the kind and text of the token |
| void set(TokenKind newKind, llvm::StringRef newText) { |
| kind = newKind; |
| text = newText; |
| } |
| |
| // Known identifiers. |
| static const char *const ID_Extract; |
| |
| llvm::StringRef text; |
| TokenKind kind = TokenKind::Eof; |
| SourceRange range; |
| VariantValue value; |
| }; |
| |
| const char *const Parser::TokenInfo::ID_Extract = "extract"; |
| |
| class Parser::CodeTokenizer { |
| public: |
| // Constructor with matcherCode and error |
| explicit CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error) |
| : code(matcherCode), startOfLine(matcherCode), error(error) { |
| nextToken = getNextToken(); |
| } |
| |
| // Constructor with matcherCode, error, and codeCompletionOffset |
| CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error, |
| unsigned codeCompletionOffset) |
| : code(matcherCode), startOfLine(matcherCode), error(error), |
| codeCompletionLocation(matcherCode.data() + codeCompletionOffset) { |
| nextToken = getNextToken(); |
| } |
| |
| // Peek at next token without consuming it |
| const TokenInfo &peekNextToken() const { return nextToken; } |
| |
| // Consume and return the next token |
| TokenInfo consumeNextToken() { |
| TokenInfo thisToken = nextToken; |
| nextToken = getNextToken(); |
| return thisToken; |
| } |
| |
| // Skip any newline tokens |
| TokenInfo skipNewlines() { |
| while (nextToken.kind == TokenKind::NewLine) |
| nextToken = getNextToken(); |
| return nextToken; |
| } |
| |
| // Consume and return next token, ignoring newlines |
| TokenInfo consumeNextTokenIgnoreNewlines() { |
| skipNewlines(); |
| return nextToken.kind == TokenKind::Eof ? nextToken : consumeNextToken(); |
| } |
| |
| // Return kind of next token |
| TokenKind nextTokenKind() const { return nextToken.kind; } |
| |
| private: |
| // Helper function to get the first character as a new StringRef and drop it |
| // from the original string |
| llvm::StringRef firstCharacterAndDrop(llvm::StringRef &str) { |
| assert(!str.empty()); |
| llvm::StringRef firstChar = str.substr(0, 1); |
| str = str.drop_front(); |
| return firstChar; |
| } |
| |
| // Get next token, consuming whitespaces and handling different token types |
| TokenInfo getNextToken() { |
| consumeWhitespace(); |
| TokenInfo result; |
| result.range.start = currentLocation(); |
| |
| // Code completion case |
| if (codeCompletionLocation && codeCompletionLocation <= code.data()) { |
| result.set(TokenKind::CodeCompletion, |
| llvm::StringRef(codeCompletionLocation, 0)); |
| codeCompletionLocation = nullptr; |
| return result; |
| } |
| |
| // End of file case |
| if (code.empty()) { |
| result.set(TokenKind::Eof, ""); |
| return result; |
| } |
| |
| // Switch to handle specific characters |
| switch (code[0]) { |
| case '#': |
| code = code.drop_until([](char c) { return c == '\n'; }); |
| return getNextToken(); |
| case ',': |
| result.set(TokenKind::Comma, firstCharacterAndDrop(code)); |
| break; |
| case '.': |
| result.set(TokenKind::Period, firstCharacterAndDrop(code)); |
| break; |
| case '\n': |
| ++line; |
| startOfLine = code.drop_front(); |
| result.set(TokenKind::NewLine, firstCharacterAndDrop(code)); |
| break; |
| case '(': |
| result.set(TokenKind::OpenParen, firstCharacterAndDrop(code)); |
| break; |
| case ')': |
| result.set(TokenKind::CloseParen, firstCharacterAndDrop(code)); |
| break; |
| case '"': |
| case '\'': |
| consumeStringLiteral(&result); |
| break; |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| consumeNumberLiteral(&result); |
| break; |
| default: |
| parseIdentifierOrInvalid(&result); |
| break; |
| } |
| |
| result.range.end = currentLocation(); |
| return result; |
| } |
| |
| void consumeNumberLiteral(TokenInfo *result) { |
| StringRef original = code; |
| unsigned value = 0; |
| if (!code.consumeInteger(0, value)) { |
| size_t numConsumed = original.size() - code.size(); |
| result->text = original.take_front(numConsumed); |
| result->kind = TokenKind::Literal; |
| result->value = static_cast<int64_t>(value); |
| return; |
| } |
| } |
| |
| // Consume a string literal, handle escape sequences and missing closing |
| // quote. |
| void consumeStringLiteral(TokenInfo *result) { |
| bool inEscape = false; |
| const char marker = code[0]; |
| for (size_t length = 1; length < code.size(); ++length) { |
| if (inEscape) { |
| inEscape = false; |
| continue; |
| } |
| if (code[length] == '\\') { |
| inEscape = true; |
| continue; |
| } |
| if (code[length] == marker) { |
| result->kind = TokenKind::Literal; |
| result->text = code.substr(0, length + 1); |
| result->value = code.substr(1, length - 1); |
| code = code.drop_front(length + 1); |
| return; |
| } |
| } |
| llvm::StringRef errorText = code; |
| code = code.drop_front(code.size()); |
| SourceRange range; |
| range.start = result->range.start; |
| range.end = currentLocation(); |
| error->addError(range, ErrorType::ParserStringError) << errorText; |
| result->kind = TokenKind::Error; |
| } |
| |
| void parseIdentifierOrInvalid(TokenInfo *result) { |
| if (isalnum(code[0])) { |
| // Parse an identifier |
| size_t tokenLength = 1; |
| |
| while (true) { |
| // A code completion location in/immediately after an identifier will |
| // cause the portion of the identifier before the code completion |
| // location to become a code completion token. |
| if (codeCompletionLocation == code.data() + tokenLength) { |
| codeCompletionLocation = nullptr; |
| result->kind = TokenKind::CodeCompletion; |
| result->text = code.substr(0, tokenLength); |
| code = code.drop_front(tokenLength); |
| return; |
| } |
| if (tokenLength == code.size() || !(isalnum(code[tokenLength]))) |
| break; |
| ++tokenLength; |
| } |
| llvm::StringRef token = code.substr(0, tokenLength); |
| code = code.drop_front(tokenLength); |
| // Check if the identifier is a boolean literal |
| if (token == "true") { |
| result->text = "false"; |
| result->kind = TokenKind::Literal; |
| result->value = true; |
| } else if (token == "false") { |
| result->text = "false"; |
| result->kind = TokenKind::Literal; |
| result->value = false; |
| } else { |
| // Otherwise it is treated as a normal identifier |
| result->kind = TokenKind::Ident; |
| result->text = token; |
| } |
| } else { |
| result->kind = TokenKind::InvalidChar; |
| result->text = code.substr(0, 1); |
| code = code.drop_front(1); |
| } |
| } |
| |
| // Consume all leading whitespace from code, except newlines |
| void consumeWhitespace() { code = code.ltrim(" \t\v\f\r"); } |
| |
| // Returns the current location in the source code |
| SourceLocation currentLocation() { |
| SourceLocation location; |
| location.line = line; |
| location.column = code.data() - startOfLine.data() + 1; |
| return location; |
| } |
| |
| llvm::StringRef code; |
| llvm::StringRef startOfLine; |
| unsigned line = 1; |
| Diagnostics *error; |
| TokenInfo nextToken; |
| const char *codeCompletionLocation = nullptr; |
| }; |
| |
| Parser::Sema::~Sema() = default; |
| |
| std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes( |
| llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) { |
| return {}; |
| } |
| |
| std::vector<MatcherCompletion> |
| Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes) { |
| return {}; |
| } |
| |
| // Entry for the scope of a parser |
| struct Parser::ScopedContextEntry { |
| Parser *parser; |
| |
| ScopedContextEntry(Parser *parser, MatcherCtor c) : parser(parser) { |
| parser->contextStack.emplace_back(c, 0u); |
| } |
| |
| ~ScopedContextEntry() { parser->contextStack.pop_back(); } |
| |
| void nextArg() { ++parser->contextStack.back().second; } |
| }; |
| |
| // Parse and validate expressions starting with an identifier. |
| // This function can parse named values and matchers. In case of failure, it |
| // will try to determine the user's intent to give an appropriate error message. |
| bool Parser::parseIdentifierPrefixImpl(VariantValue *value) { |
| const TokenInfo nameToken = tokenizer->consumeNextToken(); |
| |
| if (tokenizer->nextTokenKind() != TokenKind::OpenParen) { |
| // Parse as a named value. |
| if (auto namedValue = namedValues ? namedValues->lookup(nameToken.text) |
| : VariantValue()) { |
| |
| if (tokenizer->nextTokenKind() != TokenKind::Period) { |
| *value = namedValue; |
| return true; |
| } |
| |
| if (!namedValue.isMatcher()) { |
| error->addError(tokenizer->peekNextToken().range, |
| ErrorType::ParserNotAMatcher); |
| return false; |
| } |
| } |
| |
| if (tokenizer->nextTokenKind() == TokenKind::NewLine) { |
| error->addError(tokenizer->peekNextToken().range, |
| ErrorType::ParserNoOpenParen) |
| << "NewLine"; |
| return false; |
| } |
| |
| // If the syntax is correct and the name is not a matcher either, report |
| // an unknown named value. |
| if ((tokenizer->nextTokenKind() == TokenKind::Comma || |
| tokenizer->nextTokenKind() == TokenKind::CloseParen || |
| tokenizer->nextTokenKind() == TokenKind::NewLine || |
| tokenizer->nextTokenKind() == TokenKind::Eof) && |
| !sema->lookupMatcherCtor(nameToken.text)) { |
| error->addError(nameToken.range, ErrorType::RegistryValueNotFound) |
| << nameToken.text; |
| return false; |
| } |
| // Otherwise, fallback to the matcher parser. |
| } |
| |
| tokenizer->skipNewlines(); |
| |
| assert(nameToken.kind == TokenKind::Ident); |
| TokenInfo openToken = tokenizer->consumeNextToken(); |
| if (openToken.kind != TokenKind::OpenParen) { |
| error->addError(openToken.range, ErrorType::ParserNoOpenParen) |
| << openToken.text; |
| return false; |
| } |
| |
| std::optional<MatcherCtor> ctor = sema->lookupMatcherCtor(nameToken.text); |
| |
| // Parse as a matcher expression. |
| return parseMatcherExpressionImpl(nameToken, openToken, ctor, value); |
| } |
| |
| bool Parser::parseChainedExpression(std::string &argument) { |
| // Parse the parenthesized argument to .extract("foo") |
| // Note: EOF is handled inside the consume functions and would fail below when |
| // checking token kind. |
| const TokenInfo openToken = tokenizer->consumeNextToken(); |
| const TokenInfo argumentToken = tokenizer->consumeNextTokenIgnoreNewlines(); |
| const TokenInfo closeToken = tokenizer->consumeNextTokenIgnoreNewlines(); |
| |
| if (openToken.kind != TokenKind::OpenParen) { |
| error->addError(openToken.range, ErrorType::ParserChainedExprNoOpenParen); |
| return false; |
| } |
| |
| if (argumentToken.kind != TokenKind::Literal || |
| !argumentToken.value.isString()) { |
| error->addError(argumentToken.range, |
| ErrorType::ParserChainedExprInvalidArg); |
| return false; |
| } |
| |
| if (closeToken.kind != TokenKind::CloseParen) { |
| error->addError(closeToken.range, ErrorType::ParserChainedExprNoCloseParen); |
| return false; |
| } |
| |
| // If all checks passed, extract the argument and return true. |
| argument = argumentToken.value.getString(); |
| return true; |
| } |
| |
| // Parse the arguments of a matcher |
| bool Parser::parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor, |
| const TokenInfo &nameToken, TokenInfo &endToken) { |
| ScopedContextEntry sce(this, ctor); |
| |
| while (tokenizer->nextTokenKind() != TokenKind::Eof) { |
| if (tokenizer->nextTokenKind() == TokenKind::CloseParen) { |
| // end of args. |
| endToken = tokenizer->consumeNextToken(); |
| break; |
| } |
| |
| if (!args.empty()) { |
| // We must find a , token to continue. |
| TokenInfo commaToken = tokenizer->consumeNextToken(); |
| if (commaToken.kind != TokenKind::Comma) { |
| error->addError(commaToken.range, ErrorType::ParserNoComma) |
| << commaToken.text; |
| return false; |
| } |
| } |
| |
| ParserValue argValue; |
| tokenizer->skipNewlines(); |
| |
| argValue.text = tokenizer->peekNextToken().text; |
| argValue.range = tokenizer->peekNextToken().range; |
| if (!parseExpressionImpl(&argValue.value)) { |
| return false; |
| } |
| |
| tokenizer->skipNewlines(); |
| args.push_back(argValue); |
| sce.nextArg(); |
| } |
| |
| return true; |
| } |
| |
| // Parse and validate a matcher expression. |
| bool Parser::parseMatcherExpressionImpl(const TokenInfo &nameToken, |
| const TokenInfo &openToken, |
| std::optional<MatcherCtor> ctor, |
| VariantValue *value) { |
| if (!ctor) { |
| error->addError(nameToken.range, ErrorType::RegistryMatcherNotFound) |
| << nameToken.text; |
| // Do not return here. We need to continue to give completion suggestions. |
| } |
| |
| std::vector<ParserValue> args; |
| TokenInfo endToken; |
| |
| tokenizer->skipNewlines(); |
| |
| if (!parseMatcherArgs(args, ctor.value_or(nullptr), nameToken, endToken)) { |
| return false; |
| } |
| |
| // Check for the missing closing parenthesis |
| if (endToken.kind != TokenKind::CloseParen) { |
| error->addError(openToken.range, ErrorType::ParserNoCloseParen) |
| << nameToken.text; |
| return false; |
| } |
| |
| std::string functionName; |
| if (tokenizer->peekNextToken().kind == TokenKind::Period) { |
| tokenizer->consumeNextToken(); |
| TokenInfo chainCallToken = tokenizer->consumeNextToken(); |
| if (chainCallToken.kind == TokenKind::CodeCompletion) { |
| addCompletion(chainCallToken, MatcherCompletion("extract(\"", "extract")); |
| return false; |
| } |
| |
| if (chainCallToken.kind != TokenKind::Ident || |
| chainCallToken.text != TokenInfo::ID_Extract) { |
| error->addError(chainCallToken.range, |
| ErrorType::ParserMalformedChainedExpr); |
| return false; |
| } |
| |
| if (chainCallToken.text == TokenInfo::ID_Extract && |
| !parseChainedExpression(functionName)) |
| return false; |
| } |
| |
| if (!ctor) |
| return false; |
| // Merge the start and end infos. |
| SourceRange matcherRange = nameToken.range; |
| matcherRange.end = endToken.range.end; |
| VariantMatcher result = sema->actOnMatcherExpression( |
| *ctor, matcherRange, functionName, args, error); |
| if (result.isNull()) |
| return false; |
| *value = result; |
| return true; |
| } |
| |
| // If the prefix of this completion matches the completion token, add it to |
| // completions minus the prefix. |
| void Parser::addCompletion(const TokenInfo &compToken, |
| const MatcherCompletion &completion) { |
| if (llvm::StringRef(completion.typedText).starts_with(compToken.text)) { |
| completions.emplace_back(completion.typedText.substr(compToken.text.size()), |
| completion.matcherDecl); |
| } |
| } |
| |
| std::vector<MatcherCompletion> |
| Parser::getNamedValueCompletions(llvm::ArrayRef<ArgKind> acceptedTypes) { |
| if (!namedValues) |
| return {}; |
| |
| std::vector<MatcherCompletion> result; |
| for (const auto &entry : *namedValues) { |
| std::string decl = |
| (entry.getValue().getTypeAsString() + " " + entry.getKey()).str(); |
| result.emplace_back(entry.getKey(), decl); |
| } |
| return result; |
| } |
| |
| void Parser::addExpressionCompletions() { |
| const TokenInfo compToken = tokenizer->consumeNextTokenIgnoreNewlines(); |
| assert(compToken.kind == TokenKind::CodeCompletion); |
| |
| // We cannot complete code if there is an invalid element on the context |
| // stack. |
| for (const auto &entry : contextStack) { |
| if (!entry.first) |
| return; |
| } |
| |
| auto acceptedTypes = sema->getAcceptedCompletionTypes(contextStack); |
| for (const auto &completion : sema->getMatcherCompletions(acceptedTypes)) { |
| addCompletion(compToken, completion); |
| } |
| |
| for (const auto &completion : getNamedValueCompletions(acceptedTypes)) { |
| addCompletion(compToken, completion); |
| } |
| } |
| |
| // Parse an <Expresssion> |
| bool Parser::parseExpressionImpl(VariantValue *value) { |
| switch (tokenizer->nextTokenKind()) { |
| case TokenKind::Literal: |
| *value = tokenizer->consumeNextToken().value; |
| return true; |
| case TokenKind::Ident: |
| return parseIdentifierPrefixImpl(value); |
| case TokenKind::CodeCompletion: |
| addExpressionCompletions(); |
| return false; |
| case TokenKind::Eof: |
| error->addError(tokenizer->consumeNextToken().range, |
| ErrorType::ParserNoCode); |
| return false; |
| |
| case TokenKind::Error: |
| // This error was already reported by the tokenizer. |
| return false; |
| case TokenKind::NewLine: |
| case TokenKind::OpenParen: |
| case TokenKind::CloseParen: |
| case TokenKind::Comma: |
| case TokenKind::Period: |
| case TokenKind::InvalidChar: |
| const TokenInfo token = tokenizer->consumeNextToken(); |
| error->addError(token.range, ErrorType::ParserInvalidToken) |
| << (token.kind == TokenKind::NewLine ? "NewLine" : token.text); |
| return false; |
| } |
| |
| llvm_unreachable("Unknown token kind."); |
| } |
| |
| Parser::Parser(CodeTokenizer *tokenizer, const Registry &matcherRegistry, |
| const NamedValueMap *namedValues, Diagnostics *error) |
| : tokenizer(tokenizer), |
| sema(std::make_unique<RegistrySema>(matcherRegistry)), |
| namedValues(namedValues), error(error) {} |
| |
| Parser::RegistrySema::~RegistrySema() = default; |
| |
| std::optional<MatcherCtor> |
| Parser::RegistrySema::lookupMatcherCtor(llvm::StringRef matcherName) { |
| return RegistryManager::lookupMatcherCtor(matcherName, matcherRegistry); |
| } |
| |
| VariantMatcher Parser::RegistrySema::actOnMatcherExpression( |
| MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName, |
| llvm::ArrayRef<ParserValue> args, Diagnostics *error) { |
| return RegistryManager::constructMatcher(ctor, nameRange, functionName, args, |
| error); |
| } |
| |
| std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes( |
| llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) { |
| return RegistryManager::getAcceptedCompletionTypes(context); |
| } |
| |
| std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions( |
| llvm::ArrayRef<ArgKind> acceptedTypes) { |
| return RegistryManager::getMatcherCompletions(acceptedTypes, matcherRegistry); |
| } |
| |
| bool Parser::parseExpression(llvm::StringRef &code, |
| const Registry &matcherRegistry, |
| const NamedValueMap *namedValues, |
| VariantValue *value, Diagnostics *error) { |
| CodeTokenizer tokenizer(code, error); |
| Parser parser(&tokenizer, matcherRegistry, namedValues, error); |
| if (!parser.parseExpressionImpl(value)) |
| return false; |
| auto nextToken = tokenizer.peekNextToken(); |
| if (nextToken.kind != TokenKind::Eof && |
| nextToken.kind != TokenKind::NewLine) { |
| error->addError(tokenizer.peekNextToken().range, |
| ErrorType::ParserTrailingCode); |
| return false; |
| } |
| return true; |
| } |
| |
| std::vector<MatcherCompletion> |
| Parser::completeExpression(llvm::StringRef &code, unsigned completionOffset, |
| const Registry &matcherRegistry, |
| const NamedValueMap *namedValues) { |
| Diagnostics error; |
| CodeTokenizer tokenizer(code, &error, completionOffset); |
| Parser parser(&tokenizer, matcherRegistry, namedValues, &error); |
| VariantValue dummy; |
| parser.parseExpressionImpl(&dummy); |
| |
| return parser.completions; |
| } |
| |
| std::optional<DynMatcher> Parser::parseMatcherExpression( |
| llvm::StringRef &code, const Registry &matcherRegistry, |
| const NamedValueMap *namedValues, Diagnostics *error) { |
| VariantValue value; |
| if (!parseExpression(code, matcherRegistry, namedValues, &value, error)) |
| return std::nullopt; |
| if (!value.isMatcher()) { |
| error->addError(SourceRange(), ErrorType::ParserNotAMatcher); |
| return std::nullopt; |
| } |
| std::optional<DynMatcher> result = value.getMatcher().getDynMatcher(); |
| if (!result) { |
| error->addError(SourceRange(), ErrorType::ParserOverloadedType) |
| << value.getTypeAsString(); |
| } |
| return result; |
| } |
| |
| } // namespace mlir::query::matcher::internal |