| //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===---------------------------------------------------------------------===// |
| // |
| // This file implements an interface defined in ResourceScriptToken.h. |
| // In particular, it defines an .rc script tokenizer. |
| // |
| //===---------------------------------------------------------------------===// |
| |
| #include "ResourceScriptToken.h" |
| #include "llvm/ADT/StringExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| |
| #include <algorithm> |
| #include <cassert> |
| #include <cctype> |
| #include <cstdlib> |
| #include <utility> |
| |
| using namespace llvm; |
| |
| using Kind = RCToken::Kind; |
| |
| // Checks if Representation is a correct description of an RC integer. |
| // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), |
| // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' |
| // character (that is the difference between our representation and |
| // StringRef's one). If Representation is correct, 'true' is returned and |
| // the return value is put back in Num. |
| static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { |
| size_t Length = Representation.size(); |
| if (Length == 0) |
| return false; |
| // Strip the last 'L' if unnecessary. |
| if (std::toupper(Representation.back()) == 'L') |
| Representation = Representation.drop_back(1); |
| |
| return !Representation.getAsInteger<uint32_t>(0, Num); |
| } |
| |
| RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) |
| : TokenKind(RCTokenKind), TokenValue(Value) {} |
| |
| uint32_t RCToken::intValue() const { |
| assert(TokenKind == Kind::Int); |
| // We assume that the token already is a correct integer (checked by |
| // rcGetAsInteger). |
| uint32_t Result; |
| bool IsSuccess = rcGetAsInteger(TokenValue, Result); |
| assert(IsSuccess); |
| (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. |
| return Result; |
| } |
| |
| bool RCToken::isLongInt() const { |
| return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L'; |
| } |
| |
| StringRef RCToken::value() const { return TokenValue; } |
| |
| Kind RCToken::kind() const { return TokenKind; } |
| |
| bool RCToken::isBinaryOp() const { |
| switch (TokenKind) { |
| case Kind::Plus: |
| case Kind::Minus: |
| case Kind::Pipe: |
| case Kind::Amp: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| static Error getStringError(const Twine &message) { |
| return make_error<StringError>("Error parsing file: " + message, |
| inconvertibleErrorCode()); |
| } |
| |
| namespace { |
| |
| class Tokenizer { |
| public: |
| Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {} |
| |
| Expected<std::vector<RCToken>> run(); |
| |
| private: |
| // All 'advancing' methods return boolean values; if they're equal to false, |
| // the stream has ended or failed. |
| bool advance(size_t Amount = 1); |
| bool skipWhitespaces(); |
| |
| // Consumes a token. If any problem occurred, a non-empty Error is returned. |
| Error consumeToken(const Kind TokenKind); |
| |
| // Check if tokenizer is about to read FollowingChars. |
| bool willNowRead(StringRef FollowingChars) const; |
| |
| // Check if tokenizer can start reading an identifier at current position. |
| // The original tool did non specify the rules to determine what is a correct |
| // identifier. We assume they should follow the C convention: |
| // [a-zA-Z_][a-zA-Z0-9_]*. |
| bool canStartIdentifier() const; |
| // Check if tokenizer can continue reading an identifier. |
| bool canContinueIdentifier() const; |
| |
| // Check if tokenizer can start reading an integer. |
| // A correct integer always starts with a 0-9 digit, |
| // can contain characters 0-9A-Fa-f (digits), |
| // Ll (marking the integer is 32-bit), Xx (marking the representation |
| // is hexadecimal). As some kind of separator should come after the |
| // integer, we can consume the integer until a non-alphanumeric |
| // character. |
| bool canStartInt() const; |
| bool canContinueInt() const; |
| |
| bool canStartString() const; |
| |
| // Check if tokenizer can start reading a single line comment (e.g. a comment |
| // that begins with '//') |
| bool canStartLineComment() const; |
| |
| // Check if tokenizer can start or finish reading a block comment (e.g. a |
| // comment that begins with '/*' and ends with '*/') |
| bool canStartBlockComment() const; |
| |
| // Throw away all remaining characters on the current line. |
| void skipCurrentLine(); |
| |
| bool streamEof() const; |
| |
| // Classify the token that is about to be read from the current position. |
| Kind classifyCurrentToken() const; |
| |
| // Process the Kind::Identifier token - check if it is |
| // an identifier describing a block start or end. |
| void processIdentifier(RCToken &token) const; |
| |
| StringRef Data; |
| size_t DataLength, Pos; |
| }; |
| |
| void Tokenizer::skipCurrentLine() { |
| Pos = Data.find_first_of("\r\n", Pos); |
| Pos = Data.find_first_not_of("\r\n", Pos); |
| |
| if (Pos == StringRef::npos) |
| Pos = DataLength; |
| } |
| |
| Expected<std::vector<RCToken>> Tokenizer::run() { |
| Pos = 0; |
| std::vector<RCToken> Result; |
| |
| // Consume an optional UTF-8 Byte Order Mark. |
| if (willNowRead("\xef\xbb\xbf")) |
| advance(3); |
| |
| while (!streamEof()) { |
| if (!skipWhitespaces()) |
| break; |
| |
| Kind TokenKind = classifyCurrentToken(); |
| if (TokenKind == Kind::Invalid) |
| return getStringError("Invalid token found at position " + Twine(Pos)); |
| |
| const size_t TokenStart = Pos; |
| if (Error TokenError = consumeToken(TokenKind)) |
| return std::move(TokenError); |
| |
| // Comments are just deleted, don't bother saving them. |
| if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment) |
| continue; |
| |
| RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); |
| if (TokenKind == Kind::Identifier) { |
| processIdentifier(Token); |
| } else if (TokenKind == Kind::Int) { |
| uint32_t TokenInt; |
| if (!rcGetAsInteger(Token.value(), TokenInt)) { |
| // The integer has incorrect format or cannot be represented in |
| // a 32-bit integer. |
| return getStringError("Integer invalid or too large: " + |
| Token.value().str()); |
| } |
| } |
| |
| Result.push_back(Token); |
| } |
| |
| return Result; |
| } |
| |
| bool Tokenizer::advance(size_t Amount) { |
| Pos += Amount; |
| return !streamEof(); |
| } |
| |
| bool Tokenizer::skipWhitespaces() { |
| while (!streamEof() && isSpace(Data[Pos])) |
| advance(); |
| return !streamEof(); |
| } |
| |
| Error Tokenizer::consumeToken(const Kind TokenKind) { |
| switch (TokenKind) { |
| // One-character token consumption. |
| #define TOKEN(Name) |
| #define SHORT_TOKEN(Name, Ch) case Kind::Name: |
| #include "ResourceScriptTokenList.def" |
| advance(); |
| return Error::success(); |
| |
| case Kind::LineComment: |
| advance(2); |
| skipCurrentLine(); |
| return Error::success(); |
| |
| case Kind::StartComment: { |
| advance(2); |
| auto EndPos = Data.find("*/", Pos); |
| if (EndPos == StringRef::npos) |
| return getStringError( |
| "Unclosed multi-line comment beginning at position " + Twine(Pos)); |
| advance(EndPos - Pos); |
| advance(2); |
| return Error::success(); |
| } |
| case Kind::Identifier: |
| while (!streamEof() && canContinueIdentifier()) |
| advance(); |
| return Error::success(); |
| |
| case Kind::Int: |
| while (!streamEof() && canContinueInt()) |
| advance(); |
| return Error::success(); |
| |
| case Kind::String: |
| // Consume the preceding 'L', if there is any. |
| if (std::toupper(Data[Pos]) == 'L') |
| advance(); |
| // Consume the double-quote. |
| advance(); |
| |
| // Consume the characters until the end of the file, line or string. |
| while (true) { |
| if (streamEof()) { |
| return getStringError("Unterminated string literal."); |
| } else if (Data[Pos] == '"') { |
| // Consume the ending double-quote. |
| advance(); |
| // However, if another '"' follows this double-quote, the string didn't |
| // end and we just included '"' into the string. |
| if (!willNowRead("\"")) |
| return Error::success(); |
| } else if (Data[Pos] == '\n') { |
| return getStringError("String literal not terminated in the line."); |
| } |
| |
| advance(); |
| } |
| |
| case Kind::Invalid: |
| assert(false && "Cannot consume an invalid token."); |
| } |
| |
| llvm_unreachable("Unknown RCToken::Kind"); |
| } |
| |
| bool Tokenizer::willNowRead(StringRef FollowingChars) const { |
| return Data.drop_front(Pos).startswith(FollowingChars); |
| } |
| |
| bool Tokenizer::canStartIdentifier() const { |
| assert(!streamEof()); |
| |
| const char CurChar = Data[Pos]; |
| return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.'; |
| } |
| |
| bool Tokenizer::canContinueIdentifier() const { |
| assert(!streamEof()); |
| const char CurChar = Data[Pos]; |
| return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' || |
| CurChar == '/' || CurChar == '\\'; |
| } |
| |
| bool Tokenizer::canStartInt() const { |
| assert(!streamEof()); |
| return std::isdigit(Data[Pos]); |
| } |
| |
| bool Tokenizer::canStartBlockComment() const { |
| assert(!streamEof()); |
| return Data.drop_front(Pos).startswith("/*"); |
| } |
| |
| bool Tokenizer::canStartLineComment() const { |
| assert(!streamEof()); |
| return Data.drop_front(Pos).startswith("//"); |
| } |
| |
| bool Tokenizer::canContinueInt() const { |
| assert(!streamEof()); |
| return std::isalnum(Data[Pos]); |
| } |
| |
| bool Tokenizer::canStartString() const { |
| return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\""); |
| } |
| |
| bool Tokenizer::streamEof() const { return Pos == DataLength; } |
| |
| Kind Tokenizer::classifyCurrentToken() const { |
| if (canStartBlockComment()) |
| return Kind::StartComment; |
| if (canStartLineComment()) |
| return Kind::LineComment; |
| |
| if (canStartInt()) |
| return Kind::Int; |
| if (canStartString()) |
| return Kind::String; |
| // BEGIN and END are at this point of lexing recognized as identifiers. |
| if (canStartIdentifier()) |
| return Kind::Identifier; |
| |
| const char CurChar = Data[Pos]; |
| |
| switch (CurChar) { |
| // One-character token classification. |
| #define TOKEN(Name) |
| #define SHORT_TOKEN(Name, Ch) \ |
| case Ch: \ |
| return Kind::Name; |
| #include "ResourceScriptTokenList.def" |
| |
| default: |
| return Kind::Invalid; |
| } |
| } |
| |
| void Tokenizer::processIdentifier(RCToken &Token) const { |
| assert(Token.kind() == Kind::Identifier); |
| StringRef Name = Token.value(); |
| |
| if (Name.equals_lower("begin")) |
| Token = RCToken(Kind::BlockBegin, Name); |
| else if (Name.equals_lower("end")) |
| Token = RCToken(Kind::BlockEnd, Name); |
| } |
| |
| } // anonymous namespace |
| |
| namespace llvm { |
| |
| Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) { |
| return Tokenizer(Input).run(); |
| } |
| |
| } // namespace llvm |