| //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This class implements the lexer for assembly files. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/MC/MCParser/AsmLexer.h" |
| #include "llvm/ADT/APInt.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/StringExtras.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/StringSwitch.h" |
| #include "llvm/MC/MCAsmInfo.h" |
| #include "llvm/MC/MCParser/MCAsmLexer.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/SMLoc.h" |
| #include "llvm/Support/SaveAndRestore.h" |
| #include <cassert> |
| #include <cctype> |
| #include <cstdio> |
| #include <cstring> |
| #include <string> |
| #include <tuple> |
| #include <utility> |
| |
| using namespace llvm; |
| |
| AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { |
| AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@"); |
| LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); |
| } |
| |
| AsmLexer::~AsmLexer() = default; |
| |
| void AsmLexer::setBuffer(StringRef Buf, const char *ptr, |
| bool EndStatementAtEOF) { |
| CurBuf = Buf; |
| |
| if (ptr) |
| CurPtr = ptr; |
| else |
| CurPtr = CurBuf.begin(); |
| |
| TokStart = nullptr; |
| this->EndStatementAtEOF = EndStatementAtEOF; |
| } |
| |
| /// ReturnError - Set the error to the specified string at the specified |
| /// location. This is defined to always return AsmToken::Error. |
| AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { |
| SetError(SMLoc::getFromPointer(Loc), Msg); |
| |
| return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc)); |
| } |
| |
| int AsmLexer::getNextChar() { |
| if (CurPtr == CurBuf.end()) |
| return EOF; |
| return (unsigned char)*CurPtr++; |
| } |
| |
| int AsmLexer::peekNextChar() { |
| if (CurPtr == CurBuf.end()) |
| return EOF; |
| return (unsigned char)*CurPtr; |
| } |
| |
| /// The leading integral digit sequence and dot should have already been |
| /// consumed, some or all of the fractional digit sequence *can* have been |
| /// consumed. |
| AsmToken AsmLexer::LexFloatLiteral() { |
| // Skip the fractional digit sequence. |
| while (isDigit(*CurPtr)) |
| ++CurPtr; |
| |
| if (*CurPtr == '-' || *CurPtr == '+') |
| return ReturnError(CurPtr, "Invalid sign in float literal"); |
| |
| // Check for exponent |
| if ((*CurPtr == 'e' || *CurPtr == 'E')) { |
| ++CurPtr; |
| |
| if (*CurPtr == '-' || *CurPtr == '+') |
| ++CurPtr; |
| |
| while (isDigit(*CurPtr)) |
| ++CurPtr; |
| } |
| |
| return AsmToken(AsmToken::Real, |
| StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ |
| /// while making sure there are enough actual digits around for the constant to |
| /// be valid. |
| /// |
| /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed |
| /// before we get here. |
| AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { |
| assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && |
| "unexpected parse state in floating hex"); |
| bool NoFracDigits = true; |
| |
| // Skip the fractional part if there is one |
| if (*CurPtr == '.') { |
| ++CurPtr; |
| |
| const char *FracStart = CurPtr; |
| while (isHexDigit(*CurPtr)) |
| ++CurPtr; |
| |
| NoFracDigits = CurPtr == FracStart; |
| } |
| |
| if (NoIntDigits && NoFracDigits) |
| return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " |
| "expected at least one significand digit"); |
| |
| // Make sure we do have some kind of proper exponent part |
| if (*CurPtr != 'p' && *CurPtr != 'P') |
| return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " |
| "expected exponent part 'p'"); |
| ++CurPtr; |
| |
| if (*CurPtr == '+' || *CurPtr == '-') |
| ++CurPtr; |
| |
| // N.b. exponent digits are *not* hex |
| const char *ExpStart = CurPtr; |
| while (isDigit(*CurPtr)) |
| ++CurPtr; |
| |
| if (CurPtr == ExpStart) |
| return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " |
| "expected at least one exponent digit"); |
| |
| return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@#?]* |
| static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) { |
| return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' || |
| (AllowAt && C == '@') || (AllowHash && C == '#'); |
| } |
| |
| AsmToken AsmLexer::LexIdentifier() { |
| // Check for floating point literals. |
| if (CurPtr[-1] == '.' && isDigit(*CurPtr)) { |
| // Disambiguate a .1243foo identifier from a floating literal. |
| while (isDigit(*CurPtr)) |
| ++CurPtr; |
| |
| if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier, |
| AllowHashInIdentifier) || |
| *CurPtr == 'e' || *CurPtr == 'E') |
| return LexFloatLiteral(); |
| } |
| |
| while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier)) |
| ++CurPtr; |
| |
| // Handle . as a special case. |
| if (CurPtr == TokStart+1 && TokStart[0] == '.') |
| return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); |
| |
| return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| /// LexSlash: Slash: / |
| /// C-Style Comment: /* ... */ |
| /// C-style Comment: // ... |
| AsmToken AsmLexer::LexSlash() { |
| if (!MAI.shouldAllowAdditionalComments()) { |
| IsAtStartOfStatement = false; |
| return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); |
| } |
| |
| switch (*CurPtr) { |
| case '*': |
| IsAtStartOfStatement = false; |
| break; // C style comment. |
| case '/': |
| ++CurPtr; |
| return LexLineComment(); |
| default: |
| IsAtStartOfStatement = false; |
| return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); |
| } |
| |
| // C Style comment. |
| ++CurPtr; // skip the star. |
| const char *CommentTextStart = CurPtr; |
| while (CurPtr != CurBuf.end()) { |
| switch (*CurPtr++) { |
| case '*': |
| // End of the comment? |
| if (*CurPtr != '/') |
| break; |
| // If we have a CommentConsumer, notify it about the comment. |
| if (CommentConsumer) { |
| CommentConsumer->HandleComment( |
| SMLoc::getFromPointer(CommentTextStart), |
| StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); |
| } |
| ++CurPtr; // End the */. |
| return AsmToken(AsmToken::Comment, |
| StringRef(TokStart, CurPtr - TokStart)); |
| } |
| } |
| return ReturnError(TokStart, "unterminated comment"); |
| } |
| |
| /// LexLineComment: Comment: #[^\n]* |
| /// : //[^\n]* |
| AsmToken AsmLexer::LexLineComment() { |
| // Mark This as an end of statement with a body of the |
| // comment. While it would be nicer to leave this two tokens, |
| // backwards compatability with TargetParsers makes keeping this in this form |
| // better. |
| const char *CommentTextStart = CurPtr; |
| int CurChar = getNextChar(); |
| while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) |
| CurChar = getNextChar(); |
| if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') |
| ++CurPtr; |
| |
| // If we have a CommentConsumer, notify it about the comment. |
| if (CommentConsumer) { |
| CommentConsumer->HandleComment( |
| SMLoc::getFromPointer(CommentTextStart), |
| StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); |
| } |
| |
| IsAtStartOfLine = true; |
| // This is a whole line comment. leave newline |
| if (IsAtStartOfStatement) |
| return AsmToken(AsmToken::EndOfStatement, |
| StringRef(TokStart, CurPtr - TokStart)); |
| IsAtStartOfStatement = true; |
| |
| return AsmToken(AsmToken::EndOfStatement, |
| StringRef(TokStart, CurPtr - 1 - TokStart)); |
| } |
| |
| static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { |
| // Skip ULL, UL, U, L and LL suffices. |
| if (CurPtr[0] == 'U') |
| ++CurPtr; |
| if (CurPtr[0] == 'L') |
| ++CurPtr; |
| if (CurPtr[0] == 'L') |
| ++CurPtr; |
| } |
| |
| // Look ahead to search for first non-hex digit, if it's [hH], then we treat the |
| // integer as a hexadecimal, possibly with leading zeroes. |
| static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, |
| bool LexHex) { |
| const char *FirstNonDec = nullptr; |
| const char *LookAhead = CurPtr; |
| while (true) { |
| if (isDigit(*LookAhead)) { |
| ++LookAhead; |
| } else { |
| if (!FirstNonDec) |
| FirstNonDec = LookAhead; |
| |
| // Keep going if we are looking for a 'h' suffix. |
| if (LexHex && isHexDigit(*LookAhead)) |
| ++LookAhead; |
| else |
| break; |
| } |
| } |
| bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H'); |
| CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec; |
| if (isHex) |
| return 16; |
| return DefaultRadix; |
| } |
| |
| static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) { |
| while (hexDigitValue(*CurPtr) < DefaultRadix) { |
| ++CurPtr; |
| } |
| return CurPtr; |
| } |
| |
| static AsmToken intToken(StringRef Ref, APInt &Value) { |
| if (Value.isIntN(64)) |
| return AsmToken(AsmToken::Integer, Ref, Value); |
| return AsmToken(AsmToken::BigNum, Ref, Value); |
| } |
| |
| static std::string radixName(unsigned Radix) { |
| switch (Radix) { |
| case 2: |
| return "binary"; |
| case 8: |
| return "octal"; |
| case 10: |
| return "decimal"; |
| case 16: |
| return "hexadecimal"; |
| default: |
| return "base-" + std::to_string(Radix); |
| } |
| } |
| |
| /// LexDigit: First character is [0-9]. |
| /// Local Label: [0-9][:] |
| /// Forward/Backward Label: [0-9][fb] |
| /// Binary integer: 0b[01]+ |
| /// Octal integer: 0[0-7]+ |
| /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] |
| /// Decimal integer: [1-9][0-9]* |
| AsmToken AsmLexer::LexDigit() { |
| // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY]) |
| // MASM-flavor octal integer: [0-7]+[oOqQ] |
| // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT]) |
| // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] |
| if (LexMasmIntegers && isdigit(CurPtr[-1])) { |
| const char *FirstNonBinary = |
| (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr; |
| const char *FirstNonDecimal = |
| (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr; |
| const char *OldCurPtr = CurPtr; |
| while (isHexDigit(*CurPtr)) { |
| switch (*CurPtr) { |
| default: |
| if (!FirstNonDecimal) { |
| FirstNonDecimal = CurPtr; |
| } |
| LLVM_FALLTHROUGH; |
| case '9': |
| case '8': |
| case '7': |
| case '6': |
| case '5': |
| case '4': |
| case '3': |
| case '2': |
| if (!FirstNonBinary) { |
| FirstNonBinary = CurPtr; |
| } |
| break; |
| case '1': |
| case '0': |
| break; |
| } |
| ++CurPtr; |
| } |
| if (*CurPtr == '.') { |
| // MASM float literals (other than hex floats) always contain a ".", and |
| // are always written in decimal. |
| ++CurPtr; |
| return LexFloatLiteral(); |
| } |
| |
| if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) { |
| ++CurPtr; |
| return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| unsigned Radix = 0; |
| if (*CurPtr == 'h' || *CurPtr == 'H') { |
| // hexadecimal number |
| ++CurPtr; |
| Radix = 16; |
| } else if (*CurPtr == 't' || *CurPtr == 'T') { |
| // decimal number |
| ++CurPtr; |
| Radix = 10; |
| } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' || |
| *CurPtr == 'Q') { |
| // octal number |
| ++CurPtr; |
| Radix = 8; |
| } else if (*CurPtr == 'y' || *CurPtr == 'Y') { |
| // binary number |
| ++CurPtr; |
| Radix = 2; |
| } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr && |
| DefaultRadix < 14 && |
| (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) { |
| Radix = 10; |
| } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr && |
| DefaultRadix < 12 && |
| (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) { |
| Radix = 2; |
| } |
| |
| if (Radix) { |
| StringRef Result(TokStart, CurPtr - TokStart); |
| APInt Value(128, 0, true); |
| |
| if (Result.drop_back().getAsInteger(Radix, Value)) |
| return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); |
| |
| // MSVC accepts and ignores type suffices on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return intToken(Result, Value); |
| } |
| |
| // default-radix integers, or floating point numbers, fall through |
| CurPtr = OldCurPtr; |
| } |
| |
| // MASM default-radix integers: [0-9a-fA-F]+ |
| // (All other integer literals have a radix specifier.) |
| if (LexMasmIntegers && UseMasmDefaultRadix) { |
| CurPtr = findLastDigit(CurPtr, 16); |
| StringRef Result(TokStart, CurPtr - TokStart); |
| |
| APInt Value(128, 0, true); |
| if (Result.getAsInteger(DefaultRadix, Value)) { |
| return ReturnError(TokStart, |
| "invalid " + radixName(DefaultRadix) + " number"); |
| } |
| |
| return intToken(Result, Value); |
| } |
| |
| // Motorola hex integers: $[0-9a-fA-F]+ |
| if (LexMotorolaIntegers && CurPtr[-1] == '$') { |
| const char *NumStart = CurPtr; |
| while (isHexDigit(CurPtr[0])) |
| ++CurPtr; |
| |
| APInt Result(128, 0); |
| if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result)) |
| return ReturnError(TokStart, "invalid hexadecimal number"); |
| |
| return intToken(StringRef(TokStart, CurPtr - TokStart), Result); |
| } |
| |
| // Motorola binary integers: %[01]+ |
| if (LexMotorolaIntegers && CurPtr[-1] == '%') { |
| const char *NumStart = CurPtr; |
| while (*CurPtr == '0' || *CurPtr == '1') |
| ++CurPtr; |
| |
| APInt Result(128, 0); |
| if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result)) |
| return ReturnError(TokStart, "invalid binary number"); |
| |
| return intToken(StringRef(TokStart, CurPtr - TokStart), Result); |
| } |
| |
| // Decimal integer: [1-9][0-9]* |
| // HLASM-flavour decimal integer: [0-9][0-9]* |
| // FIXME: Later on, support for fb for HLASM has to be added in |
| // as they probably would be needed for asm goto |
| if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') { |
| unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers); |
| |
| if (!LexHLASMIntegers) { |
| bool IsHex = Radix == 16; |
| // Check for floating point literals. |
| if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { |
| if (*CurPtr == '.') |
| ++CurPtr; |
| return LexFloatLiteral(); |
| } |
| } |
| |
| StringRef Result(TokStart, CurPtr - TokStart); |
| |
| APInt Value(128, 0, true); |
| if (Result.getAsInteger(Radix, Value)) |
| return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); |
| |
| if (!LexHLASMIntegers) |
| // The darwin/x86 (and x86-64) assembler accepts and ignores type |
| // suffices on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return intToken(Result, Value); |
| } |
| |
| if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { |
| ++CurPtr; |
| // See if we actually have "0b" as part of something like "jmp 0b\n" |
| if (!isDigit(CurPtr[0])) { |
| --CurPtr; |
| StringRef Result(TokStart, CurPtr - TokStart); |
| return AsmToken(AsmToken::Integer, Result, 0); |
| } |
| const char *NumStart = CurPtr; |
| while (CurPtr[0] == '0' || CurPtr[0] == '1') |
| ++CurPtr; |
| |
| // Requires at least one binary digit. |
| if (CurPtr == NumStart) |
| return ReturnError(TokStart, "invalid binary number"); |
| |
| StringRef Result(TokStart, CurPtr - TokStart); |
| |
| APInt Value(128, 0, true); |
| if (Result.substr(2).getAsInteger(2, Value)) |
| return ReturnError(TokStart, "invalid binary number"); |
| |
| // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
| // suffixes on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return intToken(Result, Value); |
| } |
| |
| if ((*CurPtr == 'x') || (*CurPtr == 'X')) { |
| ++CurPtr; |
| const char *NumStart = CurPtr; |
| while (isHexDigit(CurPtr[0])) |
| ++CurPtr; |
| |
| // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be |
| // diagnosed by LexHexFloatLiteral). |
| if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') |
| return LexHexFloatLiteral(NumStart == CurPtr); |
| |
| // Otherwise requires at least one hex digit. |
| if (CurPtr == NumStart) |
| return ReturnError(CurPtr-2, "invalid hexadecimal number"); |
| |
| APInt Result(128, 0); |
| if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) |
| return ReturnError(TokStart, "invalid hexadecimal number"); |
| |
| // Consume the optional [hH]. |
| if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) |
| ++CurPtr; |
| |
| // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
| // suffixes on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return intToken(StringRef(TokStart, CurPtr - TokStart), Result); |
| } |
| |
| // Either octal or hexadecimal. |
| APInt Value(128, 0, true); |
| unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers); |
| StringRef Result(TokStart, CurPtr - TokStart); |
| if (Result.getAsInteger(Radix, Value)) |
| return ReturnError(TokStart, "invalid " + radixName(Radix) + " number"); |
| |
| // Consume the [hH]. |
| if (Radix == 16) |
| ++CurPtr; |
| |
| // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
| // suffixes on integer literals. |
| SkipIgnoredIntegerSuffix(CurPtr); |
| |
| return intToken(Result, Value); |
| } |
| |
| /// LexSingleQuote: Integer: 'b' |
| AsmToken AsmLexer::LexSingleQuote() { |
| int CurChar = getNextChar(); |
| |
| if (LexMasmStrings) { |
| while (CurChar != EOF) { |
| if (CurChar != '\'') { |
| CurChar = getNextChar(); |
| } else if (peekNextChar() == '\'') { |
| // In MASM single-quote strings, doubled single-quotes mean an escaped |
| // single quote, so should be lexed in. |
| getNextChar(); |
| CurChar = getNextChar(); |
| } else { |
| break; |
| } |
| } |
| if (CurChar == EOF) |
| return ReturnError(TokStart, "unterminated string constant"); |
| return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| if (CurChar == '\\') |
| CurChar = getNextChar(); |
| |
| if (CurChar == EOF) |
| return ReturnError(TokStart, "unterminated single quote"); |
| |
| CurChar = getNextChar(); |
| |
| if (CurChar != '\'') |
| return ReturnError(TokStart, "single quote way too long"); |
| |
| // The idea here being that 'c' is basically just an integral |
| // constant. |
| StringRef Res = StringRef(TokStart,CurPtr - TokStart); |
| long long Value; |
| |
| if (Res.startswith("\'\\")) { |
| char theChar = Res[2]; |
| switch (theChar) { |
| default: Value = theChar; break; |
| case '\'': Value = '\''; break; |
| case 't': Value = '\t'; break; |
| case 'n': Value = '\n'; break; |
| case 'b': Value = '\b'; break; |
| case 'f': Value = '\f'; break; |
| case 'r': Value = '\r'; break; |
| } |
| } else |
| Value = TokStart[1]; |
| |
| return AsmToken(AsmToken::Integer, Res, Value); |
| } |
| |
| /// LexQuote: String: "..." |
| AsmToken AsmLexer::LexQuote() { |
| int CurChar = getNextChar(); |
| if (LexMasmStrings) { |
| while (CurChar != EOF) { |
| if (CurChar != '"') { |
| CurChar = getNextChar(); |
| } else if (peekNextChar() == '"') { |
| // In MASM double-quoted strings, doubled double-quotes mean an escaped |
| // double quote, so should be lexed in. |
| getNextChar(); |
| CurChar = getNextChar(); |
| } else { |
| break; |
| } |
| } |
| if (CurChar == EOF) |
| return ReturnError(TokStart, "unterminated string constant"); |
| return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| // TODO: does gas allow multiline string constants? |
| while (CurChar != '"') { |
| if (CurChar == '\\') { |
| // Allow \", etc. |
| CurChar = getNextChar(); |
| } |
| |
| if (CurChar == EOF) |
| return ReturnError(TokStart, "unterminated string constant"); |
| |
| CurChar = getNextChar(); |
| } |
| |
| return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
| } |
| |
| StringRef AsmLexer::LexUntilEndOfStatement() { |
| TokStart = CurPtr; |
| |
| while (!isAtStartOfComment(CurPtr) && // Start of line comment. |
| !isAtStatementSeparator(CurPtr) && // End of statement marker. |
| *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { |
| ++CurPtr; |
| } |
| return StringRef(TokStart, CurPtr-TokStart); |
| } |
| |
| StringRef AsmLexer::LexUntilEndOfLine() { |
| TokStart = CurPtr; |
| |
| while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { |
| ++CurPtr; |
| } |
| return StringRef(TokStart, CurPtr-TokStart); |
| } |
| |
| size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, |
| bool ShouldSkipSpace) { |
| SaveAndRestore<const char *> SavedTokenStart(TokStart); |
| SaveAndRestore<const char *> SavedCurPtr(CurPtr); |
| SaveAndRestore<bool> SavedAtStartOfLine(IsAtStartOfLine); |
| SaveAndRestore<bool> SavedAtStartOfStatement(IsAtStartOfStatement); |
| SaveAndRestore<bool> SavedSkipSpace(SkipSpace, ShouldSkipSpace); |
| SaveAndRestore<bool> SavedIsPeeking(IsPeeking, true); |
| std::string SavedErr = getErr(); |
| SMLoc SavedErrLoc = getErrLoc(); |
| |
| size_t ReadCount; |
| for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { |
| AsmToken Token = LexToken(); |
| |
| Buf[ReadCount] = Token; |
| |
| if (Token.is(AsmToken::Eof)) |
| break; |
| } |
| |
| SetError(SavedErrLoc, SavedErr); |
| return ReadCount; |
| } |
| |
| bool AsmLexer::isAtStartOfComment(const char *Ptr) { |
| if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement) |
| return false; |
| |
| StringRef CommentString = MAI.getCommentString(); |
| |
| if (CommentString.size() == 1) |
| return CommentString[0] == Ptr[0]; |
| |
| // Allow # preprocessor commments also be counted as comments for "##" cases |
| if (CommentString[1] == '#') |
| return CommentString[0] == Ptr[0]; |
| |
| return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0; |
| } |
| |
| bool AsmLexer::isAtStatementSeparator(const char *Ptr) { |
| return strncmp(Ptr, MAI.getSeparatorString(), |
| strlen(MAI.getSeparatorString())) == 0; |
| } |
| |
| AsmToken AsmLexer::LexToken() { |
| TokStart = CurPtr; |
| // This always consumes at least one character. |
| int CurChar = getNextChar(); |
| |
| if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) { |
| // If this starts with a '#', this may be a cpp |
| // hash directive and otherwise a line comment. |
| AsmToken TokenBuf[2]; |
| MutableArrayRef<AsmToken> Buf(TokenBuf, 2); |
| size_t num = peekTokens(Buf, true); |
| // There cannot be a space preceding this |
| if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) && |
| TokenBuf[1].is(AsmToken::String)) { |
| CurPtr = TokStart; // reset curPtr; |
| StringRef s = LexUntilEndOfLine(); |
| UnLex(TokenBuf[1]); |
| UnLex(TokenBuf[0]); |
| return AsmToken(AsmToken::HashDirective, s); |
| } |
| |
| if (MAI.shouldAllowAdditionalComments()) |
| return LexLineComment(); |
| } |
| |
| if (isAtStartOfComment(TokStart)) |
| return LexLineComment(); |
| |
| if (isAtStatementSeparator(TokStart)) { |
| CurPtr += strlen(MAI.getSeparatorString()) - 1; |
| IsAtStartOfLine = true; |
| IsAtStartOfStatement = true; |
| return AsmToken(AsmToken::EndOfStatement, |
| StringRef(TokStart, strlen(MAI.getSeparatorString()))); |
| } |
| |
| // If we're missing a newline at EOF, make sure we still get an |
| // EndOfStatement token before the Eof token. |
| if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) { |
| IsAtStartOfLine = true; |
| IsAtStartOfStatement = true; |
| return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); |
| } |
| IsAtStartOfLine = false; |
| bool OldIsAtStartOfStatement = IsAtStartOfStatement; |
| IsAtStartOfStatement = false; |
| switch (CurChar) { |
| default: |
| if (MAI.doesAllowSymbolAtNameStart()) { |
| // Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]* |
| if (!isDigit(CurChar) && |
| isIdentifierChar(CurChar, MAI.doesAllowAtInName(), |
| AllowHashInIdentifier)) |
| return LexIdentifier(); |
| } else { |
| // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* |
| if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') |
| return LexIdentifier(); |
| } |
| |
| // Unknown character, emit an error. |
| return ReturnError(TokStart, "invalid character in input"); |
| case EOF: |
| if (EndStatementAtEOF) { |
| IsAtStartOfLine = true; |
| IsAtStartOfStatement = true; |
| } |
| return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); |
| case 0: |
| case ' ': |
| case '\t': |
| IsAtStartOfStatement = OldIsAtStartOfStatement; |
| while (*CurPtr == ' ' || *CurPtr == '\t') |
| CurPtr++; |
| if (SkipSpace) |
| return LexToken(); // Ignore whitespace. |
| else |
| return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart)); |
| case '\r': { |
| IsAtStartOfLine = true; |
| IsAtStartOfStatement = true; |
| // If this is a CR followed by LF, treat that as one token. |
| if (CurPtr != CurBuf.end() && *CurPtr == '\n') |
| ++CurPtr; |
| return AsmToken(AsmToken::EndOfStatement, |
| StringRef(TokStart, CurPtr - TokStart)); |
| } |
| case '\n': |
| IsAtStartOfLine = true; |
| IsAtStartOfStatement = true; |
| return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); |
| case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); |
| case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); |
| case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); |
| case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); |
| case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); |
| case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); |
| case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); |
| case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); |
| case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); |
| case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); |
| case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); |
| case '$': |
| if (LexMotorolaIntegers && isHexDigit(*CurPtr)) { |
| return LexDigit(); |
| } |
| |
| return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); |
| case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); |
| case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); |
| case '=': |
| if (*CurPtr == '=') { |
| ++CurPtr; |
| return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); |
| } |
| return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); |
| case '-': |
| if (*CurPtr == '>') { |
| ++CurPtr; |
| return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); |
| } |
| return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); |
| case '|': |
| if (*CurPtr == '|') { |
| ++CurPtr; |
| return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); |
| } |
| return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); |
| case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); |
| case '&': |
| if (*CurPtr == '&') { |
| ++CurPtr; |
| return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); |
| } |
| return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); |
| case '!': |
| if (*CurPtr == '=') { |
| ++CurPtr; |
| return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); |
| } |
| return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); |
| case '%': |
| if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) { |
| return LexDigit(); |
| } |
| |
| if (MAI.hasMipsExpressions()) { |
| AsmToken::TokenKind Operator; |
| unsigned OperatorLength; |
| |
| std::tie(Operator, OperatorLength) = |
| StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>( |
| StringRef(CurPtr)) |
| .StartsWith("call16", {AsmToken::PercentCall16, 7}) |
| .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8}) |
| .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8}) |
| .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10}) |
| .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10}) |
| .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9}) |
| .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7}) |
| .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7}) |
| .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9}) |
| .StartsWith("got_page", {AsmToken::PercentGot_Page, 9}) |
| .StartsWith("gottprel", {AsmToken::PercentGottprel, 9}) |
| .StartsWith("got", {AsmToken::PercentGot, 4}) |
| .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7}) |
| .StartsWith("higher", {AsmToken::PercentHigher, 7}) |
| .StartsWith("highest", {AsmToken::PercentHighest, 8}) |
| .StartsWith("hi", {AsmToken::PercentHi, 3}) |
| .StartsWith("lo", {AsmToken::PercentLo, 3}) |
| .StartsWith("neg", {AsmToken::PercentNeg, 4}) |
| .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9}) |
| .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9}) |
| .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6}) |
| .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7}) |
| .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9}) |
| .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9}) |
| .Default({AsmToken::Percent, 1}); |
| |
| if (Operator != AsmToken::Percent) { |
| CurPtr += OperatorLength - 1; |
| return AsmToken(Operator, StringRef(TokStart, OperatorLength)); |
| } |
| } |
| return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); |
| case '/': |
| IsAtStartOfStatement = OldIsAtStartOfStatement; |
| return LexSlash(); |
| case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); |
| case '\'': return LexSingleQuote(); |
| case '"': return LexQuote(); |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| return LexDigit(); |
| case '<': |
| switch (*CurPtr) { |
| case '<': |
| ++CurPtr; |
| return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); |
| case '=': |
| ++CurPtr; |
| return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); |
| case '>': |
| ++CurPtr; |
| return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); |
| default: |
| return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); |
| } |
| case '>': |
| switch (*CurPtr) { |
| case '>': |
| ++CurPtr; |
| return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); |
| case '=': |
| ++CurPtr; |
| return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); |
| default: |
| return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); |
| } |
| |
| // TODO: Quoted identifiers (objc methods etc) |
| // local labels: [0-9][:] |
| // Forward/backward labels: [0-9][fb] |
| // Integers, fp constants, character constants. |
| } |
| } |