| //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "clang-pseudo/Token.h" |
| #include "clang/Basic/IdentifierTable.h" |
| #include "clang/Basic/SourceLocation.h" |
| #include "clang/Basic/TokenKinds.h" |
| #include "clang/Lex/Lexer.h" |
| #include "clang/Lex/LiteralSupport.h" |
| |
| namespace clang { |
| namespace pseudo { |
| |
| TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) { |
| clang::SourceLocation Start; |
| // Tokenize using clang's lexer in raw mode. |
| // std::string guarantees null-termination, which the lexer needs. |
| clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(), |
| Code.data() + Code.size()); |
| Lexer.SetCommentRetentionState(true); |
| |
| TokenStream Result; |
| clang::Token CT; |
| // Index into the token stream of original source code. |
| Token::Index TokenIndex = 0; |
| unsigned LastOffset = 0; |
| unsigned Line = 0; |
| unsigned Indent = 0; |
| for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof; |
| Lexer.LexFromRawLexer(CT)) { |
| unsigned Offset = |
| CT.getLocation().getRawEncoding() - Start.getRawEncoding(); |
| |
| Token Tok; |
| Tok.Data = &Code[Offset]; |
| Tok.Length = CT.getLength(); |
| Tok.Kind = CT.getKind(); |
| |
| // Update current line number and indentation from raw source code. |
| unsigned NewLineStart = 0; |
| for (unsigned I = LastOffset; I < Offset; ++I) { |
| if (Code[I] == '\n') { |
| NewLineStart = I + 1; |
| ++Line; |
| } |
| } |
| if (NewLineStart || !LastOffset) { |
| Indent = 0; |
| for (char C : StringRef(Code).slice(NewLineStart, Offset)) { |
| if (C == ' ') |
| ++Indent; |
| else if (C == '\t') |
| Indent += 8; |
| else |
| break; |
| } |
| } |
| Tok.Indent = Indent; |
| Tok.Line = Line; |
| |
| if (CT.isAtStartOfLine()) |
| Tok.setFlag(LexFlags::StartsPPLine); |
| if (CT.needsCleaning() || CT.hasUCN()) |
| Tok.setFlag(LexFlags::NeedsCleaning); |
| |
| Tok.OriginalIndex = TokenIndex++; |
| Result.push(Tok); |
| LastOffset = Offset; |
| } |
| Result.finalize(); |
| return Result; |
| } |
| |
| TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) { |
| auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>(); |
| clang::IdentifierTable Identifiers(LangOpts); |
| TokenStream Result(CleanedStorage); |
| Result.addPayload(Code.getPayload()); |
| for (auto Tok : Code.tokens()) { |
| if (Tok.flag(LexFlags::NeedsCleaning)) { |
| // Remove escaped newlines and trigraphs. |
| llvm::SmallString<64> CleanBuffer; |
| const char *Pos = Tok.text().begin(); |
| while (Pos < Tok.text().end()) { |
| unsigned CharSize = 0; |
| CleanBuffer.push_back( |
| clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts)); |
| assert(CharSize != 0 && "no progress!"); |
| Pos += CharSize; |
| } |
| llvm::StringRef Text = CleanBuffer; |
| llvm::SmallString<64> UCNBuffer; |
| // A surface reading of the standard suggests UCNs might appear anywhere. |
| // But we need only decode them in raw_identifiers. |
| // - they cannot appear in punctuation/keyword tokens, because UCNs |
| // cannot encode basic characters outside of literals [lex.charset] |
| // - they can appear in literals, but we need not unescape them now. |
| // We treat them as escape sequences when evaluating the literal. |
| // - comments are handled similarly to literals |
| // This is good fortune, because expandUCNs requires its input to be a |
| // reasonably valid identifier (e.g. without stray backslashes). |
| if (Tok.Kind == tok::raw_identifier) { |
| clang::expandUCNs(UCNBuffer, CleanBuffer); |
| Text = UCNBuffer; |
| } |
| |
| Tok.Data = Text.copy(*CleanedStorage).data(); |
| Tok.Length = Text.size(); |
| Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning); |
| } |
| |
| if (Tok.Kind == tok::raw_identifier) { |
| // Cook raw_identifiers into identifier, keyword, etc. |
| Tok.Kind = Identifiers.get(Tok.text()).getTokenID(); |
| } else if (Tok.Kind == tok::greatergreater) { |
| // Split the greatergreater token. |
| // FIXME: split lessless token to support Cuda triple angle brackets <<<. |
| assert(Tok.text() == ">>"); |
| Tok.Kind = tok::greater; |
| Tok.Length = 1; |
| Result.push(Tok); |
| // Line is wrong if the first greater is followed by an escaped newline! |
| Tok.Data = Tok.text().data() + 1; |
| } |
| |
| Result.push(std::move(Tok)); |
| } |
| |
| Result.finalize(); |
| return Result; |
| } |
| |
| } // namespace pseudo |
| } // namespace clang |