| //===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // Tokens are the first level of abstraction above bytes used in pseudoparsing. |
| // We use clang's lexer to scan the bytes (in raw mode, with no preprocessor). |
| // The tokens is wrapped into pseudo::Token, along with line/indent info. |
| // |
| // Unlike clang, we make multiple passes over the whole file, out-of-order. |
| // Therefore we retain the whole token sequence in memory. (This is feasible as |
| // we process one file at a time). pseudo::TokenStream holds such a stream. |
| // The initial stream holds the raw tokens read from the file, later passes |
| // operate on derived TokenStreams (e.g. with directives stripped). |
| // |
| // Similar facilities from clang that are *not* used: |
| // - SourceManager: designed around multiple files and precise macro expansion. |
| // - clang::Token: coupled to SourceManager, doesn't retain layout info. |
| // (pseudo::Token is similar, but without SourceLocations). |
| // - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros. |
| // (pseudo::TokenStream is similar, but a flat token list). |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef CLANG_PSEUDO_TOKEN_H |
| #define CLANG_PSEUDO_TOKEN_H |
| |
| #include "clang/Basic/LLVM.h" |
| #include "clang/Basic/LangStandard.h" |
| #include "clang/Basic/TokenKinds.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/STLForwardCompat.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include <cstdint> |
| #include <limits> |
| #include <memory> |
| #include <vector> |
| |
| namespace clang { |
| class LangOptions; |
| namespace pseudo { |
| |
| /// A single C++ or preprocessor token. |
| /// |
| /// Unlike clang::Token and syntax::Token, these tokens are not connected to a |
| /// SourceManager - we are not dealing with multiple files. |
| struct Token { |
| /// An Index identifies a token within a stream. |
| using Index = uint32_t; |
| /// A sentinel Index indicating no token. |
| constexpr static Index Invalid = std::numeric_limits<Index>::max(); |
| struct Range; |
| |
| /// The token text. |
| /// |
| /// Typically from the original source file, but may have been synthesized. |
| StringRef text() const { return StringRef(Data, Length); } |
| const char *Data = nullptr; |
| uint32_t Length = 0; |
| |
| /// Zero-based line number for the start of the token. |
| /// This refers to the original source file as written. |
| uint32_t Line = 0; |
| /// Width of whitespace before the first token on this line. |
| uint8_t Indent = 0; |
| /// Flags have some meaning defined by the function that produced this stream. |
| uint8_t Flags = 0; |
| /// Index into the original token stream (as raw-lexed from the source code). |
| Index OriginalIndex = Invalid; |
| // Helpers to get/set Flags based on `enum class`. |
| template <class T> bool flag(T Mask) const { |
| return Flags & uint8_t{llvm::to_underlying(Mask)}; |
| } |
| template <class T> void setFlag(T Mask) { |
| Flags |= uint8_t{llvm::to_underlying(Mask)}; |
| } |
| |
| /// Returns the next token in the stream. this may not be a sentinel. |
| const Token &next() const { |
| assert(Kind != tok::eof); |
| return *(this + 1); |
| } |
| /// Returns the next token in the stream, skipping over comments. |
| const Token &nextNC() const { |
| const Token *T = this; |
| do |
| T = &T->next(); |
| while (T->Kind == tok::comment); |
| return *T; |
| } |
| /// Returns the previous token in the stream. this may not be a sentinel. |
| const Token &prev() const { |
| assert(Kind != tok::eof); |
| return *(this - 1); |
| } |
| /// Returns the bracket paired with this one, if any. |
| const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; } |
| |
| /// The type of token as determined by clang's lexer. |
| clang::tok::TokenKind Kind = clang::tok::unknown; |
| /// If this token is a paired bracket, the offset of the pair in the stream. |
| int32_t Pair = 0; |
| }; |
| static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!"); |
| llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &); |
| |
| /// A half-open range of tokens within a stream. |
| struct Token::Range { |
| Index Begin = 0; |
| Index End = 0; |
| |
| uint32_t size() const { return End - Begin; } |
| static Range emptyAt(Index Index) { return Range{Index, Index}; } |
| }; |
| llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &); |
| |
| /// A complete sequence of Tokens representing a source file. |
| /// |
| /// This may match a raw file from disk, or be derived from a previous stream. |
| /// For example, stripping comments from a TokenStream results in a new stream. |
| /// |
| /// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes: |
| /// int main ( ) ; |
| /// eof kw_int ident l_paren r_paren semi eof |
| /// front() back() |
| /// 0 1 2 3 4 5 |
| class TokenStream { |
| public: |
| /// Create an empty stream. |
| /// |
| /// Initially, the stream is appendable and not finalized. |
| /// The token sequence may only be accessed after finalize() is called. |
| /// |
| /// Payload is an opaque object which will be owned by the stream. |
| /// e.g. an allocator to hold backing storage for synthesized token text. |
| explicit TokenStream(std::shared_ptr<void> Payload = nullptr); |
| |
| /// Append a token to the stream, which must not be finalized. |
| void push(Token T) { |
| assert(!isFinalized()); |
| Storage.push_back(std::move(T)); |
| } |
| |
| /// Finalize the token stream, allowing tokens to be accessed. |
| /// Tokens may no longer be appended. |
| void finalize(); |
| bool isFinalized() const; |
| |
| /// Returns the index of T within the stream. |
| /// |
| /// T must be within the stream or the end sentinel (not the start sentinel). |
| Token::Index index(const Token &T) const { |
| assert(isFinalized()); |
| assert(&T >= Storage.data() && &T < Storage.data() + Storage.size()); |
| assert(&T != Storage.data() && "start sentinel"); |
| return &T - Tokens.data(); |
| } |
| |
| ArrayRef<Token> tokens() const { |
| assert(isFinalized()); |
| return Tokens; |
| } |
| ArrayRef<Token> tokens(Token::Range R) const { |
| return tokens().slice(R.Begin, R.End - R.Begin); |
| } |
| |
| MutableArrayRef<Token> tokens() { |
| assert(isFinalized()); |
| return Tokens; |
| } |
| |
| /// May return the end sentinel if the stream is empty. |
| const Token &front() const { |
| assert(isFinalized()); |
| return Storage[1]; |
| } |
| |
| /// Returns the shared payload. |
| std::shared_ptr<void> getPayload() const { return Payload; } |
| /// Adds the given payload to the stream. |
| void addPayload(std::shared_ptr<void> P) { |
| if (!Payload) |
| Payload = std::move(P); |
| else |
| Payload = std::make_shared< |
| std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>( |
| std::move(P), std::move(Payload)); |
| } |
| |
| /// Print the tokens in this stream to the output stream. |
| /// |
| /// The presence of newlines/spaces is preserved, but not the quantity. |
| void print(llvm::raw_ostream &) const; |
| |
| private: |
| std::shared_ptr<void> Payload; |
| |
| MutableArrayRef<Token> Tokens; |
| std::vector<Token> Storage; // eof + Tokens + eof |
| }; |
| llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &); |
| |
| /// Extracts a raw token stream from the source code. |
| /// |
| /// All tokens will reference the data of the provided string. |
| /// "word-like" tokens such as identifiers and keywords will be raw_identifier. |
| TokenStream lex(const std::string &, const clang::LangOptions &); |
| enum class LexFlags : uint8_t { |
| /// Marks the token at the start of a logical preprocessor line. |
| /// This is a position where a directive might start. |
| /// |
| /// Here, the first # is StartsPPLine, but second is not (same logical line). |
| /// #define X(error) \ |
| /// #error // not a directive! |
| /// |
| /// Careful, the directive may not start exactly on the StartsPPLine token: |
| /// /*comment*/ #include <foo.h> |
| StartsPPLine = 1 << 0, |
| /// Marks tokens containing trigraphs, escaped newlines, UCNs etc. |
| /// The text() of such tokens will contain the raw trigrah. |
| NeedsCleaning = 1 << 1, |
| }; |
| /// A generic lang options suitable for lexing/parsing a langage. |
| clang::LangOptions genericLangOpts( |
| clang::Language = clang::Language::CXX, |
| clang::LangStandard::Kind = clang::LangStandard::lang_unspecified); |
| |
| /// Decoding raw tokens written in the source code, returning a derived stream. |
| /// |
| /// - escaped newlines within tokens are removed |
| /// - trigraphs are replaced with the characters they encode |
| /// - UCNs within raw_identifiers are replaced by the characters they encode |
| /// (UCNs within strings, comments etc are not translated) |
| /// - raw_identifier tokens are assigned their correct keyword type |
| /// - the >> token is split into separate > > tokens |
| /// (we use a modified grammar where >> is a nonterminal, not a token) |
| /// |
| /// The StartsPPLine flag is preserved. |
| /// |
| /// Formally the identifier correctly happens before preprocessing, while we |
| /// should only cook raw_identifiers that survive preprocessing. |
| /// However, ignoring the Token::Kind of tokens in directives achieves the same. |
| /// (And having cooked token kinds in PP-disabled sections is useful for us). |
| TokenStream cook(const TokenStream &, const clang::LangOptions &); |
| |
| /// Drops comment tokens. |
| TokenStream stripComments(const TokenStream &); |
| |
| } // namespace pseudo |
| } // namespace clang |
| |
| #endif // CLANG_PSEUDO_TOKEN_H |