clang-tools-extra/pseudo/lib/Lex.cpp - llvm-project - Git at Google

 //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "clang-pseudo/Token.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/LiteralSupport.h"

 namespace clang {
 namespace pseudo {

 TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
   clang::SourceLocation Start;
   // Tokenize using clang's lexer in raw mode.
   // std::string guarantees null-termination, which the lexer needs.
   clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
                      Code.data() + Code.size());
   Lexer.SetCommentRetentionState(true);

   TokenStream Result;
   clang::Token CT;
   // Index into the token stream of original source code.
   Token::Index TokenIndex = 0;
   unsigned LastOffset = 0;
   unsigned Line = 0;
   unsigned Indent = 0;
   for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
        Lexer.LexFromRawLexer(CT)) {
     unsigned Offset =
         CT.getLocation().getRawEncoding() - Start.getRawEncoding();

     Token Tok;
     Tok.Data = &Code[Offset];
     Tok.Length = CT.getLength();
     Tok.Kind = CT.getKind();

     // Update current line number and indentation from raw source code.
     unsigned NewLineStart = 0;
     for (unsigned I = LastOffset; I < Offset; ++I) {
       if (Code[I] == '\n') {
         NewLineStart = I + 1;
         ++Line;
       }
     }
     if (NewLineStart || !LastOffset) {
       Indent = 0;
       for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
         if (C == ' ')
           ++Indent;
         else if (C == '\t')
           Indent += 8;
         else
           break;
       }
     }
     Tok.Indent = Indent;
     Tok.Line = Line;

     if (CT.isAtStartOfLine())
       Tok.setFlag(LexFlags::StartsPPLine);
     if (CT.needsCleaning() || CT.hasUCN())
       Tok.setFlag(LexFlags::NeedsCleaning);

     Tok.OriginalIndex = TokenIndex++;
     Result.push(Tok);
     LastOffset = Offset;
   }
   Result.finalize();
   return Result;
 }

 TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
   auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
   clang::IdentifierTable Identifiers(LangOpts);
   TokenStream Result(CleanedStorage);
   Result.addPayload(Code.getPayload());
   for (auto Tok : Code.tokens()) {
     if (Tok.flag(LexFlags::NeedsCleaning)) {
       // Remove escaped newlines and trigraphs.
       llvm::SmallString<64> CleanBuffer;
       const char *Pos = Tok.text().begin();
       while (Pos < Tok.text().end()) {
         unsigned CharSize = 0;
         CleanBuffer.push_back(
             clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
         assert(CharSize != 0 && "no progress!");
         Pos += CharSize;
       }
       llvm::StringRef Text = CleanBuffer;
       llvm::SmallString<64> UCNBuffer;
       // A surface reading of the standard suggests UCNs might appear anywhere.
       // But we need only decode them in raw_identifiers.
       //  - they cannot appear in punctuation/keyword tokens, because UCNs
       //    cannot encode basic characters outside of literals [lex.charset]
       //  - they can appear in literals, but we need not unescape them now.
       //    We treat them as escape sequences when evaluating the literal.
       //  - comments are handled similarly to literals
       // This is good fortune, because expandUCNs requires its input to be a
       // reasonably valid identifier (e.g. without stray backslashes).
       if (Tok.Kind == tok::raw_identifier) {
         clang::expandUCNs(UCNBuffer, CleanBuffer);
         Text = UCNBuffer;
       }

       Tok.Data = Text.copy(*CleanedStorage).data();
       Tok.Length = Text.size();
       Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
     }

     if (Tok.Kind == tok::raw_identifier) {
       // Cook raw_identifiers into identifier, keyword, etc.
       Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
     } else if (Tok.Kind == tok::greatergreater) {
       // Split the greatergreater token.
       // FIXME: split lessless token to support Cuda triple angle brackets <<<.
       assert(Tok.text() == ">>");
       Tok.Kind = tok::greater;
       Tok.Length = 1;
       Result.push(Tok);
       // Line is wrong if the first greater is followed by an escaped newline!
       Tok.Data = Tok.text().data() + 1;
     }

     Result.push(std::move(Tok));
   }

   Result.finalize();
   return Result;
 }

 } // namespace pseudo
 } // namespace clang
	//===--- Lex.cpp - extract token stream from source code ---------- C++--===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "clang-pseudo/Token.h"
	#include "clang/Basic/IdentifierTable.h"
	#include "clang/Basic/SourceLocation.h"
	#include "clang/Basic/TokenKinds.h"
	#include "clang/Lex/Lexer.h"
	#include "clang/Lex/LiteralSupport.h"

	namespace clang {
	namespace pseudo {

	TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
	clang::SourceLocation Start;
	// Tokenize using clang's lexer in raw mode.
	// std::string guarantees null-termination, which the lexer needs.
	clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
	Code.data() + Code.size());
	Lexer.SetCommentRetentionState(true);

	TokenStream Result;
	clang::Token CT;
	// Index into the token stream of original source code.
	Token::Index TokenIndex = 0;
	unsigned LastOffset = 0;
	unsigned Line = 0;
	unsigned Indent = 0;
	for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
	Lexer.LexFromRawLexer(CT)) {
	unsigned Offset =
	CT.getLocation().getRawEncoding() - Start.getRawEncoding();

	Token Tok;
	Tok.Data = &Code[Offset];
	Tok.Length = CT.getLength();
	Tok.Kind = CT.getKind();

	// Update current line number and indentation from raw source code.
	unsigned NewLineStart = 0;
	for (unsigned I = LastOffset; I < Offset; ++I) {
	if (Code[I] == '\n') {
	NewLineStart = I + 1;
	++Line;
	}
	}
	if (NewLineStart \|\| !LastOffset) {
	Indent = 0;
	for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
	if (C == ' ')
	++Indent;
	else if (C == '\t')
	Indent += 8;
	else
	break;
	}
	}
	Tok.Indent = Indent;
	Tok.Line = Line;

	if (CT.isAtStartOfLine())
	Tok.setFlag(LexFlags::StartsPPLine);
	if (CT.needsCleaning() \|\| CT.hasUCN())
	Tok.setFlag(LexFlags::NeedsCleaning);

	Tok.OriginalIndex = TokenIndex++;
	Result.push(Tok);
	LastOffset = Offset;
	}
	Result.finalize();
	return Result;
	}

	TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
	auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
	clang::IdentifierTable Identifiers(LangOpts);
	TokenStream Result(CleanedStorage);
	Result.addPayload(Code.getPayload());
	for (auto Tok : Code.tokens()) {
	if (Tok.flag(LexFlags::NeedsCleaning)) {
	// Remove escaped newlines and trigraphs.
	llvm::SmallString<64> CleanBuffer;
	const char *Pos = Tok.text().begin();
	while (Pos < Tok.text().end()) {
	unsigned CharSize = 0;
	CleanBuffer.push_back(
	clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
	assert(CharSize != 0 && "no progress!");
	Pos += CharSize;
	}
	llvm::StringRef Text = CleanBuffer;
	llvm::SmallString<64> UCNBuffer;
	// A surface reading of the standard suggests UCNs might appear anywhere.
	// But we need only decode them in raw_identifiers.
	// - they cannot appear in punctuation/keyword tokens, because UCNs
	// cannot encode basic characters outside of literals [lex.charset]
	// - they can appear in literals, but we need not unescape them now.
	// We treat them as escape sequences when evaluating the literal.
	// - comments are handled similarly to literals
	// This is good fortune, because expandUCNs requires its input to be a
	// reasonably valid identifier (e.g. without stray backslashes).
	if (Tok.Kind == tok::raw_identifier) {
	clang::expandUCNs(UCNBuffer, CleanBuffer);
	Text = UCNBuffer;
	}

	Tok.Data = Text.copy(*CleanedStorage).data();
	Tok.Length = Text.size();
	Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
	}

	if (Tok.Kind == tok::raw_identifier) {
	// Cook raw_identifiers into identifier, keyword, etc.
	Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
	} else if (Tok.Kind == tok::greatergreater) {
	// Split the greatergreater token.
	// FIXME: split lessless token to support Cuda triple angle brackets <<<.
	assert(Tok.text() == ">>");
	Tok.Kind = tok::greater;
	Tok.Length = 1;
	Result.push(Tok);
	// Line is wrong if the first greater is followed by an escaped newline!
	Tok.Data = Tok.text().data() + 1;
	}

	Result.push(std::move(Tok));
	}

	Result.finalize();
	return Result;
	}

	} // namespace pseudo
	} // namespace clang