lib/Format/FormatTokenLexer.h - clang - Git at Google

 //===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file contains FormatTokenLexer, which tokenizes a source file
 /// into a token stream suitable for ClangFormat.
 ///
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
 #define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H

 #include "Encoding.h"
 #include "FormatToken.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Support/Regex.h"

 #include <stack>

 namespace clang {
 namespace format {

 enum LexerState {
   NORMAL,
   TEMPLATE_STRING,
   TOKEN_STASHED,
 };

 class FormatTokenLexer {
 public:
   FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column,
                    const FormatStyle &Style, encoding::Encoding Encoding);

   ArrayRef<FormatToken *> lex();

   const AdditionalKeywords &getKeywords() { return Keywords; }

 private:
   void tryMergePreviousTokens();

   bool tryMergeLessLess();
   bool tryMergeNSStringLiteral();
   bool tryMergeJSPrivateIdentifier();
   bool tryMergeCSharpVerbatimStringLiteral();
   bool tryMergeCSharpKeywordVariables();
   bool tryMergeCSharpNullConditionals();
   bool tryMergeCSharpDoubleQuestion();
   bool tryTransformCSharpForEach();

   bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);

   // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
   bool precedesOperand(FormatToken *Tok);

   bool canPrecedeRegexLiteral(FormatToken *Prev);

   // Tries to parse a JavaScript Regex literal starting at the current token,
   // if that begins with a slash and is in a location where JavaScript allows
   // regex literals. Changes the current token to a regex literal and updates
   // its text if successful.
   void tryParseJSRegexLiteral();

   // Handles JavaScript template strings.
   //
   // JavaScript template strings use backticks ('`') as delimiters, and allow
   // embedding expressions nested in ${expr-here}. Template strings can be
   // nested recursively, i.e. expressions can contain template strings in turn.
   //
   // The code below parses starting from a backtick, up to a closing backtick or
   // an opening ${. It also maintains a stack of lexing contexts to handle
   // nested template parts by balancing curly braces.
   void handleTemplateStrings();

   void tryParsePythonComment();

   bool tryMerge_TMacro();

   bool tryMergeConflictMarkers();

   FormatToken *getStashedToken();

   FormatToken *getNextToken();

   FormatToken *FormatTok;
   bool IsFirstToken;
   std::stack<LexerState> StateStack;
   unsigned Column;
   unsigned TrailingWhitespace;
   std::unique_ptr<Lexer> Lex;
   const SourceManager &SourceMgr;
   FileID ID;
   const FormatStyle &Style;
   IdentifierTable IdentTable;
   AdditionalKeywords Keywords;
   encoding::Encoding Encoding;
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
   // Index (in 'Tokens') of the last token that starts a new line.
   unsigned FirstInLineIndex;
   SmallVector<FormatToken *, 16> Tokens;

   llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros;

   bool FormattingDisabled;

   llvm::Regex MacroBlockBeginRegex;
   llvm::Regex MacroBlockEndRegex;

   void readRawToken(FormatToken &Tok);

   void resetLexer(unsigned Offset);
 };

 } // namespace format
 } // namespace clang

 #endif
	//===--- FormatTokenLexer.h - Format C++ code ----------------- C++ -----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file contains FormatTokenLexer, which tokenizes a source file
	/// into a token stream suitable for ClangFormat.
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
	#define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H

	#include "Encoding.h"
	#include "FormatToken.h"
	#include "clang/Basic/SourceLocation.h"
	#include "clang/Basic/SourceManager.h"
	#include "clang/Format/Format.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/Support/Regex.h"

	#include <stack>

	namespace clang {
	namespace format {

	enum LexerState {
	NORMAL,
	TEMPLATE_STRING,
	TOKEN_STASHED,
	};

	class FormatTokenLexer {
	public:
	FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column,
	const FormatStyle &Style, encoding::Encoding Encoding);

	ArrayRef<FormatToken *> lex();

	const AdditionalKeywords &getKeywords() { return Keywords; }

	private:
	void tryMergePreviousTokens();

	bool tryMergeLessLess();
	bool tryMergeNSStringLiteral();
	bool tryMergeJSPrivateIdentifier();
	bool tryMergeCSharpVerbatimStringLiteral();
	bool tryMergeCSharpKeywordVariables();
	bool tryMergeCSharpNullConditionals();
	bool tryMergeCSharpDoubleQuestion();
	bool tryTransformCSharpForEach();

	bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);

	// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
	bool precedesOperand(FormatToken *Tok);

	bool canPrecedeRegexLiteral(FormatToken *Prev);

	// Tries to parse a JavaScript Regex literal starting at the current token,
	// if that begins with a slash and is in a location where JavaScript allows
	// regex literals. Changes the current token to a regex literal and updates
	// its text if successful.
	void tryParseJSRegexLiteral();

	// Handles JavaScript template strings.
	//
	// JavaScript template strings use backticks ('`') as delimiters, and allow
	// embedding expressions nested in ${expr-here}. Template strings can be
	// nested recursively, i.e. expressions can contain template strings in turn.
	//
	// The code below parses starting from a backtick, up to a closing backtick or
	// an opening ${. It also maintains a stack of lexing contexts to handle
	// nested template parts by balancing curly braces.
	void handleTemplateStrings();

	void tryParsePythonComment();

	bool tryMerge_TMacro();

	bool tryMergeConflictMarkers();

	FormatToken *getStashedToken();

	FormatToken *getNextToken();

	FormatToken *FormatTok;
	bool IsFirstToken;
	std::stack<LexerState> StateStack;
	unsigned Column;
	unsigned TrailingWhitespace;
	std::unique_ptr<Lexer> Lex;
	const SourceManager &SourceMgr;
	FileID ID;
	const FormatStyle &Style;
	IdentifierTable IdentTable;
	AdditionalKeywords Keywords;
	encoding::Encoding Encoding;
	llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
	// Index (in 'Tokens') of the last token that starts a new line.
	unsigned FirstInLineIndex;
	SmallVector<FormatToken *, 16> Tokens;

	llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros;

	bool FormattingDisabled;

	llvm::Regex MacroBlockBeginRegex;
	llvm::Regex MacroBlockEndRegex;

	void readRawToken(FormatToken &Tok);

	void resetLexer(unsigned Offset);
	};

	} // namespace format
	} // namespace clang

	#endif