FormatTokenLexer.h source code [llvm_projects/clang/lib/Format/FormatTokenLexer.h]

1	//===--- FormatTokenLexer.h - Format C++ code ----------------- C++ -----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file contains FormatTokenLexer, which tokenizes a source file
11	/// into a token stream suitable for ClangFormat.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
16	#define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
17
18	#include "Encoding.h"
19	#include "FormatToken.h"
20	#include "llvm/ADT/MapVector.h"
21	#include "llvm/ADT/SmallPtrSet.h"
22	#include "llvm/ADT/StringSet.h"
23
24	#include <stack>
25
26	namespace clang {
27	namespace format {
28
29	enum LexerState {
30	NORMAL,
31	TEMPLATE_STRING,
32	TOKEN_STASHED,
33	};
34
35	class FormatTokenLexer {
36	public:
37	FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column,
38	const FormatStyle &Style, encoding::Encoding Encoding,
39	llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
40	IdentifierTable &IdentTable);
41
42	ArrayRef<FormatToken *> lex();
43
44	const AdditionalKeywords &getKeywords() { return Keywords; }
45
46	private:
47	void tryMergePreviousTokens();
48
49	bool tryMergeLessLess();
50	bool tryMergeGreaterGreater();
51	bool tryMergeUserDefinedLiteral();
52	bool tryMergeNSStringLiteral();
53	bool tryMergeJSPrivateIdentifier();
54	bool tryMergeCSharpStringLiteral();
55	bool tryMergeCSharpKeywordVariables();
56	bool tryMergeNullishCoalescingEqual();
57	bool tryTransformCSharpForEach();
58	bool tryMergeForEach();
59
60	// Merge the most recently lexed tokens into a single token if their kinds are
61	// correct.
62	bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);
63	// Merge without checking their kinds.
64	bool tryMergeTokens(size_t Count, TokenType NewType);
65	// Merge if their kinds match any one of Kinds.
66	bool tryMergeTokensAny(ArrayRef<ArrayRef<tok::TokenKind>> Kinds,
67	TokenType NewType);
68
69	// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
70	bool precedesOperand(FormatToken *Tok);
71
72	bool canPrecedeRegexLiteral(FormatToken *Prev);
73
74	void tryParseJavaTextBlock();
75
76	// Tries to parse a JavaScript Regex literal starting at the current token,
77	// if that begins with a slash and is in a location where JavaScript allows
78	// regex literals. Changes the current token to a regex literal and updates
79	// its text if successful.
80	void tryParseJSRegexLiteral();
81
82	// Handles JavaScript template strings.
83	//
84	// JavaScript template strings use backticks ('`') as delimiters, and allow
85	// embedding expressions nested in ${expr-here}. Template strings can be
86	// nested recursively, i.e. expressions can contain template strings in turn.
87	//
88	// The code below parses starting from a backtick, up to a closing backtick or
89	// an opening ${. It also maintains a stack of lexing contexts to handle
90	// nested template parts by balancing curly braces.
91	void handleTemplateStrings();
92
93	void handleCSharpVerbatimAndInterpolatedStrings();
94
95	// Handles TableGen multiline strings. It has the form [{ ... }].
96	void handleTableGenMultilineString();
97	// Handles TableGen numeric like identifiers.
98	// They have a forms of [0-9][_a-zA-Z]([_a-zA-Z0-9]). But limited to the
99	// case it is not lexed as an integer.
100	void handleTableGenNumericLikeIdentifier();
101
102	void tryParsePythonComment();
103
104	bool tryMerge_TMacro();
105
106	bool tryMergeConflictMarkers();
107
108	void truncateToken(size_t NewLen);
109
110	FormatToken *getStashedToken();
111
112	FormatToken *getNextToken();
113
114	FormatToken *FormatTok;
115	bool IsFirstToken;
116	std::stack<LexerState> StateStack;
117	unsigned Column;
118	unsigned TrailingWhitespace;
119	std::unique_ptr<Lexer> Lex;
120	LangOptions LangOpts;
121	const SourceManager &SourceMgr;
122	FileID ID;
123	const FormatStyle &Style;
124	IdentifierTable &IdentTable;
125	AdditionalKeywords Keywords;
126	encoding::Encoding Encoding;
127	llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
128	// Index (in 'Tokens') of the last token that starts a new line.
129	unsigned FirstInLineIndex;
130	SmallVector<FormatToken *, `16`> Tokens;
131
132	llvm::SmallMapVector<IdentifierInfo *, TokenType, `8`> Macros;
133
134	llvm::SmallPtrSet<IdentifierInfo *, `8`> MacrosSkippedByRemoveParentheses,
135	TemplateNames, TypeNames, VariableTemplates;
136
137	bool FormattingDisabled;
138	llvm::Regex FormatOffRegex; // For one line.
139
140	llvm::Regex MacroBlockBeginRegex;
141	llvm::Regex MacroBlockEndRegex;
142
143	// Targets that may appear inside a C# attribute.
144	static const llvm::StringSet<> CSharpAttributeTargets;
145
146	/// Handle Verilog-specific tokens.
147	bool readRawTokenVerilogSpecific(Token &Tok);
148
149	void readRawToken(FormatToken &Tok);
150
151	void resetLexer(unsigned Offset);
152	};
153
154	} // namespace format
155	} // namespace clang
156
157	#endif
158

Browse the source code of llvm_projects/clang/lib/Format/FormatTokenLexer.h