TGLexer.h source code [llvm_projects/llvm/lib/TableGen/TGLexer.h]

1	//===- TGLexer.h - Lexer for TableGen Files ---------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This class represents the Lexer for tablegen files.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14	#define LLVM_LIB_TABLEGEN_TGLEXER_H
15
16	#include "llvm/ADT/SmallVector.h"
17	#include "llvm/ADT/StringRef.h"
18	#include "llvm/ADT/StringSet.h"
19	#include "llvm/Support/DataTypes.h"
20	#include "llvm/Support/SMLoc.h"
21	#include <cassert>
22	#include <memory>
23	#include <set>
24	#include <string>
25
26	namespace llvm {
27	template <typename T> class ArrayRef;
28	class SourceMgr;
29	class Twine;
30
31	namespace tgtok {
32	enum TokKind {
33	// Markers
34	Eof,
35	Error,
36
37	// Tokens with no info.
38	minus, // -
39	plus, // +
40	l_square, // [
41	r_square, // ]
42	l_brace, // {
43	r_brace, // }
44	l_paren, // (
45	r_paren, // )
46	less, // <
47	greater, // >
48	colon, // :
49	semi, // ;
50	comma, // ,
51	dot, // .
52	equal, // =
53	question, // ?
54	paste, // #
55	dotdotdot, // ...
56
57	// Boolean literals.
58	TrueVal,
59	FalseVal,
60
61	// Integer value.
62	IntVal,
63
64	// Binary constant. Note that these are sized according to the number of
65	// bits given.
66	BinaryIntVal,
67
68	// Preprocessing tokens for internal usage by the lexer.
69	// They are never returned as a result of Lex().
70	Ifdef,
71	Ifndef,
72	Else,
73	Endif,
74	Define,
75
76	// Reserved keywords. ('ElseKW' is named to distinguish it from the
77	// existing 'Else' that means the preprocessor #else.)
78	Bit,
79	Bits,
80	Code,
81	Dag,
82	ElseKW,
83	Field,
84	In,
85	Include,
86	Int,
87	List,
88	String,
89	Then,
90
91	// Object start tokens.
92	OBJECT_START_FIRST,
93	Assert = OBJECT_START_FIRST,
94	Class,
95	Def,
96	Defm,
97	Defset,
98	Deftype,
99	Defvar,
100	Dump,
101	Foreach,
102	If,
103	Let,
104	MultiClass,
105	OBJECT_START_LAST = MultiClass,
106
107	// Bang operators.
108	BANG_OPERATOR_FIRST,
109	XConcat = BANG_OPERATOR_FIRST,
110	XADD,
111	XSUB,
112	XMUL,
113	XDIV,
114	XNOT,
115	XLOG2,
116	XAND,
117	XOR,
118	XXOR,
119	XSRA,
120	XSRL,
121	XSHL,
122	XListConcat,
123	XListFlatten,
124	XListSplat,
125	XStrConcat,
126	XInterleave,
127	XSubstr,
128	XFind,
129	XMatch,
130	XCast,
131	XSubst,
132	XForEach,
133	XFilter,
134	XFoldl,
135	XHead,
136	XTail,
137	XSize,
138	XEmpty,
139	XInitialized,
140	XInstances,
141	XIf,
142	XCond,
143	XEq,
144	XIsA,
145	XDag,
146	XNe,
147	XLe,
148	XLt,
149	XGe,
150	XGt,
151	XSetDagOp,
152	XGetDagOp,
153	XExists,
154	XListRemove,
155	XToLower,
156	XToUpper,
157	XRange,
158	XGetDagArg,
159	XGetDagName,
160	XSetDagArg,
161	XSetDagName,
162	XRepr,
163	BANG_OPERATOR_LAST = XRepr,
164
165	// String valued tokens.
166	STRING_VALUE_FIRST,
167	Id = STRING_VALUE_FIRST,
168	StrVal,
169	VarName,
170	CodeFragment,
171	STRING_VALUE_LAST = CodeFragment,
172	};
173
174	/// isBangOperator - Return true if this is a bang operator.
175	static inline bool isBangOperator(tgtok::TokKind Kind) {
176	return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST;
177	}
178
179	/// isObjectStart - Return true if this is a valid first token for a statement.
180	static inline bool isObjectStart(tgtok::TokKind Kind) {
181	return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST;
182	}
183
184	/// isStringValue - Return true if this is a string value.
185	static inline bool isStringValue(tgtok::TokKind Kind) {
186	return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST;
187	}
188	} // namespace tgtok
189
190	/// TGLexer - TableGen Lexer class.
191	class TGLexer {
192	SourceMgr &SrcMgr;
193
194	const char CurPtr = nullptr*;
195	StringRef CurBuf;
196
197	// Information about the current token.
198	const char TokStart = nullptr*;
199	tgtok::TokKind CurCode = tgtok::TokKind::Eof;
200	std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
201	int64_t CurIntVal = `0`; // This is valid for IntVal.
202
203	/// CurBuffer - This is the current buffer index we're lexing from as managed
204	/// by the SourceMgr object.
205	unsigned CurBuffer = `0`;
206
207	public:
208	typedef std::set<std::string> DependenciesSetTy;
209
210	private:
211	/// Dependencies - This is the list of all included files.
212	DependenciesSetTy Dependencies;
213
214	public:
215	TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
216
217	tgtok::TokKind Lex() {
218	return CurCode = LexToken(FileOrLineStart: CurPtr == CurBuf.begin());
219	}
220
221	const DependenciesSetTy &getDependencies() const {
222	return Dependencies;
223	}
224
225	tgtok::TokKind getCode() const { return CurCode; }
226
227	const std::string &getCurStrVal() const {
228	assert(tgtok::isStringValue(CurCode) &&
229	"This token doesn't have a string value");
230	return CurStrVal;
231	}
232	int64_t getCurIntVal() const {
233	assert(CurCode == tgtok::IntVal && "This token isn't an integer");
234	return CurIntVal;
235	}
236	std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
237	assert(CurCode == tgtok::BinaryIntVal &&
238	"This token isn't a binary integer");
239	return {CurIntVal, (CurPtr - TokStart) - `2`};
240	}
241
242	SMLoc getLoc() const;
243	SMRange getLocRange() const;
244
245	private:
246	/// LexToken - Read the next token and return its code.
247	tgtok::TokKind LexToken(bool FileOrLineStart = false);
248
249	tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
250	tgtok::TokKind ReturnError(const char Loc, const* Twine &Msg);
251
252	int getNextChar();
253	int peekNextChar(int Index) const;
254	void SkipBCPLComment();
255	bool SkipCComment();
256	tgtok::TokKind LexIdentifier();
257	bool LexInclude();
258	tgtok::TokKind LexString();
259	tgtok::TokKind LexVarName();
260	tgtok::TokKind LexNumber();
261	tgtok::TokKind LexBracket();
262	tgtok::TokKind LexExclaim();
263
264	// Process EOF encountered in LexToken().
265	// If EOF is met in an include file, then the method will update
266	// CurPtr, CurBuf and preprocessing include stack, and return true.
267	// If EOF is met in the top-level file, then the method will
268	// update and check the preprocessing include stack, and return false.
269	bool processEOF();
270
271	// * Structures and methods for preprocessing support *
272
273	// A set of macro names that are defined either via command line or
274	// by using:
275	// #define NAME
276	StringSet<> DefinedMacros;
277
278	// Each of #ifdef and #else directives has a descriptor associated
279	// with it.
280	//
281	// An ordered list of preprocessing controls defined by #ifdef/#else
282	// directives that are in effect currently is called preprocessing
283	// control stack. It is represented as a vector of PreprocessorControlDesc's.
284	//
285	// The control stack is updated according to the following rules:
286	//
287	// For each #ifdef we add an element to the control stack.
288	// For each #else we replace the top element with a descriptor
289	// with an inverted IsDefined value.
290	// For each #endif we pop the top element from the control stack.
291	//
292	// When CurPtr reaches the current buffer's end, the control stack
293	// must be empty, i.e. #ifdef and the corresponding #endif
294	// must be located in the same file.
295	struct PreprocessorControlDesc {
296	// Either tgtok::Ifdef or tgtok::Else.
297	tgtok::TokKind Kind;
298
299	// True, if the condition for this directive is true, false - otherwise.
300	// Examples:
301	// #ifdef NAME : true, if NAME is defined, false - otherwise.
302	// ...
303	// #else : false, if NAME is defined, true - otherwise.
304	bool IsDefined;
305
306	// Pointer into CurBuf to the beginning of the preprocessing directive
307	// word, e.g.:
308	// #ifdef NAME
309	// ^ - SrcPos
310	SMLoc SrcPos;
311	};
312
313	// We want to disallow code like this:
314	// file1.td:
315	// #define NAME
316	// #ifdef NAME
317	// include "file2.td"
318	// EOF
319	// file2.td:
320	// #endif
321	// EOF
322	//
323	// To do this, we clear the preprocessing control stack on entry
324	// to each of the included file. PrepIncludeStack is used to store
325	// preprocessing control stacks for the current file and all its
326	// parent files. The back() element is the preprocessing control
327	// stack for the current file.
328	SmallVector<SmallVector<PreprocessorControlDesc>> PrepIncludeStack;
329
330	// Validate that the current preprocessing control stack is empty,
331	// since we are about to exit a file, and pop the include stack.
332	//
333	// If IncludeStackMustBeEmpty is true, the include stack must be empty
334	// after the popping, otherwise, the include stack must not be empty
335	// after the popping. Basically, the include stack must be empty
336	// only if we exit the "top-level" file (i.e. finish lexing).
337	//
338	// The method returns false, if the current preprocessing control stack
339	// is not empty (e.g. there is an unterminated #ifdef/#else),
340	// true - otherwise.
341	bool prepExitInclude(bool IncludeStackMustBeEmpty);
342
343	// Look ahead for a preprocessing directive starting from CurPtr. The caller
344	// must only call this method, if (CurPtr - 1) is '#'. If the method matches*
345	// a preprocessing directive word followed by a whitespace, then it returns
346	// one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
347	//
348	// CurPtr is not adjusted by this method.
349	tgtok::TokKind prepIsDirective() const;
350
351	// Given a preprocessing token kind, adjusts CurPtr to the end
352	// of the preprocessing directive word.
353	//
354	// We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
355	// to avoid adjusting CurPtr before we are sure that '#' is followed
356	// by a preprocessing directive. If it is not, then we fall back to
357	// tgtok::paste interpretation of '#'.
358	void prepEatPreprocessorDirective(tgtok::TokKind Kind);
359
360	// The main "exit" point from the token parsing to preprocessor.
361	//
362	// The method is called for CurPtr, when prepIsDirective() returns
363	// true. The first parameter matches the result of prepIsDirective(),
364	// denoting the actual preprocessor directive to be processed.
365	//
366	// If the preprocessing directive disables the tokens processing, e.g.:
367	// #ifdef NAME // NAME is undefined
368	// then lexPreprocessor() enters the lines-skipping mode.
369	// In this mode, it does not parse any tokens, because the code under
370	// the #ifdef may not even be a correct tablegen code. The preprocessor
371	// looks for lines containing other preprocessing directives, which
372	// may be prepended with whitespaces and C-style comments. If the line
373	// does not contain a preprocessing directive, it is skipped completely.
374	// Otherwise, the preprocessing directive is processed by recursively
375	// calling lexPreprocessor(). The processing of the encountered
376	// preprocessing directives includes updating preprocessing control stack
377	// and adding new macros into DefinedMacros set.
378	//
379	// The second parameter controls whether lexPreprocessor() is called from
380	// LexToken() (true) or recursively from lexPreprocessor() (false).
381	//
382	// If ReturnNextLiveToken is true, the method returns the next
383	// LEX token following the current directive or following the end
384	// of the disabled preprocessing region corresponding to this directive.
385	// If ReturnNextLiveToken is false, the method returns the first parameter,
386	// unless there were errors encountered in the disabled preprocessing
387	// region - in this case, it returns tgtok::Error.
388	tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
389	bool ReturnNextLiveToken = true);
390
391	// Worker method for lexPreprocessor() to skip lines after some
392	// preprocessing directive up to the buffer end or to the directive
393	// that re-enables token processing. The method returns true
394	// upon processing the next directive that re-enables tokens
395	// processing. False is returned if an error was encountered.
396	//
397	// Note that prepSkipRegion() calls lexPreprocessor() to process
398	// encountered preprocessing directives. In this case, the second
399	// parameter to lexPreprocessor() is set to false. Being passed
400	// false ReturnNextLiveToken, lexPreprocessor() must never call
401	// prepSkipRegion(). We assert this by passing ReturnNextLiveToken
402	// to prepSkipRegion() and checking that it is never set to false.
403	bool prepSkipRegion(bool MustNeverBeFalse);
404
405	// Lex name of the macro after either #ifdef or #define. We could have used
406	// LexIdentifier(), but it has special handling of "include" word, which
407	// could result in awkward diagnostic errors. Consider:
408	// ----
409	// #ifdef include
410	// class ...
411	// ----
412	// LexIdentifier() will engage LexInclude(), which will complain about
413	// missing file with name "class". Instead, prepLexMacroName() will treat
414	// "include" as a normal macro name.
415	//
416	// On entry, CurPtr points to the end of a preprocessing directive word.
417	// The method allows for whitespaces between the preprocessing directive
418	// and the macro name. The allowed whitespaces are ' ' and '\t'.
419	//
420	// If the first non-whitespace symbol after the preprocessing directive
421	// is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
422	// the method updates TokStart to the position of the first non-whitespace
423	// symbol, sets CurPtr to the position of the macro name's last symbol,
424	// and returns a string reference to the macro name. Otherwise,
425	// TokStart is set to the first non-whitespace symbol after the preprocessing
426	// directive, and the method returns an empty string reference.
427	//
428	// In all cases, TokStart may be used to point to the word following
429	// the preprocessing directive.
430	StringRef prepLexMacroName();
431
432	// Skip any whitespaces starting from CurPtr. The method is used
433	// only in the lines-skipping mode to find the first non-whitespace
434	// symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n'
435	// and '\r'. The method skips C-style comments as well, because
436	// it is used to find the beginning of the preprocessing directive.
437	// If we do not handle C-style comments the following code would
438	// result in incorrect detection of a preprocessing directive:
439	// /*
440	// #ifdef NAME
441	// /*
442	// As long as we skip C-style comments, the following code is correctly
443	// recognized as a preprocessing directive:
444	// / first line comment*
445	// second line comment / #ifdef NAME*
446	//
447	// The method returns true upon reaching the first non-whitespace symbol
448	// or EOF, CurPtr is set to point to this symbol. The method returns false,
449	// if an error occurred during skipping of a C-style comment.
450	bool prepSkipLineBegin();
451
452	// Skip any whitespaces or comments after a preprocessing directive.
453	// The method returns true upon reaching either end of the line
454	// or end of the file. If there is a multiline C-style comment
455	// after the preprocessing directive, the method skips
456	// the comment, so the final CurPtr may point to one of the next lines.
457	// The method returns false, if an error occurred during skipping
458	// C- or C++-style comment, or a non-whitespace symbol appears
459	// after the preprocessing directive.
460	//
461	// The method maybe called both during lines-skipping and tokens
462	// processing. It actually verifies that only whitespaces or/and
463	// comments follow a preprocessing directive.
464	//
465	// After the execution of this mehod, CurPtr points either to new line
466	// symbol, buffer end or non-whitespace symbol following the preprocesing
467	// directive.
468	bool prepSkipDirectiveEnd();
469
470	// Return true, if the current preprocessor control stack is such that
471	// we should allow lexer to process the next token, false - otherwise.
472	//
473	// In particular, the method returns true, if all the #ifdef/#else
474	// controls on the stack have their IsDefined member set to true.
475	bool prepIsProcessingEnabled();
476
477	// Report an error, if we reach EOF with non-empty preprocessing control
478	// stack. This means there is no matching #endif for the previous
479	// #ifdef/#else.
480	void prepReportPreprocessorStackError();
481	};
482
483	} // end namespace llvm
484
485	#endif
486

Browse the source code of llvm_projects/llvm/lib/TableGen/TGLexer.h