1//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class represents the Lexer for tablegen files.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14#define LLVM_LIB_TABLEGEN_TGLEXER_H
15
16#include "llvm/ADT/SmallVector.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/ADT/StringSet.h"
19#include "llvm/Support/DataTypes.h"
20#include "llvm/Support/SMLoc.h"
21#include <cassert>
22#include <set>
23#include <string>
24
25namespace llvm {
26template <typename T> class ArrayRef;
27class SourceMgr;
28class Twine;
29
30namespace tgtok {
31enum TokKind {
32 // Markers
33 Eof,
34 Error,
35
36 // Tokens with no info.
37 minus, // -
38 plus, // +
39 l_square, // [
40 r_square, // ]
41 l_brace, // {
42 r_brace, // }
43 l_paren, // (
44 r_paren, // )
45 less, // <
46 greater, // >
47 colon, // :
48 semi, // ;
49 comma, // ,
50 dot, // .
51 equal, // =
52 question, // ?
53 paste, // #
54 dotdotdot, // ...
55
56 // Boolean literals.
57 TrueVal,
58 FalseVal,
59
60 // Integer value.
61 IntVal,
62
63 // Binary constant. Note that these are sized according to the number of
64 // bits given.
65 BinaryIntVal,
66
67 // Preprocessing tokens for internal usage by the lexer.
68 // They are never returned as a result of Lex().
69 Ifdef,
70 Ifndef,
71 Else,
72 Endif,
73 Define,
74
75 // Reserved keywords. ('ElseKW' is named to distinguish it from the
76 // existing 'Else' that means the preprocessor #else.)
77 Bit,
78 Bits,
79 Code,
80 Dag,
81 ElseKW,
82 Field,
83 In,
84 Include,
85 Int,
86 List,
87 String,
88 Then,
89
90 // Object start tokens.
91 OBJECT_START_FIRST,
92 Assert = OBJECT_START_FIRST,
93 Class,
94 Def,
95 Defm,
96 Defset,
97 Deftype,
98 Defvar,
99 Dump,
100 Foreach,
101 If,
102 Let,
103 MultiClass,
104 OBJECT_START_LAST = MultiClass,
105
106 // Bang operators.
107 BANG_OPERATOR_FIRST,
108 XConcat = BANG_OPERATOR_FIRST,
109 XADD,
110 XSUB,
111 XMUL,
112 XDIV,
113 XNOT,
114 XLOG2,
115 XAND,
116 XOR,
117 XXOR,
118 XSRA,
119 XSRL,
120 XSHL,
121 XListConcat,
122 XListFlatten,
123 XListSplat,
124 XStrConcat,
125 XInterleave,
126 XSubstr,
127 XFind,
128 XMatch,
129 XCast,
130 XSubst,
131 XForEach,
132 XFilter,
133 XFoldl,
134 XHead,
135 XTail,
136 XSize,
137 XEmpty,
138 XInitialized,
139 XInstances,
140 XIf,
141 XCond,
142 XEq,
143 XIsA,
144 XDag,
145 XNe,
146 XLe,
147 XLt,
148 XGe,
149 XGt,
150 XSetDagOp,
151 XGetDagOp,
152 XSetDagOpName,
153 XGetDagOpName,
154 XExists,
155 XListRemove,
156 XToLower,
157 XToUpper,
158 XRange,
159 XGetDagArg,
160 XGetDagName,
161 XSetDagArg,
162 XSetDagName,
163 XRepr,
164 BANG_OPERATOR_LAST = XRepr,
165
166 // String valued tokens.
167 STRING_VALUE_FIRST,
168 Id = STRING_VALUE_FIRST,
169 StrVal,
170 VarName,
171 CodeFragment,
172 STRING_VALUE_LAST = CodeFragment,
173};
174
175/// isBangOperator - Return true if this is a bang operator.
176static inline bool isBangOperator(tgtok::TokKind Kind) {
177 return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST;
178}
179
180/// isObjectStart - Return true if this is a valid first token for a statement.
181static inline bool isObjectStart(tgtok::TokKind Kind) {
182 return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST;
183}
184
185/// isStringValue - Return true if this is a string value.
186static inline bool isStringValue(tgtok::TokKind Kind) {
187 return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST;
188}
189} // namespace tgtok
190
191/// TGLexer - TableGen Lexer class.
192class TGLexer {
193 SourceMgr &SrcMgr;
194
195 const char *CurPtr = nullptr;
196 StringRef CurBuf;
197
198 // Information about the current token.
199 const char *TokStart = nullptr;
200 tgtok::TokKind CurCode = tgtok::TokKind::Eof;
201 std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
202 int64_t CurIntVal = 0; // This is valid for IntVal.
203
204 /// CurBuffer - This is the current buffer index we're lexing from as managed
205 /// by the SourceMgr object.
206 unsigned CurBuffer = 0;
207
208public:
209 typedef std::set<std::string> DependenciesSetTy;
210
211private:
212 /// Dependencies - This is the list of all included files.
213 DependenciesSetTy Dependencies;
214
215public:
216 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
217
218 tgtok::TokKind Lex() { return CurCode = LexToken(FileOrLineStart: CurPtr == CurBuf.begin()); }
219
220 const DependenciesSetTy &getDependencies() const { return Dependencies; }
221
222 tgtok::TokKind getCode() const { return CurCode; }
223
224 const std::string &getCurStrVal() const {
225 assert(tgtok::isStringValue(CurCode) &&
226 "This token doesn't have a string value");
227 return CurStrVal;
228 }
229 int64_t getCurIntVal() const {
230 assert(CurCode == tgtok::IntVal && "This token isn't an integer");
231 return CurIntVal;
232 }
233 std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
234 assert(CurCode == tgtok::BinaryIntVal &&
235 "This token isn't a binary integer");
236 return {CurIntVal, (CurPtr - TokStart) - 2};
237 }
238
239 SMLoc getLoc() const;
240 SMRange getLocRange() const;
241
242private:
243 /// LexToken - Read the next token and return its code.
244 tgtok::TokKind LexToken(bool FileOrLineStart = false);
245
246 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
247 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
248
249 int getNextChar();
250 int peekNextChar(int Index) const;
251 void SkipBCPLComment();
252 bool SkipCComment();
253 tgtok::TokKind LexIdentifier();
254 bool LexInclude();
255 tgtok::TokKind LexString();
256 tgtok::TokKind LexVarName();
257 tgtok::TokKind LexNumber();
258 tgtok::TokKind LexBracket();
259 tgtok::TokKind LexExclaim();
260
261 // Process EOF encountered in LexToken().
262 // If EOF is met in an include file, then the method will update
263 // CurPtr, CurBuf and preprocessing include stack, and return true.
264 // If EOF is met in the top-level file, then the method will
265 // update and check the preprocessing include stack, and return false.
266 bool processEOF();
267
268 // *** Structures and methods for preprocessing support ***
269
270 // A set of macro names that are defined either via command line or
271 // by using:
272 // #define NAME
273 StringSet<> DefinedMacros;
274
275 // Each of #ifdef and #else directives has a descriptor associated
276 // with it.
277 //
278 // An ordered list of preprocessing controls defined by #ifdef/#else
279 // directives that are in effect currently is called preprocessing
280 // control stack. It is represented as a vector of PreprocessorControlDesc's.
281 //
282 // The control stack is updated according to the following rules:
283 //
284 // For each #ifdef we add an element to the control stack.
285 // For each #else we replace the top element with a descriptor
286 // with an inverted IsDefined value.
287 // For each #endif we pop the top element from the control stack.
288 //
289 // When CurPtr reaches the current buffer's end, the control stack
290 // must be empty, i.e. #ifdef and the corresponding #endif
291 // must be located in the same file.
292 struct PreprocessorControlDesc {
293 // Either tgtok::Ifdef or tgtok::Else.
294 tgtok::TokKind Kind;
295
296 // True, if the condition for this directive is true, false - otherwise.
297 // Examples:
298 // #ifdef NAME : true, if NAME is defined, false - otherwise.
299 // ...
300 // #else : false, if NAME is defined, true - otherwise.
301 bool IsDefined;
302
303 // Pointer into CurBuf to the beginning of the preprocessing directive
304 // word, e.g.:
305 // #ifdef NAME
306 // ^ - SrcPos
307 SMLoc SrcPos;
308 };
309
310 // We want to disallow code like this:
311 // file1.td:
312 // #define NAME
313 // #ifdef NAME
314 // include "file2.td"
315 // EOF
316 // file2.td:
317 // #endif
318 // EOF
319 //
320 // To do this, we clear the preprocessing control stack on entry
321 // to each of the included file. PrepIncludeStack is used to store
322 // preprocessing control stacks for the current file and all its
323 // parent files. The back() element is the preprocessing control
324 // stack for the current file.
325 SmallVector<SmallVector<PreprocessorControlDesc>> PrepIncludeStack;
326
327 // Validate that the current preprocessing control stack is empty,
328 // since we are about to exit a file, and pop the include stack.
329 //
330 // If IncludeStackMustBeEmpty is true, the include stack must be empty
331 // after the popping, otherwise, the include stack must not be empty
332 // after the popping. Basically, the include stack must be empty
333 // only if we exit the "top-level" file (i.e. finish lexing).
334 //
335 // The method returns false, if the current preprocessing control stack
336 // is not empty (e.g. there is an unterminated #ifdef/#else),
337 // true - otherwise.
338 bool prepExitInclude(bool IncludeStackMustBeEmpty);
339
340 // Look ahead for a preprocessing directive starting from CurPtr. The caller
341 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches
342 // a preprocessing directive word followed by a whitespace, then it returns
343 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
344 //
345 // CurPtr is not adjusted by this method.
346 tgtok::TokKind prepIsDirective() const;
347
348 // Given a preprocessing token kind, adjusts CurPtr to the end
349 // of the preprocessing directive word.
350 //
351 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
352 // to avoid adjusting CurPtr before we are sure that '#' is followed
353 // by a preprocessing directive. If it is not, then we fall back to
354 // tgtok::paste interpretation of '#'.
355 void prepEatPreprocessorDirective(tgtok::TokKind Kind);
356
357 // The main "exit" point from the token parsing to preprocessor.
358 //
359 // The method is called for CurPtr, when prepIsDirective() returns
360 // true. The first parameter matches the result of prepIsDirective(),
361 // denoting the actual preprocessor directive to be processed.
362 //
363 // If the preprocessing directive disables the tokens processing, e.g.:
364 // #ifdef NAME // NAME is undefined
365 // then lexPreprocessor() enters the lines-skipping mode.
366 // In this mode, it does not parse any tokens, because the code under
367 // the #ifdef may not even be a correct tablegen code. The preprocessor
368 // looks for lines containing other preprocessing directives, which
369 // may be prepended with whitespaces and C-style comments. If the line
370 // does not contain a preprocessing directive, it is skipped completely.
371 // Otherwise, the preprocessing directive is processed by recursively
372 // calling lexPreprocessor(). The processing of the encountered
373 // preprocessing directives includes updating preprocessing control stack
374 // and adding new macros into DefinedMacros set.
375 //
376 // The second parameter controls whether lexPreprocessor() is called from
377 // LexToken() (true) or recursively from lexPreprocessor() (false).
378 //
379 // If ReturnNextLiveToken is true, the method returns the next
380 // LEX token following the current directive or following the end
381 // of the disabled preprocessing region corresponding to this directive.
382 // If ReturnNextLiveToken is false, the method returns the first parameter,
383 // unless there were errors encountered in the disabled preprocessing
384 // region - in this case, it returns tgtok::Error.
385 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
386 bool ReturnNextLiveToken = true);
387
388 // Worker method for lexPreprocessor() to skip lines after some
389 // preprocessing directive up to the buffer end or to the directive
390 // that re-enables token processing. The method returns true
391 // upon processing the next directive that re-enables tokens
392 // processing. False is returned if an error was encountered.
393 //
394 // Note that prepSkipRegion() calls lexPreprocessor() to process
395 // encountered preprocessing directives. In this case, the second
396 // parameter to lexPreprocessor() is set to false. Being passed
397 // false ReturnNextLiveToken, lexPreprocessor() must never call
398 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken
399 // to prepSkipRegion() and checking that it is never set to false.
400 bool prepSkipRegion(bool MustNeverBeFalse);
401
402 // Lex name of the macro after either #ifdef or #define. We could have used
403 // LexIdentifier(), but it has special handling of "include" word, which
404 // could result in awkward diagnostic errors. Consider:
405 // ----
406 // #ifdef include
407 // class ...
408 // ----
409 // LexIdentifier() will engage LexInclude(), which will complain about
410 // missing file with name "class". Instead, prepLexMacroName() will treat
411 // "include" as a normal macro name.
412 //
413 // On entry, CurPtr points to the end of a preprocessing directive word.
414 // The method allows for whitespaces between the preprocessing directive
415 // and the macro name. The allowed whitespaces are ' ' and '\t'.
416 //
417 // If the first non-whitespace symbol after the preprocessing directive
418 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
419 // the method updates TokStart to the position of the first non-whitespace
420 // symbol, sets CurPtr to the position of the macro name's last symbol,
421 // and returns a string reference to the macro name. Otherwise,
422 // TokStart is set to the first non-whitespace symbol after the preprocessing
423 // directive, and the method returns an empty string reference.
424 //
425 // In all cases, TokStart may be used to point to the word following
426 // the preprocessing directive.
427 StringRef prepLexMacroName();
428
429 // Skip any whitespaces starting from CurPtr. The method is used
430 // only in the lines-skipping mode to find the first non-whitespace
431 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n'
432 // and '\r'. The method skips C-style comments as well, because
433 // it is used to find the beginning of the preprocessing directive.
434 // If we do not handle C-style comments the following code would
435 // result in incorrect detection of a preprocessing directive:
436 // /*
437 // #ifdef NAME
438 // */
439 // As long as we skip C-style comments, the following code is correctly
440 // recognized as a preprocessing directive:
441 // /* first line comment
442 // second line comment */ #ifdef NAME
443 //
444 // The method returns true upon reaching the first non-whitespace symbol
445 // or EOF, CurPtr is set to point to this symbol. The method returns false,
446 // if an error occurred during skipping of a C-style comment.
447 bool prepSkipLineBegin();
448
449 // Skip any whitespaces or comments after a preprocessing directive.
450 // The method returns true upon reaching either end of the line
451 // or end of the file. If there is a multiline C-style comment
452 // after the preprocessing directive, the method skips
453 // the comment, so the final CurPtr may point to one of the next lines.
454 // The method returns false, if an error occurred during skipping
455 // C- or C++-style comment, or a non-whitespace symbol appears
456 // after the preprocessing directive.
457 //
458 // The method maybe called both during lines-skipping and tokens
459 // processing. It actually verifies that only whitespaces or/and
460 // comments follow a preprocessing directive.
461 //
462 // After the execution of this mehod, CurPtr points either to new line
463 // symbol, buffer end or non-whitespace symbol following the preprocesing
464 // directive.
465 bool prepSkipDirectiveEnd();
466
467 // Return true, if the current preprocessor control stack is such that
468 // we should allow lexer to process the next token, false - otherwise.
469 //
470 // In particular, the method returns true, if all the #ifdef/#else
471 // controls on the stack have their IsDefined member set to true.
472 bool prepIsProcessingEnabled();
473
474 // Report an error, if we reach EOF with non-empty preprocessing control
475 // stack. This means there is no matching #endif for the previous
476 // #ifdef/#else.
477 void prepReportPreprocessorStackError();
478};
479
480} // end namespace llvm
481
482#endif
483