1//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class represents the Lexer for tablegen files.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14#define LLVM_LIB_TABLEGEN_TGLEXER_H
15
16#include "llvm/ADT/SmallVector.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/ADT/StringSet.h"
19#include "llvm/Support/DataTypes.h"
20#include "llvm/Support/SMLoc.h"
21#include <cassert>
22#include <set>
23#include <string>
24
25namespace llvm {
26template <typename T> class ArrayRef;
27class SourceMgr;
28class Twine;
29
30namespace tgtok {
31enum TokKind {
32 // Markers
33 Eof,
34 Error,
35
36 // Tokens with no info.
37 minus, // -
38 plus, // +
39 l_square, // [
40 r_square, // ]
41 l_brace, // {
42 r_brace, // }
43 l_paren, // (
44 r_paren, // )
45 less, // <
46 greater, // >
47 colon, // :
48 semi, // ;
49 comma, // ,
50 dot, // .
51 equal, // =
52 question, // ?
53 paste, // #
54 dotdotdot, // ...
55
56 // Boolean literals.
57 TrueVal,
58 FalseVal,
59
60 // Integer value.
61 IntVal,
62
63 // Binary constant. Note that these are sized according to the number of
64 // bits given.
65 BinaryIntVal,
66
67 // Preprocessing tokens for internal usage by the lexer.
68 // They are never returned as a result of Lex().
69 Ifdef,
70 Ifndef,
71 Else,
72 Endif,
73 Define,
74
75 // Reserved keywords. ('ElseKW' is named to distinguish it from the
76 // existing 'Else' that means the preprocessor #else.)
77 Bit,
78 Bits,
79 Code,
80 Dag,
81 ElseKW,
82 Field,
83 In,
84 Include,
85 Int,
86 List,
87 String,
88 Then,
89
90 // Object start tokens.
91 OBJECT_START_FIRST,
92 Assert = OBJECT_START_FIRST,
93 Class,
94 Def,
95 Defm,
96 Defset,
97 Deftype,
98 Defvar,
99 Dump,
100 Foreach,
101 If,
102 Let,
103 MultiClass,
104 OBJECT_START_LAST = MultiClass,
105
106 // Bang operators.
107 BANG_OPERATOR_FIRST,
108 XConcat = BANG_OPERATOR_FIRST,
109 XADD,
110 XSUB,
111 XMUL,
112 XDIV,
113 XNOT,
114 XLOG2,
115 XAND,
116 XOR,
117 XXOR,
118 XSRA,
119 XSRL,
120 XSHL,
121 XListConcat,
122 XListFlatten,
123 XListSplat,
124 XStrConcat,
125 XInterleave,
126 XSubstr,
127 XFind,
128 XMatch,
129 XCast,
130 XSubst,
131 XForEach,
132 XFilter,
133 XFoldl,
134 XHead,
135 XTail,
136 XSize,
137 XEmpty,
138 XInitialized,
139 XInstances,
140 XIf,
141 XCond,
142 XSwitch,
143 XEq,
144 XIsA,
145 XDag,
146 XNe,
147 XLe,
148 XLt,
149 XGe,
150 XGt,
151 XSetDagOp,
152 XGetDagOp,
153 XSetDagOpName,
154 XGetDagOpName,
155 XExists,
156 XListRemove,
157 XToLower,
158 XToUpper,
159 XRange,
160 XSort,
161 XGetDagArg,
162 XGetDagName,
163 XSetDagArg,
164 XSetDagName,
165 XRepr,
166 BANG_OPERATOR_LAST = XRepr,
167
168 // String valued tokens.
169 STRING_VALUE_FIRST,
170 Id = STRING_VALUE_FIRST,
171 StrVal,
172 VarName,
173 CodeFragment,
174 STRING_VALUE_LAST = CodeFragment,
175};
176
177/// isBangOperator - Return true if this is a bang operator.
178static inline bool isBangOperator(tgtok::TokKind Kind) {
179 return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST;
180}
181
182/// isObjectStart - Return true if this is a valid first token for a statement.
183static inline bool isObjectStart(tgtok::TokKind Kind) {
184 return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST;
185}
186
187/// isStringValue - Return true if this is a string value.
188static inline bool isStringValue(tgtok::TokKind Kind) {
189 return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST;
190}
191} // namespace tgtok
192
193/// TGLexer - TableGen Lexer class.
194class TGLexer {
195 SourceMgr &SrcMgr;
196
197 const char *CurPtr = nullptr;
198 StringRef CurBuf;
199
200 // Information about the current token.
201 const char *TokStart = nullptr;
202 tgtok::TokKind CurCode = tgtok::TokKind::Eof;
203 std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
204 int64_t CurIntVal = 0; // This is valid for IntVal.
205
206 /// CurBuffer - This is the current buffer index we're lexing from as managed
207 /// by the SourceMgr object.
208 unsigned CurBuffer = 0;
209
210public:
211 typedef std::set<std::string> DependenciesSetTy;
212
213private:
214 /// Dependencies - This is the list of all included files.
215 DependenciesSetTy Dependencies;
216
217public:
218 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
219
220 tgtok::TokKind Lex() { return CurCode = LexToken(FileOrLineStart: CurPtr == CurBuf.begin()); }
221
222 const DependenciesSetTy &getDependencies() const { return Dependencies; }
223
224 tgtok::TokKind getCode() const { return CurCode; }
225
226 const std::string &getCurStrVal() const {
227 assert(tgtok::isStringValue(CurCode) &&
228 "This token doesn't have a string value");
229 return CurStrVal;
230 }
231 int64_t getCurIntVal() const {
232 assert(CurCode == tgtok::IntVal && "This token isn't an integer");
233 return CurIntVal;
234 }
235 std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
236 assert(CurCode == tgtok::BinaryIntVal &&
237 "This token isn't a binary integer");
238 return {CurIntVal, (CurPtr - TokStart) - 2};
239 }
240
241 SMLoc getLoc() const;
242 SMRange getLocRange() const;
243
244private:
245 /// LexToken - Read the next token and return its code.
246 tgtok::TokKind LexToken(bool FileOrLineStart = false);
247
248 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
249 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
250
251 int getNextChar();
252 int peekNextChar(int Index) const;
253 void SkipBCPLComment();
254 bool SkipCComment();
255 tgtok::TokKind LexIdentifier();
256 bool LexInclude();
257 tgtok::TokKind LexString();
258 tgtok::TokKind LexVarName();
259 tgtok::TokKind LexNumber();
260 tgtok::TokKind LexBracket();
261 tgtok::TokKind LexExclaim();
262
263 // Process EOF encountered in LexToken().
264 // If EOF is met in an include file, then the method will update
265 // CurPtr, CurBuf and preprocessing include stack, and return true.
266 // If EOF is met in the top-level file, then the method will
267 // update and check the preprocessing include stack, and return false.
268 bool processEOF();
269
270 // *** Structures and methods for preprocessing support ***
271
272 // A set of macro names that are defined either via command line or
273 // by using:
274 // #define NAME
275 StringSet<> DefinedMacros;
276
277 // Each of #ifdef and #else directives has a descriptor associated
278 // with it.
279 //
280 // An ordered list of preprocessing controls defined by #ifdef/#else
281 // directives that are in effect currently is called preprocessing
282 // control stack. It is represented as a vector of PreprocessorControlDesc's.
283 //
284 // The control stack is updated according to the following rules:
285 //
286 // For each #ifdef we add an element to the control stack.
287 // For each #else we replace the top element with a descriptor
288 // with an inverted IsDefined value.
289 // For each #endif we pop the top element from the control stack.
290 //
291 // When CurPtr reaches the current buffer's end, the control stack
292 // must be empty, i.e. #ifdef and the corresponding #endif
293 // must be located in the same file.
294 struct PreprocessorControlDesc {
295 // Either tgtok::Ifdef or tgtok::Else.
296 tgtok::TokKind Kind;
297
298 // True, if the condition for this directive is true, false - otherwise.
299 // Examples:
300 // #ifdef NAME : true, if NAME is defined, false - otherwise.
301 // ...
302 // #else : false, if NAME is defined, true - otherwise.
303 bool IsDefined;
304
305 // Pointer into CurBuf to the beginning of the preprocessing directive
306 // word, e.g.:
307 // #ifdef NAME
308 // ^ - SrcPos
309 SMLoc SrcPos;
310 };
311
312 // We want to disallow code like this:
313 // file1.td:
314 // #define NAME
315 // #ifdef NAME
316 // include "file2.td"
317 // EOF
318 // file2.td:
319 // #endif
320 // EOF
321 //
322 // To do this, we clear the preprocessing control stack on entry
323 // to each of the included file. PrepIncludeStack is used to store
324 // preprocessing control stacks for the current file and all its
325 // parent files. The back() element is the preprocessing control
326 // stack for the current file.
327 SmallVector<SmallVector<PreprocessorControlDesc>> PrepIncludeStack;
328
329 // Validate that the current preprocessing control stack is empty,
330 // since we are about to exit a file, and pop the include stack.
331 //
332 // If IncludeStackMustBeEmpty is true, the include stack must be empty
333 // after the popping, otherwise, the include stack must not be empty
334 // after the popping. Basically, the include stack must be empty
335 // only if we exit the "top-level" file (i.e. finish lexing).
336 //
337 // The method returns false, if the current preprocessing control stack
338 // is not empty (e.g. there is an unterminated #ifdef/#else),
339 // true - otherwise.
340 bool prepExitInclude(bool IncludeStackMustBeEmpty);
341
342 // Look ahead for a preprocessing directive starting from CurPtr. The caller
343 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches
344 // a preprocessing directive word followed by a whitespace, then it returns
345 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
346 //
347 // CurPtr is not adjusted by this method.
348 tgtok::TokKind prepIsDirective() const;
349
350 // Given a preprocessing token kind, adjusts CurPtr to the end
351 // of the preprocessing directive word.
352 //
353 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
354 // to avoid adjusting CurPtr before we are sure that '#' is followed
355 // by a preprocessing directive. If it is not, then we fall back to
356 // tgtok::paste interpretation of '#'.
357 void prepEatPreprocessorDirective(tgtok::TokKind Kind);
358
359 // The main "exit" point from the token parsing to preprocessor.
360 //
361 // The method is called for CurPtr, when prepIsDirective() returns
362 // true. The first parameter matches the result of prepIsDirective(),
363 // denoting the actual preprocessor directive to be processed.
364 //
365 // If the preprocessing directive disables the tokens processing, e.g.:
366 // #ifdef NAME // NAME is undefined
367 // then lexPreprocessor() enters the lines-skipping mode.
368 // In this mode, it does not parse any tokens, because the code under
369 // the #ifdef may not even be a correct tablegen code. The preprocessor
370 // looks for lines containing other preprocessing directives, which
371 // may be prepended with whitespaces and C-style comments. If the line
372 // does not contain a preprocessing directive, it is skipped completely.
373 // Otherwise, the preprocessing directive is processed by recursively
374 // calling lexPreprocessor(). The processing of the encountered
375 // preprocessing directives includes updating preprocessing control stack
376 // and adding new macros into DefinedMacros set.
377 //
378 // The second parameter controls whether lexPreprocessor() is called from
379 // LexToken() (true) or recursively from lexPreprocessor() (false).
380 //
381 // If ReturnNextLiveToken is true, the method returns the next
382 // LEX token following the current directive or following the end
383 // of the disabled preprocessing region corresponding to this directive.
384 // If ReturnNextLiveToken is false, the method returns the first parameter,
385 // unless there were errors encountered in the disabled preprocessing
386 // region - in this case, it returns tgtok::Error.
387 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
388 bool ReturnNextLiveToken = true);
389
390 // Worker method for lexPreprocessor() to skip lines after some
391 // preprocessing directive up to the buffer end or to the directive
392 // that re-enables token processing. The method returns true
393 // upon processing the next directive that re-enables tokens
394 // processing. False is returned if an error was encountered.
395 //
396 // Note that prepSkipRegion() calls lexPreprocessor() to process
397 // encountered preprocessing directives. In this case, the second
398 // parameter to lexPreprocessor() is set to false. Being passed
399 // false ReturnNextLiveToken, lexPreprocessor() must never call
400 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken
401 // to prepSkipRegion() and checking that it is never set to false.
402 bool prepSkipRegion(bool MustNeverBeFalse);
403
404 // Lex name of the macro after either #ifdef or #define. We could have used
405 // LexIdentifier(), but it has special handling of "include" word, which
406 // could result in awkward diagnostic errors. Consider:
407 // ----
408 // #ifdef include
409 // class ...
410 // ----
411 // LexIdentifier() will engage LexInclude(), which will complain about
412 // missing file with name "class". Instead, prepLexMacroName() will treat
413 // "include" as a normal macro name.
414 //
415 // On entry, CurPtr points to the end of a preprocessing directive word.
416 // The method allows for whitespaces between the preprocessing directive
417 // and the macro name. The allowed whitespaces are ' ' and '\t'.
418 //
419 // If the first non-whitespace symbol after the preprocessing directive
420 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
421 // the method updates TokStart to the position of the first non-whitespace
422 // symbol, sets CurPtr to the position of the macro name's last symbol,
423 // and returns a string reference to the macro name. Otherwise,
424 // TokStart is set to the first non-whitespace symbol after the preprocessing
425 // directive, and the method returns an empty string reference.
426 //
427 // In all cases, TokStart may be used to point to the word following
428 // the preprocessing directive.
429 StringRef prepLexMacroName();
430
431 // Skip any whitespaces starting from CurPtr. The method is used
432 // only in the lines-skipping mode to find the first non-whitespace
433 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n'
434 // and '\r'. The method skips C-style comments as well, because
435 // it is used to find the beginning of the preprocessing directive.
436 // If we do not handle C-style comments the following code would
437 // result in incorrect detection of a preprocessing directive:
438 // /*
439 // #ifdef NAME
440 // */
441 // As long as we skip C-style comments, the following code is correctly
442 // recognized as a preprocessing directive:
443 // /* first line comment
444 // second line comment */ #ifdef NAME
445 //
446 // The method returns true upon reaching the first non-whitespace symbol
447 // or EOF, CurPtr is set to point to this symbol. The method returns false,
448 // if an error occurred during skipping of a C-style comment.
449 bool prepSkipLineBegin();
450
451 // Skip any whitespaces or comments after a preprocessing directive.
452 // The method returns true upon reaching either end of the line
453 // or end of the file. If there is a multiline C-style comment
454 // after the preprocessing directive, the method skips
455 // the comment, so the final CurPtr may point to one of the next lines.
456 // The method returns false, if an error occurred during skipping
457 // C- or C++-style comment, or a non-whitespace symbol appears
458 // after the preprocessing directive.
459 //
460 // The method maybe called both during lines-skipping and tokens
461 // processing. It actually verifies that only whitespaces or/and
462 // comments follow a preprocessing directive.
463 //
464 // After the execution of this mehod, CurPtr points either to new line
465 // symbol, buffer end or non-whitespace symbol following the preprocesing
466 // directive.
467 bool prepSkipDirectiveEnd();
468
469 // Return true, if the current preprocessor control stack is such that
470 // we should allow lexer to process the next token, false - otherwise.
471 //
472 // In particular, the method returns true, if all the #ifdef/#else
473 // controls on the stack have their IsDefined member set to true.
474 bool prepIsProcessingEnabled();
475
476 // Report an error, if we reach EOF with non-empty preprocessing control
477 // stack. This means there is no matching #endif for the previous
478 // #ifdef/#else.
479 void prepReportPreprocessorStackError();
480};
481
482} // end namespace llvm
483
484#endif
485