1 | //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This class represents the Lexer for tablegen files. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H |
14 | #define LLVM_LIB_TABLEGEN_TGLEXER_H |
15 | |
16 | #include "llvm/ADT/StringRef.h" |
17 | #include "llvm/ADT/StringSet.h" |
18 | #include "llvm/Support/DataTypes.h" |
19 | #include "llvm/Support/SMLoc.h" |
20 | #include <cassert> |
21 | #include <memory> |
22 | #include <set> |
23 | #include <string> |
24 | #include <vector> |
25 | |
26 | namespace llvm { |
27 | template <typename T> class ArrayRef; |
28 | class SourceMgr; |
29 | class Twine; |
30 | |
31 | namespace tgtok { |
32 | enum TokKind { |
33 | // Markers |
34 | Eof, |
35 | Error, |
36 | |
37 | // Tokens with no info. |
38 | minus, // - |
39 | plus, // + |
40 | l_square, // [ |
41 | r_square, // ] |
42 | l_brace, // { |
43 | r_brace, // } |
44 | l_paren, // ( |
45 | r_paren, // ) |
46 | less, // < |
47 | greater, // > |
48 | colon, // : |
49 | semi, // ; |
50 | comma, // , |
51 | dot, // . |
52 | equal, // = |
53 | question, // ? |
54 | paste, // # |
55 | dotdotdot, // ... |
56 | |
57 | // Boolean literals. |
58 | TrueVal, |
59 | FalseVal, |
60 | |
61 | // Integer value. |
62 | IntVal, |
63 | |
64 | // Binary constant. Note that these are sized according to the number of |
65 | // bits given. |
66 | BinaryIntVal, |
67 | |
68 | // Preprocessing tokens for internal usage by the lexer. |
69 | // They are never returned as a result of Lex(). |
70 | Ifdef, |
71 | Ifndef, |
72 | Else, |
73 | Endif, |
74 | Define, |
75 | |
76 | // Reserved keywords. ('ElseKW' is named to distinguish it from the |
77 | // existing 'Else' that means the preprocessor #else.) |
78 | Bit, |
79 | Bits, |
80 | Code, |
81 | Dag, |
82 | ElseKW, |
83 | FalseKW, |
84 | Field, |
85 | In, |
86 | Include, |
87 | Int, |
88 | List, |
89 | String, |
90 | Then, |
91 | TrueKW, |
92 | |
93 | // Object start tokens. |
94 | OBJECT_START_FIRST, |
95 | Assert = OBJECT_START_FIRST, |
96 | Class, |
97 | Def, |
98 | Defm, |
99 | Defset, |
100 | Deftype, |
101 | Defvar, |
102 | Dump, |
103 | Foreach, |
104 | If, |
105 | Let, |
106 | MultiClass, |
107 | OBJECT_START_LAST = MultiClass, |
108 | |
109 | // Bang operators. |
110 | BANG_OPERATOR_FIRST, |
111 | XConcat = BANG_OPERATOR_FIRST, |
112 | XADD, |
113 | XSUB, |
114 | XMUL, |
115 | XDIV, |
116 | XNOT, |
117 | XLOG2, |
118 | XAND, |
119 | XOR, |
120 | XXOR, |
121 | XSRA, |
122 | XSRL, |
123 | XSHL, |
124 | XListConcat, |
125 | XListSplat, |
126 | XStrConcat, |
127 | XInterleave, |
128 | XSubstr, |
129 | XFind, |
130 | XCast, |
131 | XSubst, |
132 | XForEach, |
133 | XFilter, |
134 | XFoldl, |
135 | XHead, |
136 | XTail, |
137 | XSize, |
138 | XEmpty, |
139 | XIf, |
140 | XCond, |
141 | XEq, |
142 | XIsA, |
143 | XDag, |
144 | XNe, |
145 | XLe, |
146 | XLt, |
147 | XGe, |
148 | XGt, |
149 | XSetDagOp, |
150 | XGetDagOp, |
151 | XExists, |
152 | XListRemove, |
153 | XToLower, |
154 | XToUpper, |
155 | XRange, |
156 | XGetDagArg, |
157 | XGetDagName, |
158 | XSetDagArg, |
159 | XSetDagName, |
160 | XRepr, |
161 | BANG_OPERATOR_LAST = XRepr, |
162 | |
163 | // String valued tokens. |
164 | STRING_VALUE_FIRST, |
165 | Id = STRING_VALUE_FIRST, |
166 | StrVal, |
167 | VarName, |
168 | CodeFragment, |
169 | STRING_VALUE_LAST = CodeFragment, |
170 | }; |
171 | |
172 | /// isBangOperator - Return true if this is a bang operator. |
173 | static inline bool isBangOperator(tgtok::TokKind Kind) { |
174 | return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST; |
175 | } |
176 | |
177 | /// isObjectStart - Return true if this is a valid first token for a statement. |
178 | static inline bool isObjectStart(tgtok::TokKind Kind) { |
179 | return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST; |
180 | } |
181 | |
182 | /// isStringValue - Return true if this is a string value. |
183 | static inline bool isStringValue(tgtok::TokKind Kind) { |
184 | return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST; |
185 | } |
186 | } // namespace tgtok |
187 | |
188 | /// TGLexer - TableGen Lexer class. |
189 | class TGLexer { |
190 | SourceMgr &SrcMgr; |
191 | |
192 | const char *CurPtr = nullptr; |
193 | StringRef CurBuf; |
194 | |
195 | // Information about the current token. |
196 | const char *TokStart = nullptr; |
197 | tgtok::TokKind CurCode = tgtok::TokKind::Eof; |
198 | std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment |
199 | int64_t CurIntVal = 0; // This is valid for IntVal. |
200 | |
201 | /// CurBuffer - This is the current buffer index we're lexing from as managed |
202 | /// by the SourceMgr object. |
203 | unsigned CurBuffer = 0; |
204 | |
205 | public: |
206 | typedef std::set<std::string> DependenciesSetTy; |
207 | |
208 | private: |
209 | /// Dependencies - This is the list of all included files. |
210 | DependenciesSetTy Dependencies; |
211 | |
212 | public: |
213 | TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); |
214 | |
215 | tgtok::TokKind Lex() { |
216 | return CurCode = LexToken(FileOrLineStart: CurPtr == CurBuf.begin()); |
217 | } |
218 | |
219 | const DependenciesSetTy &getDependencies() const { |
220 | return Dependencies; |
221 | } |
222 | |
223 | tgtok::TokKind getCode() const { return CurCode; } |
224 | |
225 | const std::string &getCurStrVal() const { |
226 | assert(tgtok::isStringValue(CurCode) && |
227 | "This token doesn't have a string value" ); |
228 | return CurStrVal; |
229 | } |
230 | int64_t getCurIntVal() const { |
231 | assert(CurCode == tgtok::IntVal && "This token isn't an integer" ); |
232 | return CurIntVal; |
233 | } |
234 | std::pair<int64_t, unsigned> getCurBinaryIntVal() const { |
235 | assert(CurCode == tgtok::BinaryIntVal && |
236 | "This token isn't a binary integer" ); |
237 | return std::make_pair(x: CurIntVal, y: (CurPtr - TokStart)-2); |
238 | } |
239 | |
240 | SMLoc getLoc() const; |
241 | SMRange getLocRange() const; |
242 | |
243 | private: |
244 | /// LexToken - Read the next token and return its code. |
245 | tgtok::TokKind LexToken(bool FileOrLineStart = false); |
246 | |
247 | tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); |
248 | tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); |
249 | |
250 | int getNextChar(); |
251 | int peekNextChar(int Index) const; |
252 | void (); |
253 | bool (); |
254 | tgtok::TokKind LexIdentifier(); |
255 | bool LexInclude(); |
256 | tgtok::TokKind LexString(); |
257 | tgtok::TokKind LexVarName(); |
258 | tgtok::TokKind LexNumber(); |
259 | tgtok::TokKind LexBracket(); |
260 | tgtok::TokKind LexExclaim(); |
261 | |
262 | // Process EOF encountered in LexToken(). |
263 | // If EOF is met in an include file, then the method will update |
264 | // CurPtr, CurBuf and preprocessing include stack, and return true. |
265 | // If EOF is met in the top-level file, then the method will |
266 | // update and check the preprocessing include stack, and return false. |
267 | bool processEOF(); |
268 | |
269 | // *** Structures and methods for preprocessing support *** |
270 | |
271 | // A set of macro names that are defined either via command line or |
272 | // by using: |
273 | // #define NAME |
274 | StringSet<> DefinedMacros; |
275 | |
276 | // Each of #ifdef and #else directives has a descriptor associated |
277 | // with it. |
278 | // |
279 | // An ordered list of preprocessing controls defined by #ifdef/#else |
280 | // directives that are in effect currently is called preprocessing |
281 | // control stack. It is represented as a vector of PreprocessorControlDesc's. |
282 | // |
283 | // The control stack is updated according to the following rules: |
284 | // |
285 | // For each #ifdef we add an element to the control stack. |
286 | // For each #else we replace the top element with a descriptor |
287 | // with an inverted IsDefined value. |
288 | // For each #endif we pop the top element from the control stack. |
289 | // |
290 | // When CurPtr reaches the current buffer's end, the control stack |
291 | // must be empty, i.e. #ifdef and the corresponding #endif |
292 | // must be located in the same file. |
293 | struct PreprocessorControlDesc { |
294 | // Either tgtok::Ifdef or tgtok::Else. |
295 | tgtok::TokKind Kind; |
296 | |
297 | // True, if the condition for this directive is true, false - otherwise. |
298 | // Examples: |
299 | // #ifdef NAME : true, if NAME is defined, false - otherwise. |
300 | // ... |
301 | // #else : false, if NAME is defined, true - otherwise. |
302 | bool IsDefined; |
303 | |
304 | // Pointer into CurBuf to the beginning of the preprocessing directive |
305 | // word, e.g.: |
306 | // #ifdef NAME |
307 | // ^ - SrcPos |
308 | SMLoc SrcPos; |
309 | }; |
310 | |
311 | // We want to disallow code like this: |
312 | // file1.td: |
313 | // #define NAME |
314 | // #ifdef NAME |
315 | // include "file2.td" |
316 | // EOF |
317 | // file2.td: |
318 | // #endif |
319 | // EOF |
320 | // |
321 | // To do this, we clear the preprocessing control stack on entry |
322 | // to each of the included file. PrepIncludeStack is used to store |
323 | // preprocessing control stacks for the current file and all its |
324 | // parent files. The back() element is the preprocessing control |
325 | // stack for the current file. |
326 | std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> |
327 | PrepIncludeStack; |
328 | |
329 | // Validate that the current preprocessing control stack is empty, |
330 | // since we are about to exit a file, and pop the include stack. |
331 | // |
332 | // If IncludeStackMustBeEmpty is true, the include stack must be empty |
333 | // after the popping, otherwise, the include stack must not be empty |
334 | // after the popping. Basically, the include stack must be empty |
335 | // only if we exit the "top-level" file (i.e. finish lexing). |
336 | // |
337 | // The method returns false, if the current preprocessing control stack |
338 | // is not empty (e.g. there is an unterminated #ifdef/#else), |
339 | // true - otherwise. |
340 | bool prepExitInclude(bool IncludeStackMustBeEmpty); |
341 | |
342 | // Look ahead for a preprocessing directive starting from CurPtr. The caller |
343 | // must only call this method, if *(CurPtr - 1) is '#'. If the method matches |
344 | // a preprocessing directive word followed by a whitespace, then it returns |
345 | // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. |
346 | // |
347 | // CurPtr is not adjusted by this method. |
348 | tgtok::TokKind prepIsDirective() const; |
349 | |
350 | // Given a preprocessing token kind, adjusts CurPtr to the end |
351 | // of the preprocessing directive word. Returns true, unless |
352 | // an unsupported token kind is passed in. |
353 | // |
354 | // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() |
355 | // to avoid adjusting CurPtr before we are sure that '#' is followed |
356 | // by a preprocessing directive. If it is not, then we fall back to |
357 | // tgtok::paste interpretation of '#'. |
358 | bool prepEatPreprocessorDirective(tgtok::TokKind Kind); |
359 | |
360 | // The main "exit" point from the token parsing to preprocessor. |
361 | // |
362 | // The method is called for CurPtr, when prepIsDirective() returns |
363 | // true. The first parameter matches the result of prepIsDirective(), |
364 | // denoting the actual preprocessor directive to be processed. |
365 | // |
366 | // If the preprocessing directive disables the tokens processing, e.g.: |
367 | // #ifdef NAME // NAME is undefined |
368 | // then lexPreprocessor() enters the lines-skipping mode. |
369 | // In this mode, it does not parse any tokens, because the code under |
370 | // the #ifdef may not even be a correct tablegen code. The preprocessor |
371 | // looks for lines containing other preprocessing directives, which |
372 | // may be prepended with whitespaces and C-style comments. If the line |
373 | // does not contain a preprocessing directive, it is skipped completely. |
374 | // Otherwise, the preprocessing directive is processed by recursively |
375 | // calling lexPreprocessor(). The processing of the encountered |
376 | // preprocessing directives includes updating preprocessing control stack |
377 | // and adding new macros into DefinedMacros set. |
378 | // |
379 | // The second parameter controls whether lexPreprocessor() is called from |
380 | // LexToken() (true) or recursively from lexPreprocessor() (false). |
381 | // |
382 | // If ReturnNextLiveToken is true, the method returns the next |
383 | // LEX token following the current directive or following the end |
384 | // of the disabled preprocessing region corresponding to this directive. |
385 | // If ReturnNextLiveToken is false, the method returns the first parameter, |
386 | // unless there were errors encountered in the disabled preprocessing |
387 | // region - in this case, it returns tgtok::Error. |
388 | tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, |
389 | bool ReturnNextLiveToken = true); |
390 | |
391 | // Worker method for lexPreprocessor() to skip lines after some |
392 | // preprocessing directive up to the buffer end or to the directive |
393 | // that re-enables token processing. The method returns true |
394 | // upon processing the next directive that re-enables tokens |
395 | // processing. False is returned if an error was encountered. |
396 | // |
397 | // Note that prepSkipRegion() calls lexPreprocessor() to process |
398 | // encountered preprocessing directives. In this case, the second |
399 | // parameter to lexPreprocessor() is set to false. Being passed |
400 | // false ReturnNextLiveToken, lexPreprocessor() must never call |
401 | // prepSkipRegion(). We assert this by passing ReturnNextLiveToken |
402 | // to prepSkipRegion() and checking that it is never set to false. |
403 | bool prepSkipRegion(bool MustNeverBeFalse); |
404 | |
405 | // Lex name of the macro after either #ifdef or #define. We could have used |
406 | // LexIdentifier(), but it has special handling of "include" word, which |
407 | // could result in awkward diagnostic errors. Consider: |
408 | // ---- |
409 | // #ifdef include |
410 | // class ... |
411 | // ---- |
412 | // LexIdentifier() will engage LexInclude(), which will complain about |
413 | // missing file with name "class". Instead, prepLexMacroName() will treat |
414 | // "include" as a normal macro name. |
415 | // |
416 | // On entry, CurPtr points to the end of a preprocessing directive word. |
417 | // The method allows for whitespaces between the preprocessing directive |
418 | // and the macro name. The allowed whitespaces are ' ' and '\t'. |
419 | // |
420 | // If the first non-whitespace symbol after the preprocessing directive |
421 | // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then |
422 | // the method updates TokStart to the position of the first non-whitespace |
423 | // symbol, sets CurPtr to the position of the macro name's last symbol, |
424 | // and returns a string reference to the macro name. Otherwise, |
425 | // TokStart is set to the first non-whitespace symbol after the preprocessing |
426 | // directive, and the method returns an empty string reference. |
427 | // |
428 | // In all cases, TokStart may be used to point to the word following |
429 | // the preprocessing directive. |
430 | StringRef prepLexMacroName(); |
431 | |
432 | // Skip any whitespaces starting from CurPtr. The method is used |
433 | // only in the lines-skipping mode to find the first non-whitespace |
434 | // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' |
435 | // and '\r'. The method skips C-style comments as well, because |
436 | // it is used to find the beginning of the preprocessing directive. |
437 | // If we do not handle C-style comments the following code would |
438 | // result in incorrect detection of a preprocessing directive: |
439 | // /* |
440 | // #ifdef NAME |
441 | // */ |
442 | // As long as we skip C-style comments, the following code is correctly |
443 | // recognized as a preprocessing directive: |
444 | // /* first line comment |
445 | // second line comment */ #ifdef NAME |
446 | // |
447 | // The method returns true upon reaching the first non-whitespace symbol |
448 | // or EOF, CurPtr is set to point to this symbol. The method returns false, |
449 | // if an error occurred during skipping of a C-style comment. |
450 | bool prepSkipLineBegin(); |
451 | |
452 | // Skip any whitespaces or comments after a preprocessing directive. |
453 | // The method returns true upon reaching either end of the line |
454 | // or end of the file. If there is a multiline C-style comment |
455 | // after the preprocessing directive, the method skips |
456 | // the comment, so the final CurPtr may point to one of the next lines. |
457 | // The method returns false, if an error occurred during skipping |
458 | // C- or C++-style comment, or a non-whitespace symbol appears |
459 | // after the preprocessing directive. |
460 | // |
461 | // The method maybe called both during lines-skipping and tokens |
462 | // processing. It actually verifies that only whitespaces or/and |
463 | // comments follow a preprocessing directive. |
464 | // |
465 | // After the execution of this mehod, CurPtr points either to new line |
466 | // symbol, buffer end or non-whitespace symbol following the preprocesing |
467 | // directive. |
468 | bool prepSkipDirectiveEnd(); |
469 | |
470 | // Return true, if the current preprocessor control stack is such that |
471 | // we should allow lexer to process the next token, false - otherwise. |
472 | // |
473 | // In particular, the method returns true, if all the #ifdef/#else |
474 | // controls on the stack have their IsDefined member set to true. |
475 | bool prepIsProcessingEnabled(); |
476 | |
477 | // Report an error, if we reach EOF with non-empty preprocessing control |
478 | // stack. This means there is no matching #endif for the previous |
479 | // #ifdef/#else. |
480 | void prepReportPreprocessorStackError(); |
481 | }; |
482 | |
483 | } // end namespace llvm |
484 | |
485 | #endif |
486 | |