1 | //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Implement the Lexer for TableGen. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "TGLexer.h" |
14 | #include "llvm/ADT/ArrayRef.h" |
15 | #include "llvm/ADT/StringExtras.h" |
16 | #include "llvm/ADT/StringSwitch.h" |
17 | #include "llvm/ADT/Twine.h" |
18 | #include "llvm/Config/config.h" // for strtoull()/strtoll() define |
19 | #include "llvm/Support/Compiler.h" |
20 | #include "llvm/Support/MemoryBuffer.h" |
21 | #include "llvm/Support/SourceMgr.h" |
22 | #include "llvm/TableGen/Error.h" |
23 | #include <cerrno> |
24 | #include <cstdio> |
25 | #include <cstdlib> |
26 | #include <cstring> |
27 | |
28 | using namespace llvm; |
29 | |
30 | namespace { |
31 | // A list of supported preprocessing directives with their |
32 | // internal token kinds and names. |
33 | struct PreprocessorDir { |
34 | tgtok::TokKind Kind; |
35 | StringRef Word; |
36 | }; |
37 | } // end anonymous namespace |
38 | |
39 | /// Returns true if `C` is a valid character in an identifier. If `First` is |
40 | /// true, returns true if `C` is a valid first character of an identifier, |
41 | /// else returns true if `C` is a valid non-first character of an identifier. |
42 | /// Identifiers match the following regular expression: |
43 | /// [a-zA-Z_][0-9a-zA-Z_]* |
44 | static bool isValidIDChar(char C, bool First) { |
45 | if (C == '_' || isAlpha(C)) |
46 | return true; |
47 | return !First && isDigit(C); |
48 | } |
49 | |
50 | constexpr PreprocessorDir PreprocessorDirs[] = {{.Kind: tgtok::Ifdef, .Word: "ifdef" }, |
51 | {.Kind: tgtok::Ifndef, .Word: "ifndef" }, |
52 | {.Kind: tgtok::Else, .Word: "else" }, |
53 | {.Kind: tgtok::Endif, .Word: "endif" }, |
54 | {.Kind: tgtok::Define, .Word: "define" }}; |
55 | |
56 | // Returns a pointer past the end of a valid macro name at the start of `Str`. |
57 | // Valid macro names match the regular expression [a-zA-Z_][0-9a-zA-Z_]*. |
58 | static const char *lexMacroName(StringRef Str) { |
59 | assert(!Str.empty()); |
60 | |
61 | // Macro names start with [a-zA-Z_]. |
62 | const char *Next = Str.begin(); |
63 | if (!isValidIDChar(C: *Next, /*First=*/true)) |
64 | return Next; |
65 | // Eat the first character of the name. |
66 | ++Next; |
67 | |
68 | // Match the rest of the identifier regex: [0-9a-zA-Z_]* |
69 | const char *End = Str.end(); |
70 | while (Next != End && isValidIDChar(C: *Next, /*First=*/false)) |
71 | ++Next; |
72 | return Next; |
73 | } |
74 | |
75 | TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) { |
76 | CurBuffer = SrcMgr.getMainFileID(); |
77 | CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer(); |
78 | CurPtr = CurBuf.begin(); |
79 | TokStart = nullptr; |
80 | |
81 | // Pretend that we enter the "top-level" include file. |
82 | PrepIncludeStack.emplace_back(); |
83 | |
84 | // Add all macros defined on the command line to the DefinedMacros set. |
85 | // Check invalid macro names and print fatal error if we find one. |
86 | for (StringRef MacroName : Macros) { |
87 | const char *End = lexMacroName(Str: MacroName); |
88 | if (End != MacroName.end()) |
89 | PrintFatalError(Msg: "invalid macro name `" + MacroName + |
90 | "` specified on command line" ); |
91 | |
92 | DefinedMacros.insert(key: MacroName); |
93 | } |
94 | } |
95 | |
96 | SMLoc TGLexer::getLoc() const { |
97 | return SMLoc::getFromPointer(Ptr: TokStart); |
98 | } |
99 | |
100 | SMRange TGLexer::getLocRange() const { |
101 | return {getLoc(), SMLoc::getFromPointer(Ptr: CurPtr)}; |
102 | } |
103 | |
104 | /// ReturnError - Set the error to the specified string at the specified |
105 | /// location. This is defined to always return tgtok::Error. |
106 | tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) { |
107 | PrintError(ErrorLoc: Loc, Msg); |
108 | return tgtok::Error; |
109 | } |
110 | |
111 | tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { |
112 | return ReturnError(Loc: SMLoc::getFromPointer(Ptr: Loc), Msg); |
113 | } |
114 | |
115 | bool TGLexer::processEOF() { |
116 | SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(i: CurBuffer); |
117 | if (ParentIncludeLoc != SMLoc()) { |
118 | // If prepExitInclude() detects a problem with the preprocessing |
119 | // control stack, it will return false. Pretend that we reached |
120 | // the final EOF and stop lexing more tokens by returning false |
121 | // to LexToken(). |
122 | if (!prepExitInclude(IncludeStackMustBeEmpty: false)) |
123 | return false; |
124 | |
125 | CurBuffer = SrcMgr.FindBufferContainingLoc(Loc: ParentIncludeLoc); |
126 | CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer(); |
127 | CurPtr = ParentIncludeLoc.getPointer(); |
128 | // Make sure TokStart points into the parent file's buffer. |
129 | // LexToken() assigns to it before calling getNextChar(), |
130 | // so it is pointing into the included file now. |
131 | TokStart = CurPtr; |
132 | return true; |
133 | } |
134 | |
135 | // Pretend that we exit the "top-level" include file. |
136 | // Note that in case of an error (e.g. control stack imbalance) |
137 | // the routine will issue a fatal error. |
138 | prepExitInclude(IncludeStackMustBeEmpty: true); |
139 | return false; |
140 | } |
141 | |
142 | int TGLexer::getNextChar() { |
143 | char CurChar = *CurPtr++; |
144 | switch (CurChar) { |
145 | default: |
146 | return (unsigned char)CurChar; |
147 | |
148 | case 0: { |
149 | // A NUL character in the stream is either the end of the current buffer or |
150 | // a spurious NUL in the file. Disambiguate that here. |
151 | if (CurPtr - 1 == CurBuf.end()) { |
152 | --CurPtr; // Arrange for another call to return EOF again. |
153 | return EOF; |
154 | } |
155 | PrintError(ErrorLoc: getLoc(), |
156 | Msg: "NUL character is invalid in source; treated as space" ); |
157 | return ' '; |
158 | } |
159 | |
160 | case '\n': |
161 | case '\r': |
162 | // Handle the newline character by ignoring it and incrementing the line |
163 | // count. However, be careful about 'dos style' files with \n\r in them. |
164 | // Only treat a \n\r or \r\n as a single line. |
165 | if ((*CurPtr == '\n' || (*CurPtr == '\r')) && |
166 | *CurPtr != CurChar) |
167 | ++CurPtr; // Eat the two char newline sequence. |
168 | return '\n'; |
169 | } |
170 | } |
171 | |
172 | int TGLexer::peekNextChar(int Index) const { |
173 | return *(CurPtr + Index); |
174 | } |
175 | |
176 | tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { |
177 | TokStart = CurPtr; |
178 | // This always consumes at least one character. |
179 | int CurChar = getNextChar(); |
180 | |
181 | switch (CurChar) { |
182 | default: |
183 | // Handle letters: [a-zA-Z_] |
184 | if (isValidIDChar(C: CurChar, /*First=*/true)) |
185 | return LexIdentifier(); |
186 | |
187 | // Unknown character, emit an error. |
188 | return ReturnError(Loc: TokStart, Msg: "unexpected character" ); |
189 | case EOF: |
190 | // Lex next token, if we just left an include file. |
191 | // Note that leaving an include file means that the next |
192 | // symbol is located at the end of the 'include "..."' |
193 | // construct, so LexToken() is called with default |
194 | // false parameter. |
195 | if (processEOF()) |
196 | return LexToken(); |
197 | |
198 | // Return EOF denoting the end of lexing. |
199 | return tgtok::Eof; |
200 | |
201 | case ':': return tgtok::colon; |
202 | case ';': return tgtok::semi; |
203 | case ',': return tgtok::comma; |
204 | case '<': return tgtok::less; |
205 | case '>': return tgtok::greater; |
206 | case ']': return tgtok::r_square; |
207 | case '{': return tgtok::l_brace; |
208 | case '}': return tgtok::r_brace; |
209 | case '(': return tgtok::l_paren; |
210 | case ')': return tgtok::r_paren; |
211 | case '=': return tgtok::equal; |
212 | case '?': return tgtok::question; |
213 | case '#': |
214 | if (FileOrLineStart) { |
215 | tgtok::TokKind Kind = prepIsDirective(); |
216 | if (Kind != tgtok::Error) |
217 | return lexPreprocessor(Kind); |
218 | } |
219 | |
220 | return tgtok::paste; |
221 | |
222 | // The period is a separate case so we can recognize the "..." |
223 | // range punctuator. |
224 | case '.': |
225 | if (peekNextChar(Index: 0) == '.') { |
226 | ++CurPtr; // Eat second dot. |
227 | if (peekNextChar(Index: 0) == '.') { |
228 | ++CurPtr; // Eat third dot. |
229 | return tgtok::dotdotdot; |
230 | } |
231 | return ReturnError(Loc: TokStart, Msg: "invalid '..' punctuation" ); |
232 | } |
233 | return tgtok::dot; |
234 | |
235 | case '\r': |
236 | llvm_unreachable("getNextChar() must never return '\r'" ); |
237 | |
238 | case ' ': |
239 | case '\t': |
240 | // Ignore whitespace. |
241 | return LexToken(FileOrLineStart); |
242 | case '\n': |
243 | // Ignore whitespace, and identify the new line. |
244 | return LexToken(FileOrLineStart: true); |
245 | case '/': |
246 | // If this is the start of a // comment, skip until the end of the line or |
247 | // the end of the buffer. |
248 | if (*CurPtr == '/') |
249 | SkipBCPLComment(); |
250 | else if (*CurPtr == '*') { |
251 | if (SkipCComment()) |
252 | return tgtok::Error; |
253 | } else // Otherwise, this is an error. |
254 | return ReturnError(Loc: TokStart, Msg: "unexpected character" ); |
255 | return LexToken(FileOrLineStart); |
256 | case '-': case '+': |
257 | case '0': case '1': case '2': case '3': case '4': case '5': case '6': |
258 | case '7': case '8': case '9': { |
259 | int NextChar = 0; |
260 | if (isDigit(C: CurChar)) { |
261 | // Allow identifiers to start with a number if it is followed by |
262 | // an identifier. This can happen with paste operations like |
263 | // foo#8i. |
264 | int i = 0; |
265 | do { |
266 | NextChar = peekNextChar(Index: i++); |
267 | } while (isDigit(C: NextChar)); |
268 | |
269 | if (NextChar == 'x' || NextChar == 'b') { |
270 | // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most |
271 | // likely a number. |
272 | int NextNextChar = peekNextChar(Index: i); |
273 | switch (NextNextChar) { |
274 | default: |
275 | break; |
276 | case '0': case '1': |
277 | if (NextChar == 'b') |
278 | return LexNumber(); |
279 | [[fallthrough]]; |
280 | case '2': case '3': case '4': case '5': |
281 | case '6': case '7': case '8': case '9': |
282 | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
283 | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
284 | if (NextChar == 'x') |
285 | return LexNumber(); |
286 | break; |
287 | } |
288 | } |
289 | } |
290 | |
291 | if (isValidIDChar(C: NextChar, /*First=*/true)) |
292 | return LexIdentifier(); |
293 | |
294 | return LexNumber(); |
295 | } |
296 | case '"': return LexString(); |
297 | case '$': return LexVarName(); |
298 | case '[': return LexBracket(); |
299 | case '!': return LexExclaim(); |
300 | } |
301 | } |
302 | |
303 | /// LexString - Lex "[^"]*" |
304 | tgtok::TokKind TGLexer::LexString() { |
305 | const char *StrStart = CurPtr; |
306 | |
307 | CurStrVal = "" ; |
308 | |
309 | while (*CurPtr != '"') { |
310 | // If we hit the end of the buffer, report an error. |
311 | if (*CurPtr == 0 && CurPtr == CurBuf.end()) |
312 | return ReturnError(Loc: StrStart, Msg: "end of file in string literal" ); |
313 | |
314 | if (*CurPtr == '\n' || *CurPtr == '\r') |
315 | return ReturnError(Loc: StrStart, Msg: "end of line in string literal" ); |
316 | |
317 | if (*CurPtr != '\\') { |
318 | CurStrVal += *CurPtr++; |
319 | continue; |
320 | } |
321 | |
322 | ++CurPtr; |
323 | |
324 | switch (*CurPtr) { |
325 | case '\\': case '\'': case '"': |
326 | // These turn into their literal character. |
327 | CurStrVal += *CurPtr++; |
328 | break; |
329 | case 't': |
330 | CurStrVal += '\t'; |
331 | ++CurPtr; |
332 | break; |
333 | case 'n': |
334 | CurStrVal += '\n'; |
335 | ++CurPtr; |
336 | break; |
337 | |
338 | case '\n': |
339 | case '\r': |
340 | return ReturnError(Loc: CurPtr, Msg: "escaped newlines not supported in tblgen" ); |
341 | |
342 | // If we hit the end of the buffer, report an error. |
343 | case '\0': |
344 | if (CurPtr == CurBuf.end()) |
345 | return ReturnError(Loc: StrStart, Msg: "end of file in string literal" ); |
346 | [[fallthrough]]; |
347 | default: |
348 | return ReturnError(Loc: CurPtr, Msg: "invalid escape in string literal" ); |
349 | } |
350 | } |
351 | |
352 | ++CurPtr; |
353 | return tgtok::StrVal; |
354 | } |
355 | |
356 | tgtok::TokKind TGLexer::LexVarName() { |
357 | if (!isValidIDChar(C: CurPtr[0], /*First=*/true)) |
358 | return ReturnError(Loc: TokStart, Msg: "invalid variable name" ); |
359 | |
360 | // Otherwise, we're ok, consume the rest of the characters. |
361 | const char *VarNameStart = CurPtr++; |
362 | |
363 | while (isValidIDChar(C: *CurPtr, /*First=*/false)) |
364 | ++CurPtr; |
365 | |
366 | CurStrVal.assign(first: VarNameStart, last: CurPtr); |
367 | return tgtok::VarName; |
368 | } |
369 | |
370 | tgtok::TokKind TGLexer::LexIdentifier() { |
371 | // The first letter is [a-zA-Z_]. |
372 | const char *IdentStart = TokStart; |
373 | |
374 | // Match the rest of the identifier regex: [0-9a-zA-Z_]* |
375 | while (isValidIDChar(C: *CurPtr, /*First=*/false)) |
376 | ++CurPtr; |
377 | |
378 | // Check to see if this identifier is a reserved keyword. |
379 | StringRef Str(IdentStart, CurPtr-IdentStart); |
380 | |
381 | tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) |
382 | .Case(S: "int" , Value: tgtok::Int) |
383 | .Case(S: "bit" , Value: tgtok::Bit) |
384 | .Case(S: "bits" , Value: tgtok::Bits) |
385 | .Case(S: "string" , Value: tgtok::String) |
386 | .Case(S: "list" , Value: tgtok::List) |
387 | .Case(S: "code" , Value: tgtok::Code) |
388 | .Case(S: "dag" , Value: tgtok::Dag) |
389 | .Case(S: "class" , Value: tgtok::Class) |
390 | .Case(S: "def" , Value: tgtok::Def) |
391 | .Case(S: "true" , Value: tgtok::TrueVal) |
392 | .Case(S: "false" , Value: tgtok::FalseVal) |
393 | .Case(S: "foreach" , Value: tgtok::Foreach) |
394 | .Case(S: "defm" , Value: tgtok::Defm) |
395 | .Case(S: "defset" , Value: tgtok::Defset) |
396 | .Case(S: "deftype" , Value: tgtok::Deftype) |
397 | .Case(S: "multiclass" , Value: tgtok::MultiClass) |
398 | .Case(S: "field" , Value: tgtok::Field) |
399 | .Case(S: "let" , Value: tgtok::Let) |
400 | .Case(S: "in" , Value: tgtok::In) |
401 | .Case(S: "defvar" , Value: tgtok::Defvar) |
402 | .Case(S: "include" , Value: tgtok::Include) |
403 | .Case(S: "if" , Value: tgtok::If) |
404 | .Case(S: "then" , Value: tgtok::Then) |
405 | .Case(S: "else" , Value: tgtok::ElseKW) |
406 | .Case(S: "assert" , Value: tgtok::Assert) |
407 | .Case(S: "dump" , Value: tgtok::Dump) |
408 | .Default(Value: tgtok::Id); |
409 | |
410 | // A couple of tokens require special processing. |
411 | switch (Kind) { |
412 | case tgtok::Include: |
413 | if (LexInclude()) return tgtok::Error; |
414 | return Lex(); |
415 | case tgtok::Id: |
416 | CurStrVal.assign(first: Str.begin(), last: Str.end()); |
417 | break; |
418 | default: |
419 | break; |
420 | } |
421 | |
422 | return Kind; |
423 | } |
424 | |
425 | /// LexInclude - We just read the "include" token. Get the string token that |
426 | /// comes next and enter the include. |
427 | bool TGLexer::LexInclude() { |
428 | // The token after the include must be a string. |
429 | tgtok::TokKind Tok = LexToken(); |
430 | if (Tok == tgtok::Error) return true; |
431 | if (Tok != tgtok::StrVal) { |
432 | PrintError(ErrorLoc: getLoc(), Msg: "expected filename after include" ); |
433 | return true; |
434 | } |
435 | |
436 | // Get the string. |
437 | std::string Filename = CurStrVal; |
438 | std::string IncludedFile; |
439 | |
440 | CurBuffer = SrcMgr.AddIncludeFile(Filename, IncludeLoc: SMLoc::getFromPointer(Ptr: CurPtr), |
441 | IncludedFile); |
442 | if (!CurBuffer) { |
443 | PrintError(ErrorLoc: getLoc(), Msg: "could not find include file '" + Filename + "'" ); |
444 | return true; |
445 | } |
446 | |
447 | Dependencies.insert(x: IncludedFile); |
448 | // Save the line number and lex buffer of the includer. |
449 | CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer(); |
450 | CurPtr = CurBuf.begin(); |
451 | |
452 | PrepIncludeStack.emplace_back(); |
453 | return false; |
454 | } |
455 | |
456 | /// SkipBCPLComment - Skip over the comment by finding the next CR or LF. |
457 | /// Or we may end up at the end of the buffer. |
458 | void TGLexer::() { |
459 | ++CurPtr; // skip the second slash. |
460 | auto EOLPos = CurBuf.find_first_of(Chars: "\r\n" , From: CurPtr - CurBuf.data()); |
461 | CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos; |
462 | } |
463 | |
464 | /// SkipCComment - This skips C-style /**/ comments. The only difference from C |
465 | /// is that we allow nesting. |
466 | bool TGLexer::() { |
467 | ++CurPtr; // skip the star. |
468 | unsigned = 1; |
469 | |
470 | while (true) { |
471 | int CurChar = getNextChar(); |
472 | switch (CurChar) { |
473 | case EOF: |
474 | PrintError(Loc: TokStart, Msg: "unterminated comment" ); |
475 | return true; |
476 | case '*': |
477 | // End of the comment? |
478 | if (CurPtr[0] != '/') break; |
479 | |
480 | ++CurPtr; // End the */. |
481 | if (--CommentDepth == 0) |
482 | return false; |
483 | break; |
484 | case '/': |
485 | // Start of a nested comment? |
486 | if (CurPtr[0] != '*') break; |
487 | ++CurPtr; |
488 | ++CommentDepth; |
489 | break; |
490 | } |
491 | } |
492 | } |
493 | |
494 | /// LexNumber - Lex: |
495 | /// [-+]?[0-9]+ |
496 | /// 0x[0-9a-fA-F]+ |
497 | /// 0b[01]+ |
498 | tgtok::TokKind TGLexer::LexNumber() { |
499 | unsigned Base = 0; |
500 | const char *NumStart; |
501 | |
502 | // Check if it's a hex or a binary value. |
503 | if (CurPtr[-1] == '0') { |
504 | NumStart = CurPtr + 1; |
505 | if (CurPtr[0] == 'x') { |
506 | Base = 16; |
507 | do |
508 | ++CurPtr; |
509 | while (isHexDigit(C: CurPtr[0])); |
510 | } else if (CurPtr[0] == 'b') { |
511 | Base = 2; |
512 | do |
513 | ++CurPtr; |
514 | while (CurPtr[0] == '0' || CurPtr[0] == '1'); |
515 | } |
516 | } |
517 | |
518 | // For a hex or binary value, we always convert it to an unsigned value. |
519 | bool IsMinus = false; |
520 | |
521 | // Check if it's a decimal value. |
522 | if (Base == 0) { |
523 | // Check for a sign without a digit. |
524 | if (!isDigit(C: CurPtr[0])) { |
525 | if (CurPtr[-1] == '-') |
526 | return tgtok::minus; |
527 | else if (CurPtr[-1] == '+') |
528 | return tgtok::plus; |
529 | } |
530 | |
531 | Base = 10; |
532 | NumStart = TokStart; |
533 | IsMinus = CurPtr[-1] == '-'; |
534 | |
535 | while (isDigit(C: CurPtr[0])) |
536 | ++CurPtr; |
537 | } |
538 | |
539 | // Requires at least one digit. |
540 | if (CurPtr == NumStart) |
541 | return ReturnError(Loc: TokStart, Msg: "invalid number" ); |
542 | |
543 | errno = 0; |
544 | if (IsMinus) |
545 | CurIntVal = strtoll(nptr: NumStart, endptr: nullptr, base: Base); |
546 | else |
547 | CurIntVal = strtoull(nptr: NumStart, endptr: nullptr, base: Base); |
548 | |
549 | if (errno == EINVAL) |
550 | return ReturnError(Loc: TokStart, Msg: "invalid number" ); |
551 | if (errno == ERANGE) |
552 | return ReturnError(Loc: TokStart, Msg: "number out of range" ); |
553 | |
554 | return Base == 2 ? tgtok::BinaryIntVal : tgtok::IntVal; |
555 | } |
556 | |
557 | /// LexBracket - We just read '['. If this is a code block, return it, |
558 | /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' |
559 | tgtok::TokKind TGLexer::LexBracket() { |
560 | if (CurPtr[0] != '{') |
561 | return tgtok::l_square; |
562 | ++CurPtr; |
563 | const char *CodeStart = CurPtr; |
564 | while (true) { |
565 | int Char = getNextChar(); |
566 | if (Char == EOF) break; |
567 | |
568 | if (Char != '}') continue; |
569 | |
570 | Char = getNextChar(); |
571 | if (Char == EOF) break; |
572 | if (Char == ']') { |
573 | CurStrVal.assign(first: CodeStart, last: CurPtr-2); |
574 | return tgtok::CodeFragment; |
575 | } |
576 | } |
577 | |
578 | return ReturnError(Loc: CodeStart - 2, Msg: "unterminated code block" ); |
579 | } |
580 | |
581 | /// LexExclaim - Lex '!' and '![a-zA-Z]+'. |
582 | tgtok::TokKind TGLexer::LexExclaim() { |
583 | if (!isAlpha(C: *CurPtr)) |
584 | return ReturnError(Loc: CurPtr - 1, Msg: "invalid \"!operator\"" ); |
585 | |
586 | const char *Start = CurPtr++; |
587 | while (isAlpha(C: *CurPtr)) |
588 | ++CurPtr; |
589 | |
590 | // Check to see which operator this is. |
591 | tgtok::TokKind Kind = |
592 | StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) |
593 | .Case(S: "eq" , Value: tgtok::XEq) |
594 | .Case(S: "ne" , Value: tgtok::XNe) |
595 | .Case(S: "le" , Value: tgtok::XLe) |
596 | .Case(S: "lt" , Value: tgtok::XLt) |
597 | .Case(S: "ge" , Value: tgtok::XGe) |
598 | .Case(S: "gt" , Value: tgtok::XGt) |
599 | .Case(S: "if" , Value: tgtok::XIf) |
600 | .Case(S: "cond" , Value: tgtok::XCond) |
601 | .Case(S: "isa" , Value: tgtok::XIsA) |
602 | .Case(S: "head" , Value: tgtok::XHead) |
603 | .Case(S: "tail" , Value: tgtok::XTail) |
604 | .Case(S: "size" , Value: tgtok::XSize) |
605 | .Case(S: "con" , Value: tgtok::XConcat) |
606 | .Case(S: "dag" , Value: tgtok::XDag) |
607 | .Case(S: "add" , Value: tgtok::XADD) |
608 | .Case(S: "sub" , Value: tgtok::XSUB) |
609 | .Case(S: "mul" , Value: tgtok::XMUL) |
610 | .Case(S: "div" , Value: tgtok::XDIV) |
611 | .Case(S: "not" , Value: tgtok::XNOT) |
612 | .Case(S: "logtwo" , Value: tgtok::XLOG2) |
613 | .Case(S: "and" , Value: tgtok::XAND) |
614 | .Case(S: "or" , Value: tgtok::XOR) |
615 | .Case(S: "xor" , Value: tgtok::XXOR) |
616 | .Case(S: "shl" , Value: tgtok::XSHL) |
617 | .Case(S: "sra" , Value: tgtok::XSRA) |
618 | .Case(S: "srl" , Value: tgtok::XSRL) |
619 | .Case(S: "cast" , Value: tgtok::XCast) |
620 | .Case(S: "empty" , Value: tgtok::XEmpty) |
621 | .Case(S: "subst" , Value: tgtok::XSubst) |
622 | .Case(S: "foldl" , Value: tgtok::XFoldl) |
623 | .Case(S: "foreach" , Value: tgtok::XForEach) |
624 | .Case(S: "filter" , Value: tgtok::XFilter) |
625 | .Case(S: "listconcat" , Value: tgtok::XListConcat) |
626 | .Case(S: "listflatten" , Value: tgtok::XListFlatten) |
627 | .Case(S: "listsplat" , Value: tgtok::XListSplat) |
628 | .Case(S: "listremove" , Value: tgtok::XListRemove) |
629 | .Case(S: "range" , Value: tgtok::XRange) |
630 | .Case(S: "strconcat" , Value: tgtok::XStrConcat) |
631 | .Case(S: "initialized" , Value: tgtok::XInitialized) |
632 | .Case(S: "interleave" , Value: tgtok::XInterleave) |
633 | .Case(S: "instances" , Value: tgtok::XInstances) |
634 | .Case(S: "substr" , Value: tgtok::XSubstr) |
635 | .Case(S: "find" , Value: tgtok::XFind) |
636 | .Cases(S0: "setdagop" , S1: "setop" , Value: tgtok::XSetDagOp) // !setop is deprecated. |
637 | .Cases(S0: "getdagop" , S1: "getop" , Value: tgtok::XGetDagOp) // !getop is deprecated. |
638 | .Case(S: "getdagarg" , Value: tgtok::XGetDagArg) |
639 | .Case(S: "getdagname" , Value: tgtok::XGetDagName) |
640 | .Case(S: "setdagarg" , Value: tgtok::XSetDagArg) |
641 | .Case(S: "setdagname" , Value: tgtok::XSetDagName) |
642 | .Case(S: "exists" , Value: tgtok::XExists) |
643 | .Case(S: "tolower" , Value: tgtok::XToLower) |
644 | .Case(S: "toupper" , Value: tgtok::XToUpper) |
645 | .Case(S: "repr" , Value: tgtok::XRepr) |
646 | .Case(S: "match" , Value: tgtok::XMatch) |
647 | .Default(Value: tgtok::Error); |
648 | |
649 | return Kind != tgtok::Error ? Kind |
650 | : ReturnError(Loc: Start - 1, Msg: "unknown operator" ); |
651 | } |
652 | |
653 | bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) { |
654 | // Report an error, if preprocessor control stack for the current |
655 | // file is not empty. |
656 | if (!PrepIncludeStack.back().empty()) { |
657 | prepReportPreprocessorStackError(); |
658 | |
659 | return false; |
660 | } |
661 | |
662 | // Pop the preprocessing controls from the include stack. |
663 | PrepIncludeStack.pop_back(); |
664 | |
665 | if (IncludeStackMustBeEmpty) { |
666 | assert(PrepIncludeStack.empty() && |
667 | "preprocessor include stack is not empty" ); |
668 | } else { |
669 | assert(!PrepIncludeStack.empty() && "preprocessor include stack is empty" ); |
670 | } |
671 | |
672 | return true; |
673 | } |
674 | |
675 | tgtok::TokKind TGLexer::prepIsDirective() const { |
676 | for (const auto [Kind, Word] : PreprocessorDirs) { |
677 | if (StringRef(CurPtr, Word.size()) != Word) |
678 | continue; |
679 | int NextChar = peekNextChar(Index: Word.size()); |
680 | |
681 | // Check for whitespace after the directive. If there is no whitespace, |
682 | // then we do not recognize it as a preprocessing directive. |
683 | |
684 | // New line and EOF may follow only #else/#endif. It will be reported |
685 | // as an error for #ifdef/#define after the call to prepLexMacroName(). |
686 | if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF || |
687 | NextChar == '\n' || |
688 | // It looks like TableGen does not support '\r' as the actual |
689 | // carriage return, e.g. getNextChar() treats a single '\r' |
690 | // as '\n'. So we do the same here. |
691 | NextChar == '\r') |
692 | return Kind; |
693 | |
694 | // Allow comments after some directives, e.g.: |
695 | // #else// OR #else/**/ |
696 | // #endif// OR #endif/**/ |
697 | // |
698 | // Note that we do allow comments after #ifdef/#define here, e.g. |
699 | // #ifdef/**/ AND #ifdef// |
700 | // #define/**/ AND #define// |
701 | // |
702 | // These cases will be reported as incorrect after calling |
703 | // prepLexMacroName(). We could have supported C-style comments |
704 | // after #ifdef/#define, but this would complicate the code |
705 | // for little benefit. |
706 | if (NextChar == '/') { |
707 | NextChar = peekNextChar(Index: Word.size() + 1); |
708 | |
709 | if (NextChar == '*' || NextChar == '/') |
710 | return Kind; |
711 | |
712 | // Pretend that we do not recognize the directive. |
713 | } |
714 | } |
715 | |
716 | return tgtok::Error; |
717 | } |
718 | |
719 | void TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) { |
720 | TokStart = CurPtr; |
721 | |
722 | for (const auto [PKind, PWord] : PreprocessorDirs) { |
723 | if (PKind == Kind) { |
724 | // Advance CurPtr to the end of the preprocessing word. |
725 | CurPtr += PWord.size(); |
726 | return; |
727 | } |
728 | } |
729 | |
730 | llvm_unreachable( |
731 | "unsupported preprocessing token in prepEatPreprocessorDirective()" ); |
732 | } |
733 | |
734 | tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind, |
735 | bool ReturnNextLiveToken) { |
736 | // We must be looking at a preprocessing directive. Eat it! |
737 | prepEatPreprocessorDirective(Kind); |
738 | |
739 | if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) { |
740 | StringRef MacroName = prepLexMacroName(); |
741 | StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef" ; |
742 | if (MacroName.empty()) |
743 | return ReturnError(Loc: TokStart, Msg: "expected macro name after " + IfTokName); |
744 | |
745 | bool MacroIsDefined = DefinedMacros.count(Key: MacroName) != 0; |
746 | |
747 | // Canonicalize ifndef's MacroIsDefined to its ifdef equivalent. |
748 | if (Kind == tgtok::Ifndef) |
749 | MacroIsDefined = !MacroIsDefined; |
750 | |
751 | // Regardless of whether we are processing tokens or not, |
752 | // we put the #ifdef control on stack. |
753 | // Note that MacroIsDefined has been canonicalized against ifdef. |
754 | PrepIncludeStack.back().push_back( |
755 | Elt: {.Kind: tgtok::Ifdef, .IsDefined: MacroIsDefined, .SrcPos: SMLoc::getFromPointer(Ptr: TokStart)}); |
756 | |
757 | if (!prepSkipDirectiveEnd()) |
758 | return ReturnError(Loc: CurPtr, Msg: "only comments are supported after " + |
759 | IfTokName + " NAME" ); |
760 | |
761 | // If we were not processing tokens before this #ifdef, |
762 | // then just return back to the lines skipping code. |
763 | if (!ReturnNextLiveToken) |
764 | return Kind; |
765 | |
766 | // If we were processing tokens before this #ifdef, |
767 | // and the macro is defined, then just return the next token. |
768 | if (MacroIsDefined) |
769 | return LexToken(); |
770 | |
771 | // We were processing tokens before this #ifdef, and the macro |
772 | // is not defined, so we have to start skipping the lines. |
773 | // If the skipping is successful, it will return the token following |
774 | // either #else or #endif corresponding to this #ifdef. |
775 | if (prepSkipRegion(MustNeverBeFalse: ReturnNextLiveToken)) |
776 | return LexToken(); |
777 | |
778 | return tgtok::Error; |
779 | } else if (Kind == tgtok::Else) { |
780 | // Check if this #else is correct before calling prepSkipDirectiveEnd(), |
781 | // which will move CurPtr away from the beginning of #else. |
782 | if (PrepIncludeStack.back().empty()) |
783 | return ReturnError(Loc: TokStart, Msg: "#else without #ifdef or #ifndef" ); |
784 | |
785 | PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back(); |
786 | |
787 | if (IfdefEntry.Kind != tgtok::Ifdef) { |
788 | PrintError(Loc: TokStart, Msg: "double #else" ); |
789 | return ReturnError(Loc: IfdefEntry.SrcPos, Msg: "previous #else is here" ); |
790 | } |
791 | |
792 | // Replace the corresponding #ifdef's control with its negation |
793 | // on the control stack. |
794 | PrepIncludeStack.back().back() = {.Kind: Kind, .IsDefined: !IfdefEntry.IsDefined, |
795 | .SrcPos: SMLoc::getFromPointer(Ptr: TokStart)}; |
796 | |
797 | if (!prepSkipDirectiveEnd()) |
798 | return ReturnError(Loc: CurPtr, Msg: "only comments are supported after #else" ); |
799 | |
800 | // If we were processing tokens before this #else, |
801 | // we have to start skipping lines until the matching #endif. |
802 | if (ReturnNextLiveToken) { |
803 | if (prepSkipRegion(MustNeverBeFalse: ReturnNextLiveToken)) |
804 | return LexToken(); |
805 | |
806 | return tgtok::Error; |
807 | } |
808 | |
809 | // Return to the lines skipping code. |
810 | return Kind; |
811 | } else if (Kind == tgtok::Endif) { |
812 | // Check if this #endif is correct before calling prepSkipDirectiveEnd(), |
813 | // which will move CurPtr away from the beginning of #endif. |
814 | if (PrepIncludeStack.back().empty()) |
815 | return ReturnError(Loc: TokStart, Msg: "#endif without #ifdef" ); |
816 | |
817 | [[maybe_unused]] auto &IfdefOrElseEntry = PrepIncludeStack.back().back(); |
818 | |
819 | assert((IfdefOrElseEntry.Kind == tgtok::Ifdef || |
820 | IfdefOrElseEntry.Kind == tgtok::Else) && |
821 | "invalid preprocessor control on the stack" ); |
822 | |
823 | if (!prepSkipDirectiveEnd()) |
824 | return ReturnError(Loc: CurPtr, Msg: "only comments are supported after #endif" ); |
825 | |
826 | PrepIncludeStack.back().pop_back(); |
827 | |
828 | // If we were processing tokens before this #endif, then |
829 | // we should continue it. |
830 | if (ReturnNextLiveToken) { |
831 | return LexToken(); |
832 | } |
833 | |
834 | // Return to the lines skipping code. |
835 | return Kind; |
836 | } else if (Kind == tgtok::Define) { |
837 | StringRef MacroName = prepLexMacroName(); |
838 | if (MacroName.empty()) |
839 | return ReturnError(Loc: TokStart, Msg: "expected macro name after #define" ); |
840 | |
841 | if (!DefinedMacros.insert(key: MacroName).second) |
842 | PrintWarning(WarningLoc: getLoc(), |
843 | Msg: "duplicate definition of macro: " + Twine(MacroName)); |
844 | |
845 | if (!prepSkipDirectiveEnd()) |
846 | return ReturnError(Loc: CurPtr, |
847 | Msg: "only comments are supported after #define NAME" ); |
848 | |
849 | assert(ReturnNextLiveToken && |
850 | "#define must be ignored during the lines skipping" ); |
851 | |
852 | return LexToken(); |
853 | } |
854 | |
855 | llvm_unreachable("preprocessing directive is not supported" ); |
856 | } |
857 | |
858 | bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) { |
859 | assert(MustNeverBeFalse && "invalid recursion." ); |
860 | |
861 | do { |
862 | // Skip all symbols to the line end. |
863 | while (*CurPtr != '\n') |
864 | ++CurPtr; |
865 | |
866 | // Find the first non-whitespace symbol in the next line(s). |
867 | if (!prepSkipLineBegin()) |
868 | return false; |
869 | |
870 | // If the first non-blank/comment symbol on the line is '#', |
871 | // it may be a start of preprocessing directive. |
872 | // |
873 | // If it is not '#' just go to the next line. |
874 | if (*CurPtr == '#') |
875 | ++CurPtr; |
876 | else |
877 | continue; |
878 | |
879 | tgtok::TokKind Kind = prepIsDirective(); |
880 | |
881 | // If we did not find a preprocessing directive or it is #define, |
882 | // then just skip to the next line. We do not have to do anything |
883 | // for #define in the line-skipping mode. |
884 | if (Kind == tgtok::Error || Kind == tgtok::Define) |
885 | continue; |
886 | |
887 | tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, ReturnNextLiveToken: false); |
888 | |
889 | // If lexPreprocessor() encountered an error during lexing this |
890 | // preprocessor idiom, then return false to the calling lexPreprocessor(). |
891 | // This will force tgtok::Error to be returned to the tokens processing. |
892 | if (ProcessedKind == tgtok::Error) |
893 | return false; |
894 | |
895 | assert(Kind == ProcessedKind && "prepIsDirective() and lexPreprocessor() " |
896 | "returned different token kinds" ); |
897 | |
898 | // If this preprocessing directive enables tokens processing, |
899 | // then return to the lexPreprocessor() and get to the next token. |
900 | // We can move from line-skipping mode to processing tokens only |
901 | // due to #else or #endif. |
902 | if (prepIsProcessingEnabled()) { |
903 | assert((Kind == tgtok::Else || Kind == tgtok::Endif) && |
904 | "tokens processing was enabled by an unexpected preprocessing " |
905 | "directive" ); |
906 | |
907 | return true; |
908 | } |
909 | } while (CurPtr != CurBuf.end()); |
910 | |
911 | // We have reached the end of the file, but never left the lines-skipping |
912 | // mode. This means there is no matching #endif. |
913 | prepReportPreprocessorStackError(); |
914 | return false; |
915 | } |
916 | |
917 | StringRef TGLexer::prepLexMacroName() { |
918 | // Skip whitespaces between the preprocessing directive and the macro name. |
919 | while (*CurPtr == ' ' || *CurPtr == '\t') |
920 | ++CurPtr; |
921 | |
922 | TokStart = CurPtr; |
923 | CurPtr = lexMacroName(Str: StringRef(CurPtr, CurBuf.end() - CurPtr)); |
924 | return StringRef(TokStart, CurPtr - TokStart); |
925 | } |
926 | |
927 | bool TGLexer::prepSkipLineBegin() { |
928 | while (CurPtr != CurBuf.end()) { |
929 | switch (*CurPtr) { |
930 | case ' ': |
931 | case '\t': |
932 | case '\n': |
933 | case '\r': |
934 | break; |
935 | |
936 | case '/': { |
937 | int NextChar = peekNextChar(Index: 1); |
938 | if (NextChar == '*') { |
939 | // Skip C-style comment. |
940 | // Note that we do not care about skipping the C++-style comments. |
941 | // If the line contains "//", it may not contain any processable |
942 | // preprocessing directive. Just return CurPtr pointing to |
943 | // the first '/' in this case. We also do not care about |
944 | // incorrect symbols after the first '/' - we are in lines-skipping |
945 | // mode, so incorrect code is allowed to some extent. |
946 | |
947 | // Set TokStart to the beginning of the comment to enable proper |
948 | // diagnostic printing in case of error in SkipCComment(). |
949 | TokStart = CurPtr; |
950 | |
951 | // CurPtr must point to '*' before call to SkipCComment(). |
952 | ++CurPtr; |
953 | if (SkipCComment()) |
954 | return false; |
955 | } else { |
956 | // CurPtr points to the non-whitespace '/'. |
957 | return true; |
958 | } |
959 | |
960 | // We must not increment CurPtr after the comment was lexed. |
961 | continue; |
962 | } |
963 | |
964 | default: |
965 | return true; |
966 | } |
967 | |
968 | ++CurPtr; |
969 | } |
970 | |
971 | // We have reached the end of the file. Return to the lines skipping |
972 | // code, and allow it to handle the EOF as needed. |
973 | return true; |
974 | } |
975 | |
976 | bool TGLexer::prepSkipDirectiveEnd() { |
977 | while (CurPtr != CurBuf.end()) { |
978 | switch (*CurPtr) { |
979 | case ' ': |
980 | case '\t': |
981 | break; |
982 | |
983 | case '\n': |
984 | case '\r': |
985 | return true; |
986 | |
987 | case '/': { |
988 | int NextChar = peekNextChar(Index: 1); |
989 | if (NextChar == '/') { |
990 | // Skip C++-style comment. |
991 | // We may just return true now, but let's skip to the line/buffer end |
992 | // to simplify the method specification. |
993 | ++CurPtr; |
994 | SkipBCPLComment(); |
995 | } else if (NextChar == '*') { |
996 | // When we are skipping C-style comment at the end of a preprocessing |
997 | // directive, we can skip several lines. If any meaningful TD token |
998 | // follows the end of the C-style comment on the same line, it will |
999 | // be considered as an invalid usage of TD token. |
1000 | // For example, we want to forbid usages like this one: |
1001 | // #define MACRO class Class {} |
1002 | // But with C-style comments we also disallow the following: |
1003 | // #define MACRO /* This macro is used |
1004 | // to ... */ class Class {} |
1005 | // One can argue that this should be allowed, but it does not seem |
1006 | // to be worth of the complication. Moreover, this matches |
1007 | // the C preprocessor behavior. |
1008 | |
1009 | // Set TokStart to the beginning of the comment to enable proper |
1010 | // diagnostic printer in case of error in SkipCComment(). |
1011 | TokStart = CurPtr; |
1012 | ++CurPtr; |
1013 | if (SkipCComment()) |
1014 | return false; |
1015 | } else { |
1016 | TokStart = CurPtr; |
1017 | PrintError(Loc: CurPtr, Msg: "unexpected character" ); |
1018 | return false; |
1019 | } |
1020 | |
1021 | // We must not increment CurPtr after the comment was lexed. |
1022 | continue; |
1023 | } |
1024 | |
1025 | default: |
1026 | // Do not allow any non-whitespaces after the directive. |
1027 | TokStart = CurPtr; |
1028 | return false; |
1029 | } |
1030 | |
1031 | ++CurPtr; |
1032 | } |
1033 | |
1034 | return true; |
1035 | } |
1036 | |
1037 | bool TGLexer::prepIsProcessingEnabled() { |
1038 | return all_of(Range&: PrepIncludeStack.back(), |
1039 | P: [](const PreprocessorControlDesc &I) { return I.IsDefined; }); |
1040 | } |
1041 | |
1042 | void TGLexer::prepReportPreprocessorStackError() { |
1043 | auto &PrepControl = PrepIncludeStack.back().back(); |
1044 | PrintError(Loc: CurBuf.end(), Msg: "reached EOF without matching #endif" ); |
1045 | PrintError(ErrorLoc: PrepControl.SrcPos, Msg: "the latest preprocessor control is here" ); |
1046 | |
1047 | TokStart = CurPtr; |
1048 | } |
1049 | |