1//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Implement the Lexer for TableGen.
10//
11//===----------------------------------------------------------------------===//
12
13#include "TGLexer.h"
14#include "llvm/ADT/ArrayRef.h"
15#include "llvm/ADT/StringExtras.h"
16#include "llvm/ADT/StringSwitch.h"
17#include "llvm/ADT/Twine.h"
18#include "llvm/Config/config.h" // for strtoull()/strtoll() define
19#include "llvm/Support/Compiler.h"
20#include "llvm/Support/MemoryBuffer.h"
21#include "llvm/Support/SourceMgr.h"
22#include "llvm/TableGen/Error.h"
23#include <cerrno>
24#include <cstdio>
25#include <cstdlib>
26#include <cstring>
27
28using namespace llvm;
29
30namespace {
31// A list of supported preprocessing directives with their
32// internal token kinds and names.
33struct PreprocessorDir {
34 tgtok::TokKind Kind;
35 StringRef Word;
36};
37} // end anonymous namespace
38
39/// Returns true if `C` is a valid character in an identifier. If `First` is
40/// true, returns true if `C` is a valid first character of an identifier,
41/// else returns true if `C` is a valid non-first character of an identifier.
42/// Identifiers match the following regular expression:
43/// [a-zA-Z_][0-9a-zA-Z_]*
44static bool isValidIDChar(char C, bool First) {
45 if (C == '_' || isAlpha(C))
46 return true;
47 return !First && isDigit(C);
48}
49
50constexpr PreprocessorDir PreprocessorDirs[] = {{.Kind: tgtok::Ifdef, .Word: "ifdef"},
51 {.Kind: tgtok::Ifndef, .Word: "ifndef"},
52 {.Kind: tgtok::Else, .Word: "else"},
53 {.Kind: tgtok::Endif, .Word: "endif"},
54 {.Kind: tgtok::Define, .Word: "define"}};
55
56// Returns a pointer past the end of a valid macro name at the start of `Str`.
57// Valid macro names match the regular expression [a-zA-Z_][0-9a-zA-Z_]*.
58static const char *lexMacroName(StringRef Str) {
59 assert(!Str.empty());
60
61 // Macro names start with [a-zA-Z_].
62 const char *Next = Str.begin();
63 if (!isValidIDChar(C: *Next, /*First=*/true))
64 return Next;
65 // Eat the first character of the name.
66 ++Next;
67
68 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
69 const char *End = Str.end();
70 while (Next != End && isValidIDChar(C: *Next, /*First=*/false))
71 ++Next;
72 return Next;
73}
74
75TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
76 CurBuffer = SrcMgr.getMainFileID();
77 CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
78 CurPtr = CurBuf.begin();
79 TokStart = nullptr;
80
81 // Pretend that we enter the "top-level" include file.
82 PrepIncludeStack.emplace_back();
83
84 // Add all macros defined on the command line to the DefinedMacros set.
85 // Check invalid macro names and print fatal error if we find one.
86 for (StringRef MacroName : Macros) {
87 const char *End = lexMacroName(Str: MacroName);
88 if (End != MacroName.end())
89 PrintFatalError(Msg: "invalid macro name `" + MacroName +
90 "` specified on command line");
91
92 DefinedMacros.insert(key: MacroName);
93 }
94}
95
96SMLoc TGLexer::getLoc() const {
97 return SMLoc::getFromPointer(Ptr: TokStart);
98}
99
100SMRange TGLexer::getLocRange() const {
101 return {getLoc(), SMLoc::getFromPointer(Ptr: CurPtr)};
102}
103
104/// ReturnError - Set the error to the specified string at the specified
105/// location. This is defined to always return tgtok::Error.
106tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) {
107 PrintError(ErrorLoc: Loc, Msg);
108 return tgtok::Error;
109}
110
111tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
112 return ReturnError(Loc: SMLoc::getFromPointer(Ptr: Loc), Msg);
113}
114
115bool TGLexer::processEOF() {
116 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(i: CurBuffer);
117 if (ParentIncludeLoc != SMLoc()) {
118 // If prepExitInclude() detects a problem with the preprocessing
119 // control stack, it will return false. Pretend that we reached
120 // the final EOF and stop lexing more tokens by returning false
121 // to LexToken().
122 if (!prepExitInclude(IncludeStackMustBeEmpty: false))
123 return false;
124
125 CurBuffer = SrcMgr.FindBufferContainingLoc(Loc: ParentIncludeLoc);
126 CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
127 CurPtr = ParentIncludeLoc.getPointer();
128 // Make sure TokStart points into the parent file's buffer.
129 // LexToken() assigns to it before calling getNextChar(),
130 // so it is pointing into the included file now.
131 TokStart = CurPtr;
132 return true;
133 }
134
135 // Pretend that we exit the "top-level" include file.
136 // Note that in case of an error (e.g. control stack imbalance)
137 // the routine will issue a fatal error.
138 prepExitInclude(IncludeStackMustBeEmpty: true);
139 return false;
140}
141
142int TGLexer::getNextChar() {
143 char CurChar = *CurPtr++;
144 switch (CurChar) {
145 default:
146 return (unsigned char)CurChar;
147
148 case 0: {
149 // A NUL character in the stream is either the end of the current buffer or
150 // a spurious NUL in the file. Disambiguate that here.
151 if (CurPtr - 1 == CurBuf.end()) {
152 --CurPtr; // Arrange for another call to return EOF again.
153 return EOF;
154 }
155 PrintError(ErrorLoc: getLoc(),
156 Msg: "NUL character is invalid in source; treated as space");
157 return ' ';
158 }
159
160 case '\n':
161 case '\r':
162 // Handle the newline character by ignoring it and incrementing the line
163 // count. However, be careful about 'dos style' files with \n\r in them.
164 // Only treat a \n\r or \r\n as a single line.
165 if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
166 *CurPtr != CurChar)
167 ++CurPtr; // Eat the two char newline sequence.
168 return '\n';
169 }
170}
171
172int TGLexer::peekNextChar(int Index) const {
173 return *(CurPtr + Index);
174}
175
176tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
177 TokStart = CurPtr;
178 // This always consumes at least one character.
179 int CurChar = getNextChar();
180
181 switch (CurChar) {
182 default:
183 // Handle letters: [a-zA-Z_]
184 if (isValidIDChar(C: CurChar, /*First=*/true))
185 return LexIdentifier();
186
187 // Unknown character, emit an error.
188 return ReturnError(Loc: TokStart, Msg: "unexpected character");
189 case EOF:
190 // Lex next token, if we just left an include file.
191 // Note that leaving an include file means that the next
192 // symbol is located at the end of the 'include "..."'
193 // construct, so LexToken() is called with default
194 // false parameter.
195 if (processEOF())
196 return LexToken();
197
198 // Return EOF denoting the end of lexing.
199 return tgtok::Eof;
200
201 case ':': return tgtok::colon;
202 case ';': return tgtok::semi;
203 case ',': return tgtok::comma;
204 case '<': return tgtok::less;
205 case '>': return tgtok::greater;
206 case ']': return tgtok::r_square;
207 case '{': return tgtok::l_brace;
208 case '}': return tgtok::r_brace;
209 case '(': return tgtok::l_paren;
210 case ')': return tgtok::r_paren;
211 case '=': return tgtok::equal;
212 case '?': return tgtok::question;
213 case '#':
214 if (FileOrLineStart) {
215 tgtok::TokKind Kind = prepIsDirective();
216 if (Kind != tgtok::Error)
217 return lexPreprocessor(Kind);
218 }
219
220 return tgtok::paste;
221
222 // The period is a separate case so we can recognize the "..."
223 // range punctuator.
224 case '.':
225 if (peekNextChar(Index: 0) == '.') {
226 ++CurPtr; // Eat second dot.
227 if (peekNextChar(Index: 0) == '.') {
228 ++CurPtr; // Eat third dot.
229 return tgtok::dotdotdot;
230 }
231 return ReturnError(Loc: TokStart, Msg: "invalid '..' punctuation");
232 }
233 return tgtok::dot;
234
235 case '\r':
236 llvm_unreachable("getNextChar() must never return '\r'");
237
238 case ' ':
239 case '\t':
240 // Ignore whitespace.
241 return LexToken(FileOrLineStart);
242 case '\n':
243 // Ignore whitespace, and identify the new line.
244 return LexToken(FileOrLineStart: true);
245 case '/':
246 // If this is the start of a // comment, skip until the end of the line or
247 // the end of the buffer.
248 if (*CurPtr == '/')
249 SkipBCPLComment();
250 else if (*CurPtr == '*') {
251 if (SkipCComment())
252 return tgtok::Error;
253 } else // Otherwise, this is an error.
254 return ReturnError(Loc: TokStart, Msg: "unexpected character");
255 return LexToken(FileOrLineStart);
256 case '-': case '+':
257 case '0': case '1': case '2': case '3': case '4': case '5': case '6':
258 case '7': case '8': case '9': {
259 int NextChar = 0;
260 if (isDigit(C: CurChar)) {
261 // Allow identifiers to start with a number if it is followed by
262 // an identifier. This can happen with paste operations like
263 // foo#8i.
264 int i = 0;
265 do {
266 NextChar = peekNextChar(Index: i++);
267 } while (isDigit(C: NextChar));
268
269 if (NextChar == 'x' || NextChar == 'b') {
270 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
271 // likely a number.
272 int NextNextChar = peekNextChar(Index: i);
273 switch (NextNextChar) {
274 default:
275 break;
276 case '0': case '1':
277 if (NextChar == 'b')
278 return LexNumber();
279 [[fallthrough]];
280 case '2': case '3': case '4': case '5':
281 case '6': case '7': case '8': case '9':
282 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
283 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
284 if (NextChar == 'x')
285 return LexNumber();
286 break;
287 }
288 }
289 }
290
291 if (isValidIDChar(C: NextChar, /*First=*/true))
292 return LexIdentifier();
293
294 return LexNumber();
295 }
296 case '"': return LexString();
297 case '$': return LexVarName();
298 case '[': return LexBracket();
299 case '!': return LexExclaim();
300 }
301}
302
303/// LexString - Lex "[^"]*"
304tgtok::TokKind TGLexer::LexString() {
305 const char *StrStart = CurPtr;
306
307 CurStrVal = "";
308
309 while (*CurPtr != '"') {
310 // If we hit the end of the buffer, report an error.
311 if (*CurPtr == 0 && CurPtr == CurBuf.end())
312 return ReturnError(Loc: StrStart, Msg: "end of file in string literal");
313
314 if (*CurPtr == '\n' || *CurPtr == '\r')
315 return ReturnError(Loc: StrStart, Msg: "end of line in string literal");
316
317 if (*CurPtr != '\\') {
318 CurStrVal += *CurPtr++;
319 continue;
320 }
321
322 ++CurPtr;
323
324 switch (*CurPtr) {
325 case '\\': case '\'': case '"':
326 // These turn into their literal character.
327 CurStrVal += *CurPtr++;
328 break;
329 case 't':
330 CurStrVal += '\t';
331 ++CurPtr;
332 break;
333 case 'n':
334 CurStrVal += '\n';
335 ++CurPtr;
336 break;
337
338 case '\n':
339 case '\r':
340 return ReturnError(Loc: CurPtr, Msg: "escaped newlines not supported in tblgen");
341
342 // If we hit the end of the buffer, report an error.
343 case '\0':
344 if (CurPtr == CurBuf.end())
345 return ReturnError(Loc: StrStart, Msg: "end of file in string literal");
346 [[fallthrough]];
347 default:
348 return ReturnError(Loc: CurPtr, Msg: "invalid escape in string literal");
349 }
350 }
351
352 ++CurPtr;
353 return tgtok::StrVal;
354}
355
356tgtok::TokKind TGLexer::LexVarName() {
357 if (!isValidIDChar(C: CurPtr[0], /*First=*/true))
358 return ReturnError(Loc: TokStart, Msg: "invalid variable name");
359
360 // Otherwise, we're ok, consume the rest of the characters.
361 const char *VarNameStart = CurPtr++;
362
363 while (isValidIDChar(C: *CurPtr, /*First=*/false))
364 ++CurPtr;
365
366 CurStrVal.assign(first: VarNameStart, last: CurPtr);
367 return tgtok::VarName;
368}
369
370tgtok::TokKind TGLexer::LexIdentifier() {
371 // The first letter is [a-zA-Z_].
372 const char *IdentStart = TokStart;
373
374 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
375 while (isValidIDChar(C: *CurPtr, /*First=*/false))
376 ++CurPtr;
377
378 // Check to see if this identifier is a reserved keyword.
379 StringRef Str(IdentStart, CurPtr-IdentStart);
380
381 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
382 .Case(S: "int", Value: tgtok::Int)
383 .Case(S: "bit", Value: tgtok::Bit)
384 .Case(S: "bits", Value: tgtok::Bits)
385 .Case(S: "string", Value: tgtok::String)
386 .Case(S: "list", Value: tgtok::List)
387 .Case(S: "code", Value: tgtok::Code)
388 .Case(S: "dag", Value: tgtok::Dag)
389 .Case(S: "class", Value: tgtok::Class)
390 .Case(S: "def", Value: tgtok::Def)
391 .Case(S: "true", Value: tgtok::TrueVal)
392 .Case(S: "false", Value: tgtok::FalseVal)
393 .Case(S: "foreach", Value: tgtok::Foreach)
394 .Case(S: "defm", Value: tgtok::Defm)
395 .Case(S: "defset", Value: tgtok::Defset)
396 .Case(S: "deftype", Value: tgtok::Deftype)
397 .Case(S: "multiclass", Value: tgtok::MultiClass)
398 .Case(S: "field", Value: tgtok::Field)
399 .Case(S: "let", Value: tgtok::Let)
400 .Case(S: "in", Value: tgtok::In)
401 .Case(S: "defvar", Value: tgtok::Defvar)
402 .Case(S: "include", Value: tgtok::Include)
403 .Case(S: "if", Value: tgtok::If)
404 .Case(S: "then", Value: tgtok::Then)
405 .Case(S: "else", Value: tgtok::ElseKW)
406 .Case(S: "assert", Value: tgtok::Assert)
407 .Case(S: "dump", Value: tgtok::Dump)
408 .Default(Value: tgtok::Id);
409
410 // A couple of tokens require special processing.
411 switch (Kind) {
412 case tgtok::Include:
413 if (LexInclude()) return tgtok::Error;
414 return Lex();
415 case tgtok::Id:
416 CurStrVal.assign(first: Str.begin(), last: Str.end());
417 break;
418 default:
419 break;
420 }
421
422 return Kind;
423}
424
425/// LexInclude - We just read the "include" token. Get the string token that
426/// comes next and enter the include.
427bool TGLexer::LexInclude() {
428 // The token after the include must be a string.
429 tgtok::TokKind Tok = LexToken();
430 if (Tok == tgtok::Error) return true;
431 if (Tok != tgtok::StrVal) {
432 PrintError(ErrorLoc: getLoc(), Msg: "expected filename after include");
433 return true;
434 }
435
436 // Get the string.
437 std::string Filename = CurStrVal;
438 std::string IncludedFile;
439
440 CurBuffer = SrcMgr.AddIncludeFile(Filename, IncludeLoc: SMLoc::getFromPointer(Ptr: CurPtr),
441 IncludedFile);
442 if (!CurBuffer) {
443 PrintError(ErrorLoc: getLoc(), Msg: "could not find include file '" + Filename + "'");
444 return true;
445 }
446
447 Dependencies.insert(x: IncludedFile);
448 // Save the line number and lex buffer of the includer.
449 CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
450 CurPtr = CurBuf.begin();
451
452 PrepIncludeStack.emplace_back();
453 return false;
454}
455
456/// SkipBCPLComment - Skip over the comment by finding the next CR or LF.
457/// Or we may end up at the end of the buffer.
458void TGLexer::SkipBCPLComment() {
459 ++CurPtr; // skip the second slash.
460 auto EOLPos = CurBuf.find_first_of(Chars: "\r\n", From: CurPtr - CurBuf.data());
461 CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos;
462}
463
464/// SkipCComment - This skips C-style /**/ comments. The only difference from C
465/// is that we allow nesting.
466bool TGLexer::SkipCComment() {
467 ++CurPtr; // skip the star.
468 unsigned CommentDepth = 1;
469
470 while (true) {
471 int CurChar = getNextChar();
472 switch (CurChar) {
473 case EOF:
474 PrintError(Loc: TokStart, Msg: "unterminated comment");
475 return true;
476 case '*':
477 // End of the comment?
478 if (CurPtr[0] != '/') break;
479
480 ++CurPtr; // End the */.
481 if (--CommentDepth == 0)
482 return false;
483 break;
484 case '/':
485 // Start of a nested comment?
486 if (CurPtr[0] != '*') break;
487 ++CurPtr;
488 ++CommentDepth;
489 break;
490 }
491 }
492}
493
494/// LexNumber - Lex:
495/// [-+]?[0-9]+
496/// 0x[0-9a-fA-F]+
497/// 0b[01]+
498tgtok::TokKind TGLexer::LexNumber() {
499 unsigned Base = 0;
500 const char *NumStart;
501
502 // Check if it's a hex or a binary value.
503 if (CurPtr[-1] == '0') {
504 NumStart = CurPtr + 1;
505 if (CurPtr[0] == 'x') {
506 Base = 16;
507 do
508 ++CurPtr;
509 while (isHexDigit(C: CurPtr[0]));
510 } else if (CurPtr[0] == 'b') {
511 Base = 2;
512 do
513 ++CurPtr;
514 while (CurPtr[0] == '0' || CurPtr[0] == '1');
515 }
516 }
517
518 // For a hex or binary value, we always convert it to an unsigned value.
519 bool IsMinus = false;
520
521 // Check if it's a decimal value.
522 if (Base == 0) {
523 // Check for a sign without a digit.
524 if (!isDigit(C: CurPtr[0])) {
525 if (CurPtr[-1] == '-')
526 return tgtok::minus;
527 else if (CurPtr[-1] == '+')
528 return tgtok::plus;
529 }
530
531 Base = 10;
532 NumStart = TokStart;
533 IsMinus = CurPtr[-1] == '-';
534
535 while (isDigit(C: CurPtr[0]))
536 ++CurPtr;
537 }
538
539 // Requires at least one digit.
540 if (CurPtr == NumStart)
541 return ReturnError(Loc: TokStart, Msg: "invalid number");
542
543 errno = 0;
544 if (IsMinus)
545 CurIntVal = strtoll(nptr: NumStart, endptr: nullptr, base: Base);
546 else
547 CurIntVal = strtoull(nptr: NumStart, endptr: nullptr, base: Base);
548
549 if (errno == EINVAL)
550 return ReturnError(Loc: TokStart, Msg: "invalid number");
551 if (errno == ERANGE)
552 return ReturnError(Loc: TokStart, Msg: "number out of range");
553
554 return Base == 2 ? tgtok::BinaryIntVal : tgtok::IntVal;
555}
556
557/// LexBracket - We just read '['. If this is a code block, return it,
558/// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
559tgtok::TokKind TGLexer::LexBracket() {
560 if (CurPtr[0] != '{')
561 return tgtok::l_square;
562 ++CurPtr;
563 const char *CodeStart = CurPtr;
564 while (true) {
565 int Char = getNextChar();
566 if (Char == EOF) break;
567
568 if (Char != '}') continue;
569
570 Char = getNextChar();
571 if (Char == EOF) break;
572 if (Char == ']') {
573 CurStrVal.assign(first: CodeStart, last: CurPtr-2);
574 return tgtok::CodeFragment;
575 }
576 }
577
578 return ReturnError(Loc: CodeStart - 2, Msg: "unterminated code block");
579}
580
581/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
582tgtok::TokKind TGLexer::LexExclaim() {
583 if (!isAlpha(C: *CurPtr))
584 return ReturnError(Loc: CurPtr - 1, Msg: "invalid \"!operator\"");
585
586 const char *Start = CurPtr++;
587 while (isAlpha(C: *CurPtr))
588 ++CurPtr;
589
590 // Check to see which operator this is.
591 tgtok::TokKind Kind =
592 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
593 .Case(S: "eq", Value: tgtok::XEq)
594 .Case(S: "ne", Value: tgtok::XNe)
595 .Case(S: "le", Value: tgtok::XLe)
596 .Case(S: "lt", Value: tgtok::XLt)
597 .Case(S: "ge", Value: tgtok::XGe)
598 .Case(S: "gt", Value: tgtok::XGt)
599 .Case(S: "if", Value: tgtok::XIf)
600 .Case(S: "cond", Value: tgtok::XCond)
601 .Case(S: "isa", Value: tgtok::XIsA)
602 .Case(S: "head", Value: tgtok::XHead)
603 .Case(S: "tail", Value: tgtok::XTail)
604 .Case(S: "size", Value: tgtok::XSize)
605 .Case(S: "con", Value: tgtok::XConcat)
606 .Case(S: "dag", Value: tgtok::XDag)
607 .Case(S: "add", Value: tgtok::XADD)
608 .Case(S: "sub", Value: tgtok::XSUB)
609 .Case(S: "mul", Value: tgtok::XMUL)
610 .Case(S: "div", Value: tgtok::XDIV)
611 .Case(S: "not", Value: tgtok::XNOT)
612 .Case(S: "logtwo", Value: tgtok::XLOG2)
613 .Case(S: "and", Value: tgtok::XAND)
614 .Case(S: "or", Value: tgtok::XOR)
615 .Case(S: "xor", Value: tgtok::XXOR)
616 .Case(S: "shl", Value: tgtok::XSHL)
617 .Case(S: "sra", Value: tgtok::XSRA)
618 .Case(S: "srl", Value: tgtok::XSRL)
619 .Case(S: "cast", Value: tgtok::XCast)
620 .Case(S: "empty", Value: tgtok::XEmpty)
621 .Case(S: "subst", Value: tgtok::XSubst)
622 .Case(S: "foldl", Value: tgtok::XFoldl)
623 .Case(S: "foreach", Value: tgtok::XForEach)
624 .Case(S: "filter", Value: tgtok::XFilter)
625 .Case(S: "listconcat", Value: tgtok::XListConcat)
626 .Case(S: "listflatten", Value: tgtok::XListFlatten)
627 .Case(S: "listsplat", Value: tgtok::XListSplat)
628 .Case(S: "listremove", Value: tgtok::XListRemove)
629 .Case(S: "range", Value: tgtok::XRange)
630 .Case(S: "strconcat", Value: tgtok::XStrConcat)
631 .Case(S: "initialized", Value: tgtok::XInitialized)
632 .Case(S: "interleave", Value: tgtok::XInterleave)
633 .Case(S: "instances", Value: tgtok::XInstances)
634 .Case(S: "substr", Value: tgtok::XSubstr)
635 .Case(S: "find", Value: tgtok::XFind)
636 .Cases(S0: "setdagop", S1: "setop", Value: tgtok::XSetDagOp) // !setop is deprecated.
637 .Cases(S0: "getdagop", S1: "getop", Value: tgtok::XGetDagOp) // !getop is deprecated.
638 .Case(S: "getdagarg", Value: tgtok::XGetDagArg)
639 .Case(S: "getdagname", Value: tgtok::XGetDagName)
640 .Case(S: "setdagarg", Value: tgtok::XSetDagArg)
641 .Case(S: "setdagname", Value: tgtok::XSetDagName)
642 .Case(S: "exists", Value: tgtok::XExists)
643 .Case(S: "tolower", Value: tgtok::XToLower)
644 .Case(S: "toupper", Value: tgtok::XToUpper)
645 .Case(S: "repr", Value: tgtok::XRepr)
646 .Case(S: "match", Value: tgtok::XMatch)
647 .Default(Value: tgtok::Error);
648
649 return Kind != tgtok::Error ? Kind
650 : ReturnError(Loc: Start - 1, Msg: "unknown operator");
651}
652
653bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
654 // Report an error, if preprocessor control stack for the current
655 // file is not empty.
656 if (!PrepIncludeStack.back().empty()) {
657 prepReportPreprocessorStackError();
658
659 return false;
660 }
661
662 // Pop the preprocessing controls from the include stack.
663 PrepIncludeStack.pop_back();
664
665 if (IncludeStackMustBeEmpty) {
666 assert(PrepIncludeStack.empty() &&
667 "preprocessor include stack is not empty");
668 } else {
669 assert(!PrepIncludeStack.empty() && "preprocessor include stack is empty");
670 }
671
672 return true;
673}
674
675tgtok::TokKind TGLexer::prepIsDirective() const {
676 for (const auto [Kind, Word] : PreprocessorDirs) {
677 if (StringRef(CurPtr, Word.size()) != Word)
678 continue;
679 int NextChar = peekNextChar(Index: Word.size());
680
681 // Check for whitespace after the directive. If there is no whitespace,
682 // then we do not recognize it as a preprocessing directive.
683
684 // New line and EOF may follow only #else/#endif. It will be reported
685 // as an error for #ifdef/#define after the call to prepLexMacroName().
686 if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF ||
687 NextChar == '\n' ||
688 // It looks like TableGen does not support '\r' as the actual
689 // carriage return, e.g. getNextChar() treats a single '\r'
690 // as '\n'. So we do the same here.
691 NextChar == '\r')
692 return Kind;
693
694 // Allow comments after some directives, e.g.:
695 // #else// OR #else/**/
696 // #endif// OR #endif/**/
697 //
698 // Note that we do allow comments after #ifdef/#define here, e.g.
699 // #ifdef/**/ AND #ifdef//
700 // #define/**/ AND #define//
701 //
702 // These cases will be reported as incorrect after calling
703 // prepLexMacroName(). We could have supported C-style comments
704 // after #ifdef/#define, but this would complicate the code
705 // for little benefit.
706 if (NextChar == '/') {
707 NextChar = peekNextChar(Index: Word.size() + 1);
708
709 if (NextChar == '*' || NextChar == '/')
710 return Kind;
711
712 // Pretend that we do not recognize the directive.
713 }
714 }
715
716 return tgtok::Error;
717}
718
719void TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
720 TokStart = CurPtr;
721
722 for (const auto [PKind, PWord] : PreprocessorDirs) {
723 if (PKind == Kind) {
724 // Advance CurPtr to the end of the preprocessing word.
725 CurPtr += PWord.size();
726 return;
727 }
728 }
729
730 llvm_unreachable(
731 "unsupported preprocessing token in prepEatPreprocessorDirective()");
732}
733
734tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
735 bool ReturnNextLiveToken) {
736 // We must be looking at a preprocessing directive. Eat it!
737 prepEatPreprocessorDirective(Kind);
738
739 if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) {
740 StringRef MacroName = prepLexMacroName();
741 StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef";
742 if (MacroName.empty())
743 return ReturnError(Loc: TokStart, Msg: "expected macro name after " + IfTokName);
744
745 bool MacroIsDefined = DefinedMacros.count(Key: MacroName) != 0;
746
747 // Canonicalize ifndef's MacroIsDefined to its ifdef equivalent.
748 if (Kind == tgtok::Ifndef)
749 MacroIsDefined = !MacroIsDefined;
750
751 // Regardless of whether we are processing tokens or not,
752 // we put the #ifdef control on stack.
753 // Note that MacroIsDefined has been canonicalized against ifdef.
754 PrepIncludeStack.back().push_back(
755 Elt: {.Kind: tgtok::Ifdef, .IsDefined: MacroIsDefined, .SrcPos: SMLoc::getFromPointer(Ptr: TokStart)});
756
757 if (!prepSkipDirectiveEnd())
758 return ReturnError(Loc: CurPtr, Msg: "only comments are supported after " +
759 IfTokName + " NAME");
760
761 // If we were not processing tokens before this #ifdef,
762 // then just return back to the lines skipping code.
763 if (!ReturnNextLiveToken)
764 return Kind;
765
766 // If we were processing tokens before this #ifdef,
767 // and the macro is defined, then just return the next token.
768 if (MacroIsDefined)
769 return LexToken();
770
771 // We were processing tokens before this #ifdef, and the macro
772 // is not defined, so we have to start skipping the lines.
773 // If the skipping is successful, it will return the token following
774 // either #else or #endif corresponding to this #ifdef.
775 if (prepSkipRegion(MustNeverBeFalse: ReturnNextLiveToken))
776 return LexToken();
777
778 return tgtok::Error;
779 } else if (Kind == tgtok::Else) {
780 // Check if this #else is correct before calling prepSkipDirectiveEnd(),
781 // which will move CurPtr away from the beginning of #else.
782 if (PrepIncludeStack.back().empty())
783 return ReturnError(Loc: TokStart, Msg: "#else without #ifdef or #ifndef");
784
785 PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back();
786
787 if (IfdefEntry.Kind != tgtok::Ifdef) {
788 PrintError(Loc: TokStart, Msg: "double #else");
789 return ReturnError(Loc: IfdefEntry.SrcPos, Msg: "previous #else is here");
790 }
791
792 // Replace the corresponding #ifdef's control with its negation
793 // on the control stack.
794 PrepIncludeStack.back().back() = {.Kind: Kind, .IsDefined: !IfdefEntry.IsDefined,
795 .SrcPos: SMLoc::getFromPointer(Ptr: TokStart)};
796
797 if (!prepSkipDirectiveEnd())
798 return ReturnError(Loc: CurPtr, Msg: "only comments are supported after #else");
799
800 // If we were processing tokens before this #else,
801 // we have to start skipping lines until the matching #endif.
802 if (ReturnNextLiveToken) {
803 if (prepSkipRegion(MustNeverBeFalse: ReturnNextLiveToken))
804 return LexToken();
805
806 return tgtok::Error;
807 }
808
809 // Return to the lines skipping code.
810 return Kind;
811 } else if (Kind == tgtok::Endif) {
812 // Check if this #endif is correct before calling prepSkipDirectiveEnd(),
813 // which will move CurPtr away from the beginning of #endif.
814 if (PrepIncludeStack.back().empty())
815 return ReturnError(Loc: TokStart, Msg: "#endif without #ifdef");
816
817 [[maybe_unused]] auto &IfdefOrElseEntry = PrepIncludeStack.back().back();
818
819 assert((IfdefOrElseEntry.Kind == tgtok::Ifdef ||
820 IfdefOrElseEntry.Kind == tgtok::Else) &&
821 "invalid preprocessor control on the stack");
822
823 if (!prepSkipDirectiveEnd())
824 return ReturnError(Loc: CurPtr, Msg: "only comments are supported after #endif");
825
826 PrepIncludeStack.back().pop_back();
827
828 // If we were processing tokens before this #endif, then
829 // we should continue it.
830 if (ReturnNextLiveToken) {
831 return LexToken();
832 }
833
834 // Return to the lines skipping code.
835 return Kind;
836 } else if (Kind == tgtok::Define) {
837 StringRef MacroName = prepLexMacroName();
838 if (MacroName.empty())
839 return ReturnError(Loc: TokStart, Msg: "expected macro name after #define");
840
841 if (!DefinedMacros.insert(key: MacroName).second)
842 PrintWarning(WarningLoc: getLoc(),
843 Msg: "duplicate definition of macro: " + Twine(MacroName));
844
845 if (!prepSkipDirectiveEnd())
846 return ReturnError(Loc: CurPtr,
847 Msg: "only comments are supported after #define NAME");
848
849 assert(ReturnNextLiveToken &&
850 "#define must be ignored during the lines skipping");
851
852 return LexToken();
853 }
854
855 llvm_unreachable("preprocessing directive is not supported");
856}
857
858bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
859 assert(MustNeverBeFalse && "invalid recursion.");
860
861 do {
862 // Skip all symbols to the line end.
863 while (*CurPtr != '\n')
864 ++CurPtr;
865
866 // Find the first non-whitespace symbol in the next line(s).
867 if (!prepSkipLineBegin())
868 return false;
869
870 // If the first non-blank/comment symbol on the line is '#',
871 // it may be a start of preprocessing directive.
872 //
873 // If it is not '#' just go to the next line.
874 if (*CurPtr == '#')
875 ++CurPtr;
876 else
877 continue;
878
879 tgtok::TokKind Kind = prepIsDirective();
880
881 // If we did not find a preprocessing directive or it is #define,
882 // then just skip to the next line. We do not have to do anything
883 // for #define in the line-skipping mode.
884 if (Kind == tgtok::Error || Kind == tgtok::Define)
885 continue;
886
887 tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, ReturnNextLiveToken: false);
888
889 // If lexPreprocessor() encountered an error during lexing this
890 // preprocessor idiom, then return false to the calling lexPreprocessor().
891 // This will force tgtok::Error to be returned to the tokens processing.
892 if (ProcessedKind == tgtok::Error)
893 return false;
894
895 assert(Kind == ProcessedKind && "prepIsDirective() and lexPreprocessor() "
896 "returned different token kinds");
897
898 // If this preprocessing directive enables tokens processing,
899 // then return to the lexPreprocessor() and get to the next token.
900 // We can move from line-skipping mode to processing tokens only
901 // due to #else or #endif.
902 if (prepIsProcessingEnabled()) {
903 assert((Kind == tgtok::Else || Kind == tgtok::Endif) &&
904 "tokens processing was enabled by an unexpected preprocessing "
905 "directive");
906
907 return true;
908 }
909 } while (CurPtr != CurBuf.end());
910
911 // We have reached the end of the file, but never left the lines-skipping
912 // mode. This means there is no matching #endif.
913 prepReportPreprocessorStackError();
914 return false;
915}
916
917StringRef TGLexer::prepLexMacroName() {
918 // Skip whitespaces between the preprocessing directive and the macro name.
919 while (*CurPtr == ' ' || *CurPtr == '\t')
920 ++CurPtr;
921
922 TokStart = CurPtr;
923 CurPtr = lexMacroName(Str: StringRef(CurPtr, CurBuf.end() - CurPtr));
924 return StringRef(TokStart, CurPtr - TokStart);
925}
926
927bool TGLexer::prepSkipLineBegin() {
928 while (CurPtr != CurBuf.end()) {
929 switch (*CurPtr) {
930 case ' ':
931 case '\t':
932 case '\n':
933 case '\r':
934 break;
935
936 case '/': {
937 int NextChar = peekNextChar(Index: 1);
938 if (NextChar == '*') {
939 // Skip C-style comment.
940 // Note that we do not care about skipping the C++-style comments.
941 // If the line contains "//", it may not contain any processable
942 // preprocessing directive. Just return CurPtr pointing to
943 // the first '/' in this case. We also do not care about
944 // incorrect symbols after the first '/' - we are in lines-skipping
945 // mode, so incorrect code is allowed to some extent.
946
947 // Set TokStart to the beginning of the comment to enable proper
948 // diagnostic printing in case of error in SkipCComment().
949 TokStart = CurPtr;
950
951 // CurPtr must point to '*' before call to SkipCComment().
952 ++CurPtr;
953 if (SkipCComment())
954 return false;
955 } else {
956 // CurPtr points to the non-whitespace '/'.
957 return true;
958 }
959
960 // We must not increment CurPtr after the comment was lexed.
961 continue;
962 }
963
964 default:
965 return true;
966 }
967
968 ++CurPtr;
969 }
970
971 // We have reached the end of the file. Return to the lines skipping
972 // code, and allow it to handle the EOF as needed.
973 return true;
974}
975
976bool TGLexer::prepSkipDirectiveEnd() {
977 while (CurPtr != CurBuf.end()) {
978 switch (*CurPtr) {
979 case ' ':
980 case '\t':
981 break;
982
983 case '\n':
984 case '\r':
985 return true;
986
987 case '/': {
988 int NextChar = peekNextChar(Index: 1);
989 if (NextChar == '/') {
990 // Skip C++-style comment.
991 // We may just return true now, but let's skip to the line/buffer end
992 // to simplify the method specification.
993 ++CurPtr;
994 SkipBCPLComment();
995 } else if (NextChar == '*') {
996 // When we are skipping C-style comment at the end of a preprocessing
997 // directive, we can skip several lines. If any meaningful TD token
998 // follows the end of the C-style comment on the same line, it will
999 // be considered as an invalid usage of TD token.
1000 // For example, we want to forbid usages like this one:
1001 // #define MACRO class Class {}
1002 // But with C-style comments we also disallow the following:
1003 // #define MACRO /* This macro is used
1004 // to ... */ class Class {}
1005 // One can argue that this should be allowed, but it does not seem
1006 // to be worth of the complication. Moreover, this matches
1007 // the C preprocessor behavior.
1008
1009 // Set TokStart to the beginning of the comment to enable proper
1010 // diagnostic printer in case of error in SkipCComment().
1011 TokStart = CurPtr;
1012 ++CurPtr;
1013 if (SkipCComment())
1014 return false;
1015 } else {
1016 TokStart = CurPtr;
1017 PrintError(Loc: CurPtr, Msg: "unexpected character");
1018 return false;
1019 }
1020
1021 // We must not increment CurPtr after the comment was lexed.
1022 continue;
1023 }
1024
1025 default:
1026 // Do not allow any non-whitespaces after the directive.
1027 TokStart = CurPtr;
1028 return false;
1029 }
1030
1031 ++CurPtr;
1032 }
1033
1034 return true;
1035}
1036
1037bool TGLexer::prepIsProcessingEnabled() {
1038 return all_of(Range&: PrepIncludeStack.back(),
1039 P: [](const PreprocessorControlDesc &I) { return I.IsDefined; });
1040}
1041
1042void TGLexer::prepReportPreprocessorStackError() {
1043 auto &PrepControl = PrepIncludeStack.back().back();
1044 PrintError(Loc: CurBuf.end(), Msg: "reached EOF without matching #endif");
1045 PrintError(ErrorLoc: PrepControl.SrcPos, Msg: "the latest preprocessor control is here");
1046
1047 TokStart = CurPtr;
1048}
1049