TGLexer.cpp source code [llvm_projects/llvm/lib/TableGen/TGLexer.cpp]

1	//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Implement the Lexer for TableGen.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "TGLexer.h"
14	#include "llvm/ADT/ArrayRef.h"
15	#include "llvm/ADT/StringExtras.h"
16	#include "llvm/ADT/StringSwitch.h"
17	#include "llvm/ADT/Twine.h"
18	#include "llvm/Config/config.h" // for strtoull()/strtoll() define
19	#include "llvm/Support/Compiler.h"
20	#include "llvm/Support/MemoryBuffer.h"
21	#include "llvm/Support/SourceMgr.h"
22	#include "llvm/TableGen/Error.h"
23	#include <cerrno>
24	#include <cstdio>
25	#include <cstdlib>
26	#include <cstring>
27
28	using namespace llvm;
29
30	namespace {
31	// A list of supported preprocessing directives with their
32	// internal token kinds and names.
33	struct PreprocessorDir {
34	tgtok::TokKind Kind;
35	StringRef Word;
36	};
37	} // end anonymous namespace
38
39	/// Returns true if `C` is a valid character in an identifier. If `First` is
40	/// true, returns true if `C` is a valid first character of an identifier,
41	/// else returns true if `C` is a valid non-first character of an identifier.
42	/// Identifiers match the following regular expression:
43	/// [a-zA-Z_][0-9a-zA-Z_]*
44	static bool isValidIDChar(char C, bool First) {
45	if (C == `'_'` \|\| isAlpha(C))
46	return true;
47	return !First && isDigit(C);
48	}
49
50	constexpr PreprocessorDir PreprocessorDirs[] = {{.Kind: tgtok::Ifdef, .Word: "ifdef"},
51	{.Kind: tgtok::Ifndef, .Word: "ifndef"},
52	{.Kind: tgtok::Else, .Word: "else"},
53	{.Kind: tgtok::Endif, .Word: "endif"},
54	{.Kind: tgtok::Define, .Word: "define"}};
55
56	// Returns a pointer past the end of a valid macro name at the start of `Str`.
57	// Valid macro names match the regular expression [a-zA-Z_][0-9a-zA-Z_].*
58	static const char *lexMacroName(StringRef Str) {
59	assert(!Str.empty());
60
61	// Macro names start with [a-zA-Z_].
62	const char *Next = Str.begin();
63	if (!isValidIDChar(C: Next, /First=/*true))
64	return Next;
65	// Eat the first character of the name.
66	++Next;
67
68	// Match the rest of the identifier regex: [0-9a-zA-Z_]*
69	const char *End = Str.end();
70	while (Next != End && isValidIDChar(C: Next, /First=/*false))
71	++Next;
72	return Next;
73	}
74
75	TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
76	CurBuffer = SrcMgr.getMainFileID();
77	CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
78	CurPtr = CurBuf.begin();
79	TokStart = nullptr;
80
81	// Pretend that we enter the "top-level" include file.
82	PrepIncludeStack.emplace_back();
83
84	// Add all macros defined on the command line to the DefinedMacros set.
85	// Check invalid macro names and print fatal error if we find one.
86	for (StringRef MacroName : Macros) {
87	const char *End = lexMacroName(Str: MacroName);
88	if (End != MacroName.end())
89	PrintFatalError(Msg: "invalid macro name `" + MacroName +
90	"` specified on command line");
91
92	DefinedMacros.insert(key: MacroName);
93	}
94	}
95
96	SMLoc TGLexer::getLoc() const {
97	return SMLoc::getFromPointer(Ptr: TokStart);
98	}
99
100	SMRange TGLexer::getLocRange() const {
101	return {getLoc(), SMLoc::getFromPointer(Ptr: CurPtr)};
102	}
103
104	/// ReturnError - Set the error to the specified string at the specified
105	/// location. This is defined to always return tgtok::Error.
106	tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) {
107	PrintError(ErrorLoc: Loc, Msg);
108	return tgtok::Error;
109	}
110
111	tgtok::TokKind TGLexer::ReturnError(const char Loc, const* Twine &Msg) {
112	return ReturnError(Loc: SMLoc::getFromPointer(Ptr: Loc), Msg);
113	}
114
115	bool TGLexer::processEOF() {
116	SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(i: CurBuffer);
117	if (ParentIncludeLoc != SMLoc ()) {
118	// If prepExitInclude() detects a problem with the preprocessing
119	// control stack, it will return false. Pretend that we reached
120	// the final EOF and stop lexing more tokens by returning false
121	// to LexToken().
122	if (!prepExitInclude(IncludeStackMustBeEmpty: false))
123	return false;
124
125	CurBuffer = SrcMgr.FindBufferContainingLoc(Loc: ParentIncludeLoc);
126	CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
127	CurPtr = ParentIncludeLoc.getPointer();
128	// Make sure TokStart points into the parent file's buffer.
129	// LexToken() assigns to it before calling getNextChar(),
130	// so it is pointing into the included file now.
131	TokStart = CurPtr;
132	return true;
133	}
134
135	// Pretend that we exit the "top-level" include file.
136	// Note that in case of an error (e.g. control stack imbalance)
137	// the routine will issue a fatal error.
138	prepExitInclude(IncludeStackMustBeEmpty: true);
139	return false;
140	}
141
142	int TGLexer::getNextChar() {
143	char CurChar = *CurPtr++;
144	switch (CurChar) {
145	default:
146	return (unsigned char)CurChar;
147
148	case `0`: {
149	// A NUL character in the stream is either the end of the current buffer or
150	// a spurious NUL in the file. Disambiguate that here.
151	if (CurPtr - `1` == CurBuf.end()) {
152	--CurPtr; // Arrange for another call to return EOF again.
153	return EOF;
154	}
155	PrintError(ErrorLoc: getLoc(),
156	Msg: "NUL character is invalid in source; treated as space");
157	return `' '`;
158	}
159
160	case `'\n'`:
161	case `'\r'`:
162	// Handle the newline character by ignoring it and incrementing the line
163	// count. However, be careful about 'dos style' files with \n\r in them.
164	// Only treat a \n\r or \r\n as a single line.
165	if ((CurPtr == `'\n'` \|\| (CurPtr == `'\r'`)) &&
166	*CurPtr != CurChar)
167	++CurPtr; // Eat the two char newline sequence.
168	return `'\n'`;
169	}
170	}
171
172	int TGLexer::peekNextChar(int Index) const {
173	return *(CurPtr + Index);
174	}
175
176	tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
177	TokStart = CurPtr;
178	// This always consumes at least one character.
179	int CurChar = getNextChar();
180
181	switch (CurChar) {
182	default:
183	// Handle letters: [a-zA-Z_]
184	if (isValidIDChar(C: CurChar, /First=/true))
185	return LexIdentifier();
186
187	// Unknown character, emit an error.
188	return ReturnError(Loc: TokStart, Msg: "unexpected character");
189	case EOF:
190	// Lex next token, if we just left an include file.
191	// Note that leaving an include file means that the next
192	// symbol is located at the end of the 'include "..."'
193	// construct, so LexToken() is called with default
194	// false parameter.
195	if (processEOF())
196	return LexToken();
197
198	// Return EOF denoting the end of lexing.
199	return tgtok::Eof;
200
201	case `':'`: return tgtok::colon;
202	case `';'`: return tgtok::semi;
203	case `','`: return tgtok::comma;
204	case `'<'`: return tgtok::less;
205	case `'>'`: return tgtok::greater;
206	case `']'`: return tgtok::r_square;
207	case `'{'`: return tgtok::l_brace;
208	case `'}'`: return tgtok::r_brace;
209	case `'('`: return tgtok::l_paren;
210	case `')'`: return tgtok::r_paren;
211	case `'='`: return tgtok::equal;
212	case `'?'`: return tgtok::question;
213	case `'#'`:
214	if (FileOrLineStart) {
215	tgtok::TokKind Kind = prepIsDirective();
216	if (Kind != tgtok::Error)
217	return lexPreprocessor(Kind);
218	}
219
220	return tgtok::paste;
221
222	// The period is a separate case so we can recognize the "..."
223	// range punctuator.
224	case `'.'`:
225	if (peekNextChar(Index: `0`) == `'.'`) {
226	++CurPtr; // Eat second dot.
227	if (peekNextChar(Index: `0`) == `'.'`) {
228	++CurPtr; // Eat third dot.
229	return tgtok::dotdotdot;
230	}
231	return ReturnError(Loc: TokStart, Msg: "invalid '..' punctuation");
232	}
233	return tgtok::dot;
234
235	case `'\r'`:
236	llvm_unreachable("getNextChar() must never return '\r'");
237
238	case `' '`:
239	case `'\t'`:
240	// Ignore whitespace.
241	return LexToken(FileOrLineStart);
242	case `'\n'`:
243	// Ignore whitespace, and identify the new line.
244	return LexToken(FileOrLineStart: true);
245	case `'/'`:
246	// If this is the start of a // comment, skip until the end of the line or
247	// the end of the buffer.
248	if (*CurPtr == `'/'`)
249	SkipBCPLComment();
250	else if (CurPtr == `''`) {
251	if (SkipCComment())
252	return tgtok::Error;
253	} else // Otherwise, this is an error.
254	return ReturnError(Loc: TokStart, Msg: "unexpected character");
255	return LexToken(FileOrLineStart);
256	case `'-'`: case `'+'`:
257	case `'0'`: case `'1'`: case `'2'`: case `'3'`: case `'4'`: case `'5'`: case `'6'`:
258	case `'7'`: case `'8'`: case `'9'`: {
259	int NextChar = `0`;
260	if (isDigit(C: CurChar)) {
261	// Allow identifiers to start with a number if it is followed by
262	// an identifier. This can happen with paste operations like
263	// foo#8i.
264	int i = `0`;
265	do {
266	NextChar = peekNextChar(Index: i++);
267	} while (isDigit(C: NextChar));
268
269	if (NextChar == `'x'` \|\| NextChar == `'b'`) {
270	// If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
271	// likely a number.
272	int NextNextChar = peekNextChar(Index: i);
273	switch (NextNextChar) {
274	default:
275	break;
276	case `'0'`: case `'1'`:
277	if (NextChar == `'b'`)
278	return LexNumber();
279	[[fallthrough]];
280	case `'2'`: case `'3'`: case `'4'`: case `'5'`:
281	case `'6'`: case `'7'`: case `'8'`: case `'9'`:
282	case `'a'`: case `'b'`: case `'c'`: case `'d'`: case `'e'`: case `'f'`:
283	case `'A'`: case `'B'`: case `'C'`: case `'D'`: case `'E'`: case `'F'`:
284	if (NextChar == `'x'`)
285	return LexNumber();
286	break;
287	}
288	}
289	}
290
291	if (isValidIDChar(C: NextChar, /First=/true))
292	return LexIdentifier();
293
294	return LexNumber();
295	}
296	case `'"'`: return LexString();
297	case `'$'`: return LexVarName();
298	case `'['`: return LexBracket();
299	case `'!'`: return LexExclaim();
300	}
301	}
302
303	/// LexString - Lex "[^"]"*
304	tgtok::TokKind TGLexer::LexString() {
305	const char *StrStart = CurPtr;
306
307	CurStrVal = "";
308
309	while (*CurPtr != `'"'`) {
310	// If we hit the end of the buffer, report an error.
311	if (*CurPtr == `0` && CurPtr == CurBuf.end())
312	return ReturnError(Loc: StrStart, Msg: "end of file in string literal");
313
314	if (CurPtr == `'\n'` \|\| CurPtr == `'\r'`)
315	return ReturnError(Loc: StrStart, Msg: "end of line in string literal");
316
317	if (*CurPtr != `'\\'`) {
318	CurStrVal += *CurPtr++;
319	continue;
320	}
321
322	++CurPtr;
323
324	switch (*CurPtr) {
325	case `'\\'`: case `'\''`: case `'"'`:
326	// These turn into their literal character.
327	CurStrVal += *CurPtr++;
328	break;
329	case `'t'`:
330	CurStrVal += `'\t'`;
331	++CurPtr;
332	break;
333	case `'n'`:
334	CurStrVal += `'\n'`;
335	++CurPtr;
336	break;
337
338	case `'\n'`:
339	case `'\r'`:
340	return ReturnError(Loc: CurPtr, Msg: "escaped newlines not supported in tblgen");
341
342	// If we hit the end of the buffer, report an error.
343	case `'\0'`:
344	if (CurPtr == CurBuf.end())
345	return ReturnError(Loc: StrStart, Msg: "end of file in string literal");
346	[[fallthrough]];
347	default:
348	return ReturnError(Loc: CurPtr, Msg: "invalid escape in string literal");
349	}
350	}
351
352	++CurPtr;
353	return tgtok::StrVal;
354	}
355
356	tgtok::TokKind TGLexer::LexVarName() {
357	if (!isValidIDChar(C: CurPtr[`0`], /First=/true))
358	return ReturnError(Loc: TokStart, Msg: "invalid variable name");
359
360	// Otherwise, we're ok, consume the rest of the characters.
361	const char *VarNameStart = CurPtr++;
362
363	while (isValidIDChar(C: CurPtr, /First=/*false))
364	++CurPtr;
365
366	CurStrVal.assign(first: VarNameStart, last: CurPtr);
367	return tgtok::VarName;
368	}
369
370	tgtok::TokKind TGLexer::LexIdentifier() {
371	// The first letter is [a-zA-Z_].
372	const char *IdentStart = TokStart;
373
374	// Match the rest of the identifier regex: [0-9a-zA-Z_]*
375	while (isValidIDChar(C: CurPtr, /First=/*false))
376	++CurPtr;
377
378	// Check to see if this identifier is a reserved keyword.
379	StringRef Str(IdentStart, CurPtr-IdentStart);
380
381	tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
382	.Case(S: "int", Value: tgtok::Int)
383	.Case(S: "bit", Value: tgtok::Bit)
384	.Case(S: "bits", Value: tgtok::Bits)
385	.Case(S: "string", Value: tgtok::String)
386	.Case(S: "list", Value: tgtok::List)
387	.Case(S: "code", Value: tgtok::Code)
388	.Case(S: "dag", Value: tgtok::Dag)
389	.Case(S: "class", Value: tgtok::Class)
390	.Case(S: "def", Value: tgtok::Def)
391	.Case(S: "true", Value: tgtok::TrueVal)
392	.Case(S: "false", Value: tgtok::FalseVal)
393	.Case(S: "foreach", Value: tgtok::Foreach)
394	.Case(S: "defm", Value: tgtok::Defm)
395	.Case(S: "defset", Value: tgtok::Defset)
396	.Case(S: "deftype", Value: tgtok::Deftype)
397	.Case(S: "multiclass", Value: tgtok::MultiClass)
398	.Case(S: "field", Value: tgtok::Field)
399	.Case(S: "let", Value: tgtok::Let)
400	.Case(S: "in", Value: tgtok::In)
401	.Case(S: "defvar", Value: tgtok::Defvar)
402	.Case(S: "include", Value: tgtok::Include)
403	.Case(S: "if", Value: tgtok::If)
404	.Case(S: "then", Value: tgtok::Then)
405	.Case(S: "else", Value: tgtok::ElseKW)
406	.Case(S: "assert", Value: tgtok::Assert)
407	.Case(S: "dump", Value: tgtok::Dump)
408	.Default(Value: tgtok::Id);
409
410	// A couple of tokens require special processing.
411	switch (Kind) {
412	case tgtok::Include:
413	if (LexInclude()) return tgtok::Error;
414	return Lex();
415	case tgtok::Id:
416	CurStrVal.assign(first: Str.begin(), last: Str.end());
417	break;
418	default:
419	break;
420	}
421
422	return Kind;
423	}
424
425	/// LexInclude - We just read the "include" token. Get the string token that
426	/// comes next and enter the include.
427	bool TGLexer::LexInclude() {
428	// The token after the include must be a string.
429	tgtok::TokKind Tok = LexToken();
430	if (Tok == tgtok::Error) return true;
431	if (Tok != tgtok::StrVal) {
432	PrintError(ErrorLoc: getLoc(), Msg: "expected filename after include");
433	return true;
434	}
435
436	// Get the string.
437	std::string Filename = CurStrVal;
438	std::string IncludedFile;
439
440	CurBuffer = SrcMgr.AddIncludeFile(Filename, IncludeLoc: SMLoc::getFromPointer(Ptr: CurPtr),
441	IncludedFile);
442	if (!CurBuffer) {
443	PrintError(ErrorLoc: getLoc(), Msg: "could not find include file '" + Filename + "'");
444	return true;
445	}
446
447	Dependencies.insert(x: IncludedFile);
448	// Save the line number and lex buffer of the includer.
449	CurBuf = SrcMgr.getMemoryBuffer(i: CurBuffer)->getBuffer();
450	CurPtr = CurBuf.begin();
451
452	PrepIncludeStack.emplace_back();
453	return false;
454	}
455
456	/// SkipBCPLComment - Skip over the comment by finding the next CR or LF.
457	/// Or we may end up at the end of the buffer.
458	void TGLexer::SkipBCPLComment() {
459	++CurPtr; // skip the second slash.
460	auto EOLPos = CurBuf.find_first_of(Chars: "\r\n", From: CurPtr - CurBuf.data());
461	CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos;
462	}
463
464	/// SkipCComment - This skips C-style // comments. The only difference from C
465	/// is that we allow nesting.
466	bool TGLexer::SkipCComment() {
467	++CurPtr; // skip the star.
468	unsigned CommentDepth = `1`;
469
470	while (true) {
471	int CurChar = getNextChar();
472	switch (CurChar) {
473	case EOF:
474	PrintError(Loc: TokStart, Msg: "unterminated comment");
475	return true;
476	case `'*'`:
477	// End of the comment?
478	if (CurPtr[`0`] != `'/'`) break;
479
480	++CurPtr; // End the /.*
481	if (--CommentDepth == `0`)
482	return false;
483	break;
484	case `'/'`:
485	// Start of a nested comment?
486	if (CurPtr[`0`] != `''`) break*;
487	++CurPtr;
488	++CommentDepth;
489	break;
490	}
491	}
492	}
493
494	/// LexNumber - Lex:
495	/// [-+]?[0-9]+
496	/// 0x[0-9a-fA-F]+
497	/// 0b[01]+
498	tgtok::TokKind TGLexer::LexNumber() {
499	unsigned Base = `0`;
500	const char *NumStart;
501
502	// Check if it's a hex or a binary value.
503	if (CurPtr[-`1`] == `'0'`) {
504	NumStart = CurPtr + `1`;
505	if (CurPtr[`0`] == `'x'`) {
506	Base = `16`;
507	do
508	++CurPtr;
509	while (isHexDigit(C: CurPtr[`0`]));
510	} else if (CurPtr[`0`] == `'b'`) {
511	Base = `2`;
512	do
513	++CurPtr;
514	while (CurPtr[`0`] == `'0'` \|\| CurPtr[`0`] == `'1'`);
515	}
516	}
517
518	// For a hex or binary value, we always convert it to an unsigned value.
519	bool IsMinus = false;
520
521	// Check if it's a decimal value.
522	if (Base == `0`) {
523	// Check for a sign without a digit.
524	if (!isDigit(C: CurPtr[`0`])) {
525	if (CurPtr[-`1`] == `'-'`)
526	return tgtok::minus;
527	else if (CurPtr[-`1`] == `'+'`)
528	return tgtok::plus;
529	}
530
531	Base = `10`;
532	NumStart = TokStart;
533	IsMinus = CurPtr[-`1`] == `'-'`;
534
535	while (isDigit(C: CurPtr[`0`]))
536	++CurPtr;
537	}
538
539	// Requires at least one digit.
540	if (CurPtr == NumStart)
541	return ReturnError(Loc: TokStart, Msg: "invalid number");
542
543	errno = `0`;
544	if (IsMinus)
545	CurIntVal = strtoll(nptr: NumStart, endptr: nullptr, base: Base);
546	else
547	CurIntVal = strtoull(nptr: NumStart, endptr: nullptr, base: Base);
548
549	if (errno == EINVAL)
550	return ReturnError(Loc: TokStart, Msg: "invalid number");
551	if (errno == ERANGE)
552	return ReturnError(Loc: TokStart, Msg: "number out of range");
553
554	return Base == `2` ? tgtok::BinaryIntVal : tgtok::IntVal;
555	}
556
557	/// LexBracket - We just read '['. If this is a code block, return it,
558	/// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ \| }[^]] ) }]'*
559	tgtok::TokKind TGLexer::LexBracket() {
560	if (CurPtr[`0`] != `'{'`)
561	return tgtok::l_square;
562	++CurPtr;
563	const char *CodeStart = CurPtr;
564	while (true) {
565	int Char = getNextChar();
566	if (Char == EOF) break;
567
568	if (Char != `'}'`) continue;
569
570	Char = getNextChar();
571	if (Char == EOF) break;
572	if (Char == `']'`) {
573	CurStrVal.assign(first: CodeStart, last: CurPtr-`2`);
574	return tgtok::CodeFragment;
575	}
576	}
577
578	return ReturnError(Loc: CodeStart - `2`, Msg: "unterminated code block");
579	}
580
581	/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
582	tgtok::TokKind TGLexer::LexExclaim() {
583	if (!isAlpha(C: *CurPtr))
584	return ReturnError(Loc: CurPtr - `1`, Msg: "invalid \"!operator\"");
585
586	const char *Start = CurPtr++;
587	while (isAlpha(C: *CurPtr))
588	++CurPtr;
589
590	// Check to see which operator this is.
591	tgtok::TokKind Kind =
592	StringSwitch<tgtok::TokKind>(StringRef (Start, CurPtr - Start))
593	.Case(S: "eq", Value: tgtok::XEq)
594	.Case(S: "ne", Value: tgtok::XNe)
595	.Case(S: "le", Value: tgtok::XLe)
596	.Case(S: "lt", Value: tgtok::XLt)
597	.Case(S: "ge", Value: tgtok::XGe)
598	.Case(S: "gt", Value: tgtok::XGt)
599	.Case(S: "if", Value: tgtok::XIf)
600	.Case(S: "cond", Value: tgtok::XCond)
601	.Case(S: "isa", Value: tgtok::XIsA)
602	.Case(S: "head", Value: tgtok::XHead)
603	.Case(S: "tail", Value: tgtok::XTail)
604	.Case(S: "size", Value: tgtok::XSize)
605	.Case(S: "con", Value: tgtok::XConcat)
606	.Case(S: "dag", Value: tgtok::XDag)
607	.Case(S: "add", Value: tgtok::XADD)
608	.Case(S: "sub", Value: tgtok::XSUB)
609	.Case(S: "mul", Value: tgtok::XMUL)
610	.Case(S: "div", Value: tgtok::XDIV)
611	.Case(S: "not", Value: tgtok::XNOT)
612	.Case(S: "logtwo", Value: tgtok::XLOG2)
613	.Case(S: "and", Value: tgtok::XAND)
614	.Case(S: "or", Value: tgtok::XOR)
615	.Case(S: "xor", Value: tgtok::XXOR)
616	.Case(S: "shl", Value: tgtok::XSHL)
617	.Case(S: "sra", Value: tgtok::XSRA)
618	.Case(S: "srl", Value: tgtok::XSRL)
619	.Case(S: "cast", Value: tgtok::XCast)
620	.Case(S: "empty", Value: tgtok::XEmpty)
621	.Case(S: "subst", Value: tgtok::XSubst)
622	.Case(S: "foldl", Value: tgtok::XFoldl)
623	.Case(S: "foreach", Value: tgtok::XForEach)
624	.Case(S: "filter", Value: tgtok::XFilter)
625	.Case(S: "listconcat", Value: tgtok::XListConcat)
626	.Case(S: "listflatten", Value: tgtok::XListFlatten)
627	.Case(S: "listsplat", Value: tgtok::XListSplat)
628	.Case(S: "listremove", Value: tgtok::XListRemove)
629	.Case(S: "range", Value: tgtok::XRange)
630	.Case(S: "strconcat", Value: tgtok::XStrConcat)
631	.Case(S: "initialized", Value: tgtok::XInitialized)
632	.Case(S: "interleave", Value: tgtok::XInterleave)
633	.Case(S: "instances", Value: tgtok::XInstances)
634	.Case(S: "substr", Value: tgtok::XSubstr)
635	.Case(S: "find", Value: tgtok::XFind)
636	.Cases(S0: "setdagop", S1: "setop", Value: tgtok::XSetDagOp) // !setop is deprecated.
637	.Cases(S0: "getdagop", S1: "getop", Value: tgtok::XGetDagOp) // !getop is deprecated.
638	.Case(S: "getdagarg", Value: tgtok::XGetDagArg)
639	.Case(S: "getdagname", Value: tgtok::XGetDagName)
640	.Case(S: "setdagarg", Value: tgtok::XSetDagArg)
641	.Case(S: "setdagname", Value: tgtok::XSetDagName)
642	.Case(S: "exists", Value: tgtok::XExists)
643	.Case(S: "tolower", Value: tgtok::XToLower)
644	.Case(S: "toupper", Value: tgtok::XToUpper)
645	.Case(S: "repr", Value: tgtok::XRepr)
646	.Case(S: "match", Value: tgtok::XMatch)
647	.Default(Value: tgtok::Error);
648
649	return Kind != tgtok::Error ? Kind
650	: ReturnError(Loc: Start - `1`, Msg: "unknown operator");
651	}
652
653	bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
654	// Report an error, if preprocessor control stack for the current
655	// file is not empty.
656	if (!PrepIncludeStack.back().empty()) {
657	prepReportPreprocessorStackError();
658
659	return false;
660	}
661
662	// Pop the preprocessing controls from the include stack.
663	PrepIncludeStack.pop_back();
664
665	if (IncludeStackMustBeEmpty) {
666	assert(PrepIncludeStack.empty() &&
667	"preprocessor include stack is not empty");
668	} else {
669	assert(!PrepIncludeStack.empty() && "preprocessor include stack is empty");
670	}
671
672	return true;
673	}
674
675	tgtok::TokKind TGLexer::prepIsDirective() const {
676	for (const auto [Kind, Word] : PreprocessorDirs) {
677	if (StringRef (CurPtr, Word.size()) != Word)
678	continue;
679	int NextChar = peekNextChar(Index: Word.size());
680
681	// Check for whitespace after the directive. If there is no whitespace,
682	// then we do not recognize it as a preprocessing directive.
683
684	// New line and EOF may follow only #else/#endif. It will be reported
685	// as an error for #ifdef/#define after the call to prepLexMacroName().
686	if (NextChar == `' '` \|\| NextChar == `'\t'` \|\| NextChar == EOF \|\|
687	NextChar == `'\n'` \|\|
688	// It looks like TableGen does not support '\r' as the actual
689	// carriage return, e.g. getNextChar() treats a single '\r'
690	// as '\n'. So we do the same here.
691	NextChar == `'\r'`)
692	return Kind;
693
694	// Allow comments after some directives, e.g.:
695	// #else// OR #else//
696	// #endif// OR #endif//
697	//
698	// Note that we do allow comments after #ifdef/#define here, e.g.
699	// #ifdef// AND #ifdef//
700	// #define// AND #define//
701	//
702	// These cases will be reported as incorrect after calling
703	// prepLexMacroName(). We could have supported C-style comments
704	// after #ifdef/#define, but this would complicate the code
705	// for little benefit.
706	if (NextChar == `'/'`) {
707	NextChar = peekNextChar(Index: Word.size() + `1`);
708
709	if (NextChar == `'*'` \|\| NextChar == `'/'`)
710	return Kind;
711
712	// Pretend that we do not recognize the directive.
713	}
714	}
715
716	return tgtok::Error;
717	}
718
719	void TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
720	TokStart = CurPtr;
721
722	for (const auto [PKind, PWord] : PreprocessorDirs) {
723	if (PKind == Kind) {
724	// Advance CurPtr to the end of the preprocessing word.
725	CurPtr += PWord.size();
726	return;
727	}
728	}
729
730	llvm_unreachable(
731	"unsupported preprocessing token in prepEatPreprocessorDirective()");
732	}
733
734	tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
735	bool ReturnNextLiveToken) {
736	// We must be looking at a preprocessing directive. Eat it!
737	prepEatPreprocessorDirective(Kind);
738
739	if (Kind == tgtok::Ifdef \|\| Kind == tgtok::Ifndef) {
740	StringRef MacroName = prepLexMacroName();
741	StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef";
742	if (MacroName.empty())
743	return ReturnError(Loc: TokStart, Msg: "expected macro name after " + IfTokName);
744
745	bool MacroIsDefined = DefinedMacros.count(Key: MacroName) != `0`;
746
747	// Canonicalize ifndef's MacroIsDefined to its ifdef equivalent.
748	if (Kind == tgtok::Ifndef)
749	MacroIsDefined = !MacroIsDefined;
750
751	// Regardless of whether we are processing tokens or not,
752	// we put the #ifdef control on stack.
753	// Note that MacroIsDefined has been canonicalized against ifdef.
754	PrepIncludeStack.back().push_back(
755	Elt: {.Kind: tgtok::Ifdef, .IsDefined: MacroIsDefined, .SrcPos: SMLoc::getFromPointer(Ptr: TokStart)});
756
757	if (!prepSkipDirectiveEnd())
758	return ReturnError(Loc: CurPtr, Msg: "only comments are supported after " +
759	IfTokName + " NAME");
760
761	// If we were not processing tokens before this #ifdef,
762	// then just return back to the lines skipping code.
763	if (!ReturnNextLiveToken)
764	return Kind;
765
766	// If we were processing tokens before this #ifdef,
767	// and the macro is defined, then just return the next token.
768	if (MacroIsDefined)
769	return LexToken();
770
771	// We were processing tokens before this #ifdef, and the macro
772	// is not defined, so we have to start skipping the lines.
773	// If the skipping is successful, it will return the token following
774	// either #else or #endif corresponding to this #ifdef.
775	if (prepSkipRegion(MustNeverBeFalse: ReturnNextLiveToken))
776	return LexToken();
777
778	return tgtok::Error;
779	} else if (Kind == tgtok::Else) {
780	// Check if this #else is correct before calling prepSkipDirectiveEnd(),
781	// which will move CurPtr away from the beginning of #else.
782	if (PrepIncludeStack.back().empty())
783	return ReturnError(Loc: TokStart, Msg: "#else without #ifdef or #ifndef");
784
785	PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back();
786
787	if (IfdefEntry.Kind != tgtok::Ifdef) {
788	PrintError(Loc: TokStart, Msg: "double #else");
789	return ReturnError(Loc: IfdefEntry.SrcPos, Msg: "previous #else is here");
790	}
791
792	// Replace the corresponding #ifdef's control with its negation
793	// on the control stack.
794	PrepIncludeStack.back().back() = {.Kind: Kind, .IsDefined: !IfdefEntry.IsDefined,
795	.SrcPos: SMLoc::getFromPointer(Ptr: TokStart)};
796
797	if (!prepSkipDirectiveEnd())
798	return ReturnError(Loc: CurPtr, Msg: "only comments are supported after #else");
799
800	// If we were processing tokens before this #else,
801	// we have to start skipping lines until the matching #endif.
802	if (ReturnNextLiveToken) {
803	if (prepSkipRegion(MustNeverBeFalse: ReturnNextLiveToken))
804	return LexToken();
805
806	return tgtok::Error;
807	}
808
809	// Return to the lines skipping code.
810	return Kind;
811	} else if (Kind == tgtok::Endif) {
812	// Check if this #endif is correct before calling prepSkipDirectiveEnd(),
813	// which will move CurPtr away from the beginning of #endif.
814	if (PrepIncludeStack.back().empty())
815	return ReturnError(Loc: TokStart, Msg: "#endif without #ifdef");
816
817	[[maybe_unused]] auto &IfdefOrElseEntry = PrepIncludeStack.back().back();
818
819	assert((IfdefOrElseEntry.Kind == tgtok::Ifdef \|\|
820	IfdefOrElseEntry.Kind == tgtok::Else) &&
821	"invalid preprocessor control on the stack");
822
823	if (!prepSkipDirectiveEnd())
824	return ReturnError(Loc: CurPtr, Msg: "only comments are supported after #endif");
825
826	PrepIncludeStack.back().pop_back();
827
828	// If we were processing tokens before this #endif, then
829	// we should continue it.
830	if (ReturnNextLiveToken) {
831	return LexToken();
832	}
833
834	// Return to the lines skipping code.
835	return Kind;
836	} else if (Kind == tgtok::Define) {
837	StringRef MacroName = prepLexMacroName();
838	if (MacroName.empty())
839	return ReturnError(Loc: TokStart, Msg: "expected macro name after #define");
840
841	if (!DefinedMacros.insert(key: MacroName).second)
842	PrintWarning(WarningLoc: getLoc(),
843	Msg: "duplicate definition of macro: " + Twine (MacroName));
844
845	if (!prepSkipDirectiveEnd())
846	return ReturnError(Loc: CurPtr,
847	Msg: "only comments are supported after #define NAME");
848
849	assert(ReturnNextLiveToken &&
850	"#define must be ignored during the lines skipping");
851
852	return LexToken();
853	}
854
855	llvm_unreachable("preprocessing directive is not supported");
856	}
857
858	bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
859	assert(MustNeverBeFalse && "invalid recursion.");
860
861	do {
862	// Skip all symbols to the line end.
863	while (*CurPtr != `'\n'`)
864	++CurPtr;
865
866	// Find the first non-whitespace symbol in the next line(s).
867	if (!prepSkipLineBegin())
868	return false;
869
870	// If the first non-blank/comment symbol on the line is '#',
871	// it may be a start of preprocessing directive.
872	//
873	// If it is not '#' just go to the next line.
874	if (*CurPtr == `'#'`)
875	++CurPtr;
876	else
877	continue;
878
879	tgtok::TokKind Kind = prepIsDirective();
880
881	// If we did not find a preprocessing directive or it is #define,
882	// then just skip to the next line. We do not have to do anything
883	// for #define in the line-skipping mode.
884	if (Kind == tgtok::Error \|\| Kind == tgtok::Define)
885	continue;
886
887	tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, ReturnNextLiveToken: false);
888
889	// If lexPreprocessor() encountered an error during lexing this
890	// preprocessor idiom, then return false to the calling lexPreprocessor().
891	// This will force tgtok::Error to be returned to the tokens processing.
892	if (ProcessedKind == tgtok::Error)
893	return false;
894
895	assert(Kind == ProcessedKind && "prepIsDirective() and lexPreprocessor() "
896	"returned different token kinds");
897
898	// If this preprocessing directive enables tokens processing,
899	// then return to the lexPreprocessor() and get to the next token.
900	// We can move from line-skipping mode to processing tokens only
901	// due to #else or #endif.
902	if (prepIsProcessingEnabled()) {
903	assert((Kind == tgtok::Else \|\| Kind == tgtok::Endif) &&
904	"tokens processing was enabled by an unexpected preprocessing "
905	"directive");
906
907	return true;
908	}
909	} while (CurPtr != CurBuf.end());
910
911	// We have reached the end of the file, but never left the lines-skipping
912	// mode. This means there is no matching #endif.
913	prepReportPreprocessorStackError();
914	return false;
915	}
916
917	StringRef TGLexer::prepLexMacroName() {
918	// Skip whitespaces between the preprocessing directive and the macro name.
919	while (CurPtr == `' '` \|\| CurPtr == `'\t'`)
920	++CurPtr;
921
922	TokStart = CurPtr;
923	CurPtr = lexMacroName(Str: StringRef (CurPtr, CurBuf.end() - CurPtr));
924	return StringRef (TokStart, CurPtr - TokStart);
925	}
926
927	bool TGLexer::prepSkipLineBegin() {
928	while (CurPtr != CurBuf.end()) {
929	switch (*CurPtr) {
930	case `' '`:
931	case `'\t'`:
932	case `'\n'`:
933	case `'\r'`:
934	break;
935
936	case `'/'`: {
937	int NextChar = peekNextChar(Index: `1`);
938	if (NextChar == `'*'`) {
939	// Skip C-style comment.
940	// Note that we do not care about skipping the C++-style comments.
941	// If the line contains "//", it may not contain any processable
942	// preprocessing directive. Just return CurPtr pointing to
943	// the first '/' in this case. We also do not care about
944	// incorrect symbols after the first '/' - we are in lines-skipping
945	// mode, so incorrect code is allowed to some extent.
946
947	// Set TokStart to the beginning of the comment to enable proper
948	// diagnostic printing in case of error in SkipCComment().
949	TokStart = CurPtr;
950
951	// CurPtr must point to '' before call to SkipCComment().*
952	++CurPtr;
953	if (SkipCComment())
954	return false;
955	} else {
956	// CurPtr points to the non-whitespace '/'.
957	return true;
958	}
959
960	// We must not increment CurPtr after the comment was lexed.
961	continue;
962	}
963
964	default:
965	return true;
966	}
967
968	++CurPtr;
969	}
970
971	// We have reached the end of the file. Return to the lines skipping
972	// code, and allow it to handle the EOF as needed.
973	return true;
974	}
975
976	bool TGLexer::prepSkipDirectiveEnd() {
977	while (CurPtr != CurBuf.end()) {
978	switch (*CurPtr) {
979	case `' '`:
980	case `'\t'`:
981	break;
982
983	case `'\n'`:
984	case `'\r'`:
985	return true;
986
987	case `'/'`: {
988	int NextChar = peekNextChar(Index: `1`);
989	if (NextChar == `'/'`) {
990	// Skip C++-style comment.
991	// We may just return true now, but let's skip to the line/buffer end
992	// to simplify the method specification.
993	++CurPtr;
994	SkipBCPLComment();
995	} else if (NextChar == `'*'`) {
996	// When we are skipping C-style comment at the end of a preprocessing
997	// directive, we can skip several lines. If any meaningful TD token
998	// follows the end of the C-style comment on the same line, it will
999	// be considered as an invalid usage of TD token.
1000	// For example, we want to forbid usages like this one:
1001	// #define MACRO class Class {}
1002	// But with C-style comments we also disallow the following:
1003	// #define MACRO / This macro is used*
1004	// to ... / class Class {}*
1005	// One can argue that this should be allowed, but it does not seem
1006	// to be worth of the complication. Moreover, this matches
1007	// the C preprocessor behavior.
1008
1009	// Set TokStart to the beginning of the comment to enable proper
1010	// diagnostic printer in case of error in SkipCComment().
1011	TokStart = CurPtr;
1012	++CurPtr;
1013	if (SkipCComment())
1014	return false;
1015	} else {
1016	TokStart = CurPtr;
1017	PrintError(Loc: CurPtr, Msg: "unexpected character");
1018	return false;
1019	}
1020
1021	// We must not increment CurPtr after the comment was lexed.
1022	continue;
1023	}
1024
1025	default:
1026	// Do not allow any non-whitespaces after the directive.
1027	TokStart = CurPtr;
1028	return false;
1029	}
1030
1031	++CurPtr;
1032	}
1033
1034	return true;
1035	}
1036
1037	bool TGLexer::prepIsProcessingEnabled() {
1038	return all_of(Range&: PrepIncludeStack.back(),
1039	P: [](const PreprocessorControlDesc &I) { return I.IsDefined; });
1040	}
1041
1042	void TGLexer::prepReportPreprocessorStackError() {
1043	auto &PrepControl = PrepIncludeStack.back().back();
1044	PrintError(Loc: CurBuf.end(), Msg: "reached EOF without matching #endif");
1045	PrintError(ErrorLoc: PrepControl.SrcPos, Msg: "the latest preprocessor control is here");
1046
1047	TokStart = CurPtr;
1048	}
1049

Browse the source code of llvm_projects/llvm/lib/TableGen/TGLexer.cpp