DependencyDirectivesScanner.cpp source code [llvm_projects/clang/lib/Lex/DependencyDirectivesScanner.cpp]

1	//===- DependencyDirectivesScanner.cpp ------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This is the interface for scanning header and source files to get the
11	/// minimum necessary preprocessor directives for evaluating includes. It
12	/// reduces the source down to #define, #include, #import, @import, and any
13	/// conditional preprocessor logic that contains one of those.
14	///
15	//===----------------------------------------------------------------------===//
16
17	#include "clang/Lex/DependencyDirectivesScanner.h"
18	#include "clang/Basic/CharInfo.h"
19	#include "clang/Basic/Diagnostic.h"
20	#include "clang/Lex/LexDiagnostic.h"
21	#include "clang/Lex/Lexer.h"
22	#include "clang/Lex/Pragma.h"
23	#include "llvm/ADT/ScopeExit.h"
24	#include "llvm/ADT/SmallString.h"
25	#include "llvm/ADT/StringMap.h"
26	#include "llvm/ADT/StringSwitch.h"
27	#include <optional>
28
29	using namespace clang;
30	using namespace clang::dependency_directives_scan;
31	using namespace llvm;
32
33	namespace {
34
35	struct DirectiveWithTokens {
36	DirectiveKind Kind;
37	unsigned NumTokens;
38
39	DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
40	: Kind(Kind), NumTokens(NumTokens) {}
41	};
42
43	/// Does an efficient "scan" of the sources to detect the presence of
44	/// preprocessor (or module import) directives and collects the raw lexed tokens
45	/// for those directives so that the \p Lexer can "replay" them when the file is
46	/// included.
47	///
48	/// Note that the behavior of the raw lexer is affected by the language mode,
49	/// while at this point we want to do a scan and collect tokens once,
50	/// irrespective of the language mode that the file will get included in. To
51	/// compensate for that the \p Lexer, while "replaying", will adjust a token
52	/// where appropriate, when it could affect the preprocessor's state.
53	/// For example in a directive like
54	///
55	/// \code
56	/// #if __has_cpp_attribute(clang::fallthrough)
57	/// \endcode
58	///
59	/// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
60	/// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
61	/// while in C++ mode.
62	struct Scanner {
63	Scanner(StringRef Input,
64	SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
65	DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
66	: Input (Input), Tokens(Tokens), Diags(Diags),
67	InputSourceLoc (InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
68	TheLexer (InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
69	Input.end()) {}
70
71	static LangOptions getLangOptsForDepScanning() {
72	LangOptions LangOpts;
73	// Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
74	LangOpts.ObjC = true;
75	LangOpts.LineComment = true;
76	LangOpts.RawStringLiterals = true;
77	LangOpts.AllowLiteralDigitSeparator = true;
78	// FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"".
79	return LangOpts;
80	}
81
82	/// Lex the provided source and emit the directive tokens.
83	///
84	/// \returns True on error.
85	bool scan(SmallVectorImpl<Directive> &Directives);
86
87	friend bool clang::scanInputForCXX20ModulesUsage(StringRef Source);
88	friend bool clang::isPreprocessedModuleFile(StringRef Source);
89
90	private:
91	/// Lexes next token and advances \p First and the \p Lexer.
92	[[nodiscard]] dependency_directives_scan::Token &
93	lexToken(const char &First, const* char *const End);
94
95	[[nodiscard]] dependency_directives_scan::Token &
96	lexIncludeFilename(const char &First, const* char *const End);
97
98	void skipLine(const char &First, const* char *const End);
99	void skipDirective(StringRef Name, const char &First, const* char *const End);
100
101	/// Returns the spelling of a string literal or identifier after performing
102	/// any processing needed to handle \c clang::Token::NeedsCleaning.
103	StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
104
105	/// Lexes next token and if it is identifier returns its string, otherwise
106	/// it skips the current line and returns \p std::nullopt.
107	///
108	/// In any case (whatever the token kind) \p First and the \p Lexer will
109	/// advance beyond the token.
110	[[nodiscard]] std::optional<StringRef>
111	tryLexIdentifierOrSkipLine(const char &First, const* char *const End);
112
113	/// Used when it is certain that next token is an identifier.
114	[[nodiscard]] StringRef lexIdentifier(const char *&First,
115	const char *const End);
116
117	/// Lexes next token and returns true iff it is an identifier that matches \p
118	/// Id, otherwise it skips the current line and returns false.
119	///
120	/// In any case (whatever the token kind) \p First and the \p Lexer will
121	/// advance beyond the token.
122	[[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id,
123	const char *&First,
124	const char *const End);
125
126	/// Lexes next token and returns true iff it matches the kind \p K.
127	/// Otherwise it skips the current line and returns false.
128	///
129	/// In any case (whatever the token kind) \p First and the \p Lexer will
130	/// advance beyond the token.
131	[[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
132	const char *const End);
133
134	/// Lexes next token and if it is string literal, returns its string.
135	/// Otherwise, it skips the current line and returns \p std::nullopt.
136	///
137	/// In any case (whatever the token kind) \p First and the \p Lexer will
138	/// advance beyond the token.
139	[[nodiscard]] std::optional<StringRef>
140	tryLexStringLiteralOrSkipLine(const char &First, const* char *const End);
141
142	[[nodiscard]] bool scanImpl(const char First, const* char *const End);
143	[[nodiscard]] bool lexPPLine(const char &First, const* char *const End);
144	[[nodiscard]] bool lexAt(const char &First, const* char *const End);
145	[[nodiscard]] bool lexModule(const char &First, const* char *const End);
146	[[nodiscard]] bool lexDefine(const char HashLoc, const* char *&First,
147	const char *const End);
148	[[nodiscard]] bool lexPragma(const char &First, const* char *const End);
149	[[nodiscard]] bool lex_Pragma(const char &First, const* char *const End);
150	[[nodiscard]] bool lexEndif(const char &First, const* char *const End);
151	[[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
152	const char *const End);
153	[[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind,
154	const char *&First,
155	const char *const End);
156	void lexPPDirectiveBody(const char &First, const* char *const End);
157
158	DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
159	Tokens.append(RHS: CurDirToks);
160	DirsWithToks.emplace_back(Args&: Kind, Args: CurDirToks.size());
161	CurDirToks.clear();
162	return DirsWithToks.back();
163	}
164	void popDirective() {
165	Tokens.pop_back_n(NumItems: DirsWithToks.pop_back_val().NumTokens);
166	}
167	DirectiveKind topDirective() const {
168	return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;
169	}
170
171	unsigned getOffsetAt(const char CurPtr) const* {
172	return CurPtr - Input.data();
173	}
174
175	/// Reports a diagnostic if the diagnostic engine is provided. Always returns
176	/// true at the end.
177	bool reportError(const char CurPtr, unsigned* Err);
178
179	bool ScanningPreprocessedModuleFile = false;
180	StringMap<char> SplitIds;
181	StringRef Input;
182	SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
183	DiagnosticsEngine *Diags;
184	SourceLocation InputSourceLoc;
185
186	const char LastTokenPtr = nullptr*;
187	/// Keeps track of the tokens for the currently lexed directive. Once a
188	/// directive is fully lexed and "committed" then the tokens get appended to
189	/// \p Tokens and \p CurDirToks is cleared for the next directive.
190	SmallVector<dependency_directives_scan::Token, `32`> CurDirToks;
191	/// The directives that were lexed along with the number of tokens that each
192	/// directive contains. The tokens of all the directives are kept in \p Tokens
193	/// vector, in the same order as the directives order in \p DirsWithToks.
194	SmallVector<DirectiveWithTokens, `64`> DirsWithToks;
195	LangOptions LangOpts;
196	Lexer TheLexer;
197	};
198
199	} // end anonymous namespace
200
201	bool Scanner::reportError(const char CurPtr, unsigned* Err) {
202	if (!Diags)
203	return true;
204	assert(CurPtr >= Input.data() && "invalid buffer ptr");
205	Diags->Report(Loc: InputSourceLoc.getLocWithOffset(Offset: getOffsetAt(CurPtr)), DiagID: Err);
206	return true;
207	}
208
209	static void skipOverSpaces(const char &First, const* char *const End) {
210	while (First != End && isHorizontalWhitespace(c: *First))
211	++First;
212	}
213
214	// Move back by one character, skipping escaped newlines (backslash + \n)
215	static char previousChar(const char First, const* char *&Current) {
216	assert(Current > First);
217	--Current;
218	while (Current > First && isVerticalWhitespace(c: *Current)) {
219	// Check if the previous character is a backslash
220	if (Current > First && *(Current - `1`) == `'\\'`) {
221	// Use Lexer's getEscapedNewLineSize to get the size of the escaped
222	// newline
223	unsigned EscapeSize = Lexer::getEscapedNewLineSize(P: Current);
224	if (EscapeSize > `0`) {
225	// Skip back over the entire escaped newline sequence (backslash +
226	// newline)
227	Current -= (`1` + EscapeSize);
228	} else {
229	break;
230	}
231	} else {
232	break;
233	}
234	}
235	return *Current;
236	}
237
238	[[nodiscard]] static bool isRawStringLiteral(const char *First,
239	const char *Current) {
240	assert(First <= Current);
241
242	// Check if we can even back up.
243	if (*Current != `'"'` \|\| First == Current)
244	return false;
245
246	// Check for an "R".
247	if (previousChar(First, Current) != `'R'`)
248	return false;
249	if (First == Current \|\|
250	!isAsciiIdentifierContinue(c: previousChar(First, Current)))
251	return true;
252
253	// Check for a prefix of "u", "U", or "L".
254	if (Current == `'u'` \|\| Current == `'U'` \|\| *Current == `'L'`)
255	return First == Current \|\|
256	!isAsciiIdentifierContinue(c: previousChar(First, Current));
257
258	// Check for a prefix of "u8".
259	if (*Current != `'8'` \|\| First == Current \|\|
260	previousChar(First, Current) != `'u'`)
261	return false;
262	return First == Current \|\|
263	!isAsciiIdentifierContinue(c: previousChar(First, Current));
264	}
265
266	static void skipRawString(const char &First, const* char *const End) {
267	assert(First[`0`] == `'"'`);
268
269	const char *Last = ++First;
270	while (Last != End && *Last != `'('`)
271	++Last;
272	if (Last == End) {
273	First = Last; // Hit the end... just give up.
274	return;
275	}
276
277	StringRef Terminator(First, Last - First);
278	for (;;) {
279	// Move First to just past the next ")".
280	First = Last;
281	while (First != End && *First != `')'`)
282	++First;
283	if (First == End)
284	return;
285	++First;
286
287	// Look ahead for the terminator sequence.
288	Last = First;
289	while (Last != End && size_t(Last - First) < Terminator.size() &&
290	Terminator [Last - First] == *Last)
291	++Last;
292
293	// Check if we hit it (or the end of the file).
294	if (Last == End) {
295	First = Last;
296	return;
297	}
298	if (size_t(Last - First) < Terminator.size())
299	continue;
300	if (*Last != `'"'`)
301	continue;
302	First = Last + `1`;
303	return;
304	}
305	}
306
307	// Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
308	static unsigned isEOL(const char First, const* char *const End) {
309	if (First == End)
310	return `0`;
311	if (End - First > `1` && isVerticalWhitespace(c: First[`0`]) &&
312	isVerticalWhitespace(c: First[`1`]) && First[`0`] != First[`1`])
313	return `2`;
314	return !!isVerticalWhitespace(c: First[`0`]);
315	}
316
317	static void skipString(const char &First, const* char *const End) {
318	assert(First == `'\''` \|\| First == `'"'` \|\| *First == `'<'`);
319	const char Terminator = First == `'<'` ? `'>'` : First;
320	for (++First; First != End && *First != Terminator; ++First) {
321	// String and character literals don't extend past the end of the line.
322	if (isVerticalWhitespace(c: *First))
323	return;
324	if (*First != `'\\'`)
325	continue;
326	// Skip past backslash to the next character. This ensures that the
327	// character right after it is skipped as well, which matters if it's
328	// the terminator.
329	if (++First == End)
330	return;
331	if (!isWhitespace(c: *First))
332	continue;
333	// Whitespace after the backslash might indicate a line continuation.
334	const char *FirstAfterBackslashPastSpace = First;
335	skipOverSpaces(First&: FirstAfterBackslashPastSpace, End);
336	if (unsigned NLSize = isEOL(First: FirstAfterBackslashPastSpace, End)) {
337	// Advance the character pointer to the next line for the next
338	// iteration.
339	First = FirstAfterBackslashPastSpace + NLSize - `1`;
340	}
341	}
342	if (First != End)
343	++First; // Finish off the string.
344	}
345
346	// Returns the length of the skipped newline
347	static unsigned skipNewline(const char &First, const* char *End) {
348	if (First == End)
349	return `0`;
350	assert(isVerticalWhitespace(*First));
351	unsigned Len = isEOL(First, End);
352	assert(Len && "expected newline");
353	First += Len;
354	return Len;
355	}
356
357	static void skipToNewlineRaw(const char &First, const* char *const End) {
358	for (;;) {
359	if (First == End)
360	return;
361
362	unsigned Len = isEOL(First, End);
363	if (Len)
364	return;
365
366	char LastNonWhitespace = `' '`;
367	do {
368	if (!isHorizontalWhitespace(c: *First))
369	LastNonWhitespace = *First;
370	if (++First == End)
371	return;
372	Len = isEOL(First, End);
373	} while (!Len);
374
375	if (LastNonWhitespace != `'\\'`)
376	return;
377
378	First += Len;
379	// Keep skipping lines...
380	}
381	}
382
383	static void skipLineComment(const char &First, const* char *const End) {
384	assert(First[`0`] == `'/'` && First[`1`] == `'/'`);
385	First += `2`;
386	skipToNewlineRaw(First, End);
387	}
388
389	static void skipBlockComment(const char &First, const* char *const End) {
390	assert(First[`0`] == `'/'` && First[`1`] == `'*'`);
391	if (End - First < `4`) {
392	First = End;
393	return;
394	}
395	for (First += `3`; First != End; ++First)
396	if (First[-`1`] == `'*'` && First[`0`] == `'/'`) {
397	++First;
398	return;
399	}
400	}
401
402	/// \returns True if the current single quotation mark character is a C++14
403	/// digit separator.
404	static bool isQuoteCppDigitSeparator(const char *const Start,
405	const char *const Cur,
406	const char *const End) {
407	assert(*Cur == `'\''` && "expected quotation character");
408	// skipLine called in places where we don't expect a valid number
409	// body before `start` on the same line, so always return false at the start.
410	if (Start == Cur)
411	return false;
412	// The previous character must be a valid PP number character.
413	// Make sure that the L, u, U, u8 prefixes don't get marked as a
414	// separator though.
415	char Prev = *(Cur - `1`);
416	if (Prev == `'L'` \|\| Prev == `'U'` \|\| Prev == `'u'`)
417	return false;
418	if (Prev == `'8'` && (Cur - `1` != Start) && *(Cur - `2`) == `'u'`)
419	return false;
420	if (!isPreprocessingNumberBody(c: Prev))
421	return false;
422	// The next character should be a valid identifier body character.
423	return (Cur + `1`) < End && isAsciiIdentifierContinue(c: *(Cur + `1`));
424	}
425
426	void Scanner::skipLine(const char &First, const* char *const End) {
427	for (;;) {
428	assert(First <= End);
429	if (First == End)
430	return;
431
432	if (isVerticalWhitespace(c: *First)) {
433	skipNewline(First, End);
434	return;
435	}
436	const char *Start = First;
437	// Use `LastNonWhitespace`to track if a line-continuation has ever been seen
438	// before a new-line character:
439	char LastNonWhitespace = `' '`;
440	while (First != End && !isVerticalWhitespace(c: *First)) {
441	// Iterate over strings correctly to avoid comments and newlines.
442	if (*First == `'"'` \|\|
443	(*First == `'\''` && !isQuoteCppDigitSeparator(Start, Cur: First, End))) {
444	LastTokenPtr = First;
445	if (isRawStringLiteral(First: Start, Current: First))
446	skipRawString(First, End);
447	else
448	skipString(First, End);
449	continue;
450	}
451
452	// Continue on the same line if an EOL is preceded with backslash
453	if (First + `1` < End && *First == `'\\'`) {
454	if (unsigned Len = isEOL(First: First + `1`, End)) {
455	First += `1` + Len;
456	continue;
457	}
458	}
459
460	// Iterate over comments correctly.
461	if (*First != `'/'` \|\| End - First < `2`) {
462	LastTokenPtr = First;
463	if (!isWhitespace(c: *First))
464	LastNonWhitespace = *First;
465	++First;
466	continue;
467	}
468
469	if (First[`1`] == `'/'`) {
470	// "//...".
471	skipLineComment(First, End);
472	continue;
473	}
474
475	if (First[`1`] != `'*'`) {
476	LastTokenPtr = First;
477	if (!isWhitespace(c: *First))
478	LastNonWhitespace = *First;
479	++First;
480	continue;
481	}
482
483	// "/.../".
484	skipBlockComment(First, End);
485	}
486	if (First == End)
487	return;
488
489	// Skip over the newline.
490	skipNewline(First, End);
491
492	if (LastNonWhitespace != `'\\'`)
493	break;
494	}
495	}
496
497	void Scanner::skipDirective(StringRef Name, const char *&First,
498	const char *const End) {
499	if (llvm::StringSwitch<bool>(Name)
500	.Case(S: "warning", Value: true)
501	.Case(S: "error", Value: true)
502	.Default(Value: false))
503	// Do not process quotes or comments.
504	skipToNewlineRaw(First, End);
505	else
506	skipLine(First, End);
507	}
508
509	static void skipWhitespace(const char &First, const* char *const End) {
510	for (;;) {
511	assert(First <= End);
512	skipOverSpaces(First, End);
513
514	if (End - First < `2`)
515	return;
516
517	if (*First == `'\\'`) {
518	const char *Ptr = First + `1`;
519	while (Ptr < End && isHorizontalWhitespace(c: *Ptr))
520	++Ptr;
521	if (Ptr != End && isVerticalWhitespace(c: *Ptr)) {
522	skipNewline(First&: Ptr, End);
523	First = Ptr;
524	continue;
525	}
526	return;
527	}
528
529	// Check for a non-comment character.
530	if (First[`0`] != `'/'`)
531	return;
532
533	// "// ...".
534	if (First[`1`] == `'/'`) {
535	skipLineComment(First, End);
536	return;
537	}
538
539	// Cannot be a comment.
540	if (First[`1`] != `'*'`)
541	return;
542
543	// "/.../".
544	skipBlockComment(First, End);
545	}
546	}
547
548	bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
549	const char *const End) {
550	assert(Kind == DirectiveKind::cxx_export_import_decl \|\|
551	Kind == DirectiveKind::cxx_export_module_decl \|\|
552	Kind == DirectiveKind::cxx_import_decl \|\|
553	Kind == DirectiveKind::cxx_module_decl \|\|
554	Kind == DirectiveKind::decl_at_import);
555
556	const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
557	for (;;) {
558	// Keep a copy of the First char incase it needs to be reset.
559	const char *Previous = First;
560	const dependency_directives_scan::Token &Tok = lexToken(First, End);
561	if ((Tok.is(K: tok::hash) \|\| Tok.is(K: tok::at)) &&
562	(Tok.Flags & clang::Token::StartOfLine)) {
563	CurDirToks.pop_back();
564	First = Previous;
565	return false;
566	}
567	if (Tok.isOneOf(Ks: tok::eof, Ks: tok::eod))
568	return reportError(
569	CurPtr: DirectiveLoc,
570	Err: diag::err_dep_source_scanner_missing_semi_after_at_import);
571	if (Tok.is(K: tok::semi))
572	break;
573	}
574
575	bool IsCXXModules = Kind == DirectiveKind::cxx_export_import_decl \|\|
576	Kind == DirectiveKind::cxx_export_module_decl \|\|
577	Kind == DirectiveKind::cxx_import_decl \|\|
578	Kind == DirectiveKind::cxx_module_decl;
579	if (IsCXXModules) {
580	lexPPDirectiveBody(First, End);
581	pushDirective(Kind);
582	return false;
583	}
584
585	pushDirective(Kind);
586	skipWhitespace(First, End);
587	if (First == End)
588	return false;
589	if (!isVerticalWhitespace(c: *First))
590	return reportError(
591	CurPtr: DirectiveLoc, Err: diag::err_dep_source_scanner_unexpected_tokens_at_import);
592	skipNewline(First, End);
593	return false;
594	}
595
596	dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
597	const char *const End) {
598	clang::Token Tok;
599	TheLexer.LexFromRawLexer(Result&: Tok);
600	First = Input.data() + TheLexer.getCurrentBufferOffset();
601	assert(First <= End);
602
603	unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
604	CurDirToks.emplace_back(Args&: Offset, Args: Tok.getLength(), Args: Tok.getKind(),
605	Args: Tok.getFlags());
606	return CurDirToks.back();
607	}
608
609	dependency_directives_scan::Token &
610	Scanner::lexIncludeFilename(const char &First, const* char *const End) {
611	clang::Token Tok;
612	TheLexer.LexIncludeFilename(FilenameTok&: Tok);
613	First = Input.data() + TheLexer.getCurrentBufferOffset();
614	assert(First <= End);
615
616	unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
617	CurDirToks.emplace_back(Args&: Offset, Args: Tok.getLength(), Args: Tok.getKind(),
618	Args: Tok.getFlags());
619	return CurDirToks.back();
620	}
621
622	void Scanner::lexPPDirectiveBody(const char &First, const* char *const End) {
623	while (true) {
624	const dependency_directives_scan::Token &Tok = lexToken(First, End);
625	if (Tok.is(K: tok::eod) \|\| Tok.is(K: tok::eof))
626	break;
627	}
628	}
629
630	StringRef
631	Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
632	bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
633	if (LLVM_LIKELY(!NeedsCleaning))
634	return Input.slice(Start: Tok.Offset, End: Tok.getEnd());
635
636	SmallString<`64`> Spelling;
637	Spelling.resize(N: Tok.Length);
638
639	// FIXME: C++11 raw string literals need special handling (see getSpellingSlow
640	// in the Lexer). Currently we cannot see them due to our LangOpts.
641
642	unsigned SpellingLength = `0`;
643	const char *BufPtr = Input.begin() + Tok.Offset;
644	const char *AfterIdent = Input.begin() + Tok.getEnd();
645	while (BufPtr < AfterIdent) {
646	auto [Char, Size] = Lexer::getCharAndSizeNoWarn(Ptr: BufPtr, LangOpts);
647	Spelling [SpellingLength++] = Char;
648	BufPtr += Size;
649	}
650
651	return SplitIds.try_emplace(Key: StringRef(Spelling.begin(), SpellingLength), Args: `0`)
652	.first ->first();
653	}
654
655	std::optional<StringRef>
656	Scanner::tryLexIdentifierOrSkipLine(const char &First, const* char *const End) {
657	const dependency_directives_scan::Token &Tok = lexToken(First, End);
658	if (Tok.isNot(K: tok::raw_identifier)) {
659	if (!Tok.is(K: tok::eod))
660	skipLine(First, End);
661	return std::nullopt;
662	}
663
664	return cleanStringIfNeeded(Tok);
665	}
666
667	StringRef Scanner::lexIdentifier(const char &First, const* char *const End) {
668	std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
669	assert(Id && "expected identifier token");
670	return *Id;
671	}
672
673	bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
674	const char *const End) {
675	if (std::optional<StringRef> FoundId =
676	tryLexIdentifierOrSkipLine(First, End)) {
677	if (*FoundId == Id)
678	return true;
679	skipLine(First, End);
680	}
681	return false;
682	}
683
684	bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
685	const char *const End) {
686	const dependency_directives_scan::Token &Tok = lexToken(First, End);
687	if (Tok.is(K))
688	return true;
689	skipLine(First, End);
690	return false;
691	}
692
693	std::optional<StringRef>
694	Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
695	const char *const End) {
696	const dependency_directives_scan::Token &Tok = lexToken(First, End);
697	if (!tok::isStringLiteral(K: Tok.Kind)) {
698	if (!Tok.is(K: tok::eod))
699	skipLine(First, End);
700	return std::nullopt;
701	}
702
703	return cleanStringIfNeeded(Tok);
704	}
705
706	bool Scanner::lexAt(const char &First, const* char *const End) {
707	// Handle "@import".
708
709	// Lex '@'.
710	const dependency_directives_scan::Token &AtTok = lexToken(First, End);
711	assert(AtTok.is(tok::at));
712	(void)AtTok;
713
714	if (!isNextIdentifierOrSkipLine(Id: "import", First, End))
715	return false;
716	return lexModuleDirectiveBody(Kind: decl_at_import, First, End);
717	}
718
719	bool Scanner::lexModule(const char &First, const* char *const End) {
720	StringRef Id = lexIdentifier(First, End);
721	bool Export = false;
722	if (Id == "export") {
723	Export = true;
724	std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);
725	if (!NextId)
726	return false;
727	Id = *NextId;
728	}
729
730	StringRef Module =
731	ScanningPreprocessedModuleFile ? "__preprocessed_module" : "module";
732	StringRef Import =
733	ScanningPreprocessedModuleFile ? "__preprocessed_import" : "import";
734
735	if (Id != Module && Id != Import) {
736	skipLine(First, End);
737	return false;
738	}
739
740	skipWhitespace(First, End);
741
742	// Ignore this as a module directive if the next character can't be part of
743	// an import.
744
745	switch (*First) {
746	case `':'`: {
747	// `module :` is never the start of a valid module declaration.
748	if (Id == Module) {
749	skipLine(First, End);
750	return false;
751	}
752	// A module partition starts with exactly one ':'. If we have '::', this is
753	// a scope resolution instead and shouldn't be recognized as a directive
754	// per P1857R3.
755	if (First + `1` != End && First[`1`] == `':'`) {
756	skipLine(First, End);
757	return false;
758	}
759	// `import:(type)name` is a valid ObjC method decl, so check one more token.
760	(void)lexToken(First, End);
761	if (!tryLexIdentifierOrSkipLine(First, End))
762	return false;
763	break;
764	}
765	case `';'`: {
766	// Handle the global module fragment `module;`.
767	if (Id == Module && !Export)
768	break;
769	skipLine(First, End);
770	return false;
771	}
772	case `'<'`:
773	case `'"'`:
774	break;
775	default:
776	if (!isAsciiIdentifierContinue(c: *First)) {
777	skipLine(First, End);
778	return false;
779	}
780	}
781
782	TheLexer.seek(Offset: getOffsetAt(CurPtr: First), /IsAtStartOfLine/ false);
783
784	DirectiveKind Kind;
785	if (Id == Module)
786	Kind = Export ? cxx_export_module_decl : cxx_module_decl;
787	else
788	Kind = Export ? cxx_export_import_decl : cxx_import_decl;
789
790	return lexModuleDirectiveBody(Kind, First, End);
791	}
792
793	bool Scanner::lex_Pragma(const char &First, const* char *const End) {
794	if (!isNextTokenOrSkipLine(K: tok::l_paren, First, End))
795	return false;
796
797	std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
798
799	if (!Str \|\| !isNextTokenOrSkipLine(K: tok::r_paren, First, End))
800	return false;
801
802	SmallString<`64`> Buffer(*Str);
803	prepare_PragmaString(StrVal&: Buffer);
804
805	// Use a new scanner instance since the tokens will be inside the allocated
806	// string. We should already have captured all the relevant tokens in the
807	// current scanner.
808	SmallVector<dependency_directives_scan::Token> DiscardTokens;
809	const char *Begin = Buffer.c_str();
810	Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
811	InputSourceLoc};
812
813	PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
814	if (PragmaScanner.lexPragma(First&: Begin, End: Buffer.end()))
815	return true;
816
817	DirectiveKind K = PragmaScanner.topDirective();
818	if (K == pp_none) {
819	skipLine(First, End);
820	return false;
821	}
822
823	assert(Begin == Buffer.end());
824	pushDirective(Kind: K);
825	return false;
826	}
827
828	bool Scanner::lexPragma(const char &First, const* char *const End) {
829	std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
830	if (!FoundId)
831	return false;
832
833	StringRef Id = *FoundId;
834	auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
835	.Case(S: "once", Value: pp_pragma_once)
836	.Case(S: "push_macro", Value: pp_pragma_push_macro)
837	.Case(S: "pop_macro", Value: pp_pragma_pop_macro)
838	.Case(S: "include_alias", Value: pp_pragma_include_alias)
839	.Default(Value: pp_none);
840	if (Kind != pp_none) {
841	lexPPDirectiveBody(First, End);
842	pushDirective(Kind);
843	return false;
844	}
845
846	if (Id != "clang") {
847	skipLine(First, End);
848	return false;
849	}
850
851	FoundId = tryLexIdentifierOrSkipLine(First, End);
852	if (!FoundId)
853	return false;
854	Id = *FoundId;
855
856	// #pragma clang system_header
857	if (Id == "system_header") {
858	lexPPDirectiveBody(First, End);
859	pushDirective(Kind: pp_pragma_system_header);
860	return false;
861	}
862
863	if (Id != "module") {
864	skipLine(First, End);
865	return false;
866	}
867
868	// #pragma clang module.
869	if (!isNextIdentifierOrSkipLine(Id: "import", First, End))
870	return false;
871
872	// #pragma clang module import.
873	lexPPDirectiveBody(First, End);
874	pushDirective(Kind: pp_pragma_import);
875	return false;
876	}
877
878	bool Scanner::lexEndif(const char &First, const* char *const End) {
879	// Strip out "#else" if it's empty.
880	if (topDirective() == pp_else)
881	popDirective();
882
883	// If "#ifdef" is empty, strip it and skip the "#endif".
884	//
885	// FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
886	// we can skip empty `#if` and `#elif` blocks as well after scanning for a
887	// literal __has_include in the condition. Even without that rule we could
888	// drop the tokens if we scan for identifiers in the condition and find none.
889	if (topDirective() == pp_ifdef \|\| topDirective() == pp_ifndef) {
890	popDirective();
891	skipLine(First, End);
892	return false;
893	}
894
895	return lexDefault(Kind: pp_endif, First, End);
896	}
897
898	bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
899	const char *const End) {
900	lexPPDirectiveBody(First, End);
901	pushDirective(Kind);
902	return false;
903	}
904
905	static bool isStartOfRelevantLine(char First) {
906	switch (First) {
907	case `'#'`:
908	case `'@'`:
909	case `'i'`:
910	case `'e'`:
911	case `'m'`:
912	case `'_'`:
913	return true;
914	}
915	return false;
916	}
917
918	static inline bool isStartWithPreprocessedModuleDirective(const char *First,
919	const char *End) {
920	assert(First <= End);
921	if (*First == `'_'`) {
922	StringRef Str(First, End - First);
923	return Str.starts_with(
924	Prefix: tok::getPPKeywordSpelling(Kind: tok::pp___preprocessed_module)) \|\|
925	Str.starts_with(
926	Prefix: tok::getPPKeywordSpelling(Kind: tok::pp___preprocessed_import));
927	}
928	return false;
929	}
930
931	bool Scanner::lexPPLine(const char &First, const* char *const End) {
932	assert(First != End);
933
934	skipWhitespace(First, End);
935	assert(First <= End);
936	if (First == End)
937	return false;
938
939	if (!isStartOfRelevantLine(First: *First)) {
940	skipLine(First, End);
941	assert(First <= End);
942	return false;
943	}
944
945	LastTokenPtr = First;
946
947	TheLexer.seek(Offset: getOffsetAt(CurPtr: First), /IsAtStartOfLine/ true);
948
949	llvm::scope_exit ScEx1([&]() {
950	/// Clear Scanner's CurDirToks before returning, in case we didn't push a
951	/// new directive.
952	CurDirToks.clear();
953	});
954
955	// FIXME: Shoule we handle @import as a preprocessing directive?
956	if (*First == `'@'`)
957	return lexAt(First, End);
958
959	bool IsPreprocessedModule =
960	isStartWithPreprocessedModuleDirective(First, End);
961	if (*First == `'_'` && !IsPreprocessedModule) {
962	if (isNextIdentifierOrSkipLine(Id: "_Pragma", First, End))
963	return lex_Pragma(First, End);
964	return false;
965	}
966
967	// Handle preprocessing directives.
968
969	TheLexer.setParsingPreprocessorDirective(true);
970	llvm::scope_exit ScEx2(
971	[&]() { TheLexer.setParsingPreprocessorDirective(false); });
972
973	// Handle module directives for C++20 modules.
974	if (First == `'i'` \|\| First == `'e'` \|\| *First == `'m'` \|\| IsPreprocessedModule)
975	return lexModule(First, End);
976
977	// Lex '#'.
978	const dependency_directives_scan::Token &HashTok = lexToken(First, End);
979	if (HashTok.is(K: tok::hashhash)) {
980	// A \p tok::hashhash at this location is passed by the preprocessor to the
981	// parser to interpret, like any other token. So for dependency scanning
982	// skip it like a normal token not affecting the preprocessor.
983	skipLine(First, End);
984	assert(First <= End);
985	return false;
986	}
987	assert(HashTok.is(tok::hash));
988	(void)HashTok;
989
990	std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
991	if (!FoundId)
992	return false;
993
994	StringRef Id = *FoundId;
995
996	if (Id == "pragma")
997	return lexPragma(First, End);
998
999	auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
1000	.Case(S: "include", Value: pp_include)
1001	.Case(S: "__include_macros", Value: pp___include_macros)
1002	.Case(S: "define", Value: pp_define)
1003	.Case(S: "undef", Value: pp_undef)
1004	.Case(S: "import", Value: pp_import)
1005	.Case(S: "include_next", Value: pp_include_next)
1006	.Case(S: "if", Value: pp_if)
1007	.Case(S: "ifdef", Value: pp_ifdef)
1008	.Case(S: "ifndef", Value: pp_ifndef)
1009	.Case(S: "elif", Value: pp_elif)
1010	.Case(S: "elifdef", Value: pp_elifdef)
1011	.Case(S: "elifndef", Value: pp_elifndef)
1012	.Case(S: "else", Value: pp_else)
1013	.Case(S: "endif", Value: pp_endif)
1014	.Default(Value: pp_none);
1015	if (Kind == pp_none) {
1016	skipDirective(Name: Id, First, End);
1017	return false;
1018	}
1019
1020	if (Kind == pp_endif)
1021	return lexEndif(First, End);
1022
1023	switch (Kind) {
1024	case pp_include:
1025	case pp___include_macros:
1026	case pp_include_next:
1027	case pp_import:
1028	// Ignore missing filenames in include or import directives.
1029	if (lexIncludeFilename(First, End).is(K: tok::eod)) {
1030	return false;
1031	}
1032	break;
1033	default:
1034	break;
1035	}
1036
1037	// Everything else.
1038	return lexDefault(Kind, First, End);
1039	}
1040
1041	static void skipUTF8ByteOrderMark(const char &First, const* char *const End) {
1042	if ((End - First) >= `3` && First[`0`] == `'\xef'` && First[`1`] == `'\xbb'` &&
1043	First[`2`] == `'\xbf'`)
1044	First += `3`;
1045	}
1046
1047	bool Scanner::scanImpl(const char First, const* char *const End) {
1048	skipUTF8ByteOrderMark(First, End);
1049	while (First != End)
1050	if (lexPPLine(First, End))
1051	return true;
1052	return false;
1053	}
1054
1055	bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
1056	ScanningPreprocessedModuleFile = clang::isPreprocessedModuleFile(Source: Input);
1057	bool Error = scanImpl(First: Input.begin(), End: Input.end());
1058
1059	if (!Error) {
1060	// Add an EOF on success.
1061	if (LastTokenPtr &&
1062	(Tokens.empty() \|\| LastTokenPtr > Input.begin() + Tokens.back().Offset))
1063	pushDirective(Kind: tokens_present_before_eof);
1064	pushDirective(Kind: pp_eof);
1065	}
1066
1067	ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
1068	for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
1069	assert(RemainingTokens.size() >= DirWithToks.NumTokens);
1070	Directives.emplace_back(Args: DirWithToks.Kind,
1071	Args: RemainingTokens.take_front(N: DirWithToks.NumTokens));
1072	RemainingTokens = RemainingTokens.drop_front(N: DirWithToks.NumTokens);
1073	}
1074	assert(RemainingTokens.empty());
1075
1076	return Error;
1077	}
1078
1079	bool clang::scanSourceForDependencyDirectives(
1080	StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
1081	SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,
1082	SourceLocation InputSourceLoc) {
1083	return Scanner (Input, Tokens, Diags, InputSourceLoc).scan(Directives);
1084	}
1085
1086	void clang::printDependencyDirectivesAsSource(
1087	StringRef Source,
1088	ArrayRef<dependency_directives_scan::Directive> Directives,
1089	llvm::raw_ostream &OS) {
1090	// Add a space separator where it is convenient for testing purposes.
1091	auto needsSpaceSeparator =
1092	[](tok::TokenKind Prev,
1093	const dependency_directives_scan::Token &Tok) -> bool {
1094	if (Prev == Tok.Kind)
1095	return !Tok.isOneOf(Ks: tok::l_paren, Ks: tok::r_paren, Ks: tok::l_square,
1096	Ks: tok::r_square);
1097	if (Prev == tok::raw_identifier &&
1098	Tok.isOneOf(Ks: tok::hash, Ks: tok::numeric_constant, Ks: tok::string_literal,
1099	Ks: tok::char_constant, Ks: tok::header_name))
1100	return true;
1101	if (Prev == tok::r_paren &&
1102	Tok.isOneOf(Ks: tok::raw_identifier, Ks: tok::hash, Ks: tok::string_literal,
1103	Ks: tok::char_constant, Ks: tok::unknown))
1104	return true;
1105	if (Prev == tok::comma &&
1106	Tok.isOneOf(Ks: tok::l_paren, Ks: tok::string_literal, Ks: tok::less))
1107	return true;
1108	return false;
1109	};
1110
1111	for (const dependency_directives_scan::Directive &Directive : Directives) {
1112	if (Directive.Kind == tokens_present_before_eof)
1113	OS << "<TokBeforeEOF>";
1114	std::optional<tok::TokenKind> PrevTokenKind;
1115	for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {
1116	if (PrevTokenKind && needsSpaceSeparator (*PrevTokenKind, Tok))
1117	OS << `' '`;
1118	PrevTokenKind = Tok.Kind;
1119	OS << Source.slice(Start: Tok.Offset, End: Tok.getEnd());
1120	}
1121	}
1122	}
1123
1124	static void skipUntilMaybeCXX20ModuleDirective(const char *&First,
1125	const char *const End) {
1126	assert(First <= End);
1127	while (First != End) {
1128	if (*First == `'#'`) {
1129	++First;
1130	skipToNewlineRaw(First, End);
1131	}
1132	skipWhitespace(First, End);
1133	if (const auto Len = isEOL(First, End)) {
1134	First += Len;
1135	continue;
1136	}
1137	break;
1138	}
1139	}
1140
1141	bool clang::scanInputForCXX20ModulesUsage(StringRef Source) {
1142	const char *First = Source.begin();
1143	const char *const End = Source.end();
1144	skipUntilMaybeCXX20ModuleDirective(First, End);
1145	if (First == End)
1146	return false;
1147
1148	// Check if the next token can even be a module directive before creating a
1149	// full lexer.
1150	if (!(First == `'i'` \|\| First == `'e'` \|\| *First == `'m'`))
1151	return false;
1152
1153	llvm::SmallVector<dependency_directives_scan::Token> Tokens;
1154	Scanner S(StringRef(First, End - First), Tokens, nullptr, SourceLocation ());
1155	S.TheLexer.setParsingPreprocessorDirective(true);
1156	if (S.lexModule(First, End))
1157	return false;
1158	auto IsCXXNamedModuleDirective = [](const DirectiveWithTokens &D) {
1159	switch (D.Kind) {
1160	case dependency_directives_scan::cxx_module_decl:
1161	case dependency_directives_scan::cxx_import_decl:
1162	case dependency_directives_scan::cxx_export_module_decl:
1163	case dependency_directives_scan::cxx_export_import_decl:
1164	return true;
1165	default:
1166	return false;
1167	}
1168	};
1169	return llvm::any_of(Range&: S.DirsWithToks, P: IsCXXNamedModuleDirective);
1170	}
1171
1172	bool clang::isPreprocessedModuleFile(StringRef Source) {
1173	const char *First = Source.begin();
1174	const char *const End = Source.end();
1175
1176	skipUntilMaybeCXX20ModuleDirective(First, End);
1177	if (First == End)
1178	return false;
1179
1180	llvm::SmallVector<dependency_directives_scan::Token> Tokens;
1181	Scanner S(StringRef(First, End - First), Tokens, nullptr, SourceLocation ());
1182	while (First != End) {
1183	if (*First == `'#'`) {
1184	++First;
1185	skipToNewlineRaw(First, End);
1186	} else if (*First == `'e'`) {
1187	S.TheLexer.seek(Offset: S.getOffsetAt(CurPtr: First), /IsAtStartOfLine=/true);
1188	StringRef Id = S.lexIdentifier(First, End);
1189	if (Id == "export") {
1190	std::optional<StringRef> NextId =
1191	S.tryLexIdentifierOrSkipLine(First, End);
1192	if (!NextId)
1193	return false;
1194	Id = *NextId;
1195	}
1196	if (Id == "__preprocessed_module" \|\| Id == "__preprocessed_import")
1197	return true;
1198	skipToNewlineRaw(First, End);
1199	} else if (isStartWithPreprocessedModuleDirective(First, End))
1200	return true;
1201	else
1202	skipToNewlineRaw(First, End);
1203
1204	skipWhitespace(First, End);
1205	if (const auto Len = isEOL(First, End)) {
1206	First += Len;
1207	continue;
1208	}
1209	break;
1210	}
1211	return false;
1212	}
1213

Browse the source code of llvm_projects/clang/lib/Lex/DependencyDirectivesScanner.cpp