Lexer.cpp source code [llvm_projects/clang/lib/Lex/Lexer.cpp]

1	//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the Lexer and Token interfaces.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "clang/Lex/Lexer.h"
14	#include "UnicodeCharSets.h"
15	#include "clang/Basic/CharInfo.h"
16	#include "clang/Basic/Diagnostic.h"
17	#include "clang/Basic/IdentifierTable.h"
18	#include "clang/Basic/LLVM.h"
19	#include "clang/Basic/LangOptions.h"
20	#include "clang/Basic/SourceLocation.h"
21	#include "clang/Basic/SourceManager.h"
22	#include "clang/Basic/TokenKinds.h"
23	#include "clang/Lex/LexDiagnostic.h"
24	#include "clang/Lex/LiteralSupport.h"
25	#include "clang/Lex/MultipleIncludeOpt.h"
26	#include "clang/Lex/Preprocessor.h"
27	#include "clang/Lex/PreprocessorOptions.h"
28	#include "clang/Lex/Token.h"
29	#include "llvm/ADT/STLExtras.h"
30	#include "llvm/ADT/StringExtras.h"
31	#include "llvm/ADT/StringRef.h"
32	#include "llvm/ADT/StringSwitch.h"
33	#include "llvm/Support/Compiler.h"
34	#include "llvm/Support/ConvertUTF.h"
35	#include "llvm/Support/MemoryBufferRef.h"
36	#include "llvm/Support/NativeFormatting.h"
37	#include "llvm/Support/Unicode.h"
38	#include "llvm/Support/UnicodeCharRanges.h"
39	#include <algorithm>
40	#include <cassert>
41	#include <cstddef>
42	#include <cstdint>
43	#include <cstring>
44	#include <limits>
45	#include <optional>
46	#include <string>
47
48	#ifdef __SSE4_2__
49	#include <nmmintrin.h>
50	#endif
51
52	using namespace clang;
53
54	//===----------------------------------------------------------------------===//
55	// Token Class Implementation
56	//===----------------------------------------------------------------------===//
57
58	/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
59	bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
60	if (isAnnotation())
61	return false;
62	if (const IdentifierInfo *II = getIdentifierInfo())
63	return II->getObjCKeywordID() == objcKey;
64	return false;
65	}
66
67	/// getObjCKeywordID - Return the ObjC keyword kind.
68	tok::ObjCKeywordKind Token::getObjCKeywordID() const {
69	if (isAnnotation())
70	return tok::objc_not_keyword;
71	const IdentifierInfo *specId = getIdentifierInfo();
72	return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
73	}
74
75	bool Token::isModuleContextualKeyword(bool AllowExport) const {
76	if (AllowExport && is(K: tok::kw_export))
77	return true;
78	if (isOneOf(Ks: tok::kw_import, Ks: tok::kw_module))
79	return true;
80	if (isNot(K: tok::identifier))
81	return false;
82	const auto *II = getIdentifierInfo();
83	return II->isImportKeyword() \|\| II->isModuleKeyword();
84	}
85
86	/// Determine whether the token kind starts a simple-type-specifier.
87	bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
88	switch (getKind()) {
89	case tok::annot_typename:
90	case tok::annot_decltype:
91	case tok::annot_pack_indexing_type:
92	return true;
93
94	case tok::kw_short:
95	case tok::kw_long:
96	case tok::kw___int64:
97	case tok::kw___int128:
98	case tok::kw_signed:
99	case tok::kw_unsigned:
100	case tok::kw_void:
101	case tok::kw_char:
102	case tok::kw_int:
103	case tok::kw_half:
104	case tok::kw_float:
105	case tok::kw_double:
106	case tok::kw___bf16:
107	case tok::kw__Float16:
108	case tok::kw___float128:
109	case tok::kw___ibm128:
110	case tok::kw_wchar_t:
111	case tok::kw_bool:
112	case tok::kw__Bool:
113	case tok::kw__Accum:
114	case tok::kw__Fract:
115	case tok::kw__Sat:
116	#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
117	#include "clang/Basic/TransformTypeTraits.def"
118	case tok::kw___auto_type:
119	case tok::kw_char16_t:
120	case tok::kw_char32_t:
121	case tok::kw_typeof:
122	case tok::kw_decltype:
123	case tok::kw_char8_t:
124	return getIdentifierInfo()->isKeyword(LangOpts);
125
126	default:
127	return false;
128	}
129	}
130
131	//===----------------------------------------------------------------------===//
132	// Lexer Class Implementation
133	//===----------------------------------------------------------------------===//
134
135	void Lexer::anchor() {}
136
137	void Lexer::InitLexer(const char BufStart, const* char *BufPtr,
138	const char *BufEnd) {
139	BufferStart = BufStart;
140	BufferPtr = BufPtr;
141	BufferEnd = BufEnd;
142
143	assert(BufEnd[`0`] == `0` &&
144	"We assume that the input buffer has a null character at the end"
145	" to simplify lexing!");
146
147	// Check whether we have a BOM in the beginning of the buffer. If yes - act
148	// accordingly. Right now we support only UTF-8 with and without BOM, so, just
149	// skip the UTF-8 BOM if it's present.
150	if (BufferStart == BufferPtr) {
151	// Determine the size of the BOM.
152	StringRef Buf(BufferStart, BufferEnd - BufferStart);
153	size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
154	.StartsWith(S: "\xEF\xBB\xBF", Value: `3`) // UTF-8 BOM
155	.Default(Value: `0`);
156
157	// Skip the BOM.
158	BufferPtr += BOMLength;
159	}
160
161	Is_PragmaLexer = false;
162	CurrentConflictMarkerState = CMK_None;
163
164	// Start of the file is a start of line.
165	IsAtStartOfLine = true;
166	IsAtPhysicalStartOfLine = true;
167
168	HasLeadingSpace = false;
169	HasLeadingEmptyMacro = false;
170
171	// We are not after parsing a #.
172	ParsingPreprocessorDirective = false;
173
174	// We are not after parsing #include.
175	ParsingFilename = false;
176
177	// We are not in raw mode. Raw mode disables diagnostics and interpretation
178	// of tokens (e.g. identifiers, thus disabling macro expansion). It is used
179	// to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
180	// or otherwise skipping over tokens.
181	LexingRawMode = false;
182
183	// Default to not keeping comments.
184	ExtendedTokenMode = `0`;
185
186	NewLinePtr = nullptr;
187	}
188
189	/// Lexer constructor - Create a new lexer object for the specified buffer
190	/// with the specified preprocessor managing the lexing process. This lexer
191	/// assumes that the associated file buffer and Preprocessor objects will
192	/// outlive it, so it doesn't take ownership of either of them.
193	Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
194	Preprocessor &PP, bool IsFirstIncludeOfFile)
195	: PreprocessorLexer (&PP, FID),
196	FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
197	LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
198	IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
199	InitLexer(BufStart: InputFile.getBufferStart(), BufPtr: InputFile.getBufferStart(),
200	BufEnd: InputFile.getBufferEnd());
201
202	resetExtendedTokenMode();
203	}
204
205	/// Lexer constructor - Create a new raw lexer object. This object is only
206	/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
207	/// range will outlive it, so it doesn't take ownership of it.
208	Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
209	const char BufStart, const* char BufPtr, const* char *BufEnd,
210	bool IsFirstIncludeOfFile)
211	: FileLoc (fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
212	IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
213	InitLexer(BufStart, BufPtr, BufEnd);
214
215	// We are* in raw mode.*
216	LexingRawMode = true;
217	}
218
219	/// Lexer constructor - Create a new raw lexer object. This object is only
220	/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
221	/// range will outlive it, so it doesn't take ownership of it.
222	Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
223	const SourceManager &SM, const LangOptions &langOpts,
224	bool IsFirstIncludeOfFile)
225	: Lexer (SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
226	FromFile.getBufferStart(), FromFile.getBufferEnd(),
227	IsFirstIncludeOfFile) {}
228
229	void Lexer::resetExtendedTokenMode() {
230	assert(PP && "Cannot reset token mode without a preprocessor");
231	if (LangOpts.TraditionalCPP)
232	SetKeepWhitespaceMode(true);
233	else
234	SetCommentRetentionState(PP->getCommentRetentionState());
235	}
236
237	/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
238	/// _Pragma expansion. This has a variety of magic semantics that this method
239	/// sets up. It returns a new'd Lexer that must be delete'd when done.
240	///
241	/// On entrance to this routine, TokStartLoc is a macro location which has a
242	/// spelling loc that indicates the bytes to be lexed for the token and an
243	/// expansion location that indicates where all lexed tokens should be
244	/// "expanded from".
245	///
246	/// TODO: It would really be nice to make _Pragma just be a wrapper around a
247	/// normal lexer that remaps tokens as they fly by. This would require making
248	/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
249	/// interface that could handle this stuff. This would pull GetMappedTokenLoc
250	/// out of the critical path of the lexer!
251	///
252	Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
253	SourceLocation ExpansionLocStart,
254	SourceLocation ExpansionLocEnd,
255	unsigned TokLen, Preprocessor &PP) {
256	SourceManager &SM = PP.getSourceManager();
257
258	// Create the lexer as if we were going to lex the file normally.
259	FileID SpellingFID = SM.getFileID(SpellingLoc);
260	llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(FID: SpellingFID);
261	Lexer L = new* Lexer (SpellingFID, InputFile, PP);
262
263	// Now that the lexer is created, change the start/end locations so that we
264	// just lex the subsection of the file that we want. This is lexing from a
265	// scratch buffer.
266	const char *StrData = SM.getCharacterData(SL: SpellingLoc);
267
268	L->BufferPtr = StrData;
269	L->BufferEnd = StrData+TokLen;
270	assert(L->BufferEnd[`0`] == `0` && "Buffer is not nul terminated!");
271
272	// Set the SourceLocation with the remapping information. This ensures that
273	// GetMappedTokenLoc will remap the tokens as they are lexed.
274	L->FileLoc = SM.createExpansionLoc(SpellingLoc: SM.getLocForStartOfFile(FID: SpellingFID),
275	ExpansionLocStart,
276	ExpansionLocEnd, Length: TokLen);
277
278	// Ensure that the lexer thinks it is inside a directive, so that end \n will
279	// return an EOD token.
280	L->ParsingPreprocessorDirective = true;
281
282	// This lexer really is for _Pragma.
283	L->Is_PragmaLexer = true;
284	return L;
285	}
286
287	void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
288	this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
289	this->IsAtStartOfLine = IsAtStartOfLine;
290	assert((BufferStart + Offset) <= BufferEnd);
291	BufferPtr = BufferStart + Offset;
292	}
293
294	template <typename T> static void StringifyImpl(T &Str, char Quote) {
295	typename T::size_type i = `0`, e = Str.size();
296	while (i < e) {
297	if (Str[i] == `'\\'` \|\| Str[i] == Quote) {
298	Str.insert(Str.begin() + i, `'\\'`);
299	i += `2`;
300	++e;
301	} else if (Str[i] == `'\n'` \|\| Str[i] == `'\r'`) {
302	// Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
303	if ((i < e - `1`) && (Str[i + `1`] == `'\n'` \|\| Str[i + `1`] == `'\r'`) &&
304	Str[i] != Str[i + `1`]) {
305	Str[i] = `'\\'`;
306	Str[i + `1`] = `'n'`;
307	} else {
308	// Replace '\n' and '\r' to '\\' followed by 'n'.
309	Str[i] = `'\\'`;
310	Str.insert(Str.begin() + i + `1`, `'n'`);
311	++e;
312	}
313	i += `2`;
314	} else
315	++i;
316	}
317	}
318
319	std::string Lexer::Stringify(StringRef Str, bool Charify) {
320	std::string Result = std::string (Str);
321	char Quote = Charify ? `'\''` : `'"'`;
322	StringifyImpl(Str&: Result, Quote);
323	return Result;
324	}
325
326	void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, Quote: `'"'`); }
327
328	//===----------------------------------------------------------------------===//
329	// Token Spelling
330	//===----------------------------------------------------------------------===//
331
332	/// Slow case of getSpelling. Extract the characters comprising the
333	/// spelling of this token from the provided input buffer.
334	static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
335	const LangOptions &LangOpts, char *Spelling) {
336	assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
337
338	size_t Length = `0`;
339	const char *BufEnd = BufPtr + Tok.getLength();
340
341	if (tok::isStringLiteral(K: Tok.getKind())) {
342	// Munch the encoding-prefix and opening double-quote.
343	while (BufPtr < BufEnd) {
344	auto CharAndSize = Lexer::getCharAndSizeNoWarn(Ptr: BufPtr, LangOpts);
345	Spelling[Length++] = CharAndSize.Char;
346	BufPtr += CharAndSize.Size;
347
348	if (Spelling[Length - `1`] == `'"'`)
349	break;
350	}
351
352	// Raw string literals need special handling; trigraph expansion and line
353	// splicing do not occur within their d-char-sequence nor within their
354	// r-char-sequence.
355	if (Length >= `2` &&
356	Spelling[Length - `2`] == `'R'` && Spelling[Length - `1`] == `'"'`) {
357	// Search backwards from the end of the token to find the matching closing
358	// quote.
359	const char *RawEnd = BufEnd;
360	do --RawEnd; while (*RawEnd != `'"'`);
361	size_t RawLength = RawEnd - BufPtr + `1`;
362
363	// Everything between the quotes is included verbatim in the spelling.
364	memcpy(dest: Spelling + Length, src: BufPtr, n: RawLength);
365	Length += RawLength;
366	BufPtr += RawLength;
367
368	// The rest of the token is lexed normally.
369	}
370	}
371
372	while (BufPtr < BufEnd) {
373	auto CharAndSize = Lexer::getCharAndSizeNoWarn(Ptr: BufPtr, LangOpts);
374	Spelling[Length++] = CharAndSize.Char;
375	BufPtr += CharAndSize.Size;
376	}
377
378	assert(Length < Tok.getLength() &&
379	"NeedsCleaning flag set on token that didn't need cleaning!");
380	return Length;
381	}
382
383	/// getSpelling() - Return the 'spelling' of this token. The spelling of a
384	/// token are the characters used to represent the token in the source file
385	/// after trigraph expansion and escaped-newline folding. In particular, this
386	/// wants to get the true, uncanonicalized, spelling of things like digraphs
387	/// UCNs, etc.
388	StringRef Lexer::getSpelling(SourceLocation loc,
389	SmallVectorImpl<char> &buffer,
390	const SourceManager &SM,
391	const LangOptions &options,
392	bool *invalid) {
393	// Break down the source location.
394	FileIDAndOffset locInfo = SM.getDecomposedLoc(Loc: loc);
395
396	// Try to the load the file buffer.
397	bool invalidTemp = false;
398	StringRef file = SM.getBufferData(FID: locInfo.first, Invalid: &invalidTemp);
399	if (invalidTemp) {
400	if (invalid) invalid = true*;
401	return {};
402	}
403
404	const char *tokenBegin = file.data() + locInfo.second;
405
406	// Lex from the start of the given location.
407	Lexer lexer(SM.getLocForStartOfFile(FID: locInfo.first), options,
408	file.begin(), tokenBegin, file.end());
409	Token token;
410	lexer.LexFromRawLexer(Result&: token);
411
412	unsigned length = token.getLength();
413
414	// Common case: no need for cleaning.
415	if (!token.needsCleaning())
416	return StringRef(tokenBegin, length);
417
418	// Hard case, we need to relex the characters into the string.
419	buffer.resize(N: length);
420	buffer.resize(N: getSpellingSlow(Tok: token, BufPtr: tokenBegin, LangOpts: options, Spelling: buffer.data()));
421	return StringRef(buffer.data(), buffer.size());
422	}
423
424	/// getSpelling() - Return the 'spelling' of this token. The spelling of a
425	/// token are the characters used to represent the token in the source file
426	/// after trigraph expansion and escaped-newline folding. In particular, this
427	/// wants to get the true, uncanonicalized, spelling of things like digraphs
428	/// UCNs, etc.
429	std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
430	const LangOptions &LangOpts, bool *Invalid) {
431	assert((int)Tok.getLength() >= `0` && "Token character range is bogus!");
432
433	bool CharDataInvalid = false;
434	const char *TokStart = SourceMgr.getCharacterData(SL: Tok.getLocation(),
435	Invalid: &CharDataInvalid);
436	if (Invalid)
437	*Invalid = CharDataInvalid;
438	if (CharDataInvalid)
439	return {};
440
441	// If this token contains nothing interesting, return it directly.
442	if (!Tok.needsCleaning())
443	return std::string (TokStart, TokStart + Tok.getLength());
444
445	std::string Result;
446	Result.resize(n: Tok.getLength());
447	Result.resize(n: getSpellingSlow(Tok, BufPtr: TokStart, LangOpts, Spelling: &*Result.begin()));
448	return Result;
449	}
450
451	/// getSpelling - This method is used to get the spelling of a token into a
452	/// preallocated buffer, instead of as an std::string. The caller is required
453	/// to allocate enough space for the token, which is guaranteed to be at least
454	/// Tok.getLength() bytes long. The actual length of the token is returned.
455	///
456	/// Note that this method may do two possible things: it may either fill in
457	/// the buffer specified with characters, or it may change the input pointer
458	/// to point to a constant buffer with the data already in it (avoiding a
459	/// copy). The caller is not allowed to modify the returned buffer pointer
460	/// if an internal buffer is returned.
461	unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
462	const SourceManager &SourceMgr,
463	const LangOptions &LangOpts, bool *Invalid) {
464	assert((int)Tok.getLength() >= `0` && "Token character range is bogus!");
465
466	const char TokStart = nullptr*;
467	// NOTE: this has to be checked before* testing for an IdentifierInfo.*
468	if (Tok.is(K: tok::raw_identifier))
469	TokStart = Tok.getRawIdentifier().data();
470	else if (!Tok.hasUCN()) {
471	if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
472	// Just return the string from the identifier table, which is very quick.
473	Buffer = II->getNameStart();
474	return II->getLength();
475	}
476	}
477
478	// NOTE: this can be checked even after testing for an IdentifierInfo.
479	if (Tok.isLiteral())
480	TokStart = Tok.getLiteralData();
481
482	if (!TokStart) {
483	// Compute the start of the token in the input lexer buffer.
484	bool CharDataInvalid = false;
485	TokStart = SourceMgr.getCharacterData(SL: Tok.getLocation(), Invalid: &CharDataInvalid);
486	if (Invalid)
487	*Invalid = CharDataInvalid;
488	if (CharDataInvalid) {
489	Buffer = "";
490	return `0`;
491	}
492	}
493
494	// If this token contains nothing interesting, return it directly.
495	if (!Tok.needsCleaning()) {
496	Buffer = TokStart;
497	return Tok.getLength();
498	}
499
500	// Otherwise, hard case, relex the characters into the string.
501	return getSpellingSlow(Tok, BufPtr: TokStart, LangOpts, Spelling: const_cast<char*>(Buffer));
502	}
503
504	/// MeasureTokenLength - Relex the token at the specified location and return
505	/// its length in bytes in the input file. If the token needs cleaning (e.g.
506	/// includes a trigraph or an escaped newline) then this count includes bytes
507	/// that are part of that.
508	unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
509	const SourceManager &SM,
510	const LangOptions &LangOpts) {
511	Token TheTok;
512	if (getRawToken(Loc, Result&: TheTok, SM, LangOpts))
513	return `0`;
514	return TheTok.getLength();
515	}
516
517	/// Relex the token at the specified location.
518	/// \returns true if there was a failure, false on success.
519	bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
520	const SourceManager &SM,
521	const LangOptions &LangOpts,
522	bool IgnoreWhiteSpace) {
523	// TODO: this could be special cased for common tokens like identifiers, ')',
524	// etc to make this faster, if it mattered. Just look at StrData[0] to handle
525	// all obviously single-char tokens. This could use
526	// Lexer::isObviouslySimpleCharacter for example to handle identifiers or
527	// something.
528
529	// If this comes from a macro expansion, we really do want the macro name, not
530	// the token this macro expanded to.
531	Loc = SM.getExpansionLoc(Loc);
532	FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc);
533	bool Invalid = false;
534	StringRef Buffer = SM.getBufferData(FID: LocInfo.first, Invalid: &Invalid);
535	if (Invalid)
536	return true;
537
538	const char *StrData = Buffer.data()+LocInfo.second;
539
540	if (!IgnoreWhiteSpace && isWhitespace(c: SkipEscapedNewLines(P: StrData)[`0`]))
541	return true;
542
543	// Create a lexer starting at the beginning of this token.
544	Lexer TheLexer(SM.getLocForStartOfFile(FID: LocInfo.first), LangOpts,
545	Buffer.begin(), StrData, Buffer.end());
546	TheLexer.SetCommentRetentionState(true);
547	TheLexer.LexFromRawLexer(Result);
548	return false;
549	}
550
551	/// Returns the pointer that points to the beginning of line that contains
552	/// the given offset, or null if the offset if invalid.
553	static const char findBeginningOfLine(StringRef Buffer, unsigned* Offset) {
554	const char *BufStart = Buffer.data();
555	if (Offset >= Buffer.size())
556	return nullptr;
557
558	const char *LexStart = BufStart + Offset;
559	for (; LexStart != BufStart; --LexStart) {
560	if (isVerticalWhitespace(c: LexStart[`0`]) &&
561	!Lexer::isNewLineEscaped(BufferStart: BufStart, Str: LexStart)) {
562	// LexStart should point at first character of logical line.
563	++LexStart;
564	break;
565	}
566	}
567	return LexStart;
568	}
569
570	static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
571	const SourceManager &SM,
572	const LangOptions &LangOpts) {
573	assert(Loc.isFileID());
574	FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc);
575	if (LocInfo.first.isInvalid())
576	return Loc;
577
578	bool Invalid = false;
579	StringRef Buffer = SM.getBufferData(FID: LocInfo.first, Invalid: &Invalid);
580	if (Invalid)
581	return Loc;
582
583	// Back up from the current location until we hit the beginning of a line
584	// (or the buffer). We'll relex from that point.
585	const char *StrData = Buffer.data() + LocInfo.second;
586	const char *LexStart = findBeginningOfLine(Buffer, Offset: LocInfo.second);
587	if (!LexStart \|\| LexStart == StrData)
588	return Loc;
589
590	// Create a lexer starting at the beginning of this token.
591	SourceLocation LexerStartLoc = Loc.getLocWithOffset(Offset: -LocInfo.second);
592	Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
593	Buffer.end());
594	TheLexer.SetCommentRetentionState(true);
595
596	// Lex tokens until we find the token that contains the source location.
597	Token TheTok;
598	do {
599	TheLexer.LexFromRawLexer(Result&: TheTok);
600
601	if (TheLexer.getBufferLocation() > StrData) {
602	// Lexing this token has taken the lexer past the source location we're
603	// looking for. If the current token encompasses our source location,
604	// return the beginning of that token.
605	if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
606	return TheTok.getLocation();
607
608	// We ended up skipping over the source location entirely, which means
609	// that it points into whitespace. We're done here.
610	break;
611	}
612	} while (TheTok.getKind() != tok::eof);
613
614	// We've passed our source location; just return the original source location.
615	return Loc;
616	}
617
618	SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
619	const SourceManager &SM,
620	const LangOptions &LangOpts) {
621	if (Loc.isFileID())
622	return getBeginningOfFileToken(Loc, SM, LangOpts);
623
624	if (!SM.isMacroArgExpansion(Loc))
625	return Loc;
626
627	SourceLocation FileLoc = SM.getSpellingLoc(Loc);
628	SourceLocation BeginFileLoc = getBeginningOfFileToken(Loc: FileLoc, SM, LangOpts);
629	FileIDAndOffset FileLocInfo = SM.getDecomposedLoc(Loc: FileLoc);
630	FileIDAndOffset BeginFileLocInfo = SM.getDecomposedLoc(Loc: BeginFileLoc);
631	assert(FileLocInfo.first == BeginFileLocInfo.first &&
632	FileLocInfo.second >= BeginFileLocInfo.second);
633	return Loc.getLocWithOffset(Offset: BeginFileLocInfo.second - FileLocInfo.second);
634	}
635
636	namespace {
637
638	enum PreambleDirectiveKind {
639	PDK_Skipped,
640	PDK_Unknown
641	};
642
643	} // namespace
644
645	PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
646	const LangOptions &LangOpts,
647	unsigned MaxLines) {
648	// Create a lexer starting at the beginning of the file. Note that we use a
649	// "fake" file source location at offset 1 so that the lexer will track our
650	// position within the file.
651	const SourceLocation::UIntTy StartOffset = `1`;
652	SourceLocation FileLoc = SourceLocation::getFromRawEncoding(Encoding: StartOffset);
653	Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
654	Buffer.end());
655	TheLexer.SetCommentRetentionState(true);
656
657	bool InPreprocessorDirective = false;
658	Token TheTok;
659	SourceLocation ActiveCommentLoc;
660
661	unsigned MaxLineOffset = `0`;
662	if (MaxLines) {
663	const char *CurPtr = Buffer.begin();
664	unsigned CurLine = `0`;
665	while (CurPtr != Buffer.end()) {
666	char ch = *CurPtr++;
667	if (ch == `'\n'`) {
668	++CurLine;
669	if (CurLine == MaxLines)
670	break;
671	}
672	}
673	if (CurPtr != Buffer.end())
674	MaxLineOffset = CurPtr - Buffer.begin();
675	}
676
677	do {
678	TheLexer.LexFromRawLexer(Result&: TheTok);
679
680	if (InPreprocessorDirective) {
681	// If we've hit the end of the file, we're done.
682	if (TheTok.getKind() == tok::eof) {
683	break;
684	}
685
686	// If we haven't hit the end of the preprocessor directive, skip this
687	// token.
688	if (!TheTok.isAtStartOfLine())
689	continue;
690
691	// We've passed the end of the preprocessor directive, and will look
692	// at this token again below.
693	InPreprocessorDirective = false;
694	}
695
696	// Keep track of the # of lines in the preamble.
697	if (TheTok.isAtStartOfLine()) {
698	unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
699
700	// If we were asked to limit the number of lines in the preamble,
701	// and we're about to exceed that limit, we're done.
702	if (MaxLineOffset && TokOffset >= MaxLineOffset)
703	break;
704	}
705
706	// Comments are okay; skip over them.
707	if (TheTok.getKind() == tok::comment) {
708	if (ActiveCommentLoc.isInvalid())
709	ActiveCommentLoc = TheTok.getLocation();
710	continue;
711	}
712
713	if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
714	// This is the start of a preprocessor directive.
715	Token HashTok = TheTok;
716	InPreprocessorDirective = true;
717	ActiveCommentLoc = SourceLocation ();
718
719	// Figure out which directive this is. Since we're lexing raw tokens,
720	// we don't have an identifier table available. Instead, just look at
721	// the raw identifier to recognize and categorize preprocessor directives.
722	TheLexer.LexFromRawLexer(Result&: TheTok);
723	if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
724	StringRef Keyword = TheTok.getRawIdentifier();
725	PreambleDirectiveKind PDK
726	= llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
727	.Case(S: "include", Value: PDK_Skipped)
728	.Case(S: "__include_macros", Value: PDK_Skipped)
729	.Case(S: "define", Value: PDK_Skipped)
730	.Case(S: "undef", Value: PDK_Skipped)
731	.Case(S: "line", Value: PDK_Skipped)
732	.Case(S: "error", Value: PDK_Skipped)
733	.Case(S: "pragma", Value: PDK_Skipped)
734	.Case(S: "import", Value: PDK_Skipped)
735	.Case(S: "include_next", Value: PDK_Skipped)
736	.Case(S: "warning", Value: PDK_Skipped)
737	.Case(S: "ident", Value: PDK_Skipped)
738	.Case(S: "sccs", Value: PDK_Skipped)
739	.Case(S: "assert", Value: PDK_Skipped)
740	.Case(S: "unassert", Value: PDK_Skipped)
741	.Case(S: "if", Value: PDK_Skipped)
742	.Case(S: "ifdef", Value: PDK_Skipped)
743	.Case(S: "ifndef", Value: PDK_Skipped)
744	.Case(S: "elif", Value: PDK_Skipped)
745	.Case(S: "elifdef", Value: PDK_Skipped)
746	.Case(S: "elifndef", Value: PDK_Skipped)
747	.Case(S: "else", Value: PDK_Skipped)
748	.Case(S: "endif", Value: PDK_Skipped)
749	.Default(Value: PDK_Unknown);
750
751	switch (PDK) {
752	case PDK_Skipped:
753	continue;
754
755	case PDK_Unknown:
756	// We don't know what this directive is; stop at the '#'.
757	break;
758	}
759	}
760
761	// We only end up here if we didn't recognize the preprocessor
762	// directive or it was one that can't occur in the preamble at this
763	// point. Roll back the current token to the location of the '#'.
764	TheTok = HashTok;
765	} else if (TheTok.isAtStartOfLine() &&
766	TheTok.getKind() == tok::raw_identifier &&
767	TheTok.getRawIdentifier() == "module" &&
768	LangOpts.CPlusPlusModules) {
769	// The initial global module fragment introducer "module;" is part of
770	// the preamble, which runs up to the module declaration "module foo;".
771	Token ModuleTok = TheTok;
772	do {
773	TheLexer.LexFromRawLexer(Result&: TheTok);
774	} while (TheTok.getKind() == tok::comment);
775	if (TheTok.getKind() != tok::semi) {
776	// Not global module fragment, roll back.
777	TheTok = ModuleTok;
778	break;
779	}
780	continue;
781	}
782
783	// We hit a token that we don't recognize as being in the
784	// "preprocessing only" part of the file, so we're no longer in
785	// the preamble.
786	break;
787	} while (true);
788
789	SourceLocation End;
790	if (ActiveCommentLoc.isValid())
791	End = ActiveCommentLoc; // don't truncate a decl comment.
792	else
793	End = TheTok.getLocation();
794
795	return PreambleBounds (End.getRawEncoding() - FileLoc.getRawEncoding(),
796	TheTok.isAtStartOfLine());
797	}
798
799	unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
800	const SourceManager &SM,
801	const LangOptions &LangOpts) {
802	// Figure out how many physical characters away the specified expansion
803	// character is. This needs to take into consideration newlines and
804	// trigraphs.
805	bool Invalid = false;
806	const char *TokPtr = SM.getCharacterData(SL: TokStart, Invalid: &Invalid);
807
808	// If they request the first char of the token, we're trivially done.
809	if (Invalid \|\| (CharNo == `0` && Lexer::isObviouslySimpleCharacter(C: *TokPtr)))
810	return `0`;
811
812	unsigned PhysOffset = `0`;
813
814	// The usual case is that tokens don't contain anything interesting. Skip
815	// over the uninteresting characters. If a token only consists of simple
816	// chars, this method is extremely fast.
817	while (Lexer::isObviouslySimpleCharacter(C: *TokPtr)) {
818	if (CharNo == `0`)
819	return PhysOffset;
820	++TokPtr;
821	--CharNo;
822	++PhysOffset;
823	}
824
825	// If we have a character that may be a trigraph or escaped newline, use a
826	// lexer to parse it correctly.
827	for (; CharNo; --CharNo) {
828	auto CharAndSize = Lexer::getCharAndSizeNoWarn(Ptr: TokPtr, LangOpts);
829	TokPtr += CharAndSize.Size;
830	PhysOffset += CharAndSize.Size;
831	}
832
833	// Final detail: if we end up on an escaped newline, we want to return the
834	// location of the actual byte of the token. For example foo\<newline>bar
835	// advanced by 3 should return the location of b, not of \\. One compounding
836	// detail of this is that the escape may be made by a trigraph.
837	if (!Lexer::isObviouslySimpleCharacter(C: *TokPtr))
838	PhysOffset += Lexer::SkipEscapedNewLines(P: TokPtr)-TokPtr;
839
840	return PhysOffset;
841	}
842
843	/// Computes the source location just past the end of the
844	/// token at this source location.
845	///
846	/// This routine can be used to produce a source location that
847	/// points just past the end of the token referenced by \p Loc, and
848	/// is generally used when a diagnostic needs to point just after a
849	/// token where it expected something different that it received. If
850	/// the returned source location would not be meaningful (e.g., if
851	/// it points into a macro), this routine returns an invalid
852	/// source location.
853	///
854	/// \param Offset an offset from the end of the token, where the source
855	/// location should refer to. The default offset (0) produces a source
856	/// location pointing just past the end of the token; an offset of 1 produces
857	/// a source location pointing to the last character in the token, etc.
858	SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
859	const SourceManager &SM,
860	const LangOptions &LangOpts) {
861	if (Loc.isInvalid())
862	return {};
863
864	if (Loc.isMacroID()) {
865	// Token split (for example, splitting '>>' into two '>' tokens) is
866	// represented in SourceManager as an ExpansionInfo (see
867	// createForTokenSplit), so these locations are MacroIDs even when no user
868	// macro is involved. For split expansions, the expansion end is already
869	// the correct insertion point.
870	const FileID LocFileID = SM.getFileID(SpellingLoc: Loc);
871	if (Offset > `0` \|\| !isAtEndOfMacroExpansion(loc: Loc, SM, LangOpts, MacroEnd: &Loc))
872	return {}; // Points inside the macro expansion.
873	if (!SM.getSLocEntry(FID: LocFileID).getExpansion().isExpansionTokenRange())
874	return Loc;
875	}
876
877	unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
878	if (Len > Offset)
879	Len = Len - Offset;
880	else
881	return Loc;
882
883	return Loc.getLocWithOffset(Offset: Len);
884	}
885
886	/// Returns true if the given MacroID location points at the first
887	/// token of the macro expansion.
888	bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
889	const SourceManager &SM,
890	const LangOptions &LangOpts,
891	SourceLocation *MacroBegin) {
892	assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
893
894	SourceLocation expansionLoc;
895	if (!SM.isAtStartOfImmediateMacroExpansion(Loc: loc, MacroBegin: &expansionLoc))
896	return false;
897
898	if (expansionLoc.isFileID()) {
899	// No other macro expansions, this is the first.
900	if (MacroBegin)
901	*MacroBegin = expansionLoc;
902	return true;
903	}
904
905	return isAtStartOfMacroExpansion(loc: expansionLoc, SM, LangOpts, MacroBegin);
906	}
907
908	/// Returns true if the given MacroID location points at the last
909	/// token of the macro expansion.
910	bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
911	const SourceManager &SM,
912	const LangOptions &LangOpts,
913	SourceLocation *MacroEnd) {
914	assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
915
916	SourceLocation spellLoc = SM.getSpellingLoc(Loc: loc);
917	unsigned tokLen = MeasureTokenLength(Loc: spellLoc, SM, LangOpts);
918	if (tokLen == `0`)
919	return false;
920
921	SourceLocation afterLoc = loc.getLocWithOffset(Offset: tokLen);
922	SourceLocation expansionLoc;
923	if (!SM.isAtEndOfImmediateMacroExpansion(Loc: afterLoc, MacroEnd: &expansionLoc))
924	return false;
925
926	if (expansionLoc.isFileID()) {
927	// No other macro expansions.
928	if (MacroEnd)
929	*MacroEnd = expansionLoc;
930	return true;
931	}
932
933	return isAtEndOfMacroExpansion(loc: expansionLoc, SM, LangOpts, MacroEnd);
934	}
935
936	static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
937	const SourceManager &SM,
938	const LangOptions &LangOpts) {
939	SourceLocation Begin = Range.getBegin();
940	SourceLocation End = Range.getEnd();
941	assert(Begin.isFileID() && End.isFileID());
942	if (Range.isTokenRange()) {
943	End = Lexer::getLocForEndOfToken(Loc: End, Offset: `0`, SM,LangOpts);
944	if (End.isInvalid())
945	return {};
946	}
947
948	// Break down the source locations.
949	auto [FID, BeginOffs] = SM.getDecomposedLoc(Loc: Begin);
950	if (FID.isInvalid())
951	return {};
952
953	unsigned EndOffs;
954	if (!SM.isInFileID(Loc: End, FID, RelativeOffset: &EndOffs) \|\|
955	BeginOffs > EndOffs)
956	return {};
957
958	return CharSourceRange::getCharRange(B: Begin, E: End);
959	}
960
961	// Assumes that `Loc` is in an expansion.
962	static bool isInExpansionTokenRange(const SourceLocation Loc,
963	const SourceManager &SM) {
964	return SM.getSLocEntry(FID: SM.getFileID(SpellingLoc: Loc))
965	.getExpansion()
966	.isExpansionTokenRange();
967	}
968
969	CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
970	const SourceManager &SM,
971	const LangOptions &LangOpts) {
972	SourceLocation Begin = Range.getBegin();
973	SourceLocation End = Range.getEnd();
974	if (Begin.isInvalid() \|\| End.isInvalid())
975	return {};
976
977	if (Begin.isFileID() && End.isFileID())
978	return makeRangeFromFileLocs(Range, SM, LangOpts);
979
980	if (Begin.isMacroID() && End.isFileID()) {
981	if (!isAtStartOfMacroExpansion(loc: Begin, SM, LangOpts, MacroBegin: &Begin))
982	return {};
983	Range.setBegin(Begin);
984	return makeRangeFromFileLocs(Range, SM, LangOpts);
985	}
986
987	if (Begin.isFileID() && End.isMacroID()) {
988	if (Range.isTokenRange()) {
989	if (!isAtEndOfMacroExpansion(loc: End, SM, LangOpts, MacroEnd: &End))
990	return {};
991	// Use the original* end, not the expanded one in `End`.*
992	Range.setTokenRange(isInExpansionTokenRange(Loc: Range.getEnd(), SM));
993	} else if (!isAtStartOfMacroExpansion(loc: End, SM, LangOpts, MacroBegin: &End))
994	return {};
995	Range.setEnd(End);
996	return makeRangeFromFileLocs(Range, SM, LangOpts);
997	}
998
999	assert(Begin.isMacroID() && End.isMacroID());
1000	SourceLocation MacroBegin, MacroEnd;
1001	if (isAtStartOfMacroExpansion(loc: Begin, SM, LangOpts, MacroBegin: &MacroBegin) &&
1002	((Range.isTokenRange() && isAtEndOfMacroExpansion(loc: End, SM, LangOpts,
1003	MacroEnd: &MacroEnd)) \|\|
1004	(Range.isCharRange() && isAtStartOfMacroExpansion(loc: End, SM, LangOpts,
1005	MacroBegin: &MacroEnd)))) {
1006	Range.setBegin(MacroBegin);
1007	Range.setEnd(MacroEnd);
1008	// Use the original* `End`, not the expanded one in `MacroEnd`.*
1009	if (Range.isTokenRange())
1010	Range.setTokenRange(isInExpansionTokenRange(Loc: End, SM));
1011	return makeRangeFromFileLocs(Range, SM, LangOpts);
1012	}
1013
1014	bool Invalid = false;
1015	const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(FID: SM.getFileID(SpellingLoc: Begin),
1016	Invalid: &Invalid);
1017	if (Invalid)
1018	return {};
1019
1020	if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1021	const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(FID: SM.getFileID(SpellingLoc: End),
1022	Invalid: &Invalid);
1023	if (Invalid)
1024	return {};
1025
1026	if (EndEntry.getExpansion().isMacroArgExpansion() &&
1027	BeginEntry.getExpansion().getExpansionLocStart() ==
1028	EndEntry.getExpansion().getExpansionLocStart()) {
1029	Range.setBegin(SM.getImmediateSpellingLoc(Loc: Begin));
1030	Range.setEnd(SM.getImmediateSpellingLoc(Loc: End));
1031	return makeFileCharRange(Range, SM, LangOpts);
1032	}
1033	}
1034
1035	return {};
1036	}
1037
1038	StringRef Lexer::getSourceText(CharSourceRange Range,
1039	const SourceManager &SM,
1040	const LangOptions &LangOpts,
1041	bool *Invalid) {
1042	Range = makeFileCharRange(Range, SM, LangOpts);
1043	if (Range.isInvalid()) {
1044	if (Invalid) Invalid = true*;
1045	return {};
1046	}
1047
1048	// Break down the source location.
1049	FileIDAndOffset beginInfo = SM.getDecomposedLoc(Loc: Range.getBegin());
1050	if (beginInfo.first.isInvalid()) {
1051	if (Invalid) Invalid = true*;
1052	return {};
1053	}
1054
1055	unsigned EndOffs;
1056	if (!SM.isInFileID(Loc: Range.getEnd(), FID: beginInfo.first, RelativeOffset: &EndOffs) \|\|
1057	beginInfo.second > EndOffs) {
1058	if (Invalid) Invalid = true*;
1059	return {};
1060	}
1061
1062	// Try to the load the file buffer.
1063	bool invalidTemp = false;
1064	StringRef file = SM.getBufferData(FID: beginInfo.first, Invalid: &invalidTemp);
1065	if (invalidTemp) {
1066	if (Invalid) Invalid = true*;
1067	return {};
1068	}
1069
1070	if (Invalid) Invalid = false*;
1071	return file.substr(Start: beginInfo.second, N: EndOffs - beginInfo.second);
1072	}
1073
1074	StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
1075	const SourceManager &SM,
1076	const LangOptions &LangOpts) {
1077	assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1078
1079	// Find the location of the immediate macro expansion.
1080	while (true) {
1081	FileID FID = SM.getFileID(SpellingLoc: Loc);
1082	const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1083	const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1084	Loc = Expansion.getExpansionLocStart();
1085	if (!Expansion.isMacroArgExpansion())
1086	break;
1087
1088	// For macro arguments we need to check that the argument did not come
1089	// from an inner macro, e.g: "MAC1( MAC2(foo) )"
1090
1091	// Loc points to the argument id of the macro definition, move to the
1092	// macro expansion.
1093	Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1094	SourceLocation SpellLoc = Expansion.getSpellingLoc();
1095	if (SpellLoc.isFileID())
1096	break; // No inner macro.
1097
1098	// If spelling location resides in the same FileID as macro expansion
1099	// location, it means there is no inner macro.
1100	FileID MacroFID = SM.getFileID(SpellingLoc: Loc);
1101	if (SM.isInFileID(Loc: SpellLoc, FID: MacroFID))
1102	break;
1103
1104	// Argument came from inner macro.
1105	Loc = SpellLoc;
1106	}
1107
1108	// Find the spelling location of the start of the non-argument expansion
1109	// range. This is where the macro name was spelled in order to begin
1110	// expanding this macro.
1111	Loc = SM.getSpellingLoc(Loc);
1112
1113	// Dig out the buffer where the macro name was spelled and the extents of the
1114	// name so that we can render it into the expansion note.
1115	FileIDAndOffset ExpansionInfo = SM.getDecomposedLoc(Loc);
1116	unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1117	StringRef ExpansionBuffer = SM.getBufferData(FID: ExpansionInfo.first);
1118	return ExpansionBuffer.substr(Start: ExpansionInfo.second, N: MacroTokenLength);
1119	}
1120
1121	StringRef Lexer::getImmediateMacroNameForDiagnostics(
1122	SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1123	assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1124	// Walk past macro argument expansions.
1125	while (SM.isMacroArgExpansion(Loc))
1126	Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1127
1128	// If the macro's spelling isn't FileID or from scratch space, then it's
1129	// actually a token paste or stringization (or similar) and not a macro at
1130	// all.
1131	SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1132	if (!SpellLoc.isFileID() \|\| SM.isWrittenInScratchSpace(Loc: SpellLoc))
1133	return {};
1134
1135	// Find the spelling location of the start of the non-argument expansion
1136	// range. This is where the macro name was spelled in order to begin
1137	// expanding this macro.
1138	Loc = SM.getSpellingLoc(Loc: SM.getImmediateExpansionRange(Loc).getBegin());
1139
1140	// Dig out the buffer where the macro name was spelled and the extents of the
1141	// name so that we can render it into the expansion note.
1142	FileIDAndOffset ExpansionInfo = SM.getDecomposedLoc(Loc);
1143	unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1144	StringRef ExpansionBuffer = SM.getBufferData(FID: ExpansionInfo.first);
1145	return ExpansionBuffer.substr(Start: ExpansionInfo.second, N: MacroTokenLength);
1146	}
1147
1148	bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
1149	return isAsciiIdentifierContinue(c, AllowDollar: LangOpts.DollarIdents);
1150	}
1151
1152	bool Lexer::isNewLineEscaped(const char BufferStart, const* char *Str) {
1153	assert(isVerticalWhitespace(Str[`0`]));
1154	if (Str - `1` < BufferStart)
1155	return false;
1156
1157	if ((Str[`0`] == `'\n'` && Str[-`1`] == `'\r'`) \|\|
1158	(Str[`0`] == `'\r'` && Str[-`1`] == `'\n'`)) {
1159	if (Str - `2` < BufferStart)
1160	return false;
1161	--Str;
1162	}
1163	--Str;
1164
1165	// Rewind to first non-space character:
1166	while (Str > BufferStart && isHorizontalWhitespace(c: *Str))
1167	--Str;
1168
1169	return *Str == `'\\'`;
1170	}
1171
1172	StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1173	const SourceManager &SM) {
1174	if (Loc.isInvalid() \|\| Loc.isMacroID())
1175	return {};
1176	FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc);
1177	if (LocInfo.first.isInvalid())
1178	return {};
1179	bool Invalid = false;
1180	StringRef Buffer = SM.getBufferData(FID: LocInfo.first, Invalid: &Invalid);
1181	if (Invalid)
1182	return {};
1183	const char *Line = findBeginningOfLine(Buffer, Offset: LocInfo.second);
1184	if (!Line)
1185	return {};
1186	StringRef Rest = Buffer.substr(Start: Line - Buffer.data());
1187	size_t NumWhitespaceChars = Rest.find_first_not_of(Chars: " \t");
1188	return NumWhitespaceChars == StringRef::npos
1189	? ""
1190	: Rest.take_front(N: NumWhitespaceChars);
1191	}
1192
1193	//===----------------------------------------------------------------------===//
1194	// Diagnostics forwarding code.
1195	//===----------------------------------------------------------------------===//
1196
1197	/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1198	/// lexer buffer was all expanded at a single point, perform the mapping.
1199	/// This is currently only used for _Pragma implementation, so it is the slow
1200	/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1201	static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1202	Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1203	static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1204	SourceLocation FileLoc,
1205	unsigned CharNo, unsigned TokLen) {
1206	assert(FileLoc.isMacroID() && "Must be a macro expansion");
1207
1208	// Otherwise, we're lexing "mapped tokens". This is used for things like
1209	// _Pragma handling. Combine the expansion location of FileLoc with the
1210	// spelling location.
1211	SourceManager &SM = PP.getSourceManager();
1212
1213	// Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1214	// characters come from spelling(FileLoc)+Offset.
1215	SourceLocation SpellingLoc = SM.getSpellingLoc(Loc: FileLoc);
1216	SpellingLoc = SpellingLoc.getLocWithOffset(Offset: CharNo);
1217
1218	// Figure out the expansion loc range, which is the range covered by the
1219	// original _Pragma(...) sequence.
1220	CharSourceRange II = SM.getImmediateExpansionRange(Loc: FileLoc);
1221
1222	return SM.createExpansionLoc(SpellingLoc, ExpansionLocStart: II.getBegin(), ExpansionLocEnd: II.getEnd(), Length: TokLen);
1223	}
1224
1225	/// getSourceLocation - Return a source location identifier for the specified
1226	/// offset in the current file.
1227	SourceLocation Lexer::getSourceLocation(const char *Loc,
1228	unsigned TokLen) const {
1229	assert(Loc >= BufferStart && Loc <= BufferEnd &&
1230	"Location out of range for this buffer!");
1231
1232	// In the normal case, we're just lexing from a simple file buffer, return
1233	// the file id from FileLoc with the offset specified.
1234	unsigned CharNo = Loc-BufferStart;
1235	if (FileLoc.isFileID())
1236	return FileLoc.getLocWithOffset(Offset: CharNo);
1237
1238	// Otherwise, this is the _Pragma lexer case, which pretends that all of the
1239	// tokens are lexed from where the _Pragma was defined.
1240	assert(PP && "This doesn't work on raw lexers");
1241	return GetMappedTokenLoc(PP&: *PP, FileLoc, CharNo, TokLen);
1242	}
1243
1244	/// Diag - Forwarding function for diagnostics. This translate a source
1245	/// position in the current buffer into a SourceLocation object for rendering.
1246	DiagnosticBuilder Lexer::Diag(const char Loc, unsigned* DiagID) const {
1247	return PP->Diag(Loc: getSourceLocation(Loc), DiagID);
1248	}
1249
1250	//===----------------------------------------------------------------------===//
1251	// Trigraph and Escaped Newline Handling Code.
1252	//===----------------------------------------------------------------------===//
1253
1254	/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1255	/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1256	static char GetTrigraphCharForLetter(char Letter) {
1257	switch (Letter) {
1258	default: return `0`;
1259	case `'='`: return `'#'`;
1260	case `')'`: return `']'`;
1261	case `'('`: return `'['`;
1262	case `'!'`: return `'\|'`;
1263	case `'\''`: return `'^'`;
1264	case `'>'`: return `'}'`;
1265	case `'/'`: return `'\\'`;
1266	case `'<'`: return `'{'`;
1267	case `'-'`: return `'~'`;
1268	}
1269	}
1270
1271	/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1272	/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1273	/// return the result character. Finally, emit a warning about trigraph use
1274	/// whether trigraphs are enabled or not.
1275	static char DecodeTrigraphChar(const char CP, Lexer L, bool Trigraphs) {
1276	char Res = GetTrigraphCharForLetter(Letter: *CP);
1277	if (!Res)
1278	return Res;
1279
1280	if (!Trigraphs) {
1281	if (L && !L->isLexingRawMode())
1282	L->Diag(Loc: CP-`2`, DiagID: diag::trigraph_ignored);
1283	return `0`;
1284	}
1285
1286	if (L && !L->isLexingRawMode())
1287	L->Diag(Loc: CP-`2`, DiagID: diag::trigraph_converted) << StringRef(&Res, `1`);
1288	return Res;
1289	}
1290
1291	/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1292	/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1293	/// trigraph equivalent on entry to this function.
1294	unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1295	unsigned Size = `0`;
1296	while (isWhitespace(c: Ptr[Size])) {
1297	++Size;
1298
1299	if (Ptr[Size-`1`] != `'\n'` && Ptr[Size-`1`] != `'\r'`)
1300	continue;
1301
1302	// If this is a \r\n or \n\r, skip the other half.
1303	if ((Ptr[Size] == `'\r'` \|\| Ptr[Size] == `'\n'`) &&
1304	Ptr[Size-`1`] != Ptr[Size])
1305	++Size;
1306
1307	return Size;
1308	}
1309
1310	// Not an escaped newline, must be a \t or something else.
1311	return `0`;
1312	}
1313
1314	/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1315	/// them), skip over them and return the first non-escaped-newline found,
1316	/// otherwise return P.
1317	const char Lexer::SkipEscapedNewLines(const* char *P) {
1318	while (true) {
1319	const char *AfterEscape;
1320	if (*P == `'\\'`) {
1321	AfterEscape = P+`1`;
1322	} else if (*P == `'?'`) {
1323	// If not a trigraph for escape, bail out.
1324	if (P[`1`] != `'?'` \|\| P[`2`] != `'/'`)
1325	return P;
1326	// FIXME: Take LangOpts into account; the language might not
1327	// support trigraphs.
1328	AfterEscape = P+`3`;
1329	} else {
1330	return P;
1331	}
1332
1333	unsigned NewLineSize = Lexer::getEscapedNewLineSize(Ptr: AfterEscape);
1334	if (NewLineSize == `0`) return P;
1335	P = AfterEscape+NewLineSize;
1336	}
1337	}
1338
1339	std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
1340	const SourceManager &SM,
1341	const LangOptions &LangOpts,
1342	bool IncludeComments) {
1343	if (Loc.isMacroID()) {
1344	if (!Lexer::isAtEndOfMacroExpansion(loc: Loc, SM, LangOpts, MacroEnd: &Loc))
1345	return std::nullopt;
1346	}
1347	Loc = Lexer::getLocForEndOfToken(Loc, Offset: `0`, SM, LangOpts);
1348
1349	// Break down the source location.
1350	FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc);
1351
1352	// Try to load the file buffer.
1353	bool InvalidTemp = false;
1354	StringRef File = SM.getBufferData(FID: LocInfo.first, Invalid: &InvalidTemp);
1355	if (InvalidTemp)
1356	return std::nullopt;
1357
1358	const char *TokenBegin = File.data() + LocInfo.second;
1359
1360	// Lex from the start of the given location.
1361	Lexer lexer(SM.getLocForStartOfFile(FID: LocInfo.first), LangOpts, File.begin(),
1362	TokenBegin, File.end());
1363	lexer.SetCommentRetentionState(IncludeComments);
1364	// Find the token.
1365	Token Tok;
1366	lexer.LexFromRawLexer(Result&: Tok);
1367	return Tok;
1368	}
1369
1370	std::optional<Token> Lexer::findPreviousToken(SourceLocation Loc,
1371	const SourceManager &SM,
1372	const LangOptions &LangOpts,
1373	bool IncludeComments) {
1374	const auto StartOfFile = SM.getLocForStartOfFile(FID: SM.getFileID(SpellingLoc: Loc));
1375	while (Loc != StartOfFile) {
1376	Loc = Loc.getLocWithOffset(Offset: -`1`);
1377	if (Loc.isInvalid())
1378	return std::nullopt;
1379
1380	Loc = GetBeginningOfToken(Loc, SM, LangOpts);
1381	Token Tok;
1382	if (getRawToken(Loc, Result&: Tok, SM, LangOpts))
1383	continue; // Not a token, go to prev location.
1384	if (!Tok.is(K: tok::comment) \|\| IncludeComments) {
1385	return Tok;
1386	}
1387	}
1388	return std::nullopt;
1389	}
1390
1391	/// Checks that the given token is the first token that occurs after the
1392	/// given location (this excludes comments and whitespace). Returns the location
1393	/// immediately after the specified token. If the token is not found or the
1394	/// location is inside a macro, the returned source location will be invalid.
1395	SourceLocation Lexer::findLocationAfterToken(
1396	SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1397	const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1398	std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1399	if (!Tok \|\| Tok ->isNot(K: TKind))
1400	return {};
1401	SourceLocation TokenLoc = Tok ->getLocation();
1402
1403	// Calculate how much whitespace needs to be skipped if any.
1404	unsigned NumWhitespaceChars = `0`;
1405	if (SkipTrailingWhitespaceAndNewLine) {
1406	const char *TokenEnd = SM.getCharacterData(SL: TokenLoc) + Tok ->getLength();
1407	unsigned char C = *TokenEnd;
1408	while (isHorizontalWhitespace(c: C)) {
1409	C = *(++TokenEnd);
1410	NumWhitespaceChars++;
1411	}
1412
1413	// Skip \r, \n, \r\n, or \n\r
1414	if (C == `'\n'` \|\| C == `'\r'`) {
1415	char PrevC = C;
1416	C = *(++TokenEnd);
1417	NumWhitespaceChars++;
1418	if ((C == `'\n'` \|\| C == `'\r'`) && C != PrevC)
1419	NumWhitespaceChars++;
1420	}
1421	}
1422
1423	return TokenLoc.getLocWithOffset(Offset: Tok ->getLength() + NumWhitespaceChars);
1424	}
1425
1426	/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1427	/// get its size, and return it. This is tricky in several cases:
1428	/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1429	/// then either return the trigraph (skipping 3 chars) or the '?',
1430	/// depending on whether trigraphs are enabled or not.
1431	/// 2. If this is an escaped newline (potentially with whitespace between
1432	/// the backslash and newline), implicitly skip the newline and return
1433	/// the char after it.
1434	///
1435	/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1436	/// know that we can accumulate into Size, and that we have already incremented
1437	/// Ptr by Size bytes.
1438	///
1439	/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1440	/// be updated to match.
1441	Lexer::SizedChar Lexer::getCharAndSizeSlow(const char Ptr, Token Tok) {
1442	unsigned Size = `0`;
1443	// If we have a slash, look for an escaped newline.
1444	if (Ptr[`0`] == `'\\'`) {
1445	++Size;
1446	++Ptr;
1447	Slash:
1448	// Common case, backslash-char where the char is not whitespace.
1449	if (!isWhitespace(c: Ptr[`0`]))
1450	return {.Char: `'\\'`, .Size: Size};
1451
1452	// See if we have optional whitespace characters between the slash and
1453	// newline.
1454	if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1455	// Remember that this token needs to be cleaned.
1456	if (Tok) Tok->setFlag(Token::NeedsCleaning);
1457
1458	// Warn if there was whitespace between the backslash and newline.
1459	if (Ptr[`0`] != `'\n'` && Ptr[`0`] != `'\r'` && Tok && !isLexingRawMode())
1460	Diag(Loc: Ptr, DiagID: diag::backslash_newline_space);
1461
1462	// Found backslash<whitespace><newline>. Parse the char after it.
1463	Size += EscapedNewLineSize;
1464	Ptr += EscapedNewLineSize;
1465
1466	// Use slow version to accumulate a correct size field.
1467	auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1468	CharAndSize.Size += Size;
1469	return CharAndSize;
1470	}
1471
1472	// Otherwise, this is not an escaped newline, just return the slash.
1473	return {.Char: `'\\'`, .Size: Size};
1474	}
1475
1476	// If this is a trigraph, process it.
1477	if (Ptr[`0`] == `'?'` && Ptr[`1`] == `'?'`) {
1478	// If this is actually a legal trigraph (not something like "??x"), emit
1479	// a trigraph warning. If so, and if trigraphs are enabled, return it.
1480	if (char C = DecodeTrigraphChar(CP: Ptr + `2`, L: Tok ? this : nullptr,
1481	Trigraphs: LangOpts.Trigraphs)) {
1482	// Remember that this token needs to be cleaned.
1483	if (Tok) Tok->setFlag(Token::NeedsCleaning);
1484
1485	Ptr += `3`;
1486	Size += `3`;
1487	if (C == `'\\'`) goto Slash;
1488	return {.Char: C, .Size: Size};
1489	}
1490	}
1491
1492	// If this is neither, return a single character.
1493	return {.Char: *Ptr, .Size: Size + `1u`};
1494	}
1495
1496	/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1497	/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1498	/// and that we have already incremented Ptr by Size bytes.
1499	///
1500	/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1501	/// be updated to match.
1502	Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1503	const LangOptions &LangOpts) {
1504
1505	unsigned Size = `0`;
1506	// If we have a slash, look for an escaped newline.
1507	if (Ptr[`0`] == `'\\'`) {
1508	++Size;
1509	++Ptr;
1510	Slash:
1511	// Common case, backslash-char where the char is not whitespace.
1512	if (!isWhitespace(c: Ptr[`0`]))
1513	return {.Char: `'\\'`, .Size: Size};
1514
1515	// See if we have optional whitespace characters followed by a newline.
1516	if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1517	// Found backslash<whitespace><newline>. Parse the char after it.
1518	Size += EscapedNewLineSize;
1519	Ptr += EscapedNewLineSize;
1520
1521	// Use slow version to accumulate a correct size field.
1522	auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1523	CharAndSize.Size += Size;
1524	return CharAndSize;
1525	}
1526
1527	// Otherwise, this is not an escaped newline, just return the slash.
1528	return {.Char: `'\\'`, .Size: Size};
1529	}
1530
1531	// If this is a trigraph, process it.
1532	if (LangOpts.Trigraphs && Ptr[`0`] == `'?'` && Ptr[`1`] == `'?'`) {
1533	// If this is actually a legal trigraph (not something like "??x"), return
1534	// it.
1535	if (char C = GetTrigraphCharForLetter(Letter: Ptr[`2`])) {
1536	Ptr += `3`;
1537	Size += `3`;
1538	if (C == `'\\'`) goto Slash;
1539	return {.Char: C, .Size: Size};
1540	}
1541	}
1542
1543	// If this is neither, return a single character.
1544	return {.Char: *Ptr, .Size: Size + `1u`};
1545	}
1546
1547	//===----------------------------------------------------------------------===//
1548	// Helper methods for lexing.
1549	//===----------------------------------------------------------------------===//
1550
1551	/// Routine that indiscriminately sets the offset into the source file.
1552	void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1553	BufferPtr = BufferStart + Offset;
1554	if (BufferPtr > BufferEnd)
1555	BufferPtr = BufferEnd;
1556	// FIXME: What exactly does the StartOfLine bit mean? There are two
1557	// possible meanings for the "start" of the line: the first token on the
1558	// unexpanded line, or the first token on the expanded line.
1559	IsAtStartOfLine = StartOfLine;
1560	IsAtPhysicalStartOfLine = StartOfLine;
1561	}
1562
1563	static bool isUnicodeWhitespace(uint32_t Codepoint) {
1564	static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1565	UnicodeWhitespaceCharRanges);
1566	return UnicodeWhitespaceChars.contains(C: Codepoint);
1567	}
1568
1569	static llvm::SmallString<`5`> codepointAsHexString(uint32_t C) {
1570	llvm::SmallString<`5`> CharBuf;
1571	llvm::raw_svector_ostream CharOS(CharBuf);
1572	llvm::write_hex(S&: CharOS, N: C, Style: llvm::HexPrintStyle::Upper, Width: `4`);
1573	return CharBuf;
1574	}
1575
1576	// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1577	// we allow "Mathematical Notation Characters" in identifiers.
1578	// This is a proposed profile that extends the XID_Start/XID_continue
1579	// with mathematical symbols, superscipts and subscripts digits
1580	// found in some production software.
1581	// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1582	static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1583	bool IsStart, bool &IsExtension) {
1584	static const llvm::sys::UnicodeCharSet MathStartChars(
1585	MathematicalNotationProfileIDStartRanges);
1586	static const llvm::sys::UnicodeCharSet MathContinueChars(
1587	MathematicalNotationProfileIDContinueRanges);
1588	if (MathStartChars.contains(C) \|\|
1589	(!IsStart && MathContinueChars.contains(C))) {
1590	IsExtension = true;
1591	return true;
1592	}
1593	return false;
1594	}
1595
1596	static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1597	bool &IsExtension) {
1598	if (LangOpts.AsmPreprocessor) {
1599	return false;
1600	} else if (LangOpts.DollarIdents && `'$'` == C) {
1601	return true;
1602	} else if (LangOpts.CPlusPlus \|\| LangOpts.C23) {
1603	// A non-leading codepoint must have the XID_Continue property.
1604	// XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1605	// so we need to check both tables.
1606	// '_' doesn't have the XID_Continue property but is allowed in C and C++.
1607	static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1608	static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1609	if (C == `'_'` \|\| XIDStartChars.contains(C) \|\| XIDContinueChars.contains(C))
1610	return true;
1611	return isMathematicalExtensionID(C, LangOpts, /IsStart=/false,
1612	IsExtension);
1613	} else if (LangOpts.C11) {
1614	static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1615	C11AllowedIDCharRanges);
1616	return C11AllowedIDChars.contains(C);
1617	} else {
1618	static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1619	C99AllowedIDCharRanges);
1620	return C99AllowedIDChars.contains(C);
1621	}
1622	}
1623
1624	static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1625	bool &IsExtension) {
1626	assert(C > `0x7F` && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1627	IsExtension = false;
1628	if (LangOpts.AsmPreprocessor) {
1629	return false;
1630	}
1631	if (LangOpts.CPlusPlus \|\| LangOpts.C23) {
1632	static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1633	if (XIDStartChars.contains(C))
1634	return true;
1635	return isMathematicalExtensionID(C, LangOpts, /IsStart=/true,
1636	IsExtension);
1637	}
1638	if (!isAllowedIDChar(C, LangOpts, IsExtension))
1639	return false;
1640	if (LangOpts.C11) {
1641	static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1642	C11DisallowedInitialIDCharRanges);
1643	return !C11DisallowedInitialIDChars.contains(C);
1644	}
1645	static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1646	C99DisallowedInitialIDCharRanges);
1647	return !C99DisallowedInitialIDChars.contains(C);
1648	}
1649
1650	static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
1651	CharSourceRange Range) {
1652
1653	static const llvm::sys::UnicodeCharSet MathStartChars(
1654	MathematicalNotationProfileIDStartRanges);
1655	static const llvm::sys::UnicodeCharSet MathContinueChars(
1656	MathematicalNotationProfileIDContinueRanges);
1657
1658	(void)MathStartChars;
1659	(void)MathContinueChars;
1660	assert((MathStartChars.contains(C) \|\| MathContinueChars.contains(C)) &&
1661	"Unexpected mathematical notation codepoint");
1662	Diags.Report(Loc: Range.getBegin(), DiagID: diag::ext_mathematical_notation)
1663	<< codepointAsHexString(C) << Range;
1664	}
1665
1666	static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1667	const char *End) {
1668	return CharSourceRange::getCharRange(B: L.getSourceLocation(Loc: Begin),
1669	E: L.getSourceLocation(Loc: End));
1670	}
1671
1672	static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1673	CharSourceRange Range, bool IsFirst) {
1674	// Check C99 compatibility.
1675	if (!Diags.isIgnored(DiagID: diag::warn_c99_compat_unicode_id, Loc: Range.getBegin())) {
1676	enum {
1677	CannotAppearInIdentifier = `0`,
1678	CannotStartIdentifier
1679	};
1680
1681	static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1682	C99AllowedIDCharRanges);
1683	static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1684	C99DisallowedInitialIDCharRanges);
1685	if (!C99AllowedIDChars.contains(C)) {
1686	Diags.Report(Loc: Range.getBegin(), DiagID: diag::warn_c99_compat_unicode_id)
1687	<< Range
1688	<< CannotAppearInIdentifier;
1689	} else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1690	Diags.Report(Loc: Range.getBegin(), DiagID: diag::warn_c99_compat_unicode_id)
1691	<< Range
1692	<< CannotStartIdentifier;
1693	}
1694	}
1695	}
1696
1697	/// After encountering UTF-8 character C and interpreting it as an identifier
1698	/// character, check whether it's a homoglyph for a common non-identifier
1699	/// source character that is unlikely to be an intentional identifier
1700	/// character and warn if so.
1701	static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1702	CharSourceRange Range) {
1703	// FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1704	struct HomoglyphPair {
1705	uint32_t Character;
1706	char LooksLike;
1707	bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1708	};
1709	static constexpr HomoglyphPair SortedHomoglyphs[] = {
1710	{.Character: U`'\u00ad'`, .LooksLike: `0`}, // SOFT HYPHEN
1711	{.Character: U`'\u01c3'`, .LooksLike: `'!'`}, // LATIN LETTER RETROFLEX CLICK
1712	{.Character: U`'\u037e'`, .LooksLike: `';'`}, // GREEK QUESTION MARK
1713	{.Character: U`'\u200b'`, .LooksLike: `0`}, // ZERO WIDTH SPACE
1714	{.Character: U`'\u200c'`, .LooksLike: `0`}, // ZERO WIDTH NON-JOINER
1715	{.Character: U`'\u200d'`, .LooksLike: `0`}, // ZERO WIDTH JOINER
1716	{.Character: U`'\u2060'`, .LooksLike: `0`}, // WORD JOINER
1717	{.Character: U`'\u2061'`, .LooksLike: `0`}, // FUNCTION APPLICATION
1718	{.Character: U`'\u2062'`, .LooksLike: `0`}, // INVISIBLE TIMES
1719	{.Character: U`'\u2063'`, .LooksLike: `0`}, // INVISIBLE SEPARATOR
1720	{.Character: U`'\u2064'`, .LooksLike: `0`}, // INVISIBLE PLUS
1721	{.Character: U`'\u2212'`, .LooksLike: `'-'`}, // MINUS SIGN
1722	{.Character: U`'\u2215'`, .LooksLike: `'/'`}, // DIVISION SLASH
1723	{.Character: U`'\u2216'`, .LooksLike: `'\\'`}, // SET MINUS
1724	{.Character: U`'\u2217'`, .LooksLike: `''`}, // ASTERISK OPERATOR*
1725	{.Character: U`'\u2223'`, .LooksLike: `'\|'`}, // DIVIDES
1726	{.Character: U`'\u2227'`, .LooksLike: `'^'`}, // LOGICAL AND
1727	{.Character: U`'\u2236'`, .LooksLike: `':'`}, // RATIO
1728	{.Character: U`'\u223c'`, .LooksLike: `'~'`}, // TILDE OPERATOR
1729	{.Character: U`'\ua789'`, .LooksLike: `':'`}, // MODIFIER LETTER COLON
1730	{.Character: U`'\ufeff'`, .LooksLike: `0`}, // ZERO WIDTH NO-BREAK SPACE
1731	{.Character: U`'\uff01'`, .LooksLike: `'!'`}, // FULLWIDTH EXCLAMATION MARK
1732	{.Character: U`'\uff03'`, .LooksLike: `'#'`}, // FULLWIDTH NUMBER SIGN
1733	{.Character: U`'\uff04'`, .LooksLike: `'$'`}, // FULLWIDTH DOLLAR SIGN
1734	{.Character: U`'\uff05'`, .LooksLike: `'%'`}, // FULLWIDTH PERCENT SIGN
1735	{.Character: U`'\uff06'`, .LooksLike: `'&'`}, // FULLWIDTH AMPERSAND
1736	{.Character: U`'\uff08'`, .LooksLike: `'('`}, // FULLWIDTH LEFT PARENTHESIS
1737	{.Character: U`'\uff09'`, .LooksLike: `')'`}, // FULLWIDTH RIGHT PARENTHESIS
1738	{.Character: U`'\uff0a'`, .LooksLike: `''`}, // FULLWIDTH ASTERISK*
1739	{.Character: U`'\uff0b'`, .LooksLike: `'+'`}, // FULLWIDTH ASTERISK
1740	{.Character: U`'\uff0c'`, .LooksLike: `','`}, // FULLWIDTH COMMA
1741	{.Character: U`'\uff0d'`, .LooksLike: `'-'`}, // FULLWIDTH HYPHEN-MINUS
1742	{.Character: U`'\uff0e'`, .LooksLike: `'.'`}, // FULLWIDTH FULL STOP
1743	{.Character: U`'\uff0f'`, .LooksLike: `'/'`}, // FULLWIDTH SOLIDUS
1744	{.Character: U`'\uff1a'`, .LooksLike: `':'`}, // FULLWIDTH COLON
1745	{.Character: U`'\uff1b'`, .LooksLike: `';'`}, // FULLWIDTH SEMICOLON
1746	{.Character: U`'\uff1c'`, .LooksLike: `'<'`}, // FULLWIDTH LESS-THAN SIGN
1747	{.Character: U`'\uff1d'`, .LooksLike: `'='`}, // FULLWIDTH EQUALS SIGN
1748	{.Character: U`'\uff1e'`, .LooksLike: `'>'`}, // FULLWIDTH GREATER-THAN SIGN
1749	{.Character: U`'\uff1f'`, .LooksLike: `'?'`}, // FULLWIDTH QUESTION MARK
1750	{.Character: U`'\uff20'`, .LooksLike: `'@'`}, // FULLWIDTH COMMERCIAL AT
1751	{.Character: U`'\uff3b'`, .LooksLike: `'['`}, // FULLWIDTH LEFT SQUARE BRACKET
1752	{.Character: U`'\uff3c'`, .LooksLike: `'\\'`}, // FULLWIDTH REVERSE SOLIDUS
1753	{.Character: U`'\uff3d'`, .LooksLike: `']'`}, // FULLWIDTH RIGHT SQUARE BRACKET
1754	{.Character: U`'\uff3e'`, .LooksLike: `'^'`}, // FULLWIDTH CIRCUMFLEX ACCENT
1755	{.Character: U`'\uff5b'`, .LooksLike: `'{'`}, // FULLWIDTH LEFT CURLY BRACKET
1756	{.Character: U`'\uff5c'`, .LooksLike: `'\|'`}, // FULLWIDTH VERTICAL LINE
1757	{.Character: U`'\uff5d'`, .LooksLike: `'}'`}, // FULLWIDTH RIGHT CURLY BRACKET
1758	{.Character: U`'\uff5e'`, .LooksLike: `'~'`}, // FULLWIDTH TILDE
1759	{.Character: `0`, .LooksLike: `0`}
1760	};
1761	auto Homoglyph =
1762	std::lower_bound(first: std::begin(arr: SortedHomoglyphs),
1763	last: std::end(arr: SortedHomoglyphs) - `1`, val: HomoglyphPair{.Character: C, .LooksLike: `'\0'`});
1764	if (Homoglyph->Character == C) {
1765	if (Homoglyph->LooksLike) {
1766	const char LooksLikeStr[] = {Homoglyph->LooksLike, `0`};
1767	Diags.Report(Loc: Range.getBegin(), DiagID: diag::warn_utf8_symbol_homoglyph)
1768	<< Range << codepointAsHexString(C) << LooksLikeStr;
1769	} else {
1770	Diags.Report(Loc: Range.getBegin(), DiagID: diag::warn_utf8_symbol_zero_width)
1771	<< Range << codepointAsHexString(C);
1772	}
1773	}
1774	}
1775
1776	static void diagnoseInvalidUnicodeCodepointInIdentifier(
1777	DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1778	CharSourceRange Range, bool IsFirst) {
1779	if (isASCII(c: CodePoint))
1780	return;
1781
1782	bool IsExtension;
1783	bool IsIDStart = isAllowedInitiallyIDChar(C: CodePoint, LangOpts, IsExtension);
1784	bool IsIDContinue =
1785	IsIDStart \|\| isAllowedIDChar(C: CodePoint, LangOpts, IsExtension);
1786
1787	if ((IsFirst && IsIDStart) \|\| (!IsFirst && IsIDContinue))
1788	return;
1789
1790	bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1791
1792	if (!IsFirst \|\| InvalidOnlyAtStart) {
1793	Diags.Report(Loc: Range.getBegin(), DiagID: diag::err_character_not_allowed_identifier)
1794	<< Range << codepointAsHexString(C: CodePoint) << int(InvalidOnlyAtStart)
1795	<< FixItHint::CreateRemoval(RemoveRange: Range);
1796	} else {
1797	Diags.Report(Loc: Range.getBegin(), DiagID: diag::err_character_not_allowed)
1798	<< Range << codepointAsHexString(C: CodePoint)
1799	<< FixItHint::CreateRemoval(RemoveRange: Range);
1800	}
1801	}
1802
1803	bool Lexer::tryConsumeIdentifierUCN(const char &CurPtr, unsigned* Size,
1804	Token &Result) {
1805	const char *UCNPtr = CurPtr + Size;
1806	uint32_t CodePoint = tryReadUCN(StartPtr&: UCNPtr, SlashLoc: CurPtr, /Token=/Result: nullptr);
1807	if (CodePoint == `0`) {
1808	return false;
1809	}
1810	bool IsExtension = false;
1811	if (!isAllowedIDChar(C: CodePoint, LangOpts, IsExtension)) {
1812	if (isASCII(c: CodePoint) \|\| isUnicodeWhitespace(Codepoint: CodePoint))
1813	return false;
1814	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1815	!PP->isPreprocessedOutput())
1816	diagnoseInvalidUnicodeCodepointInIdentifier(
1817	Diags&: PP->getDiagnostics(), LangOpts, CodePoint,
1818	Range: makeCharRange(L&: *this, Begin: CurPtr, End: UCNPtr),
1819	/IsFirst=/false);
1820
1821	// We got a unicode codepoint that is neither a space nor a
1822	// a valid identifier part.
1823	// Carry on as if the codepoint was valid for recovery purposes.
1824	} else if (!isLexingRawMode()) {
1825	if (IsExtension)
1826	diagnoseExtensionInIdentifier(Diags&: PP->getDiagnostics(), C: CodePoint,
1827	Range: makeCharRange(L&: *this, Begin: CurPtr, End: UCNPtr));
1828
1829	maybeDiagnoseIDCharCompat(Diags&: PP->getDiagnostics(), C: CodePoint,
1830	Range: makeCharRange(L&: *this, Begin: CurPtr, End: UCNPtr),
1831	/IsFirst=/false);
1832	}
1833
1834	Result.setFlag(Token::HasUCN);
1835	if ((UCNPtr - CurPtr == `6` && CurPtr[`1`] == `'u'`) \|\|
1836	(UCNPtr - CurPtr == `10` && CurPtr[`1`] == `'U'`))
1837	CurPtr = UCNPtr;
1838	else
1839	while (CurPtr != UCNPtr)
1840	(void)getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
1841	return true;
1842	}
1843
1844	bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1845	llvm::UTF32 CodePoint;
1846
1847	// If a UTF-8 codepoint appears immediately after an escaped new line,
1848	// CurPtr may point to the splicing \ on the preceding line,
1849	// so we need to skip it.
1850	unsigned FirstCodeUnitSize;
1851	getCharAndSize(Ptr: CurPtr, Size&: FirstCodeUnitSize);
1852	const char *CharStart = CurPtr + FirstCodeUnitSize - `1`;
1853	const char *UnicodePtr = CharStart;
1854
1855	llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1856	source: (const llvm::UTF8 *)&UnicodePtr, sourceEnd: (const* llvm::UTF8 *)BufferEnd,
1857	target: &CodePoint, flags: llvm::strictConversion);
1858	if (ConvResult != llvm::conversionOK)
1859	return false;
1860
1861	bool IsExtension = false;
1862	if (!isAllowedIDChar(C: static_cast<uint32_t>(CodePoint), LangOpts,
1863	IsExtension)) {
1864	if (isASCII(c: CodePoint) \|\| isUnicodeWhitespace(Codepoint: CodePoint))
1865	return false;
1866
1867	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1868	!PP->isPreprocessedOutput())
1869	diagnoseInvalidUnicodeCodepointInIdentifier(
1870	Diags&: PP->getDiagnostics(), LangOpts, CodePoint,
1871	Range: makeCharRange(L&: *this, Begin: CharStart, End: UnicodePtr), /IsFirst=/false);
1872	// We got a unicode codepoint that is neither a space nor a
1873	// a valid identifier part. Carry on as if the codepoint was
1874	// valid for recovery purposes.
1875	} else if (!isLexingRawMode()) {
1876	if (IsExtension)
1877	diagnoseExtensionInIdentifier(
1878	Diags&: PP->getDiagnostics(), C: CodePoint,
1879	Range: makeCharRange(L&: *this, Begin: CharStart, End: UnicodePtr));
1880	maybeDiagnoseIDCharCompat(Diags&: PP->getDiagnostics(), C: CodePoint,
1881	Range: makeCharRange(L&: *this, Begin: CharStart, End: UnicodePtr),
1882	/IsFirst=/false);
1883	maybeDiagnoseUTF8Homoglyph(Diags&: PP->getDiagnostics(), C: CodePoint,
1884	Range: makeCharRange(L&: *this, Begin: CharStart, End: UnicodePtr));
1885	}
1886
1887	// Once we sucessfully parsed some UTF-8,
1888	// calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1889	// being lexed, and that warnings about trailing spaces are emitted.
1890	ConsumeChar(Ptr: CurPtr, Size: FirstCodeUnitSize, Tok&: Result);
1891	CurPtr = UnicodePtr;
1892	return true;
1893	}
1894
1895	bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1896	const char *CurPtr) {
1897	bool IsExtension = false;
1898	if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1899	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1900	!PP->isPreprocessedOutput()) {
1901	if (IsExtension)
1902	diagnoseExtensionInIdentifier(Diags&: PP->getDiagnostics(), C,
1903	Range: makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr));
1904	maybeDiagnoseIDCharCompat(Diags&: PP->getDiagnostics(), C,
1905	Range: makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr),
1906	/IsFirst=/true);
1907	maybeDiagnoseUTF8Homoglyph(Diags&: PP->getDiagnostics(), C,
1908	Range: makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr));
1909	}
1910
1911	MIOpt.ReadToken();
1912	return LexIdentifierContinue(Result, CurPtr);
1913	}
1914
1915	if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1916	!PP->isPreprocessedOutput() && !isASCII(c: *BufferPtr) &&
1917	!isUnicodeWhitespace(Codepoint: C)) {
1918	// Non-ASCII characters tend to creep into source code unintentionally.
1919	// Instead of letting the parser complain about the unknown token,
1920	// just drop the character.
1921	// Note that we can /only/ do this when the non-ASCII character is actually
1922	// spelled as Unicode, not written as a UCN. The standard requires that
1923	// we not throw away any possible preprocessor tokens, but there's a
1924	// loophole in the mapping of Unicode characters to basic character set
1925	// characters that allows us to map these particular characters to, say,
1926	// whitespace.
1927	diagnoseInvalidUnicodeCodepointInIdentifier(
1928	Diags&: PP->getDiagnostics(), LangOpts, CodePoint: C,
1929	Range: makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr), /IsStart/ IsFirst: true);
1930	BufferPtr = CurPtr;
1931	return false;
1932	}
1933
1934	// Otherwise, we have an explicit UCN or a character that's unlikely to show
1935	// up by accident.
1936	MIOpt.ReadToken();
1937	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
1938	return true;
1939	}
1940
1941	static const char *
1942	fastParseASCIIIdentifier(const char *CurPtr,
1943	[[maybe_unused]] const char *BufferEnd) {
1944	#ifdef __SSE4_2__
1945	alignas(`16`) static constexpr char AsciiIdentifierRange[`16`] = {
1946	`'_'`, `'_'`, `'A'`, `'Z'`, `'a'`, `'z'`, `'0'`, `'9'`,
1947	};
1948	constexpr ssize_t BytesPerRegister = `16`;
1949
1950	__m128i AsciiIdentifierRangeV =
1951	_mm_load_si128((const __m128i *)AsciiIdentifierRange);
1952
1953	while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1954	__m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1955
1956	int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1957	_SIDD_LEAST_SIGNIFICANT \| _SIDD_CMP_RANGES \|
1958	_SIDD_UBYTE_OPS \| _SIDD_NEGATIVE_POLARITY);
1959	CurPtr += Consumed;
1960	if (Consumed == BytesPerRegister)
1961	continue;
1962	return CurPtr;
1963	}
1964	#endif
1965
1966	unsigned char C = *CurPtr;
1967	while (isAsciiIdentifierContinue(c: C))
1968	C = *++CurPtr;
1969	return CurPtr;
1970	}
1971
1972	bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1973	// Match [_A-Za-z0-9], we have already matched an identifier start.*
1974
1975	while (true) {
1976
1977	CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1978
1979	unsigned Size;
1980	// Slow path: handle trigraph, unicode codepoints, UCNs.
1981	unsigned char C = getCharAndSize(Ptr: CurPtr, Size);
1982	if (isAsciiIdentifierContinue(c: C)) {
1983	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
1984	continue;
1985	}
1986	if (C == `'$'`) {
1987	// If we hit a $ and they are not supported in identifiers, we are done.
1988	if (!LangOpts.DollarIdents)
1989	break;
1990	// Otherwise, emit a diagnostic and continue.
1991	if (!isLexingRawMode())
1992	Diag(Loc: CurPtr, DiagID: diag::ext_dollar_in_identifier);
1993	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
1994	continue;
1995	}
1996	if (C == `'\\'` && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1997	continue;
1998	if (!isASCII(c: C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1999	continue;
2000	// Neither an expected Unicode codepoint nor a UCN.
2001	break;
2002	}
2003
2004	const char *IdStart = BufferPtr;
2005	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::raw_identifier);
2006	Result.setRawIdentifierData(IdStart);
2007
2008	// If we are in raw mode, return this identifier raw. There is no need to
2009	// look up identifier information or attempt to macro expand it.
2010	if (LexingRawMode)
2011	return true;
2012
2013	// Fill in Result.IdentifierInfo and update the token kind,
2014	// looking up the identifier in the identifier table.
2015	const IdentifierInfo *II = PP->LookUpIdentifierInfo(Identifier&: Result);
2016	// Note that we have to call PP->LookUpIdentifierInfo() even for code
2017	// completion, it writes IdentifierInfo into Result, and callers rely on it.
2018
2019	// If the completion point is at the end of an identifier, we want to treat
2020	// the identifier as incomplete even if it resolves to a macro or a keyword.
2021	// This allows e.g. 'class^' to complete to 'classifier'.
2022	if (isCodeCompletionPoint(CurPtr)) {
2023	// Return the code-completion token.
2024	Result.setKind(tok::code_completion);
2025	// Skip the code-completion char and all immediate identifier characters.
2026	// This ensures we get consistent behavior when completing at any point in
2027	// an identifier (i.e. at the start, in the middle, at the end). Note that
2028	// only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
2029	// simpler.
2030	assert(*CurPtr == `0` && "Completion character must be 0");
2031	++CurPtr;
2032	// Note that code completion token is not added as a separate character
2033	// when the completion point is at the end of the buffer. Therefore, we need
2034	// to check if the buffer has ended.
2035	if (CurPtr < BufferEnd) {
2036	while (isAsciiIdentifierContinue(c: *CurPtr))
2037	++CurPtr;
2038	}
2039	BufferPtr = CurPtr;
2040	return true;
2041	}
2042
2043	// Finally, now that we know we have an identifier, pass this off to the
2044	// preprocessor, which may macro expand it or something.
2045	if (II->isHandleIdentifierCase() \|\| II->isModuleKeyword() \|\|
2046	II->isImportKeyword() \|\| II->getTokenID() == tok::kw_export)
2047	return PP->HandleIdentifier(Identifier&: Result);
2048
2049	return true;
2050	}
2051
2052	/// isHexaLiteral - Return true if Start points to a hex constant.
2053	/// in microsoft mode (where this is supposed to be several different tokens).
2054	bool Lexer::isHexaLiteral(const char Start, const* LangOptions &LangOpts) {
2055	auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Ptr: Start, LangOpts);
2056	char C1 = CharAndSize1.Char;
2057	if (C1 != `'0'`)
2058	return false;
2059
2060	auto CharAndSize2 =
2061	Lexer::getCharAndSizeNoWarn(Ptr: Start + CharAndSize1.Size, LangOpts);
2062	char C2 = CharAndSize2.Char;
2063	return (C2 == `'x'` \|\| C2 == `'X'`);
2064	}
2065
2066	/// LexNumericConstant - Lex the remainder of a integer or floating point
2067	/// constant. From[-1] is the first character lexed. Return the end of the
2068	/// constant.
2069	bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2070	unsigned Size;
2071	char C = getCharAndSize(Ptr: CurPtr, Size);
2072	char PrevCh = `0`;
2073	while (isPreprocessingNumberBody(c: C)) {
2074	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
2075	PrevCh = C;
2076	if (LangOpts.HLSL && C == `'.'` && (CurPtr == `'x'` \|\| CurPtr == `'r'`)) {
2077	CurPtr -= Size;
2078	break;
2079	}
2080	C = getCharAndSize(Ptr: CurPtr, Size);
2081	}
2082
2083	// If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2084	if ((C == `'-'` \|\| C == `'+'`) && (PrevCh == `'E'` \|\| PrevCh == `'e'`)) {
2085	// If we are in Microsoft mode, don't continue if the constant is hex.
2086	// For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2087	if (!LangOpts.MicrosoftExt \|\| !isHexaLiteral(Start: BufferPtr, LangOpts))
2088	return LexNumericConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size, Tok&: Result));
2089	}
2090
2091	// If we have a hex FP constant, continue.
2092	if ((C == `'-'` \|\| C == `'+'`) && (PrevCh == `'P'` \|\| PrevCh == `'p'`)) {
2093	// Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2094	// not-quite-conforming extension. Only do so if this looks like it's
2095	// actually meant to be a hexfloat, and not if it has a ud-suffix.
2096	bool IsHexFloat = true;
2097	if (!LangOpts.C99) {
2098	if (!isHexaLiteral(Start: BufferPtr, LangOpts))
2099	IsHexFloat = false;
2100	else if (!LangOpts.CPlusPlus17 &&
2101	std::find(first: BufferPtr, last: CurPtr, val: `'_'`) != CurPtr)
2102	IsHexFloat = false;
2103	}
2104	if (IsHexFloat)
2105	return LexNumericConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size, Tok&: Result));
2106	}
2107
2108	// If we have a digit separator, continue.
2109	if (C == `'\''` && LangOpts.AllowLiteralDigitSeparator) {
2110	auto [Next, NextSize] = getCharAndSizeNoWarn(Ptr: CurPtr + Size, LangOpts);
2111	if (isAsciiIdentifierContinue(c: Next)) {
2112	if (!isLexingRawMode())
2113	Diag(Loc: CurPtr, DiagID: LangOpts.CPlusPlus
2114	? diag::warn_cxx11_compat_digit_separator
2115	: diag::warn_c23_compat_digit_separator);
2116	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
2117	CurPtr = ConsumeChar(Ptr: CurPtr, Size: NextSize, Tok&: Result);
2118	return LexNumericConstant(Result, CurPtr);
2119	}
2120	}
2121
2122	// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2123	if (C == `'\\'` && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2124	return LexNumericConstant(Result, CurPtr);
2125	if (!isASCII(c: C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2126	return LexNumericConstant(Result, CurPtr);
2127
2128	// Update the location of token as well as BufferPtr.
2129	const char *TokStart = BufferPtr;
2130	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::numeric_constant);
2131	Result.setLiteralData(TokStart);
2132	return true;
2133	}
2134
2135	/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2136	/// in C++11, or warn on a ud-suffix in C++98.
2137	const char Lexer::LexUDSuffix(Token &Result, const* char *CurPtr,
2138	bool IsStringLiteral) {
2139	assert(LangOpts.CPlusPlus);
2140
2141	// Maximally munch an identifier.
2142	unsigned Size;
2143	char C = getCharAndSize(Ptr: CurPtr, Size);
2144	bool Consumed = false;
2145
2146	if (!isAsciiIdentifierStart(c: C)) {
2147	if (C == `'\\'` && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2148	Consumed = true;
2149	else if (!isASCII(c: C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2150	Consumed = true;
2151	else
2152	return CurPtr;
2153	}
2154
2155	if (!LangOpts.CPlusPlus11) {
2156	if (!isLexingRawMode())
2157	Diag(Loc: CurPtr,
2158	DiagID: C == `'_'` ? diag::warn_cxx11_compat_user_defined_literal
2159	: diag::warn_cxx11_compat_reserved_user_defined_literal)
2160	<< FixItHint::CreateInsertion(InsertionLoc: getSourceLocation(Loc: CurPtr), Code: " ");
2161	return CurPtr;
2162	}
2163
2164	// C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2165	// that does not start with an underscore is ill-formed. As a conforming
2166	// extension, we treat all such suffixes as if they had whitespace before
2167	// them. We assume a suffix beginning with a UCN or UTF-8 character is more
2168	// likely to be a ud-suffix than a macro, however, and accept that.
2169	if (!Consumed) {
2170	bool IsUDSuffix = false;
2171	if (C == `'_'`)
2172	IsUDSuffix = true;
2173	else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2174	// In C++1y, we need to look ahead a few characters to see if this is a
2175	// valid suffix for a string literal or a numeric literal (this could be
2176	// the 'operator""if' defining a numeric literal operator).
2177	const unsigned MaxStandardSuffixLength = `3`;
2178	char Buffer[MaxStandardSuffixLength] = { C };
2179	unsigned Consumed = Size;
2180	unsigned Chars = `1`;
2181	while (true) {
2182	auto [Next, NextSize] =
2183	getCharAndSizeNoWarn(Ptr: CurPtr + Consumed, LangOpts);
2184	if (!isAsciiIdentifierContinue(c: Next)) {
2185	// End of suffix. Check whether this is on the allowed list.
2186	const StringRef CompleteSuffix(Buffer, Chars);
2187	IsUDSuffix =
2188	StringLiteralParser::isValidUDSuffix(LangOpts, Suffix: CompleteSuffix);
2189	break;
2190	}
2191
2192	if (Chars == MaxStandardSuffixLength)
2193	// Too long: can't be a standard suffix.
2194	break;
2195
2196	Buffer[Chars++] = Next;
2197	Consumed += NextSize;
2198	}
2199	}
2200
2201	if (!IsUDSuffix) {
2202	if (!isLexingRawMode())
2203	Diag(Loc: CurPtr, DiagID: LangOpts.MSVCCompat
2204	? diag::ext_ms_reserved_user_defined_literal
2205	: diag::ext_reserved_user_defined_literal)
2206	<< FixItHint::CreateInsertion(InsertionLoc: getSourceLocation(Loc: CurPtr), Code: " ");
2207	return CurPtr;
2208	}
2209
2210	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
2211	}
2212
2213	Result.setFlag(Token::HasUDSuffix);
2214	while (true) {
2215	C = getCharAndSize(Ptr: CurPtr, Size);
2216	if (isAsciiIdentifierContinue(c: C)) {
2217	CurPtr = ConsumeChar(Ptr: CurPtr, Size, Tok&: Result);
2218	} else if (C == `'\\'` && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2219	} else if (!isASCII(c: C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2220	} else
2221	break;
2222	}
2223
2224	return CurPtr;
2225	}
2226
2227	/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2228	/// either " or L" or u8" or u" or U".
2229	bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2230	tok::TokenKind Kind) {
2231	const char *AfterQuote = CurPtr;
2232	// Does this string contain the \0 character?
2233	const char NulCharacter = nullptr*;
2234
2235	if (!isLexingRawMode() &&
2236	(Kind == tok::utf8_string_literal \|\|
2237	Kind == tok::utf16_string_literal \|\|
2238	Kind == tok::utf32_string_literal))
2239	Diag(Loc: BufferPtr, DiagID: LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2240	: diag::warn_c99_compat_unicode_literal);
2241
2242	char C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2243	while (C != `'"'`) {
2244	// Skip escaped characters. Escaped newlines will already be processed by
2245	// getAndAdvanceChar.
2246	if (C == `'\\'`)
2247	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2248
2249	if (C == `'\n'` \|\| C == `'\r'` \|\| // Newline.
2250	(C == `0` && CurPtr-`1` == BufferEnd)) { // End of file.
2251	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2252	Diag(Loc: BufferPtr, DiagID: diag::ext_unterminated_char_or_string) << `1`;
2253	FormTokenWithChars(Result, TokEnd: CurPtr-`1`, Kind: tok::unknown);
2254	return true;
2255	}
2256
2257	if (C == `0`) {
2258	if (isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
2259	if (ParsingFilename)
2260	codeCompleteIncludedFile(PathStart: AfterQuote, CompletionPoint: CurPtr - `1`, /IsAngled=/false);
2261	else
2262	PP->CodeCompleteNaturalLanguage();
2263	FormTokenWithChars(Result, TokEnd: CurPtr - `1`, Kind: tok::unknown);
2264	cutOffLexing();
2265	return true;
2266	}
2267
2268	NulCharacter = CurPtr-`1`;
2269	}
2270	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2271	}
2272
2273	// If we are in C++11, lex the optional ud-suffix.
2274	if (LangOpts.CPlusPlus)
2275	CurPtr = LexUDSuffix(Result, CurPtr, IsStringLiteral: true);
2276
2277	// If a nul character existed in the string, warn about it.
2278	if (NulCharacter && !isLexingRawMode())
2279	Diag(Loc: NulCharacter, DiagID: diag::null_in_char_or_string) << `1`;
2280
2281	// Update the location of the token as well as the BufferPtr instance var.
2282	const char *TokStart = BufferPtr;
2283	FormTokenWithChars(Result, TokEnd: CurPtr, Kind);
2284	Result.setLiteralData(TokStart);
2285	return true;
2286	}
2287
2288	/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2289	/// having lexed R", LR", u8R", uR", or UR".
2290	bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2291	tok::TokenKind Kind) {
2292	// This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2293	// Between the initial and final double quote characters of the raw string,
2294	// any transformations performed in phases 1 and 2 (trigraphs,
2295	// universal-character-names, and line splicing) are reverted.
2296
2297	if (!isLexingRawMode())
2298	Diag(Loc: BufferPtr, DiagID: diag::warn_cxx98_compat_raw_string_literal);
2299
2300	unsigned PrefixLen = `0`;
2301
2302	while (PrefixLen != `16` && isRawStringDelimBody(c: CurPtr[PrefixLen])) {
2303	if (!isLexingRawMode() &&
2304	llvm::is_contained(Set: {`'$'`, `'@'`, '`'}, Element: CurPtr[PrefixLen])) {
2305	const char *Pos = &CurPtr[PrefixLen];
2306	Diag(Loc: Pos, DiagID: LangOpts.CPlusPlus26
2307	? diag::warn_cxx26_compat_raw_string_literal_character_set
2308	: diag::ext_cxx26_raw_string_literal_character_set)
2309	<< StringRef(Pos, `1`);
2310	}
2311	++PrefixLen;
2312	}
2313
2314	// If the last character was not a '(', then we didn't lex a valid delimiter.
2315	if (CurPtr[PrefixLen] != `'('`) {
2316	if (!isLexingRawMode()) {
2317	const char *PrefixEnd = &CurPtr[PrefixLen];
2318	if (PrefixLen == `16`) {
2319	Diag(Loc: PrefixEnd, DiagID: diag::err_raw_delim_too_long);
2320	} else if (*PrefixEnd == `'\n'`) {
2321	Diag(Loc: PrefixEnd, DiagID: diag::err_invalid_newline_raw_delim);
2322	} else {
2323	Diag(Loc: PrefixEnd, DiagID: diag::err_invalid_char_raw_delim)
2324	<< StringRef(PrefixEnd, `1`);
2325	}
2326	}
2327
2328	// Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2329	// it's possible the '"' was intended to be part of the raw string, but
2330	// there's not much we can do about that.
2331	while (true) {
2332	char C = *CurPtr++;
2333
2334	if (C == `'"'`)
2335	break;
2336	if (C == `0` && CurPtr-`1` == BufferEnd) {
2337	--CurPtr;
2338	break;
2339	}
2340	}
2341
2342	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
2343	return true;
2344	}
2345
2346	// Save prefix and move CurPtr past it
2347	const char *Prefix = CurPtr;
2348	CurPtr += PrefixLen + `1`; // skip over prefix and '('
2349
2350	while (true) {
2351	char C = *CurPtr++;
2352
2353	if (C == `')'`) {
2354	// Check for prefix match and closing quote.
2355	if (strncmp(s1: CurPtr, s2: Prefix, n: PrefixLen) == `0` && CurPtr[PrefixLen] == `'"'`) {
2356	CurPtr += PrefixLen + `1`; // skip over prefix and '"'
2357	break;
2358	}
2359	} else if (C == `0` && CurPtr-`1` == BufferEnd) { // End of file.
2360	if (!isLexingRawMode())
2361	Diag(Loc: BufferPtr, DiagID: diag::err_unterminated_raw_string)
2362	<< StringRef(Prefix, PrefixLen);
2363	FormTokenWithChars(Result, TokEnd: CurPtr-`1`, Kind: tok::unknown);
2364	return true;
2365	}
2366	}
2367
2368	// If we are in C++11, lex the optional ud-suffix.
2369	if (LangOpts.CPlusPlus)
2370	CurPtr = LexUDSuffix(Result, CurPtr, IsStringLiteral: true);
2371
2372	// Update the location of token as well as BufferPtr.
2373	const char *TokStart = BufferPtr;
2374	FormTokenWithChars(Result, TokEnd: CurPtr, Kind);
2375	Result.setLiteralData(TokStart);
2376	return true;
2377	}
2378
2379	/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2380	/// after having lexed the '<' character. This is used for #include filenames.
2381	bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2382	// Does this string contain the \0 character?
2383	const char NulCharacter = nullptr*;
2384	const char *AfterLessPos = CurPtr;
2385	char C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2386	while (C != `'>'`) {
2387	// Skip escaped characters. Escaped newlines will already be processed by
2388	// getAndAdvanceChar.
2389	if (C == `'\\'`)
2390	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2391
2392	if (isVerticalWhitespace(c: C) \|\| // Newline.
2393	(C == `0` && (CurPtr - `1` == BufferEnd))) { // End of file.
2394	// If the filename is unterminated, then it must just be a lone <
2395	// character. Return this as such.
2396	FormTokenWithChars(Result, TokEnd: AfterLessPos, Kind: tok::less);
2397	return true;
2398	}
2399
2400	if (C == `0`) {
2401	if (isCodeCompletionPoint(CurPtr: CurPtr - `1`)) {
2402	codeCompleteIncludedFile(PathStart: AfterLessPos, CompletionPoint: CurPtr - `1`, /IsAngled=/true);
2403	cutOffLexing();
2404	FormTokenWithChars(Result, TokEnd: CurPtr - `1`, Kind: tok::unknown);
2405	return true;
2406	}
2407	NulCharacter = CurPtr-`1`;
2408	}
2409	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2410	}
2411
2412	// If a nul character existed in the string, warn about it.
2413	if (NulCharacter && !isLexingRawMode())
2414	Diag(Loc: NulCharacter, DiagID: diag::null_in_char_or_string) << `1`;
2415
2416	// Update the location of token as well as BufferPtr.
2417	const char *TokStart = BufferPtr;
2418	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::header_name);
2419	Result.setLiteralData(TokStart);
2420	return true;
2421	}
2422
2423	void Lexer::codeCompleteIncludedFile(const char *PathStart,
2424	const char *CompletionPoint,
2425	bool IsAngled) {
2426	// Completion only applies to the filename, after the last slash.
2427	StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2428	llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2429	auto Slash = PartialPath.find_last_of(Chars: SlashChars);
2430	StringRef Dir =
2431	(Slash == StringRef::npos) ? "" : PartialPath.take_front(N: Slash);
2432	const char *StartOfFilename =
2433	(Slash == StringRef::npos) ? PathStart : PathStart + Slash + `1`;
2434	// Code completion filter range is the filename only, up to completion point.
2435	PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2436	Name: StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2437	// We should replace the characters up to the closing quote or closest slash,
2438	// if any.
2439	while (CompletionPoint < BufferEnd) {
2440	char Next = *(CompletionPoint + `1`);
2441	if (Next == `0` \|\| Next == `'\r'` \|\| Next == `'\n'`)
2442	break;
2443	++CompletionPoint;
2444	if (Next == (IsAngled ? `'>'` : `'"'`))
2445	break;
2446	if (SlashChars.contains(C: Next))
2447	break;
2448	}
2449
2450	PP->setCodeCompletionTokenRange(
2451	Start: FileLoc.getLocWithOffset(Offset: StartOfFilename - BufferStart),
2452	End: FileLoc.getLocWithOffset(Offset: CompletionPoint - BufferStart));
2453	PP->CodeCompleteIncludedFile(Dir, IsAngled);
2454	}
2455
2456	/// LexCharConstant - Lex the remainder of a character constant, after having
2457	/// lexed either ' or L' or u8' or u' or U'.
2458	bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2459	tok::TokenKind Kind) {
2460	// Does this character contain the \0 character?
2461	const char NulCharacter = nullptr*;
2462
2463	if (!isLexingRawMode()) {
2464	if (Kind == tok::utf16_char_constant \|\| Kind == tok::utf32_char_constant)
2465	Diag(Loc: BufferPtr, DiagID: LangOpts.CPlusPlus
2466	? diag::warn_cxx98_compat_unicode_literal
2467	: diag::warn_c99_compat_unicode_literal);
2468	else if (Kind == tok::utf8_char_constant)
2469	Diag(Loc: BufferPtr, DiagID: LangOpts.CPlusPlus
2470	? diag::warn_cxx14_compat_u8_character_literal
2471	: diag::warn_c17_compat_u8_character_literal);
2472	}
2473
2474	char C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2475	if (C == `'\''`) {
2476	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2477	Diag(Loc: BufferPtr, DiagID: diag::ext_empty_character);
2478	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
2479	return true;
2480	}
2481
2482	while (C != `'\''`) {
2483	// Skip escaped characters.
2484	if (C == `'\\'`)
2485	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2486
2487	if (C == `'\n'` \|\| C == `'\r'` \|\| // Newline.
2488	(C == `0` && CurPtr-`1` == BufferEnd)) { // End of file.
2489	if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2490	Diag(Loc: BufferPtr, DiagID: diag::ext_unterminated_char_or_string) << `0`;
2491	FormTokenWithChars(Result, TokEnd: CurPtr-`1`, Kind: tok::unknown);
2492	return true;
2493	}
2494
2495	if (C == `0`) {
2496	if (isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
2497	PP->CodeCompleteNaturalLanguage();
2498	FormTokenWithChars(Result, TokEnd: CurPtr-`1`, Kind: tok::unknown);
2499	cutOffLexing();
2500	return true;
2501	}
2502
2503	NulCharacter = CurPtr-`1`;
2504	}
2505	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2506	}
2507
2508	// If we are in C++11, lex the optional ud-suffix.
2509	if (LangOpts.CPlusPlus)
2510	CurPtr = LexUDSuffix(Result, CurPtr, IsStringLiteral: false);
2511
2512	// If a nul character existed in the character, warn about it.
2513	if (NulCharacter && !isLexingRawMode())
2514	Diag(Loc: NulCharacter, DiagID: diag::null_in_char_or_string) << `0`;
2515
2516	// Update the location of token as well as BufferPtr.
2517	const char *TokStart = BufferPtr;
2518	FormTokenWithChars(Result, TokEnd: CurPtr, Kind);
2519	Result.setLiteralData(TokStart);
2520	return true;
2521	}
2522
2523	/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2524	/// Update BufferPtr to point to the next non-whitespace character and return.
2525	///
2526	/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2527	bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
2528	// Whitespace - Skip it, then return the token after the whitespace.
2529	bool SawNewline = isVerticalWhitespace(c: CurPtr[-`1`]);
2530
2531	unsigned char Char = *CurPtr;
2532
2533	const char lastNewLine = nullptr*;
2534	auto setLastNewLine = [&](const char *Ptr) {
2535	lastNewLine = Ptr;
2536	if (!NewLinePtr)
2537	NewLinePtr = Ptr;
2538	};
2539	if (SawNewline)
2540	setLastNewLine (CurPtr - `1`);
2541
2542	// Skip consecutive spaces efficiently.
2543	while (true) {
2544	// Skip horizontal whitespace, especially space, very aggressively.
2545	while (Char == `' '` \|\| isHorizontalWhitespace(c: Char))
2546	Char = *++CurPtr;
2547
2548	// Otherwise if we have something other than whitespace, we're done.
2549	if (!isVerticalWhitespace(c: Char))
2550	break;
2551
2552	if (ParsingPreprocessorDirective) {
2553	// End of preprocessor directive line, let LexTokenInternal handle this.
2554	BufferPtr = CurPtr;
2555	return false;
2556	}
2557
2558	// OK, but handle newline.
2559	if (*CurPtr == `'\n'`)
2560	setLastNewLine (CurPtr);
2561	SawNewline = true;
2562	Char = *++CurPtr;
2563	}
2564
2565	// If the client wants us to return whitespace, return it now.
2566	if (isKeepWhitespaceMode()) {
2567	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
2568	if (SawNewline) {
2569	IsAtStartOfLine = true;
2570	IsAtPhysicalStartOfLine = true;
2571	}
2572	// FIXME: The next token will not have LeadingSpace set.
2573	return true;
2574	}
2575
2576	// If this isn't immediately after a newline, there is leading space.
2577	char PrevChar = CurPtr[-`1`];
2578	bool HasLeadingSpace = !isVerticalWhitespace(c: PrevChar);
2579
2580	Result.setFlagValue(Flag: Token::LeadingSpace, Val: HasLeadingSpace);
2581	if (SawNewline) {
2582	Result.setFlag(Token::StartOfLine);
2583	Result.setFlag(Token::PhysicalStartOfLine);
2584
2585	if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2586	if (auto *Handler = PP->getEmptylineHandler())
2587	Handler->HandleEmptyline(Range: SourceRange (getSourceLocation(Loc: NewLinePtr + `1`),
2588	getSourceLocation(Loc: lastNewLine)));
2589	}
2590	}
2591
2592	BufferPtr = CurPtr;
2593	return false;
2594	}
2595
2596	/// We have just read the // characters from input. Skip until we find the
2597	/// newline character that terminates the comment. Then update BufferPtr and
2598	/// return.
2599	///
2600	/// If we're in KeepCommentMode or any CommentHandler has inserted
2601	/// some tokens, this will store the first token and return true.
2602	bool Lexer::SkipLineComment(Token &Result, const char *CurPtr) {
2603	// If Line comments aren't explicitly enabled for this language, emit an
2604	// extension warning.
2605	if (!LineComment) {
2606	if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2607	Diag(Loc: BufferPtr, DiagID: diag::ext_line_comment);
2608
2609	// Mark them enabled so we only emit one warning for this translation
2610	// unit.
2611	LineComment = true;
2612	}
2613
2614	// Scan over the body of the comment. The common case, when scanning, is that
2615	// the comment contains normal ascii characters with nothing interesting in
2616	// them. As such, optimize for this case with the inner loop.
2617	//
2618	// This loop terminates with CurPtr pointing at the newline (or end of buffer)
2619	// character that ends the line comment.
2620
2621	// C++23 [lex.phases] p1
2622	// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2623	// diagnostic only once per entire ill-formed subsequence to avoid
2624	// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2625	bool UnicodeDecodingAlreadyDiagnosed = false;
2626
2627	char C;
2628	while (true) {
2629	C = *CurPtr;
2630	// Skip over characters in the fast loop.
2631	while (isASCII(c: C) && C != `0` && // Potentially EOF.
2632	C != `'\n'` && C != `'\r'`) { // Newline or DOS-style newline.
2633	C = *++CurPtr;
2634	UnicodeDecodingAlreadyDiagnosed = false;
2635	}
2636
2637	if (!isASCII(c: C)) {
2638	unsigned Length = llvm::getUTF8SequenceSize(
2639	source: (const llvm::UTF8 )CurPtr, sourceEnd: (const* llvm::UTF8 *)BufferEnd);
2640	if (Length == `0`) {
2641	if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2642	Diag(Loc: CurPtr, DiagID: diag::warn_invalid_utf8_in_comment);
2643	UnicodeDecodingAlreadyDiagnosed = true;
2644	++CurPtr;
2645	} else {
2646	UnicodeDecodingAlreadyDiagnosed = false;
2647	CurPtr += Length;
2648	}
2649	continue;
2650	}
2651
2652	const char *NextLine = CurPtr;
2653	if (C != `0`) {
2654	// We found a newline, see if it's escaped.
2655	const char *EscapePtr = CurPtr-`1`;
2656	bool HasSpace = false;
2657	while (isHorizontalWhitespace(c: EscapePtr)) { // Skip whitespace.*
2658	--EscapePtr;
2659	HasSpace = true;
2660	}
2661
2662	if (*EscapePtr == `'\\'`)
2663	// Escaped newline.
2664	CurPtr = EscapePtr;
2665	else if (EscapePtr[`0`] == `'/'` && EscapePtr[-`1`] == `'?'` &&
2666	EscapePtr[-`2`] == `'?'` && LangOpts.Trigraphs)
2667	// Trigraph-escaped newline.
2668	CurPtr = EscapePtr-`2`;
2669	else
2670	break; // This is a newline, we're done.
2671
2672	// If there was space between the backslash and newline, warn about it.
2673	if (HasSpace && !isLexingRawMode())
2674	Diag(Loc: EscapePtr, DiagID: diag::backslash_newline_space);
2675	}
2676
2677	// Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2678	// properly decode the character. Read it in raw mode to avoid emitting
2679	// diagnostics about things like trigraphs. If we see an escaped newline,
2680	// we'll handle it below.
2681	const char *OldPtr = CurPtr;
2682	bool OldRawMode = isLexingRawMode();
2683	LexingRawMode = true;
2684	C = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
2685	LexingRawMode = OldRawMode;
2686
2687	// If we only read only one character, then no special handling is needed.
2688	// We're done and can skip forward to the newline.
2689	if (C != `0` && CurPtr == OldPtr+`1`) {
2690	CurPtr = NextLine;
2691	break;
2692	}
2693
2694	// If we read multiple characters, and one of those characters was a \r or
2695	// \n, then we had an escaped newline within the comment. Emit diagnostic
2696	// unless the next line is also a // comment.
2697	if (CurPtr != OldPtr + `1` && C != `'/'` &&
2698	(CurPtr == BufferEnd + `1` \|\| CurPtr[`0`] != `'/'`)) {
2699	for (; OldPtr != CurPtr; ++OldPtr)
2700	if (OldPtr[`0`] == `'\n'` \|\| OldPtr[`0`] == `'\r'`) {
2701	// Okay, we found a // comment that ends in a newline, if the next
2702	// line is also a // comment, but has spaces, don't emit a diagnostic.
2703	if (isWhitespace(c: C)) {
2704	const char *ForwardPtr = CurPtr;
2705	while (isWhitespace(c: ForwardPtr)) // Skip whitespace.*
2706	++ForwardPtr;
2707	if (ForwardPtr[`0`] == `'/'` && ForwardPtr[`1`] == `'/'`)
2708	break;
2709	}
2710
2711	if (!isLexingRawMode())
2712	Diag(Loc: OldPtr-`1`, DiagID: diag::ext_multi_line_line_comment);
2713	break;
2714	}
2715	}
2716
2717	if (C == `'\r'` \|\| C == `'\n'` \|\| CurPtr == BufferEnd + `1`) {
2718	--CurPtr;
2719	break;
2720	}
2721
2722	if (C == `'\0'` && isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
2723	PP->CodeCompleteNaturalLanguage();
2724	cutOffLexing();
2725	return false;
2726	}
2727	}
2728
2729	// Found but did not consume the newline. Notify comment handlers about the
2730	// comment unless we're in a #if 0 block.
2731	if (PP && !isLexingRawMode() &&
2732	PP->HandleComment(result&: Result, Comment: SourceRange (getSourceLocation(Loc: BufferPtr),
2733	getSourceLocation(Loc: CurPtr)))) {
2734	BufferPtr = CurPtr;
2735	return true; // A token has to be returned.
2736	}
2737
2738	// If we are returning comments as tokens, return this comment as a token.
2739	if (inKeepCommentMode())
2740	return SaveLineComment(Result, CurPtr);
2741
2742	// If we are inside a preprocessor directive and we see the end of line,
2743	// return immediately, so that the lexer can return this as an EOD token.
2744	if (ParsingPreprocessorDirective \|\| CurPtr == BufferEnd) {
2745	BufferPtr = CurPtr;
2746	return false;
2747	}
2748
2749	// Otherwise, eat the \n character. We don't care if this is a \n\r or
2750	// \r\n sequence. This is an efficiency hack (because we know the \n can't
2751	// contribute to another token), it isn't needed for correctness. Note that
2752	// this is ok even in KeepWhitespaceMode, because we would have returned the
2753	// comment above in that mode.
2754	NewLinePtr = CurPtr++;
2755
2756	// The next returned token is at the start of the line.
2757	Result.setFlag(Token::StartOfLine);
2758	Result.setFlag(Token::PhysicalStartOfLine);
2759	// No leading whitespace seen so far.
2760	Result.clearFlag(Flag: Token::LeadingSpace);
2761	BufferPtr = CurPtr;
2762	return false;
2763	}
2764
2765	/// If in save-comment mode, package up this Line comment in an appropriate
2766	/// way and return it.
2767	bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2768	// If we're not in a preprocessor directive, just return the // comment
2769	// directly.
2770	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::comment);
2771
2772	if (!ParsingPreprocessorDirective \|\| LexingRawMode)
2773	return true;
2774
2775	// If this Line-style comment is in a macro definition, transmogrify it into
2776	// a C-style block comment.
2777	bool Invalid = false;
2778	std::string Spelling = PP->getSpelling(Tok: Result, Invalid: &Invalid);
2779	if (Invalid)
2780	return true;
2781
2782	assert(Spelling[`0`] == `'/'` && Spelling[`1`] == `'/'` && "Not line comment?");
2783	Spelling [`1`] = `''`; // Change prefix to "/".
2784	Spelling += "/"; // add suffix.*
2785
2786	Result.setKind(tok::comment);
2787	PP->CreateString(Str: Spelling, Tok&: Result,
2788	ExpansionLocStart: Result.getLocation(), ExpansionLocEnd: Result.getLocation());
2789	return true;
2790	}
2791
2792	/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2793	/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2794	/// a diagnostic if so. We know that the newline is inside of a block comment.
2795	static bool isEndOfBlockCommentWithEscapedNewLine(const char CurPtr, Lexer L,
2796	bool Trigraphs) {
2797	assert(CurPtr[`0`] == `'\n'` \|\| CurPtr[`0`] == `'\r'`);
2798
2799	// Position of the first trigraph in the ending sequence.
2800	const char TrigraphPos = nullptr*;
2801	// Position of the first whitespace after a '\' in the ending sequence.
2802	const char SpacePos = nullptr*;
2803
2804	while (true) {
2805	// Back up off the newline.
2806	--CurPtr;
2807
2808	// If this is a two-character newline sequence, skip the other character.
2809	if (CurPtr[`0`] == `'\n'` \|\| CurPtr[`0`] == `'\r'`) {
2810	// \n\n or \r\r -> not escaped newline.
2811	if (CurPtr[`0`] == CurPtr[`1`])
2812	return false;
2813	// \n\r or \r\n -> skip the newline.
2814	--CurPtr;
2815	}
2816
2817	// If we have horizontal whitespace, skip over it. We allow whitespace
2818	// between the slash and newline.
2819	while (isHorizontalWhitespace(c: CurPtr) \|\| CurPtr == `0`) {
2820	SpacePos = CurPtr;
2821	--CurPtr;
2822	}
2823
2824	// If we have a slash, this is an escaped newline.
2825	if (*CurPtr == `'\\'`) {
2826	--CurPtr;
2827	} else if (CurPtr[`0`] == `'/'` && CurPtr[-`1`] == `'?'` && CurPtr[-`2`] == `'?'`) {
2828	// This is a trigraph encoding of a slash.
2829	TrigraphPos = CurPtr - `2`;
2830	CurPtr -= `3`;
2831	} else {
2832	return false;
2833	}
2834
2835	// If the character preceding the escaped newline is a '', then after line*
2836	// splicing we have a '/' ending the comment.*
2837	if (CurPtr == `''`)
2838	break;
2839
2840	if (CurPtr != `'\n'` && CurPtr != `'\r'`)
2841	return false;
2842	}
2843
2844	if (TrigraphPos) {
2845	// If no trigraphs are enabled, warn that we ignored this trigraph and
2846	// ignore this character.*
2847	if (!Trigraphs) {
2848	if (!L->isLexingRawMode())
2849	L->Diag(Loc: TrigraphPos, DiagID: diag::trigraph_ignored_block_comment);
2850	return false;
2851	}
2852	if (!L->isLexingRawMode())
2853	L->Diag(Loc: TrigraphPos, DiagID: diag::trigraph_ends_block_comment);
2854	}
2855
2856	// Warn about having an escaped newline between the / characters.*
2857	if (!L->isLexingRawMode())
2858	L->Diag(Loc: CurPtr + `1`, DiagID: diag::escaped_newline_block_comment_end);
2859
2860	// If there was space between the backslash and newline, warn about it.
2861	if (SpacePos && !L->isLexingRawMode())
2862	L->Diag(Loc: SpacePos, DiagID: diag::backslash_newline_space);
2863
2864	return true;
2865	}
2866
2867	#ifdef __SSE2__
2868	#include <emmintrin.h>
2869	#elif __ALTIVEC__
2870	#include <altivec.h>
2871	#undef bool
2872	#endif
2873
2874	/// We have just read from input the / and characters that started a comment.*
2875	/// Read until we find the and / characters that terminate the comment.*
2876	/// Note that we don't bother decoding trigraphs or escaped newlines in block
2877	/// comments, because they cannot cause the comment to end. The only thing
2878	/// that can happen is the comment could end with an escaped newline between
2879	/// the terminating and /.*
2880	///
2881	/// If we're in KeepCommentMode or any CommentHandler has inserted
2882	/// some tokens, this will store the first token and return true.
2883	bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
2884	// Scan one character past where we should, looking for a '/' character. Once
2885	// we find it, check to see if it was preceded by a . This common*
2886	// optimization helps people who like to put a lot of characters in their*
2887	// comments.
2888
2889	// The first character we get with newlines and trigraphs skipped to handle
2890	// the degenerate // case below correctly if the * has an escaped newline*
2891	// after it.
2892	unsigned CharSize;
2893	unsigned char C = getCharAndSize(Ptr: CurPtr, Size&: CharSize);
2894	CurPtr += CharSize;
2895	if (C == `0` && CurPtr == BufferEnd+`1`) {
2896	if (!isLexingRawMode())
2897	Diag(Loc: BufferPtr, DiagID: diag::err_unterminated_block_comment);
2898	--CurPtr;
2899
2900	// KeepWhitespaceMode should return this broken comment as a token. Since
2901	// it isn't a well formed comment, just return it as an 'unknown' token.
2902	if (isKeepWhitespaceMode()) {
2903	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
2904	return true;
2905	}
2906
2907	BufferPtr = CurPtr;
2908	return false;
2909	}
2910
2911	// Check to see if the first character after the '/' is another /. If so,*
2912	// then this slash does not end the block comment, it is part of it.
2913	if (C == `'/'`)
2914	C = *CurPtr++;
2915
2916	// C++23 [lex.phases] p1
2917	// Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2918	// diagnostic only once per entire ill-formed subsequence to avoid
2919	// emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2920	bool UnicodeDecodingAlreadyDiagnosed = false;
2921
2922	while (true) {
2923	// Skip over all non-interesting characters until we find end of buffer or a
2924	// (probably ending) '/' character.
2925	if (CurPtr + `24` < BufferEnd &&
2926	// If there is a code-completion point avoid the fast scan because it
2927	// doesn't check for '\0'.
2928	!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2929	// While not aligned to a 16-byte boundary.
2930	while (C != `'/'` && (intptr_t)CurPtr % `16` != `0`) {
2931	if (!isASCII(c: C))
2932	goto MultiByteUTF8;
2933	C = *CurPtr++;
2934	}
2935	if (C == `'/'`) goto FoundSlash;
2936
2937	#ifdef __SSE2__
2938	__m128i Slashes = _mm_set1_epi8(b: `'/'`);
2939	while (CurPtr + `16` < BufferEnd) {
2940	int Mask = _mm_movemask_epi8(a: (const* __m128i *)CurPtr);
2941	if (LLVM_UNLIKELY(Mask != `0`)) {
2942	goto MultiByteUTF8;
2943	}
2944	// look for slashes
2945	int cmp = _mm_movemask_epi8(a: _mm_cmpeq_epi8(a: (const* __m128i*)CurPtr,
2946	b: Slashes));
2947	if (cmp != `0`) {
2948	// Adjust the pointer to point directly after the first slash. It's
2949	// not necessary to set C here, it will be overwritten at the end of
2950	// the outer loop.
2951	CurPtr += llvm::countr_zero<unsigned>(Val: cmp) + `1`;
2952	goto FoundSlash;
2953	}
2954	CurPtr += `16`;
2955	}
2956	#elif __ALTIVEC__
2957	__vector unsigned char LongUTF = {`0x80`, `0x80`, `0x80`, `0x80`, `0x80`, `0x80`,
2958	`0x80`, `0x80`, `0x80`, `0x80`, `0x80`, `0x80`,
2959	`0x80`, `0x80`, `0x80`, `0x80`};
2960	__vector unsigned char Slashes = {
2961	`'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`,
2962	`'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`, `'/'`
2963	};
2964	while (CurPtr + `16` < BufferEnd) {
2965	if (LLVM_UNLIKELY(
2966	vec_any_ge((const* __vector unsigned char *)CurPtr, LongUTF)))
2967	goto MultiByteUTF8;
2968	if (vec_any_eq((const* __vector unsigned char *)CurPtr, Slashes)) {
2969	break;
2970	}
2971	CurPtr += `16`;
2972	}
2973
2974	#else
2975	while (CurPtr + `16` < BufferEnd) {
2976	bool HasNonASCII = false;
2977	for (unsigned I = `0`; I < `16`; ++I)
2978	HasNonASCII \|= !isASCII(CurPtr[I]);
2979
2980	if (LLVM_UNLIKELY(HasNonASCII))
2981	goto MultiByteUTF8;
2982
2983	bool HasSlash = false;
2984	for (unsigned I = `0`; I < `16`; ++I)
2985	HasSlash \|= CurPtr[I] == `'/'`;
2986	if (HasSlash)
2987	break;
2988	CurPtr += `16`;
2989	}
2990	#endif
2991
2992	// It has to be one of the bytes scanned, increment to it and read one.
2993	C = *CurPtr++;
2994	}
2995
2996	// Loop to scan the remainder, warning on invalid UTF-8
2997	// if the corresponding warning is enabled, emitting a diagnostic only once
2998	// per sequence that cannot be decoded.
2999	while (C != `'/'` && C != `'\0'`) {
3000	if (isASCII(c: C)) {
3001	UnicodeDecodingAlreadyDiagnosed = false;
3002	C = *CurPtr++;
3003	continue;
3004	}
3005	MultiByteUTF8:
3006	// CurPtr is 1 code unit past C, so to decode
3007	// the codepoint, we need to read from the previous position.
3008	unsigned Length = llvm::getUTF8SequenceSize(
3009	source: (const llvm::UTF8 )CurPtr - `1`, sourceEnd: (const* llvm::UTF8 *)BufferEnd);
3010	if (Length == `0`) {
3011	if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
3012	Diag(Loc: CurPtr - `1`, DiagID: diag::warn_invalid_utf8_in_comment);
3013	UnicodeDecodingAlreadyDiagnosed = true;
3014	} else {
3015	UnicodeDecodingAlreadyDiagnosed = false;
3016	CurPtr += Length - `1`;
3017	}
3018	C = *CurPtr++;
3019	}
3020
3021	if (C == `'/'`) {
3022	FoundSlash:
3023	if (CurPtr[-`2`] == `''`) // We found the final /. We're done!
3024	break;
3025
3026	if ((CurPtr[-`2`] == `'\n'` \|\| CurPtr[-`2`] == `'\r'`)) {
3027	if (isEndOfBlockCommentWithEscapedNewLine(CurPtr: CurPtr - `2`, L: this,
3028	Trigraphs: LangOpts.Trigraphs)) {
3029	// We found the final /, though it had an escaped newline between the*
3030	// and /. We're done!*
3031	break;
3032	}
3033	}
3034	if (CurPtr[`0`] == `'*'` && CurPtr[`1`] != `'/'`) {
3035	// If this is a / inside of the comment, emit a warning. Don't do this*
3036	// if this is a //, which will end the comment. This misses cases with*
3037	// embedded escaped newlines, but oh well.
3038	if (!isLexingRawMode())
3039	Diag(Loc: CurPtr-`1`, DiagID: diag::warn_nested_block_comment);
3040	}
3041	} else if (C == `0` && CurPtr == BufferEnd+`1`) {
3042	if (!isLexingRawMode())
3043	Diag(Loc: BufferPtr, DiagID: diag::err_unterminated_block_comment);
3044	// Note: the user probably forgot a /. We could continue immediately*
3045	// after the /, but this would involve lexing a lot of what really is the*
3046	// comment, which surely would confuse the parser.
3047	--CurPtr;
3048
3049	// KeepWhitespaceMode should return this broken comment as a token. Since
3050	// it isn't a well formed comment, just return it as an 'unknown' token.
3051	if (isKeepWhitespaceMode()) {
3052	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
3053	return true;
3054	}
3055
3056	BufferPtr = CurPtr;
3057	return false;
3058	} else if (C == `'\0'` && isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
3059	PP->CodeCompleteNaturalLanguage();
3060	cutOffLexing();
3061	return false;
3062	}
3063
3064	C = *CurPtr++;
3065	}
3066
3067	// Notify comment handlers about the comment unless we're in a #if 0 block.
3068	if (PP && !isLexingRawMode() &&
3069	PP->HandleComment(result&: Result, Comment: SourceRange (getSourceLocation(Loc: BufferPtr),
3070	getSourceLocation(Loc: CurPtr)))) {
3071	BufferPtr = CurPtr;
3072	return true; // A token has to be returned.
3073	}
3074
3075	// If we are returning comments as tokens, return this comment as a token.
3076	if (inKeepCommentMode()) {
3077	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::comment);
3078	return true;
3079	}
3080
3081	// It is common for the tokens immediately after a // comment to be
3082	// whitespace. Instead of going through the big switch, handle it
3083	// efficiently now. This is safe even in KeepWhitespaceMode because we would
3084	// have already returned above with the comment as a token.
3085	if (isHorizontalWhitespace(c: *CurPtr)) {
3086	SkipWhitespace(Result, CurPtr: CurPtr + `1`);
3087	return false;
3088	}
3089
3090	// Otherwise, just return so that the next character will be lexed as a token.
3091	BufferPtr = CurPtr;
3092	Result.setFlag(Token::LeadingSpace);
3093	return false;
3094	}
3095
3096	//===----------------------------------------------------------------------===//
3097	// Primary Lexing Entry Points
3098	//===----------------------------------------------------------------------===//
3099
3100	/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3101	/// uninterpreted string. This switches the lexer out of directive mode.
3102	void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
3103	assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3104	"Must be in a preprocessing directive!");
3105	Token Tmp;
3106	Tmp.startToken();
3107
3108	// CurPtr - Cache BufferPtr in an automatic variable.
3109	const char *CurPtr = BufferPtr;
3110	while (true) {
3111	char Char = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Tmp);
3112	switch (Char) {
3113	default:
3114	if (Result)
3115	Result->push_back(Elt: Char);
3116	break;
3117	case `0`: // Null.
3118	// Found end of file?
3119	if (CurPtr-`1` != BufferEnd) {
3120	if (isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
3121	PP->CodeCompleteNaturalLanguage();
3122	cutOffLexing();
3123	return;
3124	}
3125
3126	// Nope, normal character, continue.
3127	if (Result)
3128	Result->push_back(Elt: Char);
3129	break;
3130	}
3131	// FALL THROUGH.
3132	[[fallthrough]];
3133	case `'\r'`:
3134	case `'\n'`:
3135	// Okay, we found the end of the line. First, back up past the \0, \r, \n.
3136	assert(CurPtr[-`1`] == Char && "Trigraphs for newline?");
3137	BufferPtr = CurPtr-`1`;
3138
3139	// Next, lex the character, which should handle the EOD transition.
3140	Lex(Result&: Tmp);
3141	if (Tmp.is(K: tok::code_completion)) {
3142	if (PP)
3143	PP->CodeCompleteNaturalLanguage();
3144	Lex(Result&: Tmp);
3145	}
3146	assert(Tmp.is(tok::eod) && "Unexpected token!");
3147
3148	// Finally, we're done;
3149	return;
3150	}
3151	}
3152	}
3153
3154	/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3155	/// condition, reporting diagnostics and handling other edge cases as required.
3156	/// This returns true if Result contains a token, false if PP.Lex should be
3157	/// called again.
3158	bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3159	// If we hit the end of the file while parsing a preprocessor directive,
3160	// end the preprocessor directive first. The next token returned will
3161	// then be the end of file.
3162	if (ParsingPreprocessorDirective) {
3163	// Done parsing the "line".
3164	ParsingPreprocessorDirective = false;
3165	// Update the location of token as well as BufferPtr.
3166	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::eod);
3167
3168	// Restore comment saving mode, in case it was disabled for directive.
3169	if (PP)
3170	resetExtendedTokenMode();
3171	return true; // Have a token.
3172	}
3173
3174	// If we are in raw mode, return this event as an EOF token. Let the caller
3175	// that put us in raw mode handle the event.
3176	if (isLexingRawMode()) {
3177	Result.startToken();
3178	BufferPtr = BufferEnd;
3179	FormTokenWithChars(Result, TokEnd: BufferEnd, Kind: tok::eof);
3180	return true;
3181	}
3182
3183	if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
3184	PP->setRecordedPreambleConditionalStack(ConditionalStack);
3185	// If the preamble cuts off the end of a header guard, consider it guarded.
3186	// The guard is valid for the preamble content itself, and for tools the
3187	// most useful answer is "yes, this file has a header guard".
3188	if (!ConditionalStack.empty())
3189	MIOpt.ExitTopLevelConditional();
3190	ConditionalStack.clear();
3191	}
3192
3193	// Issue diagnostics for unterminated #if and missing newline.
3194
3195	// If we are in a #if directive, emit an error.
3196	while (!ConditionalStack.empty()) {
3197	if (PP->getCodeCompletionFileLoc() != FileLoc)
3198	PP->Diag(Loc: ConditionalStack.back().IfLoc,
3199	DiagID: diag::err_pp_unterminated_conditional);
3200	ConditionalStack.pop_back();
3201	}
3202
3203	// Before C++11 and C2y, a file not ending with a newline was UB. Both
3204	// standards changed this behavior (as a DR or equivalent), but we still have
3205	// an opt-in diagnostic to warn about it.
3206	if (CurPtr != BufferStart && (CurPtr[-`1`] != `'\n'` && CurPtr[-`1`] != `'\r'`))
3207	Diag(Loc: BufferEnd, DiagID: diag::warn_no_newline_eof)
3208	<< FixItHint::CreateInsertion(InsertionLoc: getSourceLocation(Loc: BufferEnd), Code: "\n");
3209
3210	BufferPtr = CurPtr;
3211
3212	// Finally, let the preprocessor handle this.
3213	return PP->HandleEndOfFile(Result, isEndOfMacro: isPragmaLexer());
3214	}
3215
3216	/// peekNextPPToken - Return std::nullopt if there are no more tokens in the
3217	/// buffer controlled by this lexer, otherwise return the next unexpanded
3218	/// token.
3219	std::optional<Token> Lexer::peekNextPPToken() {
3220	assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3221
3222	if (isDependencyDirectivesLexer()) {
3223	if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3224	return std::nullopt;
3225	Token Result;
3226	(void)convertDependencyDirectiveToken(
3227	DDTok: DepDirectives.front().Tokens [NextDepDirectiveTokenIndex], Result);
3228	return Result;
3229	}
3230
3231	// Switch to 'skipping' mode. This will ensure that we can lex a token
3232	// without emitting diagnostics, disables macro expansion, and will cause EOF
3233	// to return an EOF token instead of popping the include stack.
3234	LexingRawMode = true;
3235
3236	// Save state that can be changed while lexing so that we can restore it.
3237	const char *TmpBufferPtr = BufferPtr;
3238	bool inPPDirectiveMode = ParsingPreprocessorDirective;
3239	bool atStartOfLine = IsAtStartOfLine;
3240	bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3241	bool leadingSpace = HasLeadingSpace;
3242	MultipleIncludeOpt MIOptState = MIOpt;
3243
3244	Token Tok;
3245	Lex(Result&: Tok);
3246
3247	// Restore state that may have changed.
3248	BufferPtr = TmpBufferPtr;
3249	ParsingPreprocessorDirective = inPPDirectiveMode;
3250	HasLeadingSpace = leadingSpace;
3251	IsAtStartOfLine = atStartOfLine;
3252	IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3253	MIOpt = MIOptState;
3254	// Restore the lexer back to non-skipping mode.
3255	LexingRawMode = false;
3256
3257	if (Tok.is(K: tok::eof))
3258	return std::nullopt;
3259	return Tok;
3260	}
3261
3262	/// Find the end of a version control conflict marker.
3263	static const char FindConflictEnd(const* char CurPtr, const* char *BufferEnd,
3264	ConflictMarkerKind CMK) {
3265	const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3266	size_t TermLen = CMK == CMK_Perforce ? `5` : `7`;
3267	auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(Start: TermLen);
3268	size_t Pos = RestOfBuffer.find(Str: Terminator);
3269	while (Pos != StringRef::npos) {
3270	// Must occur at start of line.
3271	if (Pos == `0` \|\|
3272	(RestOfBuffer [Pos - `1`] != `'\r'` && RestOfBuffer [Pos - `1`] != `'\n'`)) {
3273	RestOfBuffer = RestOfBuffer.substr(Start: Pos+TermLen);
3274	Pos = RestOfBuffer.find(Str: Terminator);
3275	continue;
3276	}
3277	return RestOfBuffer.data()+Pos;
3278	}
3279	return nullptr;
3280	}
3281
3282	/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3283	/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3284	/// and recover nicely. This returns true if it is a conflict marker and false
3285	/// if not.
3286	bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3287	// Only a conflict marker if it starts at the beginning of a line.
3288	if (CurPtr != BufferStart &&
3289	CurPtr[-`1`] != `'\n'` && CurPtr[-`1`] != `'\r'`)
3290	return false;
3291
3292	// Check to see if we have <<<<<<< or >>>>.
3293	if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with(Prefix: "<<<<<<<") &&
3294	!StringRef(CurPtr, BufferEnd - CurPtr).starts_with(Prefix: ">>>> "))
3295	return false;
3296
3297	// If we have a situation where we don't care about conflict markers, ignore
3298	// it.
3299	if (CurrentConflictMarkerState \|\| isLexingRawMode())
3300	return false;
3301
3302	ConflictMarkerKind Kind = *CurPtr == `'<'` ? CMK_Normal : CMK_Perforce;
3303
3304	// Check to see if there is an ending marker somewhere in the buffer at the
3305	// start of a line to terminate this conflict marker.
3306	if (FindConflictEnd(CurPtr, BufferEnd, CMK: Kind)) {
3307	// We found a match. We are really in a conflict marker.
3308	// Diagnose this, and ignore to the end of line.
3309	Diag(Loc: CurPtr, DiagID: diag::err_conflict_marker);
3310	CurrentConflictMarkerState = Kind;
3311
3312	// Skip ahead to the end of line. We know this exists because the
3313	// end-of-conflict marker starts with \r or \n.
3314	while (CurPtr != `'\r'` && CurPtr != `'\n'`) {
3315	assert(CurPtr != BufferEnd && "Didn't find end of line");
3316	++CurPtr;
3317	}
3318	BufferPtr = CurPtr;
3319	return true;
3320	}
3321
3322	// No end of conflict marker found.
3323	return false;
3324	}
3325
3326	/// HandleEndOfConflictMarker - If this is a '====' or '\|\|\|\|' or '>>>>', or if
3327	/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3328	/// is the end of a conflict marker. Handle it by ignoring up until the end of
3329	/// the line. This returns true if it is a conflict marker and false if not.
3330	bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3331	// Only a conflict marker if it starts at the beginning of a line.
3332	if (CurPtr != BufferStart &&
3333	CurPtr[-`1`] != `'\n'` && CurPtr[-`1`] != `'\r'`)
3334	return false;
3335
3336	// If we have a situation where we don't care about conflict markers, ignore
3337	// it.
3338	if (!CurrentConflictMarkerState \|\| isLexingRawMode())
3339	return false;
3340
3341	// Check to see if we have the marker (4 characters in a row).
3342	for (unsigned i = `1`; i != `4`; ++i)
3343	if (CurPtr[i] != CurPtr[`0`])
3344	return false;
3345
3346	// If we do have it, search for the end of the conflict marker. This could
3347	// fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3348	// be the end of conflict marker.
3349	if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3350	CMK: CurrentConflictMarkerState)) {
3351	CurPtr = End;
3352
3353	// Skip ahead to the end of line.
3354	while (CurPtr != BufferEnd && CurPtr != `'\r'` && CurPtr != `'\n'`)
3355	++CurPtr;
3356
3357	BufferPtr = CurPtr;
3358
3359	// No longer in the conflict marker.
3360	CurrentConflictMarkerState = CMK_None;
3361	return true;
3362	}
3363
3364	return false;
3365	}
3366
3367	static const char findPlaceholderEnd(const* char *CurPtr,
3368	const char *BufferEnd) {
3369	if (CurPtr == BufferEnd)
3370	return nullptr;
3371	BufferEnd -= `1`; // Scan until the second last character.
3372	for (; CurPtr != BufferEnd; ++CurPtr) {
3373	if (CurPtr[`0`] == `'#'` && CurPtr[`1`] == `'>'`)
3374	return CurPtr + `2`;
3375	}
3376	return nullptr;
3377	}
3378
3379	bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3380	assert(CurPtr[-`1`] == `'<'` && CurPtr[`0`] == `'#'` && "Not a placeholder!");
3381	if (!PP \|\| !PP->getPreprocessorOpts().LexEditorPlaceholders \|\| LexingRawMode)
3382	return false;
3383	const char *End = findPlaceholderEnd(CurPtr: CurPtr + `1`, BufferEnd);
3384	if (!End)
3385	return false;
3386	const char *Start = CurPtr - `1`;
3387	if (!LangOpts.AllowEditorPlaceholders)
3388	Diag(Loc: Start, DiagID: diag::err_placeholder_in_source);
3389	Result.startToken();
3390	FormTokenWithChars(Result, TokEnd: End, Kind: tok::raw_identifier);
3391	Result.setRawIdentifierData(Start);
3392	PP->LookUpIdentifierInfo(Identifier&: Result);
3393	Result.setFlag(Token::IsEditorPlaceholder);
3394	BufferPtr = End;
3395	return true;
3396	}
3397
3398	bool Lexer::isCodeCompletionPoint(const char CurPtr) const* {
3399	if (PP && PP->isCodeCompletionEnabled()) {
3400	SourceLocation Loc = FileLoc.getLocWithOffset(Offset: CurPtr-BufferStart);
3401	return Loc == PP->getCodeCompletionLoc();
3402	}
3403
3404	return false;
3405	}
3406
3407	void Lexer::DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc,
3408	bool Named,
3409	const LangOptions &Opts,
3410	DiagnosticsEngine &Diags) {
3411	unsigned DiagId;
3412	if (Opts.CPlusPlus23)
3413	DiagId = diag::warn_cxx23_delimited_escape_sequence;
3414	else if (Opts.C2y && !Named)
3415	DiagId = diag::warn_c2y_delimited_escape_sequence;
3416	else
3417	DiagId = diag::ext_delimited_escape_sequence;
3418
3419	// The trailing arguments are only used by the extension warning; either this
3420	// is a C2y extension or a C++23 extension, unless it's a named escape
3421	// sequence in C, then it's a Clang extension.
3422	unsigned Ext;
3423	if (!Opts.CPlusPlus)
3424	Ext = Named ? `2` / Clang extension / : `1` / C2y extension /;
3425	else
3426	Ext = `0`; // C++23 extension
3427
3428	Diags.Report(Loc, DiagID: DiagId) << Named << Ext;
3429	}
3430
3431	std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3432	const char *SlashLoc,
3433	Token *Result) {
3434	unsigned CharSize;
3435	char Kind = getCharAndSize(Ptr: StartPtr, Size&: CharSize);
3436	assert((Kind == `'u'` \|\| Kind == `'U'`) && "expected a UCN");
3437
3438	unsigned NumHexDigits;
3439	if (Kind == `'u'`)
3440	NumHexDigits = `4`;
3441	else if (Kind == `'U'`)
3442	NumHexDigits = `8`;
3443
3444	bool Delimited = false;
3445	bool FoundEndDelimiter = false;
3446	unsigned Count = `0`;
3447	bool Diagnose = Result && !isLexingRawMode();
3448
3449	if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3450	if (Diagnose)
3451	Diag(Loc: SlashLoc, DiagID: diag::warn_ucn_not_valid_in_c89);
3452	return std::nullopt;
3453	}
3454
3455	const char *CurPtr = StartPtr + CharSize;
3456	const char *KindLoc = &CurPtr[-`1`];
3457
3458	uint32_t CodePoint = `0`;
3459	while (Count != NumHexDigits \|\| Delimited) {
3460	char C = getCharAndSize(Ptr: CurPtr, Size&: CharSize);
3461	if (!Delimited && Count == `0` && C == `'{'`) {
3462	Delimited = true;
3463	CurPtr += CharSize;
3464	continue;
3465	}
3466
3467	if (Delimited && C == `'}'`) {
3468	CurPtr += CharSize;
3469	FoundEndDelimiter = true;
3470	break;
3471	}
3472
3473	unsigned Value = llvm::hexDigitValue(C);
3474	if (Value == std::numeric_limits<unsigned>::max()) {
3475	if (!Delimited)
3476	break;
3477	if (Diagnose)
3478	Diag(Loc: SlashLoc, DiagID: diag::warn_delimited_ucn_incomplete)
3479	<< StringRef(KindLoc, `1`);
3480	return std::nullopt;
3481	}
3482
3483	if (CodePoint & `0xF000'0000`) {
3484	if (Diagnose)
3485	Diag(Loc: KindLoc, DiagID: diag::err_escape_too_large) << `0`;
3486	return std::nullopt;
3487	}
3488
3489	CodePoint <<= `4`;
3490	CodePoint \|= Value;
3491	CurPtr += CharSize;
3492	Count++;
3493	}
3494
3495	if (Count == `0`) {
3496	if (Diagnose)
3497	Diag(Loc: SlashLoc, DiagID: FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3498	: diag::warn_ucn_escape_no_digits)
3499	<< StringRef(KindLoc, `1`);
3500	return std::nullopt;
3501	}
3502
3503	if (Delimited && Kind == `'U'`) {
3504	if (Diagnose)
3505	Diag(Loc: SlashLoc, DiagID: diag::err_hex_escape_no_digits) << StringRef(KindLoc, `1`);
3506	return std::nullopt;
3507	}
3508
3509	if (!Delimited && Count != NumHexDigits) {
3510	if (Diagnose) {
3511	Diag(Loc: SlashLoc, DiagID: diag::warn_ucn_escape_incomplete);
3512	// If the user wrote \U1234, suggest a fixit to \u.
3513	if (Count == `4` && NumHexDigits == `8`) {
3514	CharSourceRange URange = makeCharRange(L&: *this, Begin: KindLoc, End: KindLoc + `1`);
3515	Diag(Loc: KindLoc, DiagID: diag::note_ucn_four_not_eight)
3516	<< FixItHint::CreateReplacement(RemoveRange: URange, Code: "u");
3517	}
3518	}
3519	return std::nullopt;
3520	}
3521
3522	if (Delimited && PP)
3523	DiagnoseDelimitedOrNamedEscapeSequence(Loc: getSourceLocation(Loc: SlashLoc), Named: false,
3524	Opts: PP->getLangOpts(),
3525	Diags&: PP->getDiagnostics());
3526
3527	if (Result) {
3528	Result->setFlag(Token::HasUCN);
3529	// If the UCN contains either a trigraph or a line splicing,
3530	// we need to call getAndAdvanceChar again to set the appropriate flags
3531	// on Result.
3532	if (CurPtr - StartPtr == (ptrdiff_t)(Count + `1` + (Delimited ? `2` : `0`)))
3533	StartPtr = CurPtr;
3534	else
3535	while (StartPtr != CurPtr)
3536	(void)getAndAdvanceChar(Ptr&: StartPtr, Tok&: *Result);
3537	} else {
3538	StartPtr = CurPtr;
3539	}
3540	return CodePoint;
3541	}
3542
3543	std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3544	const char *SlashLoc,
3545	Token *Result) {
3546	unsigned CharSize;
3547	bool Diagnose = Result && !isLexingRawMode();
3548
3549	char C = getCharAndSize(Ptr: StartPtr, Size&: CharSize);
3550	assert(C == `'N'` && "expected \\N{...}");
3551
3552	const char *CurPtr = StartPtr + CharSize;
3553	const char *KindLoc = &CurPtr[-`1`];
3554
3555	C = getCharAndSize(Ptr: CurPtr, Size&: CharSize);
3556	if (C != `'{'`) {
3557	if (Diagnose)
3558	Diag(Loc: SlashLoc, DiagID: diag::warn_ucn_escape_incomplete);
3559	return std::nullopt;
3560	}
3561	CurPtr += CharSize;
3562	const char *StartName = CurPtr;
3563	bool FoundEndDelimiter = false;
3564	llvm::SmallVector<char, `30`> Buffer;
3565	while (C) {
3566	C = getCharAndSize(Ptr: CurPtr, Size&: CharSize);
3567	CurPtr += CharSize;
3568	if (C == `'}'`) {
3569	FoundEndDelimiter = true;
3570	break;
3571	}
3572
3573	if (isVerticalWhitespace(c: C))
3574	break;
3575	Buffer.push_back(Elt: C);
3576	}
3577
3578	if (!FoundEndDelimiter \|\| Buffer.empty()) {
3579	if (Diagnose)
3580	Diag(Loc: SlashLoc, DiagID: FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3581	: diag::warn_delimited_ucn_incomplete)
3582	<< StringRef(KindLoc, `1`);
3583	return std::nullopt;
3584	}
3585
3586	StringRef Name(Buffer.data(), Buffer.size());
3587	std::optional<char32_t> Match =
3588	llvm::sys::unicode::nameToCodepointStrict(Name);
3589	std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3590	if (!Match) {
3591	LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3592	if (Diagnose) {
3593	Diag(Loc: StartName, DiagID: diag::err_invalid_ucn_name)
3594	<< StringRef(Buffer.data(), Buffer.size())
3595	<< makeCharRange(L&: *this, Begin: StartName, End: CurPtr - CharSize);
3596	if (LooseMatch) {
3597	Diag(Loc: StartName, DiagID: diag::note_invalid_ucn_name_loose_matching)
3598	<< FixItHint::CreateReplacement(
3599	RemoveRange: makeCharRange(L&: *this, Begin: StartName, End: CurPtr - CharSize),
3600	Code: LooseMatch ->Name);
3601	}
3602	}
3603	// We do not offer misspelled character names suggestions here
3604	// as the set of what would be a valid suggestion depends on context,
3605	// and we should not make invalid suggestions.
3606	}
3607
3608	if (Diagnose && Match)
3609	DiagnoseDelimitedOrNamedEscapeSequence(Loc: getSourceLocation(Loc: SlashLoc), Named: true,
3610	Opts: PP->getLangOpts(),
3611	Diags&: PP->getDiagnostics());
3612
3613	// If no diagnostic has been emitted yet, likely because we are doing a
3614	// tentative lexing, we do not want to recover here to make sure the token
3615	// will not be incorrectly considered valid. This function will be called
3616	// again and a diagnostic emitted then.
3617	if (LooseMatch && Diagnose)
3618	Match = LooseMatch ->CodePoint;
3619
3620	if (Result) {
3621	Result->setFlag(Token::HasUCN);
3622	// If the UCN contains either a trigraph or a line splicing,
3623	// we need to call getAndAdvanceChar again to set the appropriate flags
3624	// on Result.
3625	if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + `3`))
3626	StartPtr = CurPtr;
3627	else
3628	while (StartPtr != CurPtr)
3629	(void)getAndAdvanceChar(Ptr&: StartPtr, Tok&: *Result);
3630	} else {
3631	StartPtr = CurPtr;
3632	}
3633	return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3634	}
3635
3636	uint32_t Lexer::tryReadUCN(const char &StartPtr, const* char *SlashLoc,
3637	Token *Result) {
3638
3639	unsigned CharSize;
3640	std::optional<uint32_t> CodePointOpt;
3641	char Kind = getCharAndSize(Ptr: StartPtr, Size&: CharSize);
3642	if (Kind == `'u'` \|\| Kind == `'U'`)
3643	CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3644	else if (Kind == `'N'`)
3645	CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3646
3647	if (!CodePointOpt)
3648	return `0`;
3649
3650	uint32_t CodePoint = *CodePointOpt;
3651
3652	// Don't apply C family restrictions to UCNs in assembly mode
3653	if (LangOpts.AsmPreprocessor)
3654	return CodePoint;
3655
3656	// C23 6.4.3p2: A universal character name shall not designate a code point
3657	// where the hexadecimal value is:
3658	// - in the range D800 through DFFF inclusive; or
3659	// - greater than 10FFFF.
3660	// A universal-character-name outside the c-char-sequence of a character
3661	// constant, or the s-char-sequence of a string-literal shall not designate
3662	// a control character or a character in the basic character set.
3663
3664	// C++11 [lex.charset]p2: If the hexadecimal value for a
3665	// universal-character-name corresponds to a surrogate code point (in the
3666	// range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3667	// if the hexadecimal value for a universal-character-name outside the
3668	// c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3669	// string literal corresponds to a control character (in either of the
3670	// ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3671	// basic source character set, the program is ill-formed.
3672	if (CodePoint < `0xA0`) {
3673	// We don't use isLexingRawMode() here because we need to warn about bad
3674	// UCNs even when skipping preprocessing tokens in a #if block.
3675	if (Result && PP) {
3676	if (CodePoint < `0x20` \|\| CodePoint >= `0x7F`)
3677	Diag(Loc: BufferPtr, DiagID: diag::err_ucn_control_character);
3678	else {
3679	char C = static_cast<char>(CodePoint);
3680	Diag(Loc: BufferPtr, DiagID: diag::err_ucn_escape_basic_scs) << StringRef(&C, `1`);
3681	}
3682	}
3683
3684	return `0`;
3685	} else if (CodePoint >= `0xD800` && CodePoint <= `0xDFFF`) {
3686	// C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3687	// We don't use isLexingRawMode() here because we need to diagnose bad
3688	// UCNs even when skipping preprocessing tokens in a #if block.
3689	if (Result && PP) {
3690	if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3691	Diag(Loc: BufferPtr, DiagID: diag::warn_ucn_escape_surrogate);
3692	else
3693	Diag(Loc: BufferPtr, DiagID: diag::err_ucn_escape_invalid);
3694	}
3695	return `0`;
3696	}
3697
3698	return CodePoint;
3699	}
3700
3701	bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3702	const char *CurPtr) {
3703	if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3704	isUnicodeWhitespace(Codepoint: C)) {
3705	Diag(Loc: BufferPtr, DiagID: diag::ext_unicode_whitespace)
3706	<< makeCharRange(L&: *this, Begin: BufferPtr, End: CurPtr);
3707
3708	Result.setFlag(Token::LeadingSpace);
3709	return true;
3710	}
3711	return false;
3712	}
3713
3714	void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3715	IsAtStartOfLine = Result.isAtStartOfLine();
3716	HasLeadingSpace = Result.hasLeadingSpace();
3717	HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3718	// Note that this doesn't affect IsAtPhysicalStartOfLine.
3719	}
3720
3721	bool Lexer::Lex(Token &Result) {
3722	assert(!isDependencyDirectivesLexer());
3723
3724	// Start a new token.
3725	Result.startToken();
3726
3727	// Set up misc whitespace flags for LexTokenInternal.
3728	if (IsAtStartOfLine) {
3729	Result.setFlag(Token::StartOfLine);
3730	IsAtStartOfLine = false;
3731	}
3732
3733	if (IsAtPhysicalStartOfLine) {
3734	Result.setFlag(Token::PhysicalStartOfLine);
3735	IsAtPhysicalStartOfLine = false;
3736	}
3737
3738	if (HasLeadingSpace) {
3739	Result.setFlag(Token::LeadingSpace);
3740	HasLeadingSpace = false;
3741	}
3742
3743	if (HasLeadingEmptyMacro) {
3744	Result.setFlag(Token::LeadingEmptyMacro);
3745	HasLeadingEmptyMacro = false;
3746	}
3747
3748	bool isRawLex = isLexingRawMode();
3749	(void) isRawLex;
3750	bool returnedToken = LexTokenInternal(Result);
3751	// (After the LexTokenInternal call, the lexer might be destroyed.)
3752	assert((returnedToken \|\| !isRawLex) && "Raw lex must succeed");
3753	return returnedToken;
3754	}
3755
3756	/// LexTokenInternal - This implements a simple C family lexer. It is an
3757	/// extremely performance critical piece of code. This assumes that the buffer
3758	/// has a null character at the end of the file. This returns a preprocessing
3759	/// token, not a normal token, as such, it is an internal interface. It assumes
3760	/// that the Flags of result have been cleared before calling this.
3761	bool Lexer::LexTokenInternal(Token &Result) {
3762	LexStart:
3763	assert(!Result.needsCleaning() && "Result needs cleaning");
3764	assert(!Result.hasPtrData() && "Result has not been reset");
3765
3766	// CurPtr - Cache BufferPtr in an automatic variable.
3767	const char *CurPtr = BufferPtr;
3768
3769	// Small amounts of horizontal whitespace is very common between tokens.
3770	// Check for space character separately to skip the expensive
3771	// isHorizontalWhitespace() check
3772	if (CurPtr == `' '` \|\| isHorizontalWhitespace(c: CurPtr)) {
3773	do {
3774	++CurPtr;
3775	} while (CurPtr == `' '` \|\| isHorizontalWhitespace(c: CurPtr));
3776
3777	// If we are keeping whitespace and other tokens, just return what we just
3778	// skipped. The next lexer invocation will return the token after the
3779	// whitespace.
3780	if (isKeepWhitespaceMode()) {
3781	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::unknown);
3782	// FIXME: The next token will not have LeadingSpace set.
3783	return true;
3784	}
3785
3786	BufferPtr = CurPtr;
3787	Result.setFlag(Token::LeadingSpace);
3788	}
3789
3790	unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3791
3792	// Read a character, advancing over it.
3793	char Char = getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
3794	tok::TokenKind Kind;
3795
3796	if (!isVerticalWhitespace(c: Char))
3797	NewLinePtr = nullptr;
3798
3799	switch (Char) {
3800	case `0`: // Null.
3801	// Found end of file?
3802	if (CurPtr-`1` == BufferEnd)
3803	return LexEndOfFile(Result, CurPtr: CurPtr-`1`);
3804
3805	// Check if we are performing code completion.
3806	if (isCodeCompletionPoint(CurPtr: CurPtr-`1`)) {
3807	// Return the code-completion token.
3808	Result.startToken();
3809	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::code_completion);
3810	return true;
3811	}
3812
3813	if (!isLexingRawMode())
3814	Diag(Loc: CurPtr-`1`, DiagID: diag::null_in_file);
3815	Result.setFlag(Token::LeadingSpace);
3816	if (SkipWhitespace(Result, CurPtr))
3817	return true; // KeepWhitespaceMode
3818
3819	// We know the lexer hasn't changed, so just try again with this lexer.
3820	// (We manually eliminate the tail call to avoid recursion.)
3821	goto LexNextToken;
3822
3823	case `26`: // DOS & CP/M EOF: "^Z".
3824	// If we're in Microsoft extensions mode, treat this as end of file.
3825	if (LangOpts.MicrosoftExt) {
3826	if (!isLexingRawMode())
3827	Diag(Loc: CurPtr-`1`, DiagID: diag::ext_ctrl_z_eof_microsoft);
3828	return LexEndOfFile(Result, CurPtr: CurPtr-`1`);
3829	}
3830
3831	// If Microsoft extensions are disabled, this is just random garbage.
3832	Kind = tok::unknown;
3833	break;
3834
3835	case `'\r'`:
3836	if (CurPtr[`0`] == `'\n'`)
3837	(void)getAndAdvanceChar(Ptr&: CurPtr, Tok&: Result);
3838	[[fallthrough]];
3839	case `'\n'`:
3840	// If we are inside a preprocessor directive and we see the end of line,
3841	// we know we are done with the directive, so return an EOD token.
3842	if (ParsingPreprocessorDirective) {
3843	// Done parsing the "line".
3844	ParsingPreprocessorDirective = false;
3845
3846	// Restore comment saving mode, in case it was disabled for directive.
3847	if (PP)
3848	resetExtendedTokenMode();
3849
3850	// Since we consumed a newline, we are back at the start of a line.
3851	IsAtStartOfLine = true;
3852	IsAtPhysicalStartOfLine = true;
3853	NewLinePtr = CurPtr - `1`;
3854
3855	Kind = tok::eod;
3856	break;
3857	}
3858
3859	// No leading whitespace seen so far.
3860	Result.clearFlag(Flag: Token::LeadingSpace);
3861
3862	if (SkipWhitespace(Result, CurPtr))
3863	return true; // KeepWhitespaceMode
3864
3865	// We only saw whitespace, so just try again with this lexer.
3866	// (We manually eliminate the tail call to avoid recursion.)
3867	goto LexNextToken;
3868	case `' '`:
3869	case `'\t'`:
3870	case `'\f'`:
3871	case `'\v'`:
3872	SkipHorizontalWhitespace:
3873	Result.setFlag(Token::LeadingSpace);
3874	if (SkipWhitespace(Result, CurPtr))
3875	return true; // KeepWhitespaceMode
3876
3877	SkipIgnoredUnits:
3878	CurPtr = BufferPtr;
3879
3880	// If the next token is obviously a // or / / comment, skip it efficiently
3881	// too (without going through the big switch stmt).
3882	if (CurPtr[`0`] == `'/'` && CurPtr[`1`] == `'/'` && !inKeepCommentMode() &&
3883	LineComment && (LangOpts.CPlusPlus \|\| !LangOpts.TraditionalCPP)) {
3884	if (SkipLineComment(Result, CurPtr: CurPtr + `2`))
3885	return true; // There is a token to return.
3886	goto SkipIgnoredUnits;
3887	} else if (CurPtr[`0`] == `'/'` && CurPtr[`1`] == `'*'` && !inKeepCommentMode()) {
3888	if (SkipBlockComment(Result, CurPtr: CurPtr + `2`))
3889	return true; // There is a token to return.
3890	goto SkipIgnoredUnits;
3891	} else if (isHorizontalWhitespace(c: *CurPtr)) {
3892	goto SkipHorizontalWhitespace;
3893	}
3894	// We only saw whitespace, so just try again with this lexer.
3895	// (We manually eliminate the tail call to avoid recursion.)
3896	goto LexNextToken;
3897
3898	// C99 6.4.4.1: Integer Constants.
3899	// C99 6.4.4.2: Floating Constants.
3900	case `'0'`: case `'1'`: case `'2'`: case `'3'`: case `'4'`:
3901	case `'5'`: case `'6'`: case `'7'`: case `'8'`: case `'9'`:
3902	// Notify MIOpt that we read a non-whitespace/non-comment token.
3903	MIOpt.ReadToken();
3904	return LexNumericConstant(Result, CurPtr);
3905
3906	// Identifier (e.g., uber), or
3907	// UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3908	// UTF-8 or UTF-16 string literal (C11/C++11).
3909	case `'u'`:
3910	// Notify MIOpt that we read a non-whitespace/non-comment token.
3911	MIOpt.ReadToken();
3912
3913	if (LangOpts.CPlusPlus11 \|\| LangOpts.C11) {
3914	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
3915
3916	// UTF-16 string literal
3917	if (Char == `'"'`)
3918	return LexStringLiteral(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3919	Kind: tok::utf16_string_literal);
3920
3921	// UTF-16 character constant
3922	if (Char == `'\''`)
3923	return LexCharConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3924	Kind: tok::utf16_char_constant);
3925
3926	// UTF-16 raw string literal
3927	if (Char == `'R'` && LangOpts.RawStringLiterals &&
3928	getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2) == `'"'`)
3929	return LexRawStringLiteral(Result,
3930	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3931	Size: SizeTmp2, Tok&: Result),
3932	Kind: tok::utf16_string_literal);
3933
3934	if (Char == `'8'`) {
3935	char Char2 = getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2);
3936
3937	// UTF-8 string literal
3938	if (Char2 == `'"'`)
3939	return LexStringLiteral(Result,
3940	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3941	Size: SizeTmp2, Tok&: Result),
3942	Kind: tok::utf8_string_literal);
3943	if (Char2 == `'\''` && (LangOpts.CPlusPlus17 \|\| LangOpts.C23))
3944	return LexCharConstant(
3945	Result, CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3946	Size: SizeTmp2, Tok&: Result),
3947	Kind: tok::utf8_char_constant);
3948
3949	if (Char2 == `'R'` && LangOpts.RawStringLiterals) {
3950	unsigned SizeTmp3;
3951	char Char3 = getCharAndSize(Ptr: CurPtr + SizeTmp + SizeTmp2, Size&: SizeTmp3);
3952	// UTF-8 raw string literal
3953	if (Char3 == `'"'`) {
3954	return LexRawStringLiteral(Result,
3955	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3956	Size: SizeTmp2, Tok&: Result),
3957	Size: SizeTmp3, Tok&: Result),
3958	Kind: tok::utf8_string_literal);
3959	}
3960	}
3961	}
3962	}
3963
3964	// treat u like the start of an identifier.
3965	return LexIdentifierContinue(Result, CurPtr);
3966
3967	case `'U'`: // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3968	// Notify MIOpt that we read a non-whitespace/non-comment token.
3969	MIOpt.ReadToken();
3970
3971	if (LangOpts.CPlusPlus11 \|\| LangOpts.C11) {
3972	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
3973
3974	// UTF-32 string literal
3975	if (Char == `'"'`)
3976	return LexStringLiteral(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3977	Kind: tok::utf32_string_literal);
3978
3979	// UTF-32 character constant
3980	if (Char == `'\''`)
3981	return LexCharConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3982	Kind: tok::utf32_char_constant);
3983
3984	// UTF-32 raw string literal
3985	if (Char == `'R'` && LangOpts.RawStringLiterals &&
3986	getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2) == `'"'`)
3987	return LexRawStringLiteral(Result,
3988	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
3989	Size: SizeTmp2, Tok&: Result),
3990	Kind: tok::utf32_string_literal);
3991	}
3992
3993	// treat U like the start of an identifier.
3994	return LexIdentifierContinue(Result, CurPtr);
3995
3996	case `'R'`: // Identifier or C++0x raw string literal
3997	// Notify MIOpt that we read a non-whitespace/non-comment token.
3998	MIOpt.ReadToken();
3999
4000	if (LangOpts.RawStringLiterals) {
4001	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4002
4003	if (Char == `'"'`)
4004	return LexRawStringLiteral(Result,
4005	CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4006	Kind: tok::string_literal);
4007	}
4008
4009	// treat R like the start of an identifier.
4010	return LexIdentifierContinue(Result, CurPtr);
4011
4012	case `'L'`: // Identifier (Loony) or wide literal (L'x' or L"xyz").
4013	// Notify MIOpt that we read a non-whitespace/non-comment token.
4014	MIOpt.ReadToken();
4015	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4016
4017	// Wide string literal.
4018	if (Char == `'"'`)
4019	return LexStringLiteral(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4020	Kind: tok::wide_string_literal);
4021
4022	// Wide raw string literal.
4023	if (LangOpts.RawStringLiterals && Char == `'R'` &&
4024	getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2) == `'"'`)
4025	return LexRawStringLiteral(Result,
4026	CurPtr: ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4027	Size: SizeTmp2, Tok&: Result),
4028	Kind: tok::wide_string_literal);
4029
4030	// Wide character constant.
4031	if (Char == `'\''`)
4032	return LexCharConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4033	Kind: tok::wide_char_constant);
4034	// FALL THROUGH, treating L like the start of an identifier.
4035	[[fallthrough]];
4036
4037	// C99 6.4.2: Identifiers.
4038	case `'A'`: case `'B'`: case `'C'`: case `'D'`: case `'E'`: case `'F'`: case `'G'`:
4039	case `'H'`: case `'I'`: case `'J'`: case `'K'`: /'L'/case `'M'`: case `'N'`:
4040	case `'O'`: case `'P'`: case `'Q'`: /'R'/case `'S'`: case `'T'`: /'U'/
4041	case `'V'`: case `'W'`: case `'X'`: case `'Y'`: case `'Z'`:
4042	case `'a'`: case `'b'`: case `'c'`: case `'d'`: case `'e'`: case `'f'`: case `'g'`:
4043	case `'h'`: case `'i'`: case `'j'`: case `'k'`: case `'l'`: case `'m'`: case `'n'`:
4044	case `'o'`: case `'p'`: case `'q'`: case `'r'`: case `'s'`: case `'t'`: /'u'/
4045	case `'v'`: case `'w'`: case `'x'`: case `'y'`: case `'z'`:
4046	case `'_'`:
4047	// Notify MIOpt that we read a non-whitespace/non-comment token.
4048	MIOpt.ReadToken();
4049	return LexIdentifierContinue(Result, CurPtr);
4050	case `'$'`: // $ in identifiers.
4051	if (LangOpts.DollarIdents) {
4052	if (!isLexingRawMode())
4053	Diag(Loc: CurPtr-`1`, DiagID: diag::ext_dollar_in_identifier);
4054	// Notify MIOpt that we read a non-whitespace/non-comment token.
4055	MIOpt.ReadToken();
4056	return LexIdentifierContinue(Result, CurPtr);
4057	}
4058
4059	Kind = tok::unknown;
4060	break;
4061
4062	// C99 6.4.4: Character Constants.
4063	case `'\''`:
4064	// Notify MIOpt that we read a non-whitespace/non-comment token.
4065	MIOpt.ReadToken();
4066	return LexCharConstant(Result, CurPtr, Kind: tok::char_constant);
4067
4068	// C99 6.4.5: String Literals.
4069	case `'"'`:
4070	// Notify MIOpt that we read a non-whitespace/non-comment token.
4071	MIOpt.ReadToken();
4072	return LexStringLiteral(Result, CurPtr,
4073	Kind: ParsingFilename ? tok::header_name
4074	: tok::string_literal);
4075
4076	// C99 6.4.6: Punctuators.
4077	case `'?'`:
4078	Kind = tok::question;
4079	break;
4080	case `'['`:
4081	Kind = tok::l_square;
4082	break;
4083	case `']'`:
4084	Kind = tok::r_square;
4085	break;
4086	case `'('`:
4087	Kind = tok::l_paren;
4088	break;
4089	case `')'`:
4090	Kind = tok::r_paren;
4091	break;
4092	case `'{'`:
4093	Kind = tok::l_brace;
4094	break;
4095	case `'}'`:
4096	Kind = tok::r_brace;
4097	break;
4098	case `'.'`:
4099	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4100	if (Char >= `'0'` && Char <= `'9'`) {
4101	// Notify MIOpt that we read a non-whitespace/non-comment token.
4102	MIOpt.ReadToken();
4103
4104	return LexNumericConstant(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result));
4105	} else if (LangOpts.CPlusPlus && Char == `'*'`) {
4106	Kind = tok::periodstar;
4107	CurPtr += SizeTmp;
4108	} else if (Char == `'.'` &&
4109	getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2) == `'.'`) {
4110	Kind = tok::ellipsis;
4111	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4112	Size: SizeTmp2, Tok&: Result);
4113	} else {
4114	Kind = tok::period;
4115	}
4116	break;
4117	case `'&'`:
4118	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4119	if (Char == `'&'`) {
4120	Kind = tok::ampamp;
4121	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4122	} else if (Char == `'='`) {
4123	Kind = tok::ampequal;
4124	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4125	} else {
4126	Kind = tok::amp;
4127	}
4128	break;
4129	case `'*'`:
4130	if (getCharAndSize(Ptr: CurPtr, Size&: SizeTmp) == `'='`) {
4131	Kind = tok::starequal;
4132	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4133	} else {
4134	Kind = tok::star;
4135	}
4136	break;
4137	case `'+'`:
4138	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4139	if (Char == `'+'`) {
4140	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4141	Kind = tok::plusplus;
4142	} else if (Char == `'='`) {
4143	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4144	Kind = tok::plusequal;
4145	} else {
4146	Kind = tok::plus;
4147	}
4148	break;
4149	case `'-'`:
4150	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4151	if (Char == `'-'`) { // --
4152	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4153	Kind = tok::minusminus;
4154	} else if (Char == `'>'` && LangOpts.CPlusPlus &&
4155	getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2) == `''`) { // C++ ->
4156	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4157	Size: SizeTmp2, Tok&: Result);
4158	Kind = tok::arrowstar;
4159	} else if (Char == `'>'`) { // ->
4160	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4161	Kind = tok::arrow;
4162	} else if (Char == `'='`) { // -=
4163	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4164	Kind = tok::minusequal;
4165	} else {
4166	Kind = tok::minus;
4167	}
4168	break;
4169	case `'~'`:
4170	Kind = tok::tilde;
4171	break;
4172	case `'!'`:
4173	if (getCharAndSize(Ptr: CurPtr, Size&: SizeTmp) == `'='`) {
4174	Kind = tok::exclaimequal;
4175	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4176	} else {
4177	Kind = tok::exclaim;
4178	}
4179	break;
4180	case `'/'`:
4181	// 6.4.9: Comments
4182	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4183	if (Char == `'/'`) { // Line comment.
4184	// Even if Line comments are disabled (e.g. in C89 mode), we generally
4185	// want to lex this as a comment. There is one problem with this though,
4186	// that in one particular corner case, this can change the behavior of the
4187	// resultant program. For example, In "foo /// bar", C89 would lex
4188	// this as "foo / bar" and languages with Line comments would lex it as
4189	// "foo". Check to see if the character after the second slash is a ''.*
4190	// If so, we will lex that as a "/" instead of the start of a comment.
4191	// However, we never do this if we are just preprocessing.
4192	bool TreatAsComment =
4193	LineComment && (LangOpts.CPlusPlus \|\| !LangOpts.TraditionalCPP);
4194	if (!TreatAsComment)
4195	if (!(PP && PP->isPreprocessedOutput()))
4196	TreatAsComment = getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2) != `'*'`;
4197
4198	if (TreatAsComment) {
4199	if (SkipLineComment(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result)))
4200	return true; // There is a token to return.
4201
4202	// It is common for the tokens immediately after a // comment to be
4203	// whitespace (indentation for the next line). Instead of going through
4204	// the big switch, handle it efficiently now.
4205	goto SkipIgnoredUnits;
4206	}
4207	}
4208
4209	if (Char == `''`) { // /*/ comment.
4210	if (SkipBlockComment(Result, CurPtr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result)))
4211	return true; // There is a token to return.
4212
4213	// We only saw whitespace, so just try again with this lexer.
4214	// (We manually eliminate the tail call to avoid recursion.)
4215	goto LexNextToken;
4216	}
4217
4218	if (Char == `'='`) {
4219	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4220	Kind = tok::slashequal;
4221	} else {
4222	Kind = tok::slash;
4223	}
4224	break;
4225	case `'%'`:
4226	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4227	if (Char == `'='`) {
4228	Kind = tok::percentequal;
4229	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4230	} else if (LangOpts.Digraphs && Char == `'>'`) {
4231	Kind = tok::r_brace; // '%>' -> '}'
4232	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4233	} else if (LangOpts.Digraphs && Char == `':'`) {
4234	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4235	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4236	if (Char == `'%'` && getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2) == `':'`) {
4237	Kind = tok::hashhash; // '%:%:' -> '##'
4238	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4239	Size: SizeTmp2, Tok&: Result);
4240	} else if (Char == `'@'` && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4241	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4242	if (!isLexingRawMode())
4243	Diag(Loc: BufferPtr, DiagID: diag::ext_charize_microsoft);
4244	Kind = tok::hashat;
4245	} else { // '%:' -> '#'
4246	// We parsed a # character. If this occurs at the start of the line,
4247	// it's actually the start of a preprocessing directive. Callback to
4248	// the preprocessor to handle it.
4249	// TODO: -fpreprocessed mode??
4250	if (Result.isAtPhysicalStartOfLine() && !LexingRawMode &&
4251	!Is_PragmaLexer)
4252	goto HandleDirective;
4253
4254	Kind = tok::hash;
4255	}
4256	} else {
4257	Kind = tok::percent;
4258	}
4259	break;
4260	case `'<'`:
4261	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4262	if (ParsingFilename) {
4263	return LexAngledStringLiteral(Result, CurPtr);
4264	} else if (Char == `'<'`) {
4265	char After = getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2);
4266	if (After == `'='`) {
4267	Kind = tok::lesslessequal;
4268	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4269	Size: SizeTmp2, Tok&: Result);
4270	} else if (After == `'<'` && IsStartOfConflictMarker(CurPtr: CurPtr-`1`)) {
4271	// If this is actually a '<<<<<<<' version control conflict marker,
4272	// recognize it as such and recover nicely.
4273	goto LexNextToken;
4274	} else if (After == `'<'` && HandleEndOfConflictMarker(CurPtr: CurPtr-`1`)) {
4275	// If this is '<<<<' and we're in a Perforce-style conflict marker,
4276	// ignore it.
4277	goto LexNextToken;
4278	} else if (LangOpts.CUDA && After == `'<'`) {
4279	Kind = tok::lesslessless;
4280	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4281	Size: SizeTmp2, Tok&: Result);
4282	} else {
4283	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4284	Kind = tok::lessless;
4285	}
4286	} else if (Char == `'='`) {
4287	char After = getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2);
4288	if (After == `'>'`) {
4289	if (LangOpts.CPlusPlus20) {
4290	if (!isLexingRawMode())
4291	Diag(Loc: BufferPtr, DiagID: diag::warn_cxx17_compat_spaceship);
4292	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4293	Size: SizeTmp2, Tok&: Result);
4294	Kind = tok::spaceship;
4295	break;
4296	}
4297	// Suggest adding a space between the '<=' and the '>' to avoid a
4298	// change in semantics if this turns up in C++ <=17 mode.
4299	if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4300	Diag(Loc: BufferPtr, DiagID: diag::warn_cxx20_compat_spaceship)
4301	<< FixItHint::CreateInsertion(
4302	InsertionLoc: getSourceLocation(Loc: CurPtr + SizeTmp, TokLen: SizeTmp2), Code: " ");
4303	}
4304	}
4305	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4306	Kind = tok::lessequal;
4307	} else if (LangOpts.Digraphs && Char == `':'`) { // '<:' -> '['
4308	if (LangOpts.CPlusPlus11 &&
4309	getCharAndSize(Ptr: CurPtr + SizeTmp, Size&: SizeTmp2) == `':'`) {
4310	// C++0x [lex.pptoken]p3:
4311	// Otherwise, if the next three characters are <:: and the subsequent
4312	// character is neither : nor >, the < is treated as a preprocessor
4313	// token by itself and not as the first character of the alternative
4314	// token <:.
4315	unsigned SizeTmp3;
4316	char After = getCharAndSize(Ptr: CurPtr + SizeTmp + SizeTmp2, Size&: SizeTmp3);
4317	if (After != `':'` && After != `'>'`) {
4318	Kind = tok::less;
4319	if (!isLexingRawMode())
4320	Diag(Loc: BufferPtr, DiagID: diag::warn_cxx98_compat_less_colon_colon);
4321	break;
4322	}
4323	}
4324
4325	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4326	Kind = tok::l_square;
4327	} else if (LangOpts.Digraphs && Char == `'%'`) { // '<%' -> '{'
4328	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4329	Kind = tok::l_brace;
4330	} else if (Char == `'#'` && /Not a trigraph/ SizeTmp == `1` &&
4331	lexEditorPlaceholder(Result, CurPtr)) {
4332	return true;
4333	} else {
4334	Kind = tok::less;
4335	}
4336	break;
4337	case `'>'`:
4338	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4339	if (Char == `'='`) {
4340	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4341	Kind = tok::greaterequal;
4342	} else if (Char == `'>'`) {
4343	char After = getCharAndSize(Ptr: CurPtr+SizeTmp, Size&: SizeTmp2);
4344	if (After == `'='`) {
4345	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4346	Size: SizeTmp2, Tok&: Result);
4347	Kind = tok::greatergreaterequal;
4348	} else if (After == `'>'` && IsStartOfConflictMarker(CurPtr: CurPtr-`1`)) {
4349	// If this is actually a '>>>>' conflict marker, recognize it as such
4350	// and recover nicely.
4351	goto LexNextToken;
4352	} else if (After == `'>'` && HandleEndOfConflictMarker(CurPtr: CurPtr-`1`)) {
4353	// If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4354	goto LexNextToken;
4355	} else if (LangOpts.CUDA && After == `'>'`) {
4356	Kind = tok::greatergreatergreater;
4357	CurPtr = ConsumeChar(Ptr: ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result),
4358	Size: SizeTmp2, Tok&: Result);
4359	} else {
4360	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4361	Kind = tok::greatergreater;
4362	}
4363	} else {
4364	Kind = tok::greater;
4365	}
4366	break;
4367	case `'^'`:
4368	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4369	if (Char == `'='`) {
4370	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4371	Kind = tok::caretequal;
4372	} else if (LangOpts.Reflection && Char == `'^'`) {
4373	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4374	Kind = tok::caretcaret;
4375	} else {
4376	if (LangOpts.OpenCL && Char == `'^'`)
4377	Diag(Loc: CurPtr, DiagID: diag::err_opencl_logical_exclusive_or);
4378	Kind = tok::caret;
4379	}
4380	break;
4381	case `'\|'`:
4382	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4383	if (Char == `'='`) {
4384	Kind = tok::pipeequal;
4385	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4386	} else if (Char == `'\|'`) {
4387	// If this is '\|\|\|\|\|\|\|' and we're in a conflict marker, ignore it.
4388	if (CurPtr[`1`] == `'\|'` && HandleEndOfConflictMarker(CurPtr: CurPtr-`1`))
4389	goto LexNextToken;
4390	Kind = tok::pipepipe;
4391	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4392	} else {
4393	Kind = tok::pipe;
4394	}
4395	break;
4396	case `':'`:
4397	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4398	if (LangOpts.Digraphs && Char == `'>'`) {
4399	Kind = tok::r_square; // ':>' -> ']'
4400	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4401	} else if (Char == `':'`) {
4402	Kind = tok::coloncolon;
4403	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4404	} else {
4405	Kind = tok::colon;
4406	}
4407	break;
4408	case `';'`:
4409	Kind = tok::semi;
4410	break;
4411	case `'='`:
4412	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4413	if (Char == `'='`) {
4414	// If this is '====' and we're in a conflict marker, ignore it.
4415	if (CurPtr[`1`] == `'='` && HandleEndOfConflictMarker(CurPtr: CurPtr-`1`))
4416	goto LexNextToken;
4417
4418	Kind = tok::equalequal;
4419	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4420	} else {
4421	Kind = tok::equal;
4422	}
4423	break;
4424	case `','`:
4425	Kind = tok::comma;
4426	break;
4427	case `'#'`:
4428	Char = getCharAndSize(Ptr: CurPtr, Size&: SizeTmp);
4429	if (Char == `'#'`) {
4430	Kind = tok::hashhash;
4431	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4432	} else if (Char == `'@'` && LangOpts.MicrosoftExt) { // #@ -> Charize
4433	Kind = tok::hashat;
4434	if (!isLexingRawMode())
4435	Diag(Loc: BufferPtr, DiagID: diag::ext_charize_microsoft);
4436	CurPtr = ConsumeChar(Ptr: CurPtr, Size: SizeTmp, Tok&: Result);
4437	} else {
4438	// We parsed a # character. If this occurs at the start of the line,
4439	// it's actually the start of a preprocessing directive. Callback to
4440	// the preprocessor to handle it.
4441	// TODO: -fpreprocessed mode??
4442	if (Result.isAtPhysicalStartOfLine() && !LexingRawMode && !Is_PragmaLexer)
4443	goto HandleDirective;
4444
4445	Kind = tok::hash;
4446	}
4447	break;
4448
4449	case `'@'`:
4450	// Objective C support.
4451	if (CurPtr[-`1`] == `'@'` && LangOpts.ObjC)
4452	Kind = tok::at;
4453	else
4454	Kind = tok::unknown;
4455	break;
4456
4457	// UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4458	case `'\\'`:
4459	if (!LangOpts.AsmPreprocessor) {
4460	if (uint32_t CodePoint = tryReadUCN(StartPtr&: CurPtr, SlashLoc: BufferPtr, Result: &Result)) {
4461	if (CheckUnicodeWhitespace(Result, C: CodePoint, CurPtr)) {
4462	if (SkipWhitespace(Result, CurPtr))
4463	return true; // KeepWhitespaceMode
4464
4465	// We only saw whitespace, so just try again with this lexer.
4466	// (We manually eliminate the tail call to avoid recursion.)
4467	goto LexNextToken;
4468	}
4469
4470	return LexUnicodeIdentifierStart(Result, C: CodePoint, CurPtr);
4471	}
4472	}
4473
4474	Kind = tok::unknown;
4475	break;
4476
4477	default: {
4478	if (isASCII(c: Char)) {
4479	Kind = tok::unknown;
4480	break;
4481	}
4482
4483	llvm::UTF32 CodePoint;
4484
4485	// We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4486	// an escaped newline.
4487	--CurPtr;
4488	llvm::ConversionResult Status =
4489	llvm::convertUTF8Sequence(source: (const llvm::UTF8 **)&CurPtr,
4490	sourceEnd: (const llvm::UTF8 *)BufferEnd,
4491	target: &CodePoint,
4492	flags: llvm::strictConversion);
4493	if (Status == llvm::conversionOK) {
4494	if (CheckUnicodeWhitespace(Result, C: CodePoint, CurPtr)) {
4495	if (SkipWhitespace(Result, CurPtr))
4496	return true; // KeepWhitespaceMode
4497
4498	// We only saw whitespace, so just try again with this lexer.
4499	// (We manually eliminate the tail call to avoid recursion.)
4500	goto LexNextToken;
4501	}
4502	return LexUnicodeIdentifierStart(Result, C: CodePoint, CurPtr);
4503	}
4504
4505	if (isLexingRawMode() \|\| ParsingPreprocessorDirective \|\|
4506	PP->isPreprocessedOutput()) {
4507	++CurPtr;
4508	Kind = tok::unknown;
4509	break;
4510	}
4511
4512	// Non-ASCII characters tend to creep into source code unintentionally.
4513	// Instead of letting the parser complain about the unknown token,
4514	// just diagnose the invalid UTF-8, then drop the character.
4515	Diag(Loc: CurPtr, DiagID: diag::err_invalid_utf8);
4516
4517	BufferPtr = CurPtr+`1`;
4518	// We're pretending the character didn't exist, so just try again with
4519	// this lexer.
4520	// (We manually eliminate the tail call to avoid recursion.)
4521	goto LexNextToken;
4522	}
4523	}
4524
4525	// Notify MIOpt that we read a non-whitespace/non-comment token.
4526	MIOpt.ReadToken();
4527
4528	// Update the location of token as well as BufferPtr.
4529	FormTokenWithChars(Result, TokEnd: CurPtr, Kind);
4530	return true;
4531
4532	HandleDirective:
4533
4534	// We parsed a # character and it's the start of a preprocessing directive.
4535	FormTokenWithChars(Result, TokEnd: CurPtr, Kind: tok::hash);
4536	PP->HandleDirective(Result);
4537
4538	if (PP->hadModuleLoaderFatalFailure())
4539	// With a fatal failure in the module loader, we abort parsing.
4540	return true;
4541
4542	// We parsed the directive; lex a token with the new state.
4543	return false;
4544
4545	LexNextToken:
4546	Result.clearFlag(Flag: Token::NeedsCleaning);
4547	goto LexStart;
4548	}
4549
4550	const char *Lexer::convertDependencyDirectiveToken(
4551	const dependency_directives_scan::Token &DDTok, Token &Result) {
4552	const char *TokPtr = BufferStart + DDTok.Offset;
4553	Result.startToken();
4554	Result.setLocation(getSourceLocation(Loc: TokPtr));
4555	Result.setKind(DDTok.Kind);
4556	Result.setFlag((Token::TokenFlags)DDTok.Flags);
4557	Result.setLength(DDTok.Length);
4558	if (Result.is(K: tok::raw_identifier))
4559	Result.setRawIdentifierData(TokPtr);
4560	else if (Result.isLiteral())
4561	Result.setLiteralData(TokPtr);
4562	BufferPtr = TokPtr + DDTok.Length;
4563	return TokPtr;
4564	}
4565
4566	bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4567	assert(isDependencyDirectivesLexer());
4568
4569	using namespace dependency_directives_scan;
4570
4571	if (BufferPtr == BufferEnd)
4572	return LexEndOfFile(Result, CurPtr: BufferPtr);
4573
4574	while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4575	if (DepDirectives.front().Kind == pp_eof)
4576	return LexEndOfFile(Result, CurPtr: BufferEnd);
4577	if (DepDirectives.front().Kind == tokens_present_before_eof)
4578	MIOpt.ReadToken();
4579	NextDepDirectiveTokenIndex = `0`;
4580	DepDirectives = DepDirectives.drop_front();
4581	}
4582
4583	const dependency_directives_scan::Token &DDTok =
4584	DepDirectives.front().Tokens [NextDepDirectiveTokenIndex++];
4585	if (NextDepDirectiveTokenIndex > `1` \|\| DDTok.Kind != tok::hash) {
4586	// Read something other than a preprocessor directive hash.
4587	MIOpt.ReadToken();
4588	}
4589
4590	if (ParsingFilename && DDTok.is(K: tok::less)) {
4591	BufferPtr = BufferStart + DDTok.Offset;
4592	LexAngledStringLiteral(Result, CurPtr: BufferPtr + `1`);
4593	if (Result.isNot(K: tok::header_name))
4594	return true;
4595	// Advance the index of lexed tokens.
4596	while (true) {
4597	const dependency_directives_scan::Token &NextTok =
4598	DepDirectives.front().Tokens [NextDepDirectiveTokenIndex];
4599	if (BufferStart + NextTok.Offset >= BufferPtr)
4600	break;
4601	++NextDepDirectiveTokenIndex;
4602	}
4603	return true;
4604	}
4605
4606	const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4607
4608	if (Result.is(K: tok::hash) && Result.isAtStartOfLine()) {
4609	PP->HandleDirective(Result);
4610	if (PP->hadModuleLoaderFatalFailure())
4611	// With a fatal failure in the module loader, we abort parsing.
4612	return true;
4613	return false;
4614	}
4615	if (Result.is(K: tok::raw_identifier)) {
4616	Result.setRawIdentifierData(TokPtr);
4617	if (!isLexingRawMode()) {
4618	const IdentifierInfo *II = PP->LookUpIdentifierInfo(Identifier&: Result);
4619	if (LangOpts.CPlusPlusModules && Result.isModuleContextualKeyword() &&
4620	PP->HandleModuleContextualKeyword(Result)) {
4621	PP->HandleDirective(Result);
4622	return false;
4623	}
4624	if (II->isHandleIdentifierCase())
4625	return PP->HandleIdentifier(Identifier&: Result);
4626	}
4627	return true;
4628	}
4629	if (Result.isLiteral())
4630	return true;
4631	if (Result.is(K: tok::colon)) {
4632	// Convert consecutive colons to 'tok::coloncolon'.
4633	if (*BufferPtr == `':'`) {
4634	assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4635	tok::colon));
4636	++NextDepDirectiveTokenIndex;
4637	Result.setKind(tok::coloncolon);
4638	}
4639	return true;
4640	}
4641	if (Result.is(K: tok::eod))
4642	ParsingPreprocessorDirective = false;
4643
4644	return true;
4645	}
4646
4647	bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4648	assert(isDependencyDirectivesLexer());
4649
4650	using namespace dependency_directives_scan;
4651
4652	bool Stop = false;
4653	unsigned NestedIfs = `0`;
4654	do {
4655	DepDirectives = DepDirectives.drop_front();
4656	switch (DepDirectives.front().Kind) {
4657	case pp_none:
4658	llvm_unreachable("unexpected 'pp_none'");
4659	case pp_include:
4660	case pp___include_macros:
4661	case pp_define:
4662	case pp_undef:
4663	case pp_import:
4664	case pp_pragma_import:
4665	case pp_pragma_once:
4666	case pp_pragma_push_macro:
4667	case pp_pragma_pop_macro:
4668	case pp_pragma_include_alias:
4669	case pp_pragma_system_header:
4670	case pp_include_next:
4671	case decl_at_import:
4672	case cxx_module_decl:
4673	case cxx_import_decl:
4674	case cxx_export_module_decl:
4675	case cxx_export_import_decl:
4676	case tokens_present_before_eof:
4677	break;
4678	case pp_if:
4679	case pp_ifdef:
4680	case pp_ifndef:
4681	++NestedIfs;
4682	break;
4683	case pp_elif:
4684	case pp_elifdef:
4685	case pp_elifndef:
4686	case pp_else:
4687	if (!NestedIfs) {
4688	Stop = true;
4689	}
4690	break;
4691	case pp_endif:
4692	if (!NestedIfs) {
4693	Stop = true;
4694	} else {
4695	--NestedIfs;
4696	}
4697	break;
4698	case pp_eof:
4699	NextDepDirectiveTokenIndex = `0`;
4700	return LexEndOfFile(Result, CurPtr: BufferEnd);
4701	}
4702	} while (!Stop);
4703
4704	const dependency_directives_scan::Token &DDTok =
4705	DepDirectives.front().Tokens.front();
4706	assert(DDTok.is(tok::hash));
4707	NextDepDirectiveTokenIndex = `1`;
4708
4709	convertDependencyDirectiveToken(DDTok, Result);
4710	return false;
4711	}
4712

Browse the source code of llvm_projects/clang/lib/Lex/Lexer.cpp