LiteralSupport.cpp source code [llvm_projects/clang/lib/Lex/LiteralSupport.cpp]

1	//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the NumericLiteralParser, CharLiteralParser, and
10	// StringLiteralParser interfaces.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "clang/Lex/LiteralSupport.h"
15	#include "clang/Basic/CharInfo.h"
16	#include "clang/Basic/LangOptions.h"
17	#include "clang/Basic/SourceLocation.h"
18	#include "clang/Basic/TargetInfo.h"
19	#include "clang/Lex/LexDiagnostic.h"
20	#include "clang/Lex/Lexer.h"
21	#include "clang/Lex/Preprocessor.h"
22	#include "clang/Lex/Token.h"
23	#include "llvm/ADT/APInt.h"
24	#include "llvm/ADT/ScopeExit.h"
25	#include "llvm/ADT/SmallVector.h"
26	#include "llvm/ADT/StringExtras.h"
27	#include "llvm/ADT/StringSwitch.h"
28	#include "llvm/Support/ConvertUTF.h"
29	#include "llvm/Support/Error.h"
30	#include "llvm/Support/ErrorHandling.h"
31	#include "llvm/Support/Unicode.h"
32	#include <algorithm>
33	#include <cassert>
34	#include <cstddef>
35	#include <cstdint>
36	#include <cstring>
37	#include <string>
38
39	using namespace clang;
40
41	static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
42	switch (kind) {
43	default: llvm_unreachable("Unknown token type!");
44	case tok::char_constant:
45	case tok::string_literal:
46	case tok::utf8_char_constant:
47	case tok::utf8_string_literal:
48	return Target.getCharWidth();
49	case tok::wide_char_constant:
50	case tok::wide_string_literal:
51	return Target.getWCharWidth();
52	case tok::utf16_char_constant:
53	case tok::utf16_string_literal:
54	return Target.getChar16Width();
55	case tok::utf32_char_constant:
56	case tok::utf32_string_literal:
57	return Target.getChar32Width();
58	}
59	}
60
61	static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
62	switch (kind) {
63	default:
64	llvm_unreachable("Unknown token type!");
65	case tok::char_constant:
66	case tok::string_literal:
67	return `0`;
68	case tok::utf8_char_constant:
69	case tok::utf8_string_literal:
70	return `2`;
71	case tok::wide_char_constant:
72	case tok::wide_string_literal:
73	case tok::utf16_char_constant:
74	case tok::utf16_string_literal:
75	case tok::utf32_char_constant:
76	case tok::utf32_string_literal:
77	return `1`;
78	}
79	}
80
81	static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
82	FullSourceLoc TokLoc,
83	const char *TokBegin,
84	const char *TokRangeBegin,
85	const char *TokRangeEnd) {
86	SourceLocation Begin =
87	Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: TokRangeBegin - TokBegin,
88	SM: TokLoc.getManager(), LangOpts: Features);
89	SourceLocation End =
90	Lexer::AdvanceToTokenCharacter(TokStart: Begin, Characters: TokRangeEnd - TokRangeBegin,
91	SM: TokLoc.getManager(), LangOpts: Features);
92	return CharSourceRange::getCharRange(B: Begin, E: End);
93	}
94
95	/// Produce a diagnostic highlighting some portion of a literal.
96	///
97	/// Emits the diagnostic \p DiagID, highlighting the range of characters from
98	/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
99	/// a substring of a spelling buffer for the token beginning at \p TokBegin.
100	static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
101	const LangOptions &Features, FullSourceLoc TokLoc,
102	const char TokBegin, const* char *TokRangeBegin,
103	const char TokRangeEnd, unsigned* DiagID) {
104	SourceLocation Begin =
105	Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: TokRangeBegin - TokBegin,
106	SM: TokLoc.getManager(), LangOpts: Features);
107	return Diags->Report(Loc: Begin, DiagID) <<
108	MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
109	}
110
111	static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
112	switch (Escape) {
113	case `'\''`:
114	case `'"'`:
115	case `'?'`:
116	case `'\\'`:
117	case `'a'`:
118	case `'b'`:
119	case `'f'`:
120	case `'n'`:
121	case `'r'`:
122	case `'t'`:
123	case `'v'`:
124	return true;
125	}
126	return false;
127	}
128
129	/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
130	/// either a character or a string literal.
131	static unsigned ProcessCharEscape(const char *ThisTokBegin,
132	const char *&ThisTokBuf,
133	const char ThisTokEnd, bool* &HadError,
134	FullSourceLoc Loc, unsigned CharWidth,
135	DiagnosticsEngine *Diags,
136	const LangOptions &Features,
137	StringLiteralEvalMethod EvalMethod) {
138	const char *EscapeBegin = ThisTokBuf;
139	bool Delimited = false;
140	bool EndDelimiterFound = false;
141
142	// Skip the '\' char.
143	++ThisTokBuf;
144
145	// We know that this character can't be off the end of the buffer, because
146	// that would have been \", which would not have been the end of string.
147	unsigned ResultChar = *ThisTokBuf++;
148	char Escape = ResultChar;
149	switch (ResultChar) {
150	// These map to themselves.
151	case `'\\'`: case `'\''`: case `'"'`: case `'?'`: break;
152
153	// These have fixed mappings.
154	case `'a'`:
155	// TODO: K&R: the meaning of '\\a' is different in traditional C
156	ResultChar = `7`;
157	break;
158	case `'b'`:
159	ResultChar = `8`;
160	break;
161	case `'e'`:
162	if (Diags)
163	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
164	DiagID: diag::ext_nonstandard_escape) << "e";
165	ResultChar = `27`;
166	break;
167	case `'E'`:
168	if (Diags)
169	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
170	DiagID: diag::ext_nonstandard_escape) << "E";
171	ResultChar = `27`;
172	break;
173	case `'f'`:
174	ResultChar = `12`;
175	break;
176	case `'n'`:
177	ResultChar = `10`;
178	break;
179	case `'r'`:
180	ResultChar = `13`;
181	break;
182	case `'t'`:
183	ResultChar = `9`;
184	break;
185	case `'v'`:
186	ResultChar = `11`;
187	break;
188	case `'x'`: { // Hex escape.
189	ResultChar = `0`;
190	if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == `'{'`) {
191	Delimited = true;
192	ThisTokBuf++;
193	if (*ThisTokBuf == `'}'`) {
194	HadError = true;
195	if (Diags)
196	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
197	DiagID: diag::err_delimited_escape_empty);
198	}
199	} else if (ThisTokBuf == ThisTokEnd \|\| !isHexDigit(c: *ThisTokBuf)) {
200	if (Diags)
201	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
202	DiagID: diag::err_hex_escape_no_digits) << "x";
203	return ResultChar;
204	}
205
206	// Hex escapes are a maximal series of hex digits.
207	bool Overflow = false;
208	for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
209	if (Delimited && *ThisTokBuf == `'}'`) {
210	ThisTokBuf++;
211	EndDelimiterFound = true;
212	break;
213	}
214	int CharVal = llvm::hexDigitValue(C: *ThisTokBuf);
215	if (CharVal == -`1`) {
216	// Non delimited hex escape sequences stop at the first non-hex digit.
217	if (!Delimited)
218	break;
219	HadError = true;
220	if (Diags)
221	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
222	DiagID: diag::err_delimited_escape_invalid)
223	<< StringRef(ThisTokBuf, `1`);
224	continue;
225	}
226	// About to shift out a digit?
227	if (ResultChar & `0xF0000000`)
228	Overflow = true;
229	ResultChar <<= `4`;
230	ResultChar \|= CharVal;
231	}
232	// See if any bits will be truncated when evaluated as a character.
233	if (CharWidth != `32` && (ResultChar >> CharWidth) != `0`) {
234	Overflow = true;
235	ResultChar &= ~`0U` >> (`32`-CharWidth);
236	}
237
238	// Check for overflow.
239	if (!HadError && Overflow) { // Too many digits to fit in
240	HadError = true;
241	if (Diags)
242	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
243	DiagID: diag::err_escape_too_large)
244	<< `0`;
245	}
246	break;
247	}
248	case `'0'`: case `'1'`: case `'2'`: case `'3'`:
249	case `'4'`: case `'5'`: case `'6'`: case `'7'`: {
250	// Octal escapes.
251	--ThisTokBuf;
252	ResultChar = `0`;
253
254	// Octal escapes are a series of octal digits with maximum length 3.
255	// "\0123" is a two digit sequence equal to "\012" "3".
256	unsigned NumDigits = `0`;
257	do {
258	ResultChar <<= `3`;
259	ResultChar \|= *ThisTokBuf++ - `'0'`;
260	++NumDigits;
261	} while (ThisTokBuf != ThisTokEnd && NumDigits < `3` &&
262	ThisTokBuf[`0`] >= `'0'` && ThisTokBuf[`0`] <= `'7'`);
263
264	// Check for overflow. Reject '\777', but not L'\777'.
265	if (CharWidth != `32` && (ResultChar >> CharWidth) != `0`) {
266	if (Diags)
267	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
268	DiagID: diag::err_escape_too_large) << `1`;
269	ResultChar &= ~`0U` >> (`32`-CharWidth);
270	}
271	break;
272	}
273	case `'o'`: {
274	bool Overflow = false;
275	if (ThisTokBuf == ThisTokEnd \|\| *ThisTokBuf != `'{'`) {
276	HadError = true;
277	if (Diags)
278	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
279	DiagID: diag::err_delimited_escape_missing_brace)
280	<< "o";
281
282	break;
283	}
284	ResultChar = `0`;
285	Delimited = true;
286	++ThisTokBuf;
287	if (*ThisTokBuf == `'}'`) {
288	HadError = true;
289	if (Diags)
290	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
291	DiagID: diag::err_delimited_escape_empty);
292	}
293
294	while (ThisTokBuf != ThisTokEnd) {
295	if (*ThisTokBuf == `'}'`) {
296	EndDelimiterFound = true;
297	ThisTokBuf++;
298	break;
299	}
300	if (ThisTokBuf < `'0'` \|\| ThisTokBuf > `'7'`) {
301	HadError = true;
302	if (Diags)
303	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
304	DiagID: diag::err_delimited_escape_invalid)
305	<< StringRef(ThisTokBuf, `1`);
306	ThisTokBuf++;
307	continue;
308	}
309	// Check if one of the top three bits is set before shifting them out.
310	if (ResultChar & `0xE0000000`)
311	Overflow = true;
312
313	ResultChar <<= `3`;
314	ResultChar \|= *ThisTokBuf++ - `'0'`;
315	}
316	// Check for overflow. Reject '\777', but not L'\777'.
317	if (!HadError &&
318	(Overflow \|\| (CharWidth != `32` && (ResultChar >> CharWidth) != `0`))) {
319	HadError = true;
320	if (Diags)
321	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
322	DiagID: diag::err_escape_too_large)
323	<< `1`;
324	ResultChar &= ~`0U` >> (`32` - CharWidth);
325	}
326	break;
327	}
328	// Otherwise, these are not valid escapes.
329	case `'('`: case `'{'`: case `'['`: case `'%'`:
330	// GCC accepts these as extensions. We warn about them as such though.
331	if (Diags)
332	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
333	DiagID: diag::ext_nonstandard_escape)
334	<< std::string (`1`, ResultChar);
335	break;
336	default:
337	if (!Diags)
338	break;
339
340	if (isPrintable(c: ResultChar))
341	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
342	DiagID: diag::ext_unknown_escape)
343	<< std::string (`1`, ResultChar);
344	else
345	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
346	DiagID: diag::ext_unknown_escape)
347	<< "x" + llvm::utohexstr(X: ResultChar);
348	break;
349	}
350
351	if (Delimited && Diags) {
352	if (!EndDelimiterFound)
353	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
354	DiagID: diag::err_expected)
355	<< tok::r_brace;
356	else if (!HadError) {
357	Lexer::DiagnoseDelimitedOrNamedEscapeSequence(Loc, Named: false, Opts: Features,
358	Diags&: *Diags);
359	}
360	}
361
362	if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
363	!IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
364	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
365	DiagID: diag::err_unevaluated_string_invalid_escape_sequence)
366	<< StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
367	HadError = true;
368	}
369
370	return ResultChar;
371	}
372
373	static void appendCodePoint(unsigned Codepoint,
374	llvm::SmallVectorImpl<char> &Str) {
375	char ResultBuf[`4`];
376	char *ResultPtr = ResultBuf;
377	if (llvm::ConvertCodePointToUTF8(Source: Codepoint, ResultPtr))
378	Str.append(in_start: ResultBuf, in_end: ResultPtr);
379	}
380
381	void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
382	for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
383	if (*I != `'\\'`) {
384	Buf.push_back(Elt: *I);
385	continue;
386	}
387
388	++I;
389	char Kind = *I;
390	++I;
391
392	assert(Kind == `'u'` \|\| Kind == `'U'` \|\| Kind == `'N'`);
393	uint32_t CodePoint = `0`;
394
395	if (Kind == `'u'` && *I == `'{'`) {
396	for (++I; *I != `'}'`; ++I) {
397	unsigned Value = llvm::hexDigitValue(C: *I);
398	assert(Value != -`1U`);
399	CodePoint <<= `4`;
400	CodePoint += Value;
401	}
402	appendCodePoint(Codepoint: CodePoint, Str&: Buf);
403	continue;
404	}
405
406	if (Kind == `'N'`) {
407	assert(*I == `'{'`);
408	++I;
409	auto Delim = std::find(first: I, last: Input.end(), val: `'}'`);
410	assert(Delim != Input.end());
411	StringRef Name(I, std::distance(first: I, last: Delim));
412	std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
413	llvm::sys::unicode::nameToCodepointLooseMatching(Name);
414	assert(Res && "could not find a codepoint that was previously found");
415	CodePoint = Res ->CodePoint;
416	assert(CodePoint != `0xFFFFFFFF`);
417	appendCodePoint(Codepoint: CodePoint, Str&: Buf);
418	I = Delim;
419	continue;
420	}
421
422	unsigned NumHexDigits;
423	if (Kind == `'u'`)
424	NumHexDigits = `4`;
425	else
426	NumHexDigits = `8`;
427
428	assert(I + NumHexDigits <= E);
429
430	for (; NumHexDigits != `0`; ++I, --NumHexDigits) {
431	unsigned Value = llvm::hexDigitValue(C: *I);
432	assert(Value != -`1U`);
433
434	CodePoint <<= `4`;
435	CodePoint += Value;
436	}
437
438	appendCodePoint(Codepoint: CodePoint, Str&: Buf);
439	--I;
440	}
441	}
442
443	bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
444	const LangOptions &LO) {
445	return LO.MicrosoftExt &&
446	(K == tok::kw___FUNCTION__ \|\| K == tok::kw_L__FUNCTION__ \|\|
447	K == tok::kw___FUNCSIG__ \|\| K == tok::kw_L__FUNCSIG__ \|\|
448	K == tok::kw___FUNCDNAME__);
449	}
450
451	bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
452	return tok::isStringLiteral(K: Tok.getKind()) \|\|
453	isFunctionLocalStringLiteralMacro(K: Tok.getKind(), LO);
454	}
455
456	static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
457	const char *&ThisTokBuf,
458	const char *ThisTokEnd, uint32_t &UcnVal,
459	unsigned short &UcnLen, bool &Delimited,
460	FullSourceLoc Loc, DiagnosticsEngine *Diags,
461	const LangOptions &Features,
462	bool in_char_string_literal = false) {
463	const char *UcnBegin = ThisTokBuf;
464	bool HasError = false;
465	bool EndDelimiterFound = false;
466
467	// Skip the '\u' char's.
468	ThisTokBuf += `2`;
469	Delimited = false;
470	if (UcnBegin[`1`] == `'u'` && in_char_string_literal &&
471	ThisTokBuf != ThisTokEnd && *ThisTokBuf == `'{'`) {
472	Delimited = true;
473	ThisTokBuf++;
474	} else if (ThisTokBuf == ThisTokEnd \|\| !isHexDigit(c: *ThisTokBuf)) {
475	if (Diags)
476	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
477	DiagID: diag::err_hex_escape_no_digits)
478	<< StringRef(&ThisTokBuf[-`1`], `1`);
479	return false;
480	}
481	UcnLen = (ThisTokBuf[-`1`] == `'u'` ? `4` : `8`);
482
483	bool Overflow = false;
484	unsigned short Count = `0`;
485	for (; ThisTokBuf != ThisTokEnd && (Delimited \|\| Count != UcnLen);
486	++ThisTokBuf) {
487	if (Delimited && *ThisTokBuf == `'}'`) {
488	++ThisTokBuf;
489	EndDelimiterFound = true;
490	break;
491	}
492	int CharVal = llvm::hexDigitValue(C: *ThisTokBuf);
493	if (CharVal == -`1`) {
494	HasError = true;
495	if (!Delimited)
496	break;
497	if (Diags) {
498	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
499	DiagID: diag::err_delimited_escape_invalid)
500	<< StringRef(ThisTokBuf, `1`);
501	}
502	Count++;
503	continue;
504	}
505	if (UcnVal & `0xF0000000`) {
506	Overflow = true;
507	continue;
508	}
509	UcnVal <<= `4`;
510	UcnVal \|= CharVal;
511	Count++;
512	}
513
514	if (Overflow) {
515	if (Diags)
516	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
517	DiagID: diag::err_escape_too_large)
518	<< `0`;
519	return false;
520	}
521
522	if (Delimited && !EndDelimiterFound) {
523	if (Diags) {
524	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
525	DiagID: diag::err_expected)
526	<< tok::r_brace;
527	}
528	return false;
529	}
530
531	// If we didn't consume the proper number of digits, there is a problem.
532	if (Count == `0` \|\| (!Delimited && Count != UcnLen)) {
533	if (Diags)
534	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
535	DiagID: Delimited ? diag::err_delimited_escape_empty
536	: diag::err_ucn_escape_incomplete);
537	return false;
538	}
539	return !HasError;
540	}
541
542	static void DiagnoseInvalidUnicodeCharacterName(
543	DiagnosticsEngine Diags, const* LangOptions &Features, FullSourceLoc Loc,
544	const char TokBegin, const* char TokRangeBegin, const* char *TokRangeEnd,
545	llvm::StringRef Name) {
546
547	Diag(Diags, Features, TokLoc: Loc, TokBegin, TokRangeBegin, TokRangeEnd,
548	DiagID: diag::err_invalid_ucn_name)
549	<< Name;
550
551	namespace u = llvm::sys::unicode;
552
553	std::optional<u::LooseMatchingResult> Res =
554	u::nameToCodepointLooseMatching(Name);
555	if (Res) {
556	Diag(Diags, Features, TokLoc: Loc, TokBegin, TokRangeBegin, TokRangeEnd,
557	DiagID: diag::note_invalid_ucn_name_loose_matching)
558	<< FixItHint::CreateReplacement(
559	RemoveRange: MakeCharSourceRange(Features, TokLoc: Loc, TokBegin, TokRangeBegin,
560	TokRangeEnd),
561	Code: Res ->Name);
562	return;
563	}
564
565	unsigned Distance = `0`;
566	SmallVector<u::MatchForCodepointName> Matches =
567	u::nearestMatchesForCodepointName(Pattern: Name, MaxMatchesCount: `5`);
568	assert(!Matches.empty() && "No unicode characters found");
569
570	for (const auto &Match : Matches) {
571	if (Distance == `0`)
572	Distance = Match.Distance;
573	if (std::max(a: Distance, b: Match.Distance) -
574	std::min(a: Distance, b: Match.Distance) >
575	`3`)
576	break;
577	Distance = Match.Distance;
578
579	std::string Str;
580	llvm::UTF32 V = Match.Value;
581	bool Converted =
582	llvm::convertUTF32ToUTF8String(Src: llvm::ArrayRef<llvm::UTF32>(&V, `1`), Out&: Str);
583	(void)Converted;
584	assert(Converted && "Found a match wich is not a unicode character");
585
586	Diag(Diags, Features, TokLoc: Loc, TokBegin, TokRangeBegin, TokRangeEnd,
587	DiagID: diag::note_invalid_ucn_name_candidate)
588	<< Match.Name << llvm::utohexstr(X: Match.Value)
589	<< Str // FIXME: Fix the rendering of non printable characters
590	<< FixItHint::CreateReplacement(
591	RemoveRange: MakeCharSourceRange(Features, TokLoc: Loc, TokBegin, TokRangeBegin,
592	TokRangeEnd),
593	Code: Match.Name);
594	}
595	}
596
597	static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
598	const char *&ThisTokBuf,
599	const char *ThisTokEnd, uint32_t &UcnVal,
600	unsigned short &UcnLen, FullSourceLoc Loc,
601	DiagnosticsEngine *Diags,
602	const LangOptions &Features) {
603	const char *UcnBegin = ThisTokBuf;
604	assert(UcnBegin[`0`] == `'\\'` && UcnBegin[`1`] == `'N'`);
605	ThisTokBuf += `2`;
606	if (ThisTokBuf == ThisTokEnd \|\| *ThisTokBuf != `'{'`) {
607	if (Diags) {
608	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
609	DiagID: diag::err_delimited_escape_missing_brace)
610	<< StringRef(&ThisTokBuf[-`1`], `1`);
611	}
612	return false;
613	}
614	ThisTokBuf++;
615	const char ClosingBrace = std::find_if(first: ThisTokBuf, last: ThisTokEnd, pred: [](char* C) {
616	return C == `'}'` \|\| isVerticalWhitespace(c: C);
617	});
618	bool Incomplete = ClosingBrace == ThisTokEnd;
619	bool Empty = ClosingBrace == ThisTokBuf;
620	if (Incomplete \|\| Empty) {
621	if (Diags) {
622	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
623	DiagID: Incomplete ? diag::err_ucn_escape_incomplete
624	: diag::err_delimited_escape_empty)
625	<< StringRef(&UcnBegin[`1`], `1`);
626	}
627	ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + `1`;
628	return false;
629	}
630	StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
631	ThisTokBuf = ClosingBrace + `1`;
632	std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
633	if (!Res) {
634	if (Diags)
635	DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, TokBegin: ThisTokBegin,
636	TokRangeBegin: &UcnBegin[`3`], TokRangeEnd: ClosingBrace, Name);
637	return false;
638	}
639	UcnVal = *Res;
640	UcnLen = UcnVal > `0xFFFF` ? `8` : `4`;
641	return true;
642	}
643
644	/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
645	/// return the UTF32.
646	static bool ProcessUCNEscape(const char ThisTokBegin, const* char *&ThisTokBuf,
647	const char *ThisTokEnd, uint32_t &UcnVal,
648	unsigned short &UcnLen, FullSourceLoc Loc,
649	DiagnosticsEngine *Diags,
650	const LangOptions &Features,
651	bool in_char_string_literal = false) {
652
653	bool HasError;
654	const char *UcnBegin = ThisTokBuf;
655	bool IsDelimitedEscapeSequence = false;
656	bool IsNamedEscapeSequence = false;
657	if (ThisTokBuf[`1`] == `'N'`) {
658	IsNamedEscapeSequence = true;
659	HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
660	UcnVal, UcnLen, Loc, Diags, Features);
661	} else {
662	HasError =
663	!ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
664	UcnLen, Delimited&: IsDelimitedEscapeSequence, Loc, Diags,
665	Features, in_char_string_literal);
666	}
667	if (HasError)
668	return false;
669
670	// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
671	if ((`0xD800` <= UcnVal && UcnVal <= `0xDFFF`) \|\| // surrogate codepoints
672	UcnVal > `0x10FFFF`) { // maximum legal UTF32 value
673	if (Diags)
674	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
675	DiagID: diag::err_ucn_escape_invalid);
676	return false;
677	}
678
679	// C23 and C++11 allow UCNs that refer to control characters
680	// and basic source characters inside character and string literals
681	if (UcnVal < `0xa0` &&
682	// $, @, ` are allowed in all language modes
683	(UcnVal != `0x24` && UcnVal != `0x40` && UcnVal != `0x60`)) {
684	bool IsError =
685	(!(Features.CPlusPlus11 \|\| Features.C23) \|\| !in_char_string_literal);
686	if (Diags) {
687	char BasicSCSChar = UcnVal;
688	if (UcnVal >= `0x20` && UcnVal < `0x7f`)
689	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
690	DiagID: IsError ? diag::err_ucn_escape_basic_scs
691	: Features.CPlusPlus
692	? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
693	: diag::warn_c23_compat_literal_ucn_escape_basic_scs)
694	<< StringRef(&BasicSCSChar, `1`);
695	else
696	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
697	DiagID: IsError ? diag::err_ucn_control_character
698	: Features.CPlusPlus
699	? diag::warn_cxx98_compat_literal_ucn_control_character
700	: diag::warn_c23_compat_literal_ucn_control_character);
701	}
702	if (IsError)
703	return false;
704	}
705
706	if (!Features.CPlusPlus && !Features.C99 && Diags)
707	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
708	DiagID: diag::warn_ucn_not_valid_in_c89_literal);
709
710	if ((IsDelimitedEscapeSequence \|\| IsNamedEscapeSequence) && Diags)
711	Lexer::DiagnoseDelimitedOrNamedEscapeSequence(Loc, Named: IsNamedEscapeSequence,
712	Opts: Features, Diags&: *Diags);
713	return true;
714	}
715
716	/// MeasureUCNEscape - Determine the number of bytes within the resulting string
717	/// which this UCN will occupy.
718	static int MeasureUCNEscape(const char ThisTokBegin, const* char *&ThisTokBuf,
719	const char ThisTokEnd, unsigned* CharByteWidth,
720	const LangOptions &Features, bool &HadError) {
721	// UTF-32: 4 bytes per escape.
722	if (CharByteWidth == `4`)
723	return `4`;
724
725	uint32_t UcnVal = `0`;
726	unsigned short UcnLen = `0`;
727	FullSourceLoc Loc;
728
729	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
730	UcnLen, Loc, Diags: nullptr, Features, in_char_string_literal: true)) {
731	HadError = true;
732	return `0`;
733	}
734
735	// UTF-16: 2 bytes for BMP, 4 bytes otherwise.
736	if (CharByteWidth == `2`)
737	return UcnVal <= `0xFFFF` ? `2` : `4`;
738
739	// UTF-8.
740	if (UcnVal < `0x80`)
741	return `1`;
742	if (UcnVal < `0x800`)
743	return `2`;
744	if (UcnVal < `0x10000`)
745	return `3`;
746	return `4`;
747	}
748
749	/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
750	/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
751	/// StringLiteralParser. When we decide to implement UCN's for identifiers,
752	/// we will likely rework our support for UCN's.
753	static void EncodeUCNEscape(const char ThisTokBegin, const* char *&ThisTokBuf,
754	const char *ThisTokEnd,
755	char &ResultBuf, bool* &HadError,
756	FullSourceLoc Loc, unsigned CharByteWidth,
757	DiagnosticsEngine *Diags,
758	const LangOptions &Features) {
759	typedef uint32_t UTF32;
760	UTF32 UcnVal = `0`;
761	unsigned short UcnLen = `0`;
762	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
763	Loc, Diags, Features, in_char_string_literal: true)) {
764	HadError = true;
765	return;
766	}
767
768	assert((CharByteWidth == `1` \|\| CharByteWidth == `2` \|\| CharByteWidth == `4`) &&
769	"only character widths of 1, 2, or 4 bytes supported");
770
771	(void)UcnLen;
772	assert((UcnLen== `4` \|\| UcnLen== `8`) && "only ucn length of 4 or 8 supported");
773
774	if (CharByteWidth == `4`) {
775	// FIXME: Make the type of the result buffer correct instead of
776	// using reinterpret_cast.
777	llvm::UTF32 ResultPtr = reinterpret_cast<llvm::UTF32>(ResultBuf);
778	*ResultPtr = UcnVal;
779	ResultBuf += `4`;
780	return;
781	}
782
783	if (CharByteWidth == `2`) {
784	// FIXME: Make the type of the result buffer correct instead of
785	// using reinterpret_cast.
786	llvm::UTF16 ResultPtr = reinterpret_cast<llvm::UTF16>(ResultBuf);
787
788	if (UcnVal <= (UTF32)`0xFFFF`) {
789	*ResultPtr = UcnVal;
790	ResultBuf += `2`;
791	return;
792	}
793
794	// Convert to UTF16.
795	UcnVal -= `0x10000`;
796	*ResultPtr = `0xD800` + (UcnVal >> `10`);
797	*(ResultPtr+`1`) = `0xDC00` + (UcnVal & `0x3FF`);
798	ResultBuf += `4`;
799	return;
800	}
801
802	assert(CharByteWidth == `1` && "UTF-8 encoding is only for 1 byte characters");
803
804	// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
805	// The conversion below was inspired by:
806	// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
807	// First, we determine how many bytes the result will require.
808	typedef uint8_t UTF8;
809
810	unsigned short bytesToWrite = `0`;
811	if (UcnVal < (UTF32)`0x80`)
812	bytesToWrite = `1`;
813	else if (UcnVal < (UTF32)`0x800`)
814	bytesToWrite = `2`;
815	else if (UcnVal < (UTF32)`0x10000`)
816	bytesToWrite = `3`;
817	else
818	bytesToWrite = `4`;
819
820	const unsigned byteMask = `0xBF`;
821	const unsigned byteMark = `0x80`;
822
823	// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
824	// into the first byte, depending on how many bytes follow.
825	static const UTF8 firstByteMark[`5`] = {
826	`0x00`, `0x00`, `0xC0`, `0xE0`, `0xF0`
827	};
828	// Finally, we write the bytes into ResultBuf.
829	ResultBuf += bytesToWrite;
830	switch (bytesToWrite) { // note: everything falls through.
831	case `4`:
832	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= `6`;
833	[[fallthrough]];
834	case `3`:
835	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= `6`;
836	[[fallthrough]];
837	case `2`:
838	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= `6`;
839	[[fallthrough]];
840	case `1`:
841	*--ResultBuf = (UTF8) (UcnVal \| firstByteMark[bytesToWrite]);
842	}
843	// Update the buffer.
844	ResultBuf += bytesToWrite;
845	}
846
847	/// integer-constant: [C99 6.4.4.1]
848	/// decimal-constant integer-suffix
849	/// octal-constant integer-suffix
850	/// hexadecimal-constant integer-suffix
851	/// binary-literal integer-suffix [GNU, C++1y]
852	/// user-defined-integer-literal: [C++11 lex.ext]
853	/// decimal-literal ud-suffix
854	/// octal-literal ud-suffix
855	/// hexadecimal-literal ud-suffix
856	/// binary-literal ud-suffix [GNU, C++1y]
857	/// decimal-constant:
858	/// nonzero-digit
859	/// decimal-constant digit
860	/// octal-constant:
861	/// 0
862	/// octal-constant octal-digit
863	/// hexadecimal-constant:
864	/// hexadecimal-prefix hexadecimal-digit
865	/// hexadecimal-constant hexadecimal-digit
866	/// hexadecimal-prefix: one of
867	/// 0x 0X
868	/// binary-literal:
869	/// 0b binary-digit
870	/// 0B binary-digit
871	/// binary-literal binary-digit
872	/// integer-suffix:
873	/// unsigned-suffix [long-suffix]
874	/// unsigned-suffix [long-long-suffix]
875	/// long-suffix [unsigned-suffix]
876	/// long-long-suffix [unsigned-sufix]
877	/// nonzero-digit:
878	/// 1 2 3 4 5 6 7 8 9
879	/// octal-digit:
880	/// 0 1 2 3 4 5 6 7
881	/// hexadecimal-digit:
882	/// 0 1 2 3 4 5 6 7 8 9
883	/// a b c d e f
884	/// A B C D E F
885	/// binary-digit:
886	/// 0
887	/// 1
888	/// unsigned-suffix: one of
889	/// u U
890	/// long-suffix: one of
891	/// l L
892	/// long-long-suffix: one of
893	/// ll LL
894	///
895	/// floating-constant: [C99 6.4.4.2]
896	/// TODO: add rules...
897	///
898	NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
899	SourceLocation TokLoc,
900	const SourceManager &SM,
901	const LangOptions &LangOpts,
902	const TargetInfo &Target,
903	DiagnosticsEngine &Diags)
904	: SM(SM), LangOpts(LangOpts), Diags(Diags),
905	ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
906
907	s = DigitsBegin = ThisTokBegin;
908	saw_exponent = false;
909	saw_period = false;
910	saw_ud_suffix = false;
911	saw_fixed_point_suffix = false;
912	isLong = false;
913	isUnsigned = false;
914	isLongLong = false;
915	isSizeT = false;
916	isHalf = false;
917	isFloat = false;
918	isImaginary = false;
919	isFloat16 = false;
920	isFloat128 = false;
921	MicrosoftInteger = `0`;
922	isFract = false;
923	isAccum = false;
924	hadError = false;
925	isBitInt = false;
926
927	// This routine assumes that the range begin/end matches the regex for integer
928	// and FP constants (specifically, the 'pp-number' regex), and assumes that
929	// the byte at "end" is both valid and not part of the regex. Because of*
930	// this, it doesn't have to check for 'overscan' in various places.
931	// Note: For HLSL, the end token is allowed to be '.' which would be in the
932	// 'pp-number' regex. This is required to support vector swizzles on numeric
933	// constants (i.e. 1.xx or 1.5f.rrr).
934	if (isPreprocessingNumberBody(c: *ThisTokEnd) &&
935	!(LangOpts.HLSL && *ThisTokEnd == `'.'`)) {
936	Diags.Report(Loc: TokLoc, DiagID: diag::err_lexing_numeric);
937	hadError = true;
938	return;
939	}
940
941	if (s == `'0'`) { // parse radix*
942	ParseNumberStartingWithZero(TokLoc);
943	if (hadError)
944	return;
945	} else { // the first digit is non-zero
946	radix = `10`;
947	s = SkipDigits(ptr: s);
948	if (s == ThisTokEnd) {
949	// Done.
950	} else {
951	ParseDecimalOrOctalCommon(TokLoc);
952	if (hadError)
953	return;
954	}
955	}
956
957	SuffixBegin = s;
958	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_AfterDigits);
959
960	// Initial scan to lookahead for fixed point suffix.
961	if (LangOpts.FixedPoint) {
962	for (const char *c = s; c != ThisTokEnd; ++c) {
963	if (c == `'r'` \|\| c == `'k'` \|\| c == `'R'` \|\| c == `'K'`) {
964	saw_fixed_point_suffix = true;
965	break;
966	}
967	}
968	}
969
970	// Parse the suffix. At this point we can classify whether we have an FP or
971	// integer constant.
972	bool isFixedPointConstant = isFixedPointLiteral();
973	bool isFPConstant = isFloatingLiteral();
974	bool HasSize = false;
975	bool DoubleUnderscore = false;
976
977	// Loop over all of the characters of the suffix. If we see something bad,
978	// we break out of the loop.
979	for (; s != ThisTokEnd; ++s) {
980	switch (*s) {
981	case `'R'`:
982	case `'r'`:
983	if (!LangOpts.FixedPoint)
984	break;
985	if (isFract \|\| isAccum) break;
986	if (!(saw_period \|\| saw_exponent)) break;
987	isFract = true;
988	continue;
989	case `'K'`:
990	case `'k'`:
991	if (!LangOpts.FixedPoint)
992	break;
993	if (isFract \|\| isAccum) break;
994	if (!(saw_period \|\| saw_exponent)) break;
995	isAccum = true;
996	continue;
997	case `'h'`: // FP Suffix for "half".
998	case `'H'`:
999	// OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
1000	if (!(LangOpts.Half \|\| LangOpts.FixedPoint))
1001	break;
1002	if (isIntegerLiteral()) break; // Error for integer constant.
1003	if (HasSize)
1004	break;
1005	HasSize = true;
1006	isHalf = true;
1007	continue; // Success.
1008	case `'f'`: // FP Suffix for "float"
1009	case `'F'`:
1010	if (!isFPConstant) break; // Error for integer constant.
1011	if (HasSize)
1012	break;
1013	HasSize = true;
1014
1015	// CUDA host and device may have different _Float16 support, therefore
1016	// allows f16 literals to avoid false alarm.
1017	// When we compile for OpenMP target offloading on NVPTX, f16 suffix
1018	// should also be supported.
1019	// ToDo: more precise check for CUDA.
1020	// TODO: AMDGPU might also support it in the future.
1021	if ((Target.hasFloat16Type() \|\| LangOpts.CUDA \|\|
1022	(LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1023	s + `2` < ThisTokEnd && s[`1`] == `'1'` && s[`2`] == `'6'`) {
1024	s += `2`; // success, eat up 2 characters.
1025	isFloat16 = true;
1026	continue;
1027	}
1028
1029	isFloat = true;
1030	continue; // Success.
1031	case `'q'`: // FP Suffix for "__float128"
1032	case `'Q'`:
1033	if (!isFPConstant) break; // Error for integer constant.
1034	if (HasSize)
1035	break;
1036	HasSize = true;
1037	isFloat128 = true;
1038	continue; // Success.
1039	case `'u'`:
1040	case `'U'`:
1041	if (isFPConstant) break; // Error for floating constant.
1042	if (isUnsigned) break; // Cannot be repeated.
1043	isUnsigned = true;
1044	continue; // Success.
1045	case `'l'`:
1046	case `'L'`:
1047	if (HasSize)
1048	break;
1049	HasSize = true;
1050
1051	// Check for long long. The L's need to be adjacent and the same case.
1052	if (s[`1`] == s[`0`]) {
1053	assert(s + `1` < ThisTokEnd && "didn't maximally munch?");
1054	if (isFPConstant) break; // long long invalid for floats.
1055	isLongLong = true;
1056	++s; // Eat both of them.
1057	} else {
1058	isLong = true;
1059	}
1060	continue; // Success.
1061	case `'z'`:
1062	case `'Z'`:
1063	if (isFPConstant)
1064	break; // Invalid for floats.
1065	if (HasSize)
1066	break;
1067	HasSize = true;
1068	isSizeT = true;
1069	continue;
1070	case `'i'`:
1071	case `'I'`:
1072	if (LangOpts.MicrosoftExt && s + `1` < ThisTokEnd && !isFPConstant) {
1073	// Allow i8, i16, i32, i64, and i128. First, look ahead and check if
1074	// suffixes are Microsoft integers and not the imaginary unit.
1075	uint8_t Bits = `0`;
1076	size_t ToSkip = `0`;
1077	switch (s[`1`]) {
1078	case `'8'`: // i8 suffix
1079	Bits = `8`;
1080	ToSkip = `2`;
1081	break;
1082	case `'1'`:
1083	if (s + `2` < ThisTokEnd && s[`2`] == `'6'`) { // i16 suffix
1084	Bits = `16`;
1085	ToSkip = `3`;
1086	} else if (s + `3` < ThisTokEnd && s[`2`] == `'2'` &&
1087	s[`3`] == `'8'`) { // i128 suffix
1088	Bits = `128`;
1089	ToSkip = `4`;
1090	}
1091	break;
1092	case `'3'`:
1093	if (s + `2` < ThisTokEnd && s[`2`] == `'2'`) { // i32 suffix
1094	Bits = `32`;
1095	ToSkip = `3`;
1096	}
1097	break;
1098	case `'6'`:
1099	if (s + `2` < ThisTokEnd && s[`2`] == `'4'`) { // i64 suffix
1100	Bits = `64`;
1101	ToSkip = `3`;
1102	}
1103	break;
1104	default:
1105	break;
1106	}
1107	if (Bits) {
1108	if (HasSize)
1109	break;
1110	HasSize = true;
1111	MicrosoftInteger = Bits;
1112	s += ToSkip;
1113	assert(s <= ThisTokEnd && "didn't maximally munch?");
1114	break;
1115	}
1116	}
1117	[[fallthrough]];
1118	case `'j'`:
1119	case `'J'`:
1120	if (isImaginary) break; // Cannot be repeated.
1121	isImaginary = true;
1122	continue; // Success.
1123	case `'_'`:
1124	if (isFPConstant)
1125	break; // Invalid for floats
1126	if (HasSize)
1127	break;
1128	// There is currently no way to reach this with DoubleUnderscore set.
1129	// If new double underscope literals are added handle it here as above.
1130	assert(!DoubleUnderscore && "unhandled double underscore case");
1131	if (LangOpts.CPlusPlus && s + `2` < ThisTokEnd &&
1132	s[`1`] == `'_'`) { // s + 2 < ThisTokEnd to ensure some character exists
1133	// after __
1134	DoubleUnderscore = true;
1135	s += `2`; // Skip both '_'
1136	if (s + `1` < ThisTokEnd &&
1137	(s == `'u'` \|\| s == `'U'`)) { // Ensure some character after 'u'/'U'
1138	isUnsigned = true;
1139	++s;
1140	}
1141	if (s + `1` < ThisTokEnd &&
1142	((s == `'w'` && (++s) == `'b'`) \|\| (s == `'W'` && (++s) == `'B'`))) {
1143	isBitInt = true;
1144	HasSize = true;
1145	continue;
1146	}
1147	}
1148	break;
1149	case `'w'`:
1150	case `'W'`:
1151	if (isFPConstant)
1152	break; // Invalid for floats.
1153	if (HasSize)
1154	break; // Invalid if we already have a size for the literal.
1155
1156	// wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1157	// explicitly do not support the suffix in C++ as an extension because a
1158	// library-based UDL that resolves to a library type may be more
1159	// appropriate there. The same rules apply for __wb/__WB.
1160	if ((!LangOpts.CPlusPlus \|\| DoubleUnderscore) && s + `1` < ThisTokEnd &&
1161	((s[`0`] == `'w'` && s[`1`] == `'b'`) \|\| (s[`0`] == `'W'` && s[`1`] == `'B'`))) {
1162	isBitInt = true;
1163	HasSize = true;
1164	++s; // Skip both characters (2nd char skipped on continue).
1165	continue; // Success.
1166	}
1167	}
1168	// If we reached here, there was an error or a ud-suffix.
1169	break;
1170	}
1171
1172	// "i", "if", and "il" are user-defined suffixes in C++1y.
1173	if (s != ThisTokEnd \|\| isImaginary) {
1174	// FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1175	expandUCNs(Buf&: UDSuffixBuf, Input: StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1176	if (isValidUDSuffix(LangOpts, Suffix: UDSuffixBuf)) {
1177	if (!isImaginary) {
1178	// Any suffix pieces we might have parsed are actually part of the
1179	// ud-suffix.
1180	isLong = false;
1181	isUnsigned = false;
1182	isLongLong = false;
1183	isSizeT = false;
1184	isFloat = false;
1185	isFloat16 = false;
1186	isHalf = false;
1187	isImaginary = false;
1188	isBitInt = false;
1189	MicrosoftInteger = `0`;
1190	saw_fixed_point_suffix = false;
1191	isFract = false;
1192	isAccum = false;
1193	}
1194
1195	saw_ud_suffix = true;
1196	return;
1197	}
1198
1199	if (s != ThisTokEnd) {
1200	// Report an error if there are any.
1201	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(
1202	TokStart: TokLoc, Characters: SuffixBegin - ThisTokBegin, SM, LangOpts),
1203	DiagID: diag::err_invalid_suffix_constant)
1204	<< StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1205	<< (isFixedPointConstant ? `2` : isFPConstant);
1206	hadError = true;
1207	}
1208	}
1209
1210	if (!hadError && saw_fixed_point_suffix) {
1211	assert(isFract \|\| isAccum);
1212	}
1213	}
1214
1215	/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1216	/// numbers. It issues an error for illegal digits, and handles floating point
1217	/// parsing. If it detects a floating point number, the radix is set to 10.
1218	void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1219	assert((radix == `8` \|\| radix == `10`) && "Unexpected radix");
1220
1221	// If we have a hex digit other than 'e' (which denotes a FP exponent) then
1222	// the code is using an incorrect base.
1223	if (isHexDigit(c: s) && s != `'e'` && *s != `'E'` &&
1224	!isValidUDSuffix(LangOpts, Suffix: StringRef(s, ThisTokEnd - s))) {
1225	Diags.Report(
1226	Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: s - ThisTokBegin, SM, LangOpts),
1227	DiagID: diag::err_invalid_digit)
1228	<< StringRef(s, `1`) << (radix == `8` ? `1` : `0`);
1229	hadError = true;
1230	return;
1231	}
1232
1233	if (*s == `'.'`) {
1234	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_AfterDigits);
1235	s++;
1236	radix = `10`;
1237	saw_period = true;
1238	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_BeforeDigits);
1239	s = SkipDigits(ptr: s); // Skip suffix.
1240	}
1241	if (s == `'e'` \|\| s == `'E'`) { // exponent
1242	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_AfterDigits);
1243	const char *Exponent = s;
1244	s++;
1245	radix = `10`;
1246	saw_exponent = true;
1247	if (s != ThisTokEnd && (s == `'+'` \|\| s == `'-'`)) s++; // sign
1248	const char *first_non_digit = SkipDigits(ptr: s);
1249	if (containsDigits(Start: s, End: first_non_digit)) {
1250	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_BeforeDigits);
1251	s = first_non_digit;
1252	} else {
1253	if (!hadError) {
1254	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(
1255	TokStart: TokLoc, Characters: Exponent - ThisTokBegin, SM, LangOpts),
1256	DiagID: diag::err_exponent_has_no_digits);
1257	hadError = true;
1258	}
1259	return;
1260	}
1261	}
1262	}
1263
1264	/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1265	/// suffixes as ud-suffixes, because the diagnostic experience is better if we
1266	/// treat it as an invalid suffix.
1267	bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1268	StringRef Suffix) {
1269	if (!LangOpts.CPlusPlus11 \|\| Suffix.empty())
1270	return false;
1271
1272	// By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1273	// Suffixes starting with '__' (double underscore) are for use by
1274	// the implementation.
1275	if (Suffix.starts_with(Prefix: "_") && !Suffix.starts_with(Prefix: "__"))
1276	return true;
1277
1278	// In C++11, there are no library suffixes.
1279	if (!LangOpts.CPlusPlus14)
1280	return false;
1281
1282	// In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1283	// Per tweaked N3660, "il", "i", and "if" are also used in the library.
1284	// In C++2a "d" and "y" are used in the library.
1285	return llvm::StringSwitch<bool>(Suffix)
1286	.Cases(S0: "h", S1: "min", S2: "s", Value: true)
1287	.Cases(S0: "ms", S1: "us", S2: "ns", Value: true)
1288	.Cases(S0: "il", S1: "i", S2: "if", Value: true)
1289	.Cases(S0: "d", S1: "y", Value: LangOpts.CPlusPlus20)
1290	.Default(Value: false);
1291	}
1292
1293	void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1294	const char *Pos,
1295	CheckSeparatorKind IsAfterDigits) {
1296	if (IsAfterDigits == CSK_AfterDigits) {
1297	if (Pos == ThisTokBegin)
1298	return;
1299	--Pos;
1300	} else if (Pos == ThisTokEnd)
1301	return;
1302
1303	if (isDigitSeparator(C: *Pos)) {
1304	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: Pos - ThisTokBegin, SM,
1305	LangOpts),
1306	DiagID: diag::err_digit_separator_not_between_digits)
1307	<< IsAfterDigits;
1308	hadError = true;
1309	}
1310	}
1311
1312	/// ParseNumberStartingWithZero - This method is called when the first character
1313	/// of the number is found to be a zero. This means it is either an octal
1314	/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1315	/// a floating point number (01239.123e4). Eat the prefix, determining the
1316	/// radix etc.
1317	void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1318	assert(s[`0`] == `'0'` && "Invalid method call");
1319	s++;
1320
1321	int c1 = s[`0`];
1322
1323	// Handle a hex number like 0x1234.
1324	if ((c1 == `'x'` \|\| c1 == `'X'`) && (isHexDigit(c: s[`1`]) \|\| s[`1`] == `'.'`)) {
1325	s++;
1326	assert(s < ThisTokEnd && "didn't maximally munch?");
1327	radix = `16`;
1328	DigitsBegin = s;
1329	s = SkipHexDigits(ptr: s);
1330	bool HasSignificandDigits = containsDigits(Start: DigitsBegin, End: s);
1331	if (s == ThisTokEnd) {
1332	// Done.
1333	} else if (*s == `'.'`) {
1334	s++;
1335	saw_period = true;
1336	const char *floatDigitsBegin = s;
1337	s = SkipHexDigits(ptr: s);
1338	if (containsDigits(Start: floatDigitsBegin, End: s))
1339	HasSignificandDigits = true;
1340	if (HasSignificandDigits)
1341	checkSeparator(TokLoc, Pos: floatDigitsBegin, IsAfterDigits: CSK_BeforeDigits);
1342	}
1343
1344	if (!HasSignificandDigits) {
1345	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: s - ThisTokBegin, SM,
1346	LangOpts),
1347	DiagID: diag::err_hex_constant_requires)
1348	<< LangOpts.CPlusPlus << `1`;
1349	hadError = true;
1350	return;
1351	}
1352
1353	// A binary exponent can appear with or with a '.'. If dotted, the
1354	// binary exponent is required.
1355	if (s == `'p'` \|\| s == `'P'`) {
1356	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_AfterDigits);
1357	const char *Exponent = s;
1358	s++;
1359	saw_exponent = true;
1360	if (s != ThisTokEnd && (s == `'+'` \|\| s == `'-'`)) s++; // sign
1361	const char *first_non_digit = SkipDigits(ptr: s);
1362	if (!containsDigits(Start: s, End: first_non_digit)) {
1363	if (!hadError) {
1364	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(
1365	TokStart: TokLoc, Characters: Exponent - ThisTokBegin, SM, LangOpts),
1366	DiagID: diag::err_exponent_has_no_digits);
1367	hadError = true;
1368	}
1369	return;
1370	}
1371	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_BeforeDigits);
1372	s = first_non_digit;
1373
1374	if (!LangOpts.HexFloats)
1375	Diags.Report(Loc: TokLoc, DiagID: LangOpts.CPlusPlus
1376	? diag::ext_hex_literal_invalid
1377	: diag::ext_hex_constant_invalid);
1378	else if (LangOpts.CPlusPlus17)
1379	Diags.Report(Loc: TokLoc, DiagID: diag::warn_cxx17_hex_literal);
1380	} else if (saw_period) {
1381	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: s - ThisTokBegin, SM,
1382	LangOpts),
1383	DiagID: diag::err_hex_constant_requires)
1384	<< LangOpts.CPlusPlus << `0`;
1385	hadError = true;
1386	}
1387	return;
1388	}
1389
1390	// Handle simple binary numbers 0b01010
1391	if ((c1 == `'b'` \|\| c1 == `'B'`) && (s[`1`] == `'0'` \|\| s[`1`] == `'1'`)) {
1392	// 0b101010 is a C++14 and C23 extension.
1393	unsigned DiagId;
1394	if (LangOpts.CPlusPlus14)
1395	DiagId = diag::warn_cxx11_compat_binary_literal;
1396	else if (LangOpts.C23)
1397	DiagId = diag::warn_c23_compat_binary_literal;
1398	else if (LangOpts.CPlusPlus)
1399	DiagId = diag::ext_binary_literal_cxx14;
1400	else
1401	DiagId = diag::ext_binary_literal;
1402	Diags.Report(Loc: TokLoc, DiagID: DiagId);
1403	++s;
1404	assert(s < ThisTokEnd && "didn't maximally munch?");
1405	radix = `2`;
1406	DigitsBegin = s;
1407	s = SkipBinaryDigits(ptr: s);
1408	if (s == ThisTokEnd) {
1409	// Done.
1410	} else if (isHexDigit(c: *s) &&
1411	!isValidUDSuffix(LangOpts, Suffix: StringRef(s, ThisTokEnd - s))) {
1412	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: s - ThisTokBegin, SM,
1413	LangOpts),
1414	DiagID: diag::err_invalid_digit)
1415	<< StringRef(s, `1`) << `2`;
1416	hadError = true;
1417	}
1418	// Other suffixes will be diagnosed by the caller.
1419	return;
1420	}
1421
1422	// Parse a potential octal literal prefix.
1423	bool IsSingleZero = false;
1424	if ((c1 == `'O'` \|\| c1 == `'o'`) && (s[`1`] >= `'0'` && s[`1`] <= `'7'`)) {
1425	unsigned DiagId;
1426	if (LangOpts.C2y)
1427	DiagId = diag::warn_c2y_compat_octal_literal;
1428	else if (LangOpts.CPlusPlus)
1429	DiagId = diag::ext_cpp_octal_literal;
1430	else
1431	DiagId = diag::ext_octal_literal;
1432	Diags.Report(Loc: TokLoc, DiagID: DiagId);
1433	++s;
1434	DigitsBegin = s;
1435	radix = `8`;
1436	s = SkipOctalDigits(ptr: s);
1437	if (s == ThisTokEnd) {
1438	// Done
1439	} else if ((isHexDigit(c: s) && s != `'e'` && s != `'E'` && s != `'.'`) &&
1440	!isValidUDSuffix(LangOpts, Suffix: StringRef(s, ThisTokEnd - s))) {
1441	auto InvalidDigitLoc = Lexer::AdvanceToTokenCharacter(
1442	TokStart: TokLoc, Characters: s - ThisTokBegin, SM, LangOpts);
1443	Diags.Report(Loc: InvalidDigitLoc, DiagID: diag::err_invalid_digit)
1444	<< StringRef(s, `1`) << `1`;
1445	hadError = true;
1446	}
1447	// Other suffixes will be diagnosed by the caller.
1448	return;
1449	}
1450
1451	auto _ = llvm::make_scope_exit(F: [&] {
1452	// If we still have an octal value but we did not see an octal prefix,
1453	// diagnose as being an obsolescent feature starting in C2y.
1454	if (radix == `8` && LangOpts.C2y && !hadError && !IsSingleZero)
1455	Diags.Report(Loc: TokLoc, DiagID: diag::warn_unprefixed_octal_deprecated);
1456	});
1457
1458	// For now, the radix is set to 8. If we discover that we have a
1459	// floating point constant, the radix will change to 10. Octal floating
1460	// point constants are not permitted (only decimal and hexadecimal).
1461	radix = `8`;
1462	const char *PossibleNewDigitStart = s;
1463	s = SkipOctalDigits(ptr: s);
1464	// When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1465	// as the start of the digits. So if skipping octal digits does not skip
1466	// anything, we leave the digit start where it was.
1467	if (s != PossibleNewDigitStart)
1468	DigitsBegin = PossibleNewDigitStart;
1469	else
1470	IsSingleZero = (s == ThisTokEnd); // Is the only thing we've seen a 0?
1471
1472	if (s == ThisTokEnd)
1473	return; // Done, simple octal number like 01234
1474
1475	// If we have some other non-octal digit that is* a decimal digit, see if*
1476	// this is part of a floating point number like 094.123 or 09e1.
1477	if (isDigit(c: *s)) {
1478	const char *EndDecimal = SkipDigits(ptr: s);
1479	if (EndDecimal[`0`] == `'.'` \|\| EndDecimal[`0`] == `'e'` \|\| EndDecimal[`0`] == `'E'`) {
1480	s = EndDecimal;
1481	radix = `10`;
1482	}
1483	}
1484
1485	ParseDecimalOrOctalCommon(TokLoc);
1486	}
1487
1488	static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1489	switch (Radix) {
1490	case `2`:
1491	return NumDigits <= `64`;
1492	case `8`:
1493	return NumDigits <= `64` / `3`; // Digits are groups of 3 bits.
1494	case `10`:
1495	return NumDigits <= `19`; // floor(log10(2^64))
1496	case `16`:
1497	return NumDigits <= `64` / `4`; // Digits are groups of 4 bits.
1498	default:
1499	llvm_unreachable("impossible Radix");
1500	}
1501	}
1502
1503	/// GetIntegerValue - Convert this numeric literal value to an APInt that
1504	/// matches Val's input width. If there is an overflow, set Val to the low bits
1505	/// of the result and return true. Otherwise, return false.
1506	bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1507	// Fast path: Compute a conservative bound on the maximum number of
1508	// bits per digit in this radix. If we can't possibly overflow a
1509	// uint64 based on that bound then do the simple conversion to
1510	// integer. This avoids the expensive overflow checking below, and
1511	// handles the common cases that matter (small decimal integers and
1512	// hex/octal values which don't overflow).
1513	const unsigned NumDigits = SuffixBegin - DigitsBegin;
1514	if (alwaysFitsInto64Bits(Radix: radix, NumDigits)) {
1515	uint64_t N = `0`;
1516	for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1517	if (!isDigitSeparator(C: *Ptr))
1518	N = N * radix + llvm::hexDigitValue(C: *Ptr);
1519
1520	// This will truncate the value to Val's input width. Simply check
1521	// for overflow by comparing.
1522	Val = N;
1523	return Val.getZExtValue() != N;
1524	}
1525
1526	Val = `0`;
1527	const char *Ptr = DigitsBegin;
1528
1529	llvm::APInt RadixVal(Val.getBitWidth(), radix);
1530	llvm::APInt CharVal(Val.getBitWidth(), `0`);
1531	llvm::APInt OldVal = Val;
1532
1533	bool OverflowOccurred = false;
1534	while (Ptr < SuffixBegin) {
1535	if (isDigitSeparator(C: *Ptr)) {
1536	++Ptr;
1537	continue;
1538	}
1539
1540	unsigned C = llvm::hexDigitValue(C: *Ptr++);
1541
1542	// If this letter is out of bound for this radix, reject it.
1543	assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1544
1545	CharVal = C;
1546
1547	// Add the digit to the value in the appropriate radix. If adding in digits
1548	// made the value smaller, then this overflowed.
1549	OldVal = Val;
1550
1551	// Multiply by radix, did overflow occur on the multiply?
1552	Val *= RadixVal;
1553	OverflowOccurred \|= Val.udiv(RHS: RadixVal) != OldVal;
1554
1555	// Add value, did overflow occur on the value?
1556	// (a + b) ult b <=> overflow
1557	Val += CharVal;
1558	OverflowOccurred \|= Val.ult(RHS: CharVal);
1559	}
1560	return OverflowOccurred;
1561	}
1562
1563	llvm::APFloat::opStatus
1564	NumericLiteralParser::GetFloatValue(llvm::APFloat &Result,
1565	llvm::RoundingMode RM) {
1566	using llvm::APFloat;
1567
1568	unsigned n = std::min(a: SuffixBegin - ThisTokBegin, b: ThisTokEnd - ThisTokBegin);
1569
1570	llvm::SmallString<`16`> Buffer;
1571	StringRef Str(ThisTokBegin, n);
1572	if (Str.contains(C: `'\''`)) {
1573	Buffer.reserve(N: n);
1574	std::remove_copy_if(first: Str.begin(), last: Str.end(), result: std::back_inserter(x&: Buffer),
1575	pred: &isDigitSeparator);
1576	Str = Buffer;
1577	}
1578
1579	auto StatusOrErr = Result.convertFromString(Str, RM);
1580	assert(StatusOrErr && "Invalid floating point representation");
1581	return !errorToBool(Err: StatusOrErr.takeError()) ? *StatusOrErr
1582	: APFloat::opInvalidOp;
1583	}
1584
1585	static inline bool IsExponentPart(char c, bool isHex) {
1586	if (isHex)
1587	return c == `'p'` \|\| c == `'P'`;
1588	return c == `'e'` \|\| c == `'E'`;
1589	}
1590
1591	bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1592	assert(radix == `16` \|\| radix == `10`);
1593
1594	// Find how many digits are needed to store the whole literal.
1595	unsigned NumDigits = SuffixBegin - DigitsBegin;
1596	if (saw_period) --NumDigits;
1597
1598	// Initial scan of the exponent if it exists
1599	bool ExpOverflowOccurred = false;
1600	bool NegativeExponent = false;
1601	const char *ExponentBegin;
1602	uint64_t Exponent = `0`;
1603	int64_t BaseShift = `0`;
1604	if (saw_exponent) {
1605	const char *Ptr = DigitsBegin;
1606
1607	while (!IsExponentPart(c: *Ptr, isHex: radix == `16`))
1608	++Ptr;
1609	ExponentBegin = Ptr;
1610	++Ptr;
1611	NegativeExponent = *Ptr == `'-'`;
1612	if (NegativeExponent) ++Ptr;
1613
1614	unsigned NumExpDigits = SuffixBegin - Ptr;
1615	if (alwaysFitsInto64Bits(Radix: radix, NumDigits: NumExpDigits)) {
1616	llvm::StringRef ExpStr(Ptr, NumExpDigits);
1617	llvm::APInt ExpInt(/numBits=/`64`, ExpStr, /radix=/`10`);
1618	Exponent = ExpInt.getZExtValue();
1619	} else {
1620	ExpOverflowOccurred = true;
1621	}
1622
1623	if (NegativeExponent) BaseShift -= Exponent;
1624	else BaseShift += Exponent;
1625	}
1626
1627	// Number of bits needed for decimal literal is
1628	// ceil(NumDigits log2(10)) Integral part*
1629	// + Scale Fractional part
1630	// + ceil(Exponent log2(10)) Exponent*
1631	// --------------------------------------------------
1632	// ceil((NumDigits + Exponent) log2(10)) + Scale*
1633	//
1634	// But for simplicity in handling integers, we can round up log2(10) to 4,
1635	// making:
1636	// 4 (NumDigits + Exponent) + Scale*
1637	//
1638	// Number of digits needed for hexadecimal literal is
1639	// 4 NumDigits Integral part*
1640	// + Scale Fractional part
1641	// + Exponent Exponent
1642	// --------------------------------------------------
1643	// (4 NumDigits) + Scale + Exponent*
1644	uint64_t NumBitsNeeded;
1645	if (radix == `10`)
1646	NumBitsNeeded = `4` * (NumDigits + Exponent) + Scale;
1647	else
1648	NumBitsNeeded = `4` * NumDigits + Exponent + Scale;
1649
1650	if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1651	ExpOverflowOccurred = true;
1652	llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), `0`, /isSigned=/false);
1653
1654	bool FoundDecimal = false;
1655
1656	int64_t FractBaseShift = `0`;
1657	const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1658	for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1659	if (*Ptr == `'.'`) {
1660	FoundDecimal = true;
1661	continue;
1662	}
1663
1664	// Normal reading of an integer
1665	unsigned C = llvm::hexDigitValue(C: *Ptr);
1666	assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1667
1668	Val *= radix;
1669	Val += C;
1670
1671	if (FoundDecimal)
1672	// Keep track of how much we will need to adjust this value by from the
1673	// number of digits past the radix point.
1674	--FractBaseShift;
1675	}
1676
1677	// For a radix of 16, we will be multiplying by 2 instead of 16.
1678	if (radix == `16`) FractBaseShift *= `4`;
1679	BaseShift += FractBaseShift;
1680
1681	Val <<= Scale;
1682
1683	uint64_t Base = (radix == `16`) ? `2` : `10`;
1684	if (BaseShift > `0`) {
1685	for (int64_t i = `0`; i < BaseShift; ++i) {
1686	Val *= Base;
1687	}
1688	} else if (BaseShift < `0`) {
1689	for (int64_t i = BaseShift; i < `0` && !Val.isZero(); ++i)
1690	Val = Val.udiv(RHS: Base);
1691	}
1692
1693	bool IntOverflowOccurred = false;
1694	auto MaxVal = llvm::APInt::getMaxValue(numBits: StoreVal.getBitWidth());
1695	if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1696	IntOverflowOccurred \|= Val.ugt(RHS: MaxVal.zext(width: Val.getBitWidth()));
1697	StoreVal = Val.trunc(width: StoreVal.getBitWidth());
1698	} else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1699	IntOverflowOccurred \|= Val.zext(width: MaxVal.getBitWidth()).ugt(RHS: MaxVal);
1700	StoreVal = Val.zext(width: StoreVal.getBitWidth());
1701	} else {
1702	StoreVal = Val;
1703	}
1704
1705	return IntOverflowOccurred \|\| ExpOverflowOccurred;
1706	}
1707
1708	/// \verbatim
1709	/// user-defined-character-literal: [C++11 lex.ext]
1710	/// character-literal ud-suffix
1711	/// ud-suffix:
1712	/// identifier
1713	/// character-literal: [C++11 lex.ccon]
1714	/// ' c-char-sequence '
1715	/// u' c-char-sequence '
1716	/// U' c-char-sequence '
1717	/// L' c-char-sequence '
1718	/// u8' c-char-sequence ' [C++1z lex.ccon]
1719	/// c-char-sequence:
1720	/// c-char
1721	/// c-char-sequence c-char
1722	/// c-char:
1723	/// any member of the source character set except the single-quote ',
1724	/// backslash \, or new-line character
1725	/// escape-sequence
1726	/// universal-character-name
1727	/// escape-sequence:
1728	/// simple-escape-sequence
1729	/// octal-escape-sequence
1730	/// hexadecimal-escape-sequence
1731	/// simple-escape-sequence:
1732	/// one of \' \" \? \\ \a \b \f \n \r \t \v
1733	/// octal-escape-sequence:
1734	/// \ octal-digit
1735	/// \ octal-digit octal-digit
1736	/// \ octal-digit octal-digit octal-digit
1737	/// hexadecimal-escape-sequence:
1738	/// \x hexadecimal-digit
1739	/// hexadecimal-escape-sequence hexadecimal-digit
1740	/// universal-character-name: [C++11 lex.charset]
1741	/// \u hex-quad
1742	/// \U hex-quad hex-quad
1743	/// hex-quad:
1744	/// hex-digit hex-digit hex-digit hex-digit
1745	/// \endverbatim
1746	///
1747	CharLiteralParser::CharLiteralParser(const char begin, const* char *end,
1748	SourceLocation Loc, Preprocessor &PP,
1749	tok::TokenKind kind) {
1750	// At this point we know that the character matches the regex "(L\|u\|U)?'.'".*
1751	HadError = false;
1752
1753	Kind = kind;
1754
1755	const char *TokBegin = begin;
1756
1757	// Skip over wide character determinant.
1758	if (Kind != tok::char_constant)
1759	++begin;
1760	if (Kind == tok::utf8_char_constant)
1761	++begin;
1762
1763	// Skip over the entry quote.
1764	if (begin[`0`] != `'\''`) {
1765	PP.Diag(Loc, DiagID: diag::err_lexing_char);
1766	HadError = true;
1767	return;
1768	}
1769
1770	++begin;
1771
1772	// Remove an optional ud-suffix.
1773	if (end[-`1`] != `'\''`) {
1774	const char *UDSuffixEnd = end;
1775	do {
1776	--end;
1777	} while (end[-`1`] != `'\''`);
1778	// FIXME: Don't bother with this if !tok.hasUCN().
1779	expandUCNs(Buf&: UDSuffixBuf, Input: StringRef(end, UDSuffixEnd - end));
1780	UDSuffixOffset = end - TokBegin;
1781	}
1782
1783	// Trim the ending quote.
1784	assert(end != begin && "Invalid token lexed");
1785	--end;
1786
1787	// FIXME: The "Value" is an uint64_t so we can handle char literals of
1788	// up to 64-bits.
1789	// FIXME: This extensively assumes that 'char' is 8-bits.
1790	assert(PP.getTargetInfo().getCharWidth() == `8` &&
1791	"Assumes char is 8 bits");
1792	assert(PP.getTargetInfo().getIntWidth() <= `64` &&
1793	(PP.getTargetInfo().getIntWidth() & `7`) == `0` &&
1794	"Assumes sizeof(int) on target is <= 64 and a multiple of char");
1795	assert(PP.getTargetInfo().getWCharWidth() <= `64` &&
1796	"Assumes sizeof(wchar) on target is <= 64");
1797
1798	SmallVector<uint32_t, `4`> codepoint_buffer;
1799	codepoint_buffer.resize(N: end - begin);
1800	uint32_t *buffer_begin = &codepoint_buffer.front();
1801	uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1802
1803	// Unicode escapes representing characters that cannot be correctly
1804	// represented in a single code unit are disallowed in character literals
1805	// by this implementation.
1806	uint32_t largest_character_for_kind;
1807	if (tok::wide_char_constant == Kind) {
1808	largest_character_for_kind =
1809	`0xFFFFFFFFu` >> (`32`-PP.getTargetInfo().getWCharWidth());
1810	} else if (tok::utf8_char_constant == Kind) {
1811	largest_character_for_kind = `0x7F`;
1812	} else if (tok::utf16_char_constant == Kind) {
1813	largest_character_for_kind = `0xFFFF`;
1814	} else if (tok::utf32_char_constant == Kind) {
1815	largest_character_for_kind = `0x10FFFF`;
1816	} else {
1817	largest_character_for_kind = `0x7Fu`;
1818	}
1819
1820	while (begin != end) {
1821	// Is this a span of non-escape characters?
1822	if (begin[`0`] != `'\\'`) {
1823	char const *start = begin;
1824	do {
1825	++begin;
1826	} while (begin != end && *begin != `'\\'`);
1827
1828	char const *tmp_in_start = start;
1829	uint32_t *tmp_out_start = buffer_begin;
1830	llvm::ConversionResult res =
1831	llvm::ConvertUTF8toUTF32(sourceStart: reinterpret_cast<llvm::UTF8 const **>(&start),
1832	sourceEnd: reinterpret_cast<llvm::UTF8 const *>(begin),
1833	targetStart: &buffer_begin, targetEnd: buffer_end, flags: llvm::strictConversion);
1834	if (res != llvm::conversionOK) {
1835	// If we see bad encoding for unprefixed character literals, warn and
1836	// simply copy the byte values, for compatibility with gcc and
1837	// older versions of clang.
1838	bool NoErrorOnBadEncoding = isOrdinary();
1839	unsigned Msg = diag::err_bad_character_encoding;
1840	if (NoErrorOnBadEncoding)
1841	Msg = diag::warn_bad_character_encoding;
1842	PP.Diag(Loc, DiagID: Msg);
1843	if (NoErrorOnBadEncoding) {
1844	start = tmp_in_start;
1845	buffer_begin = tmp_out_start;
1846	for (; start != begin; ++start, ++buffer_begin)
1847	buffer_begin = static_cast<uint8_t>(start);
1848	} else {
1849	HadError = true;
1850	}
1851	} else {
1852	for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1853	if (*tmp_out_start > largest_character_for_kind) {
1854	HadError = true;
1855	PP.Diag(Loc, DiagID: diag::err_character_too_large);
1856	}
1857	}
1858	}
1859
1860	continue;
1861	}
1862	// Is this a Universal Character Name escape?
1863	if (begin[`1`] == `'u'` \|\| begin[`1`] == `'U'` \|\| begin[`1`] == `'N'`) {
1864	unsigned short UcnLen = `0`;
1865	if (!ProcessUCNEscape(ThisTokBegin: TokBegin, ThisTokBuf&: begin, ThisTokEnd: end, UcnVal&: *buffer_begin, UcnLen,
1866	Loc: FullSourceLoc (Loc, PP.getSourceManager()),
1867	Diags: &PP.getDiagnostics(), Features: PP.getLangOpts(), in_char_string_literal: true)) {
1868	HadError = true;
1869	} else if (*buffer_begin > largest_character_for_kind) {
1870	HadError = true;
1871	PP.Diag(Loc, DiagID: diag::err_character_too_large);
1872	}
1873
1874	++buffer_begin;
1875	continue;
1876	}
1877	unsigned CharWidth = getCharWidth(kind: Kind, Target: PP.getTargetInfo());
1878	uint64_t result =
1879	ProcessCharEscape(ThisTokBegin: TokBegin, ThisTokBuf&: begin, ThisTokEnd: end, HadError,
1880	Loc: FullSourceLoc (Loc, PP.getSourceManager()), CharWidth,
1881	Diags: &PP.getDiagnostics(), Features: PP.getLangOpts(),
1882	EvalMethod: StringLiteralEvalMethod::Evaluated);
1883	*buffer_begin++ = result;
1884	}
1885
1886	unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1887
1888	if (NumCharsSoFar > `1`) {
1889	if (isOrdinary() && NumCharsSoFar == `4`)
1890	PP.Diag(Loc, DiagID: diag::warn_four_char_character_literal);
1891	else if (isOrdinary())
1892	PP.Diag(Loc, DiagID: diag::warn_multichar_character_literal);
1893	else {
1894	PP.Diag(Loc, DiagID: diag::err_multichar_character_literal) << (isWide() ? `0` : `1`);
1895	HadError = true;
1896	}
1897	IsMultiChar = true;
1898	} else {
1899	IsMultiChar = false;
1900	}
1901
1902	llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), `0`);
1903
1904	// Narrow character literals act as though their value is concatenated
1905	// in this implementation, but warn on overflow.
1906	bool multi_char_too_long = false;
1907	if (isOrdinary() && isMultiChar()) {
1908	LitVal = `0`;
1909	for (size_t i = `0`; i < NumCharsSoFar; ++i) {
1910	// check for enough leading zeros to shift into
1911	multi_char_too_long \|= (LitVal.countl_zero() < `8`);
1912	LitVal <<= `8`;
1913	LitVal = LitVal + (codepoint_buffer [i] & `0xFF`);
1914	}
1915	} else if (NumCharsSoFar > `0`) {
1916	// otherwise just take the last character
1917	LitVal = buffer_begin[-`1`];
1918	}
1919
1920	if (!HadError && multi_char_too_long) {
1921	PP.Diag(Loc, DiagID: diag::warn_char_constant_too_large);
1922	}
1923
1924	// Transfer the value from APInt to uint64_t
1925	Value = LitVal.getZExtValue();
1926
1927	// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1928	// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1929	// character constants are not sign extended in the this implementation:
1930	// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1931	if (isOrdinary() && NumCharsSoFar == `1` && (Value & `128`) &&
1932	PP.getLangOpts().CharIsSigned)
1933	Value = (signed char)Value;
1934	}
1935
1936	/// \verbatim
1937	/// string-literal: [C++0x lex.string]
1938	/// encoding-prefix " [s-char-sequence] "
1939	/// encoding-prefix R raw-string
1940	/// encoding-prefix:
1941	/// u8
1942	/// u
1943	/// U
1944	/// L
1945	/// s-char-sequence:
1946	/// s-char
1947	/// s-char-sequence s-char
1948	/// s-char:
1949	/// any member of the source character set except the double-quote ",
1950	/// backslash \, or new-line character
1951	/// escape-sequence
1952	/// universal-character-name
1953	/// raw-string:
1954	/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1955	/// r-char-sequence:
1956	/// r-char
1957	/// r-char-sequence r-char
1958	/// r-char:
1959	/// any member of the source character set, except a right parenthesis )
1960	/// followed by the initial d-char-sequence (which may be empty)
1961	/// followed by a double quote ".
1962	/// d-char-sequence:
1963	/// d-char
1964	/// d-char-sequence d-char
1965	/// d-char:
1966	/// any member of the basic source character set except:
1967	/// space, the left parenthesis (, the right parenthesis ),
1968	/// the backslash \, and the control characters representing horizontal
1969	/// tab, vertical tab, form feed, and newline.
1970	/// escape-sequence: [C++0x lex.ccon]
1971	/// simple-escape-sequence
1972	/// octal-escape-sequence
1973	/// hexadecimal-escape-sequence
1974	/// simple-escape-sequence:
1975	/// one of \' \" \? \\ \a \b \f \n \r \t \v
1976	/// octal-escape-sequence:
1977	/// \ octal-digit
1978	/// \ octal-digit octal-digit
1979	/// \ octal-digit octal-digit octal-digit
1980	/// hexadecimal-escape-sequence:
1981	/// \x hexadecimal-digit
1982	/// hexadecimal-escape-sequence hexadecimal-digit
1983	/// universal-character-name:
1984	/// \u hex-quad
1985	/// \U hex-quad hex-quad
1986	/// hex-quad:
1987	/// hex-digit hex-digit hex-digit hex-digit
1988	/// \endverbatim
1989	///
1990	StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1991	Preprocessor &PP,
1992	StringLiteralEvalMethod EvalMethod)
1993	: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1994	Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1995	MaxTokenLength(`0`), SizeBound(`0`), CharByteWidth(`0`), Kind(tok::unknown),
1996	ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1997	Pascal(false) {
1998	init(StringToks);
1999	}
2000
2001	void StringLiteralParser::init(ArrayRef<Token> StringToks){
2002	// The literal token may have come from an invalid source location (e.g. due
2003	// to a PCH error), in which case the token length will be 0.
2004	if (StringToks.empty() \|\| StringToks [`0`].getLength() < `2`)
2005	return DiagnoseLexingError(Loc: SourceLocation ());
2006
2007	// Scan all of the string portions, remember the max individual token length,
2008	// computing a bound on the concatenated string length, and see whether any
2009	// piece is a wide-string. If any of the string portions is a wide-string
2010	// literal, the result is a wide-string literal [C99 6.4.5p4].
2011	assert(!StringToks.empty() && "expected at least one token");
2012	MaxTokenLength = StringToks [`0`].getLength();
2013	assert(StringToks[`0`].getLength() >= `2` && "literal token is invalid!");
2014	SizeBound = StringToks [`0`].getLength() - `2`; // -2 for "".
2015	hadError = false;
2016
2017	// Determines the kind of string from the prefix
2018	Kind = tok::string_literal;
2019
2020	/// (C99 5.1.1.2p1). The common case is only one string fragment.
2021	for (const Token &Tok : StringToks) {
2022	if (Tok.getLength() < `2`)
2023	return DiagnoseLexingError(Loc: Tok.getLocation());
2024
2025	// The string could be shorter than this if it needs cleaning, but this is a
2026	// reasonable bound, which is all we need.
2027	assert(Tok.getLength() >= `2` && "literal token is invalid!");
2028	SizeBound += Tok.getLength() - `2`; // -2 for "".
2029
2030	// Remember maximum string piece length.
2031	if (Tok.getLength() > MaxTokenLength)
2032	MaxTokenLength = Tok.getLength();
2033
2034	// Remember if we see any wide or utf-8/16/32 strings.
2035	// Also check for illegal concatenations.
2036	if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
2037	if (Diags) {
2038	SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
2039	TokStart: Tok.getLocation(), Characters: getEncodingPrefixLen(kind: Tok.getKind()), SM,
2040	LangOpts: Features);
2041	CharSourceRange Range =
2042	CharSourceRange::getCharRange(R: {Tok.getLocation(), PrefixEndLoc});
2043	StringRef Prefix(SM.getCharacterData(SL: Tok.getLocation()),
2044	getEncodingPrefixLen(kind: Tok.getKind()));
2045	Diags->Report(Loc: Tok.getLocation(),
2046	DiagID: Features.CPlusPlus26
2047	? diag::err_unevaluated_string_prefix
2048	: diag::warn_unevaluated_string_prefix)
2049	<< Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(RemoveRange: Range);
2050	}
2051	if (Features.CPlusPlus26)
2052	hadError = true;
2053	} else if (Tok.isNot(K: Kind) && Tok.isNot(K: tok::string_literal)) {
2054	if (isOrdinary()) {
2055	Kind = Tok.getKind();
2056	} else {
2057	if (Diags)
2058	Diags->Report(Loc: Tok.getLocation(), DiagID: diag::err_unsupported_string_concat);
2059	hadError = true;
2060	}
2061	}
2062	}
2063
2064	// Include space for the null terminator.
2065	++SizeBound;
2066
2067	// TODO: K&R warning: "traditional C rejects string constant concatenation"
2068
2069	// Get the width in bytes of char/wchar_t/char16_t/char32_t
2070	CharByteWidth = getCharWidth(kind: Kind, Target);
2071	assert((CharByteWidth & `7`) == `0` && "Assumes character size is byte multiple");
2072	CharByteWidth /= `8`;
2073
2074	// The output buffer size needs to be large enough to hold wide characters.
2075	// This is a worst-case assumption which basically corresponds to L"" "long".
2076	SizeBound *= CharByteWidth;
2077
2078	// Size the temporary buffer to hold the result string data.
2079	ResultBuf.resize(N: SizeBound);
2080
2081	// Likewise, but for each string piece.
2082	SmallString<`512`> TokenBuf;
2083	TokenBuf.resize(N: MaxTokenLength);
2084
2085	// Loop over all the strings, getting their spelling, and expanding them to
2086	// wide strings as appropriate.
2087	ResultPtr = &ResultBuf [`0`]; // Next byte to fill in.
2088
2089	Pascal = false;
2090
2091	SourceLocation UDSuffixTokLoc;
2092
2093	for (unsigned i = `0`, e = StringToks.size(); i != e; ++i) {
2094	const char *ThisTokBuf = &TokenBuf [`0`];
2095	// Get the spelling of the token, which eliminates trigraphs, etc. We know
2096	// that ThisTokBuf points to a buffer that is big enough for the whole token
2097	// and 'spelled' tokens can only shrink.
2098	bool StringInvalid = false;
2099	unsigned ThisTokLen =
2100	Lexer::getSpelling(Tok: StringToks [i], Buffer&: ThisTokBuf, SourceMgr: SM, LangOpts: Features,
2101	Invalid: &StringInvalid);
2102	if (StringInvalid)
2103	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2104
2105	const char *ThisTokBegin = ThisTokBuf;
2106	const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2107
2108	// Remove an optional ud-suffix.
2109	if (ThisTokEnd[-`1`] != `'"'`) {
2110	const char *UDSuffixEnd = ThisTokEnd;
2111	do {
2112	--ThisTokEnd;
2113	} while (ThisTokEnd[-`1`] != `'"'`);
2114
2115	StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2116
2117	if (UDSuffixBuf.empty()) {
2118	if (StringToks [i].hasUCN())
2119	expandUCNs(Buf&: UDSuffixBuf, Input: UDSuffix);
2120	else
2121	UDSuffixBuf.assign(RHS: UDSuffix);
2122	UDSuffixToken = i;
2123	UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2124	UDSuffixTokLoc = StringToks [i].getLocation();
2125	} else {
2126	SmallString<`32`> ExpandedUDSuffix;
2127	if (StringToks [i].hasUCN()) {
2128	expandUCNs(Buf&: ExpandedUDSuffix, Input: UDSuffix);
2129	UDSuffix = ExpandedUDSuffix;
2130	}
2131
2132	// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2133	// result of a concatenation involving at least one user-defined-string-
2134	// literal, all the participating user-defined-string-literals shall
2135	// have the same ud-suffix.
2136	bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
2137	if (UDSuffixBuf != UDSuffix \|\| UnevaluatedStringHasUDL) {
2138	if (Diags) {
2139	SourceLocation TokLoc = StringToks [i].getLocation();
2140	if (UnevaluatedStringHasUDL) {
2141	Diags->Report(Loc: TokLoc, DiagID: diag::err_unevaluated_string_udl)
2142	<< SourceRange (TokLoc, TokLoc);
2143	} else {
2144	Diags->Report(Loc: TokLoc, DiagID: diag::err_string_concat_mixed_suffix)
2145	<< UDSuffixBuf << UDSuffix
2146	<< SourceRange (UDSuffixTokLoc, UDSuffixTokLoc);
2147	}
2148	}
2149	hadError = true;
2150	}
2151	}
2152	}
2153
2154	// Strip the end quote.
2155	--ThisTokEnd;
2156
2157	// TODO: Input character set mapping support.
2158
2159	// Skip marker for wide or unicode strings.
2160	if (ThisTokBuf[`0`] == `'L'` \|\| ThisTokBuf[`0`] == `'u'` \|\| ThisTokBuf[`0`] == `'U'`) {
2161	++ThisTokBuf;
2162	// Skip 8 of u8 marker for utf8 strings.
2163	if (ThisTokBuf[`0`] == `'8'`)
2164	++ThisTokBuf;
2165	}
2166
2167	// Check for raw string
2168	if (ThisTokBuf[`0`] == `'R'`) {
2169	if (ThisTokBuf[`1`] != `'"'`) {
2170	// The file may have come from PCH and then changed after loading the
2171	// PCH; Fail gracefully.
2172	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2173	}
2174	ThisTokBuf += `2`; // skip R"
2175
2176	// C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2177	// characters.
2178	constexpr unsigned MaxRawStrDelimLen = `16`;
2179
2180	const char *Prefix = ThisTokBuf;
2181	while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2182	ThisTokBuf[`0`] != `'('`)
2183	++ThisTokBuf;
2184	if (ThisTokBuf[`0`] != `'('`)
2185	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2186	++ThisTokBuf; // skip '('
2187
2188	// Remove same number of characters from the end
2189	ThisTokEnd -= ThisTokBuf - Prefix;
2190	if (ThisTokEnd < ThisTokBuf)
2191	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2192
2193	// C++14 [lex.string]p4: A source-file new-line in a raw string literal
2194	// results in a new-line in the resulting execution string-literal.
2195	StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2196	while (!RemainingTokenSpan.empty()) {
2197	// Split the string literal on \r\n boundaries.
2198	size_t CRLFPos = RemainingTokenSpan.find(Str: "\r\n");
2199	StringRef BeforeCRLF = RemainingTokenSpan.substr(Start: `0`, N: CRLFPos);
2200	StringRef AfterCRLF = RemainingTokenSpan.substr(Start: CRLFPos);
2201
2202	// Copy everything before the \r\n sequence into the string literal.
2203	if (CopyStringFragment(Tok: StringToks [i], TokBegin: ThisTokBegin, Fragment: BeforeCRLF))
2204	hadError = true;
2205
2206	// Point into the \n inside the \r\n sequence and operate on the
2207	// remaining portion of the literal.
2208	RemainingTokenSpan = AfterCRLF.substr(Start: `1`);
2209	}
2210	} else {
2211	if (ThisTokBuf[`0`] != `'"'`) {
2212	// The file may have come from PCH and then changed after loading the
2213	// PCH; Fail gracefully.
2214	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2215	}
2216	++ThisTokBuf; // skip "
2217
2218	// Check if this is a pascal string
2219	if (!isUnevaluated() && Features.PascalStrings &&
2220	ThisTokBuf + `1` != ThisTokEnd && ThisTokBuf[`0`] == `'\\'` &&
2221	ThisTokBuf[`1`] == `'p'`) {
2222
2223	// If the \p sequence is found in the first token, we have a pascal string
2224	// Otherwise, if we already have a pascal string, ignore the first \p
2225	if (i == `0`) {
2226	++ThisTokBuf;
2227	Pascal = true;
2228	} else if (Pascal)
2229	ThisTokBuf += `2`;
2230	}
2231
2232	while (ThisTokBuf != ThisTokEnd) {
2233	// Is this a span of non-escape characters?
2234	if (ThisTokBuf[`0`] != `'\\'`) {
2235	const char *InStart = ThisTokBuf;
2236	do {
2237	++ThisTokBuf;
2238	} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[`0`] != `'\\'`);
2239
2240	// Copy the character span over.
2241	if (CopyStringFragment(Tok: StringToks [i], TokBegin: ThisTokBegin,
2242	Fragment: StringRef(InStart, ThisTokBuf - InStart)))
2243	hadError = true;
2244	continue;
2245	}
2246	// Is this a Universal Character Name escape?
2247	if (ThisTokBuf[`1`] == `'u'` \|\| ThisTokBuf[`1`] == `'U'` \|\|
2248	ThisTokBuf[`1`] == `'N'`) {
2249	EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2250	ResultBuf&: ResultPtr, HadError&: hadError,
2251	Loc: FullSourceLoc (StringToks [i].getLocation(), SM),
2252	CharByteWidth, Diags, Features);
2253	continue;
2254	}
2255	// Otherwise, this is a non-UCN escape character. Process it.
2256	unsigned ResultChar =
2257	ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, HadError&: hadError,
2258	Loc: FullSourceLoc (StringToks [i].getLocation(), SM),
2259	CharWidth: CharByteWidth * `8`, Diags, Features, EvalMethod);
2260
2261	if (CharByteWidth == `4`) {
2262	// FIXME: Make the type of the result buffer correct instead of
2263	// using reinterpret_cast.
2264	llvm::UTF32 ResultWidePtr = reinterpret_cast<llvm::UTF32>(ResultPtr);
2265	*ResultWidePtr = ResultChar;
2266	ResultPtr += `4`;
2267	} else if (CharByteWidth == `2`) {
2268	// FIXME: Make the type of the result buffer correct instead of
2269	// using reinterpret_cast.
2270	llvm::UTF16 ResultWidePtr = reinterpret_cast<llvm::UTF16>(ResultPtr);
2271	*ResultWidePtr = ResultChar & `0xFFFF`;
2272	ResultPtr += `2`;
2273	} else {
2274	assert(CharByteWidth == `1` && "Unexpected char width");
2275	*ResultPtr++ = ResultChar & `0xFF`;
2276	}
2277	}
2278	}
2279	}
2280
2281	assert((!Pascal \|\| !isUnevaluated()) &&
2282	"Pascal string in unevaluated context");
2283	if (Pascal) {
2284	if (CharByteWidth == `4`) {
2285	// FIXME: Make the type of the result buffer correct instead of
2286	// using reinterpret_cast.
2287	llvm::UTF32 ResultWidePtr = reinterpret_cast<llvm::UTF32>(ResultBuf.data());
2288	ResultWidePtr[`0`] = GetNumStringChars() - `1`;
2289	} else if (CharByteWidth == `2`) {
2290	// FIXME: Make the type of the result buffer correct instead of
2291	// using reinterpret_cast.
2292	llvm::UTF16 ResultWidePtr = reinterpret_cast<llvm::UTF16>(ResultBuf.data());
2293	ResultWidePtr[`0`] = GetNumStringChars() - `1`;
2294	} else {
2295	assert(CharByteWidth == `1` && "Unexpected char width");
2296	ResultBuf [`0`] = GetNumStringChars() - `1`;
2297	}
2298
2299	// Verify that pascal strings aren't too large.
2300	if (GetStringLength() > `256`) {
2301	if (Diags)
2302	Diags->Report(Loc: StringToks.front().getLocation(),
2303	DiagID: diag::err_pascal_string_too_long)
2304	<< SourceRange (StringToks.front().getLocation(),
2305	StringToks.back().getLocation());
2306	hadError = true;
2307	return;
2308	}
2309	} else if (Diags) {
2310	// Complain if this string literal has too many characters.
2311	unsigned MaxChars = Features.CPlusPlus? `65536` : Features.C99 ? `4095` : `509`;
2312
2313	if (GetNumStringChars() > MaxChars)
2314	Diags->Report(Loc: StringToks.front().getLocation(),
2315	DiagID: diag::ext_string_too_long)
2316	<< GetNumStringChars() << MaxChars
2317	<< (Features.CPlusPlus ? `2` : Features.C99 ? `1` : `0`)
2318	<< SourceRange (StringToks.front().getLocation(),
2319	StringToks.back().getLocation());
2320	}
2321	}
2322
2323	static const char resyncUTF8(const* char Err, const* char *End) {
2324	if (Err == End)
2325	return End;
2326	End = Err + std::min<unsigned>(a: llvm::getNumBytesForUTF8(firstByte: *Err), b: End-Err);
2327	while (++Err != End && (*Err & `0xC0`) == `0x80`)
2328	;
2329	return Err;
2330	}
2331
2332	/// This function copies from Fragment, which is a sequence of bytes
2333	/// within Tok's contents (which begin at TokBegin) into ResultPtr.
2334	/// Performs widening for multi-byte characters.
2335	bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2336	const char *TokBegin,
2337	StringRef Fragment) {
2338	const llvm::UTF8 *ErrorPtrTmp;
2339	if (ConvertUTF8toWide(WideCharWidth: CharByteWidth, Source: Fragment, ResultPtr, ErrorPtr&: ErrorPtrTmp))
2340	return false;
2341
2342	// If we see bad encoding for unprefixed string literals, warn and
2343	// simply copy the byte values, for compatibility with gcc and older
2344	// versions of clang.
2345	bool NoErrorOnBadEncoding = isOrdinary();
2346	if (NoErrorOnBadEncoding) {
2347	memcpy(dest: ResultPtr, src: Fragment.data(), n: Fragment.size());
2348	ResultPtr += Fragment.size();
2349	}
2350
2351	if (Diags) {
2352	const char ErrorPtr = reinterpret_cast<const* char *>(ErrorPtrTmp);
2353
2354	FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2355	const DiagnosticBuilder &Builder =
2356	Diag(Diags, Features, TokLoc: SourceLoc, TokBegin,
2357	TokRangeBegin: ErrorPtr, TokRangeEnd: resyncUTF8(Err: ErrorPtr, End: Fragment.end()),
2358	DiagID: NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2359	: diag::err_bad_string_encoding);
2360
2361	const char *NextStart = resyncUTF8(Err: ErrorPtr, End: Fragment.end());
2362	StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2363
2364	// Decode into a dummy buffer.
2365	SmallString<`512`> Dummy;
2366	Dummy.reserve(N: Fragment.size() * CharByteWidth);
2367	char *Ptr = Dummy.data();
2368
2369	while (!ConvertUTF8toWide(WideCharWidth: CharByteWidth, Source: NextFragment, ResultPtr&: Ptr, ErrorPtr&: ErrorPtrTmp)) {
2370	const char ErrorPtr = reinterpret_cast<const* char *>(ErrorPtrTmp);
2371	NextStart = resyncUTF8(Err: ErrorPtr, End: Fragment.end());
2372	Builder << MakeCharSourceRange(Features, TokLoc: SourceLoc, TokBegin,
2373	TokRangeBegin: ErrorPtr, TokRangeEnd: NextStart);
2374	NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2375	}
2376	}
2377	return !NoErrorOnBadEncoding;
2378	}
2379
2380	void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2381	hadError = true;
2382	if (Diags)
2383	Diags->Report(Loc, DiagID: diag::err_lexing_string);
2384	}
2385
2386	/// getOffsetOfStringByte - This function returns the offset of the
2387	/// specified byte of the string data represented by Token. This handles
2388	/// advancing over escape sequences in the string.
2389	unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2390	unsigned ByteNo) const {
2391	// Get the spelling of the token.
2392	SmallString<`32`> SpellingBuffer;
2393	SpellingBuffer.resize(N: Tok.getLength());
2394
2395	bool StringInvalid = false;
2396	const char *SpellingPtr = &SpellingBuffer [`0`];
2397	unsigned TokLen = Lexer::getSpelling(Tok, Buffer&: SpellingPtr, SourceMgr: SM, LangOpts: Features,
2398	Invalid: &StringInvalid);
2399	if (StringInvalid)
2400	return `0`;
2401
2402	const char *SpellingStart = SpellingPtr;
2403	const char *SpellingEnd = SpellingPtr+TokLen;
2404
2405	// Handle UTF-8 strings just like narrow strings.
2406	if (SpellingPtr[`0`] == `'u'` && SpellingPtr[`1`] == `'8'`)
2407	SpellingPtr += `2`;
2408
2409	assert(SpellingPtr[`0`] != `'L'` && SpellingPtr[`0`] != `'u'` &&
2410	SpellingPtr[`0`] != `'U'` && "Doesn't handle wide or utf strings yet");
2411
2412	// For raw string literals, this is easy.
2413	if (SpellingPtr[`0`] == `'R'`) {
2414	assert(SpellingPtr[`1`] == `'"'` && "Should be a raw string literal!");
2415	// Skip 'R"'.
2416	SpellingPtr += `2`;
2417	while (*SpellingPtr != `'('`) {
2418	++SpellingPtr;
2419	assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2420	}
2421	// Skip '('.
2422	++SpellingPtr;
2423	return SpellingPtr - SpellingStart + ByteNo;
2424	}
2425
2426	// Skip over the leading quote
2427	assert(SpellingPtr[`0`] == `'"'` && "Should be a string literal!");
2428	++SpellingPtr;
2429
2430	// Skip over bytes until we find the offset we're looking for.
2431	while (ByteNo) {
2432	assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2433
2434	// Step over non-escapes simply.
2435	if (*SpellingPtr != `'\\'`) {
2436	++SpellingPtr;
2437	--ByteNo;
2438	continue;
2439	}
2440
2441	// Otherwise, this is an escape character. Advance over it.
2442	bool HadError = false;
2443	if (SpellingPtr[`1`] == `'u'` \|\| SpellingPtr[`1`] == `'U'` \|\|
2444	SpellingPtr[`1`] == `'N'`) {
2445	const char *EscapePtr = SpellingPtr;
2446	unsigned Len = MeasureUCNEscape(ThisTokBegin: SpellingStart, ThisTokBuf&: SpellingPtr, ThisTokEnd: SpellingEnd,
2447	CharByteWidth: `1`, Features, HadError);
2448	if (Len > ByteNo) {
2449	// ByteNo is somewhere within the escape sequence.
2450	SpellingPtr = EscapePtr;
2451	break;
2452	}
2453	ByteNo -= Len;
2454	} else {
2455	ProcessCharEscape(ThisTokBegin: SpellingStart, ThisTokBuf&: SpellingPtr, ThisTokEnd: SpellingEnd, HadError,
2456	Loc: FullSourceLoc (Tok.getLocation(), SM), CharWidth: CharByteWidth * `8`,
2457	Diags, Features, EvalMethod: StringLiteralEvalMethod::Evaluated);
2458	--ByteNo;
2459	}
2460	assert(!HadError && "This method isn't valid on erroneous strings");
2461	}
2462
2463	return SpellingPtr-SpellingStart;
2464	}
2465
2466	/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2467	/// suffixes as ud-suffixes, because the diagnostic experience is better if we
2468	/// treat it as an invalid suffix.
2469	bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2470	StringRef Suffix) {
2471	return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) \|\|
2472	Suffix == "sv";
2473	}
2474

Browse the source code of llvm_projects/clang/lib/Lex/LiteralSupport.cpp