LiteralSupport.cpp source code [llvm_projects/clang/lib/Lex/LiteralSupport.cpp]

1	//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the NumericLiteralParser, CharLiteralParser, and
10	// StringLiteralParser interfaces.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "clang/Lex/LiteralSupport.h"
15	#include "clang/Basic/CharInfo.h"
16	#include "clang/Basic/LangOptions.h"
17	#include "clang/Basic/SourceLocation.h"
18	#include "clang/Basic/TargetInfo.h"
19	#include "clang/Lex/LexDiagnostic.h"
20	#include "clang/Lex/Lexer.h"
21	#include "clang/Lex/Preprocessor.h"
22	#include "clang/Lex/Token.h"
23	#include "llvm/ADT/APInt.h"
24	#include "llvm/ADT/SmallVector.h"
25	#include "llvm/ADT/StringExtras.h"
26	#include "llvm/ADT/StringSwitch.h"
27	#include "llvm/Support/ConvertUTF.h"
28	#include "llvm/Support/Error.h"
29	#include "llvm/Support/ErrorHandling.h"
30	#include "llvm/Support/Unicode.h"
31	#include <algorithm>
32	#include <cassert>
33	#include <cstddef>
34	#include <cstdint>
35	#include <cstring>
36	#include <string>
37
38	using namespace clang;
39
40	static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
41	switch (kind) {
42	default: llvm_unreachable("Unknown token type!");
43	case tok::char_constant:
44	case tok::string_literal:
45	case tok::utf8_char_constant:
46	case tok::utf8_string_literal:
47	return Target.getCharWidth();
48	case tok::wide_char_constant:
49	case tok::wide_string_literal:
50	return Target.getWCharWidth();
51	case tok::utf16_char_constant:
52	case tok::utf16_string_literal:
53	return Target.getChar16Width();
54	case tok::utf32_char_constant:
55	case tok::utf32_string_literal:
56	return Target.getChar32Width();
57	}
58	}
59
60	static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
61	switch (kind) {
62	default:
63	llvm_unreachable("Unknown token type!");
64	case tok::char_constant:
65	case tok::string_literal:
66	return `0`;
67	case tok::utf8_char_constant:
68	case tok::utf8_string_literal:
69	return `2`;
70	case tok::wide_char_constant:
71	case tok::wide_string_literal:
72	case tok::utf16_char_constant:
73	case tok::utf16_string_literal:
74	case tok::utf32_char_constant:
75	case tok::utf32_string_literal:
76	return `1`;
77	}
78	}
79
80	static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
81	FullSourceLoc TokLoc,
82	const char *TokBegin,
83	const char *TokRangeBegin,
84	const char *TokRangeEnd) {
85	SourceLocation Begin =
86	Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: TokRangeBegin - TokBegin,
87	SM: TokLoc.getManager(), LangOpts: Features);
88	SourceLocation End =
89	Lexer::AdvanceToTokenCharacter(TokStart: Begin, Characters: TokRangeEnd - TokRangeBegin,
90	SM: TokLoc.getManager(), LangOpts: Features);
91	return CharSourceRange::getCharRange(B: Begin, E: End);
92	}
93
94	/// Produce a diagnostic highlighting some portion of a literal.
95	///
96	/// Emits the diagnostic \p DiagID, highlighting the range of characters from
97	/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
98	/// a substring of a spelling buffer for the token beginning at \p TokBegin.
99	static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
100	const LangOptions &Features, FullSourceLoc TokLoc,
101	const char TokBegin, const* char *TokRangeBegin,
102	const char TokRangeEnd, unsigned* DiagID) {
103	SourceLocation Begin =
104	Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: TokRangeBegin - TokBegin,
105	SM: TokLoc.getManager(), LangOpts: Features);
106	return Diags->Report(Loc: Begin, DiagID) <<
107	MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
108	}
109
110	static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
111	switch (Escape) {
112	case `'\''`:
113	case `'"'`:
114	case `'?'`:
115	case `'\\'`:
116	case `'a'`:
117	case `'b'`:
118	case `'f'`:
119	case `'n'`:
120	case `'r'`:
121	case `'t'`:
122	case `'v'`:
123	return true;
124	}
125	return false;
126	}
127
128	/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
129	/// either a character or a string literal.
130	static unsigned ProcessCharEscape(const char *ThisTokBegin,
131	const char *&ThisTokBuf,
132	const char ThisTokEnd, bool* &HadError,
133	FullSourceLoc Loc, unsigned CharWidth,
134	DiagnosticsEngine *Diags,
135	const LangOptions &Features,
136	StringLiteralEvalMethod EvalMethod) {
137	const char *EscapeBegin = ThisTokBuf;
138	bool Delimited = false;
139	bool EndDelimiterFound = false;
140
141	// Skip the '\' char.
142	++ThisTokBuf;
143
144	// We know that this character can't be off the end of the buffer, because
145	// that would have been \", which would not have been the end of string.
146	unsigned ResultChar = *ThisTokBuf++;
147	char Escape = ResultChar;
148	switch (ResultChar) {
149	// These map to themselves.
150	case `'\\'`: case `'\''`: case `'"'`: case `'?'`: break;
151
152	// These have fixed mappings.
153	case `'a'`:
154	// TODO: K&R: the meaning of '\\a' is different in traditional C
155	ResultChar = `7`;
156	break;
157	case `'b'`:
158	ResultChar = `8`;
159	break;
160	case `'e'`:
161	if (Diags)
162	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
163	DiagID: diag::ext_nonstandard_escape) << "e";
164	ResultChar = `27`;
165	break;
166	case `'E'`:
167	if (Diags)
168	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
169	DiagID: diag::ext_nonstandard_escape) << "E";
170	ResultChar = `27`;
171	break;
172	case `'f'`:
173	ResultChar = `12`;
174	break;
175	case `'n'`:
176	ResultChar = `10`;
177	break;
178	case `'r'`:
179	ResultChar = `13`;
180	break;
181	case `'t'`:
182	ResultChar = `9`;
183	break;
184	case `'v'`:
185	ResultChar = `11`;
186	break;
187	case `'x'`: { // Hex escape.
188	ResultChar = `0`;
189	if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == `'{'`) {
190	Delimited = true;
191	ThisTokBuf++;
192	if (*ThisTokBuf == `'}'`) {
193	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
194	DiagID: diag::err_delimited_escape_empty);
195	return ResultChar;
196	}
197	} else if (ThisTokBuf == ThisTokEnd \|\| !isHexDigit(c: *ThisTokBuf)) {
198	if (Diags)
199	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
200	DiagID: diag::err_hex_escape_no_digits) << "x";
201	return ResultChar;
202	}
203
204	// Hex escapes are a maximal series of hex digits.
205	bool Overflow = false;
206	for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
207	if (Delimited && *ThisTokBuf == `'}'`) {
208	ThisTokBuf++;
209	EndDelimiterFound = true;
210	break;
211	}
212	int CharVal = llvm::hexDigitValue(C: *ThisTokBuf);
213	if (CharVal == -`1`) {
214	// Non delimited hex escape sequences stop at the first non-hex digit.
215	if (!Delimited)
216	break;
217	HadError = true;
218	if (Diags)
219	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
220	DiagID: diag::err_delimited_escape_invalid)
221	<< StringRef (ThisTokBuf, `1`);
222	continue;
223	}
224	// About to shift out a digit?
225	if (ResultChar & `0xF0000000`)
226	Overflow = true;
227	ResultChar <<= `4`;
228	ResultChar \|= CharVal;
229	}
230	// See if any bits will be truncated when evaluated as a character.
231	if (CharWidth != `32` && (ResultChar >> CharWidth) != `0`) {
232	Overflow = true;
233	ResultChar &= ~`0U` >> (`32`-CharWidth);
234	}
235
236	// Check for overflow.
237	if (!HadError && Overflow) { // Too many digits to fit in
238	HadError = true;
239	if (Diags)
240	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
241	DiagID: diag::err_escape_too_large)
242	<< `0`;
243	}
244	break;
245	}
246	case `'0'`: case `'1'`: case `'2'`: case `'3'`:
247	case `'4'`: case `'5'`: case `'6'`: case `'7'`: {
248	// Octal escapes.
249	--ThisTokBuf;
250	ResultChar = `0`;
251
252	// Octal escapes are a series of octal digits with maximum length 3.
253	// "\0123" is a two digit sequence equal to "\012" "3".
254	unsigned NumDigits = `0`;
255	do {
256	ResultChar <<= `3`;
257	ResultChar \|= *ThisTokBuf++ - `'0'`;
258	++NumDigits;
259	} while (ThisTokBuf != ThisTokEnd && NumDigits < `3` &&
260	ThisTokBuf[`0`] >= `'0'` && ThisTokBuf[`0`] <= `'7'`);
261
262	// Check for overflow. Reject '\777', but not L'\777'.
263	if (CharWidth != `32` && (ResultChar >> CharWidth) != `0`) {
264	if (Diags)
265	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
266	DiagID: diag::err_escape_too_large) << `1`;
267	ResultChar &= ~`0U` >> (`32`-CharWidth);
268	}
269	break;
270	}
271	case `'o'`: {
272	bool Overflow = false;
273	if (ThisTokBuf == ThisTokEnd \|\| *ThisTokBuf != `'{'`) {
274	HadError = true;
275	if (Diags)
276	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
277	DiagID: diag::err_delimited_escape_missing_brace)
278	<< "o";
279
280	break;
281	}
282	ResultChar = `0`;
283	Delimited = true;
284	++ThisTokBuf;
285	if (*ThisTokBuf == `'}'`) {
286	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
287	DiagID: diag::err_delimited_escape_empty);
288	return ResultChar;
289	}
290
291	while (ThisTokBuf != ThisTokEnd) {
292	if (*ThisTokBuf == `'}'`) {
293	EndDelimiterFound = true;
294	ThisTokBuf++;
295	break;
296	}
297	if (ThisTokBuf < `'0'` \|\| ThisTokBuf > `'7'`) {
298	HadError = true;
299	if (Diags)
300	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
301	DiagID: diag::err_delimited_escape_invalid)
302	<< StringRef (ThisTokBuf, `1`);
303	ThisTokBuf++;
304	continue;
305	}
306	// Check if one of the top three bits is set before shifting them out.
307	if (ResultChar & `0xE0000000`)
308	Overflow = true;
309
310	ResultChar <<= `3`;
311	ResultChar \|= *ThisTokBuf++ - `'0'`;
312	}
313	// Check for overflow. Reject '\777', but not L'\777'.
314	if (!HadError &&
315	(Overflow \|\| (CharWidth != `32` && (ResultChar >> CharWidth) != `0`))) {
316	HadError = true;
317	if (Diags)
318	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
319	DiagID: diag::err_escape_too_large)
320	<< `1`;
321	ResultChar &= ~`0U` >> (`32` - CharWidth);
322	}
323	break;
324	}
325	// Otherwise, these are not valid escapes.
326	case `'('`: case `'{'`: case `'['`: case `'%'`:
327	// GCC accepts these as extensions. We warn about them as such though.
328	if (Diags)
329	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
330	DiagID: diag::ext_nonstandard_escape)
331	<< std::string (`1`, ResultChar);
332	break;
333	default:
334	if (!Diags)
335	break;
336
337	if (isPrintable(c: ResultChar))
338	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
339	DiagID: diag::ext_unknown_escape)
340	<< std::string (`1`, ResultChar);
341	else
342	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
343	DiagID: diag::ext_unknown_escape)
344	<< "x" + llvm::utohexstr(X: ResultChar);
345	break;
346	}
347
348	if (Delimited && Diags) {
349	if (!EndDelimiterFound)
350	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
351	DiagID: diag::err_expected)
352	<< tok::r_brace;
353	else if (!HadError) {
354	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
355	DiagID: Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
356	: diag::ext_delimited_escape_sequence)
357	<< /delimited/ `0` << (Features.CPlusPlus ? `1` : `0`);
358	}
359	}
360
361	if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
362	!IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
363	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: EscapeBegin, TokRangeEnd: ThisTokBuf,
364	DiagID: diag::err_unevaluated_string_invalid_escape_sequence)
365	<< StringRef (EscapeBegin, ThisTokBuf - EscapeBegin);
366	HadError = true;
367	}
368
369	return ResultChar;
370	}
371
372	static void appendCodePoint(unsigned Codepoint,
373	llvm::SmallVectorImpl<char> &Str) {
374	char ResultBuf[`4`];
375	char *ResultPtr = ResultBuf;
376	if (llvm::ConvertCodePointToUTF8(Source: Codepoint, ResultPtr))
377	Str.append(in_start: ResultBuf, in_end: ResultPtr);
378	}
379
380	void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
381	for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
382	if (*I != `'\\'`) {
383	Buf.push_back(Elt: *I);
384	continue;
385	}
386
387	++I;
388	char Kind = *I;
389	++I;
390
391	assert(Kind == `'u'` \|\| Kind == `'U'` \|\| Kind == `'N'`);
392	uint32_t CodePoint = `0`;
393
394	if (Kind == `'u'` && *I == `'{'`) {
395	for (++I; *I != `'}'`; ++I) {
396	unsigned Value = llvm::hexDigitValue(C: *I);
397	assert(Value != -`1U`);
398	CodePoint <<= `4`;
399	CodePoint += Value;
400	}
401	appendCodePoint(Codepoint: CodePoint, Str&: Buf);
402	continue;
403	}
404
405	if (Kind == `'N'`) {
406	assert(*I == `'{'`);
407	++I;
408	auto Delim = std::find(first: I, last: Input.end(), val: `'}'`);
409	assert(Delim != Input.end());
410	StringRef Name(I, std::distance(first: I, last: Delim));
411	std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
412	llvm::sys::unicode::nameToCodepointLooseMatching(Name);
413	assert(Res && "could not find a codepoint that was previously found");
414	CodePoint = Res ->CodePoint;
415	assert(CodePoint != `0xFFFFFFFF`);
416	appendCodePoint(Codepoint: CodePoint, Str&: Buf);
417	I = Delim;
418	continue;
419	}
420
421	unsigned NumHexDigits;
422	if (Kind == `'u'`)
423	NumHexDigits = `4`;
424	else
425	NumHexDigits = `8`;
426
427	assert(I + NumHexDigits <= E);
428
429	for (; NumHexDigits != `0`; ++I, --NumHexDigits) {
430	unsigned Value = llvm::hexDigitValue(C: *I);
431	assert(Value != -`1U`);
432
433	CodePoint <<= `4`;
434	CodePoint += Value;
435	}
436
437	appendCodePoint(Codepoint: CodePoint, Str&: Buf);
438	--I;
439	}
440	}
441
442	bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
443	const LangOptions &LO) {
444	return LO.MicrosoftExt &&
445	(K == tok::kw___FUNCTION__ \|\| K == tok::kw_L__FUNCTION__ \|\|
446	K == tok::kw___FUNCSIG__ \|\| K == tok::kw_L__FUNCSIG__ \|\|
447	K == tok::kw___FUNCDNAME__);
448	}
449
450	bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
451	return tok::isStringLiteral(K: Tok.getKind()) \|\|
452	isFunctionLocalStringLiteralMacro(K: Tok.getKind(), LO);
453	}
454
455	static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
456	const char *&ThisTokBuf,
457	const char *ThisTokEnd, uint32_t &UcnVal,
458	unsigned short &UcnLen, bool &Delimited,
459	FullSourceLoc Loc, DiagnosticsEngine *Diags,
460	const LangOptions &Features,
461	bool in_char_string_literal = false) {
462	const char *UcnBegin = ThisTokBuf;
463	bool HasError = false;
464	bool EndDelimiterFound = false;
465
466	// Skip the '\u' char's.
467	ThisTokBuf += `2`;
468	Delimited = false;
469	if (UcnBegin[`1`] == `'u'` && in_char_string_literal &&
470	ThisTokBuf != ThisTokEnd && *ThisTokBuf == `'{'`) {
471	Delimited = true;
472	ThisTokBuf++;
473	} else if (ThisTokBuf == ThisTokEnd \|\| !isHexDigit(c: *ThisTokBuf)) {
474	if (Diags)
475	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
476	DiagID: diag::err_hex_escape_no_digits)
477	<< StringRef (&ThisTokBuf[-`1`], `1`);
478	return false;
479	}
480	UcnLen = (ThisTokBuf[-`1`] == `'u'` ? `4` : `8`);
481
482	bool Overflow = false;
483	unsigned short Count = `0`;
484	for (; ThisTokBuf != ThisTokEnd && (Delimited \|\| Count != UcnLen);
485	++ThisTokBuf) {
486	if (Delimited && *ThisTokBuf == `'}'`) {
487	++ThisTokBuf;
488	EndDelimiterFound = true;
489	break;
490	}
491	int CharVal = llvm::hexDigitValue(C: *ThisTokBuf);
492	if (CharVal == -`1`) {
493	HasError = true;
494	if (!Delimited)
495	break;
496	if (Diags) {
497	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
498	DiagID: diag::err_delimited_escape_invalid)
499	<< StringRef (ThisTokBuf, `1`);
500	}
501	Count++;
502	continue;
503	}
504	if (UcnVal & `0xF0000000`) {
505	Overflow = true;
506	continue;
507	}
508	UcnVal <<= `4`;
509	UcnVal \|= CharVal;
510	Count++;
511	}
512
513	if (Overflow) {
514	if (Diags)
515	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
516	DiagID: diag::err_escape_too_large)
517	<< `0`;
518	return false;
519	}
520
521	if (Delimited && !EndDelimiterFound) {
522	if (Diags) {
523	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
524	DiagID: diag::err_expected)
525	<< tok::r_brace;
526	}
527	return false;
528	}
529
530	// If we didn't consume the proper number of digits, there is a problem.
531	if (Count == `0` \|\| (!Delimited && Count != UcnLen)) {
532	if (Diags)
533	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
534	DiagID: Delimited ? diag::err_delimited_escape_empty
535	: diag::err_ucn_escape_incomplete);
536	return false;
537	}
538	return !HasError;
539	}
540
541	static void DiagnoseInvalidUnicodeCharacterName(
542	DiagnosticsEngine Diags, const* LangOptions &Features, FullSourceLoc Loc,
543	const char TokBegin, const* char TokRangeBegin, const* char *TokRangeEnd,
544	llvm::StringRef Name) {
545
546	Diag(Diags, Features, TokLoc: Loc, TokBegin, TokRangeBegin, TokRangeEnd,
547	DiagID: diag::err_invalid_ucn_name)
548	<< Name;
549
550	namespace u = llvm::sys::unicode;
551
552	std::optional<u::LooseMatchingResult> Res =
553	u::nameToCodepointLooseMatching(Name);
554	if (Res) {
555	Diag(Diags, Features, TokLoc: Loc, TokBegin, TokRangeBegin, TokRangeEnd,
556	DiagID: diag::note_invalid_ucn_name_loose_matching)
557	<< FixItHint::CreateReplacement(
558	RemoveRange: MakeCharSourceRange(Features, TokLoc: Loc, TokBegin, TokRangeBegin,
559	TokRangeEnd),
560	Code: Res ->Name);
561	return;
562	}
563
564	unsigned Distance = `0`;
565	SmallVector<u::MatchForCodepointName> Matches =
566	u::nearestMatchesForCodepointName(Pattern: Name, MaxMatchesCount: `5`);
567	assert(!Matches.empty() && "No unicode characters found");
568
569	for (const auto &Match : Matches) {
570	if (Distance == `0`)
571	Distance = Match.Distance;
572	if (std::max(a: Distance, b: Match.Distance) -
573	std::min(a: Distance, b: Match.Distance) >
574	`3`)
575	break;
576	Distance = Match.Distance;
577
578	std::string Str;
579	llvm::UTF32 V = Match.Value;
580	bool Converted =
581	llvm::convertUTF32ToUTF8String(Src: llvm::ArrayRef<llvm::UTF32>(&V, `1`), Out&: Str);
582	(void)Converted;
583	assert(Converted && "Found a match wich is not a unicode character");
584
585	Diag(Diags, Features, TokLoc: Loc, TokBegin, TokRangeBegin, TokRangeEnd,
586	DiagID: diag::note_invalid_ucn_name_candidate)
587	<< Match.Name << llvm::utohexstr(X: Match.Value)
588	<< Str // FIXME: Fix the rendering of non printable characters
589	<< FixItHint::CreateReplacement(
590	RemoveRange: MakeCharSourceRange(Features, TokLoc: Loc, TokBegin, TokRangeBegin,
591	TokRangeEnd),
592	Code: Match.Name);
593	}
594	}
595
596	static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
597	const char *&ThisTokBuf,
598	const char *ThisTokEnd, uint32_t &UcnVal,
599	unsigned short &UcnLen, FullSourceLoc Loc,
600	DiagnosticsEngine *Diags,
601	const LangOptions &Features) {
602	const char *UcnBegin = ThisTokBuf;
603	assert(UcnBegin[`0`] == `'\\'` && UcnBegin[`1`] == `'N'`);
604	ThisTokBuf += `2`;
605	if (ThisTokBuf == ThisTokEnd \|\| *ThisTokBuf != `'{'`) {
606	if (Diags) {
607	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
608	DiagID: diag::err_delimited_escape_missing_brace)
609	<< StringRef (&ThisTokBuf[-`1`], `1`);
610	}
611	return false;
612	}
613	ThisTokBuf++;
614	const char ClosingBrace = std::find_if(first: ThisTokBuf, last: ThisTokEnd, pred: [](char* C) {
615	return C == `'}'` \|\| isVerticalWhitespace(c: C);
616	});
617	bool Incomplete = ClosingBrace == ThisTokEnd;
618	bool Empty = ClosingBrace == ThisTokBuf;
619	if (Incomplete \|\| Empty) {
620	if (Diags) {
621	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
622	DiagID: Incomplete ? diag::err_ucn_escape_incomplete
623	: diag::err_delimited_escape_empty)
624	<< StringRef (&UcnBegin[`1`], `1`);
625	}
626	ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + `1`;
627	return false;
628	}
629	StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
630	ThisTokBuf = ClosingBrace + `1`;
631	std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
632	if (!Res) {
633	if (Diags)
634	DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, TokBegin: ThisTokBegin,
635	TokRangeBegin: &UcnBegin[`3`], TokRangeEnd: ClosingBrace, Name);
636	return false;
637	}
638	UcnVal = *Res;
639	UcnLen = UcnVal > `0xFFFF` ? `8` : `4`;
640	return true;
641	}
642
643	/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
644	/// return the UTF32.
645	static bool ProcessUCNEscape(const char ThisTokBegin, const* char *&ThisTokBuf,
646	const char *ThisTokEnd, uint32_t &UcnVal,
647	unsigned short &UcnLen, FullSourceLoc Loc,
648	DiagnosticsEngine *Diags,
649	const LangOptions &Features,
650	bool in_char_string_literal = false) {
651
652	bool HasError;
653	const char *UcnBegin = ThisTokBuf;
654	bool IsDelimitedEscapeSequence = false;
655	bool IsNamedEscapeSequence = false;
656	if (ThisTokBuf[`1`] == `'N'`) {
657	IsNamedEscapeSequence = true;
658	HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
659	UcnVal, UcnLen, Loc, Diags, Features);
660	} else {
661	HasError =
662	!ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
663	UcnLen, Delimited&: IsDelimitedEscapeSequence, Loc, Diags,
664	Features, in_char_string_literal);
665	}
666	if (HasError)
667	return false;
668
669	// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
670	if ((`0xD800` <= UcnVal && UcnVal <= `0xDFFF`) \|\| // surrogate codepoints
671	UcnVal > `0x10FFFF`) { // maximum legal UTF32 value
672	if (Diags)
673	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
674	DiagID: diag::err_ucn_escape_invalid);
675	return false;
676	}
677
678	// C23 and C++11 allow UCNs that refer to control characters
679	// and basic source characters inside character and string literals
680	if (UcnVal < `0xa0` &&
681	// $, @, ` are allowed in all language modes
682	(UcnVal != `0x24` && UcnVal != `0x40` && UcnVal != `0x60`)) {
683	bool IsError =
684	(!(Features.CPlusPlus11 \|\| Features.C23) \|\| !in_char_string_literal);
685	if (Diags) {
686	char BasicSCSChar = UcnVal;
687	if (UcnVal >= `0x20` && UcnVal < `0x7f`)
688	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
689	DiagID: IsError ? diag::err_ucn_escape_basic_scs
690	: Features.CPlusPlus
691	? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
692	: diag::warn_c23_compat_literal_ucn_escape_basic_scs)
693	<< StringRef (&BasicSCSChar, `1`);
694	else
695	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
696	DiagID: IsError ? diag::err_ucn_control_character
697	: Features.CPlusPlus
698	? diag::warn_cxx98_compat_literal_ucn_control_character
699	: diag::warn_c23_compat_literal_ucn_control_character);
700	}
701	if (IsError)
702	return false;
703	}
704
705	if (!Features.CPlusPlus && !Features.C99 && Diags)
706	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
707	DiagID: diag::warn_ucn_not_valid_in_c89_literal);
708
709	if ((IsDelimitedEscapeSequence \|\| IsNamedEscapeSequence) && Diags)
710	Diag(Diags, Features, TokLoc: Loc, TokBegin: ThisTokBegin, TokRangeBegin: UcnBegin, TokRangeEnd: ThisTokBuf,
711	DiagID: Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
712	: diag::ext_delimited_escape_sequence)
713	<< (IsNamedEscapeSequence ? `1` : `0`) << (Features.CPlusPlus ? `1` : `0`);
714
715	return true;
716	}
717
718	/// MeasureUCNEscape - Determine the number of bytes within the resulting string
719	/// which this UCN will occupy.
720	static int MeasureUCNEscape(const char ThisTokBegin, const* char *&ThisTokBuf,
721	const char ThisTokEnd, unsigned* CharByteWidth,
722	const LangOptions &Features, bool &HadError) {
723	// UTF-32: 4 bytes per escape.
724	if (CharByteWidth == `4`)
725	return `4`;
726
727	uint32_t UcnVal = `0`;
728	unsigned short UcnLen = `0`;
729	FullSourceLoc Loc;
730
731	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
732	UcnLen, Loc, Diags: nullptr, Features, in_char_string_literal: true)) {
733	HadError = true;
734	return `0`;
735	}
736
737	// UTF-16: 2 bytes for BMP, 4 bytes otherwise.
738	if (CharByteWidth == `2`)
739	return UcnVal <= `0xFFFF` ? `2` : `4`;
740
741	// UTF-8.
742	if (UcnVal < `0x80`)
743	return `1`;
744	if (UcnVal < `0x800`)
745	return `2`;
746	if (UcnVal < `0x10000`)
747	return `3`;
748	return `4`;
749	}
750
751	/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
752	/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
753	/// StringLiteralParser. When we decide to implement UCN's for identifiers,
754	/// we will likely rework our support for UCN's.
755	static void EncodeUCNEscape(const char ThisTokBegin, const* char *&ThisTokBuf,
756	const char *ThisTokEnd,
757	char &ResultBuf, bool* &HadError,
758	FullSourceLoc Loc, unsigned CharByteWidth,
759	DiagnosticsEngine *Diags,
760	const LangOptions &Features) {
761	typedef uint32_t UTF32;
762	UTF32 UcnVal = `0`;
763	unsigned short UcnLen = `0`;
764	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
765	Loc, Diags, Features, in_char_string_literal: true)) {
766	HadError = true;
767	return;
768	}
769
770	assert((CharByteWidth == `1` \|\| CharByteWidth == `2` \|\| CharByteWidth == `4`) &&
771	"only character widths of 1, 2, or 4 bytes supported");
772
773	(void)UcnLen;
774	assert((UcnLen== `4` \|\| UcnLen== `8`) && "only ucn length of 4 or 8 supported");
775
776	if (CharByteWidth == `4`) {
777	// FIXME: Make the type of the result buffer correct instead of
778	// using reinterpret_cast.
779	llvm::UTF32 ResultPtr = reinterpret_cast<llvm::UTF32>(ResultBuf);
780	*ResultPtr = UcnVal;
781	ResultBuf += `4`;
782	return;
783	}
784
785	if (CharByteWidth == `2`) {
786	// FIXME: Make the type of the result buffer correct instead of
787	// using reinterpret_cast.
788	llvm::UTF16 ResultPtr = reinterpret_cast<llvm::UTF16>(ResultBuf);
789
790	if (UcnVal <= (UTF32)`0xFFFF`) {
791	*ResultPtr = UcnVal;
792	ResultBuf += `2`;
793	return;
794	}
795
796	// Convert to UTF16.
797	UcnVal -= `0x10000`;
798	*ResultPtr = `0xD800` + (UcnVal >> `10`);
799	*(ResultPtr+`1`) = `0xDC00` + (UcnVal & `0x3FF`);
800	ResultBuf += `4`;
801	return;
802	}
803
804	assert(CharByteWidth == `1` && "UTF-8 encoding is only for 1 byte characters");
805
806	// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
807	// The conversion below was inspired by:
808	// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
809	// First, we determine how many bytes the result will require.
810	typedef uint8_t UTF8;
811
812	unsigned short bytesToWrite = `0`;
813	if (UcnVal < (UTF32)`0x80`)
814	bytesToWrite = `1`;
815	else if (UcnVal < (UTF32)`0x800`)
816	bytesToWrite = `2`;
817	else if (UcnVal < (UTF32)`0x10000`)
818	bytesToWrite = `3`;
819	else
820	bytesToWrite = `4`;
821
822	const unsigned byteMask = `0xBF`;
823	const unsigned byteMark = `0x80`;
824
825	// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
826	// into the first byte, depending on how many bytes follow.
827	static const UTF8 firstByteMark[`5`] = {
828	`0x00`, `0x00`, `0xC0`, `0xE0`, `0xF0`
829	};
830	// Finally, we write the bytes into ResultBuf.
831	ResultBuf += bytesToWrite;
832	switch (bytesToWrite) { // note: everything falls through.
833	case `4`:
834	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= `6`;
835	[[fallthrough]];
836	case `3`:
837	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= `6`;
838	[[fallthrough]];
839	case `2`:
840	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= `6`;
841	[[fallthrough]];
842	case `1`:
843	*--ResultBuf = (UTF8) (UcnVal \| firstByteMark[bytesToWrite]);
844	}
845	// Update the buffer.
846	ResultBuf += bytesToWrite;
847	}
848
849	/// integer-constant: [C99 6.4.4.1]
850	/// decimal-constant integer-suffix
851	/// octal-constant integer-suffix
852	/// hexadecimal-constant integer-suffix
853	/// binary-literal integer-suffix [GNU, C++1y]
854	/// user-defined-integer-literal: [C++11 lex.ext]
855	/// decimal-literal ud-suffix
856	/// octal-literal ud-suffix
857	/// hexadecimal-literal ud-suffix
858	/// binary-literal ud-suffix [GNU, C++1y]
859	/// decimal-constant:
860	/// nonzero-digit
861	/// decimal-constant digit
862	/// octal-constant:
863	/// 0
864	/// octal-constant octal-digit
865	/// hexadecimal-constant:
866	/// hexadecimal-prefix hexadecimal-digit
867	/// hexadecimal-constant hexadecimal-digit
868	/// hexadecimal-prefix: one of
869	/// 0x 0X
870	/// binary-literal:
871	/// 0b binary-digit
872	/// 0B binary-digit
873	/// binary-literal binary-digit
874	/// integer-suffix:
875	/// unsigned-suffix [long-suffix]
876	/// unsigned-suffix [long-long-suffix]
877	/// long-suffix [unsigned-suffix]
878	/// long-long-suffix [unsigned-sufix]
879	/// nonzero-digit:
880	/// 1 2 3 4 5 6 7 8 9
881	/// octal-digit:
882	/// 0 1 2 3 4 5 6 7
883	/// hexadecimal-digit:
884	/// 0 1 2 3 4 5 6 7 8 9
885	/// a b c d e f
886	/// A B C D E F
887	/// binary-digit:
888	/// 0
889	/// 1
890	/// unsigned-suffix: one of
891	/// u U
892	/// long-suffix: one of
893	/// l L
894	/// long-long-suffix: one of
895	/// ll LL
896	///
897	/// floating-constant: [C99 6.4.4.2]
898	/// TODO: add rules...
899	///
900	NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
901	SourceLocation TokLoc,
902	const SourceManager &SM,
903	const LangOptions &LangOpts,
904	const TargetInfo &Target,
905	DiagnosticsEngine &Diags)
906	: SM(SM), LangOpts(LangOpts), Diags(Diags),
907	ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
908
909	s = DigitsBegin = ThisTokBegin;
910	saw_exponent = false;
911	saw_period = false;
912	saw_ud_suffix = false;
913	saw_fixed_point_suffix = false;
914	isLong = false;
915	isUnsigned = false;
916	isLongLong = false;
917	isSizeT = false;
918	isHalf = false;
919	isFloat = false;
920	isImaginary = false;
921	isFloat16 = false;
922	isFloat128 = false;
923	MicrosoftInteger = `0`;
924	isFract = false;
925	isAccum = false;
926	hadError = false;
927	isBitInt = false;
928
929	// This routine assumes that the range begin/end matches the regex for integer
930	// and FP constants (specifically, the 'pp-number' regex), and assumes that
931	// the byte at "end" is both valid and not part of the regex. Because of*
932	// this, it doesn't have to check for 'overscan' in various places.
933	// Note: For HLSL, the end token is allowed to be '.' which would be in the
934	// 'pp-number' regex. This is required to support vector swizzles on numeric
935	// constants (i.e. 1.xx or 1.5f.rrr).
936	if (isPreprocessingNumberBody(c: *ThisTokEnd) &&
937	!(LangOpts.HLSL && *ThisTokEnd == `'.'`)) {
938	Diags.Report(Loc: TokLoc, DiagID: diag::err_lexing_numeric);
939	hadError = true;
940	return;
941	}
942
943	if (s == `'0'`) { // parse radix*
944	ParseNumberStartingWithZero(TokLoc);
945	if (hadError)
946	return;
947	} else { // the first digit is non-zero
948	radix = `10`;
949	s = SkipDigits(ptr: s);
950	if (s == ThisTokEnd) {
951	// Done.
952	} else {
953	ParseDecimalOrOctalCommon(TokLoc);
954	if (hadError)
955	return;
956	}
957	}
958
959	SuffixBegin = s;
960	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_AfterDigits);
961
962	// Initial scan to lookahead for fixed point suffix.
963	if (LangOpts.FixedPoint) {
964	for (const char *c = s; c != ThisTokEnd; ++c) {
965	if (c == `'r'` \|\| c == `'k'` \|\| c == `'R'` \|\| c == `'K'`) {
966	saw_fixed_point_suffix = true;
967	break;
968	}
969	}
970	}
971
972	// Parse the suffix. At this point we can classify whether we have an FP or
973	// integer constant.
974	bool isFixedPointConstant = isFixedPointLiteral();
975	bool isFPConstant = isFloatingLiteral();
976	bool HasSize = false;
977	bool DoubleUnderscore = false;
978
979	// Loop over all of the characters of the suffix. If we see something bad,
980	// we break out of the loop.
981	for (; s != ThisTokEnd; ++s) {
982	switch (*s) {
983	case `'R'`:
984	case `'r'`:
985	if (!LangOpts.FixedPoint)
986	break;
987	if (isFract \|\| isAccum) break;
988	if (!(saw_period \|\| saw_exponent)) break;
989	isFract = true;
990	continue;
991	case `'K'`:
992	case `'k'`:
993	if (!LangOpts.FixedPoint)
994	break;
995	if (isFract \|\| isAccum) break;
996	if (!(saw_period \|\| saw_exponent)) break;
997	isAccum = true;
998	continue;
999	case `'h'`: // FP Suffix for "half".
1000	case `'H'`:
1001	// OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
1002	if (!(LangOpts.Half \|\| LangOpts.FixedPoint))
1003	break;
1004	if (isIntegerLiteral()) break; // Error for integer constant.
1005	if (HasSize)
1006	break;
1007	HasSize = true;
1008	isHalf = true;
1009	continue; // Success.
1010	case `'f'`: // FP Suffix for "float"
1011	case `'F'`:
1012	if (!isFPConstant) break; // Error for integer constant.
1013	if (HasSize)
1014	break;
1015	HasSize = true;
1016
1017	// CUDA host and device may have different _Float16 support, therefore
1018	// allows f16 literals to avoid false alarm.
1019	// When we compile for OpenMP target offloading on NVPTX, f16 suffix
1020	// should also be supported.
1021	// ToDo: more precise check for CUDA.
1022	// TODO: AMDGPU might also support it in the future.
1023	if ((Target.hasFloat16Type() \|\| LangOpts.CUDA \|\|
1024	(LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1025	s + `2` < ThisTokEnd && s[`1`] == `'1'` && s[`2`] == `'6'`) {
1026	s += `2`; // success, eat up 2 characters.
1027	isFloat16 = true;
1028	continue;
1029	}
1030
1031	isFloat = true;
1032	continue; // Success.
1033	case `'q'`: // FP Suffix for "__float128"
1034	case `'Q'`:
1035	if (!isFPConstant) break; // Error for integer constant.
1036	if (HasSize)
1037	break;
1038	HasSize = true;
1039	isFloat128 = true;
1040	continue; // Success.
1041	case `'u'`:
1042	case `'U'`:
1043	if (isFPConstant) break; // Error for floating constant.
1044	if (isUnsigned) break; // Cannot be repeated.
1045	isUnsigned = true;
1046	continue; // Success.
1047	case `'l'`:
1048	case `'L'`:
1049	if (HasSize)
1050	break;
1051	HasSize = true;
1052
1053	// Check for long long. The L's need to be adjacent and the same case.
1054	if (s[`1`] == s[`0`]) {
1055	assert(s + `1` < ThisTokEnd && "didn't maximally munch?");
1056	if (isFPConstant) break; // long long invalid for floats.
1057	isLongLong = true;
1058	++s; // Eat both of them.
1059	} else {
1060	isLong = true;
1061	}
1062	continue; // Success.
1063	case `'z'`:
1064	case `'Z'`:
1065	if (isFPConstant)
1066	break; // Invalid for floats.
1067	if (HasSize)
1068	break;
1069	HasSize = true;
1070	isSizeT = true;
1071	continue;
1072	case `'i'`:
1073	case `'I'`:
1074	if (LangOpts.MicrosoftExt && !isFPConstant) {
1075	// Allow i8, i16, i32, and i64. First, look ahead and check if
1076	// suffixes are Microsoft integers and not the imaginary unit.
1077	uint8_t Bits = `0`;
1078	size_t ToSkip = `0`;
1079	switch (s[`1`]) {
1080	case `'8'`: // i8 suffix
1081	Bits = `8`;
1082	ToSkip = `2`;
1083	break;
1084	case `'1'`:
1085	if (s[`2`] == `'6'`) { // i16 suffix
1086	Bits = `16`;
1087	ToSkip = `3`;
1088	}
1089	break;
1090	case `'3'`:
1091	if (s[`2`] == `'2'`) { // i32 suffix
1092	Bits = `32`;
1093	ToSkip = `3`;
1094	}
1095	break;
1096	case `'6'`:
1097	if (s[`2`] == `'4'`) { // i64 suffix
1098	Bits = `64`;
1099	ToSkip = `3`;
1100	}
1101	break;
1102	default:
1103	break;
1104	}
1105	if (Bits) {
1106	if (HasSize)
1107	break;
1108	HasSize = true;
1109	MicrosoftInteger = Bits;
1110	s += ToSkip;
1111	assert(s <= ThisTokEnd && "didn't maximally munch?");
1112	break;
1113	}
1114	}
1115	[[fallthrough]];
1116	case `'j'`:
1117	case `'J'`:
1118	if (isImaginary) break; // Cannot be repeated.
1119	isImaginary = true;
1120	continue; // Success.
1121	case `'_'`:
1122	if (isFPConstant)
1123	break; // Invalid for floats
1124	if (HasSize)
1125	break;
1126	// There is currently no way to reach this with DoubleUnderscore set.
1127	// If new double underscope literals are added handle it here as above.
1128	assert(!DoubleUnderscore && "unhandled double underscore case");
1129	if (LangOpts.CPlusPlus && s + `2` < ThisTokEnd &&
1130	s[`1`] == `'_'`) { // s + 2 < ThisTokEnd to ensure some character exists
1131	// after __
1132	DoubleUnderscore = true;
1133	s += `2`; // Skip both '_'
1134	if (s + `1` < ThisTokEnd &&
1135	(s == `'u'` \|\| s == `'U'`)) { // Ensure some character after 'u'/'U'
1136	isUnsigned = true;
1137	++s;
1138	}
1139	if (s + `1` < ThisTokEnd &&
1140	((s == `'w'` && (++s) == `'b'`) \|\| (s == `'W'` && (++s) == `'B'`))) {
1141	isBitInt = true;
1142	HasSize = true;
1143	continue;
1144	}
1145	}
1146	break;
1147	case `'w'`:
1148	case `'W'`:
1149	if (isFPConstant)
1150	break; // Invalid for floats.
1151	if (HasSize)
1152	break; // Invalid if we already have a size for the literal.
1153
1154	// wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1155	// explicitly do not support the suffix in C++ as an extension because a
1156	// library-based UDL that resolves to a library type may be more
1157	// appropriate there. The same rules apply for __wb/__WB.
1158	if ((!LangOpts.CPlusPlus \|\| DoubleUnderscore) && s + `1` < ThisTokEnd &&
1159	((s[`0`] == `'w'` && s[`1`] == `'b'`) \|\| (s[`0`] == `'W'` && s[`1`] == `'B'`))) {
1160	isBitInt = true;
1161	HasSize = true;
1162	++s; // Skip both characters (2nd char skipped on continue).
1163	continue; // Success.
1164	}
1165	}
1166	// If we reached here, there was an error or a ud-suffix.
1167	break;
1168	}
1169
1170	// "i", "if", and "il" are user-defined suffixes in C++1y.
1171	if (s != ThisTokEnd \|\| isImaginary) {
1172	// FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1173	expandUCNs(Buf&: UDSuffixBuf, Input: StringRef (SuffixBegin, ThisTokEnd - SuffixBegin));
1174	if (isValidUDSuffix(LangOpts, Suffix: UDSuffixBuf)) {
1175	if (!isImaginary) {
1176	// Any suffix pieces we might have parsed are actually part of the
1177	// ud-suffix.
1178	isLong = false;
1179	isUnsigned = false;
1180	isLongLong = false;
1181	isSizeT = false;
1182	isFloat = false;
1183	isFloat16 = false;
1184	isHalf = false;
1185	isImaginary = false;
1186	isBitInt = false;
1187	MicrosoftInteger = `0`;
1188	saw_fixed_point_suffix = false;
1189	isFract = false;
1190	isAccum = false;
1191	}
1192
1193	saw_ud_suffix = true;
1194	return;
1195	}
1196
1197	if (s != ThisTokEnd) {
1198	// Report an error if there are any.
1199	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(
1200	TokStart: TokLoc, Characters: SuffixBegin - ThisTokBegin, SM, LangOpts),
1201	DiagID: diag::err_invalid_suffix_constant)
1202	<< StringRef (SuffixBegin, ThisTokEnd - SuffixBegin)
1203	<< (isFixedPointConstant ? `2` : isFPConstant);
1204	hadError = true;
1205	}
1206	}
1207
1208	if (!hadError && saw_fixed_point_suffix) {
1209	assert(isFract \|\| isAccum);
1210	}
1211	}
1212
1213	/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1214	/// numbers. It issues an error for illegal digits, and handles floating point
1215	/// parsing. If it detects a floating point number, the radix is set to 10.
1216	void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1217	assert((radix == `8` \|\| radix == `10`) && "Unexpected radix");
1218
1219	// If we have a hex digit other than 'e' (which denotes a FP exponent) then
1220	// the code is using an incorrect base.
1221	if (isHexDigit(c: s) && s != `'e'` && *s != `'E'` &&
1222	!isValidUDSuffix(LangOpts, Suffix: StringRef (s, ThisTokEnd - s))) {
1223	Diags.Report(
1224	Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: s - ThisTokBegin, SM, LangOpts),
1225	DiagID: diag::err_invalid_digit)
1226	<< StringRef (s, `1`) << (radix == `8` ? `1` : `0`);
1227	hadError = true;
1228	return;
1229	}
1230
1231	if (*s == `'.'`) {
1232	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_AfterDigits);
1233	s++;
1234	radix = `10`;
1235	saw_period = true;
1236	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_BeforeDigits);
1237	s = SkipDigits(ptr: s); // Skip suffix.
1238	}
1239	if (s == `'e'` \|\| s == `'E'`) { // exponent
1240	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_AfterDigits);
1241	const char *Exponent = s;
1242	s++;
1243	radix = `10`;
1244	saw_exponent = true;
1245	if (s != ThisTokEnd && (s == `'+'` \|\| s == `'-'`)) s++; // sign
1246	const char *first_non_digit = SkipDigits(ptr: s);
1247	if (containsDigits(Start: s, End: first_non_digit)) {
1248	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_BeforeDigits);
1249	s = first_non_digit;
1250	} else {
1251	if (!hadError) {
1252	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(
1253	TokStart: TokLoc, Characters: Exponent - ThisTokBegin, SM, LangOpts),
1254	DiagID: diag::err_exponent_has_no_digits);
1255	hadError = true;
1256	}
1257	return;
1258	}
1259	}
1260	}
1261
1262	/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1263	/// suffixes as ud-suffixes, because the diagnostic experience is better if we
1264	/// treat it as an invalid suffix.
1265	bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1266	StringRef Suffix) {
1267	if (!LangOpts.CPlusPlus11 \|\| Suffix.empty())
1268	return false;
1269
1270	// By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1271	// Suffixes starting with '__' (double underscore) are for use by
1272	// the implementation.
1273	if (Suffix.starts_with(Prefix: "_") && !Suffix.starts_with(Prefix: "__"))
1274	return true;
1275
1276	// In C++11, there are no library suffixes.
1277	if (!LangOpts.CPlusPlus14)
1278	return false;
1279
1280	// In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1281	// Per tweaked N3660, "il", "i", and "if" are also used in the library.
1282	// In C++2a "d" and "y" are used in the library.
1283	return llvm::StringSwitch<bool>(Suffix)
1284	.Cases(S0: "h", S1: "min", S2: "s", Value: true)
1285	.Cases(S0: "ms", S1: "us", S2: "ns", Value: true)
1286	.Cases(S0: "il", S1: "i", S2: "if", Value: true)
1287	.Cases(S0: "d", S1: "y", Value: LangOpts.CPlusPlus20)
1288	.Default(Value: false);
1289	}
1290
1291	void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1292	const char *Pos,
1293	CheckSeparatorKind IsAfterDigits) {
1294	if (IsAfterDigits == CSK_AfterDigits) {
1295	if (Pos == ThisTokBegin)
1296	return;
1297	--Pos;
1298	} else if (Pos == ThisTokEnd)
1299	return;
1300
1301	if (isDigitSeparator(C: *Pos)) {
1302	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: Pos - ThisTokBegin, SM,
1303	LangOpts),
1304	DiagID: diag::err_digit_separator_not_between_digits)
1305	<< IsAfterDigits;
1306	hadError = true;
1307	}
1308	}
1309
1310	/// ParseNumberStartingWithZero - This method is called when the first character
1311	/// of the number is found to be a zero. This means it is either an octal
1312	/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1313	/// a floating point number (01239.123e4). Eat the prefix, determining the
1314	/// radix etc.
1315	void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1316	assert(s[`0`] == `'0'` && "Invalid method call");
1317	s++;
1318
1319	int c1 = s[`0`];
1320
1321	// Handle a hex number like 0x1234.
1322	if ((c1 == `'x'` \|\| c1 == `'X'`) && (isHexDigit(c: s[`1`]) \|\| s[`1`] == `'.'`)) {
1323	s++;
1324	assert(s < ThisTokEnd && "didn't maximally munch?");
1325	radix = `16`;
1326	DigitsBegin = s;
1327	s = SkipHexDigits(ptr: s);
1328	bool HasSignificandDigits = containsDigits(Start: DigitsBegin, End: s);
1329	if (s == ThisTokEnd) {
1330	// Done.
1331	} else if (*s == `'.'`) {
1332	s++;
1333	saw_period = true;
1334	const char *floatDigitsBegin = s;
1335	s = SkipHexDigits(ptr: s);
1336	if (containsDigits(Start: floatDigitsBegin, End: s))
1337	HasSignificandDigits = true;
1338	if (HasSignificandDigits)
1339	checkSeparator(TokLoc, Pos: floatDigitsBegin, IsAfterDigits: CSK_BeforeDigits);
1340	}
1341
1342	if (!HasSignificandDigits) {
1343	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: s - ThisTokBegin, SM,
1344	LangOpts),
1345	DiagID: diag::err_hex_constant_requires)
1346	<< LangOpts.CPlusPlus << `1`;
1347	hadError = true;
1348	return;
1349	}
1350
1351	// A binary exponent can appear with or with a '.'. If dotted, the
1352	// binary exponent is required.
1353	if (s == `'p'` \|\| s == `'P'`) {
1354	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_AfterDigits);
1355	const char *Exponent = s;
1356	s++;
1357	saw_exponent = true;
1358	if (s != ThisTokEnd && (s == `'+'` \|\| s == `'-'`)) s++; // sign
1359	const char *first_non_digit = SkipDigits(ptr: s);
1360	if (!containsDigits(Start: s, End: first_non_digit)) {
1361	if (!hadError) {
1362	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(
1363	TokStart: TokLoc, Characters: Exponent - ThisTokBegin, SM, LangOpts),
1364	DiagID: diag::err_exponent_has_no_digits);
1365	hadError = true;
1366	}
1367	return;
1368	}
1369	checkSeparator(TokLoc, Pos: s, IsAfterDigits: CSK_BeforeDigits);
1370	s = first_non_digit;
1371
1372	if (!LangOpts.HexFloats)
1373	Diags.Report(Loc: TokLoc, DiagID: LangOpts.CPlusPlus
1374	? diag::ext_hex_literal_invalid
1375	: diag::ext_hex_constant_invalid);
1376	else if (LangOpts.CPlusPlus17)
1377	Diags.Report(Loc: TokLoc, DiagID: diag::warn_cxx17_hex_literal);
1378	} else if (saw_period) {
1379	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: s - ThisTokBegin, SM,
1380	LangOpts),
1381	DiagID: diag::err_hex_constant_requires)
1382	<< LangOpts.CPlusPlus << `0`;
1383	hadError = true;
1384	}
1385	return;
1386	}
1387
1388	// Handle simple binary numbers 0b01010
1389	if ((c1 == `'b'` \|\| c1 == `'B'`) && (s[`1`] == `'0'` \|\| s[`1`] == `'1'`)) {
1390	// 0b101010 is a C++14 and C23 extension.
1391	unsigned DiagId;
1392	if (LangOpts.CPlusPlus14)
1393	DiagId = diag::warn_cxx11_compat_binary_literal;
1394	else if (LangOpts.C23)
1395	DiagId = diag::warn_c23_compat_binary_literal;
1396	else if (LangOpts.CPlusPlus)
1397	DiagId = diag::ext_binary_literal_cxx14;
1398	else
1399	DiagId = diag::ext_binary_literal;
1400	Diags.Report(Loc: TokLoc, DiagID: DiagId);
1401	++s;
1402	assert(s < ThisTokEnd && "didn't maximally munch?");
1403	radix = `2`;
1404	DigitsBegin = s;
1405	s = SkipBinaryDigits(ptr: s);
1406	if (s == ThisTokEnd) {
1407	// Done.
1408	} else if (isHexDigit(c: *s) &&
1409	!isValidUDSuffix(LangOpts, Suffix: StringRef (s, ThisTokEnd - s))) {
1410	Diags.Report(Loc: Lexer::AdvanceToTokenCharacter(TokStart: TokLoc, Characters: s - ThisTokBegin, SM,
1411	LangOpts),
1412	DiagID: diag::err_invalid_digit)
1413	<< StringRef (s, `1`) << `2`;
1414	hadError = true;
1415	}
1416	// Other suffixes will be diagnosed by the caller.
1417	return;
1418	}
1419
1420	// For now, the radix is set to 8. If we discover that we have a
1421	// floating point constant, the radix will change to 10. Octal floating
1422	// point constants are not permitted (only decimal and hexadecimal).
1423	radix = `8`;
1424	const char *PossibleNewDigitStart = s;
1425	s = SkipOctalDigits(ptr: s);
1426	// When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1427	// as the start of the digits. So if skipping octal digits does not skip
1428	// anything, we leave the digit start where it was.
1429	if (s != PossibleNewDigitStart)
1430	DigitsBegin = PossibleNewDigitStart;
1431
1432	if (s == ThisTokEnd)
1433	return; // Done, simple octal number like 01234
1434
1435	// If we have some other non-octal digit that is* a decimal digit, see if*
1436	// this is part of a floating point number like 094.123 or 09e1.
1437	if (isDigit(c: *s)) {
1438	const char *EndDecimal = SkipDigits(ptr: s);
1439	if (EndDecimal[`0`] == `'.'` \|\| EndDecimal[`0`] == `'e'` \|\| EndDecimal[`0`] == `'E'`) {
1440	s = EndDecimal;
1441	radix = `10`;
1442	}
1443	}
1444
1445	ParseDecimalOrOctalCommon(TokLoc);
1446	}
1447
1448	static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1449	switch (Radix) {
1450	case `2`:
1451	return NumDigits <= `64`;
1452	case `8`:
1453	return NumDigits <= `64` / `3`; // Digits are groups of 3 bits.
1454	case `10`:
1455	return NumDigits <= `19`; // floor(log10(2^64))
1456	case `16`:
1457	return NumDigits <= `64` / `4`; // Digits are groups of 4 bits.
1458	default:
1459	llvm_unreachable("impossible Radix");
1460	}
1461	}
1462
1463	/// GetIntegerValue - Convert this numeric literal value to an APInt that
1464	/// matches Val's input width. If there is an overflow, set Val to the low bits
1465	/// of the result and return true. Otherwise, return false.
1466	bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1467	// Fast path: Compute a conservative bound on the maximum number of
1468	// bits per digit in this radix. If we can't possibly overflow a
1469	// uint64 based on that bound then do the simple conversion to
1470	// integer. This avoids the expensive overflow checking below, and
1471	// handles the common cases that matter (small decimal integers and
1472	// hex/octal values which don't overflow).
1473	const unsigned NumDigits = SuffixBegin - DigitsBegin;
1474	if (alwaysFitsInto64Bits(Radix: radix, NumDigits)) {
1475	uint64_t N = `0`;
1476	for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1477	if (!isDigitSeparator(C: *Ptr))
1478	N = N * radix + llvm::hexDigitValue(C: *Ptr);
1479
1480	// This will truncate the value to Val's input width. Simply check
1481	// for overflow by comparing.
1482	Val = N;
1483	return Val.getZExtValue() != N;
1484	}
1485
1486	Val = `0`;
1487	const char *Ptr = DigitsBegin;
1488
1489	llvm::APInt RadixVal(Val.getBitWidth(), radix);
1490	llvm::APInt CharVal(Val.getBitWidth(), `0`);
1491	llvm::APInt OldVal = Val;
1492
1493	bool OverflowOccurred = false;
1494	while (Ptr < SuffixBegin) {
1495	if (isDigitSeparator(C: *Ptr)) {
1496	++Ptr;
1497	continue;
1498	}
1499
1500	unsigned C = llvm::hexDigitValue(C: *Ptr++);
1501
1502	// If this letter is out of bound for this radix, reject it.
1503	assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1504
1505	CharVal = C;
1506
1507	// Add the digit to the value in the appropriate radix. If adding in digits
1508	// made the value smaller, then this overflowed.
1509	OldVal = Val;
1510
1511	// Multiply by radix, did overflow occur on the multiply?
1512	Val *= RadixVal;
1513	OverflowOccurred \|= Val.udiv(RHS: RadixVal) != OldVal;
1514
1515	// Add value, did overflow occur on the value?
1516	// (a + b) ult b <=> overflow
1517	Val += CharVal;
1518	OverflowOccurred \|= Val.ult(RHS: CharVal);
1519	}
1520	return OverflowOccurred;
1521	}
1522
1523	llvm::APFloat::opStatus
1524	NumericLiteralParser::GetFloatValue(llvm::APFloat &Result,
1525	llvm::RoundingMode RM) {
1526	using llvm::APFloat;
1527
1528	unsigned n = std::min(a: SuffixBegin - ThisTokBegin, b: ThisTokEnd - ThisTokBegin);
1529
1530	llvm::SmallString<`16`> Buffer;
1531	StringRef Str(ThisTokBegin, n);
1532	if (Str.contains(C: `'\''`)) {
1533	Buffer.reserve(N: n);
1534	std::remove_copy_if(first: Str.begin(), last: Str.end(), result: std::back_inserter(x&: Buffer),
1535	pred: &isDigitSeparator);
1536	Str = Buffer;
1537	}
1538
1539	auto StatusOrErr = Result.convertFromString(Str, RM);
1540	assert(StatusOrErr && "Invalid floating point representation");
1541	return !errorToBool(Err: StatusOrErr.takeError()) ? *StatusOrErr
1542	: APFloat::opInvalidOp;
1543	}
1544
1545	static inline bool IsExponentPart(char c, bool isHex) {
1546	if (isHex)
1547	return c == `'p'` \|\| c == `'P'`;
1548	return c == `'e'` \|\| c == `'E'`;
1549	}
1550
1551	bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1552	assert(radix == `16` \|\| radix == `10`);
1553
1554	// Find how many digits are needed to store the whole literal.
1555	unsigned NumDigits = SuffixBegin - DigitsBegin;
1556	if (saw_period) --NumDigits;
1557
1558	// Initial scan of the exponent if it exists
1559	bool ExpOverflowOccurred = false;
1560	bool NegativeExponent = false;
1561	const char *ExponentBegin;
1562	uint64_t Exponent = `0`;
1563	int64_t BaseShift = `0`;
1564	if (saw_exponent) {
1565	const char *Ptr = DigitsBegin;
1566
1567	while (!IsExponentPart(c: *Ptr, isHex: radix == `16`))
1568	++Ptr;
1569	ExponentBegin = Ptr;
1570	++Ptr;
1571	NegativeExponent = *Ptr == `'-'`;
1572	if (NegativeExponent) ++Ptr;
1573
1574	unsigned NumExpDigits = SuffixBegin - Ptr;
1575	if (alwaysFitsInto64Bits(Radix: radix, NumDigits: NumExpDigits)) {
1576	llvm::StringRef ExpStr(Ptr, NumExpDigits);
1577	llvm::APInt ExpInt(/numBits=/`64`, ExpStr, /radix=/`10`);
1578	Exponent = ExpInt.getZExtValue();
1579	} else {
1580	ExpOverflowOccurred = true;
1581	}
1582
1583	if (NegativeExponent) BaseShift -= Exponent;
1584	else BaseShift += Exponent;
1585	}
1586
1587	// Number of bits needed for decimal literal is
1588	// ceil(NumDigits log2(10)) Integral part*
1589	// + Scale Fractional part
1590	// + ceil(Exponent log2(10)) Exponent*
1591	// --------------------------------------------------
1592	// ceil((NumDigits + Exponent) log2(10)) + Scale*
1593	//
1594	// But for simplicity in handling integers, we can round up log2(10) to 4,
1595	// making:
1596	// 4 (NumDigits + Exponent) + Scale*
1597	//
1598	// Number of digits needed for hexadecimal literal is
1599	// 4 NumDigits Integral part*
1600	// + Scale Fractional part
1601	// + Exponent Exponent
1602	// --------------------------------------------------
1603	// (4 NumDigits) + Scale + Exponent*
1604	uint64_t NumBitsNeeded;
1605	if (radix == `10`)
1606	NumBitsNeeded = `4` * (NumDigits + Exponent) + Scale;
1607	else
1608	NumBitsNeeded = `4` * NumDigits + Exponent + Scale;
1609
1610	if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1611	ExpOverflowOccurred = true;
1612	llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), `0`, /isSigned=/false);
1613
1614	bool FoundDecimal = false;
1615
1616	int64_t FractBaseShift = `0`;
1617	const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1618	for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1619	if (*Ptr == `'.'`) {
1620	FoundDecimal = true;
1621	continue;
1622	}
1623
1624	// Normal reading of an integer
1625	unsigned C = llvm::hexDigitValue(C: *Ptr);
1626	assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1627
1628	Val *= radix;
1629	Val += C;
1630
1631	if (FoundDecimal)
1632	// Keep track of how much we will need to adjust this value by from the
1633	// number of digits past the radix point.
1634	--FractBaseShift;
1635	}
1636
1637	// For a radix of 16, we will be multiplying by 2 instead of 16.
1638	if (radix == `16`) FractBaseShift *= `4`;
1639	BaseShift += FractBaseShift;
1640
1641	Val <<= Scale;
1642
1643	uint64_t Base = (radix == `16`) ? `2` : `10`;
1644	if (BaseShift > `0`) {
1645	for (int64_t i = `0`; i < BaseShift; ++i) {
1646	Val *= Base;
1647	}
1648	} else if (BaseShift < `0`) {
1649	for (int64_t i = BaseShift; i < `0` && !Val.isZero(); ++i)
1650	Val = Val.udiv(RHS: Base);
1651	}
1652
1653	bool IntOverflowOccurred = false;
1654	auto MaxVal = llvm::APInt::getMaxValue(numBits: StoreVal.getBitWidth());
1655	if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1656	IntOverflowOccurred \|= Val.ugt(RHS: MaxVal.zext(width: Val.getBitWidth()));
1657	StoreVal = Val.trunc(width: StoreVal.getBitWidth());
1658	} else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1659	IntOverflowOccurred \|= Val.zext(width: MaxVal.getBitWidth()).ugt(RHS: MaxVal);
1660	StoreVal = Val.zext(width: StoreVal.getBitWidth());
1661	} else {
1662	StoreVal = Val;
1663	}
1664
1665	return IntOverflowOccurred \|\| ExpOverflowOccurred;
1666	}
1667
1668	/// \verbatim
1669	/// user-defined-character-literal: [C++11 lex.ext]
1670	/// character-literal ud-suffix
1671	/// ud-suffix:
1672	/// identifier
1673	/// character-literal: [C++11 lex.ccon]
1674	/// ' c-char-sequence '
1675	/// u' c-char-sequence '
1676	/// U' c-char-sequence '
1677	/// L' c-char-sequence '
1678	/// u8' c-char-sequence ' [C++1z lex.ccon]
1679	/// c-char-sequence:
1680	/// c-char
1681	/// c-char-sequence c-char
1682	/// c-char:
1683	/// any member of the source character set except the single-quote ',
1684	/// backslash \, or new-line character
1685	/// escape-sequence
1686	/// universal-character-name
1687	/// escape-sequence:
1688	/// simple-escape-sequence
1689	/// octal-escape-sequence
1690	/// hexadecimal-escape-sequence
1691	/// simple-escape-sequence:
1692	/// one of \' \" \? \\ \a \b \f \n \r \t \v
1693	/// octal-escape-sequence:
1694	/// \ octal-digit
1695	/// \ octal-digit octal-digit
1696	/// \ octal-digit octal-digit octal-digit
1697	/// hexadecimal-escape-sequence:
1698	/// \x hexadecimal-digit
1699	/// hexadecimal-escape-sequence hexadecimal-digit
1700	/// universal-character-name: [C++11 lex.charset]
1701	/// \u hex-quad
1702	/// \U hex-quad hex-quad
1703	/// hex-quad:
1704	/// hex-digit hex-digit hex-digit hex-digit
1705	/// \endverbatim
1706	///
1707	CharLiteralParser::CharLiteralParser(const char begin, const* char *end,
1708	SourceLocation Loc, Preprocessor &PP,
1709	tok::TokenKind kind) {
1710	// At this point we know that the character matches the regex "(L\|u\|U)?'.'".*
1711	HadError = false;
1712
1713	Kind = kind;
1714
1715	const char *TokBegin = begin;
1716
1717	// Skip over wide character determinant.
1718	if (Kind != tok::char_constant)
1719	++begin;
1720	if (Kind == tok::utf8_char_constant)
1721	++begin;
1722
1723	// Skip over the entry quote.
1724	if (begin[`0`] != `'\''`) {
1725	PP.Diag(Loc, DiagID: diag::err_lexing_char);
1726	HadError = true;
1727	return;
1728	}
1729
1730	++begin;
1731
1732	// Remove an optional ud-suffix.
1733	if (end[-`1`] != `'\''`) {
1734	const char *UDSuffixEnd = end;
1735	do {
1736	--end;
1737	} while (end[-`1`] != `'\''`);
1738	// FIXME: Don't bother with this if !tok.hasUCN().
1739	expandUCNs(Buf&: UDSuffixBuf, Input: StringRef (end, UDSuffixEnd - end));
1740	UDSuffixOffset = end - TokBegin;
1741	}
1742
1743	// Trim the ending quote.
1744	assert(end != begin && "Invalid token lexed");
1745	--end;
1746
1747	// FIXME: The "Value" is an uint64_t so we can handle char literals of
1748	// up to 64-bits.
1749	// FIXME: This extensively assumes that 'char' is 8-bits.
1750	assert(PP.getTargetInfo().getCharWidth() == `8` &&
1751	"Assumes char is 8 bits");
1752	assert(PP.getTargetInfo().getIntWidth() <= `64` &&
1753	(PP.getTargetInfo().getIntWidth() & `7`) == `0` &&
1754	"Assumes sizeof(int) on target is <= 64 and a multiple of char");
1755	assert(PP.getTargetInfo().getWCharWidth() <= `64` &&
1756	"Assumes sizeof(wchar) on target is <= 64");
1757
1758	SmallVector<uint32_t, `4`> codepoint_buffer;
1759	codepoint_buffer.resize(N: end - begin);
1760	uint32_t *buffer_begin = &codepoint_buffer.front();
1761	uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1762
1763	// Unicode escapes representing characters that cannot be correctly
1764	// represented in a single code unit are disallowed in character literals
1765	// by this implementation.
1766	uint32_t largest_character_for_kind;
1767	if (tok::wide_char_constant == Kind) {
1768	largest_character_for_kind =
1769	`0xFFFFFFFFu` >> (`32`-PP.getTargetInfo().getWCharWidth());
1770	} else if (tok::utf8_char_constant == Kind) {
1771	largest_character_for_kind = `0x7F`;
1772	} else if (tok::utf16_char_constant == Kind) {
1773	largest_character_for_kind = `0xFFFF`;
1774	} else if (tok::utf32_char_constant == Kind) {
1775	largest_character_for_kind = `0x10FFFF`;
1776	} else {
1777	largest_character_for_kind = `0x7Fu`;
1778	}
1779
1780	while (begin != end) {
1781	// Is this a span of non-escape characters?
1782	if (begin[`0`] != `'\\'`) {
1783	char const *start = begin;
1784	do {
1785	++begin;
1786	} while (begin != end && *begin != `'\\'`);
1787
1788	char const *tmp_in_start = start;
1789	uint32_t *tmp_out_start = buffer_begin;
1790	llvm::ConversionResult res =
1791	llvm::ConvertUTF8toUTF32(sourceStart: reinterpret_cast<llvm::UTF8 const **>(&start),
1792	sourceEnd: reinterpret_cast<llvm::UTF8 const *>(begin),
1793	targetStart: &buffer_begin, targetEnd: buffer_end, flags: llvm::strictConversion);
1794	if (res != llvm::conversionOK) {
1795	// If we see bad encoding for unprefixed character literals, warn and
1796	// simply copy the byte values, for compatibility with gcc and
1797	// older versions of clang.
1798	bool NoErrorOnBadEncoding = isOrdinary();
1799	unsigned Msg = diag::err_bad_character_encoding;
1800	if (NoErrorOnBadEncoding)
1801	Msg = diag::warn_bad_character_encoding;
1802	PP.Diag(Loc, DiagID: Msg);
1803	if (NoErrorOnBadEncoding) {
1804	start = tmp_in_start;
1805	buffer_begin = tmp_out_start;
1806	for (; start != begin; ++start, ++buffer_begin)
1807	buffer_begin = static_cast<uint8_t>(start);
1808	} else {
1809	HadError = true;
1810	}
1811	} else {
1812	for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1813	if (*tmp_out_start > largest_character_for_kind) {
1814	HadError = true;
1815	PP.Diag(Loc, DiagID: diag::err_character_too_large);
1816	}
1817	}
1818	}
1819
1820	continue;
1821	}
1822	// Is this a Universal Character Name escape?
1823	if (begin[`1`] == `'u'` \|\| begin[`1`] == `'U'` \|\| begin[`1`] == `'N'`) {
1824	unsigned short UcnLen = `0`;
1825	if (!ProcessUCNEscape(ThisTokBegin: TokBegin, ThisTokBuf&: begin, ThisTokEnd: end, UcnVal&: *buffer_begin, UcnLen,
1826	Loc: FullSourceLoc (Loc, PP.getSourceManager()),
1827	Diags: &PP.getDiagnostics(), Features: PP.getLangOpts(), in_char_string_literal: true)) {
1828	HadError = true;
1829	} else if (*buffer_begin > largest_character_for_kind) {
1830	HadError = true;
1831	PP.Diag(Loc, DiagID: diag::err_character_too_large);
1832	}
1833
1834	++buffer_begin;
1835	continue;
1836	}
1837	unsigned CharWidth = getCharWidth(kind: Kind, Target: PP.getTargetInfo());
1838	uint64_t result =
1839	ProcessCharEscape(ThisTokBegin: TokBegin, ThisTokBuf&: begin, ThisTokEnd: end, HadError,
1840	Loc: FullSourceLoc (Loc, PP.getSourceManager()), CharWidth,
1841	Diags: &PP.getDiagnostics(), Features: PP.getLangOpts(),
1842	EvalMethod: StringLiteralEvalMethod::Evaluated);
1843	*buffer_begin++ = result;
1844	}
1845
1846	unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1847
1848	if (NumCharsSoFar > `1`) {
1849	if (isOrdinary() && NumCharsSoFar == `4`)
1850	PP.Diag(Loc, DiagID: diag::warn_four_char_character_literal);
1851	else if (isOrdinary())
1852	PP.Diag(Loc, DiagID: diag::warn_multichar_character_literal);
1853	else {
1854	PP.Diag(Loc, DiagID: diag::err_multichar_character_literal) << (isWide() ? `0` : `1`);
1855	HadError = true;
1856	}
1857	IsMultiChar = true;
1858	} else {
1859	IsMultiChar = false;
1860	}
1861
1862	llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), `0`);
1863
1864	// Narrow character literals act as though their value is concatenated
1865	// in this implementation, but warn on overflow.
1866	bool multi_char_too_long = false;
1867	if (isOrdinary() && isMultiChar()) {
1868	LitVal = `0`;
1869	for (size_t i = `0`; i < NumCharsSoFar; ++i) {
1870	// check for enough leading zeros to shift into
1871	multi_char_too_long \|= (LitVal.countl_zero() < `8`);
1872	LitVal <<= `8`;
1873	LitVal = LitVal + (codepoint_buffer [i] & `0xFF`);
1874	}
1875	} else if (NumCharsSoFar > `0`) {
1876	// otherwise just take the last character
1877	LitVal = buffer_begin[-`1`];
1878	}
1879
1880	if (!HadError && multi_char_too_long) {
1881	PP.Diag(Loc, DiagID: diag::warn_char_constant_too_large);
1882	}
1883
1884	// Transfer the value from APInt to uint64_t
1885	Value = LitVal.getZExtValue();
1886
1887	// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1888	// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1889	// character constants are not sign extended in the this implementation:
1890	// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1891	if (isOrdinary() && NumCharsSoFar == `1` && (Value & `128`) &&
1892	PP.getLangOpts().CharIsSigned)
1893	Value = (signed char)Value;
1894	}
1895
1896	/// \verbatim
1897	/// string-literal: [C++0x lex.string]
1898	/// encoding-prefix " [s-char-sequence] "
1899	/// encoding-prefix R raw-string
1900	/// encoding-prefix:
1901	/// u8
1902	/// u
1903	/// U
1904	/// L
1905	/// s-char-sequence:
1906	/// s-char
1907	/// s-char-sequence s-char
1908	/// s-char:
1909	/// any member of the source character set except the double-quote ",
1910	/// backslash \, or new-line character
1911	/// escape-sequence
1912	/// universal-character-name
1913	/// raw-string:
1914	/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1915	/// r-char-sequence:
1916	/// r-char
1917	/// r-char-sequence r-char
1918	/// r-char:
1919	/// any member of the source character set, except a right parenthesis )
1920	/// followed by the initial d-char-sequence (which may be empty)
1921	/// followed by a double quote ".
1922	/// d-char-sequence:
1923	/// d-char
1924	/// d-char-sequence d-char
1925	/// d-char:
1926	/// any member of the basic source character set except:
1927	/// space, the left parenthesis (, the right parenthesis ),
1928	/// the backslash \, and the control characters representing horizontal
1929	/// tab, vertical tab, form feed, and newline.
1930	/// escape-sequence: [C++0x lex.ccon]
1931	/// simple-escape-sequence
1932	/// octal-escape-sequence
1933	/// hexadecimal-escape-sequence
1934	/// simple-escape-sequence:
1935	/// one of \' \" \? \\ \a \b \f \n \r \t \v
1936	/// octal-escape-sequence:
1937	/// \ octal-digit
1938	/// \ octal-digit octal-digit
1939	/// \ octal-digit octal-digit octal-digit
1940	/// hexadecimal-escape-sequence:
1941	/// \x hexadecimal-digit
1942	/// hexadecimal-escape-sequence hexadecimal-digit
1943	/// universal-character-name:
1944	/// \u hex-quad
1945	/// \U hex-quad hex-quad
1946	/// hex-quad:
1947	/// hex-digit hex-digit hex-digit hex-digit
1948	/// \endverbatim
1949	///
1950	StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1951	Preprocessor &PP,
1952	StringLiteralEvalMethod EvalMethod)
1953	: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1954	Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1955	MaxTokenLength(`0`), SizeBound(`0`), CharByteWidth(`0`), Kind(tok::unknown),
1956	ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1957	Pascal(false) {
1958	init(StringToks);
1959	}
1960
1961	void StringLiteralParser::init(ArrayRef<Token> StringToks){
1962	// The literal token may have come from an invalid source location (e.g. due
1963	// to a PCH error), in which case the token length will be 0.
1964	if (StringToks.empty() \|\| StringToks [`0`].getLength() < `2`)
1965	return DiagnoseLexingError(Loc: SourceLocation ());
1966
1967	// Scan all of the string portions, remember the max individual token length,
1968	// computing a bound on the concatenated string length, and see whether any
1969	// piece is a wide-string. If any of the string portions is a wide-string
1970	// literal, the result is a wide-string literal [C99 6.4.5p4].
1971	assert(!StringToks.empty() && "expected at least one token");
1972	MaxTokenLength = StringToks [`0`].getLength();
1973	assert(StringToks[`0`].getLength() >= `2` && "literal token is invalid!");
1974	SizeBound = StringToks [`0`].getLength() - `2`; // -2 for "".
1975	hadError = false;
1976
1977	// Determines the kind of string from the prefix
1978	Kind = tok::string_literal;
1979
1980	/// (C99 5.1.1.2p1). The common case is only one string fragment.
1981	for (const Token &Tok : StringToks) {
1982	if (Tok.getLength() < `2`)
1983	return DiagnoseLexingError(Loc: Tok.getLocation());
1984
1985	// The string could be shorter than this if it needs cleaning, but this is a
1986	// reasonable bound, which is all we need.
1987	assert(Tok.getLength() >= `2` && "literal token is invalid!");
1988	SizeBound += Tok.getLength() - `2`; // -2 for "".
1989
1990	// Remember maximum string piece length.
1991	if (Tok.getLength() > MaxTokenLength)
1992	MaxTokenLength = Tok.getLength();
1993
1994	// Remember if we see any wide or utf-8/16/32 strings.
1995	// Also check for illegal concatenations.
1996	if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
1997	if (Diags) {
1998	SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
1999	TokStart: Tok.getLocation(), Characters: getEncodingPrefixLen(kind: Tok.getKind()), SM,
2000	LangOpts: Features);
2001	CharSourceRange Range =
2002	CharSourceRange::getCharRange(R: {Tok.getLocation(), PrefixEndLoc});
2003	StringRef Prefix(SM.getCharacterData(SL: Tok.getLocation()),
2004	getEncodingPrefixLen(kind: Tok.getKind()));
2005	Diags->Report(Loc: Tok.getLocation(),
2006	DiagID: Features.CPlusPlus26
2007	? diag::err_unevaluated_string_prefix
2008	: diag::warn_unevaluated_string_prefix)
2009	<< Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(RemoveRange: Range);
2010	}
2011	if (Features.CPlusPlus26)
2012	hadError = true;
2013	} else if (Tok.isNot(K: Kind) && Tok.isNot(K: tok::string_literal)) {
2014	if (isOrdinary()) {
2015	Kind = Tok.getKind();
2016	} else {
2017	if (Diags)
2018	Diags->Report(Loc: Tok.getLocation(), DiagID: diag::err_unsupported_string_concat);
2019	hadError = true;
2020	}
2021	}
2022	}
2023
2024	// Include space for the null terminator.
2025	++SizeBound;
2026
2027	// TODO: K&R warning: "traditional C rejects string constant concatenation"
2028
2029	// Get the width in bytes of char/wchar_t/char16_t/char32_t
2030	CharByteWidth = getCharWidth(kind: Kind, Target);
2031	assert((CharByteWidth & `7`) == `0` && "Assumes character size is byte multiple");
2032	CharByteWidth /= `8`;
2033
2034	// The output buffer size needs to be large enough to hold wide characters.
2035	// This is a worst-case assumption which basically corresponds to L"" "long".
2036	SizeBound *= CharByteWidth;
2037
2038	// Size the temporary buffer to hold the result string data.
2039	ResultBuf.resize(N: SizeBound);
2040
2041	// Likewise, but for each string piece.
2042	SmallString<`512`> TokenBuf;
2043	TokenBuf.resize(N: MaxTokenLength);
2044
2045	// Loop over all the strings, getting their spelling, and expanding them to
2046	// wide strings as appropriate.
2047	ResultPtr = &ResultBuf [`0`]; // Next byte to fill in.
2048
2049	Pascal = false;
2050
2051	SourceLocation UDSuffixTokLoc;
2052
2053	for (unsigned i = `0`, e = StringToks.size(); i != e; ++i) {
2054	const char *ThisTokBuf = &TokenBuf [`0`];
2055	// Get the spelling of the token, which eliminates trigraphs, etc. We know
2056	// that ThisTokBuf points to a buffer that is big enough for the whole token
2057	// and 'spelled' tokens can only shrink.
2058	bool StringInvalid = false;
2059	unsigned ThisTokLen =
2060	Lexer::getSpelling(Tok: StringToks [i], Buffer&: ThisTokBuf, SourceMgr: SM, LangOpts: Features,
2061	Invalid: &StringInvalid);
2062	if (StringInvalid)
2063	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2064
2065	const char *ThisTokBegin = ThisTokBuf;
2066	const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2067
2068	// Remove an optional ud-suffix.
2069	if (ThisTokEnd[-`1`] != `'"'`) {
2070	const char *UDSuffixEnd = ThisTokEnd;
2071	do {
2072	--ThisTokEnd;
2073	} while (ThisTokEnd[-`1`] != `'"'`);
2074
2075	StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2076
2077	if (UDSuffixBuf.empty()) {
2078	if (StringToks [i].hasUCN())
2079	expandUCNs(Buf&: UDSuffixBuf, Input: UDSuffix);
2080	else
2081	UDSuffixBuf.assign(RHS: UDSuffix);
2082	UDSuffixToken = i;
2083	UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2084	UDSuffixTokLoc = StringToks [i].getLocation();
2085	} else {
2086	SmallString<`32`> ExpandedUDSuffix;
2087	if (StringToks [i].hasUCN()) {
2088	expandUCNs(Buf&: ExpandedUDSuffix, Input: UDSuffix);
2089	UDSuffix = ExpandedUDSuffix;
2090	}
2091
2092	// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2093	// result of a concatenation involving at least one user-defined-string-
2094	// literal, all the participating user-defined-string-literals shall
2095	// have the same ud-suffix.
2096	bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
2097	if (UDSuffixBuf != UDSuffix \|\| UnevaluatedStringHasUDL) {
2098	if (Diags) {
2099	SourceLocation TokLoc = StringToks [i].getLocation();
2100	if (UnevaluatedStringHasUDL) {
2101	Diags->Report(Loc: TokLoc, DiagID: diag::err_unevaluated_string_udl)
2102	<< SourceRange (TokLoc, TokLoc);
2103	} else {
2104	Diags->Report(Loc: TokLoc, DiagID: diag::err_string_concat_mixed_suffix)
2105	<< UDSuffixBuf << UDSuffix
2106	<< SourceRange (UDSuffixTokLoc, UDSuffixTokLoc);
2107	}
2108	}
2109	hadError = true;
2110	}
2111	}
2112	}
2113
2114	// Strip the end quote.
2115	--ThisTokEnd;
2116
2117	// TODO: Input character set mapping support.
2118
2119	// Skip marker for wide or unicode strings.
2120	if (ThisTokBuf[`0`] == `'L'` \|\| ThisTokBuf[`0`] == `'u'` \|\| ThisTokBuf[`0`] == `'U'`) {
2121	++ThisTokBuf;
2122	// Skip 8 of u8 marker for utf8 strings.
2123	if (ThisTokBuf[`0`] == `'8'`)
2124	++ThisTokBuf;
2125	}
2126
2127	// Check for raw string
2128	if (ThisTokBuf[`0`] == `'R'`) {
2129	if (ThisTokBuf[`1`] != `'"'`) {
2130	// The file may have come from PCH and then changed after loading the
2131	// PCH; Fail gracefully.
2132	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2133	}
2134	ThisTokBuf += `2`; // skip R"
2135
2136	// C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2137	// characters.
2138	constexpr unsigned MaxRawStrDelimLen = `16`;
2139
2140	const char *Prefix = ThisTokBuf;
2141	while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2142	ThisTokBuf[`0`] != `'('`)
2143	++ThisTokBuf;
2144	if (ThisTokBuf[`0`] != `'('`)
2145	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2146	++ThisTokBuf; // skip '('
2147
2148	// Remove same number of characters from the end
2149	ThisTokEnd -= ThisTokBuf - Prefix;
2150	if (ThisTokEnd < ThisTokBuf)
2151	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2152
2153	// C++14 [lex.string]p4: A source-file new-line in a raw string literal
2154	// results in a new-line in the resulting execution string-literal.
2155	StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2156	while (!RemainingTokenSpan.empty()) {
2157	// Split the string literal on \r\n boundaries.
2158	size_t CRLFPos = RemainingTokenSpan.find(Str: "\r\n");
2159	StringRef BeforeCRLF = RemainingTokenSpan.substr(Start: `0`, N: CRLFPos);
2160	StringRef AfterCRLF = RemainingTokenSpan.substr(Start: CRLFPos);
2161
2162	// Copy everything before the \r\n sequence into the string literal.
2163	if (CopyStringFragment(Tok: StringToks [i], TokBegin: ThisTokBegin, Fragment: BeforeCRLF))
2164	hadError = true;
2165
2166	// Point into the \n inside the \r\n sequence and operate on the
2167	// remaining portion of the literal.
2168	RemainingTokenSpan = AfterCRLF.substr(Start: `1`);
2169	}
2170	} else {
2171	if (ThisTokBuf[`0`] != `'"'`) {
2172	// The file may have come from PCH and then changed after loading the
2173	// PCH; Fail gracefully.
2174	return DiagnoseLexingError(Loc: StringToks [i].getLocation());
2175	}
2176	++ThisTokBuf; // skip "
2177
2178	// Check if this is a pascal string
2179	if (!isUnevaluated() && Features.PascalStrings &&
2180	ThisTokBuf + `1` != ThisTokEnd && ThisTokBuf[`0`] == `'\\'` &&
2181	ThisTokBuf[`1`] == `'p'`) {
2182
2183	// If the \p sequence is found in the first token, we have a pascal string
2184	// Otherwise, if we already have a pascal string, ignore the first \p
2185	if (i == `0`) {
2186	++ThisTokBuf;
2187	Pascal = true;
2188	} else if (Pascal)
2189	ThisTokBuf += `2`;
2190	}
2191
2192	while (ThisTokBuf != ThisTokEnd) {
2193	// Is this a span of non-escape characters?
2194	if (ThisTokBuf[`0`] != `'\\'`) {
2195	const char *InStart = ThisTokBuf;
2196	do {
2197	++ThisTokBuf;
2198	} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[`0`] != `'\\'`);
2199
2200	// Copy the character span over.
2201	if (CopyStringFragment(Tok: StringToks [i], TokBegin: ThisTokBegin,
2202	Fragment: StringRef (InStart, ThisTokBuf - InStart)))
2203	hadError = true;
2204	continue;
2205	}
2206	// Is this a Universal Character Name escape?
2207	if (ThisTokBuf[`1`] == `'u'` \|\| ThisTokBuf[`1`] == `'U'` \|\|
2208	ThisTokBuf[`1`] == `'N'`) {
2209	EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2210	ResultBuf&: ResultPtr, HadError&: hadError,
2211	Loc: FullSourceLoc (StringToks [i].getLocation(), SM),
2212	CharByteWidth, Diags, Features);
2213	continue;
2214	}
2215	// Otherwise, this is a non-UCN escape character. Process it.
2216	unsigned ResultChar =
2217	ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, HadError&: hadError,
2218	Loc: FullSourceLoc (StringToks [i].getLocation(), SM),
2219	CharWidth: CharByteWidth * `8`, Diags, Features, EvalMethod);
2220
2221	if (CharByteWidth == `4`) {
2222	// FIXME: Make the type of the result buffer correct instead of
2223	// using reinterpret_cast.
2224	llvm::UTF32 ResultWidePtr = reinterpret_cast<llvm::UTF32>(ResultPtr);
2225	*ResultWidePtr = ResultChar;
2226	ResultPtr += `4`;
2227	} else if (CharByteWidth == `2`) {
2228	// FIXME: Make the type of the result buffer correct instead of
2229	// using reinterpret_cast.
2230	llvm::UTF16 ResultWidePtr = reinterpret_cast<llvm::UTF16>(ResultPtr);
2231	*ResultWidePtr = ResultChar & `0xFFFF`;
2232	ResultPtr += `2`;
2233	} else {
2234	assert(CharByteWidth == `1` && "Unexpected char width");
2235	*ResultPtr++ = ResultChar & `0xFF`;
2236	}
2237	}
2238	}
2239	}
2240
2241	assert((!Pascal \|\| !isUnevaluated()) &&
2242	"Pascal string in unevaluated context");
2243	if (Pascal) {
2244	if (CharByteWidth == `4`) {
2245	// FIXME: Make the type of the result buffer correct instead of
2246	// using reinterpret_cast.
2247	llvm::UTF32 ResultWidePtr = reinterpret_cast<llvm::UTF32>(ResultBuf.data());
2248	ResultWidePtr[`0`] = GetNumStringChars() - `1`;
2249	} else if (CharByteWidth == `2`) {
2250	// FIXME: Make the type of the result buffer correct instead of
2251	// using reinterpret_cast.
2252	llvm::UTF16 ResultWidePtr = reinterpret_cast<llvm::UTF16>(ResultBuf.data());
2253	ResultWidePtr[`0`] = GetNumStringChars() - `1`;
2254	} else {
2255	assert(CharByteWidth == `1` && "Unexpected char width");
2256	ResultBuf [`0`] = GetNumStringChars() - `1`;
2257	}
2258
2259	// Verify that pascal strings aren't too large.
2260	if (GetStringLength() > `256`) {
2261	if (Diags)
2262	Diags->Report(Loc: StringToks.front().getLocation(),
2263	DiagID: diag::err_pascal_string_too_long)
2264	<< SourceRange (StringToks.front().getLocation(),
2265	StringToks.back().getLocation());
2266	hadError = true;
2267	return;
2268	}
2269	} else if (Diags) {
2270	// Complain if this string literal has too many characters.
2271	unsigned MaxChars = Features.CPlusPlus? `65536` : Features.C99 ? `4095` : `509`;
2272
2273	if (GetNumStringChars() > MaxChars)
2274	Diags->Report(Loc: StringToks.front().getLocation(),
2275	DiagID: diag::ext_string_too_long)
2276	<< GetNumStringChars() << MaxChars
2277	<< (Features.CPlusPlus ? `2` : Features.C99 ? `1` : `0`)
2278	<< SourceRange (StringToks.front().getLocation(),
2279	StringToks.back().getLocation());
2280	}
2281	}
2282
2283	static const char resyncUTF8(const* char Err, const* char *End) {
2284	if (Err == End)
2285	return End;
2286	End = Err + std::min<unsigned>(a: llvm::getNumBytesForUTF8(firstByte: *Err), b: End-Err);
2287	while (++Err != End && (*Err & `0xC0`) == `0x80`)
2288	;
2289	return Err;
2290	}
2291
2292	/// This function copies from Fragment, which is a sequence of bytes
2293	/// within Tok's contents (which begin at TokBegin) into ResultPtr.
2294	/// Performs widening for multi-byte characters.
2295	bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2296	const char *TokBegin,
2297	StringRef Fragment) {
2298	const llvm::UTF8 *ErrorPtrTmp;
2299	if (ConvertUTF8toWide(WideCharWidth: CharByteWidth, Source: Fragment, ResultPtr, ErrorPtr&: ErrorPtrTmp))
2300	return false;
2301
2302	// If we see bad encoding for unprefixed string literals, warn and
2303	// simply copy the byte values, for compatibility with gcc and older
2304	// versions of clang.
2305	bool NoErrorOnBadEncoding = isOrdinary();
2306	if (NoErrorOnBadEncoding) {
2307	memcpy(dest: ResultPtr, src: Fragment.data(), n: Fragment.size());
2308	ResultPtr += Fragment.size();
2309	}
2310
2311	if (Diags) {
2312	const char ErrorPtr = reinterpret_cast<const* char *>(ErrorPtrTmp);
2313
2314	FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2315	const DiagnosticBuilder &Builder =
2316	Diag(Diags, Features, TokLoc: SourceLoc, TokBegin,
2317	TokRangeBegin: ErrorPtr, TokRangeEnd: resyncUTF8(Err: ErrorPtr, End: Fragment.end()),
2318	DiagID: NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2319	: diag::err_bad_string_encoding);
2320
2321	const char *NextStart = resyncUTF8(Err: ErrorPtr, End: Fragment.end());
2322	StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2323
2324	// Decode into a dummy buffer.
2325	SmallString<`512`> Dummy;
2326	Dummy.reserve(N: Fragment.size() * CharByteWidth);
2327	char *Ptr = Dummy.data();
2328
2329	while (!ConvertUTF8toWide(WideCharWidth: CharByteWidth, Source: NextFragment, ResultPtr&: Ptr, ErrorPtr&: ErrorPtrTmp)) {
2330	const char ErrorPtr = reinterpret_cast<const* char *>(ErrorPtrTmp);
2331	NextStart = resyncUTF8(Err: ErrorPtr, End: Fragment.end());
2332	Builder << MakeCharSourceRange(Features, TokLoc: SourceLoc, TokBegin,
2333	TokRangeBegin: ErrorPtr, TokRangeEnd: NextStart);
2334	NextFragment = StringRef (NextStart, Fragment.end()-NextStart);
2335	}
2336	}
2337	return !NoErrorOnBadEncoding;
2338	}
2339
2340	void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2341	hadError = true;
2342	if (Diags)
2343	Diags->Report(Loc, DiagID: diag::err_lexing_string);
2344	}
2345
2346	/// getOffsetOfStringByte - This function returns the offset of the
2347	/// specified byte of the string data represented by Token. This handles
2348	/// advancing over escape sequences in the string.
2349	unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2350	unsigned ByteNo) const {
2351	// Get the spelling of the token.
2352	SmallString<`32`> SpellingBuffer;
2353	SpellingBuffer.resize(N: Tok.getLength());
2354
2355	bool StringInvalid = false;
2356	const char *SpellingPtr = &SpellingBuffer [`0`];
2357	unsigned TokLen = Lexer::getSpelling(Tok, Buffer&: SpellingPtr, SourceMgr: SM, LangOpts: Features,
2358	Invalid: &StringInvalid);
2359	if (StringInvalid)
2360	return `0`;
2361
2362	const char *SpellingStart = SpellingPtr;
2363	const char *SpellingEnd = SpellingPtr+TokLen;
2364
2365	// Handle UTF-8 strings just like narrow strings.
2366	if (SpellingPtr[`0`] == `'u'` && SpellingPtr[`1`] == `'8'`)
2367	SpellingPtr += `2`;
2368
2369	assert(SpellingPtr[`0`] != `'L'` && SpellingPtr[`0`] != `'u'` &&
2370	SpellingPtr[`0`] != `'U'` && "Doesn't handle wide or utf strings yet");
2371
2372	// For raw string literals, this is easy.
2373	if (SpellingPtr[`0`] == `'R'`) {
2374	assert(SpellingPtr[`1`] == `'"'` && "Should be a raw string literal!");
2375	// Skip 'R"'.
2376	SpellingPtr += `2`;
2377	while (*SpellingPtr != `'('`) {
2378	++SpellingPtr;
2379	assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2380	}
2381	// Skip '('.
2382	++SpellingPtr;
2383	return SpellingPtr - SpellingStart + ByteNo;
2384	}
2385
2386	// Skip over the leading quote
2387	assert(SpellingPtr[`0`] == `'"'` && "Should be a string literal!");
2388	++SpellingPtr;
2389
2390	// Skip over bytes until we find the offset we're looking for.
2391	while (ByteNo) {
2392	assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2393
2394	// Step over non-escapes simply.
2395	if (*SpellingPtr != `'\\'`) {
2396	++SpellingPtr;
2397	--ByteNo;
2398	continue;
2399	}
2400
2401	// Otherwise, this is an escape character. Advance over it.
2402	bool HadError = false;
2403	if (SpellingPtr[`1`] == `'u'` \|\| SpellingPtr[`1`] == `'U'` \|\|
2404	SpellingPtr[`1`] == `'N'`) {
2405	const char *EscapePtr = SpellingPtr;
2406	unsigned Len = MeasureUCNEscape(ThisTokBegin: SpellingStart, ThisTokBuf&: SpellingPtr, ThisTokEnd: SpellingEnd,
2407	CharByteWidth: `1`, Features, HadError);
2408	if (Len > ByteNo) {
2409	// ByteNo is somewhere within the escape sequence.
2410	SpellingPtr = EscapePtr;
2411	break;
2412	}
2413	ByteNo -= Len;
2414	} else {
2415	ProcessCharEscape(ThisTokBegin: SpellingStart, ThisTokBuf&: SpellingPtr, ThisTokEnd: SpellingEnd, HadError,
2416	Loc: FullSourceLoc (Tok.getLocation(), SM), CharWidth: CharByteWidth * `8`,
2417	Diags, Features, EvalMethod: StringLiteralEvalMethod::Evaluated);
2418	--ByteNo;
2419	}
2420	assert(!HadError && "This method isn't valid on erroneous strings");
2421	}
2422
2423	return SpellingPtr-SpellingStart;
2424	}
2425
2426	/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2427	/// suffixes as ud-suffixes, because the diagnostic experience is better if we
2428	/// treat it as an invalid suffix.
2429	bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2430	StringRef Suffix) {
2431	return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) \|\|
2432	Suffix == "sv";
2433	}
2434

Browse the source code of llvm_projects/clang/lib/Lex/LiteralSupport.cpp