AsmLexer.cpp source code [llvm_projects/llvm/lib/MC/MCParser/AsmLexer.cpp]

1	//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This class implements the lexer for assembly files.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/MC/MCParser/AsmLexer.h"
14	#include "llvm/ADT/APInt.h"
15	#include "llvm/ADT/ArrayRef.h"
16	#include "llvm/ADT/StringExtras.h"
17	#include "llvm/ADT/StringRef.h"
18	#include "llvm/MC/MCAsmInfo.h"
19	#include "llvm/MC/MCParser/AsmLexer.h"
20	#include "llvm/Support/Compiler.h"
21	#include "llvm/Support/SMLoc.h"
22	#include "llvm/Support/SaveAndRestore.h"
23	#include "llvm/Support/raw_ostream.h"
24	#include <cassert>
25	#include <cctype>
26	#include <cstdio>
27	#include <cstring>
28	#include <string>
29
30	using namespace llvm;
31
32	SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Ptr: Str.data()); }
33
34	SMLoc AsmToken::getEndLoc() const {
35	return SMLoc::getFromPointer(Ptr: Str.data() + Str.size());
36	}
37
38	SMRange AsmToken::getLocRange() const { return SMRange (getLoc(), getEndLoc()); }
39
40	void AsmToken::dump(raw_ostream &OS) const {
41	switch (Kind) {
42	case AsmToken::Error:
43	OS << "error";
44	break;
45	case AsmToken::Identifier:
46	OS << "identifier: " << getString();
47	break;
48	case AsmToken::Integer:
49	OS << "int: " << getString();
50	break;
51	case AsmToken::Real:
52	OS << "real: " << getString();
53	break;
54	case AsmToken::String:
55	OS << "string: " << getString();
56	break;
57
58	// clang-format off
59	case AsmToken::Amp: OS << "Amp"; break;
60	case AsmToken::AmpAmp: OS << "AmpAmp"; break;
61	case AsmToken::At: OS << "At"; break;
62	case AsmToken::BackSlash: OS << "BackSlash"; break;
63	case AsmToken::BigNum: OS << "BigNum"; break;
64	case AsmToken::Caret: OS << "Caret"; break;
65	case AsmToken::Colon: OS << "Colon"; break;
66	case AsmToken::Comma: OS << "Comma"; break;
67	case AsmToken::Comment: OS << "Comment"; break;
68	case AsmToken::Dollar: OS << "Dollar"; break;
69	case AsmToken::Dot: OS << "Dot"; break;
70	case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
71	case AsmToken::Eof: OS << "Eof"; break;
72	case AsmToken::Equal: OS << "Equal"; break;
73	case AsmToken::EqualEqual: OS << "EqualEqual"; break;
74	case AsmToken::Exclaim: OS << "Exclaim"; break;
75	case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break;
76	case AsmToken::Greater: OS << "Greater"; break;
77	case AsmToken::GreaterEqual: OS << "GreaterEqual"; break;
78	case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
79	case AsmToken::Hash: OS << "Hash"; break;
80	case AsmToken::HashDirective: OS << "HashDirective"; break;
81	case AsmToken::LBrac: OS << "LBrac"; break;
82	case AsmToken::LCurly: OS << "LCurly"; break;
83	case AsmToken::LParen: OS << "LParen"; break;
84	case AsmToken::Less: OS << "Less"; break;
85	case AsmToken::LessEqual: OS << "LessEqual"; break;
86	case AsmToken::LessGreater: OS << "LessGreater"; break;
87	case AsmToken::LessLess: OS << "LessLess"; break;
88	case AsmToken::Minus: OS << "Minus"; break;
89	case AsmToken::MinusGreater: OS << "MinusGreater"; break;
90	case AsmToken::Percent: OS << "Percent"; break;
91	case AsmToken::Pipe: OS << "Pipe"; break;
92	case AsmToken::PipePipe: OS << "PipePipe"; break;
93	case AsmToken::Plus: OS << "Plus"; break;
94	case AsmToken::Question: OS << "Question"; break;
95	case AsmToken::RBrac: OS << "RBrac"; break;
96	case AsmToken::RCurly: OS << "RCurly"; break;
97	case AsmToken::RParen: OS << "RParen"; break;
98	case AsmToken::Slash: OS << "Slash"; break;
99	case AsmToken::Space: OS << "Space"; break;
100	case AsmToken::Star: OS << "Star"; break;
101	case AsmToken::Tilde: OS << "Tilde"; break;
102	// clang-format on
103	}
104
105	// Print the token string.
106	OS << " (\"";
107	OS.write_escaped(Str: getString());
108	OS << "\")";
109	}
110
111	AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
112	// For COFF targets, this is true, while for ELF targets, it should be false.
113	// Currently, @specifier parsing depends on '@' being included in the token.
114	AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@") &&
115	MAI.useAtForSpecifier();
116	LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
117
118	CurTok.emplace_back(Args: AsmToken::Space, Args: StringRef ());
119	}
120
121	void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
122	bool EndStatementAtEOF) {
123	CurBuf = Buf;
124
125	if (ptr)
126	CurPtr = ptr;
127	else
128	CurPtr = CurBuf.begin();
129
130	TokStart = nullptr;
131	this->EndStatementAtEOF = EndStatementAtEOF;
132	}
133
134	/// ReturnError - Set the error to the specified string at the specified
135	/// location. This is defined to always return AsmToken::Error.
136	AsmToken AsmLexer::ReturnError(const char Loc, const* std::string &Msg) {
137	SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg);
138
139	return AsmToken (AsmToken::Error, StringRef (Loc, CurPtr - Loc));
140	}
141
142	int AsmLexer::getNextChar() {
143	if (CurPtr == CurBuf.end())
144	return EOF;
145	return (unsigned char)*CurPtr++;
146	}
147
148	int AsmLexer::peekNextChar() {
149	if (CurPtr == CurBuf.end())
150	return EOF;
151	return (unsigned char)*CurPtr;
152	}
153
154	/// The leading integral digit sequence and dot should have already been
155	/// consumed, some or all of the fractional digit sequence can* have been*
156	/// consumed.
157	AsmToken AsmLexer::LexFloatLiteral() {
158	// Skip the fractional digit sequence.
159	while (isDigit(C: *CurPtr))
160	++CurPtr;
161
162	if (CurPtr == `'-'` \|\| CurPtr == `'+'`)
163	return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal");
164
165	// Check for exponent
166	if ((CurPtr == `'e'` \|\| CurPtr == `'E'`)) {
167	++CurPtr;
168
169	if (CurPtr == `'-'` \|\| CurPtr == `'+'`)
170	++CurPtr;
171
172	while (isDigit(C: *CurPtr))
173	++CurPtr;
174	}
175
176	return AsmToken (AsmToken::Real,
177	StringRef (TokStart, CurPtr - TokStart));
178	}
179
180	/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F])?[pP][+-]?[0-9a-fA-F]+*
181	/// while making sure there are enough actual digits around for the constant to
182	/// be valid.
183	///
184	/// The leading "0x[0-9a-fA-F]" (i.e. integer part) has already been consumed*
185	/// before we get here.
186	AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
187	assert((CurPtr == `'p'` \|\| CurPtr == `'P'` \|\| *CurPtr == `'.'`) &&
188	"unexpected parse state in floating hex");
189	bool NoFracDigits = true;
190
191	// Skip the fractional part if there is one
192	if (*CurPtr == `'.'`) {
193	++CurPtr;
194
195	const char *FracStart = CurPtr;
196	while (isHexDigit(C: *CurPtr))
197	++CurPtr;
198
199	NoFracDigits = CurPtr == FracStart;
200	}
201
202	if (NoIntDigits && NoFracDigits)
203	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
204	"expected at least one significand digit");
205
206	// Make sure we do have some kind of proper exponent part
207	if (CurPtr != `'p'` && CurPtr != `'P'`)
208	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
209	"expected exponent part 'p'");
210	++CurPtr;
211
212	if (CurPtr == `'+'` \|\| CurPtr == `'-'`)
213	++CurPtr;
214
215	// N.b. exponent digits are not* hex*
216	const char *ExpStart = CurPtr;
217	while (isDigit(C: *CurPtr))
218	++CurPtr;
219
220	if (CurPtr == ExpStart)
221	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
222	"expected at least one exponent digit");
223
224	return AsmToken (AsmToken::Real, StringRef (TokStart, CurPtr - TokStart));
225	}
226
227	/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
228	static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
229	return isAlnum(C) \|\| C == `'_'` \|\| C == `'$'` \|\| C == `'.'` \|\| C == `'?'` \|\|
230	(AllowAt && C == `'@'`) \|\| (AllowHash && C == `'#'`);
231	}
232
233	AsmToken AsmLexer::LexIdentifier() {
234	// Check for floating point literals.
235	if (CurPtr[-`1`] == `'.'` && isDigit(C: *CurPtr)) {
236	// Disambiguate a .1243foo identifier from a floating literal.
237	while (isDigit(C: *CurPtr))
238	++CurPtr;
239
240	if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier,
241	AllowHash: AllowHashInIdentifier) \|\|
242	CurPtr == `'e'` \|\| CurPtr == `'E'`)
243	return LexFloatLiteral();
244	}
245
246	while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier))
247	++CurPtr;
248
249	// Handle . as a special case.
250	if (CurPtr == TokStart+`1` && TokStart[`0`] == `'.'`)
251	return AsmToken (AsmToken::Dot, StringRef (TokStart, `1`));
252
253	return AsmToken (AsmToken::Identifier, StringRef (TokStart, CurPtr - TokStart));
254	}
255
256	/// LexSlash: Slash: /
257	/// C-Style Comment: / ... /
258	/// C-style Comment: // ...
259	AsmToken AsmLexer::LexSlash() {
260	if (!MAI.shouldAllowAdditionalComments()) {
261	IsAtStartOfStatement = false;
262	return AsmToken (AsmToken::Slash, StringRef (TokStart, `1`));
263	}
264
265	switch (*CurPtr) {
266	case `'*'`:
267	IsAtStartOfStatement = false;
268	break; // C style comment.
269	case `'/'`:
270	++CurPtr;
271	return LexLineComment();
272	default:
273	IsAtStartOfStatement = false;
274	return AsmToken (AsmToken::Slash, StringRef (TokStart, `1`));
275	}
276
277	// C Style comment.
278	++CurPtr; // skip the star.
279	const char *CommentTextStart = CurPtr;
280	while (CurPtr != CurBuf.end()) {
281	switch (*CurPtr++) {
282	case `'*'`:
283	// End of the comment?
284	if (*CurPtr != `'/'`)
285	break;
286	// If we have a CommentConsumer, notify it about the comment.
287	if (CommentConsumer) {
288	CommentConsumer->HandleComment(
289	Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
290	CommentText: StringRef (CommentTextStart, CurPtr - `1` - CommentTextStart));
291	}
292	++CurPtr; // End the /.*
293	return AsmToken (AsmToken::Comment,
294	StringRef (TokStart, CurPtr - TokStart));
295	}
296	}
297	return ReturnError(Loc: TokStart, Msg: "unterminated comment");
298	}
299
300	/// LexLineComment: Comment: #[^\n]*
301	/// : //[^\n]*
302	AsmToken AsmLexer::LexLineComment() {
303	// Mark This as an end of statement with a body of the
304	// comment. While it would be nicer to leave this two tokens,
305	// backwards compatability with TargetParsers makes keeping this in this form
306	// better.
307	const char *CommentTextStart = CurPtr;
308	int CurChar = getNextChar();
309	while (CurChar != `'\n'` && CurChar != `'\r'` && CurChar != EOF)
310	CurChar = getNextChar();
311	const char *NewlinePtr = CurPtr;
312	if (CurChar == `'\r'` && CurPtr != CurBuf.end() && *CurPtr == `'\n'`)
313	++CurPtr;
314
315	// If we have a CommentConsumer, notify it about the comment.
316	if (CommentConsumer) {
317	CommentConsumer->HandleComment(
318	Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
319	CommentText: StringRef (CommentTextStart, NewlinePtr - `1` - CommentTextStart));
320	}
321
322	IsAtStartOfLine = true;
323	// This is a whole line comment. leave newline
324	if (IsAtStartOfStatement)
325	return AsmToken (AsmToken::EndOfStatement,
326	StringRef (TokStart, CurPtr - TokStart));
327	IsAtStartOfStatement = true;
328
329	return AsmToken (AsmToken::EndOfStatement,
330	StringRef (TokStart, CurPtr - `1` - TokStart));
331	}
332
333	static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
334	// Skip case-insensitive ULL, UL, U, L and LL suffixes.
335	if (CurPtr[`0`] == `'U'` \|\| CurPtr[`0`] == `'u'`)
336	++CurPtr;
337	if (CurPtr[`0`] == `'L'` \|\| CurPtr[`0`] == `'l'`)
338	++CurPtr;
339	if (CurPtr[`0`] == `'L'` \|\| CurPtr[`0`] == `'l'`)
340	++CurPtr;
341	}
342
343	// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
344	// integer as a hexadecimal, possibly with leading zeroes.
345	static unsigned doHexLookAhead(const char &CurPtr, unsigned* DefaultRadix,
346	bool LexHex) {
347	const char FirstNonDec = nullptr*;
348	const char *LookAhead = CurPtr;
349	while (true) {
350	if (isDigit(C: *LookAhead)) {
351	++LookAhead;
352	} else {
353	if (!FirstNonDec)
354	FirstNonDec = LookAhead;
355
356	// Keep going if we are looking for a 'h' suffix.
357	if (LexHex && isHexDigit(C: *LookAhead))
358	++LookAhead;
359	else
360	break;
361	}
362	}
363	bool isHex = LexHex && (LookAhead == `'h'` \|\| LookAhead == `'H'`);
364	CurPtr = isHex \|\| !FirstNonDec ? LookAhead : FirstNonDec;
365	if (isHex)
366	return `16`;
367	return DefaultRadix;
368	}
369
370	static const char findLastDigit(const* char CurPtr, unsigned* DefaultRadix) {
371	while (hexDigitValue(C: *CurPtr) < DefaultRadix) {
372	++CurPtr;
373	}
374	return CurPtr;
375	}
376
377	static AsmToken intToken(StringRef Ref, APInt &Value) {
378	if (Value.isIntN(N: `64`))
379	return AsmToken (AsmToken::Integer, Ref, Value);
380	return AsmToken (AsmToken::BigNum, Ref, Value);
381	}
382
383	static std::string radixName(unsigned Radix) {
384	switch (Radix) {
385	case `2`:
386	return "binary";
387	case `8`:
388	return "octal";
389	case `10`:
390	return "decimal";
391	case `16`:
392	return "hexadecimal";
393	default:
394	return "base-" + std::to_string(val: Radix);
395	}
396	}
397
398	/// LexDigit: First character is [0-9].
399	/// Local Label: [0-9][:]
400	/// Forward/Backward Label: [0-9][fb]
401	/// Binary integer: 0b[01]+
402	/// Octal integer: 0[0-7]+
403	/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F][hH]*
404	/// Decimal integer: [1-9][0-9]*
405	AsmToken AsmLexer::LexDigit() {
406	// MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
407	// MASM-flavor octal integer: [0-7]+[oOqQ]
408	// MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
409	// MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F][hH]*
410	if (LexMasmIntegers && isdigit(CurPtr[-`1`])) {
411	const char *FirstNonBinary =
412	(CurPtr[-`1`] != `'0'` && CurPtr[-`1`] != `'1'`) ? CurPtr - `1` : nullptr;
413	const char *FirstNonDecimal =
414	(CurPtr[-`1`] < `'0'` \|\| CurPtr[-`1`] > `'9'`) ? CurPtr - `1` : nullptr;
415	const char *OldCurPtr = CurPtr;
416	while (isHexDigit(C: *CurPtr)) {
417	switch (*CurPtr) {
418	default:
419	if (!FirstNonDecimal) {
420	FirstNonDecimal = CurPtr;
421	}
422	[[fallthrough]];
423	case `'9'`:
424	case `'8'`:
425	case `'7'`:
426	case `'6'`:
427	case `'5'`:
428	case `'4'`:
429	case `'3'`:
430	case `'2'`:
431	if (!FirstNonBinary) {
432	FirstNonBinary = CurPtr;
433	}
434	break;
435	case `'1'`:
436	case `'0'`:
437	break;
438	}
439	++CurPtr;
440	}
441	if (*CurPtr == `'.'`) {
442	// MASM float literals (other than hex floats) always contain a ".", and
443	// are always written in decimal.
444	++CurPtr;
445	return LexFloatLiteral();
446	}
447
448	if (LexMasmHexFloats && (CurPtr == `'r'` \|\| CurPtr == `'R'`)) {
449	++CurPtr;
450	return AsmToken (AsmToken::Real, StringRef (TokStart, CurPtr - TokStart));
451	}
452
453	unsigned Radix = `0`;
454	if (CurPtr == `'h'` \|\| CurPtr == `'H'`) {
455	// hexadecimal number
456	++CurPtr;
457	Radix = `16`;
458	} else if (CurPtr == `'t'` \|\| CurPtr == `'T'`) {
459	// decimal number
460	++CurPtr;
461	Radix = `10`;
462	} else if (CurPtr == `'o'` \|\| CurPtr == `'O'` \|\| *CurPtr == `'q'` \|\|
463	*CurPtr == `'Q'`) {
464	// octal number
465	++CurPtr;
466	Radix = `8`;
467	} else if (CurPtr == `'y'` \|\| CurPtr == `'Y'`) {
468	// binary number
469	++CurPtr;
470	Radix = `2`;
471	} else if (FirstNonDecimal && FirstNonDecimal + `1` == CurPtr &&
472	DefaultRadix < `14` &&
473	(FirstNonDecimal == `'d'` \|\| FirstNonDecimal == `'D'`)) {
474	Radix = `10`;
475	} else if (FirstNonBinary && FirstNonBinary + `1` == CurPtr &&
476	DefaultRadix < `12` &&
477	(FirstNonBinary == `'b'` \|\| FirstNonBinary == `'B'`)) {
478	Radix = `2`;
479	}
480
481	if (Radix) {
482	StringRef Result(TokStart, CurPtr - TokStart);
483	APInt Value(`128`, `0`, true);
484
485	if (Result.drop_back().getAsInteger(Radix, Result&: Value))
486	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
487
488	// MSVC accepts and ignores type suffices on integer literals.
489	SkipIgnoredIntegerSuffix(CurPtr);
490
491	return intToken(Ref: Result, Value);
492	}
493
494	// default-radix integers, or floating point numbers, fall through
495	CurPtr = OldCurPtr;
496	}
497
498	// MASM default-radix integers: [0-9a-fA-F]+
499	// (All other integer literals have a radix specifier.)
500	if (LexMasmIntegers && UseMasmDefaultRadix) {
501	CurPtr = findLastDigit(CurPtr, DefaultRadix: `16`);
502	StringRef Result(TokStart, CurPtr - TokStart);
503
504	APInt Value(`128`, `0`, true);
505	if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) {
506	return ReturnError(Loc: TokStart,
507	Msg: "invalid " + radixName(Radix: DefaultRadix) + " number");
508	}
509
510	return intToken(Ref: Result, Value);
511	}
512
513	// Motorola hex integers: $[0-9a-fA-F]+
514	if (LexMotorolaIntegers && CurPtr[-`1`] == `'$'`) {
515	const char *NumStart = CurPtr;
516	while (isHexDigit(C: CurPtr[`0`]))
517	++CurPtr;
518
519	APInt Result(`128`, `0`);
520	if (StringRef (NumStart, CurPtr - NumStart).getAsInteger(Radix: `16`, Result))
521	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
522
523	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
524	}
525
526	// Motorola binary integers: %[01]+
527	if (LexMotorolaIntegers && CurPtr[-`1`] == `'%'`) {
528	const char *NumStart = CurPtr;
529	while (CurPtr == `'0'` \|\| CurPtr == `'1'`)
530	++CurPtr;
531
532	APInt Result(`128`, `0`);
533	if (StringRef (NumStart, CurPtr - NumStart).getAsInteger(Radix: `2`, Result))
534	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
535
536	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
537	}
538
539	// Decimal integer: [1-9][0-9]*
540	// HLASM-flavour decimal integer: [0-9][0-9]*
541	// FIXME: Later on, support for fb for HLASM has to be added in
542	// as they probably would be needed for asm goto
543	if (LexHLASMIntegers \|\| CurPtr[-`1`] != `'0'` \|\| CurPtr[`0`] == `'.'`) {
544	unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: `10`, LexHex: LexMasmIntegers);
545
546	if (!LexHLASMIntegers) {
547	bool IsHex = Radix == `16`;
548	// Check for floating point literals.
549	if (!IsHex && (CurPtr == `'.'` \|\| CurPtr == `'e'` \|\| *CurPtr == `'E'`)) {
550	if (*CurPtr == `'.'`)
551	++CurPtr;
552	return LexFloatLiteral();
553	}
554	}
555
556	StringRef Result(TokStart, CurPtr - TokStart);
557
558	APInt Value(`128`, `0`, true);
559	if (Result.getAsInteger(Radix, Result&: Value))
560	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
561
562	if (!LexHLASMIntegers)
563	// The darwin/x86 (and x86-64) assembler accepts and ignores type
564	// suffices on integer literals.
565	SkipIgnoredIntegerSuffix(CurPtr);
566
567	return intToken(Ref: Result, Value);
568	}
569
570	if (!LexMasmIntegers && ((CurPtr == `'b'`) \|\| (CurPtr == `'B'`))) {
571	++CurPtr;
572	// See if we actually have "0b" as part of something like "jmp 0b\n"
573	if (!isDigit(C: CurPtr[`0`])) {
574	--CurPtr;
575	StringRef Result(TokStart, CurPtr - TokStart);
576	return AsmToken (AsmToken::Integer, Result, `0`);
577	}
578	const char *NumStart = CurPtr;
579	while (CurPtr[`0`] == `'0'` \|\| CurPtr[`0`] == `'1'`)
580	++CurPtr;
581
582	// Requires at least one binary digit.
583	if (CurPtr == NumStart)
584	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
585
586	StringRef Result(TokStart, CurPtr - TokStart);
587
588	APInt Value(`128`, `0`, true);
589	if (Result.substr(Start: `2`).getAsInteger(Radix: `2`, Result&: Value))
590	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
591
592	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
593	// suffixes on integer literals.
594	SkipIgnoredIntegerSuffix(CurPtr);
595
596	return intToken(Ref: Result, Value);
597	}
598
599	if ((CurPtr == `'x'`) \|\| (CurPtr == `'X'`)) {
600	++CurPtr;
601	const char *NumStart = CurPtr;
602	while (isHexDigit(C: CurPtr[`0`]))
603	++CurPtr;
604
605	// "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
606	// diagnosed by LexHexFloatLiteral).
607	if (CurPtr[`0`] == `'.'` \|\| CurPtr[`0`] == `'p'` \|\| CurPtr[`0`] == `'P'`)
608	return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr);
609
610	// Otherwise requires at least one hex digit.
611	if (CurPtr == NumStart)
612	return ReturnError(Loc: CurPtr-`2`, Msg: "invalid hexadecimal number");
613
614	APInt Result(`128`, `0`);
615	if (StringRef (TokStart, CurPtr - TokStart).getAsInteger(Radix: `0`, Result))
616	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
617
618	// Consume the optional [hH].
619	if (LexMasmIntegers && (CurPtr == `'h'` \|\| CurPtr == `'H'`))
620	++CurPtr;
621
622	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
623	// suffixes on integer literals.
624	SkipIgnoredIntegerSuffix(CurPtr);
625
626	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
627	}
628
629	// Either octal or hexadecimal.
630	APInt Value(`128`, `0`, true);
631	unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: `8`, LexHex: LexMasmIntegers);
632	StringRef Result(TokStart, CurPtr - TokStart);
633	if (Result.getAsInteger(Radix, Result&: Value))
634	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
635
636	// Consume the [hH].
637	if (Radix == `16`)
638	++CurPtr;
639
640	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
641	// suffixes on integer literals.
642	SkipIgnoredIntegerSuffix(CurPtr);
643
644	return intToken(Ref: Result, Value);
645	}
646
647	/// LexSingleQuote: Integer: 'b'
648	AsmToken AsmLexer::LexSingleQuote() {
649	int CurChar = getNextChar();
650
651	if (LexHLASMStrings)
652	return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals");
653
654	if (LexMasmStrings) {
655	while (CurChar != EOF) {
656	if (CurChar != `'\''`) {
657	CurChar = getNextChar();
658	} else if (peekNextChar() == `'\''`) {
659	// In MASM single-quote strings, doubled single-quotes mean an escaped
660	// single quote, so should be lexed in.
661	(void)getNextChar();
662	CurChar = getNextChar();
663	} else {
664	break;
665	}
666	}
667	if (CurChar == EOF)
668	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
669	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
670	}
671
672	if (CurChar == `'\\'`)
673	CurChar = getNextChar();
674
675	if (CurChar == EOF)
676	return ReturnError(Loc: TokStart, Msg: "unterminated single quote");
677
678	CurChar = getNextChar();
679
680	if (CurChar != `'\''`)
681	return ReturnError(Loc: TokStart, Msg: "single quote way too long");
682
683	// The idea here being that 'c' is basically just an integral
684	// constant.
685	StringRef Res = StringRef (TokStart,CurPtr - TokStart);
686	long long Value;
687
688	if (Res.starts_with(Prefix: "\'\\")) {
689	char theChar = Res [`2`];
690	switch (theChar) {
691	default: Value = theChar; break;
692	case `'\''`: Value = `'\''`; break;
693	case `'t'`: Value = `'\t'`; break;
694	case `'n'`: Value = `'\n'`; break;
695	case `'b'`: Value = `'\b'`; break;
696	case `'f'`: Value = `'\f'`; break;
697	case `'r'`: Value = `'\r'`; break;
698	}
699	} else
700	Value = TokStart[`1`];
701
702	return AsmToken (AsmToken::Integer, Res, Value);
703	}
704
705	/// LexQuote: String: "..."
706	AsmToken AsmLexer::LexQuote() {
707	int CurChar = getNextChar();
708	if (LexHLASMStrings)
709	return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals");
710
711	if (LexMasmStrings) {
712	while (CurChar != EOF) {
713	if (CurChar != `'"'`) {
714	CurChar = getNextChar();
715	} else if (peekNextChar() == `'"'`) {
716	// In MASM double-quoted strings, doubled double-quotes mean an escaped
717	// double quote, so should be lexed in.
718	(void)getNextChar();
719	CurChar = getNextChar();
720	} else {
721	break;
722	}
723	}
724	if (CurChar == EOF)
725	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
726	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
727	}
728
729	while (CurChar != `'"'`) {
730	if (CurChar == `'\\'`) {
731	// Allow \", etc.
732	CurChar = getNextChar();
733	}
734
735	if (CurChar == EOF)
736	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
737
738	CurChar = getNextChar();
739	}
740
741	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
742	}
743
744	StringRef AsmLexer::LexUntilEndOfStatement() {
745	TokStart = CurPtr;
746
747	while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment.
748	!isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker.
749	CurPtr != `'\n'` && CurPtr != `'\r'` && CurPtr != CurBuf.end()) {
750	++CurPtr;
751	}
752	return StringRef (TokStart, CurPtr-TokStart);
753	}
754
755	StringRef AsmLexer::LexUntilEndOfLine() {
756	TokStart = CurPtr;
757
758	while (CurPtr != `'\n'` && CurPtr != `'\r'` && CurPtr != CurBuf.end()) {
759	++CurPtr;
760	}
761	return StringRef (TokStart, CurPtr-TokStart);
762	}
763
764	size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
765	bool ShouldSkipSpace) {
766	SaveAndRestore SavedTokenStart(TokStart);
767	SaveAndRestore SavedCurPtr(CurPtr);
768	SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
769	SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
770	SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
771	SaveAndRestore SavedIsPeeking(IsPeeking, true);
772	std::string SavedErr = getErr();
773	SMLoc SavedErrLoc = getErrLoc();
774
775	size_t ReadCount;
776	for (ReadCount = `0`; ReadCount < Buf.size(); ++ReadCount) {
777	AsmToken Token = LexToken();
778
779	Buf [ReadCount] = Token;
780
781	if (Token.is(K: AsmToken::Eof)) {
782	ReadCount++;
783	break;
784	}
785	}
786
787	SetError(errLoc: SavedErrLoc, err: SavedErr);
788	return ReadCount;
789	}
790
791	bool AsmLexer::isAtStartOfComment(const char *Ptr) {
792	if (MAI.isHLASM() && !IsAtStartOfStatement)
793	return false;
794
795	StringRef CommentString = MAI.getCommentString();
796
797	if (CommentString.size() == `1`)
798	return CommentString [`0`] == Ptr[`0`];
799
800	// Allow # preprocessor comments also be counted as comments for "##" cases
801	if (CommentString [`1`] == `'#'`)
802	return CommentString [`0`] == Ptr[`0`];
803
804	return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == `0`;
805	}
806
807	bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
808	return strncmp(s1: Ptr, s2: MAI.getSeparatorString(),
809	n: strlen(s: MAI.getSeparatorString())) == `0`;
810	}
811
812	AsmToken AsmLexer::LexToken() {
813	TokStart = CurPtr;
814	// This always consumes at least one character.
815	int CurChar = getNextChar();
816
817	if (!IsPeeking && CurChar == `'#'` && IsAtStartOfStatement) {
818	// If this starts with a '#', this may be a cpp
819	// hash directive and otherwise a line comment.
820	AsmToken TokenBuf[`2`];
821	MutableArrayRef<AsmToken> Buf(TokenBuf, `2`);
822	size_t num = peekTokens(Buf, ShouldSkipSpace: true);
823	// There cannot be a space preceding this
824	if (IsAtStartOfLine && num == `2` && TokenBuf[`0`].is(K: AsmToken::Integer) &&
825	TokenBuf[`1`].is(K: AsmToken::String)) {
826	CurPtr = TokStart; // reset curPtr;
827	StringRef s = LexUntilEndOfLine();
828	UnLex(Token: TokenBuf[`1`]);
829	UnLex(Token: TokenBuf[`0`]);
830	return AsmToken (AsmToken::HashDirective, s);
831	}
832
833	if (MAI.shouldAllowAdditionalComments())
834	return LexLineComment();
835	}
836
837	if (isAtStartOfComment(Ptr: TokStart))
838	return LexLineComment();
839
840	if (isAtStatementSeparator(Ptr: TokStart)) {
841	CurPtr += strlen(s: MAI.getSeparatorString()) - `1`;
842	IsAtStartOfLine = true;
843	IsAtStartOfStatement = true;
844	return AsmToken (AsmToken::EndOfStatement,
845	StringRef (TokStart, strlen(s: MAI.getSeparatorString())));
846	}
847
848	// If we're missing a newline at EOF, make sure we still get an
849	// EndOfStatement token before the Eof token.
850	if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
851	IsAtStartOfLine = true;
852	IsAtStartOfStatement = true;
853	return AsmToken (AsmToken::EndOfStatement, StringRef (TokStart, `0`));
854	}
855	IsAtStartOfLine = false;
856	bool OldIsAtStartOfStatement = IsAtStartOfStatement;
857	IsAtStartOfStatement = false;
858	switch (CurChar) {
859	default:
860	// Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
861	// Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
862	// an identifier is target-dependent. These characters are handled in the
863	// respective switch cases.
864	if (isalpha(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'`)
865	return LexIdentifier();
866
867	// Unknown character, emit an error.
868	return ReturnError(Loc: TokStart, Msg: "invalid character in input");
869	case EOF:
870	if (EndStatementAtEOF) {
871	IsAtStartOfLine = true;
872	IsAtStartOfStatement = true;
873	}
874	return AsmToken (AsmToken::Eof, StringRef (TokStart, `0`));
875	case `0`:
876	case `' '`:
877	case `'\t'`:
878	IsAtStartOfStatement = OldIsAtStartOfStatement;
879	while (CurPtr == `' '` \|\| CurPtr == `'\t'`)
880	CurPtr++;
881	if (SkipSpace)
882	return LexToken(); // Ignore whitespace.
883	else
884	return AsmToken (AsmToken::Space, StringRef (TokStart, CurPtr - TokStart));
885	case `'\r'`: {
886	IsAtStartOfLine = true;
887	IsAtStartOfStatement = true;
888	// If this is a CR followed by LF, treat that as one token.
889	if (CurPtr != CurBuf.end() && *CurPtr == `'\n'`)
890	++CurPtr;
891	return AsmToken (AsmToken::EndOfStatement,
892	StringRef (TokStart, CurPtr - TokStart));
893	}
894	case `'\n'`:
895	IsAtStartOfLine = true;
896	IsAtStartOfStatement = true;
897	return AsmToken (AsmToken::EndOfStatement, StringRef (TokStart, `1`));
898	case `':'`: return AsmToken (AsmToken::Colon, StringRef (TokStart, `1`));
899	case `'+'`: return AsmToken (AsmToken::Plus, StringRef (TokStart, `1`));
900	case `'~'`: return AsmToken (AsmToken::Tilde, StringRef (TokStart, `1`));
901	case `'('`: return AsmToken (AsmToken::LParen, StringRef (TokStart, `1`));
902	case `')'`: return AsmToken (AsmToken::RParen, StringRef (TokStart, `1`));
903	case `'['`: return AsmToken (AsmToken::LBrac, StringRef (TokStart, `1`));
904	case `']'`: return AsmToken (AsmToken::RBrac, StringRef (TokStart, `1`));
905	case `'{'`: return AsmToken (AsmToken::LCurly, StringRef (TokStart, `1`));
906	case `'}'`: return AsmToken (AsmToken::RCurly, StringRef (TokStart, `1`));
907	case `''`: return* AsmToken (AsmToken::Star, StringRef (TokStart, `1`));
908	case `','`: return AsmToken (AsmToken::Comma, StringRef (TokStart, `1`));
909	case `'$'`: {
910	if (LexMotorolaIntegers && isHexDigit(C: *CurPtr))
911	return LexDigit();
912	if (MAI.doesAllowDollarAtStartOfIdentifier())
913	return LexIdentifier();
914	return AsmToken (AsmToken::Dollar, StringRef (TokStart, `1`));
915	}
916	case `'@'`:
917	if (MAI.doesAllowAtAtStartOfIdentifier())
918	return LexIdentifier();
919	return AsmToken (AsmToken::At, StringRef (TokStart, `1`));
920	case `'#'`:
921	if (MAI.isHLASM())
922	return LexIdentifier();
923	return AsmToken (AsmToken::Hash, StringRef (TokStart, `1`));
924	case `'?'`:
925	if (MAI.doesAllowQuestionAtStartOfIdentifier())
926	return LexIdentifier();
927	return AsmToken (AsmToken::Question, StringRef (TokStart, `1`));
928	case `'\\'`: return AsmToken (AsmToken::BackSlash, StringRef (TokStart, `1`));
929	case `'='`:
930	if (*CurPtr == `'='`) {
931	++CurPtr;
932	return AsmToken (AsmToken::EqualEqual, StringRef (TokStart, `2`));
933	}
934	return AsmToken (AsmToken::Equal, StringRef (TokStart, `1`));
935	case `'-'`:
936	if (*CurPtr == `'>'`) {
937	++CurPtr;
938	return AsmToken (AsmToken::MinusGreater, StringRef (TokStart, `2`));
939	}
940	return AsmToken (AsmToken::Minus, StringRef (TokStart, `1`));
941	case `'\|'`:
942	if (*CurPtr == `'\|'`) {
943	++CurPtr;
944	return AsmToken (AsmToken::PipePipe, StringRef (TokStart, `2`));
945	}
946	return AsmToken (AsmToken::Pipe, StringRef (TokStart, `1`));
947	case `'^'`: return AsmToken (AsmToken::Caret, StringRef (TokStart, `1`));
948	case `'&'`:
949	if (*CurPtr == `'&'`) {
950	++CurPtr;
951	return AsmToken (AsmToken::AmpAmp, StringRef (TokStart, `2`));
952	}
953	return AsmToken (AsmToken::Amp, StringRef (TokStart, `1`));
954	case `'!'`:
955	if (*CurPtr == `'='`) {
956	++CurPtr;
957	return AsmToken (AsmToken::ExclaimEqual, StringRef (TokStart, `2`));
958	}
959	return AsmToken (AsmToken::Exclaim, StringRef (TokStart, `1`));
960	case `'%'`:
961	if (LexMotorolaIntegers && (CurPtr == `'0'` \|\| CurPtr == `'1'`)) {
962	return LexDigit();
963	}
964	return AsmToken (AsmToken::Percent, StringRef (TokStart, `1`));
965	case `'/'`:
966	IsAtStartOfStatement = OldIsAtStartOfStatement;
967	return LexSlash();
968	case `'\''`: return LexSingleQuote();
969	case `'"'`: return LexQuote();
970	case `'0'`: case `'1'`: case `'2'`: case `'3'`: case `'4'`:
971	case `'5'`: case `'6'`: case `'7'`: case `'8'`: case `'9'`:
972	return LexDigit();
973	case `'<'`:
974	switch (*CurPtr) {
975	case `'<'`:
976	++CurPtr;
977	return AsmToken (AsmToken::LessLess, StringRef (TokStart, `2`));
978	case `'='`:
979	++CurPtr;
980	return AsmToken (AsmToken::LessEqual, StringRef (TokStart, `2`));
981	case `'>'`:
982	++CurPtr;
983	return AsmToken (AsmToken::LessGreater, StringRef (TokStart, `2`));
984	default:
985	return AsmToken (AsmToken::Less, StringRef (TokStart, `1`));
986	}
987	case `'>'`:
988	switch (*CurPtr) {
989	case `'>'`:
990	++CurPtr;
991	return AsmToken (AsmToken::GreaterGreater, StringRef (TokStart, `2`));
992	case `'='`:
993	++CurPtr;
994	return AsmToken (AsmToken::GreaterEqual, StringRef (TokStart, `2`));
995	default:
996	return AsmToken (AsmToken::Greater, StringRef (TokStart, `1`));
997	}
998
999	// TODO: Quoted identifiers (objc methods etc)
1000	// local labels: [0-9][:]
1001	// Forward/backward labels: [0-9][fb]
1002	// Integers, fp constants, character constants.
1003	}
1004	}
1005

Browse the source code of llvm_projects/llvm/lib/MC/MCParser/AsmLexer.cpp