AsmLexer.cpp source code [llvm_projects/llvm/lib/MC/MCParser/AsmLexer.cpp]

1	//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This class implements the lexer for assembly files.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/MC/MCParser/AsmLexer.h"
14	#include "llvm/ADT/APInt.h"
15	#include "llvm/ADT/ArrayRef.h"
16	#include "llvm/ADT/StringExtras.h"
17	#include "llvm/ADT/StringRef.h"
18	#include "llvm/MC/MCAsmInfo.h"
19	#include "llvm/Support/Compiler.h"
20	#include "llvm/Support/SMLoc.h"
21	#include "llvm/Support/SaveAndRestore.h"
22	#include "llvm/Support/raw_ostream.h"
23	#include <cassert>
24	#include <cctype>
25	#include <cstdio>
26	#include <cstring>
27	#include <string>
28
29	using namespace llvm;
30
31	SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Ptr: Str.data()); }
32
33	SMLoc AsmToken::getEndLoc() const {
34	return SMLoc::getFromPointer(Ptr: Str.data() + Str.size());
35	}
36
37	SMRange AsmToken::getLocRange() const { return SMRange (getLoc(), getEndLoc()); }
38
39	void AsmToken::dump(raw_ostream &OS) const {
40	switch (Kind) {
41	case AsmToken::Error:
42	OS << "error";
43	break;
44	case AsmToken::Identifier:
45	OS << "identifier: " << getString();
46	break;
47	case AsmToken::Integer:
48	OS << "int: " << getString();
49	break;
50	case AsmToken::Real:
51	OS << "real: " << getString();
52	break;
53	case AsmToken::String:
54	OS << "string: " << getString();
55	break;
56
57	// clang-format off
58	case AsmToken::Amp: OS << "Amp"; break;
59	case AsmToken::AmpAmp: OS << "AmpAmp"; break;
60	case AsmToken::At: OS << "At"; break;
61	case AsmToken::BackSlash: OS << "BackSlash"; break;
62	case AsmToken::BigNum: OS << "BigNum"; break;
63	case AsmToken::Caret: OS << "Caret"; break;
64	case AsmToken::Colon: OS << "Colon"; break;
65	case AsmToken::Comma: OS << "Comma"; break;
66	case AsmToken::Comment: OS << "Comment"; break;
67	case AsmToken::Dollar: OS << "Dollar"; break;
68	case AsmToken::Dot: OS << "Dot"; break;
69	case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
70	case AsmToken::Eof: OS << "Eof"; break;
71	case AsmToken::Equal: OS << "Equal"; break;
72	case AsmToken::EqualEqual: OS << "EqualEqual"; break;
73	case AsmToken::Exclaim: OS << "Exclaim"; break;
74	case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break;
75	case AsmToken::Greater: OS << "Greater"; break;
76	case AsmToken::GreaterEqual: OS << "GreaterEqual"; break;
77	case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
78	case AsmToken::Hash: OS << "Hash"; break;
79	case AsmToken::HashDirective: OS << "HashDirective"; break;
80	case AsmToken::LBrac: OS << "LBrac"; break;
81	case AsmToken::LCurly: OS << "LCurly"; break;
82	case AsmToken::LParen: OS << "LParen"; break;
83	case AsmToken::Less: OS << "Less"; break;
84	case AsmToken::LessEqual: OS << "LessEqual"; break;
85	case AsmToken::LessGreater: OS << "LessGreater"; break;
86	case AsmToken::LessLess: OS << "LessLess"; break;
87	case AsmToken::Minus: OS << "Minus"; break;
88	case AsmToken::MinusGreater: OS << "MinusGreater"; break;
89	case AsmToken::Percent: OS << "Percent"; break;
90	case AsmToken::Pipe: OS << "Pipe"; break;
91	case AsmToken::PipePipe: OS << "PipePipe"; break;
92	case AsmToken::Plus: OS << "Plus"; break;
93	case AsmToken::Question: OS << "Question"; break;
94	case AsmToken::RBrac: OS << "RBrac"; break;
95	case AsmToken::RCurly: OS << "RCurly"; break;
96	case AsmToken::RParen: OS << "RParen"; break;
97	case AsmToken::Slash: OS << "Slash"; break;
98	case AsmToken::Space: OS << "Space"; break;
99	case AsmToken::Star: OS << "Star"; break;
100	case AsmToken::Tilde: OS << "Tilde"; break;
101	// clang-format on
102	}
103
104	// Print the token string.
105	OS << " (\"";
106	OS.write_escaped(Str: getString());
107	OS << "\")";
108	}
109
110	AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
111	// For COFF targets, this is true, while for ELF targets, it should be false.
112	// Currently, @specifier parsing depends on '@' being included in the token.
113	AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@") &&
114	MAI.useAtForSpecifier();
115	LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
116
117	CurTok.emplace_back(Args: AsmToken::Space, Args: StringRef ());
118	}
119
120	void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
121	bool EndStatementAtEOF) {
122	// Buffer must be NULL-terminated. NULL terminator must reside at `Buf.end()`.
123	// It must be safe to dereference `Buf.end()`.
124	assert(*Buf.end() == `'\0'` &&
125	"Buffer provided to AsmLexer lacks null terminator.");
126
127	CurBuf = Buf;
128
129	if (ptr)
130	CurPtr = ptr;
131	else
132	CurPtr = CurBuf.begin();
133
134	TokStart = nullptr;
135	this->EndStatementAtEOF = EndStatementAtEOF;
136	}
137
138	/// ReturnError - Set the error to the specified string at the specified
139	/// location. This is defined to always return AsmToken::Error.
140	AsmToken AsmLexer::ReturnError(const char Loc, const* std::string &Msg) {
141	SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg);
142
143	return AsmToken (AsmToken::Error, StringRef (Loc, CurPtr - Loc));
144	}
145
146	int AsmLexer::getNextChar() {
147	if (CurPtr == CurBuf.end())
148	return EOF;
149	return (unsigned char)*CurPtr++;
150	}
151
152	int AsmLexer::peekNextChar() {
153	if (CurPtr == CurBuf.end())
154	return EOF;
155	return (unsigned char)*CurPtr;
156	}
157
158	/// The leading integral digit sequence and dot should have already been
159	/// consumed, some or all of the fractional digit sequence can* have been*
160	/// consumed.
161	AsmToken AsmLexer::LexFloatLiteral() {
162	// Skip the fractional digit sequence.
163	while (isDigit(C: *CurPtr))
164	++CurPtr;
165
166	if (CurPtr == `'-'` \|\| CurPtr == `'+'`)
167	return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal");
168
169	// Check for exponent
170	if ((CurPtr == `'e'` \|\| CurPtr == `'E'`)) {
171	++CurPtr;
172
173	if (CurPtr == `'-'` \|\| CurPtr == `'+'`)
174	++CurPtr;
175
176	while (isDigit(C: *CurPtr))
177	++CurPtr;
178	}
179
180	return AsmToken (AsmToken::Real,
181	StringRef (TokStart, CurPtr - TokStart));
182	}
183
184	/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F])?[pP][+-]?[0-9a-fA-F]+*
185	/// while making sure there are enough actual digits around for the constant to
186	/// be valid.
187	///
188	/// The leading "0x[0-9a-fA-F]" (i.e. integer part) has already been consumed*
189	/// before we get here.
190	AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
191	assert((CurPtr == `'p'` \|\| CurPtr == `'P'` \|\| *CurPtr == `'.'`) &&
192	"unexpected parse state in floating hex");
193	bool NoFracDigits = true;
194
195	// Skip the fractional part if there is one
196	if (*CurPtr == `'.'`) {
197	++CurPtr;
198
199	const char *FracStart = CurPtr;
200	while (isHexDigit(C: *CurPtr))
201	++CurPtr;
202
203	NoFracDigits = CurPtr == FracStart;
204	}
205
206	if (NoIntDigits && NoFracDigits)
207	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
208	"expected at least one significand digit");
209
210	// Make sure we do have some kind of proper exponent part
211	if (CurPtr != `'p'` && CurPtr != `'P'`)
212	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
213	"expected exponent part 'p'");
214	++CurPtr;
215
216	if (CurPtr == `'+'` \|\| CurPtr == `'-'`)
217	++CurPtr;
218
219	// N.b. exponent digits are not* hex*
220	const char *ExpStart = CurPtr;
221	while (isDigit(C: *CurPtr))
222	++CurPtr;
223
224	if (CurPtr == ExpStart)
225	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
226	"expected at least one exponent digit");
227
228	return AsmToken (AsmToken::Real, StringRef (TokStart, CurPtr - TokStart));
229	}
230
231	/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
232	static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
233	return isAlnum(C) \|\| C == `'_'` \|\| C == `'$'` \|\| C == `'.'` \|\| C == `'?'` \|\|
234	(AllowAt && C == `'@'`) \|\| (AllowHash && C == `'#'`);
235	}
236
237	AsmToken AsmLexer::LexIdentifier() {
238	// Check for floating point literals.
239	if (CurPtr[-`1`] == `'.'` && isDigit(C: *CurPtr)) {
240	// Disambiguate a .1243foo identifier from a floating literal.
241	while (isDigit(C: *CurPtr))
242	++CurPtr;
243
244	if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier,
245	AllowHash: AllowHashInIdentifier) \|\|
246	CurPtr == `'e'` \|\| CurPtr == `'E'`)
247	return LexFloatLiteral();
248	}
249
250	while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier))
251	++CurPtr;
252
253	// Handle . as a special case.
254	if (CurPtr == TokStart+`1` && TokStart[`0`] == `'.'`)
255	return AsmToken (AsmToken::Dot, StringRef (TokStart, `1`));
256
257	return AsmToken (AsmToken::Identifier, StringRef (TokStart, CurPtr - TokStart));
258	}
259
260	/// LexSlash: Slash: /
261	/// C-Style Comment: / ... /
262	/// C-style Comment: // ...
263	AsmToken AsmLexer::LexSlash() {
264	if (!MAI.shouldAllowAdditionalComments()) {
265	IsAtStartOfStatement = false;
266	return AsmToken (AsmToken::Slash, StringRef (TokStart, `1`));
267	}
268
269	switch (*CurPtr) {
270	case `'*'`:
271	IsAtStartOfStatement = false;
272	break; // C style comment.
273	case `'/'`:
274	++CurPtr;
275	return LexLineComment();
276	default:
277	IsAtStartOfStatement = false;
278	return AsmToken (AsmToken::Slash, StringRef (TokStart, `1`));
279	}
280
281	// C Style comment.
282	++CurPtr; // skip the star.
283	const char *CommentTextStart = CurPtr;
284	while (CurPtr != CurBuf.end()) {
285	switch (*CurPtr++) {
286	case `'*'`:
287	// End of the comment?
288	if (*CurPtr != `'/'`)
289	break;
290	// If we have a CommentConsumer, notify it about the comment.
291	if (CommentConsumer) {
292	CommentConsumer->HandleComment(
293	Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
294	CommentText: StringRef (CommentTextStart, CurPtr - `1` - CommentTextStart));
295	}
296	++CurPtr; // End the /.*
297	return AsmToken (AsmToken::Comment,
298	StringRef (TokStart, CurPtr - TokStart));
299	}
300	}
301	return ReturnError(Loc: TokStart, Msg: "unterminated comment");
302	}
303
304	/// LexLineComment: Comment: #[^\n]*
305	/// : //[^\n]*
306	AsmToken AsmLexer::LexLineComment() {
307	// Mark This as an end of statement with a body of the
308	// comment. While it would be nicer to leave this two tokens,
309	// backwards compatability with TargetParsers makes keeping this in this form
310	// better.
311	const char *CommentTextStart = CurPtr;
312	int CurChar = getNextChar();
313	while (CurChar != `'\n'` && CurChar != `'\r'` && CurChar != EOF)
314	CurChar = getNextChar();
315	const char *NewlinePtr = CurPtr;
316	if (CurChar == `'\r'` && CurPtr != CurBuf.end() && *CurPtr == `'\n'`)
317	++CurPtr;
318
319	// If we have a CommentConsumer, notify it about the comment.
320	if (CommentConsumer) {
321	CommentConsumer->HandleComment(
322	Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
323	CommentText: StringRef (CommentTextStart, NewlinePtr - `1` - CommentTextStart));
324	}
325
326	IsAtStartOfLine = true;
327	// This is a whole line comment. leave newline
328	if (IsAtStartOfStatement)
329	return AsmToken (AsmToken::EndOfStatement,
330	StringRef (TokStart, CurPtr - TokStart));
331	IsAtStartOfStatement = true;
332
333	return AsmToken (AsmToken::EndOfStatement,
334	StringRef (TokStart, CurPtr - `1` - TokStart));
335	}
336
337	static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
338	// Skip case-insensitive ULL, UL, U, L and LL suffixes.
339	if (CurPtr[`0`] == `'U'` \|\| CurPtr[`0`] == `'u'`)
340	++CurPtr;
341	if (CurPtr[`0`] == `'L'` \|\| CurPtr[`0`] == `'l'`)
342	++CurPtr;
343	if (CurPtr[`0`] == `'L'` \|\| CurPtr[`0`] == `'l'`)
344	++CurPtr;
345	}
346
347	// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
348	// integer as a hexadecimal, possibly with leading zeroes.
349	static unsigned doHexLookAhead(const char &CurPtr, unsigned* DefaultRadix,
350	bool LexHex) {
351	const char FirstNonDec = nullptr*;
352	const char *LookAhead = CurPtr;
353	while (true) {
354	if (isDigit(C: *LookAhead)) {
355	++LookAhead;
356	} else {
357	if (!FirstNonDec)
358	FirstNonDec = LookAhead;
359
360	// Keep going if we are looking for a 'h' suffix.
361	if (LexHex && isHexDigit(C: *LookAhead))
362	++LookAhead;
363	else
364	break;
365	}
366	}
367	bool isHex = LexHex && (LookAhead == `'h'` \|\| LookAhead == `'H'`);
368	CurPtr = isHex \|\| !FirstNonDec ? LookAhead : FirstNonDec;
369	if (isHex)
370	return `16`;
371	return DefaultRadix;
372	}
373
374	static const char findLastDigit(const* char CurPtr, unsigned* DefaultRadix) {
375	while (hexDigitValue(C: *CurPtr) < DefaultRadix) {
376	++CurPtr;
377	}
378	return CurPtr;
379	}
380
381	static AsmToken intToken(StringRef Ref, APInt &Value) {
382	if (Value.isIntN(N: `64`))
383	return AsmToken (AsmToken::Integer, Ref, Value);
384	return AsmToken (AsmToken::BigNum, Ref, Value);
385	}
386
387	static std::string radixName(unsigned Radix) {
388	switch (Radix) {
389	case `2`:
390	return "binary";
391	case `8`:
392	return "octal";
393	case `10`:
394	return "decimal";
395	case `16`:
396	return "hexadecimal";
397	default:
398	return "base-" + std::to_string(val: Radix);
399	}
400	}
401
402	/// LexDigit: First character is [0-9].
403	/// Local Label: [0-9][:]
404	/// Forward/Backward Label: [0-9][fb]
405	/// Binary integer: 0b[01]+
406	/// Octal integer: 0[0-7]+
407	/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F][hH]*
408	/// Decimal integer: [1-9][0-9]*
409	AsmToken AsmLexer::LexDigit() {
410	// MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
411	// MASM-flavor octal integer: [0-7]+[oOqQ]
412	// MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
413	// MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F][hH]*
414	if (LexMasmIntegers && isdigit(CurPtr[-`1`])) {
415	const char *FirstNonBinary =
416	(CurPtr[-`1`] != `'0'` && CurPtr[-`1`] != `'1'`) ? CurPtr - `1` : nullptr;
417	const char *FirstNonDecimal =
418	(CurPtr[-`1`] < `'0'` \|\| CurPtr[-`1`] > `'9'`) ? CurPtr - `1` : nullptr;
419	const char *OldCurPtr = CurPtr;
420	while (isHexDigit(C: *CurPtr)) {
421	switch (*CurPtr) {
422	default:
423	if (!FirstNonDecimal) {
424	FirstNonDecimal = CurPtr;
425	}
426	[[fallthrough]];
427	case `'9'`:
428	case `'8'`:
429	case `'7'`:
430	case `'6'`:
431	case `'5'`:
432	case `'4'`:
433	case `'3'`:
434	case `'2'`:
435	if (!FirstNonBinary) {
436	FirstNonBinary = CurPtr;
437	}
438	break;
439	case `'1'`:
440	case `'0'`:
441	break;
442	}
443	++CurPtr;
444	}
445	if (*CurPtr == `'.'`) {
446	// MASM float literals (other than hex floats) always contain a ".", and
447	// are always written in decimal.
448	++CurPtr;
449	return LexFloatLiteral();
450	}
451
452	if (LexMasmHexFloats && (CurPtr == `'r'` \|\| CurPtr == `'R'`)) {
453	++CurPtr;
454	return AsmToken (AsmToken::Real, StringRef (TokStart, CurPtr - TokStart));
455	}
456
457	unsigned Radix = `0`;
458	if (CurPtr == `'h'` \|\| CurPtr == `'H'`) {
459	// hexadecimal number
460	++CurPtr;
461	Radix = `16`;
462	} else if (CurPtr == `'t'` \|\| CurPtr == `'T'`) {
463	// decimal number
464	++CurPtr;
465	Radix = `10`;
466	} else if (CurPtr == `'o'` \|\| CurPtr == `'O'` \|\| *CurPtr == `'q'` \|\|
467	*CurPtr == `'Q'`) {
468	// octal number
469	++CurPtr;
470	Radix = `8`;
471	} else if (CurPtr == `'y'` \|\| CurPtr == `'Y'`) {
472	// binary number
473	++CurPtr;
474	Radix = `2`;
475	} else if (FirstNonDecimal && FirstNonDecimal + `1` == CurPtr &&
476	DefaultRadix < `14` &&
477	(FirstNonDecimal == `'d'` \|\| FirstNonDecimal == `'D'`)) {
478	Radix = `10`;
479	} else if (FirstNonBinary && FirstNonBinary + `1` == CurPtr &&
480	DefaultRadix < `12` &&
481	(FirstNonBinary == `'b'` \|\| FirstNonBinary == `'B'`)) {
482	Radix = `2`;
483	}
484
485	if (Radix) {
486	StringRef Result(TokStart, CurPtr - TokStart);
487	APInt Value(`128`, `0`, true);
488
489	if (Result.drop_back().getAsInteger(Radix, Result&: Value))
490	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
491
492	// MSVC accepts and ignores type suffices on integer literals.
493	SkipIgnoredIntegerSuffix(CurPtr);
494
495	return intToken(Ref: Result, Value);
496	}
497
498	// default-radix integers, or floating point numbers, fall through
499	CurPtr = OldCurPtr;
500	}
501
502	// MASM default-radix integers: [0-9a-fA-F]+
503	// (All other integer literals have a radix specifier.)
504	if (LexMasmIntegers && UseMasmDefaultRadix) {
505	CurPtr = findLastDigit(CurPtr, DefaultRadix: `16`);
506	StringRef Result(TokStart, CurPtr - TokStart);
507
508	APInt Value(`128`, `0`, true);
509	if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) {
510	return ReturnError(Loc: TokStart,
511	Msg: "invalid " + radixName(Radix: DefaultRadix) + " number");
512	}
513
514	return intToken(Ref: Result, Value);
515	}
516
517	// Motorola hex integers: $[0-9a-fA-F]+
518	if (LexMotorolaIntegers && CurPtr[-`1`] == `'$'`) {
519	const char *NumStart = CurPtr;
520	while (isHexDigit(C: CurPtr[`0`]))
521	++CurPtr;
522
523	APInt Result(`128`, `0`);
524	if (StringRef (NumStart, CurPtr - NumStart).getAsInteger(Radix: `16`, Result))
525	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
526
527	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
528	}
529
530	// Motorola binary integers: %[01]+
531	if (LexMotorolaIntegers && CurPtr[-`1`] == `'%'`) {
532	const char *NumStart = CurPtr;
533	while (CurPtr == `'0'` \|\| CurPtr == `'1'`)
534	++CurPtr;
535
536	APInt Result(`128`, `0`);
537	if (StringRef (NumStart, CurPtr - NumStart).getAsInteger(Radix: `2`, Result))
538	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
539
540	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
541	}
542
543	// Decimal integer: [1-9][0-9]*
544	// HLASM-flavour decimal integer: [0-9][0-9]*
545	// FIXME: Later on, support for fb for HLASM has to be added in
546	// as they probably would be needed for asm goto
547	if (LexHLASMIntegers \|\| CurPtr[-`1`] != `'0'` \|\| CurPtr[`0`] == `'.'`) {
548	unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: `10`, LexHex: LexMasmIntegers);
549
550	if (!LexHLASMIntegers) {
551	bool IsHex = Radix == `16`;
552	// Check for floating point literals.
553	if (!IsHex && (CurPtr == `'.'` \|\| CurPtr == `'e'` \|\| *CurPtr == `'E'`)) {
554	if (*CurPtr == `'.'`)
555	++CurPtr;
556	return LexFloatLiteral();
557	}
558	}
559
560	StringRef Result(TokStart, CurPtr - TokStart);
561
562	APInt Value(`128`, `0`, true);
563	if (Result.getAsInteger(Radix, Result&: Value))
564	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
565
566	if (!LexHLASMIntegers)
567	// The darwin/x86 (and x86-64) assembler accepts and ignores type
568	// suffices on integer literals.
569	SkipIgnoredIntegerSuffix(CurPtr);
570
571	return intToken(Ref: Result, Value);
572	}
573
574	if (!LexMasmIntegers && ((CurPtr == `'b'`) \|\| (CurPtr == `'B'`))) {
575	++CurPtr;
576	// See if we actually have "0b" as part of something like "jmp 0b\n"
577	if (!isDigit(C: CurPtr[`0`])) {
578	--CurPtr;
579	StringRef Result(TokStart, CurPtr - TokStart);
580	return AsmToken (AsmToken::Integer, Result, `0`);
581	}
582	const char *NumStart = CurPtr;
583	while (CurPtr[`0`] == `'0'` \|\| CurPtr[`0`] == `'1'`)
584	++CurPtr;
585
586	// Requires at least one binary digit.
587	if (CurPtr == NumStart)
588	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
589
590	StringRef Result(TokStart, CurPtr - TokStart);
591
592	APInt Value(`128`, `0`, true);
593	if (Result.substr(Start: `2`).getAsInteger(Radix: `2`, Result&: Value))
594	return ReturnError(Loc: TokStart, Msg: "invalid binary number");
595
596	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
597	// suffixes on integer literals.
598	SkipIgnoredIntegerSuffix(CurPtr);
599
600	return intToken(Ref: Result, Value);
601	}
602
603	if ((CurPtr == `'x'`) \|\| (CurPtr == `'X'`)) {
604	++CurPtr;
605	const char *NumStart = CurPtr;
606	while (isHexDigit(C: CurPtr[`0`]))
607	++CurPtr;
608
609	// "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
610	// diagnosed by LexHexFloatLiteral).
611	if (CurPtr[`0`] == `'.'` \|\| CurPtr[`0`] == `'p'` \|\| CurPtr[`0`] == `'P'`)
612	return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr);
613
614	// Otherwise requires at least one hex digit.
615	if (CurPtr == NumStart)
616	return ReturnError(Loc: CurPtr-`2`, Msg: "invalid hexadecimal number");
617
618	APInt Result(`128`, `0`);
619	if (StringRef (TokStart, CurPtr - TokStart).getAsInteger(Radix: `0`, Result))
620	return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
621
622	// Consume the optional [hH].
623	if (LexMasmIntegers && (CurPtr == `'h'` \|\| CurPtr == `'H'`))
624	++CurPtr;
625
626	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
627	// suffixes on integer literals.
628	SkipIgnoredIntegerSuffix(CurPtr);
629
630	return intToken(Ref: StringRef (TokStart, CurPtr - TokStart), Value&: Result);
631	}
632
633	// Either octal or hexadecimal.
634	APInt Value(`128`, `0`, true);
635	unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: `8`, LexHex: LexMasmIntegers);
636	StringRef Result(TokStart, CurPtr - TokStart);
637	if (Result.getAsInteger(Radix, Result&: Value))
638	return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
639
640	// Consume the [hH].
641	if (Radix == `16`)
642	++CurPtr;
643
644	// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
645	// suffixes on integer literals.
646	SkipIgnoredIntegerSuffix(CurPtr);
647
648	return intToken(Ref: Result, Value);
649	}
650
651	/// LexSingleQuote: Integer: 'b'
652	AsmToken AsmLexer::LexSingleQuote() {
653	int CurChar = getNextChar();
654
655	if (LexHLASMStrings)
656	return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals");
657
658	if (LexMasmStrings) {
659	while (CurChar != EOF) {
660	if (CurChar != `'\''`) {
661	CurChar = getNextChar();
662	} else if (peekNextChar() == `'\''`) {
663	// In MASM single-quote strings, doubled single-quotes mean an escaped
664	// single quote, so should be lexed in.
665	(void)getNextChar();
666	CurChar = getNextChar();
667	} else {
668	break;
669	}
670	}
671	if (CurChar == EOF)
672	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
673	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
674	}
675
676	if (CurChar == `'\\'`)
677	CurChar = getNextChar();
678
679	if (CurChar == EOF)
680	return ReturnError(Loc: TokStart, Msg: "unterminated single quote");
681
682	CurChar = getNextChar();
683
684	if (CurChar != `'\''`)
685	return ReturnError(Loc: TokStart, Msg: "single quote way too long");
686
687	// The idea here being that 'c' is basically just an integral
688	// constant.
689	StringRef Res = StringRef (TokStart,CurPtr - TokStart);
690	long long Value;
691
692	if (Res.starts_with(Prefix: "\'\\")) {
693	char theChar = Res [`2`];
694	switch (theChar) {
695	default: Value = theChar; break;
696	case `'\''`: Value = `'\''`; break;
697	case `'t'`: Value = `'\t'`; break;
698	case `'n'`: Value = `'\n'`; break;
699	case `'b'`: Value = `'\b'`; break;
700	case `'f'`: Value = `'\f'`; break;
701	case `'r'`: Value = `'\r'`; break;
702	}
703	} else
704	Value = TokStart[`1`];
705
706	return AsmToken (AsmToken::Integer, Res, Value);
707	}
708
709	/// LexQuote: String: "..."
710	AsmToken AsmLexer::LexQuote() {
711	int CurChar = getNextChar();
712	if (LexHLASMStrings)
713	return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals");
714
715	if (LexMasmStrings) {
716	while (CurChar != EOF) {
717	if (CurChar != `'"'`) {
718	CurChar = getNextChar();
719	} else if (peekNextChar() == `'"'`) {
720	// In MASM double-quoted strings, doubled double-quotes mean an escaped
721	// double quote, so should be lexed in.
722	(void)getNextChar();
723	CurChar = getNextChar();
724	} else {
725	break;
726	}
727	}
728	if (CurChar == EOF)
729	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
730	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
731	}
732
733	while (CurChar != `'"'`) {
734	if (CurChar == `'\\'`) {
735	// Allow \", etc.
736	CurChar = getNextChar();
737	}
738
739	if (CurChar == EOF)
740	return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
741
742	CurChar = getNextChar();
743	}
744
745	return AsmToken (AsmToken::String, StringRef (TokStart, CurPtr - TokStart));
746	}
747
748	StringRef AsmLexer::LexUntilEndOfStatement() {
749	TokStart = CurPtr;
750
751	while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment.
752	!isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker.
753	CurPtr != `'\n'` && CurPtr != `'\r'` && CurPtr != CurBuf.end()) {
754	++CurPtr;
755	}
756	return StringRef (TokStart, CurPtr-TokStart);
757	}
758
759	StringRef AsmLexer::LexUntilEndOfLine() {
760	TokStart = CurPtr;
761
762	while (CurPtr != `'\n'` && CurPtr != `'\r'` && CurPtr != CurBuf.end()) {
763	++CurPtr;
764	}
765	return StringRef (TokStart, CurPtr-TokStart);
766	}
767
768	size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
769	bool ShouldSkipSpace) {
770	SaveAndRestore SavedTokenStart(TokStart);
771	SaveAndRestore SavedCurPtr(CurPtr);
772	SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
773	SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
774	SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
775	SaveAndRestore SavedIsPeeking(IsPeeking, true);
776	std::string SavedErr = getErr();
777	SMLoc SavedErrLoc = getErrLoc();
778
779	size_t ReadCount;
780	for (ReadCount = `0`; ReadCount < Buf.size(); ++ReadCount) {
781	AsmToken Token = LexToken();
782
783	Buf [ReadCount] = Token;
784
785	if (Token.is(K: AsmToken::Eof)) {
786	ReadCount++;
787	break;
788	}
789	}
790
791	SetError(errLoc: SavedErrLoc, err: SavedErr);
792	return ReadCount;
793	}
794
795	bool AsmLexer::isAtStartOfComment(const char *Ptr) {
796	if (MAI.isHLASM() && !IsAtStartOfStatement)
797	return false;
798
799	StringRef CommentString = MAI.getCommentString();
800
801	if (CommentString.size() == `1`)
802	return CommentString [`0`] == Ptr[`0`];
803
804	// Allow # preprocessor comments also be counted as comments for "##" cases
805	if (CommentString [`1`] == `'#'`)
806	return CommentString [`0`] == Ptr[`0`];
807
808	return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == `0`;
809	}
810
811	bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
812	return strncmp(s1: Ptr, s2: MAI.getSeparatorString(),
813	n: strlen(s: MAI.getSeparatorString())) == `0`;
814	}
815
816	AsmToken AsmLexer::LexToken() {
817	TokStart = CurPtr;
818	// This always consumes at least one character.
819	int CurChar = getNextChar();
820
821	if (!IsPeeking && CurChar == `'#'` && IsAtStartOfStatement) {
822	// If this starts with a '#', this may be a cpp
823	// hash directive and otherwise a line comment.
824	AsmToken TokenBuf[`2`];
825	MutableArrayRef<AsmToken> Buf(TokenBuf, `2`);
826	size_t num = peekTokens(Buf, ShouldSkipSpace: true);
827	// There cannot be a space preceding this
828	if (IsAtStartOfLine && num == `2` && TokenBuf[`0`].is(K: AsmToken::Integer) &&
829	TokenBuf[`1`].is(K: AsmToken::String)) {
830	CurPtr = TokStart; // reset curPtr;
831	StringRef s = LexUntilEndOfLine();
832	UnLex(Token: TokenBuf[`1`]);
833	UnLex(Token: TokenBuf[`0`]);
834	return AsmToken (AsmToken::HashDirective, s);
835	}
836
837	if (MAI.shouldAllowAdditionalComments())
838	return LexLineComment();
839	}
840
841	if (isAtStartOfComment(Ptr: TokStart)) {
842	StringRef CommentString = MAI.getCommentString();
843	// For multi-char comment strings, advance CurPtr only if we matched the
844	// full string. This stops us from accidentally eating the newline if the
845	// current line ends in a single comment char.
846	if (CommentString.size() > `1` &&
847	StringRef (TokStart, CommentString.size()) == CommentString) {
848	CurPtr += CommentString.size() - `1`;
849	}
850	return LexLineComment();
851	}
852
853	if (isAtStatementSeparator(Ptr: TokStart)) {
854	CurPtr += strlen(s: MAI.getSeparatorString()) - `1`;
855	IsAtStartOfLine = true;
856	IsAtStartOfStatement = true;
857	return AsmToken (AsmToken::EndOfStatement,
858	StringRef (TokStart, strlen(s: MAI.getSeparatorString())));
859	}
860
861	// If we're missing a newline at EOF, make sure we still get an
862	// EndOfStatement token before the Eof token.
863	if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
864	IsAtStartOfLine = true;
865	IsAtStartOfStatement = true;
866	return AsmToken (AsmToken::EndOfStatement, StringRef (TokStart, `0`));
867	}
868	IsAtStartOfLine = false;
869	bool OldIsAtStartOfStatement = IsAtStartOfStatement;
870	IsAtStartOfStatement = false;
871	switch (CurChar) {
872	default:
873	// Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
874	// Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
875	// an identifier is target-dependent. These characters are handled in the
876	// respective switch cases.
877	if (isalpha(CurChar) \|\| CurChar == `'_'` \|\| CurChar == `'.'`)
878	return LexIdentifier();
879
880	// Unknown character, emit an error.
881	return ReturnError(Loc: TokStart, Msg: "invalid character in input");
882	case EOF:
883	if (EndStatementAtEOF) {
884	IsAtStartOfLine = true;
885	IsAtStartOfStatement = true;
886	}
887	return AsmToken (AsmToken::Eof, StringRef (TokStart, `0`));
888	case `0`:
889	case `' '`:
890	case `'\t'`:
891	IsAtStartOfStatement = OldIsAtStartOfStatement;
892	while (CurPtr == `' '` \|\| CurPtr == `'\t'`)
893	CurPtr++;
894	if (SkipSpace)
895	return LexToken(); // Ignore whitespace.
896	else
897	return AsmToken (AsmToken::Space, StringRef (TokStart, CurPtr - TokStart));
898	case `'\r'`: {
899	IsAtStartOfLine = true;
900	IsAtStartOfStatement = true;
901	// If this is a CR followed by LF, treat that as one token.
902	if (CurPtr != CurBuf.end() && *CurPtr == `'\n'`)
903	++CurPtr;
904	return AsmToken (AsmToken::EndOfStatement,
905	StringRef (TokStart, CurPtr - TokStart));
906	}
907	case `'\n'`:
908	IsAtStartOfLine = true;
909	IsAtStartOfStatement = true;
910	return AsmToken (AsmToken::EndOfStatement, StringRef (TokStart, `1`));
911	case `':'`: return AsmToken (AsmToken::Colon, StringRef (TokStart, `1`));
912	case `'+'`: return AsmToken (AsmToken::Plus, StringRef (TokStart, `1`));
913	case `'~'`: return AsmToken (AsmToken::Tilde, StringRef (TokStart, `1`));
914	case `'('`: return AsmToken (AsmToken::LParen, StringRef (TokStart, `1`));
915	case `')'`: return AsmToken (AsmToken::RParen, StringRef (TokStart, `1`));
916	case `'['`: return AsmToken (AsmToken::LBrac, StringRef (TokStart, `1`));
917	case `']'`: return AsmToken (AsmToken::RBrac, StringRef (TokStart, `1`));
918	case `'{'`: return AsmToken (AsmToken::LCurly, StringRef (TokStart, `1`));
919	case `'}'`: return AsmToken (AsmToken::RCurly, StringRef (TokStart, `1`));
920	case `''`: return* AsmToken (AsmToken::Star, StringRef (TokStart, `1`));
921	case `','`: return AsmToken (AsmToken::Comma, StringRef (TokStart, `1`));
922	case `'$'`: {
923	if (LexMotorolaIntegers && isHexDigit(C: *CurPtr))
924	return LexDigit();
925	if (MAI.doesAllowDollarAtStartOfIdentifier())
926	return LexIdentifier();
927	return AsmToken (AsmToken::Dollar, StringRef (TokStart, `1`));
928	}
929	case `'@'`:
930	if (MAI.doesAllowAtAtStartOfIdentifier())
931	return LexIdentifier();
932	return AsmToken (AsmToken::At, StringRef (TokStart, `1`));
933	case `'#'`:
934	if (MAI.isHLASM())
935	return LexIdentifier();
936	return AsmToken (AsmToken::Hash, StringRef (TokStart, `1`));
937	case `'?'`:
938	if (MAI.doesAllowQuestionAtStartOfIdentifier())
939	return LexIdentifier();
940	return AsmToken (AsmToken::Question, StringRef (TokStart, `1`));
941	case `'\\'`: return AsmToken (AsmToken::BackSlash, StringRef (TokStart, `1`));
942	case `'='`:
943	if (*CurPtr == `'='`) {
944	++CurPtr;
945	return AsmToken (AsmToken::EqualEqual, StringRef (TokStart, `2`));
946	}
947	return AsmToken (AsmToken::Equal, StringRef (TokStart, `1`));
948	case `'-'`:
949	if (*CurPtr == `'>'`) {
950	++CurPtr;
951	return AsmToken (AsmToken::MinusGreater, StringRef (TokStart, `2`));
952	}
953	return AsmToken (AsmToken::Minus, StringRef (TokStart, `1`));
954	case `'\|'`:
955	if (*CurPtr == `'\|'`) {
956	++CurPtr;
957	return AsmToken (AsmToken::PipePipe, StringRef (TokStart, `2`));
958	}
959	return AsmToken (AsmToken::Pipe, StringRef (TokStart, `1`));
960	case `'^'`: return AsmToken (AsmToken::Caret, StringRef (TokStart, `1`));
961	case `'&'`:
962	if (*CurPtr == `'&'`) {
963	++CurPtr;
964	return AsmToken (AsmToken::AmpAmp, StringRef (TokStart, `2`));
965	}
966	return AsmToken (AsmToken::Amp, StringRef (TokStart, `1`));
967	case `'!'`:
968	if (*CurPtr == `'='`) {
969	++CurPtr;
970	return AsmToken (AsmToken::ExclaimEqual, StringRef (TokStart, `2`));
971	}
972	return AsmToken (AsmToken::Exclaim, StringRef (TokStart, `1`));
973	case `'%'`:
974	if (LexMotorolaIntegers && (CurPtr == `'0'` \|\| CurPtr == `'1'`)) {
975	return LexDigit();
976	}
977	return AsmToken (AsmToken::Percent, StringRef (TokStart, `1`));
978	case `'/'`:
979	IsAtStartOfStatement = OldIsAtStartOfStatement;
980	return LexSlash();
981	case `'\''`: return LexSingleQuote();
982	case `'"'`: return LexQuote();
983	case `'0'`: case `'1'`: case `'2'`: case `'3'`: case `'4'`:
984	case `'5'`: case `'6'`: case `'7'`: case `'8'`: case `'9'`:
985	return LexDigit();
986	case `'<'`:
987	switch (*CurPtr) {
988	case `'<'`:
989	++CurPtr;
990	return AsmToken (AsmToken::LessLess, StringRef (TokStart, `2`));
991	case `'='`:
992	++CurPtr;
993	return AsmToken (AsmToken::LessEqual, StringRef (TokStart, `2`));
994	case `'>'`:
995	++CurPtr;
996	return AsmToken (AsmToken::LessGreater, StringRef (TokStart, `2`));
997	default:
998	return AsmToken (AsmToken::Less, StringRef (TokStart, `1`));
999	}
1000	case `'>'`:
1001	switch (*CurPtr) {
1002	case `'>'`:
1003	++CurPtr;
1004	return AsmToken (AsmToken::GreaterGreater, StringRef (TokStart, `2`));
1005	case `'='`:
1006	++CurPtr;
1007	return AsmToken (AsmToken::GreaterEqual, StringRef (TokStart, `2`));
1008	default:
1009	return AsmToken (AsmToken::Greater, StringRef (TokStart, `1`));
1010	}
1011
1012	// TODO: Quoted identifiers (objc methods etc)
1013	// local labels: [0-9][:]
1014	// Forward/backward labels: [0-9][fb]
1015	// Integers, fp constants, character constants.
1016	}
1017	}
1018

Browse the source code of llvm_projects/llvm/lib/MC/MCParser/AsmLexer.cpp