1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class implements the lexer for assembly files.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/MC/MCParser/AsmLexer.h"
14#include "llvm/ADT/APInt.h"
15#include "llvm/ADT/ArrayRef.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/MC/MCAsmInfo.h"
19#include "llvm/Support/Compiler.h"
20#include "llvm/Support/SMLoc.h"
21#include "llvm/Support/SaveAndRestore.h"
22#include "llvm/Support/raw_ostream.h"
23#include <cassert>
24#include <cctype>
25#include <cstdio>
26#include <cstring>
27#include <string>
28
29using namespace llvm;
30
31SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Ptr: Str.data()); }
32
33SMLoc AsmToken::getEndLoc() const {
34 return SMLoc::getFromPointer(Ptr: Str.data() + Str.size());
35}
36
37SMRange AsmToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
38
39void AsmToken::dump(raw_ostream &OS) const {
40 switch (Kind) {
41 case AsmToken::Error:
42 OS << "error";
43 break;
44 case AsmToken::Identifier:
45 OS << "identifier: " << getString();
46 break;
47 case AsmToken::Integer:
48 OS << "int: " << getString();
49 break;
50 case AsmToken::Real:
51 OS << "real: " << getString();
52 break;
53 case AsmToken::String:
54 OS << "string: " << getString();
55 break;
56
57 // clang-format off
58 case AsmToken::Amp: OS << "Amp"; break;
59 case AsmToken::AmpAmp: OS << "AmpAmp"; break;
60 case AsmToken::At: OS << "At"; break;
61 case AsmToken::BackSlash: OS << "BackSlash"; break;
62 case AsmToken::BigNum: OS << "BigNum"; break;
63 case AsmToken::Caret: OS << "Caret"; break;
64 case AsmToken::Colon: OS << "Colon"; break;
65 case AsmToken::Comma: OS << "Comma"; break;
66 case AsmToken::Comment: OS << "Comment"; break;
67 case AsmToken::Dollar: OS << "Dollar"; break;
68 case AsmToken::Dot: OS << "Dot"; break;
69 case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
70 case AsmToken::Eof: OS << "Eof"; break;
71 case AsmToken::Equal: OS << "Equal"; break;
72 case AsmToken::EqualEqual: OS << "EqualEqual"; break;
73 case AsmToken::Exclaim: OS << "Exclaim"; break;
74 case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break;
75 case AsmToken::Greater: OS << "Greater"; break;
76 case AsmToken::GreaterEqual: OS << "GreaterEqual"; break;
77 case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
78 case AsmToken::Hash: OS << "Hash"; break;
79 case AsmToken::HashDirective: OS << "HashDirective"; break;
80 case AsmToken::LBrac: OS << "LBrac"; break;
81 case AsmToken::LCurly: OS << "LCurly"; break;
82 case AsmToken::LParen: OS << "LParen"; break;
83 case AsmToken::Less: OS << "Less"; break;
84 case AsmToken::LessEqual: OS << "LessEqual"; break;
85 case AsmToken::LessGreater: OS << "LessGreater"; break;
86 case AsmToken::LessLess: OS << "LessLess"; break;
87 case AsmToken::Minus: OS << "Minus"; break;
88 case AsmToken::MinusGreater: OS << "MinusGreater"; break;
89 case AsmToken::Percent: OS << "Percent"; break;
90 case AsmToken::Pipe: OS << "Pipe"; break;
91 case AsmToken::PipePipe: OS << "PipePipe"; break;
92 case AsmToken::Plus: OS << "Plus"; break;
93 case AsmToken::Question: OS << "Question"; break;
94 case AsmToken::RBrac: OS << "RBrac"; break;
95 case AsmToken::RCurly: OS << "RCurly"; break;
96 case AsmToken::RParen: OS << "RParen"; break;
97 case AsmToken::Slash: OS << "Slash"; break;
98 case AsmToken::Space: OS << "Space"; break;
99 case AsmToken::Star: OS << "Star"; break;
100 case AsmToken::Tilde: OS << "Tilde"; break;
101 // clang-format on
102 }
103
104 // Print the token string.
105 OS << " (\"";
106 OS.write_escaped(Str: getString());
107 OS << "\")";
108}
109
110AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
111 // For COFF targets, this is true, while for ELF targets, it should be false.
112 // Currently, @specifier parsing depends on '@' being included in the token.
113 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@") &&
114 MAI.useAtForSpecifier();
115 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
116
117 CurTok.emplace_back(Args: AsmToken::Space, Args: StringRef());
118}
119
120void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
121 bool EndStatementAtEOF) {
122 // Buffer must be NULL-terminated. NULL terminator must reside at `Buf.end()`.
123 // It must be safe to dereference `Buf.end()`.
124 assert(*Buf.end() == '\0' &&
125 "Buffer provided to AsmLexer lacks null terminator.");
126
127 CurBuf = Buf;
128
129 if (ptr)
130 CurPtr = ptr;
131 else
132 CurPtr = CurBuf.begin();
133
134 TokStart = nullptr;
135 this->EndStatementAtEOF = EndStatementAtEOF;
136}
137
138/// ReturnError - Set the error to the specified string at the specified
139/// location. This is defined to always return AsmToken::Error.
140AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
141 SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg);
142
143 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
144}
145
146int AsmLexer::getNextChar() {
147 if (CurPtr == CurBuf.end())
148 return EOF;
149 return (unsigned char)*CurPtr++;
150}
151
152int AsmLexer::peekNextChar() {
153 if (CurPtr == CurBuf.end())
154 return EOF;
155 return (unsigned char)*CurPtr;
156}
157
158/// The leading integral digit sequence and dot should have already been
159/// consumed, some or all of the fractional digit sequence *can* have been
160/// consumed.
161AsmToken AsmLexer::LexFloatLiteral() {
162 // Skip the fractional digit sequence.
163 while (isDigit(C: *CurPtr))
164 ++CurPtr;
165
166 if (*CurPtr == '-' || *CurPtr == '+')
167 return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal");
168
169 // Check for exponent
170 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
171 ++CurPtr;
172
173 if (*CurPtr == '-' || *CurPtr == '+')
174 ++CurPtr;
175
176 while (isDigit(C: *CurPtr))
177 ++CurPtr;
178 }
179
180 return AsmToken(AsmToken::Real,
181 StringRef(TokStart, CurPtr - TokStart));
182}
183
184/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
185/// while making sure there are enough actual digits around for the constant to
186/// be valid.
187///
188/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
189/// before we get here.
190AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
191 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
192 "unexpected parse state in floating hex");
193 bool NoFracDigits = true;
194
195 // Skip the fractional part if there is one
196 if (*CurPtr == '.') {
197 ++CurPtr;
198
199 const char *FracStart = CurPtr;
200 while (isHexDigit(C: *CurPtr))
201 ++CurPtr;
202
203 NoFracDigits = CurPtr == FracStart;
204 }
205
206 if (NoIntDigits && NoFracDigits)
207 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
208 "expected at least one significand digit");
209
210 // Make sure we do have some kind of proper exponent part
211 if (*CurPtr != 'p' && *CurPtr != 'P')
212 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
213 "expected exponent part 'p'");
214 ++CurPtr;
215
216 if (*CurPtr == '+' || *CurPtr == '-')
217 ++CurPtr;
218
219 // N.b. exponent digits are *not* hex
220 const char *ExpStart = CurPtr;
221 while (isDigit(C: *CurPtr))
222 ++CurPtr;
223
224 if (CurPtr == ExpStart)
225 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
226 "expected at least one exponent digit");
227
228 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
229}
230
231/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
232static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
233 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
234 (AllowAt && C == '@') || (AllowHash && C == '#');
235}
236
237AsmToken AsmLexer::LexIdentifier() {
238 // Check for floating point literals.
239 if (CurPtr[-1] == '.' && isDigit(C: *CurPtr)) {
240 // Disambiguate a .1243foo identifier from a floating literal.
241 while (isDigit(C: *CurPtr))
242 ++CurPtr;
243
244 if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier,
245 AllowHash: AllowHashInIdentifier) ||
246 *CurPtr == 'e' || *CurPtr == 'E')
247 return LexFloatLiteral();
248 }
249
250 while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier))
251 ++CurPtr;
252
253 // Handle . as a special case.
254 if (CurPtr == TokStart+1 && TokStart[0] == '.')
255 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
256
257 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
258}
259
260/// LexSlash: Slash: /
261/// C-Style Comment: /* ... */
262/// C-style Comment: // ...
263AsmToken AsmLexer::LexSlash() {
264 if (!MAI.shouldAllowAdditionalComments()) {
265 IsAtStartOfStatement = false;
266 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
267 }
268
269 switch (*CurPtr) {
270 case '*':
271 IsAtStartOfStatement = false;
272 break; // C style comment.
273 case '/':
274 ++CurPtr;
275 return LexLineComment();
276 default:
277 IsAtStartOfStatement = false;
278 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
279 }
280
281 // C Style comment.
282 ++CurPtr; // skip the star.
283 const char *CommentTextStart = CurPtr;
284 while (CurPtr != CurBuf.end()) {
285 switch (*CurPtr++) {
286 case '*':
287 // End of the comment?
288 if (*CurPtr != '/')
289 break;
290 // If we have a CommentConsumer, notify it about the comment.
291 if (CommentConsumer) {
292 CommentConsumer->HandleComment(
293 Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
294 CommentText: StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
295 }
296 ++CurPtr; // End the */.
297 return AsmToken(AsmToken::Comment,
298 StringRef(TokStart, CurPtr - TokStart));
299 }
300 }
301 return ReturnError(Loc: TokStart, Msg: "unterminated comment");
302}
303
304/// LexLineComment: Comment: #[^\n]*
305/// : //[^\n]*
306AsmToken AsmLexer::LexLineComment() {
307 // Mark This as an end of statement with a body of the
308 // comment. While it would be nicer to leave this two tokens,
309 // backwards compatability with TargetParsers makes keeping this in this form
310 // better.
311 const char *CommentTextStart = CurPtr;
312 int CurChar = getNextChar();
313 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
314 CurChar = getNextChar();
315 const char *NewlinePtr = CurPtr;
316 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
317 ++CurPtr;
318
319 // If we have a CommentConsumer, notify it about the comment.
320 if (CommentConsumer) {
321 CommentConsumer->HandleComment(
322 Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
323 CommentText: StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
324 }
325
326 IsAtStartOfLine = true;
327 // This is a whole line comment. leave newline
328 if (IsAtStartOfStatement)
329 return AsmToken(AsmToken::EndOfStatement,
330 StringRef(TokStart, CurPtr - TokStart));
331 IsAtStartOfStatement = true;
332
333 return AsmToken(AsmToken::EndOfStatement,
334 StringRef(TokStart, CurPtr - 1 - TokStart));
335}
336
337static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
338 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
339 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
340 ++CurPtr;
341 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
342 ++CurPtr;
343 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
344 ++CurPtr;
345}
346
347// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
348// integer as a hexadecimal, possibly with leading zeroes.
349static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
350 bool LexHex) {
351 const char *FirstNonDec = nullptr;
352 const char *LookAhead = CurPtr;
353 while (true) {
354 if (isDigit(C: *LookAhead)) {
355 ++LookAhead;
356 } else {
357 if (!FirstNonDec)
358 FirstNonDec = LookAhead;
359
360 // Keep going if we are looking for a 'h' suffix.
361 if (LexHex && isHexDigit(C: *LookAhead))
362 ++LookAhead;
363 else
364 break;
365 }
366 }
367 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
368 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
369 if (isHex)
370 return 16;
371 return DefaultRadix;
372}
373
374static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
375 while (hexDigitValue(C: *CurPtr) < DefaultRadix) {
376 ++CurPtr;
377 }
378 return CurPtr;
379}
380
381static AsmToken intToken(StringRef Ref, APInt &Value) {
382 if (Value.isIntN(N: 64))
383 return AsmToken(AsmToken::Integer, Ref, Value);
384 return AsmToken(AsmToken::BigNum, Ref, Value);
385}
386
387static std::string radixName(unsigned Radix) {
388 switch (Radix) {
389 case 2:
390 return "binary";
391 case 8:
392 return "octal";
393 case 10:
394 return "decimal";
395 case 16:
396 return "hexadecimal";
397 default:
398 return "base-" + std::to_string(val: Radix);
399 }
400}
401
402/// LexDigit: First character is [0-9].
403/// Local Label: [0-9][:]
404/// Forward/Backward Label: [0-9][fb]
405/// Binary integer: 0b[01]+
406/// Octal integer: 0[0-7]+
407/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
408/// Decimal integer: [1-9][0-9]*
409AsmToken AsmLexer::LexDigit() {
410 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
411 // MASM-flavor octal integer: [0-7]+[oOqQ]
412 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
413 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
414 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
415 const char *FirstNonBinary =
416 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
417 const char *FirstNonDecimal =
418 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
419 const char *OldCurPtr = CurPtr;
420 while (isHexDigit(C: *CurPtr)) {
421 switch (*CurPtr) {
422 default:
423 if (!FirstNonDecimal) {
424 FirstNonDecimal = CurPtr;
425 }
426 [[fallthrough]];
427 case '9':
428 case '8':
429 case '7':
430 case '6':
431 case '5':
432 case '4':
433 case '3':
434 case '2':
435 if (!FirstNonBinary) {
436 FirstNonBinary = CurPtr;
437 }
438 break;
439 case '1':
440 case '0':
441 break;
442 }
443 ++CurPtr;
444 }
445 if (*CurPtr == '.') {
446 // MASM float literals (other than hex floats) always contain a ".", and
447 // are always written in decimal.
448 ++CurPtr;
449 return LexFloatLiteral();
450 }
451
452 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
453 ++CurPtr;
454 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
455 }
456
457 unsigned Radix = 0;
458 if (*CurPtr == 'h' || *CurPtr == 'H') {
459 // hexadecimal number
460 ++CurPtr;
461 Radix = 16;
462 } else if (*CurPtr == 't' || *CurPtr == 'T') {
463 // decimal number
464 ++CurPtr;
465 Radix = 10;
466 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
467 *CurPtr == 'Q') {
468 // octal number
469 ++CurPtr;
470 Radix = 8;
471 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
472 // binary number
473 ++CurPtr;
474 Radix = 2;
475 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
476 DefaultRadix < 14 &&
477 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
478 Radix = 10;
479 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
480 DefaultRadix < 12 &&
481 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
482 Radix = 2;
483 }
484
485 if (Radix) {
486 StringRef Result(TokStart, CurPtr - TokStart);
487 APInt Value(128, 0, true);
488
489 if (Result.drop_back().getAsInteger(Radix, Result&: Value))
490 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
491
492 // MSVC accepts and ignores type suffices on integer literals.
493 SkipIgnoredIntegerSuffix(CurPtr);
494
495 return intToken(Ref: Result, Value);
496 }
497
498 // default-radix integers, or floating point numbers, fall through
499 CurPtr = OldCurPtr;
500 }
501
502 // MASM default-radix integers: [0-9a-fA-F]+
503 // (All other integer literals have a radix specifier.)
504 if (LexMasmIntegers && UseMasmDefaultRadix) {
505 CurPtr = findLastDigit(CurPtr, DefaultRadix: 16);
506 StringRef Result(TokStart, CurPtr - TokStart);
507
508 APInt Value(128, 0, true);
509 if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) {
510 return ReturnError(Loc: TokStart,
511 Msg: "invalid " + radixName(Radix: DefaultRadix) + " number");
512 }
513
514 return intToken(Ref: Result, Value);
515 }
516
517 // Motorola hex integers: $[0-9a-fA-F]+
518 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
519 const char *NumStart = CurPtr;
520 while (isHexDigit(C: CurPtr[0]))
521 ++CurPtr;
522
523 APInt Result(128, 0);
524 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 16, Result))
525 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
526
527 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
528 }
529
530 // Motorola binary integers: %[01]+
531 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
532 const char *NumStart = CurPtr;
533 while (*CurPtr == '0' || *CurPtr == '1')
534 ++CurPtr;
535
536 APInt Result(128, 0);
537 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 2, Result))
538 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
539
540 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
541 }
542
543 // Decimal integer: [1-9][0-9]*
544 // HLASM-flavour decimal integer: [0-9][0-9]*
545 // FIXME: Later on, support for fb for HLASM has to be added in
546 // as they probably would be needed for asm goto
547 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
548 unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 10, LexHex: LexMasmIntegers);
549
550 if (!LexHLASMIntegers) {
551 bool IsHex = Radix == 16;
552 // Check for floating point literals.
553 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
554 if (*CurPtr == '.')
555 ++CurPtr;
556 return LexFloatLiteral();
557 }
558 }
559
560 StringRef Result(TokStart, CurPtr - TokStart);
561
562 APInt Value(128, 0, true);
563 if (Result.getAsInteger(Radix, Result&: Value))
564 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
565
566 if (!LexHLASMIntegers)
567 // The darwin/x86 (and x86-64) assembler accepts and ignores type
568 // suffices on integer literals.
569 SkipIgnoredIntegerSuffix(CurPtr);
570
571 return intToken(Ref: Result, Value);
572 }
573
574 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
575 ++CurPtr;
576 // See if we actually have "0b" as part of something like "jmp 0b\n"
577 if (!isDigit(C: CurPtr[0])) {
578 --CurPtr;
579 StringRef Result(TokStart, CurPtr - TokStart);
580 return AsmToken(AsmToken::Integer, Result, 0);
581 }
582 const char *NumStart = CurPtr;
583 while (CurPtr[0] == '0' || CurPtr[0] == '1')
584 ++CurPtr;
585
586 // Requires at least one binary digit.
587 if (CurPtr == NumStart)
588 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
589
590 StringRef Result(TokStart, CurPtr - TokStart);
591
592 APInt Value(128, 0, true);
593 if (Result.substr(Start: 2).getAsInteger(Radix: 2, Result&: Value))
594 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
595
596 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
597 // suffixes on integer literals.
598 SkipIgnoredIntegerSuffix(CurPtr);
599
600 return intToken(Ref: Result, Value);
601 }
602
603 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
604 ++CurPtr;
605 const char *NumStart = CurPtr;
606 while (isHexDigit(C: CurPtr[0]))
607 ++CurPtr;
608
609 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
610 // diagnosed by LexHexFloatLiteral).
611 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
612 return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr);
613
614 // Otherwise requires at least one hex digit.
615 if (CurPtr == NumStart)
616 return ReturnError(Loc: CurPtr-2, Msg: "invalid hexadecimal number");
617
618 APInt Result(128, 0);
619 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(Radix: 0, Result))
620 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
621
622 // Consume the optional [hH].
623 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
624 ++CurPtr;
625
626 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
627 // suffixes on integer literals.
628 SkipIgnoredIntegerSuffix(CurPtr);
629
630 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
631 }
632
633 // Either octal or hexadecimal.
634 APInt Value(128, 0, true);
635 unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 8, LexHex: LexMasmIntegers);
636 StringRef Result(TokStart, CurPtr - TokStart);
637 if (Result.getAsInteger(Radix, Result&: Value))
638 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
639
640 // Consume the [hH].
641 if (Radix == 16)
642 ++CurPtr;
643
644 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
645 // suffixes on integer literals.
646 SkipIgnoredIntegerSuffix(CurPtr);
647
648 return intToken(Ref: Result, Value);
649}
650
651/// LexSingleQuote: Integer: 'b'
652AsmToken AsmLexer::LexSingleQuote() {
653 int CurChar = getNextChar();
654
655 if (LexHLASMStrings)
656 return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals");
657
658 if (LexMasmStrings) {
659 while (CurChar != EOF) {
660 if (CurChar != '\'') {
661 CurChar = getNextChar();
662 } else if (peekNextChar() == '\'') {
663 // In MASM single-quote strings, doubled single-quotes mean an escaped
664 // single quote, so should be lexed in.
665 (void)getNextChar();
666 CurChar = getNextChar();
667 } else {
668 break;
669 }
670 }
671 if (CurChar == EOF)
672 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
673 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
674 }
675
676 if (CurChar == '\\')
677 CurChar = getNextChar();
678
679 if (CurChar == EOF)
680 return ReturnError(Loc: TokStart, Msg: "unterminated single quote");
681
682 CurChar = getNextChar();
683
684 if (CurChar != '\'')
685 return ReturnError(Loc: TokStart, Msg: "single quote way too long");
686
687 // The idea here being that 'c' is basically just an integral
688 // constant.
689 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
690 long long Value;
691
692 if (Res.starts_with(Prefix: "\'\\")) {
693 char theChar = Res[2];
694 switch (theChar) {
695 default: Value = theChar; break;
696 case '\'': Value = '\''; break;
697 case 't': Value = '\t'; break;
698 case 'n': Value = '\n'; break;
699 case 'b': Value = '\b'; break;
700 case 'f': Value = '\f'; break;
701 case 'r': Value = '\r'; break;
702 }
703 } else
704 Value = TokStart[1];
705
706 return AsmToken(AsmToken::Integer, Res, Value);
707}
708
709/// LexQuote: String: "..."
710AsmToken AsmLexer::LexQuote() {
711 int CurChar = getNextChar();
712 if (LexHLASMStrings)
713 return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals");
714
715 if (LexMasmStrings) {
716 while (CurChar != EOF) {
717 if (CurChar != '"') {
718 CurChar = getNextChar();
719 } else if (peekNextChar() == '"') {
720 // In MASM double-quoted strings, doubled double-quotes mean an escaped
721 // double quote, so should be lexed in.
722 (void)getNextChar();
723 CurChar = getNextChar();
724 } else {
725 break;
726 }
727 }
728 if (CurChar == EOF)
729 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
730 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
731 }
732
733 while (CurChar != '"') {
734 if (CurChar == '\\') {
735 // Allow \", etc.
736 CurChar = getNextChar();
737 }
738
739 if (CurChar == EOF)
740 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
741
742 CurChar = getNextChar();
743 }
744
745 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
746}
747
748StringRef AsmLexer::LexUntilEndOfStatement() {
749 TokStart = CurPtr;
750
751 while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment.
752 !isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker.
753 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
754 ++CurPtr;
755 }
756 return StringRef(TokStart, CurPtr-TokStart);
757}
758
759StringRef AsmLexer::LexUntilEndOfLine() {
760 TokStart = CurPtr;
761
762 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
763 ++CurPtr;
764 }
765 return StringRef(TokStart, CurPtr-TokStart);
766}
767
768size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
769 bool ShouldSkipSpace) {
770 SaveAndRestore SavedTokenStart(TokStart);
771 SaveAndRestore SavedCurPtr(CurPtr);
772 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
773 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
774 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
775 SaveAndRestore SavedIsPeeking(IsPeeking, true);
776 std::string SavedErr = getErr();
777 SMLoc SavedErrLoc = getErrLoc();
778
779 size_t ReadCount;
780 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
781 AsmToken Token = LexToken();
782
783 Buf[ReadCount] = Token;
784
785 if (Token.is(K: AsmToken::Eof)) {
786 ReadCount++;
787 break;
788 }
789 }
790
791 SetError(errLoc: SavedErrLoc, err: SavedErr);
792 return ReadCount;
793}
794
795bool AsmLexer::isAtStartOfComment(const char *Ptr) {
796 if (MAI.isHLASM() && !IsAtStartOfStatement)
797 return false;
798
799 StringRef CommentString = MAI.getCommentString();
800
801 if (CommentString.size() == 1)
802 return CommentString[0] == Ptr[0];
803
804 // Allow # preprocessor comments also be counted as comments for "##" cases
805 if (CommentString[1] == '#')
806 return CommentString[0] == Ptr[0];
807
808 return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == 0;
809}
810
811bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
812 return strncmp(s1: Ptr, s2: MAI.getSeparatorString(),
813 n: strlen(s: MAI.getSeparatorString())) == 0;
814}
815
816AsmToken AsmLexer::LexToken() {
817 TokStart = CurPtr;
818 // This always consumes at least one character.
819 int CurChar = getNextChar();
820
821 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
822 // If this starts with a '#', this may be a cpp
823 // hash directive and otherwise a line comment.
824 AsmToken TokenBuf[2];
825 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
826 size_t num = peekTokens(Buf, ShouldSkipSpace: true);
827 // There cannot be a space preceding this
828 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(K: AsmToken::Integer) &&
829 TokenBuf[1].is(K: AsmToken::String)) {
830 CurPtr = TokStart; // reset curPtr;
831 StringRef s = LexUntilEndOfLine();
832 UnLex(Token: TokenBuf[1]);
833 UnLex(Token: TokenBuf[0]);
834 return AsmToken(AsmToken::HashDirective, s);
835 }
836
837 if (MAI.shouldAllowAdditionalComments())
838 return LexLineComment();
839 }
840
841 if (isAtStartOfComment(Ptr: TokStart)) {
842 StringRef CommentString = MAI.getCommentString();
843 // For multi-char comment strings, advance CurPtr only if we matched the
844 // full string. This stops us from accidentally eating the newline if the
845 // current line ends in a single comment char.
846 if (CommentString.size() > 1 &&
847 StringRef(TokStart, CommentString.size()) == CommentString) {
848 CurPtr += CommentString.size() - 1;
849 }
850 return LexLineComment();
851 }
852
853 if (isAtStatementSeparator(Ptr: TokStart)) {
854 CurPtr += strlen(s: MAI.getSeparatorString()) - 1;
855 IsAtStartOfLine = true;
856 IsAtStartOfStatement = true;
857 return AsmToken(AsmToken::EndOfStatement,
858 StringRef(TokStart, strlen(s: MAI.getSeparatorString())));
859 }
860
861 // If we're missing a newline at EOF, make sure we still get an
862 // EndOfStatement token before the Eof token.
863 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
864 IsAtStartOfLine = true;
865 IsAtStartOfStatement = true;
866 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
867 }
868 IsAtStartOfLine = false;
869 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
870 IsAtStartOfStatement = false;
871 switch (CurChar) {
872 default:
873 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
874 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
875 // an identifier is target-dependent. These characters are handled in the
876 // respective switch cases.
877 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
878 return LexIdentifier();
879
880 // Unknown character, emit an error.
881 return ReturnError(Loc: TokStart, Msg: "invalid character in input");
882 case EOF:
883 if (EndStatementAtEOF) {
884 IsAtStartOfLine = true;
885 IsAtStartOfStatement = true;
886 }
887 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
888 case 0:
889 case ' ':
890 case '\t':
891 IsAtStartOfStatement = OldIsAtStartOfStatement;
892 while (*CurPtr == ' ' || *CurPtr == '\t')
893 CurPtr++;
894 if (SkipSpace)
895 return LexToken(); // Ignore whitespace.
896 else
897 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
898 case '\r': {
899 IsAtStartOfLine = true;
900 IsAtStartOfStatement = true;
901 // If this is a CR followed by LF, treat that as one token.
902 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
903 ++CurPtr;
904 return AsmToken(AsmToken::EndOfStatement,
905 StringRef(TokStart, CurPtr - TokStart));
906 }
907 case '\n':
908 IsAtStartOfLine = true;
909 IsAtStartOfStatement = true;
910 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
911 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
912 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
913 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
914 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
915 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
916 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
917 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
918 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
919 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
920 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
921 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
922 case '$': {
923 if (LexMotorolaIntegers && isHexDigit(C: *CurPtr))
924 return LexDigit();
925 if (MAI.doesAllowDollarAtStartOfIdentifier())
926 return LexIdentifier();
927 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
928 }
929 case '@':
930 if (MAI.doesAllowAtAtStartOfIdentifier())
931 return LexIdentifier();
932 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
933 case '#':
934 if (MAI.isHLASM())
935 return LexIdentifier();
936 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
937 case '?':
938 if (MAI.doesAllowQuestionAtStartOfIdentifier())
939 return LexIdentifier();
940 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
941 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
942 case '=':
943 if (*CurPtr == '=') {
944 ++CurPtr;
945 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
946 }
947 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
948 case '-':
949 if (*CurPtr == '>') {
950 ++CurPtr;
951 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
952 }
953 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
954 case '|':
955 if (*CurPtr == '|') {
956 ++CurPtr;
957 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
958 }
959 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
960 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
961 case '&':
962 if (*CurPtr == '&') {
963 ++CurPtr;
964 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
965 }
966 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
967 case '!':
968 if (*CurPtr == '=') {
969 ++CurPtr;
970 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
971 }
972 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
973 case '%':
974 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
975 return LexDigit();
976 }
977 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
978 case '/':
979 IsAtStartOfStatement = OldIsAtStartOfStatement;
980 return LexSlash();
981 case '\'': return LexSingleQuote();
982 case '"': return LexQuote();
983 case '0': case '1': case '2': case '3': case '4':
984 case '5': case '6': case '7': case '8': case '9':
985 return LexDigit();
986 case '<':
987 switch (*CurPtr) {
988 case '<':
989 ++CurPtr;
990 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
991 case '=':
992 ++CurPtr;
993 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
994 case '>':
995 ++CurPtr;
996 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
997 default:
998 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
999 }
1000 case '>':
1001 switch (*CurPtr) {
1002 case '>':
1003 ++CurPtr;
1004 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
1005 case '=':
1006 ++CurPtr;
1007 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
1008 default:
1009 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
1010 }
1011
1012 // TODO: Quoted identifiers (objc methods etc)
1013 // local labels: [0-9][:]
1014 // Forward/backward labels: [0-9][fb]
1015 // Integers, fp constants, character constants.
1016 }
1017}
1018