1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class implements the lexer for assembly files.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/MC/MCParser/AsmLexer.h"
14#include "llvm/ADT/APInt.h"
15#include "llvm/ADT/ArrayRef.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/MC/MCAsmInfo.h"
19#include "llvm/MC/MCParser/AsmLexer.h"
20#include "llvm/Support/Compiler.h"
21#include "llvm/Support/SMLoc.h"
22#include "llvm/Support/SaveAndRestore.h"
23#include "llvm/Support/raw_ostream.h"
24#include <cassert>
25#include <cctype>
26#include <cstdio>
27#include <cstring>
28#include <string>
29
30using namespace llvm;
31
32SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Ptr: Str.data()); }
33
34SMLoc AsmToken::getEndLoc() const {
35 return SMLoc::getFromPointer(Ptr: Str.data() + Str.size());
36}
37
38SMRange AsmToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
39
40void AsmToken::dump(raw_ostream &OS) const {
41 switch (Kind) {
42 case AsmToken::Error:
43 OS << "error";
44 break;
45 case AsmToken::Identifier:
46 OS << "identifier: " << getString();
47 break;
48 case AsmToken::Integer:
49 OS << "int: " << getString();
50 break;
51 case AsmToken::Real:
52 OS << "real: " << getString();
53 break;
54 case AsmToken::String:
55 OS << "string: " << getString();
56 break;
57
58 // clang-format off
59 case AsmToken::Amp: OS << "Amp"; break;
60 case AsmToken::AmpAmp: OS << "AmpAmp"; break;
61 case AsmToken::At: OS << "At"; break;
62 case AsmToken::BackSlash: OS << "BackSlash"; break;
63 case AsmToken::BigNum: OS << "BigNum"; break;
64 case AsmToken::Caret: OS << "Caret"; break;
65 case AsmToken::Colon: OS << "Colon"; break;
66 case AsmToken::Comma: OS << "Comma"; break;
67 case AsmToken::Comment: OS << "Comment"; break;
68 case AsmToken::Dollar: OS << "Dollar"; break;
69 case AsmToken::Dot: OS << "Dot"; break;
70 case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
71 case AsmToken::Eof: OS << "Eof"; break;
72 case AsmToken::Equal: OS << "Equal"; break;
73 case AsmToken::EqualEqual: OS << "EqualEqual"; break;
74 case AsmToken::Exclaim: OS << "Exclaim"; break;
75 case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break;
76 case AsmToken::Greater: OS << "Greater"; break;
77 case AsmToken::GreaterEqual: OS << "GreaterEqual"; break;
78 case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
79 case AsmToken::Hash: OS << "Hash"; break;
80 case AsmToken::HashDirective: OS << "HashDirective"; break;
81 case AsmToken::LBrac: OS << "LBrac"; break;
82 case AsmToken::LCurly: OS << "LCurly"; break;
83 case AsmToken::LParen: OS << "LParen"; break;
84 case AsmToken::Less: OS << "Less"; break;
85 case AsmToken::LessEqual: OS << "LessEqual"; break;
86 case AsmToken::LessGreater: OS << "LessGreater"; break;
87 case AsmToken::LessLess: OS << "LessLess"; break;
88 case AsmToken::Minus: OS << "Minus"; break;
89 case AsmToken::MinusGreater: OS << "MinusGreater"; break;
90 case AsmToken::Percent: OS << "Percent"; break;
91 case AsmToken::Pipe: OS << "Pipe"; break;
92 case AsmToken::PipePipe: OS << "PipePipe"; break;
93 case AsmToken::Plus: OS << "Plus"; break;
94 case AsmToken::Question: OS << "Question"; break;
95 case AsmToken::RBrac: OS << "RBrac"; break;
96 case AsmToken::RCurly: OS << "RCurly"; break;
97 case AsmToken::RParen: OS << "RParen"; break;
98 case AsmToken::Slash: OS << "Slash"; break;
99 case AsmToken::Space: OS << "Space"; break;
100 case AsmToken::Star: OS << "Star"; break;
101 case AsmToken::Tilde: OS << "Tilde"; break;
102 // clang-format on
103 }
104
105 // Print the token string.
106 OS << " (\"";
107 OS.write_escaped(Str: getString());
108 OS << "\")";
109}
110
111AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
112 // For COFF targets, this is true, while for ELF targets, it should be false.
113 // Currently, @specifier parsing depends on '@' being included in the token.
114 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@") &&
115 MAI.useAtForSpecifier();
116 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
117
118 CurTok.emplace_back(Args: AsmToken::Space, Args: StringRef());
119}
120
121void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
122 bool EndStatementAtEOF) {
123 CurBuf = Buf;
124
125 if (ptr)
126 CurPtr = ptr;
127 else
128 CurPtr = CurBuf.begin();
129
130 TokStart = nullptr;
131 this->EndStatementAtEOF = EndStatementAtEOF;
132}
133
134/// ReturnError - Set the error to the specified string at the specified
135/// location. This is defined to always return AsmToken::Error.
136AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
137 SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg);
138
139 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
140}
141
142int AsmLexer::getNextChar() {
143 if (CurPtr == CurBuf.end())
144 return EOF;
145 return (unsigned char)*CurPtr++;
146}
147
148int AsmLexer::peekNextChar() {
149 if (CurPtr == CurBuf.end())
150 return EOF;
151 return (unsigned char)*CurPtr;
152}
153
154/// The leading integral digit sequence and dot should have already been
155/// consumed, some or all of the fractional digit sequence *can* have been
156/// consumed.
157AsmToken AsmLexer::LexFloatLiteral() {
158 // Skip the fractional digit sequence.
159 while (isDigit(C: *CurPtr))
160 ++CurPtr;
161
162 if (*CurPtr == '-' || *CurPtr == '+')
163 return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal");
164
165 // Check for exponent
166 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
167 ++CurPtr;
168
169 if (*CurPtr == '-' || *CurPtr == '+')
170 ++CurPtr;
171
172 while (isDigit(C: *CurPtr))
173 ++CurPtr;
174 }
175
176 return AsmToken(AsmToken::Real,
177 StringRef(TokStart, CurPtr - TokStart));
178}
179
180/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
181/// while making sure there are enough actual digits around for the constant to
182/// be valid.
183///
184/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
185/// before we get here.
186AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
187 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
188 "unexpected parse state in floating hex");
189 bool NoFracDigits = true;
190
191 // Skip the fractional part if there is one
192 if (*CurPtr == '.') {
193 ++CurPtr;
194
195 const char *FracStart = CurPtr;
196 while (isHexDigit(C: *CurPtr))
197 ++CurPtr;
198
199 NoFracDigits = CurPtr == FracStart;
200 }
201
202 if (NoIntDigits && NoFracDigits)
203 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
204 "expected at least one significand digit");
205
206 // Make sure we do have some kind of proper exponent part
207 if (*CurPtr != 'p' && *CurPtr != 'P')
208 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
209 "expected exponent part 'p'");
210 ++CurPtr;
211
212 if (*CurPtr == '+' || *CurPtr == '-')
213 ++CurPtr;
214
215 // N.b. exponent digits are *not* hex
216 const char *ExpStart = CurPtr;
217 while (isDigit(C: *CurPtr))
218 ++CurPtr;
219
220 if (CurPtr == ExpStart)
221 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
222 "expected at least one exponent digit");
223
224 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
225}
226
227/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
228static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
229 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
230 (AllowAt && C == '@') || (AllowHash && C == '#');
231}
232
233AsmToken AsmLexer::LexIdentifier() {
234 // Check for floating point literals.
235 if (CurPtr[-1] == '.' && isDigit(C: *CurPtr)) {
236 // Disambiguate a .1243foo identifier from a floating literal.
237 while (isDigit(C: *CurPtr))
238 ++CurPtr;
239
240 if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier,
241 AllowHash: AllowHashInIdentifier) ||
242 *CurPtr == 'e' || *CurPtr == 'E')
243 return LexFloatLiteral();
244 }
245
246 while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier))
247 ++CurPtr;
248
249 // Handle . as a special case.
250 if (CurPtr == TokStart+1 && TokStart[0] == '.')
251 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
252
253 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
254}
255
256/// LexSlash: Slash: /
257/// C-Style Comment: /* ... */
258/// C-style Comment: // ...
259AsmToken AsmLexer::LexSlash() {
260 if (!MAI.shouldAllowAdditionalComments()) {
261 IsAtStartOfStatement = false;
262 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
263 }
264
265 switch (*CurPtr) {
266 case '*':
267 IsAtStartOfStatement = false;
268 break; // C style comment.
269 case '/':
270 ++CurPtr;
271 return LexLineComment();
272 default:
273 IsAtStartOfStatement = false;
274 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
275 }
276
277 // C Style comment.
278 ++CurPtr; // skip the star.
279 const char *CommentTextStart = CurPtr;
280 while (CurPtr != CurBuf.end()) {
281 switch (*CurPtr++) {
282 case '*':
283 // End of the comment?
284 if (*CurPtr != '/')
285 break;
286 // If we have a CommentConsumer, notify it about the comment.
287 if (CommentConsumer) {
288 CommentConsumer->HandleComment(
289 Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
290 CommentText: StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
291 }
292 ++CurPtr; // End the */.
293 return AsmToken(AsmToken::Comment,
294 StringRef(TokStart, CurPtr - TokStart));
295 }
296 }
297 return ReturnError(Loc: TokStart, Msg: "unterminated comment");
298}
299
300/// LexLineComment: Comment: #[^\n]*
301/// : //[^\n]*
302AsmToken AsmLexer::LexLineComment() {
303 // Mark This as an end of statement with a body of the
304 // comment. While it would be nicer to leave this two tokens,
305 // backwards compatability with TargetParsers makes keeping this in this form
306 // better.
307 const char *CommentTextStart = CurPtr;
308 int CurChar = getNextChar();
309 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
310 CurChar = getNextChar();
311 const char *NewlinePtr = CurPtr;
312 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
313 ++CurPtr;
314
315 // If we have a CommentConsumer, notify it about the comment.
316 if (CommentConsumer) {
317 CommentConsumer->HandleComment(
318 Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
319 CommentText: StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
320 }
321
322 IsAtStartOfLine = true;
323 // This is a whole line comment. leave newline
324 if (IsAtStartOfStatement)
325 return AsmToken(AsmToken::EndOfStatement,
326 StringRef(TokStart, CurPtr - TokStart));
327 IsAtStartOfStatement = true;
328
329 return AsmToken(AsmToken::EndOfStatement,
330 StringRef(TokStart, CurPtr - 1 - TokStart));
331}
332
333static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
334 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
335 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
336 ++CurPtr;
337 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
338 ++CurPtr;
339 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
340 ++CurPtr;
341}
342
343// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
344// integer as a hexadecimal, possibly with leading zeroes.
345static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
346 bool LexHex) {
347 const char *FirstNonDec = nullptr;
348 const char *LookAhead = CurPtr;
349 while (true) {
350 if (isDigit(C: *LookAhead)) {
351 ++LookAhead;
352 } else {
353 if (!FirstNonDec)
354 FirstNonDec = LookAhead;
355
356 // Keep going if we are looking for a 'h' suffix.
357 if (LexHex && isHexDigit(C: *LookAhead))
358 ++LookAhead;
359 else
360 break;
361 }
362 }
363 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
364 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
365 if (isHex)
366 return 16;
367 return DefaultRadix;
368}
369
370static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
371 while (hexDigitValue(C: *CurPtr) < DefaultRadix) {
372 ++CurPtr;
373 }
374 return CurPtr;
375}
376
377static AsmToken intToken(StringRef Ref, APInt &Value) {
378 if (Value.isIntN(N: 64))
379 return AsmToken(AsmToken::Integer, Ref, Value);
380 return AsmToken(AsmToken::BigNum, Ref, Value);
381}
382
383static std::string radixName(unsigned Radix) {
384 switch (Radix) {
385 case 2:
386 return "binary";
387 case 8:
388 return "octal";
389 case 10:
390 return "decimal";
391 case 16:
392 return "hexadecimal";
393 default:
394 return "base-" + std::to_string(val: Radix);
395 }
396}
397
398/// LexDigit: First character is [0-9].
399/// Local Label: [0-9][:]
400/// Forward/Backward Label: [0-9][fb]
401/// Binary integer: 0b[01]+
402/// Octal integer: 0[0-7]+
403/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
404/// Decimal integer: [1-9][0-9]*
405AsmToken AsmLexer::LexDigit() {
406 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
407 // MASM-flavor octal integer: [0-7]+[oOqQ]
408 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
409 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
410 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
411 const char *FirstNonBinary =
412 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
413 const char *FirstNonDecimal =
414 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
415 const char *OldCurPtr = CurPtr;
416 while (isHexDigit(C: *CurPtr)) {
417 switch (*CurPtr) {
418 default:
419 if (!FirstNonDecimal) {
420 FirstNonDecimal = CurPtr;
421 }
422 [[fallthrough]];
423 case '9':
424 case '8':
425 case '7':
426 case '6':
427 case '5':
428 case '4':
429 case '3':
430 case '2':
431 if (!FirstNonBinary) {
432 FirstNonBinary = CurPtr;
433 }
434 break;
435 case '1':
436 case '0':
437 break;
438 }
439 ++CurPtr;
440 }
441 if (*CurPtr == '.') {
442 // MASM float literals (other than hex floats) always contain a ".", and
443 // are always written in decimal.
444 ++CurPtr;
445 return LexFloatLiteral();
446 }
447
448 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
449 ++CurPtr;
450 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
451 }
452
453 unsigned Radix = 0;
454 if (*CurPtr == 'h' || *CurPtr == 'H') {
455 // hexadecimal number
456 ++CurPtr;
457 Radix = 16;
458 } else if (*CurPtr == 't' || *CurPtr == 'T') {
459 // decimal number
460 ++CurPtr;
461 Radix = 10;
462 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
463 *CurPtr == 'Q') {
464 // octal number
465 ++CurPtr;
466 Radix = 8;
467 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
468 // binary number
469 ++CurPtr;
470 Radix = 2;
471 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
472 DefaultRadix < 14 &&
473 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
474 Radix = 10;
475 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
476 DefaultRadix < 12 &&
477 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
478 Radix = 2;
479 }
480
481 if (Radix) {
482 StringRef Result(TokStart, CurPtr - TokStart);
483 APInt Value(128, 0, true);
484
485 if (Result.drop_back().getAsInteger(Radix, Result&: Value))
486 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
487
488 // MSVC accepts and ignores type suffices on integer literals.
489 SkipIgnoredIntegerSuffix(CurPtr);
490
491 return intToken(Ref: Result, Value);
492 }
493
494 // default-radix integers, or floating point numbers, fall through
495 CurPtr = OldCurPtr;
496 }
497
498 // MASM default-radix integers: [0-9a-fA-F]+
499 // (All other integer literals have a radix specifier.)
500 if (LexMasmIntegers && UseMasmDefaultRadix) {
501 CurPtr = findLastDigit(CurPtr, DefaultRadix: 16);
502 StringRef Result(TokStart, CurPtr - TokStart);
503
504 APInt Value(128, 0, true);
505 if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) {
506 return ReturnError(Loc: TokStart,
507 Msg: "invalid " + radixName(Radix: DefaultRadix) + " number");
508 }
509
510 return intToken(Ref: Result, Value);
511 }
512
513 // Motorola hex integers: $[0-9a-fA-F]+
514 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
515 const char *NumStart = CurPtr;
516 while (isHexDigit(C: CurPtr[0]))
517 ++CurPtr;
518
519 APInt Result(128, 0);
520 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 16, Result))
521 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
522
523 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
524 }
525
526 // Motorola binary integers: %[01]+
527 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
528 const char *NumStart = CurPtr;
529 while (*CurPtr == '0' || *CurPtr == '1')
530 ++CurPtr;
531
532 APInt Result(128, 0);
533 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 2, Result))
534 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
535
536 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
537 }
538
539 // Decimal integer: [1-9][0-9]*
540 // HLASM-flavour decimal integer: [0-9][0-9]*
541 // FIXME: Later on, support for fb for HLASM has to be added in
542 // as they probably would be needed for asm goto
543 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
544 unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 10, LexHex: LexMasmIntegers);
545
546 if (!LexHLASMIntegers) {
547 bool IsHex = Radix == 16;
548 // Check for floating point literals.
549 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
550 if (*CurPtr == '.')
551 ++CurPtr;
552 return LexFloatLiteral();
553 }
554 }
555
556 StringRef Result(TokStart, CurPtr - TokStart);
557
558 APInt Value(128, 0, true);
559 if (Result.getAsInteger(Radix, Result&: Value))
560 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
561
562 if (!LexHLASMIntegers)
563 // The darwin/x86 (and x86-64) assembler accepts and ignores type
564 // suffices on integer literals.
565 SkipIgnoredIntegerSuffix(CurPtr);
566
567 return intToken(Ref: Result, Value);
568 }
569
570 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
571 ++CurPtr;
572 // See if we actually have "0b" as part of something like "jmp 0b\n"
573 if (!isDigit(C: CurPtr[0])) {
574 --CurPtr;
575 StringRef Result(TokStart, CurPtr - TokStart);
576 return AsmToken(AsmToken::Integer, Result, 0);
577 }
578 const char *NumStart = CurPtr;
579 while (CurPtr[0] == '0' || CurPtr[0] == '1')
580 ++CurPtr;
581
582 // Requires at least one binary digit.
583 if (CurPtr == NumStart)
584 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
585
586 StringRef Result(TokStart, CurPtr - TokStart);
587
588 APInt Value(128, 0, true);
589 if (Result.substr(Start: 2).getAsInteger(Radix: 2, Result&: Value))
590 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
591
592 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
593 // suffixes on integer literals.
594 SkipIgnoredIntegerSuffix(CurPtr);
595
596 return intToken(Ref: Result, Value);
597 }
598
599 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
600 ++CurPtr;
601 const char *NumStart = CurPtr;
602 while (isHexDigit(C: CurPtr[0]))
603 ++CurPtr;
604
605 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
606 // diagnosed by LexHexFloatLiteral).
607 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
608 return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr);
609
610 // Otherwise requires at least one hex digit.
611 if (CurPtr == NumStart)
612 return ReturnError(Loc: CurPtr-2, Msg: "invalid hexadecimal number");
613
614 APInt Result(128, 0);
615 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(Radix: 0, Result))
616 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
617
618 // Consume the optional [hH].
619 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
620 ++CurPtr;
621
622 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
623 // suffixes on integer literals.
624 SkipIgnoredIntegerSuffix(CurPtr);
625
626 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
627 }
628
629 // Either octal or hexadecimal.
630 APInt Value(128, 0, true);
631 unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 8, LexHex: LexMasmIntegers);
632 StringRef Result(TokStart, CurPtr - TokStart);
633 if (Result.getAsInteger(Radix, Result&: Value))
634 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
635
636 // Consume the [hH].
637 if (Radix == 16)
638 ++CurPtr;
639
640 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
641 // suffixes on integer literals.
642 SkipIgnoredIntegerSuffix(CurPtr);
643
644 return intToken(Ref: Result, Value);
645}
646
647/// LexSingleQuote: Integer: 'b'
648AsmToken AsmLexer::LexSingleQuote() {
649 int CurChar = getNextChar();
650
651 if (LexHLASMStrings)
652 return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals");
653
654 if (LexMasmStrings) {
655 while (CurChar != EOF) {
656 if (CurChar != '\'') {
657 CurChar = getNextChar();
658 } else if (peekNextChar() == '\'') {
659 // In MASM single-quote strings, doubled single-quotes mean an escaped
660 // single quote, so should be lexed in.
661 (void)getNextChar();
662 CurChar = getNextChar();
663 } else {
664 break;
665 }
666 }
667 if (CurChar == EOF)
668 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
669 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
670 }
671
672 if (CurChar == '\\')
673 CurChar = getNextChar();
674
675 if (CurChar == EOF)
676 return ReturnError(Loc: TokStart, Msg: "unterminated single quote");
677
678 CurChar = getNextChar();
679
680 if (CurChar != '\'')
681 return ReturnError(Loc: TokStart, Msg: "single quote way too long");
682
683 // The idea here being that 'c' is basically just an integral
684 // constant.
685 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
686 long long Value;
687
688 if (Res.starts_with(Prefix: "\'\\")) {
689 char theChar = Res[2];
690 switch (theChar) {
691 default: Value = theChar; break;
692 case '\'': Value = '\''; break;
693 case 't': Value = '\t'; break;
694 case 'n': Value = '\n'; break;
695 case 'b': Value = '\b'; break;
696 case 'f': Value = '\f'; break;
697 case 'r': Value = '\r'; break;
698 }
699 } else
700 Value = TokStart[1];
701
702 return AsmToken(AsmToken::Integer, Res, Value);
703}
704
705/// LexQuote: String: "..."
706AsmToken AsmLexer::LexQuote() {
707 int CurChar = getNextChar();
708 if (LexHLASMStrings)
709 return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals");
710
711 if (LexMasmStrings) {
712 while (CurChar != EOF) {
713 if (CurChar != '"') {
714 CurChar = getNextChar();
715 } else if (peekNextChar() == '"') {
716 // In MASM double-quoted strings, doubled double-quotes mean an escaped
717 // double quote, so should be lexed in.
718 (void)getNextChar();
719 CurChar = getNextChar();
720 } else {
721 break;
722 }
723 }
724 if (CurChar == EOF)
725 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
726 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
727 }
728
729 while (CurChar != '"') {
730 if (CurChar == '\\') {
731 // Allow \", etc.
732 CurChar = getNextChar();
733 }
734
735 if (CurChar == EOF)
736 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
737
738 CurChar = getNextChar();
739 }
740
741 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
742}
743
744StringRef AsmLexer::LexUntilEndOfStatement() {
745 TokStart = CurPtr;
746
747 while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment.
748 !isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker.
749 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
750 ++CurPtr;
751 }
752 return StringRef(TokStart, CurPtr-TokStart);
753}
754
755StringRef AsmLexer::LexUntilEndOfLine() {
756 TokStart = CurPtr;
757
758 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
759 ++CurPtr;
760 }
761 return StringRef(TokStart, CurPtr-TokStart);
762}
763
764size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
765 bool ShouldSkipSpace) {
766 SaveAndRestore SavedTokenStart(TokStart);
767 SaveAndRestore SavedCurPtr(CurPtr);
768 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
769 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
770 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
771 SaveAndRestore SavedIsPeeking(IsPeeking, true);
772 std::string SavedErr = getErr();
773 SMLoc SavedErrLoc = getErrLoc();
774
775 size_t ReadCount;
776 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
777 AsmToken Token = LexToken();
778
779 Buf[ReadCount] = Token;
780
781 if (Token.is(K: AsmToken::Eof)) {
782 ReadCount++;
783 break;
784 }
785 }
786
787 SetError(errLoc: SavedErrLoc, err: SavedErr);
788 return ReadCount;
789}
790
791bool AsmLexer::isAtStartOfComment(const char *Ptr) {
792 if (MAI.isHLASM() && !IsAtStartOfStatement)
793 return false;
794
795 StringRef CommentString = MAI.getCommentString();
796
797 if (CommentString.size() == 1)
798 return CommentString[0] == Ptr[0];
799
800 // Allow # preprocessor comments also be counted as comments for "##" cases
801 if (CommentString[1] == '#')
802 return CommentString[0] == Ptr[0];
803
804 return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == 0;
805}
806
807bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
808 return strncmp(s1: Ptr, s2: MAI.getSeparatorString(),
809 n: strlen(s: MAI.getSeparatorString())) == 0;
810}
811
812AsmToken AsmLexer::LexToken() {
813 TokStart = CurPtr;
814 // This always consumes at least one character.
815 int CurChar = getNextChar();
816
817 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
818 // If this starts with a '#', this may be a cpp
819 // hash directive and otherwise a line comment.
820 AsmToken TokenBuf[2];
821 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
822 size_t num = peekTokens(Buf, ShouldSkipSpace: true);
823 // There cannot be a space preceding this
824 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(K: AsmToken::Integer) &&
825 TokenBuf[1].is(K: AsmToken::String)) {
826 CurPtr = TokStart; // reset curPtr;
827 StringRef s = LexUntilEndOfLine();
828 UnLex(Token: TokenBuf[1]);
829 UnLex(Token: TokenBuf[0]);
830 return AsmToken(AsmToken::HashDirective, s);
831 }
832
833 if (MAI.shouldAllowAdditionalComments())
834 return LexLineComment();
835 }
836
837 if (isAtStartOfComment(Ptr: TokStart))
838 return LexLineComment();
839
840 if (isAtStatementSeparator(Ptr: TokStart)) {
841 CurPtr += strlen(s: MAI.getSeparatorString()) - 1;
842 IsAtStartOfLine = true;
843 IsAtStartOfStatement = true;
844 return AsmToken(AsmToken::EndOfStatement,
845 StringRef(TokStart, strlen(s: MAI.getSeparatorString())));
846 }
847
848 // If we're missing a newline at EOF, make sure we still get an
849 // EndOfStatement token before the Eof token.
850 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
851 IsAtStartOfLine = true;
852 IsAtStartOfStatement = true;
853 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
854 }
855 IsAtStartOfLine = false;
856 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
857 IsAtStartOfStatement = false;
858 switch (CurChar) {
859 default:
860 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
861 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
862 // an identifier is target-dependent. These characters are handled in the
863 // respective switch cases.
864 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
865 return LexIdentifier();
866
867 // Unknown character, emit an error.
868 return ReturnError(Loc: TokStart, Msg: "invalid character in input");
869 case EOF:
870 if (EndStatementAtEOF) {
871 IsAtStartOfLine = true;
872 IsAtStartOfStatement = true;
873 }
874 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
875 case 0:
876 case ' ':
877 case '\t':
878 IsAtStartOfStatement = OldIsAtStartOfStatement;
879 while (*CurPtr == ' ' || *CurPtr == '\t')
880 CurPtr++;
881 if (SkipSpace)
882 return LexToken(); // Ignore whitespace.
883 else
884 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
885 case '\r': {
886 IsAtStartOfLine = true;
887 IsAtStartOfStatement = true;
888 // If this is a CR followed by LF, treat that as one token.
889 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
890 ++CurPtr;
891 return AsmToken(AsmToken::EndOfStatement,
892 StringRef(TokStart, CurPtr - TokStart));
893 }
894 case '\n':
895 IsAtStartOfLine = true;
896 IsAtStartOfStatement = true;
897 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
898 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
899 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
900 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
901 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
902 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
903 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
904 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
905 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
906 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
907 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
908 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
909 case '$': {
910 if (LexMotorolaIntegers && isHexDigit(C: *CurPtr))
911 return LexDigit();
912 if (MAI.doesAllowDollarAtStartOfIdentifier())
913 return LexIdentifier();
914 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
915 }
916 case '@':
917 if (MAI.doesAllowAtAtStartOfIdentifier())
918 return LexIdentifier();
919 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
920 case '#':
921 if (MAI.isHLASM())
922 return LexIdentifier();
923 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
924 case '?':
925 if (MAI.doesAllowQuestionAtStartOfIdentifier())
926 return LexIdentifier();
927 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
928 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
929 case '=':
930 if (*CurPtr == '=') {
931 ++CurPtr;
932 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
933 }
934 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
935 case '-':
936 if (*CurPtr == '>') {
937 ++CurPtr;
938 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
939 }
940 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
941 case '|':
942 if (*CurPtr == '|') {
943 ++CurPtr;
944 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
945 }
946 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
947 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
948 case '&':
949 if (*CurPtr == '&') {
950 ++CurPtr;
951 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
952 }
953 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
954 case '!':
955 if (*CurPtr == '=') {
956 ++CurPtr;
957 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
958 }
959 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
960 case '%':
961 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
962 return LexDigit();
963 }
964 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
965 case '/':
966 IsAtStartOfStatement = OldIsAtStartOfStatement;
967 return LexSlash();
968 case '\'': return LexSingleQuote();
969 case '"': return LexQuote();
970 case '0': case '1': case '2': case '3': case '4':
971 case '5': case '6': case '7': case '8': case '9':
972 return LexDigit();
973 case '<':
974 switch (*CurPtr) {
975 case '<':
976 ++CurPtr;
977 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
978 case '=':
979 ++CurPtr;
980 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
981 case '>':
982 ++CurPtr;
983 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
984 default:
985 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
986 }
987 case '>':
988 switch (*CurPtr) {
989 case '>':
990 ++CurPtr;
991 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
992 case '=':
993 ++CurPtr;
994 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
995 default:
996 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
997 }
998
999 // TODO: Quoted identifiers (objc methods etc)
1000 // local labels: [0-9][:]
1001 // Forward/backward labels: [0-9][fb]
1002 // Integers, fp constants, character constants.
1003 }
1004}
1005