1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class implements the lexer for assembly files.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/MC/MCParser/AsmLexer.h"
14#include "llvm/ADT/APInt.h"
15#include "llvm/ADT/ArrayRef.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/ADT/StringSwitch.h"
19#include "llvm/MC/MCAsmInfo.h"
20#include "llvm/MC/MCParser/MCAsmLexer.h"
21#include "llvm/Support/Compiler.h"
22#include "llvm/Support/SMLoc.h"
23#include "llvm/Support/SaveAndRestore.h"
24#include <cassert>
25#include <cctype>
26#include <cstdio>
27#include <cstring>
28#include <string>
29#include <tuple>
30#include <utility>
31
32using namespace llvm;
33
34AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@");
36 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37}
38
39AsmLexer::~AsmLexer() = default;
40
41void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42 bool EndStatementAtEOF) {
43 CurBuf = Buf;
44
45 if (ptr)
46 CurPtr = ptr;
47 else
48 CurPtr = CurBuf.begin();
49
50 TokStart = nullptr;
51 this->EndStatementAtEOF = EndStatementAtEOF;
52}
53
54/// ReturnError - Set the error to the specified string at the specified
55/// location. This is defined to always return AsmToken::Error.
56AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57 SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg);
58
59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60}
61
62int AsmLexer::getNextChar() {
63 if (CurPtr == CurBuf.end())
64 return EOF;
65 return (unsigned char)*CurPtr++;
66}
67
68int AsmLexer::peekNextChar() {
69 if (CurPtr == CurBuf.end())
70 return EOF;
71 return (unsigned char)*CurPtr;
72}
73
74/// The leading integral digit sequence and dot should have already been
75/// consumed, some or all of the fractional digit sequence *can* have been
76/// consumed.
77AsmToken AsmLexer::LexFloatLiteral() {
78 // Skip the fractional digit sequence.
79 while (isDigit(C: *CurPtr))
80 ++CurPtr;
81
82 if (*CurPtr == '-' || *CurPtr == '+')
83 return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal");
84
85 // Check for exponent
86 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87 ++CurPtr;
88
89 if (*CurPtr == '-' || *CurPtr == '+')
90 ++CurPtr;
91
92 while (isDigit(C: *CurPtr))
93 ++CurPtr;
94 }
95
96 return AsmToken(AsmToken::Real,
97 StringRef(TokStart, CurPtr - TokStart));
98}
99
100/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101/// while making sure there are enough actual digits around for the constant to
102/// be valid.
103///
104/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105/// before we get here.
106AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108 "unexpected parse state in floating hex");
109 bool NoFracDigits = true;
110
111 // Skip the fractional part if there is one
112 if (*CurPtr == '.') {
113 ++CurPtr;
114
115 const char *FracStart = CurPtr;
116 while (isHexDigit(C: *CurPtr))
117 ++CurPtr;
118
119 NoFracDigits = CurPtr == FracStart;
120 }
121
122 if (NoIntDigits && NoFracDigits)
123 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
124 "expected at least one significand digit");
125
126 // Make sure we do have some kind of proper exponent part
127 if (*CurPtr != 'p' && *CurPtr != 'P')
128 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
129 "expected exponent part 'p'");
130 ++CurPtr;
131
132 if (*CurPtr == '+' || *CurPtr == '-')
133 ++CurPtr;
134
135 // N.b. exponent digits are *not* hex
136 const char *ExpStart = CurPtr;
137 while (isDigit(C: *CurPtr))
138 ++CurPtr;
139
140 if (CurPtr == ExpStart)
141 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: "
142 "expected at least one exponent digit");
143
144 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
145}
146
147/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150 (AllowAt && C == '@') || (AllowHash && C == '#');
151}
152
153AsmToken AsmLexer::LexIdentifier() {
154 // Check for floating point literals.
155 if (CurPtr[-1] == '.' && isDigit(C: *CurPtr)) {
156 // Disambiguate a .1243foo identifier from a floating literal.
157 while (isDigit(C: *CurPtr))
158 ++CurPtr;
159
160 if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier,
161 AllowHash: AllowHashInIdentifier) ||
162 *CurPtr == 'e' || *CurPtr == 'E')
163 return LexFloatLiteral();
164 }
165
166 while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier))
167 ++CurPtr;
168
169 // Handle . as a special case.
170 if (CurPtr == TokStart+1 && TokStart[0] == '.')
171 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
172
173 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
174}
175
176/// LexSlash: Slash: /
177/// C-Style Comment: /* ... */
178/// C-style Comment: // ...
179AsmToken AsmLexer::LexSlash() {
180 if (!MAI.shouldAllowAdditionalComments()) {
181 IsAtStartOfStatement = false;
182 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
183 }
184
185 switch (*CurPtr) {
186 case '*':
187 IsAtStartOfStatement = false;
188 break; // C style comment.
189 case '/':
190 ++CurPtr;
191 return LexLineComment();
192 default:
193 IsAtStartOfStatement = false;
194 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
195 }
196
197 // C Style comment.
198 ++CurPtr; // skip the star.
199 const char *CommentTextStart = CurPtr;
200 while (CurPtr != CurBuf.end()) {
201 switch (*CurPtr++) {
202 case '*':
203 // End of the comment?
204 if (*CurPtr != '/')
205 break;
206 // If we have a CommentConsumer, notify it about the comment.
207 if (CommentConsumer) {
208 CommentConsumer->HandleComment(
209 Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
210 CommentText: StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211 }
212 ++CurPtr; // End the */.
213 return AsmToken(AsmToken::Comment,
214 StringRef(TokStart, CurPtr - TokStart));
215 }
216 }
217 return ReturnError(Loc: TokStart, Msg: "unterminated comment");
218}
219
220/// LexLineComment: Comment: #[^\n]*
221/// : //[^\n]*
222AsmToken AsmLexer::LexLineComment() {
223 // Mark This as an end of statement with a body of the
224 // comment. While it would be nicer to leave this two tokens,
225 // backwards compatability with TargetParsers makes keeping this in this form
226 // better.
227 const char *CommentTextStart = CurPtr;
228 int CurChar = getNextChar();
229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230 CurChar = getNextChar();
231 const char *NewlinePtr = CurPtr;
232 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
233 ++CurPtr;
234
235 // If we have a CommentConsumer, notify it about the comment.
236 if (CommentConsumer) {
237 CommentConsumer->HandleComment(
238 Loc: SMLoc::getFromPointer(Ptr: CommentTextStart),
239 CommentText: StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
240 }
241
242 IsAtStartOfLine = true;
243 // This is a whole line comment. leave newline
244 if (IsAtStartOfStatement)
245 return AsmToken(AsmToken::EndOfStatement,
246 StringRef(TokStart, CurPtr - TokStart));
247 IsAtStartOfStatement = true;
248
249 return AsmToken(AsmToken::EndOfStatement,
250 StringRef(TokStart, CurPtr - 1 - TokStart));
251}
252
253static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
255 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
256 ++CurPtr;
257 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
258 ++CurPtr;
259 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
260 ++CurPtr;
261}
262
263// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264// integer as a hexadecimal, possibly with leading zeroes.
265static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
266 bool LexHex) {
267 const char *FirstNonDec = nullptr;
268 const char *LookAhead = CurPtr;
269 while (true) {
270 if (isDigit(C: *LookAhead)) {
271 ++LookAhead;
272 } else {
273 if (!FirstNonDec)
274 FirstNonDec = LookAhead;
275
276 // Keep going if we are looking for a 'h' suffix.
277 if (LexHex && isHexDigit(C: *LookAhead))
278 ++LookAhead;
279 else
280 break;
281 }
282 }
283 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
284 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
285 if (isHex)
286 return 16;
287 return DefaultRadix;
288}
289
290static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
291 while (hexDigitValue(C: *CurPtr) < DefaultRadix) {
292 ++CurPtr;
293 }
294 return CurPtr;
295}
296
297static AsmToken intToken(StringRef Ref, APInt &Value) {
298 if (Value.isIntN(N: 64))
299 return AsmToken(AsmToken::Integer, Ref, Value);
300 return AsmToken(AsmToken::BigNum, Ref, Value);
301}
302
303static std::string radixName(unsigned Radix) {
304 switch (Radix) {
305 case 2:
306 return "binary";
307 case 8:
308 return "octal";
309 case 10:
310 return "decimal";
311 case 16:
312 return "hexadecimal";
313 default:
314 return "base-" + std::to_string(val: Radix);
315 }
316}
317
318/// LexDigit: First character is [0-9].
319/// Local Label: [0-9][:]
320/// Forward/Backward Label: [0-9][fb]
321/// Binary integer: 0b[01]+
322/// Octal integer: 0[0-7]+
323/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324/// Decimal integer: [1-9][0-9]*
325AsmToken AsmLexer::LexDigit() {
326 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327 // MASM-flavor octal integer: [0-7]+[oOqQ]
328 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
331 const char *FirstNonBinary =
332 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
333 const char *FirstNonDecimal =
334 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
335 const char *OldCurPtr = CurPtr;
336 while (isHexDigit(C: *CurPtr)) {
337 switch (*CurPtr) {
338 default:
339 if (!FirstNonDecimal) {
340 FirstNonDecimal = CurPtr;
341 }
342 [[fallthrough]];
343 case '9':
344 case '8':
345 case '7':
346 case '6':
347 case '5':
348 case '4':
349 case '3':
350 case '2':
351 if (!FirstNonBinary) {
352 FirstNonBinary = CurPtr;
353 }
354 break;
355 case '1':
356 case '0':
357 break;
358 }
359 ++CurPtr;
360 }
361 if (*CurPtr == '.') {
362 // MASM float literals (other than hex floats) always contain a ".", and
363 // are always written in decimal.
364 ++CurPtr;
365 return LexFloatLiteral();
366 }
367
368 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
369 ++CurPtr;
370 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
371 }
372
373 unsigned Radix = 0;
374 if (*CurPtr == 'h' || *CurPtr == 'H') {
375 // hexadecimal number
376 ++CurPtr;
377 Radix = 16;
378 } else if (*CurPtr == 't' || *CurPtr == 'T') {
379 // decimal number
380 ++CurPtr;
381 Radix = 10;
382 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
383 *CurPtr == 'Q') {
384 // octal number
385 ++CurPtr;
386 Radix = 8;
387 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
388 // binary number
389 ++CurPtr;
390 Radix = 2;
391 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
392 DefaultRadix < 14 &&
393 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
394 Radix = 10;
395 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
396 DefaultRadix < 12 &&
397 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
398 Radix = 2;
399 }
400
401 if (Radix) {
402 StringRef Result(TokStart, CurPtr - TokStart);
403 APInt Value(128, 0, true);
404
405 if (Result.drop_back().getAsInteger(Radix, Result&: Value))
406 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
407
408 // MSVC accepts and ignores type suffices on integer literals.
409 SkipIgnoredIntegerSuffix(CurPtr);
410
411 return intToken(Ref: Result, Value);
412 }
413
414 // default-radix integers, or floating point numbers, fall through
415 CurPtr = OldCurPtr;
416 }
417
418 // MASM default-radix integers: [0-9a-fA-F]+
419 // (All other integer literals have a radix specifier.)
420 if (LexMasmIntegers && UseMasmDefaultRadix) {
421 CurPtr = findLastDigit(CurPtr, DefaultRadix: 16);
422 StringRef Result(TokStart, CurPtr - TokStart);
423
424 APInt Value(128, 0, true);
425 if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) {
426 return ReturnError(Loc: TokStart,
427 Msg: "invalid " + radixName(Radix: DefaultRadix) + " number");
428 }
429
430 return intToken(Ref: Result, Value);
431 }
432
433 // Motorola hex integers: $[0-9a-fA-F]+
434 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
435 const char *NumStart = CurPtr;
436 while (isHexDigit(C: CurPtr[0]))
437 ++CurPtr;
438
439 APInt Result(128, 0);
440 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 16, Result))
441 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
442
443 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
444 }
445
446 // Motorola binary integers: %[01]+
447 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
448 const char *NumStart = CurPtr;
449 while (*CurPtr == '0' || *CurPtr == '1')
450 ++CurPtr;
451
452 APInt Result(128, 0);
453 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 2, Result))
454 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
455
456 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
457 }
458
459 // Decimal integer: [1-9][0-9]*
460 // HLASM-flavour decimal integer: [0-9][0-9]*
461 // FIXME: Later on, support for fb for HLASM has to be added in
462 // as they probably would be needed for asm goto
463 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
464 unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 10, LexHex: LexMasmIntegers);
465
466 if (!LexHLASMIntegers) {
467 bool IsHex = Radix == 16;
468 // Check for floating point literals.
469 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
470 if (*CurPtr == '.')
471 ++CurPtr;
472 return LexFloatLiteral();
473 }
474 }
475
476 StringRef Result(TokStart, CurPtr - TokStart);
477
478 APInt Value(128, 0, true);
479 if (Result.getAsInteger(Radix, Result&: Value))
480 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
481
482 if (!LexHLASMIntegers)
483 // The darwin/x86 (and x86-64) assembler accepts and ignores type
484 // suffices on integer literals.
485 SkipIgnoredIntegerSuffix(CurPtr);
486
487 return intToken(Ref: Result, Value);
488 }
489
490 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
491 ++CurPtr;
492 // See if we actually have "0b" as part of something like "jmp 0b\n"
493 if (!isDigit(C: CurPtr[0])) {
494 --CurPtr;
495 StringRef Result(TokStart, CurPtr - TokStart);
496 return AsmToken(AsmToken::Integer, Result, 0);
497 }
498 const char *NumStart = CurPtr;
499 while (CurPtr[0] == '0' || CurPtr[0] == '1')
500 ++CurPtr;
501
502 // Requires at least one binary digit.
503 if (CurPtr == NumStart)
504 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
505
506 StringRef Result(TokStart, CurPtr - TokStart);
507
508 APInt Value(128, 0, true);
509 if (Result.substr(Start: 2).getAsInteger(Radix: 2, Result&: Value))
510 return ReturnError(Loc: TokStart, Msg: "invalid binary number");
511
512 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513 // suffixes on integer literals.
514 SkipIgnoredIntegerSuffix(CurPtr);
515
516 return intToken(Ref: Result, Value);
517 }
518
519 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
520 ++CurPtr;
521 const char *NumStart = CurPtr;
522 while (isHexDigit(C: CurPtr[0]))
523 ++CurPtr;
524
525 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526 // diagnosed by LexHexFloatLiteral).
527 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
528 return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr);
529
530 // Otherwise requires at least one hex digit.
531 if (CurPtr == NumStart)
532 return ReturnError(Loc: CurPtr-2, Msg: "invalid hexadecimal number");
533
534 APInt Result(128, 0);
535 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(Radix: 0, Result))
536 return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number");
537
538 // Consume the optional [hH].
539 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
540 ++CurPtr;
541
542 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543 // suffixes on integer literals.
544 SkipIgnoredIntegerSuffix(CurPtr);
545
546 return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result);
547 }
548
549 // Either octal or hexadecimal.
550 APInt Value(128, 0, true);
551 unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 8, LexHex: LexMasmIntegers);
552 StringRef Result(TokStart, CurPtr - TokStart);
553 if (Result.getAsInteger(Radix, Result&: Value))
554 return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number");
555
556 // Consume the [hH].
557 if (Radix == 16)
558 ++CurPtr;
559
560 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561 // suffixes on integer literals.
562 SkipIgnoredIntegerSuffix(CurPtr);
563
564 return intToken(Ref: Result, Value);
565}
566
567/// LexSingleQuote: Integer: 'b'
568AsmToken AsmLexer::LexSingleQuote() {
569 int CurChar = getNextChar();
570
571 if (LexHLASMStrings)
572 return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals");
573
574 if (LexMasmStrings) {
575 while (CurChar != EOF) {
576 if (CurChar != '\'') {
577 CurChar = getNextChar();
578 } else if (peekNextChar() == '\'') {
579 // In MASM single-quote strings, doubled single-quotes mean an escaped
580 // single quote, so should be lexed in.
581 (void)getNextChar();
582 CurChar = getNextChar();
583 } else {
584 break;
585 }
586 }
587 if (CurChar == EOF)
588 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
589 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
590 }
591
592 if (CurChar == '\\')
593 CurChar = getNextChar();
594
595 if (CurChar == EOF)
596 return ReturnError(Loc: TokStart, Msg: "unterminated single quote");
597
598 CurChar = getNextChar();
599
600 if (CurChar != '\'')
601 return ReturnError(Loc: TokStart, Msg: "single quote way too long");
602
603 // The idea here being that 'c' is basically just an integral
604 // constant.
605 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
606 long long Value;
607
608 if (Res.starts_with(Prefix: "\'\\")) {
609 char theChar = Res[2];
610 switch (theChar) {
611 default: Value = theChar; break;
612 case '\'': Value = '\''; break;
613 case 't': Value = '\t'; break;
614 case 'n': Value = '\n'; break;
615 case 'b': Value = '\b'; break;
616 case 'f': Value = '\f'; break;
617 case 'r': Value = '\r'; break;
618 }
619 } else
620 Value = TokStart[1];
621
622 return AsmToken(AsmToken::Integer, Res, Value);
623}
624
625/// LexQuote: String: "..."
626AsmToken AsmLexer::LexQuote() {
627 int CurChar = getNextChar();
628 if (LexHLASMStrings)
629 return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals");
630
631 if (LexMasmStrings) {
632 while (CurChar != EOF) {
633 if (CurChar != '"') {
634 CurChar = getNextChar();
635 } else if (peekNextChar() == '"') {
636 // In MASM double-quoted strings, doubled double-quotes mean an escaped
637 // double quote, so should be lexed in.
638 (void)getNextChar();
639 CurChar = getNextChar();
640 } else {
641 break;
642 }
643 }
644 if (CurChar == EOF)
645 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
646 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
647 }
648
649 while (CurChar != '"') {
650 if (CurChar == '\\') {
651 // Allow \", etc.
652 CurChar = getNextChar();
653 }
654
655 if (CurChar == EOF)
656 return ReturnError(Loc: TokStart, Msg: "unterminated string constant");
657
658 CurChar = getNextChar();
659 }
660
661 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
662}
663
664StringRef AsmLexer::LexUntilEndOfStatement() {
665 TokStart = CurPtr;
666
667 while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment.
668 !isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker.
669 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
670 ++CurPtr;
671 }
672 return StringRef(TokStart, CurPtr-TokStart);
673}
674
675StringRef AsmLexer::LexUntilEndOfLine() {
676 TokStart = CurPtr;
677
678 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
679 ++CurPtr;
680 }
681 return StringRef(TokStart, CurPtr-TokStart);
682}
683
684size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
685 bool ShouldSkipSpace) {
686 SaveAndRestore SavedTokenStart(TokStart);
687 SaveAndRestore SavedCurPtr(CurPtr);
688 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
689 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
690 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
691 SaveAndRestore SavedIsPeeking(IsPeeking, true);
692 std::string SavedErr = getErr();
693 SMLoc SavedErrLoc = getErrLoc();
694
695 size_t ReadCount;
696 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
697 AsmToken Token = LexToken();
698
699 Buf[ReadCount] = Token;
700
701 if (Token.is(K: AsmToken::Eof))
702 break;
703 }
704
705 SetError(errLoc: SavedErrLoc, err: SavedErr);
706 return ReadCount;
707}
708
709bool AsmLexer::isAtStartOfComment(const char *Ptr) {
710 if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
711 return false;
712
713 StringRef CommentString = MAI.getCommentString();
714
715 if (CommentString.size() == 1)
716 return CommentString[0] == Ptr[0];
717
718 // Allow # preprocessor comments also be counted as comments for "##" cases
719 if (CommentString[1] == '#')
720 return CommentString[0] == Ptr[0];
721
722 return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == 0;
723}
724
725bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
726 return strncmp(s1: Ptr, s2: MAI.getSeparatorString(),
727 n: strlen(s: MAI.getSeparatorString())) == 0;
728}
729
730AsmToken AsmLexer::LexToken() {
731 TokStart = CurPtr;
732 // This always consumes at least one character.
733 int CurChar = getNextChar();
734
735 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
736 // If this starts with a '#', this may be a cpp
737 // hash directive and otherwise a line comment.
738 AsmToken TokenBuf[2];
739 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
740 size_t num = peekTokens(Buf, ShouldSkipSpace: true);
741 // There cannot be a space preceding this
742 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(K: AsmToken::Integer) &&
743 TokenBuf[1].is(K: AsmToken::String)) {
744 CurPtr = TokStart; // reset curPtr;
745 StringRef s = LexUntilEndOfLine();
746 UnLex(Token: TokenBuf[1]);
747 UnLex(Token: TokenBuf[0]);
748 return AsmToken(AsmToken::HashDirective, s);
749 }
750
751 if (MAI.shouldAllowAdditionalComments())
752 return LexLineComment();
753 }
754
755 if (isAtStartOfComment(Ptr: TokStart))
756 return LexLineComment();
757
758 if (isAtStatementSeparator(Ptr: TokStart)) {
759 CurPtr += strlen(s: MAI.getSeparatorString()) - 1;
760 IsAtStartOfLine = true;
761 IsAtStartOfStatement = true;
762 return AsmToken(AsmToken::EndOfStatement,
763 StringRef(TokStart, strlen(s: MAI.getSeparatorString())));
764 }
765
766 // If we're missing a newline at EOF, make sure we still get an
767 // EndOfStatement token before the Eof token.
768 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
769 IsAtStartOfLine = true;
770 IsAtStartOfStatement = true;
771 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
772 }
773 IsAtStartOfLine = false;
774 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
775 IsAtStartOfStatement = false;
776 switch (CurChar) {
777 default:
778 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
779 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
780 // an identifier is target-dependent. These characters are handled in the
781 // respective switch cases.
782 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
783 return LexIdentifier();
784
785 // Unknown character, emit an error.
786 return ReturnError(Loc: TokStart, Msg: "invalid character in input");
787 case EOF:
788 if (EndStatementAtEOF) {
789 IsAtStartOfLine = true;
790 IsAtStartOfStatement = true;
791 }
792 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
793 case 0:
794 case ' ':
795 case '\t':
796 IsAtStartOfStatement = OldIsAtStartOfStatement;
797 while (*CurPtr == ' ' || *CurPtr == '\t')
798 CurPtr++;
799 if (SkipSpace)
800 return LexToken(); // Ignore whitespace.
801 else
802 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
803 case '\r': {
804 IsAtStartOfLine = true;
805 IsAtStartOfStatement = true;
806 // If this is a CR followed by LF, treat that as one token.
807 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
808 ++CurPtr;
809 return AsmToken(AsmToken::EndOfStatement,
810 StringRef(TokStart, CurPtr - TokStart));
811 }
812 case '\n':
813 IsAtStartOfLine = true;
814 IsAtStartOfStatement = true;
815 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
816 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
817 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
818 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
819 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
820 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
821 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
822 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
823 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
824 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
825 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
826 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
827 case '$': {
828 if (LexMotorolaIntegers && isHexDigit(C: *CurPtr))
829 return LexDigit();
830 if (MAI.doesAllowDollarAtStartOfIdentifier())
831 return LexIdentifier();
832 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
833 }
834 case '@':
835 if (MAI.doesAllowAtAtStartOfIdentifier())
836 return LexIdentifier();
837 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
838 case '#':
839 if (MAI.doesAllowHashAtStartOfIdentifier())
840 return LexIdentifier();
841 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
842 case '?':
843 if (MAI.doesAllowQuestionAtStartOfIdentifier())
844 return LexIdentifier();
845 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
846 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
847 case '=':
848 if (*CurPtr == '=') {
849 ++CurPtr;
850 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
851 }
852 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
853 case '-':
854 if (*CurPtr == '>') {
855 ++CurPtr;
856 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
857 }
858 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
859 case '|':
860 if (*CurPtr == '|') {
861 ++CurPtr;
862 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
863 }
864 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
865 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
866 case '&':
867 if (*CurPtr == '&') {
868 ++CurPtr;
869 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
870 }
871 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
872 case '!':
873 if (*CurPtr == '=') {
874 ++CurPtr;
875 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
876 }
877 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
878 case '%':
879 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
880 return LexDigit();
881 }
882
883 if (MAI.hasMipsExpressions()) {
884 AsmToken::TokenKind Operator;
885 unsigned OperatorLength;
886
887 std::tie(args&: Operator, args&: OperatorLength) =
888 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
889 StringRef(CurPtr))
890 .StartsWith(S: "call16", Value: {AsmToken::PercentCall16, 7})
891 .StartsWith(S: "call_hi", Value: {AsmToken::PercentCall_Hi, 8})
892 .StartsWith(S: "call_lo", Value: {AsmToken::PercentCall_Lo, 8})
893 .StartsWith(S: "dtprel_hi", Value: {AsmToken::PercentDtprel_Hi, 10})
894 .StartsWith(S: "dtprel_lo", Value: {AsmToken::PercentDtprel_Lo, 10})
895 .StartsWith(S: "got_disp", Value: {AsmToken::PercentGot_Disp, 9})
896 .StartsWith(S: "got_hi", Value: {AsmToken::PercentGot_Hi, 7})
897 .StartsWith(S: "got_lo", Value: {AsmToken::PercentGot_Lo, 7})
898 .StartsWith(S: "got_ofst", Value: {AsmToken::PercentGot_Ofst, 9})
899 .StartsWith(S: "got_page", Value: {AsmToken::PercentGot_Page, 9})
900 .StartsWith(S: "gottprel", Value: {AsmToken::PercentGottprel, 9})
901 .StartsWith(S: "got", Value: {AsmToken::PercentGot, 4})
902 .StartsWith(S: "gp_rel", Value: {AsmToken::PercentGp_Rel, 7})
903 .StartsWith(S: "higher", Value: {AsmToken::PercentHigher, 7})
904 .StartsWith(S: "highest", Value: {AsmToken::PercentHighest, 8})
905 .StartsWith(S: "hi", Value: {AsmToken::PercentHi, 3})
906 .StartsWith(S: "lo", Value: {AsmToken::PercentLo, 3})
907 .StartsWith(S: "neg", Value: {AsmToken::PercentNeg, 4})
908 .StartsWith(S: "pcrel_hi", Value: {AsmToken::PercentPcrel_Hi, 9})
909 .StartsWith(S: "pcrel_lo", Value: {AsmToken::PercentPcrel_Lo, 9})
910 .StartsWith(S: "tlsgd", Value: {AsmToken::PercentTlsgd, 6})
911 .StartsWith(S: "tlsldm", Value: {AsmToken::PercentTlsldm, 7})
912 .StartsWith(S: "tprel_hi", Value: {AsmToken::PercentTprel_Hi, 9})
913 .StartsWith(S: "tprel_lo", Value: {AsmToken::PercentTprel_Lo, 9})
914 .Default(Value: {AsmToken::Percent, 1});
915
916 if (Operator != AsmToken::Percent) {
917 CurPtr += OperatorLength - 1;
918 return AsmToken(Operator, StringRef(TokStart, OperatorLength));
919 }
920 }
921 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
922 case '/':
923 IsAtStartOfStatement = OldIsAtStartOfStatement;
924 return LexSlash();
925 case '\'': return LexSingleQuote();
926 case '"': return LexQuote();
927 case '0': case '1': case '2': case '3': case '4':
928 case '5': case '6': case '7': case '8': case '9':
929 return LexDigit();
930 case '<':
931 switch (*CurPtr) {
932 case '<':
933 ++CurPtr;
934 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
935 case '=':
936 ++CurPtr;
937 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
938 case '>':
939 ++CurPtr;
940 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
941 default:
942 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
943 }
944 case '>':
945 switch (*CurPtr) {
946 case '>':
947 ++CurPtr;
948 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
949 case '=':
950 ++CurPtr;
951 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
952 default:
953 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
954 }
955
956 // TODO: Quoted identifiers (objc methods etc)
957 // local labels: [0-9][:]
958 // Forward/backward labels: [0-9][fb]
959 // Integers, fp constants, character constants.
960 }
961}
962