1 | //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This class implements the lexer for assembly files. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/MC/MCParser/AsmLexer.h" |
14 | #include "llvm/ADT/APInt.h" |
15 | #include "llvm/ADT/ArrayRef.h" |
16 | #include "llvm/ADT/StringExtras.h" |
17 | #include "llvm/ADT/StringRef.h" |
18 | #include "llvm/ADT/StringSwitch.h" |
19 | #include "llvm/MC/MCAsmInfo.h" |
20 | #include "llvm/MC/MCParser/MCAsmLexer.h" |
21 | #include "llvm/Support/Compiler.h" |
22 | #include "llvm/Support/SMLoc.h" |
23 | #include "llvm/Support/SaveAndRestore.h" |
24 | #include <cassert> |
25 | #include <cctype> |
26 | #include <cstdio> |
27 | #include <cstring> |
28 | #include <string> |
29 | #include <tuple> |
30 | #include <utility> |
31 | |
32 | using namespace llvm; |
33 | |
34 | AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { |
35 | AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@" ); |
36 | LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); |
37 | } |
38 | |
39 | AsmLexer::~AsmLexer() = default; |
40 | |
41 | void AsmLexer::setBuffer(StringRef Buf, const char *ptr, |
42 | bool EndStatementAtEOF) { |
43 | CurBuf = Buf; |
44 | |
45 | if (ptr) |
46 | CurPtr = ptr; |
47 | else |
48 | CurPtr = CurBuf.begin(); |
49 | |
50 | TokStart = nullptr; |
51 | this->EndStatementAtEOF = EndStatementAtEOF; |
52 | } |
53 | |
54 | /// ReturnError - Set the error to the specified string at the specified |
55 | /// location. This is defined to always return AsmToken::Error. |
56 | AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { |
57 | SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg); |
58 | |
59 | return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc)); |
60 | } |
61 | |
62 | int AsmLexer::getNextChar() { |
63 | if (CurPtr == CurBuf.end()) |
64 | return EOF; |
65 | return (unsigned char)*CurPtr++; |
66 | } |
67 | |
68 | int AsmLexer::peekNextChar() { |
69 | if (CurPtr == CurBuf.end()) |
70 | return EOF; |
71 | return (unsigned char)*CurPtr; |
72 | } |
73 | |
74 | /// The leading integral digit sequence and dot should have already been |
75 | /// consumed, some or all of the fractional digit sequence *can* have been |
76 | /// consumed. |
77 | AsmToken AsmLexer::LexFloatLiteral() { |
78 | // Skip the fractional digit sequence. |
79 | while (isDigit(C: *CurPtr)) |
80 | ++CurPtr; |
81 | |
82 | if (*CurPtr == '-' || *CurPtr == '+') |
83 | return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal" ); |
84 | |
85 | // Check for exponent |
86 | if ((*CurPtr == 'e' || *CurPtr == 'E')) { |
87 | ++CurPtr; |
88 | |
89 | if (*CurPtr == '-' || *CurPtr == '+') |
90 | ++CurPtr; |
91 | |
92 | while (isDigit(C: *CurPtr)) |
93 | ++CurPtr; |
94 | } |
95 | |
96 | return AsmToken(AsmToken::Real, |
97 | StringRef(TokStart, CurPtr - TokStart)); |
98 | } |
99 | |
100 | /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ |
101 | /// while making sure there are enough actual digits around for the constant to |
102 | /// be valid. |
103 | /// |
104 | /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed |
105 | /// before we get here. |
106 | AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { |
107 | assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && |
108 | "unexpected parse state in floating hex" ); |
109 | bool NoFracDigits = true; |
110 | |
111 | // Skip the fractional part if there is one |
112 | if (*CurPtr == '.') { |
113 | ++CurPtr; |
114 | |
115 | const char *FracStart = CurPtr; |
116 | while (isHexDigit(C: *CurPtr)) |
117 | ++CurPtr; |
118 | |
119 | NoFracDigits = CurPtr == FracStart; |
120 | } |
121 | |
122 | if (NoIntDigits && NoFracDigits) |
123 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: " |
124 | "expected at least one significand digit" ); |
125 | |
126 | // Make sure we do have some kind of proper exponent part |
127 | if (*CurPtr != 'p' && *CurPtr != 'P') |
128 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: " |
129 | "expected exponent part 'p'" ); |
130 | ++CurPtr; |
131 | |
132 | if (*CurPtr == '+' || *CurPtr == '-') |
133 | ++CurPtr; |
134 | |
135 | // N.b. exponent digits are *not* hex |
136 | const char *ExpStart = CurPtr; |
137 | while (isDigit(C: *CurPtr)) |
138 | ++CurPtr; |
139 | |
140 | if (CurPtr == ExpStart) |
141 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: " |
142 | "expected at least one exponent digit" ); |
143 | |
144 | return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); |
145 | } |
146 | |
147 | /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]* |
148 | static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) { |
149 | return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' || |
150 | (AllowAt && C == '@') || (AllowHash && C == '#'); |
151 | } |
152 | |
153 | AsmToken AsmLexer::LexIdentifier() { |
154 | // Check for floating point literals. |
155 | if (CurPtr[-1] == '.' && isDigit(C: *CurPtr)) { |
156 | // Disambiguate a .1243foo identifier from a floating literal. |
157 | while (isDigit(C: *CurPtr)) |
158 | ++CurPtr; |
159 | |
160 | if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, |
161 | AllowHash: AllowHashInIdentifier) || |
162 | *CurPtr == 'e' || *CurPtr == 'E') |
163 | return LexFloatLiteral(); |
164 | } |
165 | |
166 | while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier)) |
167 | ++CurPtr; |
168 | |
169 | // Handle . as a special case. |
170 | if (CurPtr == TokStart+1 && TokStart[0] == '.') |
171 | return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); |
172 | |
173 | return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); |
174 | } |
175 | |
176 | /// LexSlash: Slash: / |
177 | /// C-Style Comment: /* ... */ |
178 | /// C-style Comment: // ... |
179 | AsmToken AsmLexer::LexSlash() { |
180 | if (!MAI.shouldAllowAdditionalComments()) { |
181 | IsAtStartOfStatement = false; |
182 | return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); |
183 | } |
184 | |
185 | switch (*CurPtr) { |
186 | case '*': |
187 | IsAtStartOfStatement = false; |
188 | break; // C style comment. |
189 | case '/': |
190 | ++CurPtr; |
191 | return LexLineComment(); |
192 | default: |
193 | IsAtStartOfStatement = false; |
194 | return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); |
195 | } |
196 | |
197 | // C Style comment. |
198 | ++CurPtr; // skip the star. |
199 | const char * = CurPtr; |
200 | while (CurPtr != CurBuf.end()) { |
201 | switch (*CurPtr++) { |
202 | case '*': |
203 | // End of the comment? |
204 | if (*CurPtr != '/') |
205 | break; |
206 | // If we have a CommentConsumer, notify it about the comment. |
207 | if (CommentConsumer) { |
208 | CommentConsumer->HandleComment( |
209 | Loc: SMLoc::getFromPointer(Ptr: CommentTextStart), |
210 | CommentText: StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); |
211 | } |
212 | ++CurPtr; // End the */. |
213 | return AsmToken(AsmToken::Comment, |
214 | StringRef(TokStart, CurPtr - TokStart)); |
215 | } |
216 | } |
217 | return ReturnError(Loc: TokStart, Msg: "unterminated comment" ); |
218 | } |
219 | |
220 | /// LexLineComment: Comment: #[^\n]* |
221 | /// : //[^\n]* |
222 | AsmToken AsmLexer::() { |
223 | // Mark This as an end of statement with a body of the |
224 | // comment. While it would be nicer to leave this two tokens, |
225 | // backwards compatability with TargetParsers makes keeping this in this form |
226 | // better. |
227 | const char * = CurPtr; |
228 | int CurChar = getNextChar(); |
229 | while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) |
230 | CurChar = getNextChar(); |
231 | const char *NewlinePtr = CurPtr; |
232 | if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') |
233 | ++CurPtr; |
234 | |
235 | // If we have a CommentConsumer, notify it about the comment. |
236 | if (CommentConsumer) { |
237 | CommentConsumer->HandleComment( |
238 | Loc: SMLoc::getFromPointer(Ptr: CommentTextStart), |
239 | CommentText: StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart)); |
240 | } |
241 | |
242 | IsAtStartOfLine = true; |
243 | // This is a whole line comment. leave newline |
244 | if (IsAtStartOfStatement) |
245 | return AsmToken(AsmToken::EndOfStatement, |
246 | StringRef(TokStart, CurPtr - TokStart)); |
247 | IsAtStartOfStatement = true; |
248 | |
249 | return AsmToken(AsmToken::EndOfStatement, |
250 | StringRef(TokStart, CurPtr - 1 - TokStart)); |
251 | } |
252 | |
253 | static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { |
254 | // Skip case-insensitive ULL, UL, U, L and LL suffixes. |
255 | if (CurPtr[0] == 'U' || CurPtr[0] == 'u') |
256 | ++CurPtr; |
257 | if (CurPtr[0] == 'L' || CurPtr[0] == 'l') |
258 | ++CurPtr; |
259 | if (CurPtr[0] == 'L' || CurPtr[0] == 'l') |
260 | ++CurPtr; |
261 | } |
262 | |
263 | // Look ahead to search for first non-hex digit, if it's [hH], then we treat the |
264 | // integer as a hexadecimal, possibly with leading zeroes. |
265 | static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, |
266 | bool LexHex) { |
267 | const char *FirstNonDec = nullptr; |
268 | const char *LookAhead = CurPtr; |
269 | while (true) { |
270 | if (isDigit(C: *LookAhead)) { |
271 | ++LookAhead; |
272 | } else { |
273 | if (!FirstNonDec) |
274 | FirstNonDec = LookAhead; |
275 | |
276 | // Keep going if we are looking for a 'h' suffix. |
277 | if (LexHex && isHexDigit(C: *LookAhead)) |
278 | ++LookAhead; |
279 | else |
280 | break; |
281 | } |
282 | } |
283 | bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H'); |
284 | CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec; |
285 | if (isHex) |
286 | return 16; |
287 | return DefaultRadix; |
288 | } |
289 | |
290 | static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) { |
291 | while (hexDigitValue(C: *CurPtr) < DefaultRadix) { |
292 | ++CurPtr; |
293 | } |
294 | return CurPtr; |
295 | } |
296 | |
297 | static AsmToken intToken(StringRef Ref, APInt &Value) { |
298 | if (Value.isIntN(N: 64)) |
299 | return AsmToken(AsmToken::Integer, Ref, Value); |
300 | return AsmToken(AsmToken::BigNum, Ref, Value); |
301 | } |
302 | |
303 | static std::string radixName(unsigned Radix) { |
304 | switch (Radix) { |
305 | case 2: |
306 | return "binary" ; |
307 | case 8: |
308 | return "octal" ; |
309 | case 10: |
310 | return "decimal" ; |
311 | case 16: |
312 | return "hexadecimal" ; |
313 | default: |
314 | return "base-" + std::to_string(val: Radix); |
315 | } |
316 | } |
317 | |
318 | /// LexDigit: First character is [0-9]. |
319 | /// Local Label: [0-9][:] |
320 | /// Forward/Backward Label: [0-9][fb] |
321 | /// Binary integer: 0b[01]+ |
322 | /// Octal integer: 0[0-7]+ |
323 | /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] |
324 | /// Decimal integer: [1-9][0-9]* |
325 | AsmToken AsmLexer::LexDigit() { |
326 | // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY]) |
327 | // MASM-flavor octal integer: [0-7]+[oOqQ] |
328 | // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT]) |
329 | // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] |
330 | if (LexMasmIntegers && isdigit(CurPtr[-1])) { |
331 | const char *FirstNonBinary = |
332 | (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr; |
333 | const char *FirstNonDecimal = |
334 | (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr; |
335 | const char *OldCurPtr = CurPtr; |
336 | while (isHexDigit(C: *CurPtr)) { |
337 | switch (*CurPtr) { |
338 | default: |
339 | if (!FirstNonDecimal) { |
340 | FirstNonDecimal = CurPtr; |
341 | } |
342 | [[fallthrough]]; |
343 | case '9': |
344 | case '8': |
345 | case '7': |
346 | case '6': |
347 | case '5': |
348 | case '4': |
349 | case '3': |
350 | case '2': |
351 | if (!FirstNonBinary) { |
352 | FirstNonBinary = CurPtr; |
353 | } |
354 | break; |
355 | case '1': |
356 | case '0': |
357 | break; |
358 | } |
359 | ++CurPtr; |
360 | } |
361 | if (*CurPtr == '.') { |
362 | // MASM float literals (other than hex floats) always contain a ".", and |
363 | // are always written in decimal. |
364 | ++CurPtr; |
365 | return LexFloatLiteral(); |
366 | } |
367 | |
368 | if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) { |
369 | ++CurPtr; |
370 | return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); |
371 | } |
372 | |
373 | unsigned Radix = 0; |
374 | if (*CurPtr == 'h' || *CurPtr == 'H') { |
375 | // hexadecimal number |
376 | ++CurPtr; |
377 | Radix = 16; |
378 | } else if (*CurPtr == 't' || *CurPtr == 'T') { |
379 | // decimal number |
380 | ++CurPtr; |
381 | Radix = 10; |
382 | } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' || |
383 | *CurPtr == 'Q') { |
384 | // octal number |
385 | ++CurPtr; |
386 | Radix = 8; |
387 | } else if (*CurPtr == 'y' || *CurPtr == 'Y') { |
388 | // binary number |
389 | ++CurPtr; |
390 | Radix = 2; |
391 | } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr && |
392 | DefaultRadix < 14 && |
393 | (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) { |
394 | Radix = 10; |
395 | } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr && |
396 | DefaultRadix < 12 && |
397 | (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) { |
398 | Radix = 2; |
399 | } |
400 | |
401 | if (Radix) { |
402 | StringRef Result(TokStart, CurPtr - TokStart); |
403 | APInt Value(128, 0, true); |
404 | |
405 | if (Result.drop_back().getAsInteger(Radix, Result&: Value)) |
406 | return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number" ); |
407 | |
408 | // MSVC accepts and ignores type suffices on integer literals. |
409 | SkipIgnoredIntegerSuffix(CurPtr); |
410 | |
411 | return intToken(Ref: Result, Value); |
412 | } |
413 | |
414 | // default-radix integers, or floating point numbers, fall through |
415 | CurPtr = OldCurPtr; |
416 | } |
417 | |
418 | // MASM default-radix integers: [0-9a-fA-F]+ |
419 | // (All other integer literals have a radix specifier.) |
420 | if (LexMasmIntegers && UseMasmDefaultRadix) { |
421 | CurPtr = findLastDigit(CurPtr, DefaultRadix: 16); |
422 | StringRef Result(TokStart, CurPtr - TokStart); |
423 | |
424 | APInt Value(128, 0, true); |
425 | if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) { |
426 | return ReturnError(Loc: TokStart, |
427 | Msg: "invalid " + radixName(Radix: DefaultRadix) + " number" ); |
428 | } |
429 | |
430 | return intToken(Ref: Result, Value); |
431 | } |
432 | |
433 | // Motorola hex integers: $[0-9a-fA-F]+ |
434 | if (LexMotorolaIntegers && CurPtr[-1] == '$') { |
435 | const char *NumStart = CurPtr; |
436 | while (isHexDigit(C: CurPtr[0])) |
437 | ++CurPtr; |
438 | |
439 | APInt Result(128, 0); |
440 | if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 16, Result)) |
441 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number" ); |
442 | |
443 | return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result); |
444 | } |
445 | |
446 | // Motorola binary integers: %[01]+ |
447 | if (LexMotorolaIntegers && CurPtr[-1] == '%') { |
448 | const char *NumStart = CurPtr; |
449 | while (*CurPtr == '0' || *CurPtr == '1') |
450 | ++CurPtr; |
451 | |
452 | APInt Result(128, 0); |
453 | if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 2, Result)) |
454 | return ReturnError(Loc: TokStart, Msg: "invalid binary number" ); |
455 | |
456 | return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result); |
457 | } |
458 | |
459 | // Decimal integer: [1-9][0-9]* |
460 | // HLASM-flavour decimal integer: [0-9][0-9]* |
461 | // FIXME: Later on, support for fb for HLASM has to be added in |
462 | // as they probably would be needed for asm goto |
463 | if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') { |
464 | unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 10, LexHex: LexMasmIntegers); |
465 | |
466 | if (!LexHLASMIntegers) { |
467 | bool IsHex = Radix == 16; |
468 | // Check for floating point literals. |
469 | if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { |
470 | if (*CurPtr == '.') |
471 | ++CurPtr; |
472 | return LexFloatLiteral(); |
473 | } |
474 | } |
475 | |
476 | StringRef Result(TokStart, CurPtr - TokStart); |
477 | |
478 | APInt Value(128, 0, true); |
479 | if (Result.getAsInteger(Radix, Result&: Value)) |
480 | return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number" ); |
481 | |
482 | if (!LexHLASMIntegers) |
483 | // The darwin/x86 (and x86-64) assembler accepts and ignores type |
484 | // suffices on integer literals. |
485 | SkipIgnoredIntegerSuffix(CurPtr); |
486 | |
487 | return intToken(Ref: Result, Value); |
488 | } |
489 | |
490 | if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { |
491 | ++CurPtr; |
492 | // See if we actually have "0b" as part of something like "jmp 0b\n" |
493 | if (!isDigit(C: CurPtr[0])) { |
494 | --CurPtr; |
495 | StringRef Result(TokStart, CurPtr - TokStart); |
496 | return AsmToken(AsmToken::Integer, Result, 0); |
497 | } |
498 | const char *NumStart = CurPtr; |
499 | while (CurPtr[0] == '0' || CurPtr[0] == '1') |
500 | ++CurPtr; |
501 | |
502 | // Requires at least one binary digit. |
503 | if (CurPtr == NumStart) |
504 | return ReturnError(Loc: TokStart, Msg: "invalid binary number" ); |
505 | |
506 | StringRef Result(TokStart, CurPtr - TokStart); |
507 | |
508 | APInt Value(128, 0, true); |
509 | if (Result.substr(Start: 2).getAsInteger(Radix: 2, Result&: Value)) |
510 | return ReturnError(Loc: TokStart, Msg: "invalid binary number" ); |
511 | |
512 | // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
513 | // suffixes on integer literals. |
514 | SkipIgnoredIntegerSuffix(CurPtr); |
515 | |
516 | return intToken(Ref: Result, Value); |
517 | } |
518 | |
519 | if ((*CurPtr == 'x') || (*CurPtr == 'X')) { |
520 | ++CurPtr; |
521 | const char *NumStart = CurPtr; |
522 | while (isHexDigit(C: CurPtr[0])) |
523 | ++CurPtr; |
524 | |
525 | // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be |
526 | // diagnosed by LexHexFloatLiteral). |
527 | if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') |
528 | return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr); |
529 | |
530 | // Otherwise requires at least one hex digit. |
531 | if (CurPtr == NumStart) |
532 | return ReturnError(Loc: CurPtr-2, Msg: "invalid hexadecimal number" ); |
533 | |
534 | APInt Result(128, 0); |
535 | if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(Radix: 0, Result)) |
536 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number" ); |
537 | |
538 | // Consume the optional [hH]. |
539 | if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) |
540 | ++CurPtr; |
541 | |
542 | // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
543 | // suffixes on integer literals. |
544 | SkipIgnoredIntegerSuffix(CurPtr); |
545 | |
546 | return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result); |
547 | } |
548 | |
549 | // Either octal or hexadecimal. |
550 | APInt Value(128, 0, true); |
551 | unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 8, LexHex: LexMasmIntegers); |
552 | StringRef Result(TokStart, CurPtr - TokStart); |
553 | if (Result.getAsInteger(Radix, Result&: Value)) |
554 | return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number" ); |
555 | |
556 | // Consume the [hH]. |
557 | if (Radix == 16) |
558 | ++CurPtr; |
559 | |
560 | // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
561 | // suffixes on integer literals. |
562 | SkipIgnoredIntegerSuffix(CurPtr); |
563 | |
564 | return intToken(Ref: Result, Value); |
565 | } |
566 | |
567 | /// LexSingleQuote: Integer: 'b' |
568 | AsmToken AsmLexer::LexSingleQuote() { |
569 | int CurChar = getNextChar(); |
570 | |
571 | if (LexHLASMStrings) |
572 | return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals" ); |
573 | |
574 | if (LexMasmStrings) { |
575 | while (CurChar != EOF) { |
576 | if (CurChar != '\'') { |
577 | CurChar = getNextChar(); |
578 | } else if (peekNextChar() == '\'') { |
579 | // In MASM single-quote strings, doubled single-quotes mean an escaped |
580 | // single quote, so should be lexed in. |
581 | (void)getNextChar(); |
582 | CurChar = getNextChar(); |
583 | } else { |
584 | break; |
585 | } |
586 | } |
587 | if (CurChar == EOF) |
588 | return ReturnError(Loc: TokStart, Msg: "unterminated string constant" ); |
589 | return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
590 | } |
591 | |
592 | if (CurChar == '\\') |
593 | CurChar = getNextChar(); |
594 | |
595 | if (CurChar == EOF) |
596 | return ReturnError(Loc: TokStart, Msg: "unterminated single quote" ); |
597 | |
598 | CurChar = getNextChar(); |
599 | |
600 | if (CurChar != '\'') |
601 | return ReturnError(Loc: TokStart, Msg: "single quote way too long" ); |
602 | |
603 | // The idea here being that 'c' is basically just an integral |
604 | // constant. |
605 | StringRef Res = StringRef(TokStart,CurPtr - TokStart); |
606 | long long Value; |
607 | |
608 | if (Res.starts_with(Prefix: "\'\\" )) { |
609 | char theChar = Res[2]; |
610 | switch (theChar) { |
611 | default: Value = theChar; break; |
612 | case '\'': Value = '\''; break; |
613 | case 't': Value = '\t'; break; |
614 | case 'n': Value = '\n'; break; |
615 | case 'b': Value = '\b'; break; |
616 | case 'f': Value = '\f'; break; |
617 | case 'r': Value = '\r'; break; |
618 | } |
619 | } else |
620 | Value = TokStart[1]; |
621 | |
622 | return AsmToken(AsmToken::Integer, Res, Value); |
623 | } |
624 | |
625 | /// LexQuote: String: "..." |
626 | AsmToken AsmLexer::LexQuote() { |
627 | int CurChar = getNextChar(); |
628 | if (LexHLASMStrings) |
629 | return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals" ); |
630 | |
631 | if (LexMasmStrings) { |
632 | while (CurChar != EOF) { |
633 | if (CurChar != '"') { |
634 | CurChar = getNextChar(); |
635 | } else if (peekNextChar() == '"') { |
636 | // In MASM double-quoted strings, doubled double-quotes mean an escaped |
637 | // double quote, so should be lexed in. |
638 | (void)getNextChar(); |
639 | CurChar = getNextChar(); |
640 | } else { |
641 | break; |
642 | } |
643 | } |
644 | if (CurChar == EOF) |
645 | return ReturnError(Loc: TokStart, Msg: "unterminated string constant" ); |
646 | return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
647 | } |
648 | |
649 | while (CurChar != '"') { |
650 | if (CurChar == '\\') { |
651 | // Allow \", etc. |
652 | CurChar = getNextChar(); |
653 | } |
654 | |
655 | if (CurChar == EOF) |
656 | return ReturnError(Loc: TokStart, Msg: "unterminated string constant" ); |
657 | |
658 | CurChar = getNextChar(); |
659 | } |
660 | |
661 | return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
662 | } |
663 | |
664 | StringRef AsmLexer::LexUntilEndOfStatement() { |
665 | TokStart = CurPtr; |
666 | |
667 | while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment. |
668 | !isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker. |
669 | *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { |
670 | ++CurPtr; |
671 | } |
672 | return StringRef(TokStart, CurPtr-TokStart); |
673 | } |
674 | |
675 | StringRef AsmLexer::LexUntilEndOfLine() { |
676 | TokStart = CurPtr; |
677 | |
678 | while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { |
679 | ++CurPtr; |
680 | } |
681 | return StringRef(TokStart, CurPtr-TokStart); |
682 | } |
683 | |
684 | size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, |
685 | bool ShouldSkipSpace) { |
686 | SaveAndRestore SavedTokenStart(TokStart); |
687 | SaveAndRestore SavedCurPtr(CurPtr); |
688 | SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine); |
689 | SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement); |
690 | SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace); |
691 | SaveAndRestore SavedIsPeeking(IsPeeking, true); |
692 | std::string SavedErr = getErr(); |
693 | SMLoc SavedErrLoc = getErrLoc(); |
694 | |
695 | size_t ReadCount; |
696 | for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { |
697 | AsmToken Token = LexToken(); |
698 | |
699 | Buf[ReadCount] = Token; |
700 | |
701 | if (Token.is(K: AsmToken::Eof)) |
702 | break; |
703 | } |
704 | |
705 | SetError(errLoc: SavedErrLoc, err: SavedErr); |
706 | return ReadCount; |
707 | } |
708 | |
709 | bool AsmLexer::(const char *Ptr) { |
710 | if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement) |
711 | return false; |
712 | |
713 | StringRef = MAI.getCommentString(); |
714 | |
715 | if (CommentString.size() == 1) |
716 | return CommentString[0] == Ptr[0]; |
717 | |
718 | // Allow # preprocessor comments also be counted as comments for "##" cases |
719 | if (CommentString[1] == '#') |
720 | return CommentString[0] == Ptr[0]; |
721 | |
722 | return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == 0; |
723 | } |
724 | |
725 | bool AsmLexer::isAtStatementSeparator(const char *Ptr) { |
726 | return strncmp(s1: Ptr, s2: MAI.getSeparatorString(), |
727 | n: strlen(s: MAI.getSeparatorString())) == 0; |
728 | } |
729 | |
730 | AsmToken AsmLexer::LexToken() { |
731 | TokStart = CurPtr; |
732 | // This always consumes at least one character. |
733 | int CurChar = getNextChar(); |
734 | |
735 | if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) { |
736 | // If this starts with a '#', this may be a cpp |
737 | // hash directive and otherwise a line comment. |
738 | AsmToken TokenBuf[2]; |
739 | MutableArrayRef<AsmToken> Buf(TokenBuf, 2); |
740 | size_t num = peekTokens(Buf, ShouldSkipSpace: true); |
741 | // There cannot be a space preceding this |
742 | if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(K: AsmToken::Integer) && |
743 | TokenBuf[1].is(K: AsmToken::String)) { |
744 | CurPtr = TokStart; // reset curPtr; |
745 | StringRef s = LexUntilEndOfLine(); |
746 | UnLex(Token: TokenBuf[1]); |
747 | UnLex(Token: TokenBuf[0]); |
748 | return AsmToken(AsmToken::HashDirective, s); |
749 | } |
750 | |
751 | if (MAI.shouldAllowAdditionalComments()) |
752 | return LexLineComment(); |
753 | } |
754 | |
755 | if (isAtStartOfComment(Ptr: TokStart)) |
756 | return LexLineComment(); |
757 | |
758 | if (isAtStatementSeparator(Ptr: TokStart)) { |
759 | CurPtr += strlen(s: MAI.getSeparatorString()) - 1; |
760 | IsAtStartOfLine = true; |
761 | IsAtStartOfStatement = true; |
762 | return AsmToken(AsmToken::EndOfStatement, |
763 | StringRef(TokStart, strlen(s: MAI.getSeparatorString()))); |
764 | } |
765 | |
766 | // If we're missing a newline at EOF, make sure we still get an |
767 | // EndOfStatement token before the Eof token. |
768 | if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) { |
769 | IsAtStartOfLine = true; |
770 | IsAtStartOfStatement = true; |
771 | return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); |
772 | } |
773 | IsAtStartOfLine = false; |
774 | bool OldIsAtStartOfStatement = IsAtStartOfStatement; |
775 | IsAtStartOfStatement = false; |
776 | switch (CurChar) { |
777 | default: |
778 | // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]* |
779 | // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of |
780 | // an identifier is target-dependent. These characters are handled in the |
781 | // respective switch cases. |
782 | if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') |
783 | return LexIdentifier(); |
784 | |
785 | // Unknown character, emit an error. |
786 | return ReturnError(Loc: TokStart, Msg: "invalid character in input" ); |
787 | case EOF: |
788 | if (EndStatementAtEOF) { |
789 | IsAtStartOfLine = true; |
790 | IsAtStartOfStatement = true; |
791 | } |
792 | return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); |
793 | case 0: |
794 | case ' ': |
795 | case '\t': |
796 | IsAtStartOfStatement = OldIsAtStartOfStatement; |
797 | while (*CurPtr == ' ' || *CurPtr == '\t') |
798 | CurPtr++; |
799 | if (SkipSpace) |
800 | return LexToken(); // Ignore whitespace. |
801 | else |
802 | return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart)); |
803 | case '\r': { |
804 | IsAtStartOfLine = true; |
805 | IsAtStartOfStatement = true; |
806 | // If this is a CR followed by LF, treat that as one token. |
807 | if (CurPtr != CurBuf.end() && *CurPtr == '\n') |
808 | ++CurPtr; |
809 | return AsmToken(AsmToken::EndOfStatement, |
810 | StringRef(TokStart, CurPtr - TokStart)); |
811 | } |
812 | case '\n': |
813 | IsAtStartOfLine = true; |
814 | IsAtStartOfStatement = true; |
815 | return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); |
816 | case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); |
817 | case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); |
818 | case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); |
819 | case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); |
820 | case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); |
821 | case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); |
822 | case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); |
823 | case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); |
824 | case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); |
825 | case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); |
826 | case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); |
827 | case '$': { |
828 | if (LexMotorolaIntegers && isHexDigit(C: *CurPtr)) |
829 | return LexDigit(); |
830 | if (MAI.doesAllowDollarAtStartOfIdentifier()) |
831 | return LexIdentifier(); |
832 | return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); |
833 | } |
834 | case '@': |
835 | if (MAI.doesAllowAtAtStartOfIdentifier()) |
836 | return LexIdentifier(); |
837 | return AsmToken(AsmToken::At, StringRef(TokStart, 1)); |
838 | case '#': |
839 | if (MAI.doesAllowHashAtStartOfIdentifier()) |
840 | return LexIdentifier(); |
841 | return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); |
842 | case '?': |
843 | if (MAI.doesAllowQuestionAtStartOfIdentifier()) |
844 | return LexIdentifier(); |
845 | return AsmToken(AsmToken::Question, StringRef(TokStart, 1)); |
846 | case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); |
847 | case '=': |
848 | if (*CurPtr == '=') { |
849 | ++CurPtr; |
850 | return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); |
851 | } |
852 | return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); |
853 | case '-': |
854 | if (*CurPtr == '>') { |
855 | ++CurPtr; |
856 | return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); |
857 | } |
858 | return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); |
859 | case '|': |
860 | if (*CurPtr == '|') { |
861 | ++CurPtr; |
862 | return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); |
863 | } |
864 | return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); |
865 | case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); |
866 | case '&': |
867 | if (*CurPtr == '&') { |
868 | ++CurPtr; |
869 | return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); |
870 | } |
871 | return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); |
872 | case '!': |
873 | if (*CurPtr == '=') { |
874 | ++CurPtr; |
875 | return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); |
876 | } |
877 | return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); |
878 | case '%': |
879 | if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) { |
880 | return LexDigit(); |
881 | } |
882 | |
883 | if (MAI.hasMipsExpressions()) { |
884 | AsmToken::TokenKind Operator; |
885 | unsigned OperatorLength; |
886 | |
887 | std::tie(args&: Operator, args&: OperatorLength) = |
888 | StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>( |
889 | StringRef(CurPtr)) |
890 | .StartsWith(S: "call16" , Value: {AsmToken::PercentCall16, 7}) |
891 | .StartsWith(S: "call_hi" , Value: {AsmToken::PercentCall_Hi, 8}) |
892 | .StartsWith(S: "call_lo" , Value: {AsmToken::PercentCall_Lo, 8}) |
893 | .StartsWith(S: "dtprel_hi" , Value: {AsmToken::PercentDtprel_Hi, 10}) |
894 | .StartsWith(S: "dtprel_lo" , Value: {AsmToken::PercentDtprel_Lo, 10}) |
895 | .StartsWith(S: "got_disp" , Value: {AsmToken::PercentGot_Disp, 9}) |
896 | .StartsWith(S: "got_hi" , Value: {AsmToken::PercentGot_Hi, 7}) |
897 | .StartsWith(S: "got_lo" , Value: {AsmToken::PercentGot_Lo, 7}) |
898 | .StartsWith(S: "got_ofst" , Value: {AsmToken::PercentGot_Ofst, 9}) |
899 | .StartsWith(S: "got_page" , Value: {AsmToken::PercentGot_Page, 9}) |
900 | .StartsWith(S: "gottprel" , Value: {AsmToken::PercentGottprel, 9}) |
901 | .StartsWith(S: "got" , Value: {AsmToken::PercentGot, 4}) |
902 | .StartsWith(S: "gp_rel" , Value: {AsmToken::PercentGp_Rel, 7}) |
903 | .StartsWith(S: "higher" , Value: {AsmToken::PercentHigher, 7}) |
904 | .StartsWith(S: "highest" , Value: {AsmToken::PercentHighest, 8}) |
905 | .StartsWith(S: "hi" , Value: {AsmToken::PercentHi, 3}) |
906 | .StartsWith(S: "lo" , Value: {AsmToken::PercentLo, 3}) |
907 | .StartsWith(S: "neg" , Value: {AsmToken::PercentNeg, 4}) |
908 | .StartsWith(S: "pcrel_hi" , Value: {AsmToken::PercentPcrel_Hi, 9}) |
909 | .StartsWith(S: "pcrel_lo" , Value: {AsmToken::PercentPcrel_Lo, 9}) |
910 | .StartsWith(S: "tlsgd" , Value: {AsmToken::PercentTlsgd, 6}) |
911 | .StartsWith(S: "tlsldm" , Value: {AsmToken::PercentTlsldm, 7}) |
912 | .StartsWith(S: "tprel_hi" , Value: {AsmToken::PercentTprel_Hi, 9}) |
913 | .StartsWith(S: "tprel_lo" , Value: {AsmToken::PercentTprel_Lo, 9}) |
914 | .Default(Value: {AsmToken::Percent, 1}); |
915 | |
916 | if (Operator != AsmToken::Percent) { |
917 | CurPtr += OperatorLength - 1; |
918 | return AsmToken(Operator, StringRef(TokStart, OperatorLength)); |
919 | } |
920 | } |
921 | return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); |
922 | case '/': |
923 | IsAtStartOfStatement = OldIsAtStartOfStatement; |
924 | return LexSlash(); |
925 | case '\'': return LexSingleQuote(); |
926 | case '"': return LexQuote(); |
927 | case '0': case '1': case '2': case '3': case '4': |
928 | case '5': case '6': case '7': case '8': case '9': |
929 | return LexDigit(); |
930 | case '<': |
931 | switch (*CurPtr) { |
932 | case '<': |
933 | ++CurPtr; |
934 | return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); |
935 | case '=': |
936 | ++CurPtr; |
937 | return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); |
938 | case '>': |
939 | ++CurPtr; |
940 | return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); |
941 | default: |
942 | return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); |
943 | } |
944 | case '>': |
945 | switch (*CurPtr) { |
946 | case '>': |
947 | ++CurPtr; |
948 | return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); |
949 | case '=': |
950 | ++CurPtr; |
951 | return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); |
952 | default: |
953 | return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); |
954 | } |
955 | |
956 | // TODO: Quoted identifiers (objc methods etc) |
957 | // local labels: [0-9][:] |
958 | // Forward/backward labels: [0-9][fb] |
959 | // Integers, fp constants, character constants. |
960 | } |
961 | } |
962 | |