1 | //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This class implements the lexer for assembly files. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/MC/MCParser/AsmLexer.h" |
14 | #include "llvm/ADT/APInt.h" |
15 | #include "llvm/ADT/ArrayRef.h" |
16 | #include "llvm/ADT/StringExtras.h" |
17 | #include "llvm/ADT/StringRef.h" |
18 | #include "llvm/MC/MCAsmInfo.h" |
19 | #include "llvm/MC/MCParser/AsmLexer.h" |
20 | #include "llvm/Support/Compiler.h" |
21 | #include "llvm/Support/SMLoc.h" |
22 | #include "llvm/Support/SaveAndRestore.h" |
23 | #include "llvm/Support/raw_ostream.h" |
24 | #include <cassert> |
25 | #include <cctype> |
26 | #include <cstdio> |
27 | #include <cstring> |
28 | #include <string> |
29 | |
30 | using namespace llvm; |
31 | |
32 | SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Ptr: Str.data()); } |
33 | |
34 | SMLoc AsmToken::getEndLoc() const { |
35 | return SMLoc::getFromPointer(Ptr: Str.data() + Str.size()); |
36 | } |
37 | |
38 | SMRange AsmToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); } |
39 | |
40 | void AsmToken::dump(raw_ostream &OS) const { |
41 | switch (Kind) { |
42 | case AsmToken::Error: |
43 | OS << "error" ; |
44 | break; |
45 | case AsmToken::Identifier: |
46 | OS << "identifier: " << getString(); |
47 | break; |
48 | case AsmToken::Integer: |
49 | OS << "int: " << getString(); |
50 | break; |
51 | case AsmToken::Real: |
52 | OS << "real: " << getString(); |
53 | break; |
54 | case AsmToken::String: |
55 | OS << "string: " << getString(); |
56 | break; |
57 | |
58 | // clang-format off |
59 | case AsmToken::Amp: OS << "Amp" ; break; |
60 | case AsmToken::AmpAmp: OS << "AmpAmp" ; break; |
61 | case AsmToken::At: OS << "At" ; break; |
62 | case AsmToken::BackSlash: OS << "BackSlash" ; break; |
63 | case AsmToken::BigNum: OS << "BigNum" ; break; |
64 | case AsmToken::Caret: OS << "Caret" ; break; |
65 | case AsmToken::Colon: OS << "Colon" ; break; |
66 | case AsmToken::Comma: OS << "Comma" ; break; |
67 | case AsmToken::Comment: OS << "Comment" ; break; |
68 | case AsmToken::Dollar: OS << "Dollar" ; break; |
69 | case AsmToken::Dot: OS << "Dot" ; break; |
70 | case AsmToken::EndOfStatement: OS << "EndOfStatement" ; break; |
71 | case AsmToken::Eof: OS << "Eof" ; break; |
72 | case AsmToken::Equal: OS << "Equal" ; break; |
73 | case AsmToken::EqualEqual: OS << "EqualEqual" ; break; |
74 | case AsmToken::Exclaim: OS << "Exclaim" ; break; |
75 | case AsmToken::ExclaimEqual: OS << "ExclaimEqual" ; break; |
76 | case AsmToken::Greater: OS << "Greater" ; break; |
77 | case AsmToken::GreaterEqual: OS << "GreaterEqual" ; break; |
78 | case AsmToken::GreaterGreater: OS << "GreaterGreater" ; break; |
79 | case AsmToken::Hash: OS << "Hash" ; break; |
80 | case AsmToken::HashDirective: OS << "HashDirective" ; break; |
81 | case AsmToken::LBrac: OS << "LBrac" ; break; |
82 | case AsmToken::LCurly: OS << "LCurly" ; break; |
83 | case AsmToken::LParen: OS << "LParen" ; break; |
84 | case AsmToken::Less: OS << "Less" ; break; |
85 | case AsmToken::LessEqual: OS << "LessEqual" ; break; |
86 | case AsmToken::LessGreater: OS << "LessGreater" ; break; |
87 | case AsmToken::LessLess: OS << "LessLess" ; break; |
88 | case AsmToken::Minus: OS << "Minus" ; break; |
89 | case AsmToken::MinusGreater: OS << "MinusGreater" ; break; |
90 | case AsmToken::Percent: OS << "Percent" ; break; |
91 | case AsmToken::Pipe: OS << "Pipe" ; break; |
92 | case AsmToken::PipePipe: OS << "PipePipe" ; break; |
93 | case AsmToken::Plus: OS << "Plus" ; break; |
94 | case AsmToken::Question: OS << "Question" ; break; |
95 | case AsmToken::RBrac: OS << "RBrac" ; break; |
96 | case AsmToken::RCurly: OS << "RCurly" ; break; |
97 | case AsmToken::RParen: OS << "RParen" ; break; |
98 | case AsmToken::Slash: OS << "Slash" ; break; |
99 | case AsmToken::Space: OS << "Space" ; break; |
100 | case AsmToken::Star: OS << "Star" ; break; |
101 | case AsmToken::Tilde: OS << "Tilde" ; break; |
102 | // clang-format on |
103 | } |
104 | |
105 | // Print the token string. |
106 | OS << " (\"" ; |
107 | OS.write_escaped(Str: getString()); |
108 | OS << "\")" ; |
109 | } |
110 | |
111 | AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { |
112 | // For COFF targets, this is true, while for ELF targets, it should be false. |
113 | // Currently, @specifier parsing depends on '@' being included in the token. |
114 | AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with(Prefix: "@" ) && |
115 | MAI.useAtForSpecifier(); |
116 | LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); |
117 | |
118 | CurTok.emplace_back(Args: AsmToken::Space, Args: StringRef()); |
119 | } |
120 | |
121 | void AsmLexer::setBuffer(StringRef Buf, const char *ptr, |
122 | bool EndStatementAtEOF) { |
123 | CurBuf = Buf; |
124 | |
125 | if (ptr) |
126 | CurPtr = ptr; |
127 | else |
128 | CurPtr = CurBuf.begin(); |
129 | |
130 | TokStart = nullptr; |
131 | this->EndStatementAtEOF = EndStatementAtEOF; |
132 | } |
133 | |
134 | /// ReturnError - Set the error to the specified string at the specified |
135 | /// location. This is defined to always return AsmToken::Error. |
136 | AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { |
137 | SetError(errLoc: SMLoc::getFromPointer(Ptr: Loc), err: Msg); |
138 | |
139 | return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc)); |
140 | } |
141 | |
142 | int AsmLexer::getNextChar() { |
143 | if (CurPtr == CurBuf.end()) |
144 | return EOF; |
145 | return (unsigned char)*CurPtr++; |
146 | } |
147 | |
148 | int AsmLexer::peekNextChar() { |
149 | if (CurPtr == CurBuf.end()) |
150 | return EOF; |
151 | return (unsigned char)*CurPtr; |
152 | } |
153 | |
154 | /// The leading integral digit sequence and dot should have already been |
155 | /// consumed, some or all of the fractional digit sequence *can* have been |
156 | /// consumed. |
157 | AsmToken AsmLexer::LexFloatLiteral() { |
158 | // Skip the fractional digit sequence. |
159 | while (isDigit(C: *CurPtr)) |
160 | ++CurPtr; |
161 | |
162 | if (*CurPtr == '-' || *CurPtr == '+') |
163 | return ReturnError(Loc: CurPtr, Msg: "invalid sign in float literal" ); |
164 | |
165 | // Check for exponent |
166 | if ((*CurPtr == 'e' || *CurPtr == 'E')) { |
167 | ++CurPtr; |
168 | |
169 | if (*CurPtr == '-' || *CurPtr == '+') |
170 | ++CurPtr; |
171 | |
172 | while (isDigit(C: *CurPtr)) |
173 | ++CurPtr; |
174 | } |
175 | |
176 | return AsmToken(AsmToken::Real, |
177 | StringRef(TokStart, CurPtr - TokStart)); |
178 | } |
179 | |
180 | /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ |
181 | /// while making sure there are enough actual digits around for the constant to |
182 | /// be valid. |
183 | /// |
184 | /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed |
185 | /// before we get here. |
186 | AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { |
187 | assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && |
188 | "unexpected parse state in floating hex" ); |
189 | bool NoFracDigits = true; |
190 | |
191 | // Skip the fractional part if there is one |
192 | if (*CurPtr == '.') { |
193 | ++CurPtr; |
194 | |
195 | const char *FracStart = CurPtr; |
196 | while (isHexDigit(C: *CurPtr)) |
197 | ++CurPtr; |
198 | |
199 | NoFracDigits = CurPtr == FracStart; |
200 | } |
201 | |
202 | if (NoIntDigits && NoFracDigits) |
203 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: " |
204 | "expected at least one significand digit" ); |
205 | |
206 | // Make sure we do have some kind of proper exponent part |
207 | if (*CurPtr != 'p' && *CurPtr != 'P') |
208 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: " |
209 | "expected exponent part 'p'" ); |
210 | ++CurPtr; |
211 | |
212 | if (*CurPtr == '+' || *CurPtr == '-') |
213 | ++CurPtr; |
214 | |
215 | // N.b. exponent digits are *not* hex |
216 | const char *ExpStart = CurPtr; |
217 | while (isDigit(C: *CurPtr)) |
218 | ++CurPtr; |
219 | |
220 | if (CurPtr == ExpStart) |
221 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal floating-point constant: " |
222 | "expected at least one exponent digit" ); |
223 | |
224 | return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); |
225 | } |
226 | |
227 | /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]* |
228 | static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) { |
229 | return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' || |
230 | (AllowAt && C == '@') || (AllowHash && C == '#'); |
231 | } |
232 | |
233 | AsmToken AsmLexer::LexIdentifier() { |
234 | // Check for floating point literals. |
235 | if (CurPtr[-1] == '.' && isDigit(C: *CurPtr)) { |
236 | // Disambiguate a .1243foo identifier from a floating literal. |
237 | while (isDigit(C: *CurPtr)) |
238 | ++CurPtr; |
239 | |
240 | if (!isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, |
241 | AllowHash: AllowHashInIdentifier) || |
242 | *CurPtr == 'e' || *CurPtr == 'E') |
243 | return LexFloatLiteral(); |
244 | } |
245 | |
246 | while (isIdentifierChar(C: *CurPtr, AllowAt: AllowAtInIdentifier, AllowHash: AllowHashInIdentifier)) |
247 | ++CurPtr; |
248 | |
249 | // Handle . as a special case. |
250 | if (CurPtr == TokStart+1 && TokStart[0] == '.') |
251 | return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); |
252 | |
253 | return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); |
254 | } |
255 | |
256 | /// LexSlash: Slash: / |
257 | /// C-Style Comment: /* ... */ |
258 | /// C-style Comment: // ... |
259 | AsmToken AsmLexer::LexSlash() { |
260 | if (!MAI.shouldAllowAdditionalComments()) { |
261 | IsAtStartOfStatement = false; |
262 | return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); |
263 | } |
264 | |
265 | switch (*CurPtr) { |
266 | case '*': |
267 | IsAtStartOfStatement = false; |
268 | break; // C style comment. |
269 | case '/': |
270 | ++CurPtr; |
271 | return LexLineComment(); |
272 | default: |
273 | IsAtStartOfStatement = false; |
274 | return AsmToken(AsmToken::Slash, StringRef(TokStart, 1)); |
275 | } |
276 | |
277 | // C Style comment. |
278 | ++CurPtr; // skip the star. |
279 | const char * = CurPtr; |
280 | while (CurPtr != CurBuf.end()) { |
281 | switch (*CurPtr++) { |
282 | case '*': |
283 | // End of the comment? |
284 | if (*CurPtr != '/') |
285 | break; |
286 | // If we have a CommentConsumer, notify it about the comment. |
287 | if (CommentConsumer) { |
288 | CommentConsumer->HandleComment( |
289 | Loc: SMLoc::getFromPointer(Ptr: CommentTextStart), |
290 | CommentText: StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); |
291 | } |
292 | ++CurPtr; // End the */. |
293 | return AsmToken(AsmToken::Comment, |
294 | StringRef(TokStart, CurPtr - TokStart)); |
295 | } |
296 | } |
297 | return ReturnError(Loc: TokStart, Msg: "unterminated comment" ); |
298 | } |
299 | |
300 | /// LexLineComment: Comment: #[^\n]* |
301 | /// : //[^\n]* |
302 | AsmToken AsmLexer::() { |
303 | // Mark This as an end of statement with a body of the |
304 | // comment. While it would be nicer to leave this two tokens, |
305 | // backwards compatability with TargetParsers makes keeping this in this form |
306 | // better. |
307 | const char * = CurPtr; |
308 | int CurChar = getNextChar(); |
309 | while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) |
310 | CurChar = getNextChar(); |
311 | const char *NewlinePtr = CurPtr; |
312 | if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') |
313 | ++CurPtr; |
314 | |
315 | // If we have a CommentConsumer, notify it about the comment. |
316 | if (CommentConsumer) { |
317 | CommentConsumer->HandleComment( |
318 | Loc: SMLoc::getFromPointer(Ptr: CommentTextStart), |
319 | CommentText: StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart)); |
320 | } |
321 | |
322 | IsAtStartOfLine = true; |
323 | // This is a whole line comment. leave newline |
324 | if (IsAtStartOfStatement) |
325 | return AsmToken(AsmToken::EndOfStatement, |
326 | StringRef(TokStart, CurPtr - TokStart)); |
327 | IsAtStartOfStatement = true; |
328 | |
329 | return AsmToken(AsmToken::EndOfStatement, |
330 | StringRef(TokStart, CurPtr - 1 - TokStart)); |
331 | } |
332 | |
333 | static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { |
334 | // Skip case-insensitive ULL, UL, U, L and LL suffixes. |
335 | if (CurPtr[0] == 'U' || CurPtr[0] == 'u') |
336 | ++CurPtr; |
337 | if (CurPtr[0] == 'L' || CurPtr[0] == 'l') |
338 | ++CurPtr; |
339 | if (CurPtr[0] == 'L' || CurPtr[0] == 'l') |
340 | ++CurPtr; |
341 | } |
342 | |
343 | // Look ahead to search for first non-hex digit, if it's [hH], then we treat the |
344 | // integer as a hexadecimal, possibly with leading zeroes. |
345 | static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, |
346 | bool LexHex) { |
347 | const char *FirstNonDec = nullptr; |
348 | const char *LookAhead = CurPtr; |
349 | while (true) { |
350 | if (isDigit(C: *LookAhead)) { |
351 | ++LookAhead; |
352 | } else { |
353 | if (!FirstNonDec) |
354 | FirstNonDec = LookAhead; |
355 | |
356 | // Keep going if we are looking for a 'h' suffix. |
357 | if (LexHex && isHexDigit(C: *LookAhead)) |
358 | ++LookAhead; |
359 | else |
360 | break; |
361 | } |
362 | } |
363 | bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H'); |
364 | CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec; |
365 | if (isHex) |
366 | return 16; |
367 | return DefaultRadix; |
368 | } |
369 | |
370 | static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) { |
371 | while (hexDigitValue(C: *CurPtr) < DefaultRadix) { |
372 | ++CurPtr; |
373 | } |
374 | return CurPtr; |
375 | } |
376 | |
377 | static AsmToken intToken(StringRef Ref, APInt &Value) { |
378 | if (Value.isIntN(N: 64)) |
379 | return AsmToken(AsmToken::Integer, Ref, Value); |
380 | return AsmToken(AsmToken::BigNum, Ref, Value); |
381 | } |
382 | |
383 | static std::string radixName(unsigned Radix) { |
384 | switch (Radix) { |
385 | case 2: |
386 | return "binary" ; |
387 | case 8: |
388 | return "octal" ; |
389 | case 10: |
390 | return "decimal" ; |
391 | case 16: |
392 | return "hexadecimal" ; |
393 | default: |
394 | return "base-" + std::to_string(val: Radix); |
395 | } |
396 | } |
397 | |
398 | /// LexDigit: First character is [0-9]. |
399 | /// Local Label: [0-9][:] |
400 | /// Forward/Backward Label: [0-9][fb] |
401 | /// Binary integer: 0b[01]+ |
402 | /// Octal integer: 0[0-7]+ |
403 | /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] |
404 | /// Decimal integer: [1-9][0-9]* |
405 | AsmToken AsmLexer::LexDigit() { |
406 | // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY]) |
407 | // MASM-flavor octal integer: [0-7]+[oOqQ] |
408 | // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT]) |
409 | // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] |
410 | if (LexMasmIntegers && isdigit(CurPtr[-1])) { |
411 | const char *FirstNonBinary = |
412 | (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr; |
413 | const char *FirstNonDecimal = |
414 | (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr; |
415 | const char *OldCurPtr = CurPtr; |
416 | while (isHexDigit(C: *CurPtr)) { |
417 | switch (*CurPtr) { |
418 | default: |
419 | if (!FirstNonDecimal) { |
420 | FirstNonDecimal = CurPtr; |
421 | } |
422 | [[fallthrough]]; |
423 | case '9': |
424 | case '8': |
425 | case '7': |
426 | case '6': |
427 | case '5': |
428 | case '4': |
429 | case '3': |
430 | case '2': |
431 | if (!FirstNonBinary) { |
432 | FirstNonBinary = CurPtr; |
433 | } |
434 | break; |
435 | case '1': |
436 | case '0': |
437 | break; |
438 | } |
439 | ++CurPtr; |
440 | } |
441 | if (*CurPtr == '.') { |
442 | // MASM float literals (other than hex floats) always contain a ".", and |
443 | // are always written in decimal. |
444 | ++CurPtr; |
445 | return LexFloatLiteral(); |
446 | } |
447 | |
448 | if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) { |
449 | ++CurPtr; |
450 | return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); |
451 | } |
452 | |
453 | unsigned Radix = 0; |
454 | if (*CurPtr == 'h' || *CurPtr == 'H') { |
455 | // hexadecimal number |
456 | ++CurPtr; |
457 | Radix = 16; |
458 | } else if (*CurPtr == 't' || *CurPtr == 'T') { |
459 | // decimal number |
460 | ++CurPtr; |
461 | Radix = 10; |
462 | } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' || |
463 | *CurPtr == 'Q') { |
464 | // octal number |
465 | ++CurPtr; |
466 | Radix = 8; |
467 | } else if (*CurPtr == 'y' || *CurPtr == 'Y') { |
468 | // binary number |
469 | ++CurPtr; |
470 | Radix = 2; |
471 | } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr && |
472 | DefaultRadix < 14 && |
473 | (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) { |
474 | Radix = 10; |
475 | } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr && |
476 | DefaultRadix < 12 && |
477 | (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) { |
478 | Radix = 2; |
479 | } |
480 | |
481 | if (Radix) { |
482 | StringRef Result(TokStart, CurPtr - TokStart); |
483 | APInt Value(128, 0, true); |
484 | |
485 | if (Result.drop_back().getAsInteger(Radix, Result&: Value)) |
486 | return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number" ); |
487 | |
488 | // MSVC accepts and ignores type suffices on integer literals. |
489 | SkipIgnoredIntegerSuffix(CurPtr); |
490 | |
491 | return intToken(Ref: Result, Value); |
492 | } |
493 | |
494 | // default-radix integers, or floating point numbers, fall through |
495 | CurPtr = OldCurPtr; |
496 | } |
497 | |
498 | // MASM default-radix integers: [0-9a-fA-F]+ |
499 | // (All other integer literals have a radix specifier.) |
500 | if (LexMasmIntegers && UseMasmDefaultRadix) { |
501 | CurPtr = findLastDigit(CurPtr, DefaultRadix: 16); |
502 | StringRef Result(TokStart, CurPtr - TokStart); |
503 | |
504 | APInt Value(128, 0, true); |
505 | if (Result.getAsInteger(Radix: DefaultRadix, Result&: Value)) { |
506 | return ReturnError(Loc: TokStart, |
507 | Msg: "invalid " + radixName(Radix: DefaultRadix) + " number" ); |
508 | } |
509 | |
510 | return intToken(Ref: Result, Value); |
511 | } |
512 | |
513 | // Motorola hex integers: $[0-9a-fA-F]+ |
514 | if (LexMotorolaIntegers && CurPtr[-1] == '$') { |
515 | const char *NumStart = CurPtr; |
516 | while (isHexDigit(C: CurPtr[0])) |
517 | ++CurPtr; |
518 | |
519 | APInt Result(128, 0); |
520 | if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 16, Result)) |
521 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number" ); |
522 | |
523 | return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result); |
524 | } |
525 | |
526 | // Motorola binary integers: %[01]+ |
527 | if (LexMotorolaIntegers && CurPtr[-1] == '%') { |
528 | const char *NumStart = CurPtr; |
529 | while (*CurPtr == '0' || *CurPtr == '1') |
530 | ++CurPtr; |
531 | |
532 | APInt Result(128, 0); |
533 | if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(Radix: 2, Result)) |
534 | return ReturnError(Loc: TokStart, Msg: "invalid binary number" ); |
535 | |
536 | return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result); |
537 | } |
538 | |
539 | // Decimal integer: [1-9][0-9]* |
540 | // HLASM-flavour decimal integer: [0-9][0-9]* |
541 | // FIXME: Later on, support for fb for HLASM has to be added in |
542 | // as they probably would be needed for asm goto |
543 | if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') { |
544 | unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 10, LexHex: LexMasmIntegers); |
545 | |
546 | if (!LexHLASMIntegers) { |
547 | bool IsHex = Radix == 16; |
548 | // Check for floating point literals. |
549 | if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { |
550 | if (*CurPtr == '.') |
551 | ++CurPtr; |
552 | return LexFloatLiteral(); |
553 | } |
554 | } |
555 | |
556 | StringRef Result(TokStart, CurPtr - TokStart); |
557 | |
558 | APInt Value(128, 0, true); |
559 | if (Result.getAsInteger(Radix, Result&: Value)) |
560 | return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number" ); |
561 | |
562 | if (!LexHLASMIntegers) |
563 | // The darwin/x86 (and x86-64) assembler accepts and ignores type |
564 | // suffices on integer literals. |
565 | SkipIgnoredIntegerSuffix(CurPtr); |
566 | |
567 | return intToken(Ref: Result, Value); |
568 | } |
569 | |
570 | if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { |
571 | ++CurPtr; |
572 | // See if we actually have "0b" as part of something like "jmp 0b\n" |
573 | if (!isDigit(C: CurPtr[0])) { |
574 | --CurPtr; |
575 | StringRef Result(TokStart, CurPtr - TokStart); |
576 | return AsmToken(AsmToken::Integer, Result, 0); |
577 | } |
578 | const char *NumStart = CurPtr; |
579 | while (CurPtr[0] == '0' || CurPtr[0] == '1') |
580 | ++CurPtr; |
581 | |
582 | // Requires at least one binary digit. |
583 | if (CurPtr == NumStart) |
584 | return ReturnError(Loc: TokStart, Msg: "invalid binary number" ); |
585 | |
586 | StringRef Result(TokStart, CurPtr - TokStart); |
587 | |
588 | APInt Value(128, 0, true); |
589 | if (Result.substr(Start: 2).getAsInteger(Radix: 2, Result&: Value)) |
590 | return ReturnError(Loc: TokStart, Msg: "invalid binary number" ); |
591 | |
592 | // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
593 | // suffixes on integer literals. |
594 | SkipIgnoredIntegerSuffix(CurPtr); |
595 | |
596 | return intToken(Ref: Result, Value); |
597 | } |
598 | |
599 | if ((*CurPtr == 'x') || (*CurPtr == 'X')) { |
600 | ++CurPtr; |
601 | const char *NumStart = CurPtr; |
602 | while (isHexDigit(C: CurPtr[0])) |
603 | ++CurPtr; |
604 | |
605 | // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be |
606 | // diagnosed by LexHexFloatLiteral). |
607 | if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') |
608 | return LexHexFloatLiteral(NoIntDigits: NumStart == CurPtr); |
609 | |
610 | // Otherwise requires at least one hex digit. |
611 | if (CurPtr == NumStart) |
612 | return ReturnError(Loc: CurPtr-2, Msg: "invalid hexadecimal number" ); |
613 | |
614 | APInt Result(128, 0); |
615 | if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(Radix: 0, Result)) |
616 | return ReturnError(Loc: TokStart, Msg: "invalid hexadecimal number" ); |
617 | |
618 | // Consume the optional [hH]. |
619 | if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) |
620 | ++CurPtr; |
621 | |
622 | // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
623 | // suffixes on integer literals. |
624 | SkipIgnoredIntegerSuffix(CurPtr); |
625 | |
626 | return intToken(Ref: StringRef(TokStart, CurPtr - TokStart), Value&: Result); |
627 | } |
628 | |
629 | // Either octal or hexadecimal. |
630 | APInt Value(128, 0, true); |
631 | unsigned Radix = doHexLookAhead(CurPtr, DefaultRadix: 8, LexHex: LexMasmIntegers); |
632 | StringRef Result(TokStart, CurPtr - TokStart); |
633 | if (Result.getAsInteger(Radix, Result&: Value)) |
634 | return ReturnError(Loc: TokStart, Msg: "invalid " + radixName(Radix) + " number" ); |
635 | |
636 | // Consume the [hH]. |
637 | if (Radix == 16) |
638 | ++CurPtr; |
639 | |
640 | // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL |
641 | // suffixes on integer literals. |
642 | SkipIgnoredIntegerSuffix(CurPtr); |
643 | |
644 | return intToken(Ref: Result, Value); |
645 | } |
646 | |
647 | /// LexSingleQuote: Integer: 'b' |
648 | AsmToken AsmLexer::LexSingleQuote() { |
649 | int CurChar = getNextChar(); |
650 | |
651 | if (LexHLASMStrings) |
652 | return ReturnError(Loc: TokStart, Msg: "invalid usage of character literals" ); |
653 | |
654 | if (LexMasmStrings) { |
655 | while (CurChar != EOF) { |
656 | if (CurChar != '\'') { |
657 | CurChar = getNextChar(); |
658 | } else if (peekNextChar() == '\'') { |
659 | // In MASM single-quote strings, doubled single-quotes mean an escaped |
660 | // single quote, so should be lexed in. |
661 | (void)getNextChar(); |
662 | CurChar = getNextChar(); |
663 | } else { |
664 | break; |
665 | } |
666 | } |
667 | if (CurChar == EOF) |
668 | return ReturnError(Loc: TokStart, Msg: "unterminated string constant" ); |
669 | return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
670 | } |
671 | |
672 | if (CurChar == '\\') |
673 | CurChar = getNextChar(); |
674 | |
675 | if (CurChar == EOF) |
676 | return ReturnError(Loc: TokStart, Msg: "unterminated single quote" ); |
677 | |
678 | CurChar = getNextChar(); |
679 | |
680 | if (CurChar != '\'') |
681 | return ReturnError(Loc: TokStart, Msg: "single quote way too long" ); |
682 | |
683 | // The idea here being that 'c' is basically just an integral |
684 | // constant. |
685 | StringRef Res = StringRef(TokStart,CurPtr - TokStart); |
686 | long long Value; |
687 | |
688 | if (Res.starts_with(Prefix: "\'\\" )) { |
689 | char theChar = Res[2]; |
690 | switch (theChar) { |
691 | default: Value = theChar; break; |
692 | case '\'': Value = '\''; break; |
693 | case 't': Value = '\t'; break; |
694 | case 'n': Value = '\n'; break; |
695 | case 'b': Value = '\b'; break; |
696 | case 'f': Value = '\f'; break; |
697 | case 'r': Value = '\r'; break; |
698 | } |
699 | } else |
700 | Value = TokStart[1]; |
701 | |
702 | return AsmToken(AsmToken::Integer, Res, Value); |
703 | } |
704 | |
705 | /// LexQuote: String: "..." |
706 | AsmToken AsmLexer::LexQuote() { |
707 | int CurChar = getNextChar(); |
708 | if (LexHLASMStrings) |
709 | return ReturnError(Loc: TokStart, Msg: "invalid usage of string literals" ); |
710 | |
711 | if (LexMasmStrings) { |
712 | while (CurChar != EOF) { |
713 | if (CurChar != '"') { |
714 | CurChar = getNextChar(); |
715 | } else if (peekNextChar() == '"') { |
716 | // In MASM double-quoted strings, doubled double-quotes mean an escaped |
717 | // double quote, so should be lexed in. |
718 | (void)getNextChar(); |
719 | CurChar = getNextChar(); |
720 | } else { |
721 | break; |
722 | } |
723 | } |
724 | if (CurChar == EOF) |
725 | return ReturnError(Loc: TokStart, Msg: "unterminated string constant" ); |
726 | return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
727 | } |
728 | |
729 | while (CurChar != '"') { |
730 | if (CurChar == '\\') { |
731 | // Allow \", etc. |
732 | CurChar = getNextChar(); |
733 | } |
734 | |
735 | if (CurChar == EOF) |
736 | return ReturnError(Loc: TokStart, Msg: "unterminated string constant" ); |
737 | |
738 | CurChar = getNextChar(); |
739 | } |
740 | |
741 | return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); |
742 | } |
743 | |
744 | StringRef AsmLexer::LexUntilEndOfStatement() { |
745 | TokStart = CurPtr; |
746 | |
747 | while (!isAtStartOfComment(Ptr: CurPtr) && // Start of line comment. |
748 | !isAtStatementSeparator(Ptr: CurPtr) && // End of statement marker. |
749 | *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { |
750 | ++CurPtr; |
751 | } |
752 | return StringRef(TokStart, CurPtr-TokStart); |
753 | } |
754 | |
755 | StringRef AsmLexer::LexUntilEndOfLine() { |
756 | TokStart = CurPtr; |
757 | |
758 | while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { |
759 | ++CurPtr; |
760 | } |
761 | return StringRef(TokStart, CurPtr-TokStart); |
762 | } |
763 | |
764 | size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, |
765 | bool ShouldSkipSpace) { |
766 | SaveAndRestore SavedTokenStart(TokStart); |
767 | SaveAndRestore SavedCurPtr(CurPtr); |
768 | SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine); |
769 | SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement); |
770 | SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace); |
771 | SaveAndRestore SavedIsPeeking(IsPeeking, true); |
772 | std::string SavedErr = getErr(); |
773 | SMLoc SavedErrLoc = getErrLoc(); |
774 | |
775 | size_t ReadCount; |
776 | for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { |
777 | AsmToken Token = LexToken(); |
778 | |
779 | Buf[ReadCount] = Token; |
780 | |
781 | if (Token.is(K: AsmToken::Eof)) { |
782 | ReadCount++; |
783 | break; |
784 | } |
785 | } |
786 | |
787 | SetError(errLoc: SavedErrLoc, err: SavedErr); |
788 | return ReadCount; |
789 | } |
790 | |
791 | bool AsmLexer::(const char *Ptr) { |
792 | if (MAI.isHLASM() && !IsAtStartOfStatement) |
793 | return false; |
794 | |
795 | StringRef = MAI.getCommentString(); |
796 | |
797 | if (CommentString.size() == 1) |
798 | return CommentString[0] == Ptr[0]; |
799 | |
800 | // Allow # preprocessor comments also be counted as comments for "##" cases |
801 | if (CommentString[1] == '#') |
802 | return CommentString[0] == Ptr[0]; |
803 | |
804 | return strncmp(s1: Ptr, s2: CommentString.data(), n: CommentString.size()) == 0; |
805 | } |
806 | |
807 | bool AsmLexer::isAtStatementSeparator(const char *Ptr) { |
808 | return strncmp(s1: Ptr, s2: MAI.getSeparatorString(), |
809 | n: strlen(s: MAI.getSeparatorString())) == 0; |
810 | } |
811 | |
812 | AsmToken AsmLexer::LexToken() { |
813 | TokStart = CurPtr; |
814 | // This always consumes at least one character. |
815 | int CurChar = getNextChar(); |
816 | |
817 | if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) { |
818 | // If this starts with a '#', this may be a cpp |
819 | // hash directive and otherwise a line comment. |
820 | AsmToken TokenBuf[2]; |
821 | MutableArrayRef<AsmToken> Buf(TokenBuf, 2); |
822 | size_t num = peekTokens(Buf, ShouldSkipSpace: true); |
823 | // There cannot be a space preceding this |
824 | if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(K: AsmToken::Integer) && |
825 | TokenBuf[1].is(K: AsmToken::String)) { |
826 | CurPtr = TokStart; // reset curPtr; |
827 | StringRef s = LexUntilEndOfLine(); |
828 | UnLex(Token: TokenBuf[1]); |
829 | UnLex(Token: TokenBuf[0]); |
830 | return AsmToken(AsmToken::HashDirective, s); |
831 | } |
832 | |
833 | if (MAI.shouldAllowAdditionalComments()) |
834 | return LexLineComment(); |
835 | } |
836 | |
837 | if (isAtStartOfComment(Ptr: TokStart)) |
838 | return LexLineComment(); |
839 | |
840 | if (isAtStatementSeparator(Ptr: TokStart)) { |
841 | CurPtr += strlen(s: MAI.getSeparatorString()) - 1; |
842 | IsAtStartOfLine = true; |
843 | IsAtStartOfStatement = true; |
844 | return AsmToken(AsmToken::EndOfStatement, |
845 | StringRef(TokStart, strlen(s: MAI.getSeparatorString()))); |
846 | } |
847 | |
848 | // If we're missing a newline at EOF, make sure we still get an |
849 | // EndOfStatement token before the Eof token. |
850 | if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) { |
851 | IsAtStartOfLine = true; |
852 | IsAtStartOfStatement = true; |
853 | return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); |
854 | } |
855 | IsAtStartOfLine = false; |
856 | bool OldIsAtStartOfStatement = IsAtStartOfStatement; |
857 | IsAtStartOfStatement = false; |
858 | switch (CurChar) { |
859 | default: |
860 | // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]* |
861 | // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of |
862 | // an identifier is target-dependent. These characters are handled in the |
863 | // respective switch cases. |
864 | if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') |
865 | return LexIdentifier(); |
866 | |
867 | // Unknown character, emit an error. |
868 | return ReturnError(Loc: TokStart, Msg: "invalid character in input" ); |
869 | case EOF: |
870 | if (EndStatementAtEOF) { |
871 | IsAtStartOfLine = true; |
872 | IsAtStartOfStatement = true; |
873 | } |
874 | return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); |
875 | case 0: |
876 | case ' ': |
877 | case '\t': |
878 | IsAtStartOfStatement = OldIsAtStartOfStatement; |
879 | while (*CurPtr == ' ' || *CurPtr == '\t') |
880 | CurPtr++; |
881 | if (SkipSpace) |
882 | return LexToken(); // Ignore whitespace. |
883 | else |
884 | return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart)); |
885 | case '\r': { |
886 | IsAtStartOfLine = true; |
887 | IsAtStartOfStatement = true; |
888 | // If this is a CR followed by LF, treat that as one token. |
889 | if (CurPtr != CurBuf.end() && *CurPtr == '\n') |
890 | ++CurPtr; |
891 | return AsmToken(AsmToken::EndOfStatement, |
892 | StringRef(TokStart, CurPtr - TokStart)); |
893 | } |
894 | case '\n': |
895 | IsAtStartOfLine = true; |
896 | IsAtStartOfStatement = true; |
897 | return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); |
898 | case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); |
899 | case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); |
900 | case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); |
901 | case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); |
902 | case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); |
903 | case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); |
904 | case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); |
905 | case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); |
906 | case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); |
907 | case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); |
908 | case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); |
909 | case '$': { |
910 | if (LexMotorolaIntegers && isHexDigit(C: *CurPtr)) |
911 | return LexDigit(); |
912 | if (MAI.doesAllowDollarAtStartOfIdentifier()) |
913 | return LexIdentifier(); |
914 | return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); |
915 | } |
916 | case '@': |
917 | if (MAI.doesAllowAtAtStartOfIdentifier()) |
918 | return LexIdentifier(); |
919 | return AsmToken(AsmToken::At, StringRef(TokStart, 1)); |
920 | case '#': |
921 | if (MAI.isHLASM()) |
922 | return LexIdentifier(); |
923 | return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); |
924 | case '?': |
925 | if (MAI.doesAllowQuestionAtStartOfIdentifier()) |
926 | return LexIdentifier(); |
927 | return AsmToken(AsmToken::Question, StringRef(TokStart, 1)); |
928 | case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); |
929 | case '=': |
930 | if (*CurPtr == '=') { |
931 | ++CurPtr; |
932 | return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); |
933 | } |
934 | return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); |
935 | case '-': |
936 | if (*CurPtr == '>') { |
937 | ++CurPtr; |
938 | return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); |
939 | } |
940 | return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); |
941 | case '|': |
942 | if (*CurPtr == '|') { |
943 | ++CurPtr; |
944 | return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); |
945 | } |
946 | return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); |
947 | case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); |
948 | case '&': |
949 | if (*CurPtr == '&') { |
950 | ++CurPtr; |
951 | return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); |
952 | } |
953 | return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); |
954 | case '!': |
955 | if (*CurPtr == '=') { |
956 | ++CurPtr; |
957 | return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); |
958 | } |
959 | return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); |
960 | case '%': |
961 | if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) { |
962 | return LexDigit(); |
963 | } |
964 | return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); |
965 | case '/': |
966 | IsAtStartOfStatement = OldIsAtStartOfStatement; |
967 | return LexSlash(); |
968 | case '\'': return LexSingleQuote(); |
969 | case '"': return LexQuote(); |
970 | case '0': case '1': case '2': case '3': case '4': |
971 | case '5': case '6': case '7': case '8': case '9': |
972 | return LexDigit(); |
973 | case '<': |
974 | switch (*CurPtr) { |
975 | case '<': |
976 | ++CurPtr; |
977 | return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); |
978 | case '=': |
979 | ++CurPtr; |
980 | return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); |
981 | case '>': |
982 | ++CurPtr; |
983 | return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); |
984 | default: |
985 | return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); |
986 | } |
987 | case '>': |
988 | switch (*CurPtr) { |
989 | case '>': |
990 | ++CurPtr; |
991 | return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); |
992 | case '=': |
993 | ++CurPtr; |
994 | return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); |
995 | default: |
996 | return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); |
997 | } |
998 | |
999 | // TODO: Quoted identifiers (objc methods etc) |
1000 | // local labels: [0-9][:] |
1001 | // Forward/backward labels: [0-9][fb] |
1002 | // Integers, fp constants, character constants. |
1003 | } |
1004 | } |
1005 | |