LLVM 22.0.0git
AsmLexer.cpp
Go to the documentation of this file.
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class implements the lexer for assembly files.
10//
11//===----------------------------------------------------------------------===//
12
14#include "llvm/ADT/APInt.h"
15#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/MC/MCAsmInfo.h"
20#include "llvm/Support/SMLoc.h"
23#include <cassert>
24#include <cctype>
25#include <cstdio>
26#include <cstring>
27#include <string>
28
29using namespace llvm;
30
31SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Str.data()); }
32
34 return SMLoc::getFromPointer(Str.data() + Str.size());
35}
36
38
39void AsmToken::dump(raw_ostream &OS) const {
40 switch (Kind) {
41 case AsmToken::Error:
42 OS << "error";
43 break;
45 OS << "identifier: " << getString();
46 break;
48 OS << "int: " << getString();
49 break;
50 case AsmToken::Real:
51 OS << "real: " << getString();
52 break;
54 OS << "string: " << getString();
55 break;
56
57 // clang-format off
58 case AsmToken::Amp: OS << "Amp"; break;
59 case AsmToken::AmpAmp: OS << "AmpAmp"; break;
60 case AsmToken::At: OS << "At"; break;
61 case AsmToken::BackSlash: OS << "BackSlash"; break;
62 case AsmToken::BigNum: OS << "BigNum"; break;
63 case AsmToken::Caret: OS << "Caret"; break;
64 case AsmToken::Colon: OS << "Colon"; break;
65 case AsmToken::Comma: OS << "Comma"; break;
66 case AsmToken::Comment: OS << "Comment"; break;
67 case AsmToken::Dollar: OS << "Dollar"; break;
68 case AsmToken::Dot: OS << "Dot"; break;
69 case AsmToken::EndOfStatement: OS << "EndOfStatement"; break;
70 case AsmToken::Eof: OS << "Eof"; break;
71 case AsmToken::Equal: OS << "Equal"; break;
72 case AsmToken::EqualEqual: OS << "EqualEqual"; break;
73 case AsmToken::Exclaim: OS << "Exclaim"; break;
74 case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break;
75 case AsmToken::Greater: OS << "Greater"; break;
76 case AsmToken::GreaterEqual: OS << "GreaterEqual"; break;
77 case AsmToken::GreaterGreater: OS << "GreaterGreater"; break;
78 case AsmToken::Hash: OS << "Hash"; break;
79 case AsmToken::HashDirective: OS << "HashDirective"; break;
80 case AsmToken::LBrac: OS << "LBrac"; break;
81 case AsmToken::LCurly: OS << "LCurly"; break;
82 case AsmToken::LParen: OS << "LParen"; break;
83 case AsmToken::Less: OS << "Less"; break;
84 case AsmToken::LessEqual: OS << "LessEqual"; break;
85 case AsmToken::LessGreater: OS << "LessGreater"; break;
86 case AsmToken::LessLess: OS << "LessLess"; break;
87 case AsmToken::Minus: OS << "Minus"; break;
88 case AsmToken::MinusGreater: OS << "MinusGreater"; break;
89 case AsmToken::Percent: OS << "Percent"; break;
90 case AsmToken::Pipe: OS << "Pipe"; break;
91 case AsmToken::PipePipe: OS << "PipePipe"; break;
92 case AsmToken::Plus: OS << "Plus"; break;
93 case AsmToken::Question: OS << "Question"; break;
94 case AsmToken::RBrac: OS << "RBrac"; break;
95 case AsmToken::RCurly: OS << "RCurly"; break;
96 case AsmToken::RParen: OS << "RParen"; break;
97 case AsmToken::Slash: OS << "Slash"; break;
98 case AsmToken::Space: OS << "Space"; break;
99 case AsmToken::Star: OS << "Star"; break;
100 case AsmToken::Tilde: OS << "Tilde"; break;
101 // clang-format on
102 }
103
104 // Print the token string.
105 OS << " (\"";
107 OS << "\")";
108}
109
110AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
111 // For COFF targets, this is true, while for ELF targets, it should be false.
112 // Currently, @specifier parsing depends on '@' being included in the token.
113 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@") &&
114 MAI.useAtForSpecifier();
115 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
116
117 CurTok.emplace_back(AsmToken::Space, StringRef());
118}
119
120void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
121 bool EndStatementAtEOF) {
122 CurBuf = Buf;
123
124 if (ptr)
125 CurPtr = ptr;
126 else
127 CurPtr = CurBuf.begin();
128
129 TokStart = nullptr;
130 this->EndStatementAtEOF = EndStatementAtEOF;
131}
132
133/// ReturnError - Set the error to the specified string at the specified
134/// location. This is defined to always return AsmToken::Error.
135AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
136 SetError(SMLoc::getFromPointer(Loc), Msg);
137
138 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
139}
140
141int AsmLexer::getNextChar() {
142 if (CurPtr == CurBuf.end())
143 return EOF;
144 return (unsigned char)*CurPtr++;
145}
146
147int AsmLexer::peekNextChar() {
148 if (CurPtr == CurBuf.end())
149 return EOF;
150 return (unsigned char)*CurPtr;
151}
152
153/// The leading integral digit sequence and dot should have already been
154/// consumed, some or all of the fractional digit sequence *can* have been
155/// consumed.
156AsmToken AsmLexer::LexFloatLiteral() {
157 // Skip the fractional digit sequence.
158 while (isDigit(*CurPtr))
159 ++CurPtr;
160
161 if (*CurPtr == '-' || *CurPtr == '+')
162 return ReturnError(CurPtr, "invalid sign in float literal");
163
164 // Check for exponent
165 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
166 ++CurPtr;
167
168 if (*CurPtr == '-' || *CurPtr == '+')
169 ++CurPtr;
170
171 while (isDigit(*CurPtr))
172 ++CurPtr;
173 }
174
175 return AsmToken(AsmToken::Real,
176 StringRef(TokStart, CurPtr - TokStart));
177}
178
179/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
180/// while making sure there are enough actual digits around for the constant to
181/// be valid.
182///
183/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
184/// before we get here.
185AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
186 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
187 "unexpected parse state in floating hex");
188 bool NoFracDigits = true;
189
190 // Skip the fractional part if there is one
191 if (*CurPtr == '.') {
192 ++CurPtr;
193
194 const char *FracStart = CurPtr;
195 while (isHexDigit(*CurPtr))
196 ++CurPtr;
197
198 NoFracDigits = CurPtr == FracStart;
199 }
200
201 if (NoIntDigits && NoFracDigits)
202 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
203 "expected at least one significand digit");
204
205 // Make sure we do have some kind of proper exponent part
206 if (*CurPtr != 'p' && *CurPtr != 'P')
207 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
208 "expected exponent part 'p'");
209 ++CurPtr;
210
211 if (*CurPtr == '+' || *CurPtr == '-')
212 ++CurPtr;
213
214 // N.b. exponent digits are *not* hex
215 const char *ExpStart = CurPtr;
216 while (isDigit(*CurPtr))
217 ++CurPtr;
218
219 if (CurPtr == ExpStart)
220 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
221 "expected at least one exponent digit");
222
223 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
224}
225
226/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
227static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
228 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
229 (AllowAt && C == '@') || (AllowHash && C == '#');
230}
231
232AsmToken AsmLexer::LexIdentifier() {
233 // Check for floating point literals.
234 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
235 // Disambiguate a .1243foo identifier from a floating literal.
236 while (isDigit(*CurPtr))
237 ++CurPtr;
238
239 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
240 AllowHashInIdentifier) ||
241 *CurPtr == 'e' || *CurPtr == 'E')
242 return LexFloatLiteral();
243 }
244
245 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
246 ++CurPtr;
247
248 // Handle . as a special case.
249 if (CurPtr == TokStart+1 && TokStart[0] == '.')
250 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
251
252 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
253}
254
255/// LexSlash: Slash: /
256/// C-Style Comment: /* ... */
257/// C-style Comment: // ...
258AsmToken AsmLexer::LexSlash() {
259 if (!MAI.shouldAllowAdditionalComments()) {
260 IsAtStartOfStatement = false;
261 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
262 }
263
264 switch (*CurPtr) {
265 case '*':
266 IsAtStartOfStatement = false;
267 break; // C style comment.
268 case '/':
269 ++CurPtr;
270 return LexLineComment();
271 default:
272 IsAtStartOfStatement = false;
273 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
274 }
275
276 // C Style comment.
277 ++CurPtr; // skip the star.
278 const char *CommentTextStart = CurPtr;
279 while (CurPtr != CurBuf.end()) {
280 switch (*CurPtr++) {
281 case '*':
282 // End of the comment?
283 if (*CurPtr != '/')
284 break;
285 // If we have a CommentConsumer, notify it about the comment.
286 if (CommentConsumer) {
287 CommentConsumer->HandleComment(
288 SMLoc::getFromPointer(CommentTextStart),
289 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
290 }
291 ++CurPtr; // End the */.
292 return AsmToken(AsmToken::Comment,
293 StringRef(TokStart, CurPtr - TokStart));
294 }
295 }
296 return ReturnError(TokStart, "unterminated comment");
297}
298
299/// LexLineComment: Comment: #[^\n]*
300/// : //[^\n]*
301AsmToken AsmLexer::LexLineComment() {
302 // Mark This as an end of statement with a body of the
303 // comment. While it would be nicer to leave this two tokens,
304 // backwards compatability with TargetParsers makes keeping this in this form
305 // better.
306 const char *CommentTextStart = CurPtr;
307 int CurChar = getNextChar();
308 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
309 CurChar = getNextChar();
310 const char *NewlinePtr = CurPtr;
311 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
312 ++CurPtr;
313
314 // If we have a CommentConsumer, notify it about the comment.
315 if (CommentConsumer) {
316 CommentConsumer->HandleComment(
317 SMLoc::getFromPointer(CommentTextStart),
318 StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
319 }
320
321 IsAtStartOfLine = true;
322 // This is a whole line comment. leave newline
323 if (IsAtStartOfStatement)
324 return AsmToken(AsmToken::EndOfStatement,
325 StringRef(TokStart, CurPtr - TokStart));
326 IsAtStartOfStatement = true;
327
328 return AsmToken(AsmToken::EndOfStatement,
329 StringRef(TokStart, CurPtr - 1 - TokStart));
330}
331
332static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
333 // Skip case-insensitive ULL, UL, U, L and LL suffixes.
334 if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
335 ++CurPtr;
336 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
337 ++CurPtr;
338 if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
339 ++CurPtr;
340}
341
342// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
343// integer as a hexadecimal, possibly with leading zeroes.
344static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
345 bool LexHex) {
346 const char *FirstNonDec = nullptr;
347 const char *LookAhead = CurPtr;
348 while (true) {
349 if (isDigit(*LookAhead)) {
350 ++LookAhead;
351 } else {
352 if (!FirstNonDec)
353 FirstNonDec = LookAhead;
354
355 // Keep going if we are looking for a 'h' suffix.
356 if (LexHex && isHexDigit(*LookAhead))
357 ++LookAhead;
358 else
359 break;
360 }
361 }
362 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
363 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
364 if (isHex)
365 return 16;
366 return DefaultRadix;
367}
368
369static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
370 while (hexDigitValue(*CurPtr) < DefaultRadix) {
371 ++CurPtr;
372 }
373 return CurPtr;
374}
375
377 if (Value.isIntN(64))
380}
381
382static std::string radixName(unsigned Radix) {
383 switch (Radix) {
384 case 2:
385 return "binary";
386 case 8:
387 return "octal";
388 case 10:
389 return "decimal";
390 case 16:
391 return "hexadecimal";
392 default:
393 return "base-" + std::to_string(Radix);
394 }
395}
396
397/// LexDigit: First character is [0-9].
398/// Local Label: [0-9][:]
399/// Forward/Backward Label: [0-9][fb]
400/// Binary integer: 0b[01]+
401/// Octal integer: 0[0-7]+
402/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
403/// Decimal integer: [1-9][0-9]*
404AsmToken AsmLexer::LexDigit() {
405 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
406 // MASM-flavor octal integer: [0-7]+[oOqQ]
407 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
408 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
409 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
410 const char *FirstNonBinary =
411 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
412 const char *FirstNonDecimal =
413 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
414 const char *OldCurPtr = CurPtr;
415 while (isHexDigit(*CurPtr)) {
416 switch (*CurPtr) {
417 default:
418 if (!FirstNonDecimal) {
419 FirstNonDecimal = CurPtr;
420 }
421 [[fallthrough]];
422 case '9':
423 case '8':
424 case '7':
425 case '6':
426 case '5':
427 case '4':
428 case '3':
429 case '2':
430 if (!FirstNonBinary) {
431 FirstNonBinary = CurPtr;
432 }
433 break;
434 case '1':
435 case '0':
436 break;
437 }
438 ++CurPtr;
439 }
440 if (*CurPtr == '.') {
441 // MASM float literals (other than hex floats) always contain a ".", and
442 // are always written in decimal.
443 ++CurPtr;
444 return LexFloatLiteral();
445 }
446
447 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
448 ++CurPtr;
449 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
450 }
451
452 unsigned Radix = 0;
453 if (*CurPtr == 'h' || *CurPtr == 'H') {
454 // hexadecimal number
455 ++CurPtr;
456 Radix = 16;
457 } else if (*CurPtr == 't' || *CurPtr == 'T') {
458 // decimal number
459 ++CurPtr;
460 Radix = 10;
461 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
462 *CurPtr == 'Q') {
463 // octal number
464 ++CurPtr;
465 Radix = 8;
466 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
467 // binary number
468 ++CurPtr;
469 Radix = 2;
470 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
471 DefaultRadix < 14 &&
472 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
473 Radix = 10;
474 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
475 DefaultRadix < 12 &&
476 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
477 Radix = 2;
478 }
479
480 if (Radix) {
481 StringRef Result(TokStart, CurPtr - TokStart);
482 APInt Value(128, 0, true);
483
484 if (Result.drop_back().getAsInteger(Radix, Value))
485 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
486
487 // MSVC accepts and ignores type suffices on integer literals.
489
490 return intToken(Result, Value);
491 }
492
493 // default-radix integers, or floating point numbers, fall through
494 CurPtr = OldCurPtr;
495 }
496
497 // MASM default-radix integers: [0-9a-fA-F]+
498 // (All other integer literals have a radix specifier.)
499 if (LexMasmIntegers && UseMasmDefaultRadix) {
500 CurPtr = findLastDigit(CurPtr, 16);
501 StringRef Result(TokStart, CurPtr - TokStart);
502
503 APInt Value(128, 0, true);
504 if (Result.getAsInteger(DefaultRadix, Value)) {
505 return ReturnError(TokStart,
506 "invalid " + radixName(DefaultRadix) + " number");
507 }
508
509 return intToken(Result, Value);
510 }
511
512 // Motorola hex integers: $[0-9a-fA-F]+
513 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
514 const char *NumStart = CurPtr;
515 while (isHexDigit(CurPtr[0]))
516 ++CurPtr;
517
518 APInt Result(128, 0);
519 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
520 return ReturnError(TokStart, "invalid hexadecimal number");
521
522 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
523 }
524
525 // Motorola binary integers: %[01]+
526 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
527 const char *NumStart = CurPtr;
528 while (*CurPtr == '0' || *CurPtr == '1')
529 ++CurPtr;
530
531 APInt Result(128, 0);
532 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
533 return ReturnError(TokStart, "invalid binary number");
534
535 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
536 }
537
538 // Decimal integer: [1-9][0-9]*
539 // HLASM-flavour decimal integer: [0-9][0-9]*
540 // FIXME: Later on, support for fb for HLASM has to be added in
541 // as they probably would be needed for asm goto
542 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
543 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
544
545 if (!LexHLASMIntegers) {
546 bool IsHex = Radix == 16;
547 // Check for floating point literals.
548 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
549 if (*CurPtr == '.')
550 ++CurPtr;
551 return LexFloatLiteral();
552 }
553 }
554
555 StringRef Result(TokStart, CurPtr - TokStart);
556
557 APInt Value(128, 0, true);
558 if (Result.getAsInteger(Radix, Value))
559 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
560
561 if (!LexHLASMIntegers)
562 // The darwin/x86 (and x86-64) assembler accepts and ignores type
563 // suffices on integer literals.
565
566 return intToken(Result, Value);
567 }
568
569 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
570 ++CurPtr;
571 // See if we actually have "0b" as part of something like "jmp 0b\n"
572 if (!isDigit(CurPtr[0])) {
573 --CurPtr;
574 StringRef Result(TokStart, CurPtr - TokStart);
575 return AsmToken(AsmToken::Integer, Result, 0);
576 }
577 const char *NumStart = CurPtr;
578 while (CurPtr[0] == '0' || CurPtr[0] == '1')
579 ++CurPtr;
580
581 // Requires at least one binary digit.
582 if (CurPtr == NumStart)
583 return ReturnError(TokStart, "invalid binary number");
584
585 StringRef Result(TokStart, CurPtr - TokStart);
586
587 APInt Value(128, 0, true);
588 if (Result.substr(2).getAsInteger(2, Value))
589 return ReturnError(TokStart, "invalid binary number");
590
591 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
592 // suffixes on integer literals.
594
595 return intToken(Result, Value);
596 }
597
598 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
599 ++CurPtr;
600 const char *NumStart = CurPtr;
601 while (isHexDigit(CurPtr[0]))
602 ++CurPtr;
603
604 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
605 // diagnosed by LexHexFloatLiteral).
606 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
607 return LexHexFloatLiteral(NumStart == CurPtr);
608
609 // Otherwise requires at least one hex digit.
610 if (CurPtr == NumStart)
611 return ReturnError(CurPtr-2, "invalid hexadecimal number");
612
613 APInt Result(128, 0);
614 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
615 return ReturnError(TokStart, "invalid hexadecimal number");
616
617 // Consume the optional [hH].
618 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
619 ++CurPtr;
620
621 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
622 // suffixes on integer literals.
624
625 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
626 }
627
628 // Either octal or hexadecimal.
629 APInt Value(128, 0, true);
630 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
631 StringRef Result(TokStart, CurPtr - TokStart);
632 if (Result.getAsInteger(Radix, Value))
633 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
634
635 // Consume the [hH].
636 if (Radix == 16)
637 ++CurPtr;
638
639 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
640 // suffixes on integer literals.
642
643 return intToken(Result, Value);
644}
645
646/// LexSingleQuote: Integer: 'b'
647AsmToken AsmLexer::LexSingleQuote() {
648 int CurChar = getNextChar();
649
650 if (LexHLASMStrings)
651 return ReturnError(TokStart, "invalid usage of character literals");
652
653 if (LexMasmStrings) {
654 while (CurChar != EOF) {
655 if (CurChar != '\'') {
656 CurChar = getNextChar();
657 } else if (peekNextChar() == '\'') {
658 // In MASM single-quote strings, doubled single-quotes mean an escaped
659 // single quote, so should be lexed in.
660 (void)getNextChar();
661 CurChar = getNextChar();
662 } else {
663 break;
664 }
665 }
666 if (CurChar == EOF)
667 return ReturnError(TokStart, "unterminated string constant");
668 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
669 }
670
671 if (CurChar == '\\')
672 CurChar = getNextChar();
673
674 if (CurChar == EOF)
675 return ReturnError(TokStart, "unterminated single quote");
676
677 CurChar = getNextChar();
678
679 if (CurChar != '\'')
680 return ReturnError(TokStart, "single quote way too long");
681
682 // The idea here being that 'c' is basically just an integral
683 // constant.
684 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
685 long long Value;
686
687 if (Res.starts_with("\'\\")) {
688 char theChar = Res[2];
689 switch (theChar) {
690 default: Value = theChar; break;
691 case '\'': Value = '\''; break;
692 case 't': Value = '\t'; break;
693 case 'n': Value = '\n'; break;
694 case 'b': Value = '\b'; break;
695 case 'f': Value = '\f'; break;
696 case 'r': Value = '\r'; break;
697 }
698 } else
699 Value = TokStart[1];
700
701 return AsmToken(AsmToken::Integer, Res, Value);
702}
703
704/// LexQuote: String: "..."
705AsmToken AsmLexer::LexQuote() {
706 int CurChar = getNextChar();
707 if (LexHLASMStrings)
708 return ReturnError(TokStart, "invalid usage of string literals");
709
710 if (LexMasmStrings) {
711 while (CurChar != EOF) {
712 if (CurChar != '"') {
713 CurChar = getNextChar();
714 } else if (peekNextChar() == '"') {
715 // In MASM double-quoted strings, doubled double-quotes mean an escaped
716 // double quote, so should be lexed in.
717 (void)getNextChar();
718 CurChar = getNextChar();
719 } else {
720 break;
721 }
722 }
723 if (CurChar == EOF)
724 return ReturnError(TokStart, "unterminated string constant");
725 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
726 }
727
728 while (CurChar != '"') {
729 if (CurChar == '\\') {
730 // Allow \", etc.
731 CurChar = getNextChar();
732 }
733
734 if (CurChar == EOF)
735 return ReturnError(TokStart, "unterminated string constant");
736
737 CurChar = getNextChar();
738 }
739
740 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
741}
742
744 TokStart = CurPtr;
745
746 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
747 !isAtStatementSeparator(CurPtr) && // End of statement marker.
748 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
749 ++CurPtr;
750 }
751 return StringRef(TokStart, CurPtr-TokStart);
752}
753
754StringRef AsmLexer::LexUntilEndOfLine() {
755 TokStart = CurPtr;
756
757 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
758 ++CurPtr;
759 }
760 return StringRef(TokStart, CurPtr-TokStart);
761}
762
764 bool ShouldSkipSpace) {
765 SaveAndRestore SavedTokenStart(TokStart);
766 SaveAndRestore SavedCurPtr(CurPtr);
767 SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
768 SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
769 SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
770 SaveAndRestore SavedIsPeeking(IsPeeking, true);
771 std::string SavedErr = getErr();
772 SMLoc SavedErrLoc = getErrLoc();
773
774 size_t ReadCount;
775 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
776 AsmToken Token = LexToken();
777
778 Buf[ReadCount] = Token;
779
780 if (Token.is(AsmToken::Eof)) {
781 ReadCount++;
782 break;
783 }
784 }
785
786 SetError(SavedErrLoc, SavedErr);
787 return ReadCount;
788}
789
790bool AsmLexer::isAtStartOfComment(const char *Ptr) {
791 if (MAI.isHLASM() && !IsAtStartOfStatement)
792 return false;
793
794 StringRef CommentString = MAI.getCommentString();
795
796 if (CommentString.size() == 1)
797 return CommentString[0] == Ptr[0];
798
799 // Allow # preprocessor comments also be counted as comments for "##" cases
800 if (CommentString[1] == '#')
801 return CommentString[0] == Ptr[0];
802
803 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
804}
805
806bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
807 return strncmp(Ptr, MAI.getSeparatorString(),
808 strlen(MAI.getSeparatorString())) == 0;
809}
810
811AsmToken AsmLexer::LexToken() {
812 TokStart = CurPtr;
813 // This always consumes at least one character.
814 int CurChar = getNextChar();
815
816 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
817 // If this starts with a '#', this may be a cpp
818 // hash directive and otherwise a line comment.
819 AsmToken TokenBuf[2];
820 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
821 size_t num = peekTokens(Buf, true);
822 // There cannot be a space preceding this
823 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
824 TokenBuf[1].is(AsmToken::String)) {
825 CurPtr = TokStart; // reset curPtr;
826 StringRef s = LexUntilEndOfLine();
827 UnLex(TokenBuf[1]);
828 UnLex(TokenBuf[0]);
829 return AsmToken(AsmToken::HashDirective, s);
830 }
831
832 if (MAI.shouldAllowAdditionalComments())
833 return LexLineComment();
834 }
835
836 if (isAtStartOfComment(TokStart)) {
837 StringRef CommentString = MAI.getCommentString();
838 // For multi-char comment strings, advance CurPtr only if we matched the
839 // full string. This stops us from accidentally eating the newline if the
840 // current line ends in a single comment char.
841 if (CommentString.size() > 1 &&
842 StringRef(TokStart, CommentString.size()) == CommentString) {
843 CurPtr += CommentString.size() - 1;
844 }
845 return LexLineComment();
846 }
847
848 if (isAtStatementSeparator(TokStart)) {
849 CurPtr += strlen(MAI.getSeparatorString()) - 1;
850 IsAtStartOfLine = true;
851 IsAtStartOfStatement = true;
852 return AsmToken(AsmToken::EndOfStatement,
853 StringRef(TokStart, strlen(MAI.getSeparatorString())));
854 }
855
856 // If we're missing a newline at EOF, make sure we still get an
857 // EndOfStatement token before the Eof token.
858 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
859 IsAtStartOfLine = true;
860 IsAtStartOfStatement = true;
861 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
862 }
863 IsAtStartOfLine = false;
864 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
865 IsAtStartOfStatement = false;
866 switch (CurChar) {
867 default:
868 // Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
869 // Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
870 // an identifier is target-dependent. These characters are handled in the
871 // respective switch cases.
872 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
873 return LexIdentifier();
874
875 // Unknown character, emit an error.
876 return ReturnError(TokStart, "invalid character in input");
877 case EOF:
878 if (EndStatementAtEOF) {
879 IsAtStartOfLine = true;
880 IsAtStartOfStatement = true;
881 }
882 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
883 case 0:
884 case ' ':
885 case '\t':
886 IsAtStartOfStatement = OldIsAtStartOfStatement;
887 while (*CurPtr == ' ' || *CurPtr == '\t')
888 CurPtr++;
889 if (SkipSpace)
890 return LexToken(); // Ignore whitespace.
891 else
892 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
893 case '\r': {
894 IsAtStartOfLine = true;
895 IsAtStartOfStatement = true;
896 // If this is a CR followed by LF, treat that as one token.
897 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
898 ++CurPtr;
899 return AsmToken(AsmToken::EndOfStatement,
900 StringRef(TokStart, CurPtr - TokStart));
901 }
902 case '\n':
903 IsAtStartOfLine = true;
904 IsAtStartOfStatement = true;
905 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
906 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
907 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
908 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
909 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
910 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
911 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
912 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
913 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
914 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
915 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
916 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
917 case '$': {
918 if (LexMotorolaIntegers && isHexDigit(*CurPtr))
919 return LexDigit();
920 if (MAI.doesAllowDollarAtStartOfIdentifier())
921 return LexIdentifier();
922 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
923 }
924 case '@':
925 if (MAI.doesAllowAtAtStartOfIdentifier())
926 return LexIdentifier();
927 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
928 case '#':
929 if (MAI.isHLASM())
930 return LexIdentifier();
931 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
932 case '?':
933 if (MAI.doesAllowQuestionAtStartOfIdentifier())
934 return LexIdentifier();
935 return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
936 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
937 case '=':
938 if (*CurPtr == '=') {
939 ++CurPtr;
940 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
941 }
942 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
943 case '-':
944 if (*CurPtr == '>') {
945 ++CurPtr;
946 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
947 }
948 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
949 case '|':
950 if (*CurPtr == '|') {
951 ++CurPtr;
952 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
953 }
954 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
955 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
956 case '&':
957 if (*CurPtr == '&') {
958 ++CurPtr;
959 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
960 }
961 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
962 case '!':
963 if (*CurPtr == '=') {
964 ++CurPtr;
965 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
966 }
967 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
968 case '%':
969 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
970 return LexDigit();
971 }
972 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
973 case '/':
974 IsAtStartOfStatement = OldIsAtStartOfStatement;
975 return LexSlash();
976 case '\'': return LexSingleQuote();
977 case '"': return LexQuote();
978 case '0': case '1': case '2': case '3': case '4':
979 case '5': case '6': case '7': case '8': case '9':
980 return LexDigit();
981 case '<':
982 switch (*CurPtr) {
983 case '<':
984 ++CurPtr;
985 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
986 case '=':
987 ++CurPtr;
988 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
989 case '>':
990 ++CurPtr;
991 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
992 default:
993 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
994 }
995 case '>':
996 switch (*CurPtr) {
997 case '>':
998 ++CurPtr;
999 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
1000 case '=':
1001 ++CurPtr;
1002 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
1003 default:
1004 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
1005 }
1006
1007 // TODO: Quoted identifiers (objc methods etc)
1008 // local labels: [0-9][:]
1009 // Forward/backward labels: [0-9][fb]
1010 // Integers, fp constants, character constants.
1011 }
1012}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
static std::string radixName(unsigned Radix)
Definition AsmLexer.cpp:382
static void SkipIgnoredIntegerSuffix(const char *&CurPtr)
Definition AsmLexer.cpp:332
static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, bool LexHex)
Definition AsmLexer.cpp:344
static AsmToken intToken(StringRef Ref, APInt &Value)
Definition AsmLexer.cpp:376
static const char * findLastDigit(const char *CurPtr, unsigned DefaultRadix)
Definition AsmLexer.cpp:369
static bool isIdentifierChar(char C)
Return true if the given character satisfies the following regular expression: [-a-zA-Z$....
Definition MILexer.cpp:118
This file provides utility classes that use RAII to save and restore values.
This file contains some functions that are useful when dealing with strings.
Class for arbitrary precision integers.
Definition APInt.h:78
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
LLVM_ABI AsmLexer(const MCAsmInfo &MAI)
Definition AsmLexer.cpp:110
void UnLex(AsmToken const &Token)
Definition AsmLexer.h:106
bool is(AsmToken::TokenKind K) const
Check if the current token has kind K.
Definition AsmLexer.h:147
SMLoc getErrLoc()
Get the current error location.
Definition AsmLexer.h:138
const std::string & getErr()
Get the current error string.
Definition AsmLexer.h:141
LLVM_ABI StringRef LexUntilEndOfStatement()
Definition AsmLexer.cpp:743
LLVM_ABI void setBuffer(StringRef Buf, const char *ptr=nullptr, bool EndStatementAtEOF=true)
Definition AsmLexer.cpp:120
LLVM_ABI size_t peekTokens(MutableArrayRef< AsmToken > Buf, bool ShouldSkipSpace=true)
Look ahead an arbitrary number of tokens.
Definition AsmLexer.cpp:763
Target independent representation for an assembler token.
Definition MCAsmMacro.h:22
LLVM_ABI SMLoc getLoc() const
Definition AsmLexer.cpp:31
StringRef getString() const
Get the string for the current token, this includes all characters (for example, the quotes on string...
Definition MCAsmMacro.h:103
bool is(TokenKind K) const
Definition MCAsmMacro.h:75
LLVM_ABI SMLoc getEndLoc() const
Definition AsmLexer.cpp:33
LLVM_ABI void dump(raw_ostream &OS) const
Definition AsmLexer.cpp:39
LLVM_ABI SMRange getLocRange() const
Definition AsmLexer.cpp:37
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool isHLASM() const
Definition MCAsmInfo.h:520
StringRef getCommentString() const
Definition MCAsmInfo.h:538
const char * getSeparatorString() const
Definition MCAsmInfo.h:533
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:299
Represents a location in source code.
Definition SMLoc.h:22
static SMLoc getFromPointer(const char *Ptr)
Definition SMLoc.h:35
Represents a range in source code.
Definition SMLoc.h:47
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:261
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
iterator end() const
Definition StringRef.h:114
LLVM Value Representation.
Definition Value.h:75
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & write_escaped(StringRef Str, bool UseHexEscapes=false)
Output Str, turning '\', '\t', ' ', '"', and anything that doesn't satisfy llvm::isPrint into an esca...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
unsigned hexDigitValue(char C)
Interpret the given character C as a hexadecimal digit and return its value.
bool isDigit(char C)
Checks if character C is one of the 10 decimal digits.
bool isAlnum(char C)
Checks whether character C is either a decimal digit or an uppercase or lowercase letter as classifie...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
bool isHexDigit(char C)
Checks if character C is a hexadecimal numeric character.
A utility class that uses RAII to save and restore the value of a variable.