| 1 | //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This class represents the Lexer for tablegen files. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H |
| 14 | #define LLVM_LIB_TABLEGEN_TGLEXER_H |
| 15 | |
| 16 | #include "llvm/ADT/SmallVector.h" |
| 17 | #include "llvm/ADT/StringRef.h" |
| 18 | #include "llvm/ADT/StringSet.h" |
| 19 | #include "llvm/Support/DataTypes.h" |
| 20 | #include "llvm/Support/SMLoc.h" |
| 21 | #include <cassert> |
| 22 | #include <memory> |
| 23 | #include <set> |
| 24 | #include <string> |
| 25 | |
| 26 | namespace llvm { |
| 27 | template <typename T> class ArrayRef; |
| 28 | class SourceMgr; |
| 29 | class Twine; |
| 30 | |
| 31 | namespace tgtok { |
| 32 | enum TokKind { |
| 33 | // Markers |
| 34 | Eof, |
| 35 | Error, |
| 36 | |
| 37 | // Tokens with no info. |
| 38 | minus, // - |
| 39 | plus, // + |
| 40 | l_square, // [ |
| 41 | r_square, // ] |
| 42 | l_brace, // { |
| 43 | r_brace, // } |
| 44 | l_paren, // ( |
| 45 | r_paren, // ) |
| 46 | less, // < |
| 47 | greater, // > |
| 48 | colon, // : |
| 49 | semi, // ; |
| 50 | comma, // , |
| 51 | dot, // . |
| 52 | equal, // = |
| 53 | question, // ? |
| 54 | paste, // # |
| 55 | dotdotdot, // ... |
| 56 | |
| 57 | // Boolean literals. |
| 58 | TrueVal, |
| 59 | FalseVal, |
| 60 | |
| 61 | // Integer value. |
| 62 | IntVal, |
| 63 | |
| 64 | // Binary constant. Note that these are sized according to the number of |
| 65 | // bits given. |
| 66 | BinaryIntVal, |
| 67 | |
| 68 | // Preprocessing tokens for internal usage by the lexer. |
| 69 | // They are never returned as a result of Lex(). |
| 70 | Ifdef, |
| 71 | Ifndef, |
| 72 | Else, |
| 73 | Endif, |
| 74 | Define, |
| 75 | |
| 76 | // Reserved keywords. ('ElseKW' is named to distinguish it from the |
| 77 | // existing 'Else' that means the preprocessor #else.) |
| 78 | Bit, |
| 79 | Bits, |
| 80 | Code, |
| 81 | Dag, |
| 82 | ElseKW, |
| 83 | Field, |
| 84 | In, |
| 85 | Include, |
| 86 | Int, |
| 87 | List, |
| 88 | String, |
| 89 | Then, |
| 90 | |
| 91 | // Object start tokens. |
| 92 | OBJECT_START_FIRST, |
| 93 | Assert = OBJECT_START_FIRST, |
| 94 | Class, |
| 95 | Def, |
| 96 | Defm, |
| 97 | Defset, |
| 98 | Deftype, |
| 99 | Defvar, |
| 100 | Dump, |
| 101 | Foreach, |
| 102 | If, |
| 103 | Let, |
| 104 | MultiClass, |
| 105 | OBJECT_START_LAST = MultiClass, |
| 106 | |
| 107 | // Bang operators. |
| 108 | BANG_OPERATOR_FIRST, |
| 109 | XConcat = BANG_OPERATOR_FIRST, |
| 110 | XADD, |
| 111 | XSUB, |
| 112 | XMUL, |
| 113 | XDIV, |
| 114 | XNOT, |
| 115 | XLOG2, |
| 116 | XAND, |
| 117 | XOR, |
| 118 | XXOR, |
| 119 | XSRA, |
| 120 | XSRL, |
| 121 | XSHL, |
| 122 | XListConcat, |
| 123 | XListFlatten, |
| 124 | XListSplat, |
| 125 | XStrConcat, |
| 126 | XInterleave, |
| 127 | XSubstr, |
| 128 | XFind, |
| 129 | XMatch, |
| 130 | XCast, |
| 131 | XSubst, |
| 132 | XForEach, |
| 133 | XFilter, |
| 134 | XFoldl, |
| 135 | XHead, |
| 136 | XTail, |
| 137 | XSize, |
| 138 | XEmpty, |
| 139 | XInitialized, |
| 140 | XInstances, |
| 141 | XIf, |
| 142 | XCond, |
| 143 | XEq, |
| 144 | XIsA, |
| 145 | XDag, |
| 146 | XNe, |
| 147 | XLe, |
| 148 | XLt, |
| 149 | XGe, |
| 150 | XGt, |
| 151 | XSetDagOp, |
| 152 | XGetDagOp, |
| 153 | XExists, |
| 154 | XListRemove, |
| 155 | XToLower, |
| 156 | XToUpper, |
| 157 | XRange, |
| 158 | XGetDagArg, |
| 159 | XGetDagName, |
| 160 | XSetDagArg, |
| 161 | XSetDagName, |
| 162 | XRepr, |
| 163 | BANG_OPERATOR_LAST = XRepr, |
| 164 | |
| 165 | // String valued tokens. |
| 166 | STRING_VALUE_FIRST, |
| 167 | Id = STRING_VALUE_FIRST, |
| 168 | StrVal, |
| 169 | VarName, |
| 170 | CodeFragment, |
| 171 | STRING_VALUE_LAST = CodeFragment, |
| 172 | }; |
| 173 | |
| 174 | /// isBangOperator - Return true if this is a bang operator. |
| 175 | static inline bool isBangOperator(tgtok::TokKind Kind) { |
| 176 | return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST; |
| 177 | } |
| 178 | |
| 179 | /// isObjectStart - Return true if this is a valid first token for a statement. |
| 180 | static inline bool isObjectStart(tgtok::TokKind Kind) { |
| 181 | return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST; |
| 182 | } |
| 183 | |
| 184 | /// isStringValue - Return true if this is a string value. |
| 185 | static inline bool isStringValue(tgtok::TokKind Kind) { |
| 186 | return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST; |
| 187 | } |
| 188 | } // namespace tgtok |
| 189 | |
| 190 | /// TGLexer - TableGen Lexer class. |
| 191 | class TGLexer { |
| 192 | SourceMgr &SrcMgr; |
| 193 | |
| 194 | const char *CurPtr = nullptr; |
| 195 | StringRef CurBuf; |
| 196 | |
| 197 | // Information about the current token. |
| 198 | const char *TokStart = nullptr; |
| 199 | tgtok::TokKind CurCode = tgtok::TokKind::Eof; |
| 200 | std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment |
| 201 | int64_t CurIntVal = 0; // This is valid for IntVal. |
| 202 | |
| 203 | /// CurBuffer - This is the current buffer index we're lexing from as managed |
| 204 | /// by the SourceMgr object. |
| 205 | unsigned CurBuffer = 0; |
| 206 | |
| 207 | public: |
| 208 | typedef std::set<std::string> DependenciesSetTy; |
| 209 | |
| 210 | private: |
| 211 | /// Dependencies - This is the list of all included files. |
| 212 | DependenciesSetTy Dependencies; |
| 213 | |
| 214 | public: |
| 215 | TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); |
| 216 | |
| 217 | tgtok::TokKind Lex() { |
| 218 | return CurCode = LexToken(FileOrLineStart: CurPtr == CurBuf.begin()); |
| 219 | } |
| 220 | |
| 221 | const DependenciesSetTy &getDependencies() const { |
| 222 | return Dependencies; |
| 223 | } |
| 224 | |
| 225 | tgtok::TokKind getCode() const { return CurCode; } |
| 226 | |
| 227 | const std::string &getCurStrVal() const { |
| 228 | assert(tgtok::isStringValue(CurCode) && |
| 229 | "This token doesn't have a string value" ); |
| 230 | return CurStrVal; |
| 231 | } |
| 232 | int64_t getCurIntVal() const { |
| 233 | assert(CurCode == tgtok::IntVal && "This token isn't an integer" ); |
| 234 | return CurIntVal; |
| 235 | } |
| 236 | std::pair<int64_t, unsigned> getCurBinaryIntVal() const { |
| 237 | assert(CurCode == tgtok::BinaryIntVal && |
| 238 | "This token isn't a binary integer" ); |
| 239 | return {CurIntVal, (CurPtr - TokStart) - 2}; |
| 240 | } |
| 241 | |
| 242 | SMLoc getLoc() const; |
| 243 | SMRange getLocRange() const; |
| 244 | |
| 245 | private: |
| 246 | /// LexToken - Read the next token and return its code. |
| 247 | tgtok::TokKind LexToken(bool FileOrLineStart = false); |
| 248 | |
| 249 | tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); |
| 250 | tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); |
| 251 | |
| 252 | int getNextChar(); |
| 253 | int peekNextChar(int Index) const; |
| 254 | void (); |
| 255 | bool (); |
| 256 | tgtok::TokKind LexIdentifier(); |
| 257 | bool LexInclude(); |
| 258 | tgtok::TokKind LexString(); |
| 259 | tgtok::TokKind LexVarName(); |
| 260 | tgtok::TokKind LexNumber(); |
| 261 | tgtok::TokKind LexBracket(); |
| 262 | tgtok::TokKind LexExclaim(); |
| 263 | |
| 264 | // Process EOF encountered in LexToken(). |
| 265 | // If EOF is met in an include file, then the method will update |
| 266 | // CurPtr, CurBuf and preprocessing include stack, and return true. |
| 267 | // If EOF is met in the top-level file, then the method will |
| 268 | // update and check the preprocessing include stack, and return false. |
| 269 | bool processEOF(); |
| 270 | |
| 271 | // *** Structures and methods for preprocessing support *** |
| 272 | |
| 273 | // A set of macro names that are defined either via command line or |
| 274 | // by using: |
| 275 | // #define NAME |
| 276 | StringSet<> DefinedMacros; |
| 277 | |
| 278 | // Each of #ifdef and #else directives has a descriptor associated |
| 279 | // with it. |
| 280 | // |
| 281 | // An ordered list of preprocessing controls defined by #ifdef/#else |
| 282 | // directives that are in effect currently is called preprocessing |
| 283 | // control stack. It is represented as a vector of PreprocessorControlDesc's. |
| 284 | // |
| 285 | // The control stack is updated according to the following rules: |
| 286 | // |
| 287 | // For each #ifdef we add an element to the control stack. |
| 288 | // For each #else we replace the top element with a descriptor |
| 289 | // with an inverted IsDefined value. |
| 290 | // For each #endif we pop the top element from the control stack. |
| 291 | // |
| 292 | // When CurPtr reaches the current buffer's end, the control stack |
| 293 | // must be empty, i.e. #ifdef and the corresponding #endif |
| 294 | // must be located in the same file. |
| 295 | struct PreprocessorControlDesc { |
| 296 | // Either tgtok::Ifdef or tgtok::Else. |
| 297 | tgtok::TokKind Kind; |
| 298 | |
| 299 | // True, if the condition for this directive is true, false - otherwise. |
| 300 | // Examples: |
| 301 | // #ifdef NAME : true, if NAME is defined, false - otherwise. |
| 302 | // ... |
| 303 | // #else : false, if NAME is defined, true - otherwise. |
| 304 | bool IsDefined; |
| 305 | |
| 306 | // Pointer into CurBuf to the beginning of the preprocessing directive |
| 307 | // word, e.g.: |
| 308 | // #ifdef NAME |
| 309 | // ^ - SrcPos |
| 310 | SMLoc SrcPos; |
| 311 | }; |
| 312 | |
| 313 | // We want to disallow code like this: |
| 314 | // file1.td: |
| 315 | // #define NAME |
| 316 | // #ifdef NAME |
| 317 | // include "file2.td" |
| 318 | // EOF |
| 319 | // file2.td: |
| 320 | // #endif |
| 321 | // EOF |
| 322 | // |
| 323 | // To do this, we clear the preprocessing control stack on entry |
| 324 | // to each of the included file. PrepIncludeStack is used to store |
| 325 | // preprocessing control stacks for the current file and all its |
| 326 | // parent files. The back() element is the preprocessing control |
| 327 | // stack for the current file. |
| 328 | SmallVector<SmallVector<PreprocessorControlDesc>> PrepIncludeStack; |
| 329 | |
| 330 | // Validate that the current preprocessing control stack is empty, |
| 331 | // since we are about to exit a file, and pop the include stack. |
| 332 | // |
| 333 | // If IncludeStackMustBeEmpty is true, the include stack must be empty |
| 334 | // after the popping, otherwise, the include stack must not be empty |
| 335 | // after the popping. Basically, the include stack must be empty |
| 336 | // only if we exit the "top-level" file (i.e. finish lexing). |
| 337 | // |
| 338 | // The method returns false, if the current preprocessing control stack |
| 339 | // is not empty (e.g. there is an unterminated #ifdef/#else), |
| 340 | // true - otherwise. |
| 341 | bool prepExitInclude(bool IncludeStackMustBeEmpty); |
| 342 | |
| 343 | // Look ahead for a preprocessing directive starting from CurPtr. The caller |
| 344 | // must only call this method, if *(CurPtr - 1) is '#'. If the method matches |
| 345 | // a preprocessing directive word followed by a whitespace, then it returns |
| 346 | // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. |
| 347 | // |
| 348 | // CurPtr is not adjusted by this method. |
| 349 | tgtok::TokKind prepIsDirective() const; |
| 350 | |
| 351 | // Given a preprocessing token kind, adjusts CurPtr to the end |
| 352 | // of the preprocessing directive word. |
| 353 | // |
| 354 | // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() |
| 355 | // to avoid adjusting CurPtr before we are sure that '#' is followed |
| 356 | // by a preprocessing directive. If it is not, then we fall back to |
| 357 | // tgtok::paste interpretation of '#'. |
| 358 | void prepEatPreprocessorDirective(tgtok::TokKind Kind); |
| 359 | |
| 360 | // The main "exit" point from the token parsing to preprocessor. |
| 361 | // |
| 362 | // The method is called for CurPtr, when prepIsDirective() returns |
| 363 | // true. The first parameter matches the result of prepIsDirective(), |
| 364 | // denoting the actual preprocessor directive to be processed. |
| 365 | // |
| 366 | // If the preprocessing directive disables the tokens processing, e.g.: |
| 367 | // #ifdef NAME // NAME is undefined |
| 368 | // then lexPreprocessor() enters the lines-skipping mode. |
| 369 | // In this mode, it does not parse any tokens, because the code under |
| 370 | // the #ifdef may not even be a correct tablegen code. The preprocessor |
| 371 | // looks for lines containing other preprocessing directives, which |
| 372 | // may be prepended with whitespaces and C-style comments. If the line |
| 373 | // does not contain a preprocessing directive, it is skipped completely. |
| 374 | // Otherwise, the preprocessing directive is processed by recursively |
| 375 | // calling lexPreprocessor(). The processing of the encountered |
| 376 | // preprocessing directives includes updating preprocessing control stack |
| 377 | // and adding new macros into DefinedMacros set. |
| 378 | // |
| 379 | // The second parameter controls whether lexPreprocessor() is called from |
| 380 | // LexToken() (true) or recursively from lexPreprocessor() (false). |
| 381 | // |
| 382 | // If ReturnNextLiveToken is true, the method returns the next |
| 383 | // LEX token following the current directive or following the end |
| 384 | // of the disabled preprocessing region corresponding to this directive. |
| 385 | // If ReturnNextLiveToken is false, the method returns the first parameter, |
| 386 | // unless there were errors encountered in the disabled preprocessing |
| 387 | // region - in this case, it returns tgtok::Error. |
| 388 | tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, |
| 389 | bool ReturnNextLiveToken = true); |
| 390 | |
| 391 | // Worker method for lexPreprocessor() to skip lines after some |
| 392 | // preprocessing directive up to the buffer end or to the directive |
| 393 | // that re-enables token processing. The method returns true |
| 394 | // upon processing the next directive that re-enables tokens |
| 395 | // processing. False is returned if an error was encountered. |
| 396 | // |
| 397 | // Note that prepSkipRegion() calls lexPreprocessor() to process |
| 398 | // encountered preprocessing directives. In this case, the second |
| 399 | // parameter to lexPreprocessor() is set to false. Being passed |
| 400 | // false ReturnNextLiveToken, lexPreprocessor() must never call |
| 401 | // prepSkipRegion(). We assert this by passing ReturnNextLiveToken |
| 402 | // to prepSkipRegion() and checking that it is never set to false. |
| 403 | bool prepSkipRegion(bool MustNeverBeFalse); |
| 404 | |
| 405 | // Lex name of the macro after either #ifdef or #define. We could have used |
| 406 | // LexIdentifier(), but it has special handling of "include" word, which |
| 407 | // could result in awkward diagnostic errors. Consider: |
| 408 | // ---- |
| 409 | // #ifdef include |
| 410 | // class ... |
| 411 | // ---- |
| 412 | // LexIdentifier() will engage LexInclude(), which will complain about |
| 413 | // missing file with name "class". Instead, prepLexMacroName() will treat |
| 414 | // "include" as a normal macro name. |
| 415 | // |
| 416 | // On entry, CurPtr points to the end of a preprocessing directive word. |
| 417 | // The method allows for whitespaces between the preprocessing directive |
| 418 | // and the macro name. The allowed whitespaces are ' ' and '\t'. |
| 419 | // |
| 420 | // If the first non-whitespace symbol after the preprocessing directive |
| 421 | // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then |
| 422 | // the method updates TokStart to the position of the first non-whitespace |
| 423 | // symbol, sets CurPtr to the position of the macro name's last symbol, |
| 424 | // and returns a string reference to the macro name. Otherwise, |
| 425 | // TokStart is set to the first non-whitespace symbol after the preprocessing |
| 426 | // directive, and the method returns an empty string reference. |
| 427 | // |
| 428 | // In all cases, TokStart may be used to point to the word following |
| 429 | // the preprocessing directive. |
| 430 | StringRef prepLexMacroName(); |
| 431 | |
| 432 | // Skip any whitespaces starting from CurPtr. The method is used |
| 433 | // only in the lines-skipping mode to find the first non-whitespace |
| 434 | // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' |
| 435 | // and '\r'. The method skips C-style comments as well, because |
| 436 | // it is used to find the beginning of the preprocessing directive. |
| 437 | // If we do not handle C-style comments the following code would |
| 438 | // result in incorrect detection of a preprocessing directive: |
| 439 | // /* |
| 440 | // #ifdef NAME |
| 441 | // */ |
| 442 | // As long as we skip C-style comments, the following code is correctly |
| 443 | // recognized as a preprocessing directive: |
| 444 | // /* first line comment |
| 445 | // second line comment */ #ifdef NAME |
| 446 | // |
| 447 | // The method returns true upon reaching the first non-whitespace symbol |
| 448 | // or EOF, CurPtr is set to point to this symbol. The method returns false, |
| 449 | // if an error occurred during skipping of a C-style comment. |
| 450 | bool prepSkipLineBegin(); |
| 451 | |
| 452 | // Skip any whitespaces or comments after a preprocessing directive. |
| 453 | // The method returns true upon reaching either end of the line |
| 454 | // or end of the file. If there is a multiline C-style comment |
| 455 | // after the preprocessing directive, the method skips |
| 456 | // the comment, so the final CurPtr may point to one of the next lines. |
| 457 | // The method returns false, if an error occurred during skipping |
| 458 | // C- or C++-style comment, or a non-whitespace symbol appears |
| 459 | // after the preprocessing directive. |
| 460 | // |
| 461 | // The method maybe called both during lines-skipping and tokens |
| 462 | // processing. It actually verifies that only whitespaces or/and |
| 463 | // comments follow a preprocessing directive. |
| 464 | // |
| 465 | // After the execution of this mehod, CurPtr points either to new line |
| 466 | // symbol, buffer end or non-whitespace symbol following the preprocesing |
| 467 | // directive. |
| 468 | bool prepSkipDirectiveEnd(); |
| 469 | |
| 470 | // Return true, if the current preprocessor control stack is such that |
| 471 | // we should allow lexer to process the next token, false - otherwise. |
| 472 | // |
| 473 | // In particular, the method returns true, if all the #ifdef/#else |
| 474 | // controls on the stack have their IsDefined member set to true. |
| 475 | bool prepIsProcessingEnabled(); |
| 476 | |
| 477 | // Report an error, if we reach EOF with non-empty preprocessing control |
| 478 | // stack. This means there is no matching #endif for the previous |
| 479 | // #ifdef/#else. |
| 480 | void prepReportPreprocessorStackError(); |
| 481 | }; |
| 482 | |
| 483 | } // end namespace llvm |
| 484 | |
| 485 | #endif |
| 486 | |