Home | History | Annotate | Line # | Download | only in TableGen
      1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This class represents the Lexer for tablegen files.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
     14 #define LLVM_LIB_TABLEGEN_TGLEXER_H
     15 
     16 #include "llvm/ADT/StringRef.h"
     17 #include "llvm/ADT/StringSet.h"
     18 #include "llvm/Support/DataTypes.h"
     19 #include "llvm/Support/SMLoc.h"
     20 #include <cassert>
     21 #include <memory>
     22 #include <set>
     23 #include <string>
     24 #include <vector>
     25 
     26 namespace llvm {
     27 template <typename T> class ArrayRef;
     28 class SourceMgr;
     29 class Twine;
     30 
     31 namespace tgtok {
     32   enum TokKind {
     33     // Markers
     34     Eof, Error,
     35 
     36     // Tokens with no info.
     37     minus, plus,        // - +
     38     l_square, r_square, // [ ]
     39     l_brace, r_brace,   // { }
     40     l_paren, r_paren,   // ( )
     41     less, greater,      // < >
     42     colon, semi,        // : ;
     43     comma, dot,         // , .
     44     equal, question,    // = ?
     45     paste,              // #
     46     dotdotdot,          // ...
     47 
     48     // Reserved keywords. ('ElseKW' is named to distinguish it from the
     49     // existing 'Else' that means the preprocessor #else.)
     50     Assert, Bit, Bits, Class, Code, Dag, Def, Defm, Defset, Defvar, ElseKW,
     51     FalseKW, Field, Foreach, If, In, Include, Int, Let, List, MultiClass,
     52     String, Then, TrueKW,
     53 
     54     // Bang operators.
     55     XConcat, XADD, XSUB, XMUL, XNOT, XAND, XOR, XXOR, XSRA, XSRL, XSHL,
     56     XListConcat, XListSplat, XStrConcat, XInterleave, XSubstr, XFind, XCast,
     57     XSubst, XForEach, XFilter, XFoldl, XHead, XTail, XSize, XEmpty, XIf,
     58     XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetDagOp, XGetDagOp,
     59 
     60     // Boolean literals.
     61     TrueVal, FalseVal,
     62 
     63     // Integer value.
     64     IntVal,
     65 
     66     // Binary constant.  Note that these are sized according to the number of
     67     // bits given.
     68     BinaryIntVal,
     69 
     70     // String valued tokens.
     71     Id, StrVal, VarName, CodeFragment,
     72 
     73     // Preprocessing tokens for internal usage by the lexer.
     74     // They are never returned as a result of Lex().
     75     Ifdef, Ifndef, Else, Endif, Define
     76   };
     77 }
     78 
     79 /// TGLexer - TableGen Lexer class.
     80 class TGLexer {
     81   SourceMgr &SrcMgr;
     82 
     83   const char *CurPtr = nullptr;
     84   StringRef CurBuf;
     85 
     86   // Information about the current token.
     87   const char *TokStart = nullptr;
     88   tgtok::TokKind CurCode = tgtok::TokKind::Eof;
     89   std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
     90   int64_t CurIntVal = 0; // This is valid for IntVal.
     91 
     92   /// CurBuffer - This is the current buffer index we're lexing from as managed
     93   /// by the SourceMgr object.
     94   unsigned CurBuffer = 0;
     95 
     96 public:
     97   typedef std::set<std::string> DependenciesSetTy;
     98 
     99 private:
    100   /// Dependencies - This is the list of all included files.
    101   DependenciesSetTy Dependencies;
    102 
    103 public:
    104   TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
    105 
    106   tgtok::TokKind Lex() {
    107     return CurCode = LexToken(CurPtr == CurBuf.begin());
    108   }
    109 
    110   const DependenciesSetTy &getDependencies() const {
    111     return Dependencies;
    112   }
    113 
    114   tgtok::TokKind getCode() const { return CurCode; }
    115 
    116   const std::string &getCurStrVal() const {
    117     assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
    118             CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
    119            "This token doesn't have a string value");
    120     return CurStrVal;
    121   }
    122   int64_t getCurIntVal() const {
    123     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
    124     return CurIntVal;
    125   }
    126   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
    127     assert(CurCode == tgtok::BinaryIntVal &&
    128            "This token isn't a binary integer");
    129     return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
    130   }
    131 
    132   SMLoc getLoc() const;
    133 
    134 private:
    135   /// LexToken - Read the next token and return its code.
    136   tgtok::TokKind LexToken(bool FileOrLineStart = false);
    137 
    138   tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
    139   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
    140 
    141   int getNextChar();
    142   int peekNextChar(int Index) const;
    143   void SkipBCPLComment();
    144   bool SkipCComment();
    145   tgtok::TokKind LexIdentifier();
    146   bool LexInclude();
    147   tgtok::TokKind LexString();
    148   tgtok::TokKind LexVarName();
    149   tgtok::TokKind LexNumber();
    150   tgtok::TokKind LexBracket();
    151   tgtok::TokKind LexExclaim();
    152 
    153   // Process EOF encountered in LexToken().
    154   // If EOF is met in an include file, then the method will update
    155   // CurPtr, CurBuf and preprocessing include stack, and return true.
    156   // If EOF is met in the top-level file, then the method will
    157   // update and check the preprocessing include stack, and return false.
    158   bool processEOF();
    159 
    160   // *** Structures and methods for preprocessing support ***
    161 
    162   // A set of macro names that are defined either via command line or
    163   // by using:
    164   //     #define NAME
    165   StringSet<> DefinedMacros;
    166 
    167   // Each of #ifdef and #else directives has a descriptor associated
    168   // with it.
    169   //
    170   // An ordered list of preprocessing controls defined by #ifdef/#else
    171   // directives that are in effect currently is called preprocessing
    172   // control stack.  It is represented as a vector of PreprocessorControlDesc's.
    173   //
    174   // The control stack is updated according to the following rules:
    175   //
    176   // For each #ifdef we add an element to the control stack.
    177   // For each #else we replace the top element with a descriptor
    178   // with an inverted IsDefined value.
    179   // For each #endif we pop the top element from the control stack.
    180   //
    181   // When CurPtr reaches the current buffer's end, the control stack
    182   // must be empty, i.e. #ifdef and the corresponding #endif
    183   // must be located in the same file.
    184   struct PreprocessorControlDesc {
    185     // Either tgtok::Ifdef or tgtok::Else.
    186     tgtok::TokKind Kind;
    187 
    188     // True, if the condition for this directive is true, false - otherwise.
    189     // Examples:
    190     //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
    191     //     ...
    192     //     #else             : false, if NAME is defined, true - otherwise.
    193     bool IsDefined;
    194 
    195     // Pointer into CurBuf to the beginning of the preprocessing directive
    196     // word, e.g.:
    197     //     #ifdef NAME
    198     //      ^ - SrcPos
    199     SMLoc SrcPos;
    200   };
    201 
    202   // We want to disallow code like this:
    203   //     file1.td:
    204   //         #define NAME
    205   //         #ifdef NAME
    206   //         include "file2.td"
    207   //     EOF
    208   //     file2.td:
    209   //         #endif
    210   //     EOF
    211   //
    212   // To do this, we clear the preprocessing control stack on entry
    213   // to each of the included file.  PrepIncludeStack is used to store
    214   // preprocessing control stacks for the current file and all its
    215   // parent files.  The back() element is the preprocessing control
    216   // stack for the current file.
    217   std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
    218       PrepIncludeStack;
    219 
    220   // Validate that the current preprocessing control stack is empty,
    221   // since we are about to exit a file, and pop the include stack.
    222   //
    223   // If IncludeStackMustBeEmpty is true, the include stack must be empty
    224   // after the popping, otherwise, the include stack must not be empty
    225   // after the popping.  Basically, the include stack must be empty
    226   // only if we exit the "top-level" file (i.e. finish lexing).
    227   //
    228   // The method returns false, if the current preprocessing control stack
    229   // is not empty (e.g. there is an unterminated #ifdef/#else),
    230   // true - otherwise.
    231   bool prepExitInclude(bool IncludeStackMustBeEmpty);
    232 
    233   // Look ahead for a preprocessing directive starting from CurPtr.  The caller
    234   // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
    235   // a preprocessing directive word followed by a whitespace, then it returns
    236   // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
    237   //
    238   // CurPtr is not adjusted by this method.
    239   tgtok::TokKind prepIsDirective() const;
    240 
    241   // Given a preprocessing token kind, adjusts CurPtr to the end
    242   // of the preprocessing directive word.  Returns true, unless
    243   // an unsupported token kind is passed in.
    244   //
    245   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
    246   // to avoid adjusting CurPtr before we are sure that '#' is followed
    247   // by a preprocessing directive.  If it is not, then we fall back to
    248   // tgtok::paste interpretation of '#'.
    249   bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
    250 
    251   // The main "exit" point from the token parsing to preprocessor.
    252   //
    253   // The method is called for CurPtr, when prepIsDirective() returns
    254   // true.  The first parameter matches the result of prepIsDirective(),
    255   // denoting the actual preprocessor directive to be processed.
    256   //
    257   // If the preprocessing directive disables the tokens processing, e.g.:
    258   //     #ifdef NAME // NAME is undefined
    259   // then lexPreprocessor() enters the lines-skipping mode.
    260   // In this mode, it does not parse any tokens, because the code under
    261   // the #ifdef may not even be a correct tablegen code.  The preprocessor
    262   // looks for lines containing other preprocessing directives, which
    263   // may be prepended with whitespaces and C-style comments.  If the line
    264   // does not contain a preprocessing directive, it is skipped completely.
    265   // Otherwise, the preprocessing directive is processed by recursively
    266   // calling lexPreprocessor().  The processing of the encountered
    267   // preprocessing directives includes updating preprocessing control stack
    268   // and adding new macros into DefinedMacros set.
    269   //
    270   // The second parameter controls whether lexPreprocessor() is called from
    271   // LexToken() (true) or recursively from lexPreprocessor() (false).
    272   //
    273   // If ReturnNextLiveToken is true, the method returns the next
    274   // LEX token following the current directive or following the end
    275   // of the disabled preprocessing region corresponding to this directive.
    276   // If ReturnNextLiveToken is false, the method returns the first parameter,
    277   // unless there were errors encountered in the disabled preprocessing
    278   // region - in this case, it returns tgtok::Error.
    279   tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
    280                                  bool ReturnNextLiveToken = true);
    281 
    282   // Worker method for lexPreprocessor() to skip lines after some
    283   // preprocessing directive up to the buffer end or to the directive
    284   // that re-enables token processing.  The method returns true
    285   // upon processing the next directive that re-enables tokens
    286   // processing.  False is returned if an error was encountered.
    287   //
    288   // Note that prepSkipRegion() calls lexPreprocessor() to process
    289   // encountered preprocessing directives.  In this case, the second
    290   // parameter to lexPreprocessor() is set to false.  Being passed
    291   // false ReturnNextLiveToken, lexPreprocessor() must never call
    292   // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
    293   // to prepSkipRegion() and checking that it is never set to false.
    294   bool prepSkipRegion(bool MustNeverBeFalse);
    295 
    296   // Lex name of the macro after either #ifdef or #define.  We could have used
    297   // LexIdentifier(), but it has special handling of "include" word, which
    298   // could result in awkward diagnostic errors.  Consider:
    299   // ----
    300   // #ifdef include
    301   // class ...
    302   // ----
    303   // LexIdentifier() will engage LexInclude(), which will complain about
    304   // missing file with name "class".  Instead, prepLexMacroName() will treat
    305   // "include" as a normal macro name.
    306   //
    307   // On entry, CurPtr points to the end of a preprocessing directive word.
    308   // The method allows for whitespaces between the preprocessing directive
    309   // and the macro name.  The allowed whitespaces are ' ' and '\t'.
    310   //
    311   // If the first non-whitespace symbol after the preprocessing directive
    312   // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
    313   // the method updates TokStart to the position of the first non-whitespace
    314   // symbol, sets CurPtr to the position of the macro name's last symbol,
    315   // and returns a string reference to the macro name.  Otherwise,
    316   // TokStart is set to the first non-whitespace symbol after the preprocessing
    317   // directive, and the method returns an empty string reference.
    318   //
    319   // In all cases, TokStart may be used to point to the word following
    320   // the preprocessing directive.
    321   StringRef prepLexMacroName();
    322 
    323   // Skip any whitespaces starting from CurPtr.  The method is used
    324   // only in the lines-skipping mode to find the first non-whitespace
    325   // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
    326   // and '\r'.  The method skips C-style comments as well, because
    327   // it is used to find the beginning of the preprocessing directive.
    328   // If we do not handle C-style comments the following code would
    329   // result in incorrect detection of a preprocessing directive:
    330   //     /*
    331   //     #ifdef NAME
    332   //     */
    333   // As long as we skip C-style comments, the following code is correctly
    334   // recognized as a preprocessing directive:
    335   //     /* first line comment
    336   //        second line comment */ #ifdef NAME
    337   //
    338   // The method returns true upon reaching the first non-whitespace symbol
    339   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
    340   // if an error occured during skipping of a C-style comment.
    341   bool prepSkipLineBegin();
    342 
    343   // Skip any whitespaces or comments after a preprocessing directive.
    344   // The method returns true upon reaching either end of the line
    345   // or end of the file.  If there is a multiline C-style comment
    346   // after the preprocessing directive, the method skips
    347   // the comment, so the final CurPtr may point to one of the next lines.
    348   // The method returns false, if an error occured during skipping
    349   // C- or C++-style comment, or a non-whitespace symbol appears
    350   // after the preprocessing directive.
    351   //
    352   // The method maybe called both during lines-skipping and tokens
    353   // processing.  It actually verifies that only whitespaces or/and
    354   // comments follow a preprocessing directive.
    355   //
    356   // After the execution of this mehod, CurPtr points either to new line
    357   // symbol, buffer end or non-whitespace symbol following the preprocesing
    358   // directive.
    359   bool prepSkipDirectiveEnd();
    360 
    361   // Skip all symbols to the end of the line/file.
    362   // The method adjusts CurPtr, so that it points to either new line
    363   // symbol in the current line or the buffer end.
    364   void prepSkipToLineEnd();
    365 
    366   // Return true, if the current preprocessor control stack is such that
    367   // we should allow lexer to process the next token, false - otherwise.
    368   //
    369   // In particular, the method returns true, if all the #ifdef/#else
    370   // controls on the stack have their IsDefined member set to true.
    371   bool prepIsProcessingEnabled();
    372 
    373   // Report an error, if we reach EOF with non-empty preprocessing control
    374   // stack.  This means there is no matching #endif for the previous
    375   // #ifdef/#else.
    376   void prepReportPreprocessorStackError();
    377 };
    378 
    379 } // end namespace llvm
    380 
    381 #endif
    382