Home | History | Annotate | Line # | Download | only in Syntax
      1 //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 // Record tokens that a preprocessor emits and define operations to map between
      9 // the tokens written in a file and tokens produced by the preprocessor.
     10 //
     11 // When running the compiler, there are two token streams we are interested in:
     12 //   - "spelled" tokens directly correspond to a substring written in some
     13 //     source file.
     14 //   - "expanded" tokens represent the result of preprocessing, parses consumes
     15 //     this token stream to produce the AST.
     16 //
     17 // Expanded tokens correspond directly to locations found in the AST, allowing
     18 // to find subranges of the token stream covered by various AST nodes. Spelled
     19 // tokens correspond directly to the source code written by the user.
     20 //
     21 // To allow composing these two use-cases, we also define operations that map
     22 // between expanded and spelled tokens that produced them (macro calls,
     23 // directives, etc).
     24 //
     25 //===----------------------------------------------------------------------===//
     26 
     27 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
     28 #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
     29 
     30 #include "clang/Basic/FileManager.h"
     31 #include "clang/Basic/LangOptions.h"
     32 #include "clang/Basic/SourceLocation.h"
     33 #include "clang/Basic/SourceManager.h"
     34 #include "clang/Basic/TokenKinds.h"
     35 #include "clang/Lex/Token.h"
     36 #include "llvm/ADT/ArrayRef.h"
     37 #include "llvm/ADT/DenseMap.h"
     38 #include "llvm/ADT/Optional.h"
     39 #include "llvm/ADT/StringRef.h"
     40 #include "llvm/Support/Compiler.h"
     41 #include "llvm/Support/raw_ostream.h"
     42 #include <cstdint>
     43 #include <tuple>
     44 
     45 namespace clang {
     46 class Preprocessor;
     47 
     48 namespace syntax {
     49 
     50 /// A half-open character range inside a particular file, the start offset is
     51 /// included and the end offset is excluded from the range.
     52 struct FileRange {
     53   /// EXPECTS: File.isValid() && Begin <= End.
     54   FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
     55   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
     56   FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
     57   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
     58   ///          are the same.
     59   FileRange(const SourceManager &SM, SourceLocation BeginLoc,
     60             SourceLocation EndLoc);
     61 
     62   FileID file() const { return File; }
     63   /// Start is a start offset (inclusive) in the corresponding file.
     64   unsigned beginOffset() const { return Begin; }
     65   /// End offset (exclusive) in the corresponding file.
     66   unsigned endOffset() const { return End; }
     67 
     68   unsigned length() const { return End - Begin; }
     69 
     70   /// Check if \p Offset is inside the range.
     71   bool contains(unsigned Offset) const {
     72     return Begin <= Offset && Offset < End;
     73   }
     74   /// Check \p Offset is inside the range or equal to its endpoint.
     75   bool touches(unsigned Offset) const {
     76     return Begin <= Offset && Offset <= End;
     77   }
     78 
     79   /// Gets the substring that this FileRange refers to.
     80   llvm::StringRef text(const SourceManager &SM) const;
     81 
     82   /// Convert to the clang range. The returned range is always a char range,
     83   /// never a token range.
     84   CharSourceRange toCharRange(const SourceManager &SM) const;
     85 
     86   friend bool operator==(const FileRange &L, const FileRange &R) {
     87     return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
     88   }
     89   friend bool operator!=(const FileRange &L, const FileRange &R) {
     90     return !(L == R);
     91   }
     92 
     93 private:
     94   FileID File;
     95   unsigned Begin;
     96   unsigned End;
     97 };
     98 
     99 /// For debugging purposes.
    100 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
    101 
    102 /// A token coming directly from a file or from a macro invocation. Has just
    103 /// enough information to locate the token in the source code.
    104 /// Can represent both expanded and spelled tokens.
    105 class Token {
    106 public:
    107   Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
    108   /// EXPECTS: clang::Token is not an annotation token.
    109   explicit Token(const clang::Token &T);
    110 
    111   tok::TokenKind kind() const { return Kind; }
    112   /// Location of the first character of a token.
    113   SourceLocation location() const { return Location; }
    114   /// Location right after the last character of a token.
    115   SourceLocation endLocation() const {
    116     return Location.getLocWithOffset(Length);
    117   }
    118   unsigned length() const { return Length; }
    119 
    120   /// Get the substring covered by the token. Note that will include all
    121   /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
    122   ///    in\
    123   ///    t
    124   /// both have the same kind tok::kw_int, but results of text() are different.
    125   llvm::StringRef text(const SourceManager &SM) const;
    126 
    127   /// Gets a range of this token.
    128   /// EXPECTS: token comes from a file, not from a macro expansion.
    129   FileRange range(const SourceManager &SM) const;
    130 
    131   /// Given two tokens inside the same file, returns a file range that starts at
    132   /// \p First and ends at \p Last.
    133   /// EXPECTS: First and Last are file tokens from the same file, Last starts
    134   ///          after First.
    135   static FileRange range(const SourceManager &SM, const syntax::Token &First,
    136                          const syntax::Token &Last);
    137 
    138   std::string dumpForTests(const SourceManager &SM) const;
    139   /// For debugging purposes.
    140   std::string str() const;
    141 
    142 private:
    143   SourceLocation Location;
    144   unsigned Length;
    145   tok::TokenKind Kind;
    146 };
    147 /// For debugging purposes. Equivalent to a call to Token::str().
    148 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
    149 
    150 /// A list of tokens obtained by preprocessing a text buffer and operations to
    151 /// map between the expanded and spelled tokens, i.e. TokenBuffer has
    152 /// information about two token streams:
    153 ///    1. Expanded tokens: tokens produced by the preprocessor after all macro
    154 ///       replacements,
    155 ///    2. Spelled tokens: corresponding directly to the source code of a file
    156 ///       before any macro replacements occurred.
    157 /// Here's an example to illustrate a difference between those two:
    158 ///     #define FOO 10
    159 ///     int a = FOO;
    160 ///
    161 /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
    162 /// Expanded tokens are {'int','a','=','10',';','eof'}.
    163 ///
    164 /// Note that the expanded token stream has a tok::eof token at the end, the
    165 /// spelled tokens never store a 'eof' token.
    166 ///
    167 /// The full list expanded tokens can be obtained with expandedTokens(). Spelled
    168 /// tokens for each of the files can be obtained via spelledTokens(FileID).
    169 ///
    170 /// To map between the expanded and spelled tokens use findSpelledByExpanded().
    171 ///
    172 /// To build a token buffer use the TokenCollector class. You can also compute
    173 /// the spelled tokens of a file using the tokenize() helper.
    174 ///
    175 /// FIXME: allow mappings into macro arguments.
    176 class TokenBuffer {
    177 public:
    178   TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
    179 
    180   TokenBuffer(TokenBuffer &&) = default;
    181   TokenBuffer(const TokenBuffer &) = delete;
    182   TokenBuffer &operator=(TokenBuffer &&) = default;
    183   TokenBuffer &operator=(const TokenBuffer &) = delete;
    184 
    185   /// All tokens produced by the preprocessor after all macro replacements,
    186   /// directives, etc. Source locations found in the clang AST will always
    187   /// point to one of these tokens.
    188   /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
    189   /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
    190   ///        into two '>' tokens by the parser. However, TokenBuffer currently
    191   ///        keeps it as a single '>>' token.
    192   llvm::ArrayRef<syntax::Token> expandedTokens() const {
    193     return ExpandedTokens;
    194   }
    195 
    196   /// Builds a cache to make future calls to expandedToken(SourceRange) faster.
    197   /// Creates an index only once. Further calls to it will be no-op.
    198   void indexExpandedTokens();
    199 
    200   /// Returns the subrange of expandedTokens() corresponding to the closed
    201   /// token range R.
    202   /// Consider calling indexExpandedTokens() before for faster lookups.
    203   llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
    204 
    205   /// Returns the subrange of spelled tokens corresponding to AST node spanning
    206   /// \p Expanded. This is the text that should be replaced if a refactoring
    207   /// were to rewrite the node. If \p Expanded is empty, the returned value is
    208   /// llvm::None.
    209   ///
    210   /// Will fail if the expanded tokens do not correspond to a sequence of
    211   /// spelled tokens. E.g. for the following example:
    212   ///
    213   ///   #define FIRST f1 f2 f3
    214   ///   #define SECOND s1 s2 s3
    215   ///   #define ID2(X, Y) X Y
    216   ///
    217   ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
    218   ///   d ID2(e f g, h) i  // expanded tokens are: d e f g h i
    219   ///
    220   /// the results would be:
    221   ///   expanded   => spelled
    222   ///   ------------------------
    223   ///            a => a
    224   ///     s1 s2 s3 => SECOND
    225   ///   a f1 f2 f3 => a FIRST
    226   ///         a f1 => can't map
    227   ///        s1 s2 => can't map
    228   ///         e f  => e f
    229   ///         g h  => can't map
    230   ///
    231   /// EXPECTS: \p Expanded is a subrange of expandedTokens().
    232   /// Complexity is logarithmic.
    233   llvm::Optional<llvm::ArrayRef<syntax::Token>>
    234   spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
    235 
    236   /// Find the subranges of expanded tokens, corresponding to \p Spelled.
    237   ///
    238   /// Some spelled tokens may not be present in the expanded token stream, so
    239   /// this function can return an empty vector, e.g. for tokens of macro
    240   /// directives or disabled preprocessor branches.
    241   ///
    242   /// Some spelled tokens can be duplicated in the expanded token stream
    243   /// multiple times and this function will return multiple results in those
    244   /// cases. This happens when \p Spelled is inside a macro argument.
    245   ///
    246   /// FIXME: return correct results on macro arguments. For now, we return an
    247   ///        empty list.
    248   ///
    249   /// (!) will return empty vector on tokens from #define body:
    250   /// E.g. for the following example:
    251   ///
    252   ///   #define FIRST(A) f1 A = A f2
    253   ///   #define SECOND s
    254   ///
    255   ///   a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
    256   /// The results would be
    257   ///   spelled           => expanded
    258   ///   ------------------------
    259   ///   #define FIRST     => {}
    260   ///   a FIRST(arg)      => {a f1 arg = arg f2}
    261   ///   arg               => {arg, arg} // arg #1 is before `=` and arg #2 is
    262   ///                                   // after `=` in the expanded tokens.
    263   llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
    264   expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
    265 
    266   /// An expansion produced by the preprocessor, includes macro expansions and
    267   /// preprocessor directives. Preprocessor always maps a non-empty range of
    268   /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
    269   /// few examples of expansions:
    270   ///    #pragma once      // Expands to an empty range.
    271   ///    #define FOO 1 2 3 // Expands an empty range.
    272   ///    FOO               // Expands to "1 2 3".
    273   /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
    274   ///    #include <vector> // Expands to tokens produced by the include.
    275   struct Expansion {
    276     llvm::ArrayRef<syntax::Token> Spelled;
    277     llvm::ArrayRef<syntax::Token> Expanded;
    278   };
    279   /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
    280   /// a preprocessor directive) return the subrange of expanded tokens that the
    281   /// macro expands to.
    282   llvm::Optional<Expansion>
    283   expansionStartingAt(const syntax::Token *Spelled) const;
    284   /// Returns all expansions (partially) expanded from the specified tokens.
    285   /// This is the expansions whose Spelled range intersects \p Spelled.
    286   std::vector<Expansion>
    287   expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const;
    288 
    289   /// Lexed tokens of a file before preprocessing. E.g. for the following input
    290   ///     #define DECL(name) int name = 10
    291   ///     DECL(a);
    292   /// spelledTokens() returns
    293   ///    {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
    294   ///     "DECL", "(", "a", ")", ";"}
    295   llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
    296 
    297   /// Returns the spelled Token starting at Loc, if there are no such tokens
    298   /// returns nullptr.
    299   const syntax::Token *spelledTokenAt(SourceLocation Loc) const;
    300 
    301   /// Get all tokens that expand a macro in \p FID. For the following input
    302   ///     #define FOO B
    303   ///     #define FOO2(X) int X
    304   ///     FOO2(XY)
    305   ///     int B;
    306   ///     FOO;
    307   /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
    308   /// respecitvely).
    309   std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
    310 
    311   const SourceManager &sourceManager() const { return *SourceMgr; }
    312 
    313   std::string dumpForTests() const;
    314 
    315 private:
    316   /// Describes a mapping between a continuous subrange of spelled tokens and
    317   /// expanded tokens. Represents macro expansions, preprocessor directives,
    318   /// conditionally disabled pp regions, etc.
    319   ///   #define FOO 1+2
    320   ///   #define BAR(a) a + 1
    321   ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
    322   ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
    323   ///                            macroTokens = {'BAR', '(', '1', ')'}.
    324   struct Mapping {
    325     // Positions in the corresponding spelled token stream. The corresponding
    326     // range is never empty.
    327     unsigned BeginSpelled = 0;
    328     unsigned EndSpelled = 0;
    329     // Positions in the expanded token stream. The corresponding range can be
    330     // empty.
    331     unsigned BeginExpanded = 0;
    332     unsigned EndExpanded = 0;
    333 
    334     /// For debugging purposes.
    335     std::string str() const;
    336   };
    337   /// Spelled tokens of the file with information about the subranges.
    338   struct MarkedFile {
    339     /// Lexed, but not preprocessed, tokens of the file. These map directly to
    340     /// text in the corresponding files and include tokens of all preprocessor
    341     /// directives.
    342     /// FIXME: spelled tokens don't change across FileID that map to the same
    343     ///        FileEntry. We could consider deduplicating them to save memory.
    344     std::vector<syntax::Token> SpelledTokens;
    345     /// A sorted list to convert between the spelled and expanded token streams.
    346     std::vector<Mapping> Mappings;
    347     /// The first expanded token produced for this FileID.
    348     unsigned BeginExpanded = 0;
    349     unsigned EndExpanded = 0;
    350   };
    351 
    352   friend class TokenCollector;
    353 
    354   /// Maps a single expanded token to its spelled counterpart or a mapping that
    355   /// produced it.
    356   std::pair<const syntax::Token *, const Mapping *>
    357   spelledForExpandedToken(const syntax::Token *Expanded) const;
    358 
    359   /// Returns a mapping starting before \p Spelled token, or nullptr if no
    360   /// such mapping exists.
    361   static const Mapping *
    362   mappingStartingBeforeSpelled(const MarkedFile &F,
    363                                const syntax::Token *Spelled);
    364 
    365   /// Convert a private Mapping to a public Expansion.
    366   Expansion makeExpansion(const MarkedFile &, const Mapping &) const;
    367   /// Returns the file that the Spelled tokens are taken from.
    368   /// Asserts that they are non-empty, from a tracked file, and in-bounds.
    369   const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
    370 
    371   /// Token stream produced after preprocessing, conceputally this captures the
    372   /// same stream as 'clang -E' (excluding the preprocessor directives like
    373   /// #file, etc.).
    374   std::vector<syntax::Token> ExpandedTokens;
    375   // Index of ExpandedTokens for faster lookups by SourceLocation.
    376   llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
    377   llvm::DenseMap<FileID, MarkedFile> Files;
    378   // The value is never null, pointer instead of reference to avoid disabling
    379   // implicit assignment operator.
    380   const SourceManager *SourceMgr;
    381 };
    382 
    383 /// The spelled tokens that overlap or touch a spelling location Loc.
    384 /// This always returns 0-2 tokens.
    385 llvm::ArrayRef<syntax::Token>
    386 spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
    387 llvm::ArrayRef<syntax::Token>
    388 spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens);
    389 
    390 /// The identifier token that overlaps or touches a spelling location Loc.
    391 /// If there is none, returns nullptr.
    392 const syntax::Token *
    393 spelledIdentifierTouching(SourceLocation Loc,
    394                           llvm::ArrayRef<syntax::Token> Tokens);
    395 const syntax::Token *
    396 spelledIdentifierTouching(SourceLocation Loc,
    397                           const syntax::TokenBuffer &Tokens);
    398 
    399 /// Lex the text buffer, corresponding to \p FID, in raw mode and record the
    400 /// resulting spelled tokens. Does minimal post-processing on raw identifiers,
    401 /// setting the appropriate token kind (instead of the raw_identifier reported
    402 /// by lexer in raw mode). This is a very low-level function, most users should
    403 /// prefer to use TokenCollector. Lexing in raw mode produces wildly different
    404 /// results from what one might expect when running a C++ frontend, e.g.
    405 /// preprocessor does not run at all.
    406 /// The result will *not* have a 'eof' token at the end.
    407 std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
    408                                     const LangOptions &LO);
    409 /// Similar to one above, instead of whole file tokenizes a part of it. Note
    410 /// that, the first token might be incomplete if FR.startOffset is not at the
    411 /// beginning of a token, and the last token returned will start before the
    412 /// FR.endOffset but might end after it.
    413 std::vector<syntax::Token>
    414 tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);
    415 
    416 /// Collects tokens for the main file while running the frontend action. An
    417 /// instance of this object should be created on
    418 /// FrontendAction::BeginSourceFile() and the results should be consumed after
    419 /// FrontendAction::Execute() finishes.
    420 class TokenCollector {
    421 public:
    422   /// Adds the hooks to collect the tokens. Should be called before the
    423   /// preprocessing starts, i.e. as a part of BeginSourceFile() or
    424   /// CreateASTConsumer().
    425   TokenCollector(Preprocessor &P);
    426 
    427   /// Finalizes token collection. Should be called after preprocessing is
    428   /// finished, i.e. after running Execute().
    429   LLVM_NODISCARD TokenBuffer consume() &&;
    430 
    431 private:
    432   /// Maps from a start to an end spelling location of transformations
    433   /// performed by the preprocessor. These include:
    434   ///   1. range from '#' to the last token in the line for PP directives,
    435   ///   2. macro name and arguments for macro expansions.
    436   /// Note that we record only top-level macro expansions, intermediate
    437   /// expansions (e.g. inside macro arguments) are ignored.
    438   ///
    439   /// Used to find correct boundaries of macro calls and directives when
    440   /// building mappings from spelled to expanded tokens.
    441   ///
    442   /// Logically, at each point of the preprocessor execution there is a stack of
    443   /// macro expansions being processed and we could use it to recover the
    444   /// location information we need. However, the public preprocessor API only
    445   /// exposes the points when macro expansions start (when we push a macro onto
    446   /// the stack) and not when they end (when we pop a macro from the stack).
    447   /// To workaround this limitation, we rely on source location information
    448   /// stored in this map.
    449   using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>;
    450   class Builder;
    451   class CollectPPExpansions;
    452 
    453   std::vector<syntax::Token> Expanded;
    454   // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
    455   PPExpansions Expansions;
    456   Preprocessor &PP;
    457   CollectPPExpansions *Collector;
    458 };
    459 
    460 } // namespace syntax
    461 } // namespace clang
    462 
    463 #endif
    464