Home | History | Annotate | Line # | Download | only in Lex
      1 //===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 //  This file defines the Lexer interface.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #ifndef LLVM_CLANG_LEX_LEXER_H
     14 #define LLVM_CLANG_LEX_LEXER_H
     15 
     16 #include "clang/Basic/LangOptions.h"
     17 #include "clang/Basic/SourceLocation.h"
     18 #include "clang/Basic/TokenKinds.h"
     19 #include "clang/Lex/PreprocessorLexer.h"
     20 #include "clang/Lex/Token.h"
     21 #include "llvm/ADT/Optional.h"
     22 #include "llvm/ADT/SmallVector.h"
     23 #include "llvm/ADT/StringRef.h"
     24 #include <cassert>
     25 #include <cstdint>
     26 #include <string>
     27 
     28 namespace llvm {
     29 
     30 class MemoryBufferRef;
     31 
     32 } // namespace llvm
     33 
     34 namespace clang {
     35 
     36 class DiagnosticBuilder;
     37 class Preprocessor;
     38 class SourceManager;
     39 
     40 /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
     41 /// recovering from.
     42 enum ConflictMarkerKind {
     43   /// Not within a conflict marker.
     44   CMK_None,
     45 
     46   /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
     47   /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
     48   CMK_Normal,
     49 
     50   /// A Perforce-style conflict marker, initiated by 4 ">"s,
     51   /// separated by 4 "="s, and terminated by 4 "<"s.
     52   CMK_Perforce
     53 };
     54 
     55 /// Describes the bounds (start, size) of the preamble and a flag required by
     56 /// PreprocessorOptions::PrecompiledPreambleBytes.
     57 /// The preamble includes the BOM, if any.
     58 struct PreambleBounds {
     59   /// Size of the preamble in bytes.
     60   unsigned Size;
     61 
     62   /// Whether the preamble ends at the start of a new line.
     63   ///
     64   /// Used to inform the lexer as to whether it's starting at the beginning of
     65   /// a line after skipping the preamble.
     66   bool PreambleEndsAtStartOfLine;
     67 
     68   PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
     69       : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
     70 };
     71 
     72 /// Lexer - This provides a simple interface that turns a text buffer into a
     73 /// stream of tokens.  This provides no support for file reading or buffering,
     74 /// or buffering/seeking of tokens, only forward lexing is supported.  It relies
     75 /// on the specified Preprocessor object to handle preprocessor directives, etc.
     76 class Lexer : public PreprocessorLexer {
     77   friend class Preprocessor;
     78 
     79   void anchor() override;
     80 
     81   //===--------------------------------------------------------------------===//
     82   // Constant configuration values for this lexer.
     83 
     84   // Start of the buffer.
     85   const char *BufferStart;
     86 
     87   // End of the buffer.
     88   const char *BufferEnd;
     89 
     90   // Location for start of file.
     91   SourceLocation FileLoc;
     92 
     93   // LangOpts enabled by this language (cache).
     94   LangOptions LangOpts;
     95 
     96   // True if lexer for _Pragma handling.
     97   bool Is_PragmaLexer;
     98 
     99   //===--------------------------------------------------------------------===//
    100   // Context-specific lexing flags set by the preprocessor.
    101   //
    102 
    103   /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
    104   /// and return them as tokens.  This is used for -C and -CC modes, and
    105   /// whitespace preservation can be useful for some clients that want to lex
    106   /// the file in raw mode and get every character from the file.
    107   ///
    108   /// When this is set to 2 it returns comments and whitespace.  When set to 1
    109   /// it returns comments, when it is set to 0 it returns normal tokens only.
    110   unsigned char ExtendedTokenMode;
    111 
    112   //===--------------------------------------------------------------------===//
    113   // Context that changes as the file is lexed.
    114   // NOTE: any state that mutates when in raw mode must have save/restore code
    115   // in Lexer::isNextPPTokenLParen.
    116 
    117   // BufferPtr - Current pointer into the buffer.  This is the next character
    118   // to be lexed.
    119   const char *BufferPtr;
    120 
    121   // IsAtStartOfLine - True if the next lexed token should get the "start of
    122   // line" flag set on it.
    123   bool IsAtStartOfLine;
    124 
    125   bool IsAtPhysicalStartOfLine;
    126 
    127   bool HasLeadingSpace;
    128 
    129   bool HasLeadingEmptyMacro;
    130 
    131   // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
    132   // it also points to '\n.'
    133   const char *NewLinePtr;
    134 
    135   // CurrentConflictMarkerState - The kind of conflict marker we are handling.
    136   ConflictMarkerKind CurrentConflictMarkerState;
    137 
    138   void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
    139 
    140 public:
    141   /// Lexer constructor - Create a new lexer object for the specified buffer
    142   /// with the specified preprocessor managing the lexing process.  This lexer
    143   /// assumes that the associated file buffer and Preprocessor objects will
    144   /// outlive it, so it doesn't take ownership of either of them.
    145   Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP);
    146 
    147   /// Lexer constructor - Create a new raw lexer object.  This object is only
    148   /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
    149   /// text range will outlive it, so it doesn't take ownership of it.
    150   Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
    151         const char *BufStart, const char *BufPtr, const char *BufEnd);
    152 
    153   /// Lexer constructor - Create a new raw lexer object.  This object is only
    154   /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
    155   /// text range will outlive it, so it doesn't take ownership of it.
    156   Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
    157         const SourceManager &SM, const LangOptions &LangOpts);
    158 
    159   Lexer(const Lexer &) = delete;
    160   Lexer &operator=(const Lexer &) = delete;
    161 
    162   /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
    163   /// _Pragma expansion.  This has a variety of magic semantics that this method
    164   /// sets up.  It returns a new'd Lexer that must be delete'd when done.
    165   static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
    166                                    SourceLocation ExpansionLocStart,
    167                                    SourceLocation ExpansionLocEnd,
    168                                    unsigned TokLen, Preprocessor &PP);
    169 
    170   /// getLangOpts - Return the language features currently enabled.
    171   /// NOTE: this lexer modifies features as a file is parsed!
    172   const LangOptions &getLangOpts() const { return LangOpts; }
    173 
    174   /// getFileLoc - Return the File Location for the file we are lexing out of.
    175   /// The physical location encodes the location where the characters come from,
    176   /// the virtual location encodes where we should *claim* the characters came
    177   /// from.  Currently this is only used by _Pragma handling.
    178   SourceLocation getFileLoc() const { return FileLoc; }
    179 
    180 private:
    181   /// Lex - Return the next token in the file.  If this is the end of file, it
    182   /// return the tok::eof token.  This implicitly involves the preprocessor.
    183   bool Lex(Token &Result);
    184 
    185 public:
    186   /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
    187   bool isPragmaLexer() const { return Is_PragmaLexer; }
    188 
    189 private:
    190   /// IndirectLex - An indirect call to 'Lex' that can be invoked via
    191   ///  the PreprocessorLexer interface.
    192   void IndirectLex(Token &Result) override { Lex(Result); }
    193 
    194 public:
    195   /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
    196   /// associated preprocessor object.  Return true if the 'next character to
    197   /// read' pointer points at the end of the lexer buffer, false otherwise.
    198   bool LexFromRawLexer(Token &Result) {
    199     assert(LexingRawMode && "Not already in raw mode!");
    200     Lex(Result);
    201     // Note that lexing to the end of the buffer doesn't implicitly delete the
    202     // lexer when in raw mode.
    203     return BufferPtr == BufferEnd;
    204   }
    205 
    206   /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
    207   /// every character in the file, including whitespace and comments.  This
    208   /// should only be used in raw mode, as the preprocessor is not prepared to
    209   /// deal with the excess tokens.
    210   bool isKeepWhitespaceMode() const {
    211     return ExtendedTokenMode > 1;
    212   }
    213 
    214   /// SetKeepWhitespaceMode - This method lets clients enable or disable
    215   /// whitespace retention mode.
    216   void SetKeepWhitespaceMode(bool Val) {
    217     assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
    218            "Can only retain whitespace in raw mode or -traditional-cpp");
    219     ExtendedTokenMode = Val ? 2 : 0;
    220   }
    221 
    222   /// inKeepCommentMode - Return true if the lexer should return comments as
    223   /// tokens.
    224   bool inKeepCommentMode() const {
    225     return ExtendedTokenMode > 0;
    226   }
    227 
    228   /// SetCommentRetentionMode - Change the comment retention mode of the lexer
    229   /// to the specified mode.  This is really only useful when lexing in raw
    230   /// mode, because otherwise the lexer needs to manage this.
    231   void SetCommentRetentionState(bool Mode) {
    232     assert(!isKeepWhitespaceMode() &&
    233            "Can't play with comment retention state when retaining whitespace");
    234     ExtendedTokenMode = Mode ? 1 : 0;
    235   }
    236 
    237   /// Sets the extended token mode back to its initial value, according to the
    238   /// language options and preprocessor. This controls whether the lexer
    239   /// produces comment and whitespace tokens.
    240   ///
    241   /// This requires the lexer to have an associated preprocessor. A standalone
    242   /// lexer has nothing to reset to.
    243   void resetExtendedTokenMode();
    244 
    245   /// Gets source code buffer.
    246   StringRef getBuffer() const {
    247     return StringRef(BufferStart, BufferEnd - BufferStart);
    248   }
    249 
    250   /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
    251   /// uninterpreted string.  This switches the lexer out of directive mode.
    252   void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
    253 
    254 
    255   /// Diag - Forwarding function for diagnostics.  This translate a source
    256   /// position in the current buffer into a SourceLocation object for rendering.
    257   DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
    258 
    259   /// getSourceLocation - Return a source location identifier for the specified
    260   /// offset in the current file.
    261   SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
    262 
    263   /// getSourceLocation - Return a source location for the next character in
    264   /// the current file.
    265   SourceLocation getSourceLocation() override {
    266     return getSourceLocation(BufferPtr);
    267   }
    268 
    269   /// Return the current location in the buffer.
    270   const char *getBufferLocation() const { return BufferPtr; }
    271 
    272   /// Returns the current lexing offset.
    273   unsigned getCurrentBufferOffset() {
    274     assert(BufferPtr >= BufferStart && "Invalid buffer state");
    275     return BufferPtr - BufferStart;
    276   }
    277 
    278   /// Skip over \p NumBytes bytes.
    279   ///
    280   /// If the skip is successful, the next token will be lexed from the new
    281   /// offset. The lexer also assumes that we skipped to the start of the line.
    282   ///
    283   /// \returns true if the skip failed (new offset would have been past the
    284   /// end of the buffer), false otherwise.
    285   bool skipOver(unsigned NumBytes);
    286 
    287   /// Stringify - Convert the specified string into a C string by i) escaping
    288   /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
    289   /// If Charify is true, this escapes the ' character instead of ".
    290   static std::string Stringify(StringRef Str, bool Charify = false);
    291 
    292   /// Stringify - Convert the specified string into a C string by i) escaping
    293   /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
    294   static void Stringify(SmallVectorImpl<char> &Str);
    295 
    296   /// getSpelling - This method is used to get the spelling of a token into a
    297   /// preallocated buffer, instead of as an std::string.  The caller is required
    298   /// to allocate enough space for the token, which is guaranteed to be at least
    299   /// Tok.getLength() bytes long.  The length of the actual result is returned.
    300   ///
    301   /// Note that this method may do two possible things: it may either fill in
    302   /// the buffer specified with characters, or it may *change the input pointer*
    303   /// to point to a constant buffer with the data already in it (avoiding a
    304   /// copy).  The caller is not allowed to modify the returned buffer pointer
    305   /// if an internal buffer is returned.
    306   static unsigned getSpelling(const Token &Tok, const char *&Buffer,
    307                               const SourceManager &SourceMgr,
    308                               const LangOptions &LangOpts,
    309                               bool *Invalid = nullptr);
    310 
    311   /// getSpelling() - Return the 'spelling' of the Tok token.  The spelling of a
    312   /// token is the characters used to represent the token in the source file
    313   /// after trigraph expansion and escaped-newline folding.  In particular, this
    314   /// wants to get the true, uncanonicalized, spelling of things like digraphs
    315   /// UCNs, etc.
    316   static std::string getSpelling(const Token &Tok,
    317                                  const SourceManager &SourceMgr,
    318                                  const LangOptions &LangOpts,
    319                                  bool *Invalid = nullptr);
    320 
    321   /// getSpelling - This method is used to get the spelling of the
    322   /// token at the given source location.  If, as is usually true, it
    323   /// is not necessary to copy any data, then the returned string may
    324   /// not point into the provided buffer.
    325   ///
    326   /// This method lexes at the expansion depth of the given
    327   /// location and does not jump to the expansion or spelling
    328   /// location.
    329   static StringRef getSpelling(SourceLocation loc,
    330                                SmallVectorImpl<char> &buffer,
    331                                const SourceManager &SM,
    332                                const LangOptions &options,
    333                                bool *invalid = nullptr);
    334 
    335   /// MeasureTokenLength - Relex the token at the specified location and return
    336   /// its length in bytes in the input file.  If the token needs cleaning (e.g.
    337   /// includes a trigraph or an escaped newline) then this count includes bytes
    338   /// that are part of that.
    339   static unsigned MeasureTokenLength(SourceLocation Loc,
    340                                      const SourceManager &SM,
    341                                      const LangOptions &LangOpts);
    342 
    343   /// Relex the token at the specified location.
    344   /// \returns true if there was a failure, false on success.
    345   static bool getRawToken(SourceLocation Loc, Token &Result,
    346                           const SourceManager &SM,
    347                           const LangOptions &LangOpts,
    348                           bool IgnoreWhiteSpace = false);
    349 
    350   /// Given a location any where in a source buffer, find the location
    351   /// that corresponds to the beginning of the token in which the original
    352   /// source location lands.
    353   static SourceLocation GetBeginningOfToken(SourceLocation Loc,
    354                                             const SourceManager &SM,
    355                                             const LangOptions &LangOpts);
    356 
    357   /// Get the physical length (including trigraphs and escaped newlines) of the
    358   /// first \p Characters characters of the token starting at TokStart.
    359   static unsigned getTokenPrefixLength(SourceLocation TokStart,
    360                                        unsigned CharNo,
    361                                        const SourceManager &SM,
    362                                        const LangOptions &LangOpts);
    363 
    364   /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
    365   /// location at the start of a token, return a new location that specifies a
    366   /// character within the token.  This handles trigraphs and escaped newlines.
    367   static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
    368                                                 unsigned Characters,
    369                                                 const SourceManager &SM,
    370                                                 const LangOptions &LangOpts) {
    371     return TokStart.getLocWithOffset(
    372         getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
    373   }
    374 
    375   /// Computes the source location just past the end of the
    376   /// token at this source location.
    377   ///
    378   /// This routine can be used to produce a source location that
    379   /// points just past the end of the token referenced by \p Loc, and
    380   /// is generally used when a diagnostic needs to point just after a
    381   /// token where it expected something different that it received. If
    382   /// the returned source location would not be meaningful (e.g., if
    383   /// it points into a macro), this routine returns an invalid
    384   /// source location.
    385   ///
    386   /// \param Offset an offset from the end of the token, where the source
    387   /// location should refer to. The default offset (0) produces a source
    388   /// location pointing just past the end of the token; an offset of 1 produces
    389   /// a source location pointing to the last character in the token, etc.
    390   static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
    391                                             const SourceManager &SM,
    392                                             const LangOptions &LangOpts);
    393 
    394   /// Given a token range, produce a corresponding CharSourceRange that
    395   /// is not a token range. This allows the source range to be used by
    396   /// components that don't have access to the lexer and thus can't find the
    397   /// end of the range for themselves.
    398   static CharSourceRange getAsCharRange(SourceRange Range,
    399                                         const SourceManager &SM,
    400                                         const LangOptions &LangOpts) {
    401     SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
    402     return End.isInvalid() ? CharSourceRange()
    403                            : CharSourceRange::getCharRange(
    404                                  Range.getBegin(), End);
    405   }
    406   static CharSourceRange getAsCharRange(CharSourceRange Range,
    407                                         const SourceManager &SM,
    408                                         const LangOptions &LangOpts) {
    409     return Range.isTokenRange()
    410                ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
    411                : Range;
    412   }
    413 
    414   /// Returns true if the given MacroID location points at the first
    415   /// token of the macro expansion.
    416   ///
    417   /// \param MacroBegin If non-null and function returns true, it is set to
    418   /// begin location of the macro.
    419   static bool isAtStartOfMacroExpansion(SourceLocation loc,
    420                                         const SourceManager &SM,
    421                                         const LangOptions &LangOpts,
    422                                         SourceLocation *MacroBegin = nullptr);
    423 
    424   /// Returns true if the given MacroID location points at the last
    425   /// token of the macro expansion.
    426   ///
    427   /// \param MacroEnd If non-null and function returns true, it is set to
    428   /// end location of the macro.
    429   static bool isAtEndOfMacroExpansion(SourceLocation loc,
    430                                       const SourceManager &SM,
    431                                       const LangOptions &LangOpts,
    432                                       SourceLocation *MacroEnd = nullptr);
    433 
    434   /// Accepts a range and returns a character range with file locations.
    435   ///
    436   /// Returns a null range if a part of the range resides inside a macro
    437   /// expansion or the range does not reside on the same FileID.
    438   ///
    439   /// This function is trying to deal with macros and return a range based on
    440   /// file locations. The cases where it can successfully handle macros are:
    441   ///
    442   /// -begin or end range lies at the start or end of a macro expansion, in
    443   ///  which case the location will be set to the expansion point, e.g:
    444   ///    \#define M 1 2
    445   ///    a M
    446   /// If you have a range [a, 2] (where 2 came from the macro), the function
    447   /// will return a range for "a M"
    448   /// if you have range [a, 1], the function will fail because the range
    449   /// overlaps with only a part of the macro
    450   ///
    451   /// -The macro is a function macro and the range can be mapped to the macro
    452   ///  arguments, e.g:
    453   ///    \#define M 1 2
    454   ///    \#define FM(x) x
    455   ///    FM(a b M)
    456   /// if you have range [b, 2], the function will return the file range "b M"
    457   /// inside the macro arguments.
    458   /// if you have range [a, 2], the function will return the file range
    459   /// "FM(a b M)" since the range includes all of the macro expansion.
    460   static CharSourceRange makeFileCharRange(CharSourceRange Range,
    461                                            const SourceManager &SM,
    462                                            const LangOptions &LangOpts);
    463 
    464   /// Returns a string for the source that the range encompasses.
    465   static StringRef getSourceText(CharSourceRange Range,
    466                                  const SourceManager &SM,
    467                                  const LangOptions &LangOpts,
    468                                  bool *Invalid = nullptr);
    469 
    470   /// Retrieve the name of the immediate macro expansion.
    471   ///
    472   /// This routine starts from a source location, and finds the name of the macro
    473   /// responsible for its immediate expansion. It looks through any intervening
    474   /// macro argument expansions to compute this. It returns a StringRef which
    475   /// refers to the SourceManager-owned buffer of the source where that macro
    476   /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
    477   static StringRef getImmediateMacroName(SourceLocation Loc,
    478                                          const SourceManager &SM,
    479                                          const LangOptions &LangOpts);
    480 
    481   /// Retrieve the name of the immediate macro expansion.
    482   ///
    483   /// This routine starts from a source location, and finds the name of the
    484   /// macro responsible for its immediate expansion. It looks through any
    485   /// intervening macro argument expansions to compute this. It returns a
    486   /// StringRef which refers to the SourceManager-owned buffer of the source
    487   /// where that macro name is spelled. Thus, the result shouldn't out-live
    488   /// that SourceManager.
    489   ///
    490   /// This differs from Lexer::getImmediateMacroName in that any macro argument
    491   /// location will result in the topmost function macro that accepted it.
    492   /// e.g.
    493   /// \code
    494   ///   MAC1( MAC2(foo) )
    495   /// \endcode
    496   /// for location of 'foo' token, this function will return "MAC1" while
    497   /// Lexer::getImmediateMacroName will return "MAC2".
    498   static StringRef getImmediateMacroNameForDiagnostics(
    499       SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
    500 
    501   /// Compute the preamble of the given file.
    502   ///
    503   /// The preamble of a file contains the initial comments, include directives,
    504   /// and other preprocessor directives that occur before the code in this
    505   /// particular file actually begins. The preamble of the main source file is
    506   /// a potential prefix header.
    507   ///
    508   /// \param Buffer The memory buffer containing the file's contents.
    509   ///
    510   /// \param MaxLines If non-zero, restrict the length of the preamble
    511   /// to fewer than this number of lines.
    512   ///
    513   /// \returns The offset into the file where the preamble ends and the rest
    514   /// of the file begins along with a boolean value indicating whether
    515   /// the preamble ends at the beginning of a new line.
    516   static PreambleBounds ComputePreamble(StringRef Buffer,
    517                                         const LangOptions &LangOpts,
    518                                         unsigned MaxLines = 0);
    519 
    520   /// Finds the token that comes right after the given location.
    521   ///
    522   /// Returns the next token, or none if the location is inside a macro.
    523   static Optional<Token> findNextToken(SourceLocation Loc,
    524                                        const SourceManager &SM,
    525                                        const LangOptions &LangOpts);
    526 
    527   /// Checks that the given token is the first token that occurs after
    528   /// the given location (this excludes comments and whitespace). Returns the
    529   /// location immediately after the specified token. If the token is not found
    530   /// or the location is inside a macro, the returned source location will be
    531   /// invalid.
    532   static SourceLocation findLocationAfterToken(SourceLocation loc,
    533                                          tok::TokenKind TKind,
    534                                          const SourceManager &SM,
    535                                          const LangOptions &LangOpts,
    536                                          bool SkipTrailingWhitespaceAndNewLine);
    537 
    538   /// Returns true if the given character could appear in an identifier.
    539   static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
    540 
    541   /// Checks whether new line pointed by Str is preceded by escape
    542   /// sequence.
    543   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
    544 
    545   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
    546   /// emit a warning.
    547   static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
    548                                           const LangOptions &LangOpts) {
    549     // If this is not a trigraph and not a UCN or escaped newline, return
    550     // quickly.
    551     if (isObviouslySimpleCharacter(Ptr[0])) {
    552       Size = 1;
    553       return *Ptr;
    554     }
    555 
    556     Size = 0;
    557     return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
    558   }
    559 
    560   /// Returns the leading whitespace for line that corresponds to the given
    561   /// location \p Loc.
    562   static StringRef getIndentationForLine(SourceLocation Loc,
    563                                          const SourceManager &SM);
    564 
    565 private:
    566   //===--------------------------------------------------------------------===//
    567   // Internal implementation interfaces.
    568 
    569   /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
    570   /// by Lex.
    571   ///
    572   bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
    573 
    574   bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
    575 
    576   /// Given that a token begins with the Unicode character \p C, figure out
    577   /// what kind of token it is and dispatch to the appropriate lexing helper
    578   /// function.
    579   bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
    580 
    581   /// FormTokenWithChars - When we lex a token, we have identified a span
    582   /// starting at BufferPtr, going to TokEnd that forms the token.  This method
    583   /// takes that range and assigns it to the token as its location and size.  In
    584   /// addition, since tokens cannot overlap, this also updates BufferPtr to be
    585   /// TokEnd.
    586   void FormTokenWithChars(Token &Result, const char *TokEnd,
    587                           tok::TokenKind Kind) {
    588     unsigned TokLen = TokEnd-BufferPtr;
    589     Result.setLength(TokLen);
    590     Result.setLocation(getSourceLocation(BufferPtr, TokLen));
    591     Result.setKind(Kind);
    592     BufferPtr = TokEnd;
    593   }
    594 
    595   /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
    596   /// tok::l_paren token, 0 if it is something else and 2 if there are no more
    597   /// tokens in the buffer controlled by this lexer.
    598   unsigned isNextPPTokenLParen();
    599 
    600   //===--------------------------------------------------------------------===//
    601   // Lexer character reading interfaces.
    602 
    603   // This lexer is built on two interfaces for reading characters, both of which
    604   // automatically provide phase 1/2 translation.  getAndAdvanceChar is used
    605   // when we know that we will be reading a character from the input buffer and
    606   // that this character will be part of the result token. This occurs in (f.e.)
    607   // string processing, because we know we need to read until we find the
    608   // closing '"' character.
    609   //
    610   // The second interface is the combination of getCharAndSize with
    611   // ConsumeChar.  getCharAndSize reads a phase 1/2 translated character,
    612   // returning it and its size.  If the lexer decides that this character is
    613   // part of the current token, it calls ConsumeChar on it.  This two stage
    614   // approach allows us to emit diagnostics for characters (e.g. warnings about
    615   // trigraphs), knowing that they only are emitted if the character is
    616   // consumed.
    617 
    618   /// isObviouslySimpleCharacter - Return true if the specified character is
    619   /// obviously the same in translation phase 1 and translation phase 3.  This
    620   /// can return false for characters that end up being the same, but it will
    621   /// never return true for something that needs to be mapped.
    622   static bool isObviouslySimpleCharacter(char C) {
    623     return C != '?' && C != '\\';
    624   }
    625 
    626   /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
    627   /// advance over it, and return it.  This is tricky in several cases.  Here we
    628   /// just handle the trivial case and fall-back to the non-inlined
    629   /// getCharAndSizeSlow method to handle the hard case.
    630   inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
    631     // If this is not a trigraph and not a UCN or escaped newline, return
    632     // quickly.
    633     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
    634 
    635     unsigned Size = 0;
    636     char C = getCharAndSizeSlow(Ptr, Size, &Tok);
    637     Ptr += Size;
    638     return C;
    639   }
    640 
    641   /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
    642   /// and added to a given token, check to see if there are diagnostics that
    643   /// need to be emitted or flags that need to be set on the token.  If so, do
    644   /// it.
    645   const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
    646     // Normal case, we consumed exactly one token.  Just return it.
    647     if (Size == 1)
    648       return Ptr+Size;
    649 
    650     // Otherwise, re-lex the character with a current token, allowing
    651     // diagnostics to be emitted and flags to be set.
    652     Size = 0;
    653     getCharAndSizeSlow(Ptr, Size, &Tok);
    654     return Ptr+Size;
    655   }
    656 
    657   /// getCharAndSize - Peek a single 'character' from the specified buffer,
    658   /// get its size, and return it.  This is tricky in several cases.  Here we
    659   /// just handle the trivial case and fall-back to the non-inlined
    660   /// getCharAndSizeSlow method to handle the hard case.
    661   inline char getCharAndSize(const char *Ptr, unsigned &Size) {
    662     // If this is not a trigraph and not a UCN or escaped newline, return
    663     // quickly.
    664     if (isObviouslySimpleCharacter(Ptr[0])) {
    665       Size = 1;
    666       return *Ptr;
    667     }
    668 
    669     Size = 0;
    670     return getCharAndSizeSlow(Ptr, Size);
    671   }
    672 
    673   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
    674   /// method.
    675   char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
    676                           Token *Tok = nullptr);
    677 
    678   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
    679   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
    680   /// to this function.
    681   static unsigned getEscapedNewLineSize(const char *P);
    682 
    683   /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
    684   /// them), skip over them and return the first non-escaped-newline found,
    685   /// otherwise return P.
    686   static const char *SkipEscapedNewLines(const char *P);
    687 
    688   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
    689   /// diagnostic.
    690   static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
    691                                        const LangOptions &LangOpts);
    692 
    693   //===--------------------------------------------------------------------===//
    694   // Other lexer functions.
    695 
    696   void SetByteOffset(unsigned Offset, bool StartOfLine);
    697 
    698   void PropagateLineStartLeadingSpaceInfo(Token &Result);
    699 
    700   const char *LexUDSuffix(Token &Result, const char *CurPtr,
    701                           bool IsStringLiteral);
    702 
    703   // Helper functions to lex the remainder of a token of the specific type.
    704   bool LexIdentifier         (Token &Result, const char *CurPtr);
    705   bool LexNumericConstant    (Token &Result, const char *CurPtr);
    706   bool LexStringLiteral      (Token &Result, const char *CurPtr,
    707                               tok::TokenKind Kind);
    708   bool LexRawStringLiteral   (Token &Result, const char *CurPtr,
    709                               tok::TokenKind Kind);
    710   bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
    711   bool LexCharConstant       (Token &Result, const char *CurPtr,
    712                               tok::TokenKind Kind);
    713   bool LexEndOfFile          (Token &Result, const char *CurPtr);
    714   bool SkipWhitespace        (Token &Result, const char *CurPtr,
    715                               bool &TokAtPhysicalStartOfLine);
    716   bool SkipLineComment       (Token &Result, const char *CurPtr,
    717                               bool &TokAtPhysicalStartOfLine);
    718   bool SkipBlockComment      (Token &Result, const char *CurPtr,
    719                               bool &TokAtPhysicalStartOfLine);
    720   bool SaveLineComment       (Token &Result, const char *CurPtr);
    721 
    722   bool IsStartOfConflictMarker(const char *CurPtr);
    723   bool HandleEndOfConflictMarker(const char *CurPtr);
    724 
    725   bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
    726 
    727   bool isCodeCompletionPoint(const char *CurPtr) const;
    728   void cutOffLexing() { BufferPtr = BufferEnd; }
    729 
    730   bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
    731 
    732   void codeCompleteIncludedFile(const char *PathStart,
    733                                 const char *CompletionPoint, bool IsAngled);
    734 
    735   /// Read a universal character name.
    736   ///
    737   /// \param StartPtr The position in the source buffer after the initial '\'.
    738   ///                 If the UCN is syntactically well-formed (but not
    739   ///                 necessarily valid), this parameter will be updated to
    740   ///                 point to the character after the UCN.
    741   /// \param SlashLoc The position in the source buffer of the '\'.
    742   /// \param Result   The token being formed. Pass \c nullptr to suppress
    743   ///                 diagnostics and handle token formation in the caller.
    744   ///
    745   /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
    746   ///         invalid.
    747   uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
    748 
    749   /// Try to consume a UCN as part of an identifier at the current
    750   /// location.
    751   /// \param CurPtr Initially points to the range of characters in the source
    752   ///               buffer containing the '\'. Updated to point past the end of
    753   ///               the UCN on success.
    754   /// \param Size The number of characters occupied by the '\' (including
    755   ///             trigraphs and escaped newlines).
    756   /// \param Result The token being produced. Marked as containing a UCN on
    757   ///               success.
    758   /// \return \c true if a UCN was lexed and it produced an acceptable
    759   ///         identifier character, \c false otherwise.
    760   bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
    761                                Token &Result);
    762 
    763   /// Try to consume an identifier character encoded in UTF-8.
    764   /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
    765   ///        sequence. On success, updated to point past the end of it.
    766   /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
    767   ///         character was lexed, \c false otherwise.
    768   bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
    769 };
    770 
    771 } // namespace clang
    772 
    773 #endif // LLVM_CLANG_LEX_LEXER_H
    774