Home | History | Annotate | Line # | Download | only in AST
      1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 //  This file defines lexer for structured comments and supporting token class.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
     14 #define LLVM_CLANG_AST_COMMENTLEXER_H
     15 
     16 #include "clang/Basic/Diagnostic.h"
     17 #include "clang/Basic/SourceManager.h"
     18 #include "llvm/ADT/SmallString.h"
     19 #include "llvm/ADT/StringRef.h"
     20 #include "llvm/Support/Allocator.h"
     21 #include "llvm/Support/raw_ostream.h"
     22 
     23 namespace clang {
     24 namespace comments {
     25 
     26 class Lexer;
     27 class TextTokenRetokenizer;
     28 struct CommandInfo;
     29 class CommandTraits;
     30 
     31 namespace tok {
     32 enum TokenKind {
     33   eof,
     34   newline,
     35   text,
     36   unknown_command,   // Command that does not have an ID.
     37   backslash_command, // Command with an ID, that used backslash marker.
     38   at_command,        // Command with an ID, that used 'at' marker.
     39   verbatim_block_begin,
     40   verbatim_block_line,
     41   verbatim_block_end,
     42   verbatim_line_name,
     43   verbatim_line_text,
     44   html_start_tag,     // <tag
     45   html_ident,         // attr
     46   html_equals,        // =
     47   html_quoted_string, // "blah\"blah" or 'blah\'blah'
     48   html_greater,       // >
     49   html_slash_greater, // />
     50   html_end_tag        // </tag
     51 };
     52 } // end namespace tok
     53 
     54 /// Comment token.
     55 class Token {
     56   friend class Lexer;
     57   friend class TextTokenRetokenizer;
     58 
     59   /// The location of the token.
     60   SourceLocation Loc;
     61 
     62   /// The actual kind of the token.
     63   tok::TokenKind Kind;
     64 
     65   /// Integer value associated with a token.
     66   ///
     67   /// If the token is a known command, contains command ID and TextPtr is
     68   /// unused (command spelling can be found with CommandTraits).  Otherwise,
     69   /// contains the length of the string that starts at TextPtr.
     70   unsigned IntVal;
     71 
     72   /// Length of the token spelling in comment.  Can be 0 for synthenized
     73   /// tokens.
     74   unsigned Length;
     75 
     76   /// Contains text value associated with a token.
     77   const char *TextPtr;
     78 
     79 public:
     80   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
     81   void setLocation(SourceLocation SL) { Loc = SL; }
     82 
     83   SourceLocation getEndLocation() const LLVM_READONLY {
     84     if (Length == 0 || Length == 1)
     85       return Loc;
     86     return Loc.getLocWithOffset(Length - 1);
     87   }
     88 
     89   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
     90   void setKind(tok::TokenKind K) { Kind = K; }
     91 
     92   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
     93   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
     94 
     95   unsigned getLength() const LLVM_READONLY { return Length; }
     96   void setLength(unsigned L) { Length = L; }
     97 
     98   StringRef getText() const LLVM_READONLY {
     99     assert(is(tok::text));
    100     return StringRef(TextPtr, IntVal);
    101   }
    102 
    103   void setText(StringRef Text) {
    104     assert(is(tok::text));
    105     TextPtr = Text.data();
    106     IntVal = Text.size();
    107   }
    108 
    109   StringRef getUnknownCommandName() const LLVM_READONLY {
    110     assert(is(tok::unknown_command));
    111     return StringRef(TextPtr, IntVal);
    112   }
    113 
    114   void setUnknownCommandName(StringRef Name) {
    115     assert(is(tok::unknown_command));
    116     TextPtr = Name.data();
    117     IntVal = Name.size();
    118   }
    119 
    120   unsigned getCommandID() const LLVM_READONLY {
    121     assert(is(tok::backslash_command) || is(tok::at_command));
    122     return IntVal;
    123   }
    124 
    125   void setCommandID(unsigned ID) {
    126     assert(is(tok::backslash_command) || is(tok::at_command));
    127     IntVal = ID;
    128   }
    129 
    130   unsigned getVerbatimBlockID() const LLVM_READONLY {
    131     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
    132     return IntVal;
    133   }
    134 
    135   void setVerbatimBlockID(unsigned ID) {
    136     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
    137     IntVal = ID;
    138   }
    139 
    140   StringRef getVerbatimBlockText() const LLVM_READONLY {
    141     assert(is(tok::verbatim_block_line));
    142     return StringRef(TextPtr, IntVal);
    143   }
    144 
    145   void setVerbatimBlockText(StringRef Text) {
    146     assert(is(tok::verbatim_block_line));
    147     TextPtr = Text.data();
    148     IntVal = Text.size();
    149   }
    150 
    151   unsigned getVerbatimLineID() const LLVM_READONLY {
    152     assert(is(tok::verbatim_line_name));
    153     return IntVal;
    154   }
    155 
    156   void setVerbatimLineID(unsigned ID) {
    157     assert(is(tok::verbatim_line_name));
    158     IntVal = ID;
    159   }
    160 
    161   StringRef getVerbatimLineText() const LLVM_READONLY {
    162     assert(is(tok::verbatim_line_text));
    163     return StringRef(TextPtr, IntVal);
    164   }
    165 
    166   void setVerbatimLineText(StringRef Text) {
    167     assert(is(tok::verbatim_line_text));
    168     TextPtr = Text.data();
    169     IntVal = Text.size();
    170   }
    171 
    172   StringRef getHTMLTagStartName() const LLVM_READONLY {
    173     assert(is(tok::html_start_tag));
    174     return StringRef(TextPtr, IntVal);
    175   }
    176 
    177   void setHTMLTagStartName(StringRef Name) {
    178     assert(is(tok::html_start_tag));
    179     TextPtr = Name.data();
    180     IntVal = Name.size();
    181   }
    182 
    183   StringRef getHTMLIdent() const LLVM_READONLY {
    184     assert(is(tok::html_ident));
    185     return StringRef(TextPtr, IntVal);
    186   }
    187 
    188   void setHTMLIdent(StringRef Name) {
    189     assert(is(tok::html_ident));
    190     TextPtr = Name.data();
    191     IntVal = Name.size();
    192   }
    193 
    194   StringRef getHTMLQuotedString() const LLVM_READONLY {
    195     assert(is(tok::html_quoted_string));
    196     return StringRef(TextPtr, IntVal);
    197   }
    198 
    199   void setHTMLQuotedString(StringRef Str) {
    200     assert(is(tok::html_quoted_string));
    201     TextPtr = Str.data();
    202     IntVal = Str.size();
    203   }
    204 
    205   StringRef getHTMLTagEndName() const LLVM_READONLY {
    206     assert(is(tok::html_end_tag));
    207     return StringRef(TextPtr, IntVal);
    208   }
    209 
    210   void setHTMLTagEndName(StringRef Name) {
    211     assert(is(tok::html_end_tag));
    212     TextPtr = Name.data();
    213     IntVal = Name.size();
    214   }
    215 
    216   void dump(const Lexer &L, const SourceManager &SM) const;
    217 };
    218 
    219 /// Comment lexer.
    220 class Lexer {
    221 private:
    222   Lexer(const Lexer &) = delete;
    223   void operator=(const Lexer &) = delete;
    224 
    225   /// Allocator for strings that are semantic values of tokens and have to be
    226   /// computed (for example, resolved decimal character references).
    227   llvm::BumpPtrAllocator &Allocator;
    228 
    229   DiagnosticsEngine &Diags;
    230 
    231   const CommandTraits &Traits;
    232 
    233   const char *const BufferStart;
    234   const char *const BufferEnd;
    235 
    236   const char *BufferPtr;
    237 
    238   /// One past end pointer for the current comment.  For BCPL comments points
    239   /// to newline or BufferEnd, for C comments points to star in '*/'.
    240   const char *CommentEnd;
    241 
    242   SourceLocation FileLoc;
    243 
    244   /// If true, the commands, html tags, etc will be parsed and reported as
    245   /// separate tokens inside the comment body. If false, the comment text will
    246   /// be parsed into text and newline tokens.
    247   bool ParseCommands;
    248 
    249   enum LexerCommentState : uint8_t {
    250     LCS_BeforeComment,
    251     LCS_InsideBCPLComment,
    252     LCS_InsideCComment,
    253     LCS_BetweenComments
    254   };
    255 
    256   /// Low-level lexer state, track if we are inside or outside of comment.
    257   LexerCommentState CommentState;
    258 
    259   enum LexerState : uint8_t {
    260     /// Lexing normal comment text
    261     LS_Normal,
    262 
    263     /// Finished lexing verbatim block beginning command, will lex first body
    264     /// line.
    265     LS_VerbatimBlockFirstLine,
    266 
    267     /// Lexing verbatim block body line-by-line, skipping line-starting
    268     /// decorations.
    269     LS_VerbatimBlockBody,
    270 
    271     /// Finished lexing verbatim line beginning command, will lex text (one
    272     /// line).
    273     LS_VerbatimLineText,
    274 
    275     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
    276     LS_HTMLStartTag,
    277 
    278     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
    279     LS_HTMLEndTag
    280   };
    281 
    282   /// Current lexing mode.
    283   LexerState State;
    284 
    285   /// If State is LS_VerbatimBlock, contains the name of verbatim end
    286   /// command, including command marker.
    287   SmallString<16> VerbatimBlockEndCommandName;
    288 
    289   /// Given a character reference name (e.g., "lt"), return the character that
    290   /// it stands for (e.g., "<").
    291   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
    292 
    293   /// Given a Unicode codepoint as base-10 integer, return the character.
    294   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
    295 
    296   /// Given a Unicode codepoint as base-16 integer, return the character.
    297   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
    298 
    299   void formTokenWithChars(Token &Result, const char *TokEnd,
    300                           tok::TokenKind Kind);
    301 
    302   void formTextToken(Token &Result, const char *TokEnd) {
    303     StringRef Text(BufferPtr, TokEnd - BufferPtr);
    304     formTokenWithChars(Result, TokEnd, tok::text);
    305     Result.setText(Text);
    306   }
    307 
    308   SourceLocation getSourceLocation(const char *Loc) const {
    309     assert(Loc >= BufferStart && Loc <= BufferEnd &&
    310            "Location out of range for this buffer!");
    311 
    312     const unsigned CharNo = Loc - BufferStart;
    313     return FileLoc.getLocWithOffset(CharNo);
    314   }
    315 
    316   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
    317     return Diags.Report(Loc, DiagID);
    318   }
    319 
    320   /// Eat string matching regexp \code \s*\* \endcode.
    321   void skipLineStartingDecorations();
    322 
    323   /// Lex comment text, including commands if ParseCommands is set to true.
    324   void lexCommentText(Token &T);
    325 
    326   void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
    327                                 const CommandInfo *Info);
    328 
    329   void lexVerbatimBlockFirstLine(Token &T);
    330 
    331   void lexVerbatimBlockBody(Token &T);
    332 
    333   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
    334                                const CommandInfo *Info);
    335 
    336   void lexVerbatimLineText(Token &T);
    337 
    338   void lexHTMLCharacterReference(Token &T);
    339 
    340   void setupAndLexHTMLStartTag(Token &T);
    341 
    342   void lexHTMLStartTag(Token &T);
    343 
    344   void setupAndLexHTMLEndTag(Token &T);
    345 
    346   void lexHTMLEndTag(Token &T);
    347 
    348 public:
    349   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
    350         const CommandTraits &Traits, SourceLocation FileLoc,
    351         const char *BufferStart, const char *BufferEnd,
    352         bool ParseCommands = true);
    353 
    354   void lex(Token &T);
    355 
    356   StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
    357 };
    358 
    359 } // end namespace comments
    360 } // end namespace clang
    361 
    362 #endif
    363 
    364