Home | History | Annotate | Line # | Download | only in AST
      1 //===--- CommentLexer.cpp -------------------------------------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 
      9 #include "clang/AST/CommentLexer.h"
     10 #include "clang/AST/CommentCommandTraits.h"
     11 #include "clang/AST/CommentDiagnostic.h"
     12 #include "clang/Basic/CharInfo.h"
     13 #include "llvm/ADT/StringExtras.h"
     14 #include "llvm/ADT/StringSwitch.h"
     15 #include "llvm/Support/ConvertUTF.h"
     16 #include "llvm/Support/ErrorHandling.h"
     17 
     18 namespace clang {
     19 namespace comments {
     20 
     21 void Token::dump(const Lexer &L, const SourceManager &SM) const {
     22   llvm::errs() << "comments::Token Kind=" << Kind << " ";
     23   Loc.print(llvm::errs(), SM);
     24   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
     25 }
     26 
     27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
     28   return isLetter(C);
     29 }
     30 
     31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
     32   return isDigit(C);
     33 }
     34 
     35 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
     36   return isHexDigit(C);
     37 }
     38 
     39 static inline StringRef convertCodePointToUTF8(
     40                                       llvm::BumpPtrAllocator &Allocator,
     41                                       unsigned CodePoint) {
     42   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
     43   char *ResolvedPtr = Resolved;
     44   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
     45     return StringRef(Resolved, ResolvedPtr - Resolved);
     46   else
     47     return StringRef();
     48 }
     49 
     50 namespace {
     51 
     52 #include "clang/AST/CommentHTMLTags.inc"
     53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
     54 
     55 } // end anonymous namespace
     56 
     57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
     58   // Fast path, first check a few most widely used named character references.
     59   return llvm::StringSwitch<StringRef>(Name)
     60       .Case("amp", "&")
     61       .Case("lt", "<")
     62       .Case("gt", ">")
     63       .Case("quot", "\"")
     64       .Case("apos", "\'")
     65       // Slow path.
     66       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
     67 }
     68 
     69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
     70   unsigned CodePoint = 0;
     71   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     72     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
     73     CodePoint *= 10;
     74     CodePoint += Name[i] - '0';
     75   }
     76   return convertCodePointToUTF8(Allocator, CodePoint);
     77 }
     78 
     79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
     80   unsigned CodePoint = 0;
     81   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     82     CodePoint *= 16;
     83     const char C = Name[i];
     84     assert(isHTMLHexCharacterReferenceCharacter(C));
     85     CodePoint += llvm::hexDigitValue(C);
     86   }
     87   return convertCodePointToUTF8(Allocator, CodePoint);
     88 }
     89 
     90 void Lexer::skipLineStartingDecorations() {
     91   // This function should be called only for C comments
     92   assert(CommentState == LCS_InsideCComment);
     93 
     94   if (BufferPtr == CommentEnd)
     95     return;
     96 
     97   switch (*BufferPtr) {
     98   case ' ':
     99   case '\t':
    100   case '\f':
    101   case '\v': {
    102     const char *NewBufferPtr = BufferPtr;
    103     NewBufferPtr++;
    104     if (NewBufferPtr == CommentEnd)
    105       return;
    106 
    107     char C = *NewBufferPtr;
    108     while (isHorizontalWhitespace(C)) {
    109       NewBufferPtr++;
    110       if (NewBufferPtr == CommentEnd)
    111         return;
    112       C = *NewBufferPtr;
    113     }
    114     if (C == '*')
    115       BufferPtr = NewBufferPtr + 1;
    116     break;
    117   }
    118   case '*':
    119     BufferPtr++;
    120     break;
    121   }
    122 }
    123 
    124 namespace {
    125 /// Returns pointer to the first newline character in the string.
    126 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
    127   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    128     if (isVerticalWhitespace(*BufferPtr))
    129       return BufferPtr;
    130   }
    131   return BufferEnd;
    132 }
    133 
    134 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
    135   if (BufferPtr == BufferEnd)
    136     return BufferPtr;
    137 
    138   if (*BufferPtr == '\n')
    139     BufferPtr++;
    140   else {
    141     assert(*BufferPtr == '\r');
    142     BufferPtr++;
    143     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
    144       BufferPtr++;
    145   }
    146   return BufferPtr;
    147 }
    148 
    149 const char *skipNamedCharacterReference(const char *BufferPtr,
    150                                         const char *BufferEnd) {
    151   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    152     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
    153       return BufferPtr;
    154   }
    155   return BufferEnd;
    156 }
    157 
    158 const char *skipDecimalCharacterReference(const char *BufferPtr,
    159                                           const char *BufferEnd) {
    160   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    161     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
    162       return BufferPtr;
    163   }
    164   return BufferEnd;
    165 }
    166 
    167 const char *skipHexCharacterReference(const char *BufferPtr,
    168                                       const char *BufferEnd) {
    169   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    170     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
    171       return BufferPtr;
    172   }
    173   return BufferEnd;
    174 }
    175 
    176 bool isHTMLIdentifierStartingCharacter(char C) {
    177   return isLetter(C);
    178 }
    179 
    180 bool isHTMLIdentifierCharacter(char C) {
    181   return isAlphanumeric(C);
    182 }
    183 
    184 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
    185   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    186     if (!isHTMLIdentifierCharacter(*BufferPtr))
    187       return BufferPtr;
    188   }
    189   return BufferEnd;
    190 }
    191 
    192 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
    193 /// string allowed.
    194 ///
    195 /// Returns pointer to closing quote.
    196 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
    197 {
    198   const char Quote = *BufferPtr;
    199   assert(Quote == '\"' || Quote == '\'');
    200 
    201   BufferPtr++;
    202   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    203     const char C = *BufferPtr;
    204     if (C == Quote && BufferPtr[-1] != '\\')
    205       return BufferPtr;
    206   }
    207   return BufferEnd;
    208 }
    209 
    210 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
    211   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    212     if (!isWhitespace(*BufferPtr))
    213       return BufferPtr;
    214   }
    215   return BufferEnd;
    216 }
    217 
    218 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
    219   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
    220 }
    221 
    222 bool isCommandNameStartCharacter(char C) {
    223   return isLetter(C);
    224 }
    225 
    226 bool isCommandNameCharacter(char C) {
    227   return isAlphanumeric(C);
    228 }
    229 
    230 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
    231   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    232     if (!isCommandNameCharacter(*BufferPtr))
    233       return BufferPtr;
    234   }
    235   return BufferEnd;
    236 }
    237 
    238 /// Return the one past end pointer for BCPL comments.
    239 /// Handles newlines escaped with backslash or trigraph for backslahs.
    240 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
    241   const char *CurPtr = BufferPtr;
    242   while (CurPtr != BufferEnd) {
    243     while (!isVerticalWhitespace(*CurPtr)) {
    244       CurPtr++;
    245       if (CurPtr == BufferEnd)
    246         return BufferEnd;
    247     }
    248     // We found a newline, check if it is escaped.
    249     const char *EscapePtr = CurPtr - 1;
    250     while(isHorizontalWhitespace(*EscapePtr))
    251       EscapePtr--;
    252 
    253     if (*EscapePtr == '\\' ||
    254         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
    255          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
    256       // We found an escaped newline.
    257       CurPtr = skipNewline(CurPtr, BufferEnd);
    258     } else
    259       return CurPtr; // Not an escaped newline.
    260   }
    261   return BufferEnd;
    262 }
    263 
    264 /// Return the one past end pointer for C comments.
    265 /// Very dumb, does not handle escaped newlines or trigraphs.
    266 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
    267   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    268     if (*BufferPtr == '*') {
    269       assert(BufferPtr + 1 != BufferEnd);
    270       if (*(BufferPtr + 1) == '/')
    271         return BufferPtr;
    272     }
    273   }
    274   llvm_unreachable("buffer end hit before '*/' was seen");
    275 }
    276 
    277 } // end anonymous namespace
    278 
    279 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
    280                                tok::TokenKind Kind) {
    281   const unsigned TokLen = TokEnd - BufferPtr;
    282   Result.setLocation(getSourceLocation(BufferPtr));
    283   Result.setKind(Kind);
    284   Result.setLength(TokLen);
    285 #ifndef NDEBUG
    286   Result.TextPtr = "<UNSET>";
    287   Result.IntVal = 7;
    288 #endif
    289   BufferPtr = TokEnd;
    290 }
    291 
    292 void Lexer::lexCommentText(Token &T) {
    293   assert(CommentState == LCS_InsideBCPLComment ||
    294          CommentState == LCS_InsideCComment);
    295 
    296   // Handles lexing non-command text, i.e. text and newline.
    297   auto HandleNonCommandToken = [&]() -> void {
    298     assert(State == LS_Normal);
    299 
    300     const char *TokenPtr = BufferPtr;
    301     assert(TokenPtr < CommentEnd);
    302     switch (*TokenPtr) {
    303       case '\n':
    304       case '\r':
    305           TokenPtr = skipNewline(TokenPtr, CommentEnd);
    306           formTokenWithChars(T, TokenPtr, tok::newline);
    307 
    308           if (CommentState == LCS_InsideCComment)
    309             skipLineStartingDecorations();
    310           return;
    311 
    312       default: {
    313           StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
    314           size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
    315                            .find_first_of(TokStartSymbols);
    316           if (End != StringRef::npos)
    317             TokenPtr += End;
    318           else
    319             TokenPtr = CommentEnd;
    320           formTextToken(T, TokenPtr);
    321           return;
    322       }
    323     }
    324   };
    325 
    326   if (!ParseCommands)
    327     return HandleNonCommandToken();
    328 
    329   switch (State) {
    330   case LS_Normal:
    331     break;
    332   case LS_VerbatimBlockFirstLine:
    333     lexVerbatimBlockFirstLine(T);
    334     return;
    335   case LS_VerbatimBlockBody:
    336     lexVerbatimBlockBody(T);
    337     return;
    338   case LS_VerbatimLineText:
    339     lexVerbatimLineText(T);
    340     return;
    341   case LS_HTMLStartTag:
    342     lexHTMLStartTag(T);
    343     return;
    344   case LS_HTMLEndTag:
    345     lexHTMLEndTag(T);
    346     return;
    347   }
    348 
    349   assert(State == LS_Normal);
    350   const char *TokenPtr = BufferPtr;
    351   assert(TokenPtr < CommentEnd);
    352   switch(*TokenPtr) {
    353     case '\\':
    354     case '@': {
    355       // Commands that start with a backslash and commands that start with
    356       // 'at' have equivalent semantics.  But we keep information about the
    357       // exact syntax in AST for comments.
    358       tok::TokenKind CommandKind =
    359           (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
    360       TokenPtr++;
    361       if (TokenPtr == CommentEnd) {
    362         formTextToken(T, TokenPtr);
    363         return;
    364       }
    365       char C = *TokenPtr;
    366       switch (C) {
    367       default:
    368         break;
    369 
    370       case '\\': case '@': case '&': case '$':
    371       case '#':  case '<': case '>': case '%':
    372       case '\"': case '.': case ':':
    373         // This is one of \\ \@ \& \$ etc escape sequences.
    374         TokenPtr++;
    375         if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
    376           // This is the \:: escape sequence.
    377           TokenPtr++;
    378         }
    379         StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
    380         formTokenWithChars(T, TokenPtr, tok::text);
    381         T.setText(UnescapedText);
    382         return;
    383       }
    384 
    385       // Don't make zero-length commands.
    386       if (!isCommandNameStartCharacter(*TokenPtr)) {
    387         formTextToken(T, TokenPtr);
    388         return;
    389       }
    390 
    391       TokenPtr = skipCommandName(TokenPtr, CommentEnd);
    392       unsigned Length = TokenPtr - (BufferPtr + 1);
    393 
    394       // Hardcoded support for lexing LaTeX formula commands
    395       // \f$ \f[ \f] \f{ \f} as a single command.
    396       if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
    397         C = *TokenPtr;
    398         if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
    399           TokenPtr++;
    400           Length++;
    401         }
    402       }
    403 
    404       StringRef CommandName(BufferPtr + 1, Length);
    405 
    406       const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
    407       if (!Info) {
    408         if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
    409           StringRef CorrectedName = Info->Name;
    410           SourceLocation Loc = getSourceLocation(BufferPtr);
    411           SourceLocation EndLoc = getSourceLocation(TokenPtr);
    412           SourceRange FullRange = SourceRange(Loc, EndLoc);
    413           SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
    414           Diag(Loc, diag::warn_correct_comment_command_name)
    415             << FullRange << CommandName << CorrectedName
    416             << FixItHint::CreateReplacement(CommandRange, CorrectedName);
    417         } else {
    418           formTokenWithChars(T, TokenPtr, tok::unknown_command);
    419           T.setUnknownCommandName(CommandName);
    420           Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
    421               << SourceRange(T.getLocation(), T.getEndLocation());
    422           return;
    423         }
    424       }
    425       if (Info->IsVerbatimBlockCommand) {
    426         setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
    427         return;
    428       }
    429       if (Info->IsVerbatimLineCommand) {
    430         setupAndLexVerbatimLine(T, TokenPtr, Info);
    431         return;
    432       }
    433       formTokenWithChars(T, TokenPtr, CommandKind);
    434       T.setCommandID(Info->getID());
    435       return;
    436     }
    437 
    438     case '&':
    439       lexHTMLCharacterReference(T);
    440       return;
    441 
    442     case '<': {
    443       TokenPtr++;
    444       if (TokenPtr == CommentEnd) {
    445         formTextToken(T, TokenPtr);
    446         return;
    447       }
    448       const char C = *TokenPtr;
    449       if (isHTMLIdentifierStartingCharacter(C))
    450         setupAndLexHTMLStartTag(T);
    451       else if (C == '/')
    452         setupAndLexHTMLEndTag(T);
    453       else
    454         formTextToken(T, TokenPtr);
    455       return;
    456     }
    457 
    458     default:
    459       return HandleNonCommandToken();
    460   }
    461 }
    462 
    463 void Lexer::setupAndLexVerbatimBlock(Token &T,
    464                                      const char *TextBegin,
    465                                      char Marker, const CommandInfo *Info) {
    466   assert(Info->IsVerbatimBlockCommand);
    467 
    468   VerbatimBlockEndCommandName.clear();
    469   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
    470   VerbatimBlockEndCommandName.append(Info->EndCommandName);
    471 
    472   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
    473   T.setVerbatimBlockID(Info->getID());
    474 
    475   // If there is a newline following the verbatim opening command, skip the
    476   // newline so that we don't create an tok::verbatim_block_line with empty
    477   // text content.
    478   if (BufferPtr != CommentEnd &&
    479       isVerticalWhitespace(*BufferPtr)) {
    480     BufferPtr = skipNewline(BufferPtr, CommentEnd);
    481     State = LS_VerbatimBlockBody;
    482     return;
    483   }
    484 
    485   State = LS_VerbatimBlockFirstLine;
    486 }
    487 
    488 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
    489 again:
    490   assert(BufferPtr < CommentEnd);
    491 
    492   // FIXME: It would be better to scan the text once, finding either the block
    493   // end command or newline.
    494   //
    495   // Extract current line.
    496   const char *Newline = findNewline(BufferPtr, CommentEnd);
    497   StringRef Line(BufferPtr, Newline - BufferPtr);
    498 
    499   // Look for end command in current line.
    500   size_t Pos = Line.find(VerbatimBlockEndCommandName);
    501   const char *TextEnd;
    502   const char *NextLine;
    503   if (Pos == StringRef::npos) {
    504     // Current line is completely verbatim.
    505     TextEnd = Newline;
    506     NextLine = skipNewline(Newline, CommentEnd);
    507   } else if (Pos == 0) {
    508     // Current line contains just an end command.
    509     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
    510     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
    511     formTokenWithChars(T, End, tok::verbatim_block_end);
    512     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
    513     State = LS_Normal;
    514     return;
    515   } else {
    516     // There is some text, followed by end command.  Extract text first.
    517     TextEnd = BufferPtr + Pos;
    518     NextLine = TextEnd;
    519     // If there is only whitespace before end command, skip whitespace.
    520     if (isWhitespace(BufferPtr, TextEnd)) {
    521       BufferPtr = TextEnd;
    522       goto again;
    523     }
    524   }
    525 
    526   StringRef Text(BufferPtr, TextEnd - BufferPtr);
    527   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
    528   T.setVerbatimBlockText(Text);
    529 
    530   State = LS_VerbatimBlockBody;
    531 }
    532 
    533 void Lexer::lexVerbatimBlockBody(Token &T) {
    534   assert(State == LS_VerbatimBlockBody);
    535 
    536   if (CommentState == LCS_InsideCComment)
    537     skipLineStartingDecorations();
    538 
    539   if (BufferPtr == CommentEnd) {
    540     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
    541     T.setVerbatimBlockText("");
    542     return;
    543   }
    544 
    545   lexVerbatimBlockFirstLine(T);
    546 }
    547 
    548 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
    549                                     const CommandInfo *Info) {
    550   assert(Info->IsVerbatimLineCommand);
    551   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
    552   T.setVerbatimLineID(Info->getID());
    553 
    554   State = LS_VerbatimLineText;
    555 }
    556 
    557 void Lexer::lexVerbatimLineText(Token &T) {
    558   assert(State == LS_VerbatimLineText);
    559 
    560   // Extract current line.
    561   const char *Newline = findNewline(BufferPtr, CommentEnd);
    562   StringRef Text(BufferPtr, Newline - BufferPtr);
    563   formTokenWithChars(T, Newline, tok::verbatim_line_text);
    564   T.setVerbatimLineText(Text);
    565 
    566   State = LS_Normal;
    567 }
    568 
    569 void Lexer::lexHTMLCharacterReference(Token &T) {
    570   const char *TokenPtr = BufferPtr;
    571   assert(*TokenPtr == '&');
    572   TokenPtr++;
    573   if (TokenPtr == CommentEnd) {
    574     formTextToken(T, TokenPtr);
    575     return;
    576   }
    577   const char *NamePtr;
    578   bool isNamed = false;
    579   bool isDecimal = false;
    580   char C = *TokenPtr;
    581   if (isHTMLNamedCharacterReferenceCharacter(C)) {
    582     NamePtr = TokenPtr;
    583     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
    584     isNamed = true;
    585   } else if (C == '#') {
    586     TokenPtr++;
    587     if (TokenPtr == CommentEnd) {
    588       formTextToken(T, TokenPtr);
    589       return;
    590     }
    591     C = *TokenPtr;
    592     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
    593       NamePtr = TokenPtr;
    594       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
    595       isDecimal = true;
    596     } else if (C == 'x' || C == 'X') {
    597       TokenPtr++;
    598       NamePtr = TokenPtr;
    599       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
    600     } else {
    601       formTextToken(T, TokenPtr);
    602       return;
    603     }
    604   } else {
    605     formTextToken(T, TokenPtr);
    606     return;
    607   }
    608   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
    609       *TokenPtr != ';') {
    610     formTextToken(T, TokenPtr);
    611     return;
    612   }
    613   StringRef Name(NamePtr, TokenPtr - NamePtr);
    614   TokenPtr++; // Skip semicolon.
    615   StringRef Resolved;
    616   if (isNamed)
    617     Resolved = resolveHTMLNamedCharacterReference(Name);
    618   else if (isDecimal)
    619     Resolved = resolveHTMLDecimalCharacterReference(Name);
    620   else
    621     Resolved = resolveHTMLHexCharacterReference(Name);
    622 
    623   if (Resolved.empty()) {
    624     formTextToken(T, TokenPtr);
    625     return;
    626   }
    627   formTokenWithChars(T, TokenPtr, tok::text);
    628   T.setText(Resolved);
    629 }
    630 
    631 void Lexer::setupAndLexHTMLStartTag(Token &T) {
    632   assert(BufferPtr[0] == '<' &&
    633          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
    634   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
    635   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
    636   if (!isHTMLTagName(Name)) {
    637     formTextToken(T, TagNameEnd);
    638     return;
    639   }
    640 
    641   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
    642   T.setHTMLTagStartName(Name);
    643 
    644   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
    645 
    646   const char C = *BufferPtr;
    647   if (BufferPtr != CommentEnd &&
    648       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
    649     State = LS_HTMLStartTag;
    650 }
    651 
    652 void Lexer::lexHTMLStartTag(Token &T) {
    653   assert(State == LS_HTMLStartTag);
    654 
    655   const char *TokenPtr = BufferPtr;
    656   char C = *TokenPtr;
    657   if (isHTMLIdentifierCharacter(C)) {
    658     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
    659     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
    660     formTokenWithChars(T, TokenPtr, tok::html_ident);
    661     T.setHTMLIdent(Ident);
    662   } else {
    663     switch (C) {
    664     case '=':
    665       TokenPtr++;
    666       formTokenWithChars(T, TokenPtr, tok::html_equals);
    667       break;
    668     case '\"':
    669     case '\'': {
    670       const char *OpenQuote = TokenPtr;
    671       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
    672       const char *ClosingQuote = TokenPtr;
    673       if (TokenPtr != CommentEnd) // Skip closing quote.
    674         TokenPtr++;
    675       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
    676       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
    677                                       ClosingQuote - (OpenQuote + 1)));
    678       break;
    679     }
    680     case '>':
    681       TokenPtr++;
    682       formTokenWithChars(T, TokenPtr, tok::html_greater);
    683       State = LS_Normal;
    684       return;
    685     case '/':
    686       TokenPtr++;
    687       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
    688         TokenPtr++;
    689         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
    690       } else
    691         formTextToken(T, TokenPtr);
    692 
    693       State = LS_Normal;
    694       return;
    695     }
    696   }
    697 
    698   // Now look ahead and return to normal state if we don't see any HTML tokens
    699   // ahead.
    700   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
    701   if (BufferPtr == CommentEnd) {
    702     State = LS_Normal;
    703     return;
    704   }
    705 
    706   C = *BufferPtr;
    707   if (!isHTMLIdentifierStartingCharacter(C) &&
    708       C != '=' && C != '\"' && C != '\'' && C != '>') {
    709     State = LS_Normal;
    710     return;
    711   }
    712 }
    713 
    714 void Lexer::setupAndLexHTMLEndTag(Token &T) {
    715   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
    716 
    717   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
    718   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
    719   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
    720   if (!isHTMLTagName(Name)) {
    721     formTextToken(T, TagNameEnd);
    722     return;
    723   }
    724 
    725   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
    726 
    727   formTokenWithChars(T, End, tok::html_end_tag);
    728   T.setHTMLTagEndName(Name);
    729 
    730   if (BufferPtr != CommentEnd && *BufferPtr == '>')
    731     State = LS_HTMLEndTag;
    732 }
    733 
    734 void Lexer::lexHTMLEndTag(Token &T) {
    735   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
    736 
    737   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
    738   State = LS_Normal;
    739 }
    740 
    741 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
    742              const CommandTraits &Traits, SourceLocation FileLoc,
    743              const char *BufferStart, const char *BufferEnd, bool ParseCommands)
    744     : Allocator(Allocator), Diags(Diags), Traits(Traits),
    745       BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
    746       FileLoc(FileLoc), ParseCommands(ParseCommands),
    747       CommentState(LCS_BeforeComment), State(LS_Normal) {}
    748 
    749 void Lexer::lex(Token &T) {
    750 again:
    751   switch (CommentState) {
    752   case LCS_BeforeComment:
    753     if (BufferPtr == BufferEnd) {
    754       formTokenWithChars(T, BufferPtr, tok::eof);
    755       return;
    756     }
    757 
    758     assert(*BufferPtr == '/');
    759     BufferPtr++; // Skip first slash.
    760     switch(*BufferPtr) {
    761     case '/': { // BCPL comment.
    762       BufferPtr++; // Skip second slash.
    763 
    764       if (BufferPtr != BufferEnd) {
    765         // Skip Doxygen magic marker, if it is present.
    766         // It might be missing because of a typo //< or /*<, or because we
    767         // merged this non-Doxygen comment into a bunch of Doxygen comments
    768         // around it: /** ... */ /* ... */ /** ... */
    769         const char C = *BufferPtr;
    770         if (C == '/' || C == '!')
    771           BufferPtr++;
    772       }
    773 
    774       // Skip less-than symbol that marks trailing comments.
    775       // Skip it even if the comment is not a Doxygen one, because //< and /*<
    776       // are frequent typos.
    777       if (BufferPtr != BufferEnd && *BufferPtr == '<')
    778         BufferPtr++;
    779 
    780       CommentState = LCS_InsideBCPLComment;
    781       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
    782         State = LS_Normal;
    783       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
    784       goto again;
    785     }
    786     case '*': { // C comment.
    787       BufferPtr++; // Skip star.
    788 
    789       // Skip Doxygen magic marker.
    790       const char C = *BufferPtr;
    791       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
    792         BufferPtr++;
    793 
    794       // Skip less-than symbol that marks trailing comments.
    795       if (BufferPtr != BufferEnd && *BufferPtr == '<')
    796         BufferPtr++;
    797 
    798       CommentState = LCS_InsideCComment;
    799       State = LS_Normal;
    800       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
    801       goto again;
    802     }
    803     default:
    804       llvm_unreachable("second character of comment should be '/' or '*'");
    805     }
    806 
    807   case LCS_BetweenComments: {
    808     // Consecutive comments are extracted only if there is only whitespace
    809     // between them.  So we can search for the start of the next comment.
    810     const char *EndWhitespace = BufferPtr;
    811     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
    812       EndWhitespace++;
    813 
    814     // Turn any whitespace between comments (and there is only whitespace
    815     // between them -- guaranteed by comment extraction) into a newline.  We
    816     // have two newlines between C comments in total (first one was synthesized
    817     // after a comment).
    818     formTokenWithChars(T, EndWhitespace, tok::newline);
    819 
    820     CommentState = LCS_BeforeComment;
    821     break;
    822   }
    823 
    824   case LCS_InsideBCPLComment:
    825   case LCS_InsideCComment:
    826     if (BufferPtr != CommentEnd) {
    827       lexCommentText(T);
    828       break;
    829     } else {
    830       // Skip C comment closing sequence.
    831       if (CommentState == LCS_InsideCComment) {
    832         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
    833         BufferPtr += 2;
    834         assert(BufferPtr <= BufferEnd);
    835 
    836         // Synthenize newline just after the C comment, regardless if there is
    837         // actually a newline.
    838         formTokenWithChars(T, BufferPtr, tok::newline);
    839 
    840         CommentState = LCS_BetweenComments;
    841         break;
    842       } else {
    843         // Don't synthesized a newline after BCPL comment.
    844         CommentState = LCS_BetweenComments;
    845         goto again;
    846       }
    847     }
    848   }
    849 }
    850 
    851 StringRef Lexer::getSpelling(const Token &Tok,
    852                              const SourceManager &SourceMgr) const {
    853   SourceLocation Loc = Tok.getLocation();
    854   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
    855 
    856   bool InvalidTemp = false;
    857   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
    858   if (InvalidTemp)
    859     return StringRef();
    860 
    861   const char *Begin = File.data() + LocInfo.second;
    862   return StringRef(Begin, Tok.getLength());
    863 }
    864 
    865 } // end namespace comments
    866 } // end namespace clang
    867