1 1.1 joerg //===--- CommentLexer.cpp -------------------------------------------------===// 2 1.1 joerg // 3 1.1 joerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 1.1 joerg // See https://llvm.org/LICENSE.txt for license information. 5 1.1 joerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 1.1 joerg // 7 1.1 joerg //===----------------------------------------------------------------------===// 8 1.1 joerg 9 1.1 joerg #include "clang/AST/CommentLexer.h" 10 1.1 joerg #include "clang/AST/CommentCommandTraits.h" 11 1.1 joerg #include "clang/AST/CommentDiagnostic.h" 12 1.1 joerg #include "clang/Basic/CharInfo.h" 13 1.1 joerg #include "llvm/ADT/StringExtras.h" 14 1.1 joerg #include "llvm/ADT/StringSwitch.h" 15 1.1 joerg #include "llvm/Support/ConvertUTF.h" 16 1.1 joerg #include "llvm/Support/ErrorHandling.h" 17 1.1 joerg 18 1.1 joerg namespace clang { 19 1.1 joerg namespace comments { 20 1.1 joerg 21 1.1 joerg void Token::dump(const Lexer &L, const SourceManager &SM) const { 22 1.1 joerg llvm::errs() << "comments::Token Kind=" << Kind << " "; 23 1.1 joerg Loc.print(llvm::errs(), SM); 24 1.1 joerg llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 25 1.1 joerg } 26 1.1 joerg 27 1.1 joerg static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 28 1.1 joerg return isLetter(C); 29 1.1 joerg } 30 1.1 joerg 31 1.1 joerg static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 32 1.1 joerg return isDigit(C); 33 1.1 joerg } 34 1.1 joerg 35 1.1 joerg static inline bool isHTMLHexCharacterReferenceCharacter(char C) { 36 1.1 joerg return isHexDigit(C); 37 1.1 joerg } 38 1.1 joerg 39 1.1 joerg static inline StringRef convertCodePointToUTF8( 40 1.1 joerg llvm::BumpPtrAllocator &Allocator, 41 1.1 joerg unsigned CodePoint) { 42 1.1 joerg char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 43 1.1 joerg char *ResolvedPtr = Resolved; 44 1.1 joerg if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 45 1.1 joerg return StringRef(Resolved, ResolvedPtr - Resolved); 46 1.1 joerg else 47 1.1 joerg return StringRef(); 48 1.1 joerg } 49 1.1 joerg 50 1.1 joerg namespace { 51 1.1 joerg 52 1.1 joerg #include "clang/AST/CommentHTMLTags.inc" 53 1.1 joerg #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 54 1.1 joerg 55 1.1 joerg } // end anonymous namespace 56 1.1 joerg 57 1.1 joerg StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 58 1.1 joerg // Fast path, first check a few most widely used named character references. 59 1.1 joerg return llvm::StringSwitch<StringRef>(Name) 60 1.1 joerg .Case("amp", "&") 61 1.1 joerg .Case("lt", "<") 62 1.1 joerg .Case("gt", ">") 63 1.1 joerg .Case("quot", "\"") 64 1.1 joerg .Case("apos", "\'") 65 1.1 joerg // Slow path. 66 1.1 joerg .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 67 1.1 joerg } 68 1.1 joerg 69 1.1 joerg StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 70 1.1 joerg unsigned CodePoint = 0; 71 1.1 joerg for (unsigned i = 0, e = Name.size(); i != e; ++i) { 72 1.1 joerg assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 73 1.1 joerg CodePoint *= 10; 74 1.1 joerg CodePoint += Name[i] - '0'; 75 1.1 joerg } 76 1.1 joerg return convertCodePointToUTF8(Allocator, CodePoint); 77 1.1 joerg } 78 1.1 joerg 79 1.1 joerg StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 80 1.1 joerg unsigned CodePoint = 0; 81 1.1 joerg for (unsigned i = 0, e = Name.size(); i != e; ++i) { 82 1.1 joerg CodePoint *= 16; 83 1.1 joerg const char C = Name[i]; 84 1.1 joerg assert(isHTMLHexCharacterReferenceCharacter(C)); 85 1.1 joerg CodePoint += llvm::hexDigitValue(C); 86 1.1 joerg } 87 1.1 joerg return convertCodePointToUTF8(Allocator, CodePoint); 88 1.1 joerg } 89 1.1 joerg 90 1.1 joerg void Lexer::skipLineStartingDecorations() { 91 1.1 joerg // This function should be called only for C comments 92 1.1 joerg assert(CommentState == LCS_InsideCComment); 93 1.1 joerg 94 1.1 joerg if (BufferPtr == CommentEnd) 95 1.1 joerg return; 96 1.1 joerg 97 1.1 joerg switch (*BufferPtr) { 98 1.1 joerg case ' ': 99 1.1 joerg case '\t': 100 1.1 joerg case '\f': 101 1.1 joerg case '\v': { 102 1.1 joerg const char *NewBufferPtr = BufferPtr; 103 1.1 joerg NewBufferPtr++; 104 1.1 joerg if (NewBufferPtr == CommentEnd) 105 1.1 joerg return; 106 1.1 joerg 107 1.1 joerg char C = *NewBufferPtr; 108 1.1 joerg while (isHorizontalWhitespace(C)) { 109 1.1 joerg NewBufferPtr++; 110 1.1 joerg if (NewBufferPtr == CommentEnd) 111 1.1 joerg return; 112 1.1 joerg C = *NewBufferPtr; 113 1.1 joerg } 114 1.1 joerg if (C == '*') 115 1.1 joerg BufferPtr = NewBufferPtr + 1; 116 1.1 joerg break; 117 1.1 joerg } 118 1.1 joerg case '*': 119 1.1 joerg BufferPtr++; 120 1.1 joerg break; 121 1.1 joerg } 122 1.1 joerg } 123 1.1 joerg 124 1.1 joerg namespace { 125 1.1 joerg /// Returns pointer to the first newline character in the string. 126 1.1 joerg const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 127 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 128 1.1 joerg if (isVerticalWhitespace(*BufferPtr)) 129 1.1 joerg return BufferPtr; 130 1.1 joerg } 131 1.1 joerg return BufferEnd; 132 1.1 joerg } 133 1.1 joerg 134 1.1 joerg const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 135 1.1 joerg if (BufferPtr == BufferEnd) 136 1.1 joerg return BufferPtr; 137 1.1 joerg 138 1.1 joerg if (*BufferPtr == '\n') 139 1.1 joerg BufferPtr++; 140 1.1 joerg else { 141 1.1 joerg assert(*BufferPtr == '\r'); 142 1.1 joerg BufferPtr++; 143 1.1 joerg if (BufferPtr != BufferEnd && *BufferPtr == '\n') 144 1.1 joerg BufferPtr++; 145 1.1 joerg } 146 1.1 joerg return BufferPtr; 147 1.1 joerg } 148 1.1 joerg 149 1.1 joerg const char *skipNamedCharacterReference(const char *BufferPtr, 150 1.1 joerg const char *BufferEnd) { 151 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 152 1.1 joerg if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 153 1.1 joerg return BufferPtr; 154 1.1 joerg } 155 1.1 joerg return BufferEnd; 156 1.1 joerg } 157 1.1 joerg 158 1.1 joerg const char *skipDecimalCharacterReference(const char *BufferPtr, 159 1.1 joerg const char *BufferEnd) { 160 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 161 1.1 joerg if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 162 1.1 joerg return BufferPtr; 163 1.1 joerg } 164 1.1 joerg return BufferEnd; 165 1.1 joerg } 166 1.1 joerg 167 1.1 joerg const char *skipHexCharacterReference(const char *BufferPtr, 168 1.1 joerg const char *BufferEnd) { 169 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 170 1.1 joerg if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 171 1.1 joerg return BufferPtr; 172 1.1 joerg } 173 1.1 joerg return BufferEnd; 174 1.1 joerg } 175 1.1 joerg 176 1.1 joerg bool isHTMLIdentifierStartingCharacter(char C) { 177 1.1 joerg return isLetter(C); 178 1.1 joerg } 179 1.1 joerg 180 1.1 joerg bool isHTMLIdentifierCharacter(char C) { 181 1.1 joerg return isAlphanumeric(C); 182 1.1 joerg } 183 1.1 joerg 184 1.1 joerg const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 185 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 186 1.1 joerg if (!isHTMLIdentifierCharacter(*BufferPtr)) 187 1.1 joerg return BufferPtr; 188 1.1 joerg } 189 1.1 joerg return BufferEnd; 190 1.1 joerg } 191 1.1 joerg 192 1.1 joerg /// Skip HTML string quoted in single or double quotes. Escaping quotes inside 193 1.1 joerg /// string allowed. 194 1.1 joerg /// 195 1.1 joerg /// Returns pointer to closing quote. 196 1.1 joerg const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 197 1.1 joerg { 198 1.1 joerg const char Quote = *BufferPtr; 199 1.1 joerg assert(Quote == '\"' || Quote == '\''); 200 1.1 joerg 201 1.1 joerg BufferPtr++; 202 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 203 1.1 joerg const char C = *BufferPtr; 204 1.1 joerg if (C == Quote && BufferPtr[-1] != '\\') 205 1.1 joerg return BufferPtr; 206 1.1 joerg } 207 1.1 joerg return BufferEnd; 208 1.1 joerg } 209 1.1 joerg 210 1.1 joerg const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 211 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 212 1.1 joerg if (!isWhitespace(*BufferPtr)) 213 1.1 joerg return BufferPtr; 214 1.1 joerg } 215 1.1 joerg return BufferEnd; 216 1.1 joerg } 217 1.1 joerg 218 1.1 joerg bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 219 1.1 joerg return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 220 1.1 joerg } 221 1.1 joerg 222 1.1 joerg bool isCommandNameStartCharacter(char C) { 223 1.1 joerg return isLetter(C); 224 1.1 joerg } 225 1.1 joerg 226 1.1 joerg bool isCommandNameCharacter(char C) { 227 1.1 joerg return isAlphanumeric(C); 228 1.1 joerg } 229 1.1 joerg 230 1.1 joerg const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 231 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 232 1.1 joerg if (!isCommandNameCharacter(*BufferPtr)) 233 1.1 joerg return BufferPtr; 234 1.1 joerg } 235 1.1 joerg return BufferEnd; 236 1.1 joerg } 237 1.1 joerg 238 1.1 joerg /// Return the one past end pointer for BCPL comments. 239 1.1 joerg /// Handles newlines escaped with backslash or trigraph for backslahs. 240 1.1 joerg const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 241 1.1 joerg const char *CurPtr = BufferPtr; 242 1.1 joerg while (CurPtr != BufferEnd) { 243 1.1 joerg while (!isVerticalWhitespace(*CurPtr)) { 244 1.1 joerg CurPtr++; 245 1.1 joerg if (CurPtr == BufferEnd) 246 1.1 joerg return BufferEnd; 247 1.1 joerg } 248 1.1 joerg // We found a newline, check if it is escaped. 249 1.1 joerg const char *EscapePtr = CurPtr - 1; 250 1.1 joerg while(isHorizontalWhitespace(*EscapePtr)) 251 1.1 joerg EscapePtr--; 252 1.1 joerg 253 1.1 joerg if (*EscapePtr == '\\' || 254 1.1 joerg (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 255 1.1 joerg EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 256 1.1 joerg // We found an escaped newline. 257 1.1 joerg CurPtr = skipNewline(CurPtr, BufferEnd); 258 1.1 joerg } else 259 1.1 joerg return CurPtr; // Not an escaped newline. 260 1.1 joerg } 261 1.1 joerg return BufferEnd; 262 1.1 joerg } 263 1.1 joerg 264 1.1 joerg /// Return the one past end pointer for C comments. 265 1.1 joerg /// Very dumb, does not handle escaped newlines or trigraphs. 266 1.1 joerg const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 267 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 268 1.1 joerg if (*BufferPtr == '*') { 269 1.1 joerg assert(BufferPtr + 1 != BufferEnd); 270 1.1 joerg if (*(BufferPtr + 1) == '/') 271 1.1 joerg return BufferPtr; 272 1.1 joerg } 273 1.1 joerg } 274 1.1 joerg llvm_unreachable("buffer end hit before '*/' was seen"); 275 1.1 joerg } 276 1.1 joerg 277 1.1 joerg } // end anonymous namespace 278 1.1 joerg 279 1.1 joerg void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, 280 1.1 joerg tok::TokenKind Kind) { 281 1.1 joerg const unsigned TokLen = TokEnd - BufferPtr; 282 1.1 joerg Result.setLocation(getSourceLocation(BufferPtr)); 283 1.1 joerg Result.setKind(Kind); 284 1.1 joerg Result.setLength(TokLen); 285 1.1 joerg #ifndef NDEBUG 286 1.1 joerg Result.TextPtr = "<UNSET>"; 287 1.1 joerg Result.IntVal = 7; 288 1.1 joerg #endif 289 1.1 joerg BufferPtr = TokEnd; 290 1.1 joerg } 291 1.1 joerg 292 1.1 joerg void Lexer::lexCommentText(Token &T) { 293 1.1 joerg assert(CommentState == LCS_InsideBCPLComment || 294 1.1 joerg CommentState == LCS_InsideCComment); 295 1.1 joerg 296 1.1 joerg // Handles lexing non-command text, i.e. text and newline. 297 1.1 joerg auto HandleNonCommandToken = [&]() -> void { 298 1.1 joerg assert(State == LS_Normal); 299 1.1 joerg 300 1.1 joerg const char *TokenPtr = BufferPtr; 301 1.1 joerg assert(TokenPtr < CommentEnd); 302 1.1 joerg switch (*TokenPtr) { 303 1.1 joerg case '\n': 304 1.1 joerg case '\r': 305 1.1 joerg TokenPtr = skipNewline(TokenPtr, CommentEnd); 306 1.1 joerg formTokenWithChars(T, TokenPtr, tok::newline); 307 1.1 joerg 308 1.1 joerg if (CommentState == LCS_InsideCComment) 309 1.1 joerg skipLineStartingDecorations(); 310 1.1 joerg return; 311 1.1 joerg 312 1.1 joerg default: { 313 1.1 joerg StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r"; 314 1.1 joerg size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr) 315 1.1 joerg .find_first_of(TokStartSymbols); 316 1.1 joerg if (End != StringRef::npos) 317 1.1 joerg TokenPtr += End; 318 1.1 joerg else 319 1.1 joerg TokenPtr = CommentEnd; 320 1.1 joerg formTextToken(T, TokenPtr); 321 1.1 joerg return; 322 1.1 joerg } 323 1.1 joerg } 324 1.1 joerg }; 325 1.1 joerg 326 1.1 joerg if (!ParseCommands) 327 1.1 joerg return HandleNonCommandToken(); 328 1.1 joerg 329 1.1 joerg switch (State) { 330 1.1 joerg case LS_Normal: 331 1.1 joerg break; 332 1.1 joerg case LS_VerbatimBlockFirstLine: 333 1.1 joerg lexVerbatimBlockFirstLine(T); 334 1.1 joerg return; 335 1.1 joerg case LS_VerbatimBlockBody: 336 1.1 joerg lexVerbatimBlockBody(T); 337 1.1 joerg return; 338 1.1 joerg case LS_VerbatimLineText: 339 1.1 joerg lexVerbatimLineText(T); 340 1.1 joerg return; 341 1.1 joerg case LS_HTMLStartTag: 342 1.1 joerg lexHTMLStartTag(T); 343 1.1 joerg return; 344 1.1 joerg case LS_HTMLEndTag: 345 1.1 joerg lexHTMLEndTag(T); 346 1.1 joerg return; 347 1.1 joerg } 348 1.1 joerg 349 1.1 joerg assert(State == LS_Normal); 350 1.1 joerg const char *TokenPtr = BufferPtr; 351 1.1 joerg assert(TokenPtr < CommentEnd); 352 1.1 joerg switch(*TokenPtr) { 353 1.1 joerg case '\\': 354 1.1 joerg case '@': { 355 1.1 joerg // Commands that start with a backslash and commands that start with 356 1.1 joerg // 'at' have equivalent semantics. But we keep information about the 357 1.1 joerg // exact syntax in AST for comments. 358 1.1 joerg tok::TokenKind CommandKind = 359 1.1 joerg (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 360 1.1 joerg TokenPtr++; 361 1.1 joerg if (TokenPtr == CommentEnd) { 362 1.1 joerg formTextToken(T, TokenPtr); 363 1.1 joerg return; 364 1.1 joerg } 365 1.1 joerg char C = *TokenPtr; 366 1.1 joerg switch (C) { 367 1.1 joerg default: 368 1.1 joerg break; 369 1.1 joerg 370 1.1 joerg case '\\': case '@': case '&': case '$': 371 1.1 joerg case '#': case '<': case '>': case '%': 372 1.1 joerg case '\"': case '.': case ':': 373 1.1 joerg // This is one of \\ \@ \& \$ etc escape sequences. 374 1.1 joerg TokenPtr++; 375 1.1 joerg if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 376 1.1 joerg // This is the \:: escape sequence. 377 1.1 joerg TokenPtr++; 378 1.1 joerg } 379 1.1 joerg StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 380 1.1 joerg formTokenWithChars(T, TokenPtr, tok::text); 381 1.1 joerg T.setText(UnescapedText); 382 1.1 joerg return; 383 1.1 joerg } 384 1.1 joerg 385 1.1 joerg // Don't make zero-length commands. 386 1.1 joerg if (!isCommandNameStartCharacter(*TokenPtr)) { 387 1.1 joerg formTextToken(T, TokenPtr); 388 1.1 joerg return; 389 1.1 joerg } 390 1.1 joerg 391 1.1 joerg TokenPtr = skipCommandName(TokenPtr, CommentEnd); 392 1.1 joerg unsigned Length = TokenPtr - (BufferPtr + 1); 393 1.1 joerg 394 1.1 joerg // Hardcoded support for lexing LaTeX formula commands 395 1.1 joerg // \f$ \f[ \f] \f{ \f} as a single command. 396 1.1 joerg if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 397 1.1 joerg C = *TokenPtr; 398 1.1 joerg if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 399 1.1 joerg TokenPtr++; 400 1.1 joerg Length++; 401 1.1 joerg } 402 1.1 joerg } 403 1.1 joerg 404 1.1 joerg StringRef CommandName(BufferPtr + 1, Length); 405 1.1 joerg 406 1.1 joerg const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 407 1.1 joerg if (!Info) { 408 1.1 joerg if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 409 1.1 joerg StringRef CorrectedName = Info->Name; 410 1.1 joerg SourceLocation Loc = getSourceLocation(BufferPtr); 411 1.1 joerg SourceLocation EndLoc = getSourceLocation(TokenPtr); 412 1.1 joerg SourceRange FullRange = SourceRange(Loc, EndLoc); 413 1.1 joerg SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); 414 1.1 joerg Diag(Loc, diag::warn_correct_comment_command_name) 415 1.1 joerg << FullRange << CommandName << CorrectedName 416 1.1 joerg << FixItHint::CreateReplacement(CommandRange, CorrectedName); 417 1.1 joerg } else { 418 1.1 joerg formTokenWithChars(T, TokenPtr, tok::unknown_command); 419 1.1 joerg T.setUnknownCommandName(CommandName); 420 1.1 joerg Diag(T.getLocation(), diag::warn_unknown_comment_command_name) 421 1.1 joerg << SourceRange(T.getLocation(), T.getEndLocation()); 422 1.1 joerg return; 423 1.1 joerg } 424 1.1 joerg } 425 1.1 joerg if (Info->IsVerbatimBlockCommand) { 426 1.1 joerg setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 427 1.1 joerg return; 428 1.1 joerg } 429 1.1 joerg if (Info->IsVerbatimLineCommand) { 430 1.1 joerg setupAndLexVerbatimLine(T, TokenPtr, Info); 431 1.1 joerg return; 432 1.1 joerg } 433 1.1 joerg formTokenWithChars(T, TokenPtr, CommandKind); 434 1.1 joerg T.setCommandID(Info->getID()); 435 1.1 joerg return; 436 1.1 joerg } 437 1.1 joerg 438 1.1 joerg case '&': 439 1.1 joerg lexHTMLCharacterReference(T); 440 1.1 joerg return; 441 1.1 joerg 442 1.1 joerg case '<': { 443 1.1 joerg TokenPtr++; 444 1.1 joerg if (TokenPtr == CommentEnd) { 445 1.1 joerg formTextToken(T, TokenPtr); 446 1.1 joerg return; 447 1.1 joerg } 448 1.1 joerg const char C = *TokenPtr; 449 1.1 joerg if (isHTMLIdentifierStartingCharacter(C)) 450 1.1 joerg setupAndLexHTMLStartTag(T); 451 1.1 joerg else if (C == '/') 452 1.1 joerg setupAndLexHTMLEndTag(T); 453 1.1 joerg else 454 1.1 joerg formTextToken(T, TokenPtr); 455 1.1 joerg return; 456 1.1 joerg } 457 1.1 joerg 458 1.1 joerg default: 459 1.1 joerg return HandleNonCommandToken(); 460 1.1 joerg } 461 1.1 joerg } 462 1.1 joerg 463 1.1 joerg void Lexer::setupAndLexVerbatimBlock(Token &T, 464 1.1 joerg const char *TextBegin, 465 1.1 joerg char Marker, const CommandInfo *Info) { 466 1.1 joerg assert(Info->IsVerbatimBlockCommand); 467 1.1 joerg 468 1.1 joerg VerbatimBlockEndCommandName.clear(); 469 1.1 joerg VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 470 1.1 joerg VerbatimBlockEndCommandName.append(Info->EndCommandName); 471 1.1 joerg 472 1.1 joerg formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 473 1.1 joerg T.setVerbatimBlockID(Info->getID()); 474 1.1 joerg 475 1.1 joerg // If there is a newline following the verbatim opening command, skip the 476 1.1 joerg // newline so that we don't create an tok::verbatim_block_line with empty 477 1.1 joerg // text content. 478 1.1 joerg if (BufferPtr != CommentEnd && 479 1.1 joerg isVerticalWhitespace(*BufferPtr)) { 480 1.1 joerg BufferPtr = skipNewline(BufferPtr, CommentEnd); 481 1.1 joerg State = LS_VerbatimBlockBody; 482 1.1 joerg return; 483 1.1 joerg } 484 1.1 joerg 485 1.1 joerg State = LS_VerbatimBlockFirstLine; 486 1.1 joerg } 487 1.1 joerg 488 1.1 joerg void Lexer::lexVerbatimBlockFirstLine(Token &T) { 489 1.1 joerg again: 490 1.1 joerg assert(BufferPtr < CommentEnd); 491 1.1 joerg 492 1.1 joerg // FIXME: It would be better to scan the text once, finding either the block 493 1.1 joerg // end command or newline. 494 1.1 joerg // 495 1.1 joerg // Extract current line. 496 1.1 joerg const char *Newline = findNewline(BufferPtr, CommentEnd); 497 1.1 joerg StringRef Line(BufferPtr, Newline - BufferPtr); 498 1.1 joerg 499 1.1 joerg // Look for end command in current line. 500 1.1 joerg size_t Pos = Line.find(VerbatimBlockEndCommandName); 501 1.1 joerg const char *TextEnd; 502 1.1 joerg const char *NextLine; 503 1.1 joerg if (Pos == StringRef::npos) { 504 1.1 joerg // Current line is completely verbatim. 505 1.1 joerg TextEnd = Newline; 506 1.1 joerg NextLine = skipNewline(Newline, CommentEnd); 507 1.1 joerg } else if (Pos == 0) { 508 1.1 joerg // Current line contains just an end command. 509 1.1 joerg const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 510 1.1 joerg StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 511 1.1 joerg formTokenWithChars(T, End, tok::verbatim_block_end); 512 1.1 joerg T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 513 1.1 joerg State = LS_Normal; 514 1.1 joerg return; 515 1.1 joerg } else { 516 1.1 joerg // There is some text, followed by end command. Extract text first. 517 1.1 joerg TextEnd = BufferPtr + Pos; 518 1.1 joerg NextLine = TextEnd; 519 1.1 joerg // If there is only whitespace before end command, skip whitespace. 520 1.1 joerg if (isWhitespace(BufferPtr, TextEnd)) { 521 1.1 joerg BufferPtr = TextEnd; 522 1.1 joerg goto again; 523 1.1 joerg } 524 1.1 joerg } 525 1.1 joerg 526 1.1 joerg StringRef Text(BufferPtr, TextEnd - BufferPtr); 527 1.1 joerg formTokenWithChars(T, NextLine, tok::verbatim_block_line); 528 1.1 joerg T.setVerbatimBlockText(Text); 529 1.1 joerg 530 1.1 joerg State = LS_VerbatimBlockBody; 531 1.1 joerg } 532 1.1 joerg 533 1.1 joerg void Lexer::lexVerbatimBlockBody(Token &T) { 534 1.1 joerg assert(State == LS_VerbatimBlockBody); 535 1.1 joerg 536 1.1 joerg if (CommentState == LCS_InsideCComment) 537 1.1 joerg skipLineStartingDecorations(); 538 1.1 joerg 539 1.1 joerg if (BufferPtr == CommentEnd) { 540 1.1 joerg formTokenWithChars(T, BufferPtr, tok::verbatim_block_line); 541 1.1 joerg T.setVerbatimBlockText(""); 542 1.1 joerg return; 543 1.1 joerg } 544 1.1 joerg 545 1.1 joerg lexVerbatimBlockFirstLine(T); 546 1.1 joerg } 547 1.1 joerg 548 1.1 joerg void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 549 1.1 joerg const CommandInfo *Info) { 550 1.1 joerg assert(Info->IsVerbatimLineCommand); 551 1.1 joerg formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 552 1.1 joerg T.setVerbatimLineID(Info->getID()); 553 1.1 joerg 554 1.1 joerg State = LS_VerbatimLineText; 555 1.1 joerg } 556 1.1 joerg 557 1.1 joerg void Lexer::lexVerbatimLineText(Token &T) { 558 1.1 joerg assert(State == LS_VerbatimLineText); 559 1.1 joerg 560 1.1 joerg // Extract current line. 561 1.1 joerg const char *Newline = findNewline(BufferPtr, CommentEnd); 562 1.1 joerg StringRef Text(BufferPtr, Newline - BufferPtr); 563 1.1 joerg formTokenWithChars(T, Newline, tok::verbatim_line_text); 564 1.1 joerg T.setVerbatimLineText(Text); 565 1.1 joerg 566 1.1 joerg State = LS_Normal; 567 1.1 joerg } 568 1.1 joerg 569 1.1 joerg void Lexer::lexHTMLCharacterReference(Token &T) { 570 1.1 joerg const char *TokenPtr = BufferPtr; 571 1.1 joerg assert(*TokenPtr == '&'); 572 1.1 joerg TokenPtr++; 573 1.1 joerg if (TokenPtr == CommentEnd) { 574 1.1 joerg formTextToken(T, TokenPtr); 575 1.1 joerg return; 576 1.1 joerg } 577 1.1 joerg const char *NamePtr; 578 1.1 joerg bool isNamed = false; 579 1.1 joerg bool isDecimal = false; 580 1.1 joerg char C = *TokenPtr; 581 1.1 joerg if (isHTMLNamedCharacterReferenceCharacter(C)) { 582 1.1 joerg NamePtr = TokenPtr; 583 1.1 joerg TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 584 1.1 joerg isNamed = true; 585 1.1 joerg } else if (C == '#') { 586 1.1 joerg TokenPtr++; 587 1.1 joerg if (TokenPtr == CommentEnd) { 588 1.1 joerg formTextToken(T, TokenPtr); 589 1.1 joerg return; 590 1.1 joerg } 591 1.1 joerg C = *TokenPtr; 592 1.1 joerg if (isHTMLDecimalCharacterReferenceCharacter(C)) { 593 1.1 joerg NamePtr = TokenPtr; 594 1.1 joerg TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 595 1.1 joerg isDecimal = true; 596 1.1 joerg } else if (C == 'x' || C == 'X') { 597 1.1 joerg TokenPtr++; 598 1.1 joerg NamePtr = TokenPtr; 599 1.1 joerg TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 600 1.1 joerg } else { 601 1.1 joerg formTextToken(T, TokenPtr); 602 1.1 joerg return; 603 1.1 joerg } 604 1.1 joerg } else { 605 1.1 joerg formTextToken(T, TokenPtr); 606 1.1 joerg return; 607 1.1 joerg } 608 1.1 joerg if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 609 1.1 joerg *TokenPtr != ';') { 610 1.1 joerg formTextToken(T, TokenPtr); 611 1.1 joerg return; 612 1.1 joerg } 613 1.1 joerg StringRef Name(NamePtr, TokenPtr - NamePtr); 614 1.1 joerg TokenPtr++; // Skip semicolon. 615 1.1 joerg StringRef Resolved; 616 1.1 joerg if (isNamed) 617 1.1 joerg Resolved = resolveHTMLNamedCharacterReference(Name); 618 1.1 joerg else if (isDecimal) 619 1.1 joerg Resolved = resolveHTMLDecimalCharacterReference(Name); 620 1.1 joerg else 621 1.1 joerg Resolved = resolveHTMLHexCharacterReference(Name); 622 1.1 joerg 623 1.1 joerg if (Resolved.empty()) { 624 1.1 joerg formTextToken(T, TokenPtr); 625 1.1 joerg return; 626 1.1 joerg } 627 1.1 joerg formTokenWithChars(T, TokenPtr, tok::text); 628 1.1 joerg T.setText(Resolved); 629 1.1 joerg } 630 1.1 joerg 631 1.1 joerg void Lexer::setupAndLexHTMLStartTag(Token &T) { 632 1.1 joerg assert(BufferPtr[0] == '<' && 633 1.1 joerg isHTMLIdentifierStartingCharacter(BufferPtr[1])); 634 1.1 joerg const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 635 1.1 joerg StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 636 1.1 joerg if (!isHTMLTagName(Name)) { 637 1.1 joerg formTextToken(T, TagNameEnd); 638 1.1 joerg return; 639 1.1 joerg } 640 1.1 joerg 641 1.1 joerg formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 642 1.1 joerg T.setHTMLTagStartName(Name); 643 1.1 joerg 644 1.1 joerg BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 645 1.1 joerg 646 1.1 joerg const char C = *BufferPtr; 647 1.1 joerg if (BufferPtr != CommentEnd && 648 1.1 joerg (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 649 1.1 joerg State = LS_HTMLStartTag; 650 1.1 joerg } 651 1.1 joerg 652 1.1 joerg void Lexer::lexHTMLStartTag(Token &T) { 653 1.1 joerg assert(State == LS_HTMLStartTag); 654 1.1 joerg 655 1.1 joerg const char *TokenPtr = BufferPtr; 656 1.1 joerg char C = *TokenPtr; 657 1.1 joerg if (isHTMLIdentifierCharacter(C)) { 658 1.1 joerg TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 659 1.1 joerg StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 660 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_ident); 661 1.1 joerg T.setHTMLIdent(Ident); 662 1.1 joerg } else { 663 1.1 joerg switch (C) { 664 1.1 joerg case '=': 665 1.1 joerg TokenPtr++; 666 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_equals); 667 1.1 joerg break; 668 1.1 joerg case '\"': 669 1.1 joerg case '\'': { 670 1.1 joerg const char *OpenQuote = TokenPtr; 671 1.1 joerg TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 672 1.1 joerg const char *ClosingQuote = TokenPtr; 673 1.1 joerg if (TokenPtr != CommentEnd) // Skip closing quote. 674 1.1 joerg TokenPtr++; 675 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 676 1.1 joerg T.setHTMLQuotedString(StringRef(OpenQuote + 1, 677 1.1 joerg ClosingQuote - (OpenQuote + 1))); 678 1.1 joerg break; 679 1.1 joerg } 680 1.1 joerg case '>': 681 1.1 joerg TokenPtr++; 682 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_greater); 683 1.1 joerg State = LS_Normal; 684 1.1 joerg return; 685 1.1 joerg case '/': 686 1.1 joerg TokenPtr++; 687 1.1 joerg if (TokenPtr != CommentEnd && *TokenPtr == '>') { 688 1.1 joerg TokenPtr++; 689 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 690 1.1 joerg } else 691 1.1 joerg formTextToken(T, TokenPtr); 692 1.1 joerg 693 1.1 joerg State = LS_Normal; 694 1.1 joerg return; 695 1.1 joerg } 696 1.1 joerg } 697 1.1 joerg 698 1.1 joerg // Now look ahead and return to normal state if we don't see any HTML tokens 699 1.1 joerg // ahead. 700 1.1 joerg BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 701 1.1 joerg if (BufferPtr == CommentEnd) { 702 1.1 joerg State = LS_Normal; 703 1.1 joerg return; 704 1.1 joerg } 705 1.1 joerg 706 1.1 joerg C = *BufferPtr; 707 1.1 joerg if (!isHTMLIdentifierStartingCharacter(C) && 708 1.1 joerg C != '=' && C != '\"' && C != '\'' && C != '>') { 709 1.1 joerg State = LS_Normal; 710 1.1 joerg return; 711 1.1 joerg } 712 1.1 joerg } 713 1.1 joerg 714 1.1 joerg void Lexer::setupAndLexHTMLEndTag(Token &T) { 715 1.1 joerg assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 716 1.1 joerg 717 1.1 joerg const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 718 1.1 joerg const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 719 1.1 joerg StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 720 1.1 joerg if (!isHTMLTagName(Name)) { 721 1.1 joerg formTextToken(T, TagNameEnd); 722 1.1 joerg return; 723 1.1 joerg } 724 1.1 joerg 725 1.1 joerg const char *End = skipWhitespace(TagNameEnd, CommentEnd); 726 1.1 joerg 727 1.1 joerg formTokenWithChars(T, End, tok::html_end_tag); 728 1.1 joerg T.setHTMLTagEndName(Name); 729 1.1 joerg 730 1.1 joerg if (BufferPtr != CommentEnd && *BufferPtr == '>') 731 1.1 joerg State = LS_HTMLEndTag; 732 1.1 joerg } 733 1.1 joerg 734 1.1 joerg void Lexer::lexHTMLEndTag(Token &T) { 735 1.1 joerg assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 736 1.1 joerg 737 1.1 joerg formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 738 1.1 joerg State = LS_Normal; 739 1.1 joerg } 740 1.1 joerg 741 1.1 joerg Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 742 1.1 joerg const CommandTraits &Traits, SourceLocation FileLoc, 743 1.1.1.2 joerg const char *BufferStart, const char *BufferEnd, bool ParseCommands) 744 1.1 joerg : Allocator(Allocator), Diags(Diags), Traits(Traits), 745 1.1.1.2 joerg BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart), 746 1.1.1.2 joerg FileLoc(FileLoc), ParseCommands(ParseCommands), 747 1.1.1.2 joerg CommentState(LCS_BeforeComment), State(LS_Normal) {} 748 1.1 joerg 749 1.1 joerg void Lexer::lex(Token &T) { 750 1.1 joerg again: 751 1.1 joerg switch (CommentState) { 752 1.1 joerg case LCS_BeforeComment: 753 1.1 joerg if (BufferPtr == BufferEnd) { 754 1.1 joerg formTokenWithChars(T, BufferPtr, tok::eof); 755 1.1 joerg return; 756 1.1 joerg } 757 1.1 joerg 758 1.1 joerg assert(*BufferPtr == '/'); 759 1.1 joerg BufferPtr++; // Skip first slash. 760 1.1 joerg switch(*BufferPtr) { 761 1.1 joerg case '/': { // BCPL comment. 762 1.1 joerg BufferPtr++; // Skip second slash. 763 1.1 joerg 764 1.1 joerg if (BufferPtr != BufferEnd) { 765 1.1 joerg // Skip Doxygen magic marker, if it is present. 766 1.1 joerg // It might be missing because of a typo //< or /*<, or because we 767 1.1 joerg // merged this non-Doxygen comment into a bunch of Doxygen comments 768 1.1 joerg // around it: /** ... */ /* ... */ /** ... */ 769 1.1 joerg const char C = *BufferPtr; 770 1.1 joerg if (C == '/' || C == '!') 771 1.1 joerg BufferPtr++; 772 1.1 joerg } 773 1.1 joerg 774 1.1 joerg // Skip less-than symbol that marks trailing comments. 775 1.1 joerg // Skip it even if the comment is not a Doxygen one, because //< and /*< 776 1.1 joerg // are frequent typos. 777 1.1 joerg if (BufferPtr != BufferEnd && *BufferPtr == '<') 778 1.1 joerg BufferPtr++; 779 1.1 joerg 780 1.1 joerg CommentState = LCS_InsideBCPLComment; 781 1.1 joerg if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 782 1.1 joerg State = LS_Normal; 783 1.1 joerg CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 784 1.1 joerg goto again; 785 1.1 joerg } 786 1.1 joerg case '*': { // C comment. 787 1.1 joerg BufferPtr++; // Skip star. 788 1.1 joerg 789 1.1 joerg // Skip Doxygen magic marker. 790 1.1 joerg const char C = *BufferPtr; 791 1.1 joerg if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 792 1.1 joerg BufferPtr++; 793 1.1 joerg 794 1.1 joerg // Skip less-than symbol that marks trailing comments. 795 1.1 joerg if (BufferPtr != BufferEnd && *BufferPtr == '<') 796 1.1 joerg BufferPtr++; 797 1.1 joerg 798 1.1 joerg CommentState = LCS_InsideCComment; 799 1.1 joerg State = LS_Normal; 800 1.1 joerg CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 801 1.1 joerg goto again; 802 1.1 joerg } 803 1.1 joerg default: 804 1.1 joerg llvm_unreachable("second character of comment should be '/' or '*'"); 805 1.1 joerg } 806 1.1 joerg 807 1.1 joerg case LCS_BetweenComments: { 808 1.1 joerg // Consecutive comments are extracted only if there is only whitespace 809 1.1 joerg // between them. So we can search for the start of the next comment. 810 1.1 joerg const char *EndWhitespace = BufferPtr; 811 1.1 joerg while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 812 1.1 joerg EndWhitespace++; 813 1.1 joerg 814 1.1 joerg // Turn any whitespace between comments (and there is only whitespace 815 1.1 joerg // between them -- guaranteed by comment extraction) into a newline. We 816 1.1 joerg // have two newlines between C comments in total (first one was synthesized 817 1.1 joerg // after a comment). 818 1.1 joerg formTokenWithChars(T, EndWhitespace, tok::newline); 819 1.1 joerg 820 1.1 joerg CommentState = LCS_BeforeComment; 821 1.1 joerg break; 822 1.1 joerg } 823 1.1 joerg 824 1.1 joerg case LCS_InsideBCPLComment: 825 1.1 joerg case LCS_InsideCComment: 826 1.1 joerg if (BufferPtr != CommentEnd) { 827 1.1 joerg lexCommentText(T); 828 1.1 joerg break; 829 1.1 joerg } else { 830 1.1 joerg // Skip C comment closing sequence. 831 1.1 joerg if (CommentState == LCS_InsideCComment) { 832 1.1 joerg assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 833 1.1 joerg BufferPtr += 2; 834 1.1 joerg assert(BufferPtr <= BufferEnd); 835 1.1 joerg 836 1.1 joerg // Synthenize newline just after the C comment, regardless if there is 837 1.1 joerg // actually a newline. 838 1.1 joerg formTokenWithChars(T, BufferPtr, tok::newline); 839 1.1 joerg 840 1.1 joerg CommentState = LCS_BetweenComments; 841 1.1 joerg break; 842 1.1 joerg } else { 843 1.1 joerg // Don't synthesized a newline after BCPL comment. 844 1.1 joerg CommentState = LCS_BetweenComments; 845 1.1 joerg goto again; 846 1.1 joerg } 847 1.1 joerg } 848 1.1 joerg } 849 1.1 joerg } 850 1.1 joerg 851 1.1 joerg StringRef Lexer::getSpelling(const Token &Tok, 852 1.1 joerg const SourceManager &SourceMgr) const { 853 1.1 joerg SourceLocation Loc = Tok.getLocation(); 854 1.1 joerg std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 855 1.1 joerg 856 1.1 joerg bool InvalidTemp = false; 857 1.1 joerg StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 858 1.1 joerg if (InvalidTemp) 859 1.1 joerg return StringRef(); 860 1.1 joerg 861 1.1 joerg const char *Begin = File.data() + LocInfo.second; 862 1.1 joerg return StringRef(Begin, Tok.getLength()); 863 1.1 joerg } 864 1.1 joerg 865 1.1 joerg } // end namespace comments 866 1.1 joerg } // end namespace clang 867