CommentLexer.cpp revision 1.1.1.1 1 1.1 joerg //===--- CommentLexer.cpp -------------------------------------------------===//
2 1.1 joerg //
3 1.1 joerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 1.1 joerg // See https://llvm.org/LICENSE.txt for license information.
5 1.1 joerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 1.1 joerg //
7 1.1 joerg //===----------------------------------------------------------------------===//
8 1.1 joerg
9 1.1 joerg #include "clang/AST/CommentLexer.h"
10 1.1 joerg #include "clang/AST/CommentCommandTraits.h"
11 1.1 joerg #include "clang/AST/CommentDiagnostic.h"
12 1.1 joerg #include "clang/Basic/CharInfo.h"
13 1.1 joerg #include "llvm/ADT/StringExtras.h"
14 1.1 joerg #include "llvm/ADT/StringSwitch.h"
15 1.1 joerg #include "llvm/Support/ConvertUTF.h"
16 1.1 joerg #include "llvm/Support/ErrorHandling.h"
17 1.1 joerg
18 1.1 joerg namespace clang {
19 1.1 joerg namespace comments {
20 1.1 joerg
21 1.1 joerg void Token::dump(const Lexer &L, const SourceManager &SM) const {
22 1.1 joerg llvm::errs() << "comments::Token Kind=" << Kind << " ";
23 1.1 joerg Loc.print(llvm::errs(), SM);
24 1.1 joerg llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25 1.1 joerg }
26 1.1 joerg
27 1.1 joerg static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28 1.1 joerg return isLetter(C);
29 1.1 joerg }
30 1.1 joerg
31 1.1 joerg static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32 1.1 joerg return isDigit(C);
33 1.1 joerg }
34 1.1 joerg
35 1.1 joerg static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36 1.1 joerg return isHexDigit(C);
37 1.1 joerg }
38 1.1 joerg
39 1.1 joerg static inline StringRef convertCodePointToUTF8(
40 1.1 joerg llvm::BumpPtrAllocator &Allocator,
41 1.1 joerg unsigned CodePoint) {
42 1.1 joerg char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43 1.1 joerg char *ResolvedPtr = Resolved;
44 1.1 joerg if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45 1.1 joerg return StringRef(Resolved, ResolvedPtr - Resolved);
46 1.1 joerg else
47 1.1 joerg return StringRef();
48 1.1 joerg }
49 1.1 joerg
50 1.1 joerg namespace {
51 1.1 joerg
52 1.1 joerg #include "clang/AST/CommentHTMLTags.inc"
53 1.1 joerg #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54 1.1 joerg
55 1.1 joerg } // end anonymous namespace
56 1.1 joerg
57 1.1 joerg StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58 1.1 joerg // Fast path, first check a few most widely used named character references.
59 1.1 joerg return llvm::StringSwitch<StringRef>(Name)
60 1.1 joerg .Case("amp", "&")
61 1.1 joerg .Case("lt", "<")
62 1.1 joerg .Case("gt", ">")
63 1.1 joerg .Case("quot", "\"")
64 1.1 joerg .Case("apos", "\'")
65 1.1 joerg // Slow path.
66 1.1 joerg .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67 1.1 joerg }
68 1.1 joerg
69 1.1 joerg StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70 1.1 joerg unsigned CodePoint = 0;
71 1.1 joerg for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72 1.1 joerg assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73 1.1 joerg CodePoint *= 10;
74 1.1 joerg CodePoint += Name[i] - '0';
75 1.1 joerg }
76 1.1 joerg return convertCodePointToUTF8(Allocator, CodePoint);
77 1.1 joerg }
78 1.1 joerg
79 1.1 joerg StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80 1.1 joerg unsigned CodePoint = 0;
81 1.1 joerg for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82 1.1 joerg CodePoint *= 16;
83 1.1 joerg const char C = Name[i];
84 1.1 joerg assert(isHTMLHexCharacterReferenceCharacter(C));
85 1.1 joerg CodePoint += llvm::hexDigitValue(C);
86 1.1 joerg }
87 1.1 joerg return convertCodePointToUTF8(Allocator, CodePoint);
88 1.1 joerg }
89 1.1 joerg
90 1.1 joerg void Lexer::skipLineStartingDecorations() {
91 1.1 joerg // This function should be called only for C comments
92 1.1 joerg assert(CommentState == LCS_InsideCComment);
93 1.1 joerg
94 1.1 joerg if (BufferPtr == CommentEnd)
95 1.1 joerg return;
96 1.1 joerg
97 1.1 joerg switch (*BufferPtr) {
98 1.1 joerg case ' ':
99 1.1 joerg case '\t':
100 1.1 joerg case '\f':
101 1.1 joerg case '\v': {
102 1.1 joerg const char *NewBufferPtr = BufferPtr;
103 1.1 joerg NewBufferPtr++;
104 1.1 joerg if (NewBufferPtr == CommentEnd)
105 1.1 joerg return;
106 1.1 joerg
107 1.1 joerg char C = *NewBufferPtr;
108 1.1 joerg while (isHorizontalWhitespace(C)) {
109 1.1 joerg NewBufferPtr++;
110 1.1 joerg if (NewBufferPtr == CommentEnd)
111 1.1 joerg return;
112 1.1 joerg C = *NewBufferPtr;
113 1.1 joerg }
114 1.1 joerg if (C == '*')
115 1.1 joerg BufferPtr = NewBufferPtr + 1;
116 1.1 joerg break;
117 1.1 joerg }
118 1.1 joerg case '*':
119 1.1 joerg BufferPtr++;
120 1.1 joerg break;
121 1.1 joerg }
122 1.1 joerg }
123 1.1 joerg
124 1.1 joerg namespace {
125 1.1 joerg /// Returns pointer to the first newline character in the string.
126 1.1 joerg const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
127 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
128 1.1 joerg if (isVerticalWhitespace(*BufferPtr))
129 1.1 joerg return BufferPtr;
130 1.1 joerg }
131 1.1 joerg return BufferEnd;
132 1.1 joerg }
133 1.1 joerg
134 1.1 joerg const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
135 1.1 joerg if (BufferPtr == BufferEnd)
136 1.1 joerg return BufferPtr;
137 1.1 joerg
138 1.1 joerg if (*BufferPtr == '\n')
139 1.1 joerg BufferPtr++;
140 1.1 joerg else {
141 1.1 joerg assert(*BufferPtr == '\r');
142 1.1 joerg BufferPtr++;
143 1.1 joerg if (BufferPtr != BufferEnd && *BufferPtr == '\n')
144 1.1 joerg BufferPtr++;
145 1.1 joerg }
146 1.1 joerg return BufferPtr;
147 1.1 joerg }
148 1.1 joerg
149 1.1 joerg const char *skipNamedCharacterReference(const char *BufferPtr,
150 1.1 joerg const char *BufferEnd) {
151 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
152 1.1 joerg if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
153 1.1 joerg return BufferPtr;
154 1.1 joerg }
155 1.1 joerg return BufferEnd;
156 1.1 joerg }
157 1.1 joerg
158 1.1 joerg const char *skipDecimalCharacterReference(const char *BufferPtr,
159 1.1 joerg const char *BufferEnd) {
160 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
161 1.1 joerg if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
162 1.1 joerg return BufferPtr;
163 1.1 joerg }
164 1.1 joerg return BufferEnd;
165 1.1 joerg }
166 1.1 joerg
167 1.1 joerg const char *skipHexCharacterReference(const char *BufferPtr,
168 1.1 joerg const char *BufferEnd) {
169 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
170 1.1 joerg if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
171 1.1 joerg return BufferPtr;
172 1.1 joerg }
173 1.1 joerg return BufferEnd;
174 1.1 joerg }
175 1.1 joerg
176 1.1 joerg bool isHTMLIdentifierStartingCharacter(char C) {
177 1.1 joerg return isLetter(C);
178 1.1 joerg }
179 1.1 joerg
180 1.1 joerg bool isHTMLIdentifierCharacter(char C) {
181 1.1 joerg return isAlphanumeric(C);
182 1.1 joerg }
183 1.1 joerg
184 1.1 joerg const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
185 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
186 1.1 joerg if (!isHTMLIdentifierCharacter(*BufferPtr))
187 1.1 joerg return BufferPtr;
188 1.1 joerg }
189 1.1 joerg return BufferEnd;
190 1.1 joerg }
191 1.1 joerg
192 1.1 joerg /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
193 1.1 joerg /// string allowed.
194 1.1 joerg ///
195 1.1 joerg /// Returns pointer to closing quote.
196 1.1 joerg const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
197 1.1 joerg {
198 1.1 joerg const char Quote = *BufferPtr;
199 1.1 joerg assert(Quote == '\"' || Quote == '\'');
200 1.1 joerg
201 1.1 joerg BufferPtr++;
202 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
203 1.1 joerg const char C = *BufferPtr;
204 1.1 joerg if (C == Quote && BufferPtr[-1] != '\\')
205 1.1 joerg return BufferPtr;
206 1.1 joerg }
207 1.1 joerg return BufferEnd;
208 1.1 joerg }
209 1.1 joerg
210 1.1 joerg const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
211 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
212 1.1 joerg if (!isWhitespace(*BufferPtr))
213 1.1 joerg return BufferPtr;
214 1.1 joerg }
215 1.1 joerg return BufferEnd;
216 1.1 joerg }
217 1.1 joerg
218 1.1 joerg bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
219 1.1 joerg return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
220 1.1 joerg }
221 1.1 joerg
222 1.1 joerg bool isCommandNameStartCharacter(char C) {
223 1.1 joerg return isLetter(C);
224 1.1 joerg }
225 1.1 joerg
226 1.1 joerg bool isCommandNameCharacter(char C) {
227 1.1 joerg return isAlphanumeric(C);
228 1.1 joerg }
229 1.1 joerg
230 1.1 joerg const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
231 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
232 1.1 joerg if (!isCommandNameCharacter(*BufferPtr))
233 1.1 joerg return BufferPtr;
234 1.1 joerg }
235 1.1 joerg return BufferEnd;
236 1.1 joerg }
237 1.1 joerg
238 1.1 joerg /// Return the one past end pointer for BCPL comments.
239 1.1 joerg /// Handles newlines escaped with backslash or trigraph for backslahs.
240 1.1 joerg const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
241 1.1 joerg const char *CurPtr = BufferPtr;
242 1.1 joerg while (CurPtr != BufferEnd) {
243 1.1 joerg while (!isVerticalWhitespace(*CurPtr)) {
244 1.1 joerg CurPtr++;
245 1.1 joerg if (CurPtr == BufferEnd)
246 1.1 joerg return BufferEnd;
247 1.1 joerg }
248 1.1 joerg // We found a newline, check if it is escaped.
249 1.1 joerg const char *EscapePtr = CurPtr - 1;
250 1.1 joerg while(isHorizontalWhitespace(*EscapePtr))
251 1.1 joerg EscapePtr--;
252 1.1 joerg
253 1.1 joerg if (*EscapePtr == '\\' ||
254 1.1 joerg (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
255 1.1 joerg EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
256 1.1 joerg // We found an escaped newline.
257 1.1 joerg CurPtr = skipNewline(CurPtr, BufferEnd);
258 1.1 joerg } else
259 1.1 joerg return CurPtr; // Not an escaped newline.
260 1.1 joerg }
261 1.1 joerg return BufferEnd;
262 1.1 joerg }
263 1.1 joerg
264 1.1 joerg /// Return the one past end pointer for C comments.
265 1.1 joerg /// Very dumb, does not handle escaped newlines or trigraphs.
266 1.1 joerg const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
267 1.1 joerg for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
268 1.1 joerg if (*BufferPtr == '*') {
269 1.1 joerg assert(BufferPtr + 1 != BufferEnd);
270 1.1 joerg if (*(BufferPtr + 1) == '/')
271 1.1 joerg return BufferPtr;
272 1.1 joerg }
273 1.1 joerg }
274 1.1 joerg llvm_unreachable("buffer end hit before '*/' was seen");
275 1.1 joerg }
276 1.1 joerg
277 1.1 joerg } // end anonymous namespace
278 1.1 joerg
279 1.1 joerg void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
280 1.1 joerg tok::TokenKind Kind) {
281 1.1 joerg const unsigned TokLen = TokEnd - BufferPtr;
282 1.1 joerg Result.setLocation(getSourceLocation(BufferPtr));
283 1.1 joerg Result.setKind(Kind);
284 1.1 joerg Result.setLength(TokLen);
285 1.1 joerg #ifndef NDEBUG
286 1.1 joerg Result.TextPtr = "<UNSET>";
287 1.1 joerg Result.IntVal = 7;
288 1.1 joerg #endif
289 1.1 joerg BufferPtr = TokEnd;
290 1.1 joerg }
291 1.1 joerg
292 1.1 joerg void Lexer::lexCommentText(Token &T) {
293 1.1 joerg assert(CommentState == LCS_InsideBCPLComment ||
294 1.1 joerg CommentState == LCS_InsideCComment);
295 1.1 joerg
296 1.1 joerg // Handles lexing non-command text, i.e. text and newline.
297 1.1 joerg auto HandleNonCommandToken = [&]() -> void {
298 1.1 joerg assert(State == LS_Normal);
299 1.1 joerg
300 1.1 joerg const char *TokenPtr = BufferPtr;
301 1.1 joerg assert(TokenPtr < CommentEnd);
302 1.1 joerg switch (*TokenPtr) {
303 1.1 joerg case '\n':
304 1.1 joerg case '\r':
305 1.1 joerg TokenPtr = skipNewline(TokenPtr, CommentEnd);
306 1.1 joerg formTokenWithChars(T, TokenPtr, tok::newline);
307 1.1 joerg
308 1.1 joerg if (CommentState == LCS_InsideCComment)
309 1.1 joerg skipLineStartingDecorations();
310 1.1 joerg return;
311 1.1 joerg
312 1.1 joerg default: {
313 1.1 joerg StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
314 1.1 joerg size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
315 1.1 joerg .find_first_of(TokStartSymbols);
316 1.1 joerg if (End != StringRef::npos)
317 1.1 joerg TokenPtr += End;
318 1.1 joerg else
319 1.1 joerg TokenPtr = CommentEnd;
320 1.1 joerg formTextToken(T, TokenPtr);
321 1.1 joerg return;
322 1.1 joerg }
323 1.1 joerg }
324 1.1 joerg };
325 1.1 joerg
326 1.1 joerg if (!ParseCommands)
327 1.1 joerg return HandleNonCommandToken();
328 1.1 joerg
329 1.1 joerg switch (State) {
330 1.1 joerg case LS_Normal:
331 1.1 joerg break;
332 1.1 joerg case LS_VerbatimBlockFirstLine:
333 1.1 joerg lexVerbatimBlockFirstLine(T);
334 1.1 joerg return;
335 1.1 joerg case LS_VerbatimBlockBody:
336 1.1 joerg lexVerbatimBlockBody(T);
337 1.1 joerg return;
338 1.1 joerg case LS_VerbatimLineText:
339 1.1 joerg lexVerbatimLineText(T);
340 1.1 joerg return;
341 1.1 joerg case LS_HTMLStartTag:
342 1.1 joerg lexHTMLStartTag(T);
343 1.1 joerg return;
344 1.1 joerg case LS_HTMLEndTag:
345 1.1 joerg lexHTMLEndTag(T);
346 1.1 joerg return;
347 1.1 joerg }
348 1.1 joerg
349 1.1 joerg assert(State == LS_Normal);
350 1.1 joerg const char *TokenPtr = BufferPtr;
351 1.1 joerg assert(TokenPtr < CommentEnd);
352 1.1 joerg switch(*TokenPtr) {
353 1.1 joerg case '\\':
354 1.1 joerg case '@': {
355 1.1 joerg // Commands that start with a backslash and commands that start with
356 1.1 joerg // 'at' have equivalent semantics. But we keep information about the
357 1.1 joerg // exact syntax in AST for comments.
358 1.1 joerg tok::TokenKind CommandKind =
359 1.1 joerg (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
360 1.1 joerg TokenPtr++;
361 1.1 joerg if (TokenPtr == CommentEnd) {
362 1.1 joerg formTextToken(T, TokenPtr);
363 1.1 joerg return;
364 1.1 joerg }
365 1.1 joerg char C = *TokenPtr;
366 1.1 joerg switch (C) {
367 1.1 joerg default:
368 1.1 joerg break;
369 1.1 joerg
370 1.1 joerg case '\\': case '@': case '&': case '$':
371 1.1 joerg case '#': case '<': case '>': case '%':
372 1.1 joerg case '\"': case '.': case ':':
373 1.1 joerg // This is one of \\ \@ \& \$ etc escape sequences.
374 1.1 joerg TokenPtr++;
375 1.1 joerg if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
376 1.1 joerg // This is the \:: escape sequence.
377 1.1 joerg TokenPtr++;
378 1.1 joerg }
379 1.1 joerg StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
380 1.1 joerg formTokenWithChars(T, TokenPtr, tok::text);
381 1.1 joerg T.setText(UnescapedText);
382 1.1 joerg return;
383 1.1 joerg }
384 1.1 joerg
385 1.1 joerg // Don't make zero-length commands.
386 1.1 joerg if (!isCommandNameStartCharacter(*TokenPtr)) {
387 1.1 joerg formTextToken(T, TokenPtr);
388 1.1 joerg return;
389 1.1 joerg }
390 1.1 joerg
391 1.1 joerg TokenPtr = skipCommandName(TokenPtr, CommentEnd);
392 1.1 joerg unsigned Length = TokenPtr - (BufferPtr + 1);
393 1.1 joerg
394 1.1 joerg // Hardcoded support for lexing LaTeX formula commands
395 1.1 joerg // \f$ \f[ \f] \f{ \f} as a single command.
396 1.1 joerg if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
397 1.1 joerg C = *TokenPtr;
398 1.1 joerg if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
399 1.1 joerg TokenPtr++;
400 1.1 joerg Length++;
401 1.1 joerg }
402 1.1 joerg }
403 1.1 joerg
404 1.1 joerg StringRef CommandName(BufferPtr + 1, Length);
405 1.1 joerg
406 1.1 joerg const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
407 1.1 joerg if (!Info) {
408 1.1 joerg if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
409 1.1 joerg StringRef CorrectedName = Info->Name;
410 1.1 joerg SourceLocation Loc = getSourceLocation(BufferPtr);
411 1.1 joerg SourceLocation EndLoc = getSourceLocation(TokenPtr);
412 1.1 joerg SourceRange FullRange = SourceRange(Loc, EndLoc);
413 1.1 joerg SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
414 1.1 joerg Diag(Loc, diag::warn_correct_comment_command_name)
415 1.1 joerg << FullRange << CommandName << CorrectedName
416 1.1 joerg << FixItHint::CreateReplacement(CommandRange, CorrectedName);
417 1.1 joerg } else {
418 1.1 joerg formTokenWithChars(T, TokenPtr, tok::unknown_command);
419 1.1 joerg T.setUnknownCommandName(CommandName);
420 1.1 joerg Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
421 1.1 joerg << SourceRange(T.getLocation(), T.getEndLocation());
422 1.1 joerg return;
423 1.1 joerg }
424 1.1 joerg }
425 1.1 joerg if (Info->IsVerbatimBlockCommand) {
426 1.1 joerg setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
427 1.1 joerg return;
428 1.1 joerg }
429 1.1 joerg if (Info->IsVerbatimLineCommand) {
430 1.1 joerg setupAndLexVerbatimLine(T, TokenPtr, Info);
431 1.1 joerg return;
432 1.1 joerg }
433 1.1 joerg formTokenWithChars(T, TokenPtr, CommandKind);
434 1.1 joerg T.setCommandID(Info->getID());
435 1.1 joerg return;
436 1.1 joerg }
437 1.1 joerg
438 1.1 joerg case '&':
439 1.1 joerg lexHTMLCharacterReference(T);
440 1.1 joerg return;
441 1.1 joerg
442 1.1 joerg case '<': {
443 1.1 joerg TokenPtr++;
444 1.1 joerg if (TokenPtr == CommentEnd) {
445 1.1 joerg formTextToken(T, TokenPtr);
446 1.1 joerg return;
447 1.1 joerg }
448 1.1 joerg const char C = *TokenPtr;
449 1.1 joerg if (isHTMLIdentifierStartingCharacter(C))
450 1.1 joerg setupAndLexHTMLStartTag(T);
451 1.1 joerg else if (C == '/')
452 1.1 joerg setupAndLexHTMLEndTag(T);
453 1.1 joerg else
454 1.1 joerg formTextToken(T, TokenPtr);
455 1.1 joerg return;
456 1.1 joerg }
457 1.1 joerg
458 1.1 joerg default:
459 1.1 joerg return HandleNonCommandToken();
460 1.1 joerg }
461 1.1 joerg }
462 1.1 joerg
463 1.1 joerg void Lexer::setupAndLexVerbatimBlock(Token &T,
464 1.1 joerg const char *TextBegin,
465 1.1 joerg char Marker, const CommandInfo *Info) {
466 1.1 joerg assert(Info->IsVerbatimBlockCommand);
467 1.1 joerg
468 1.1 joerg VerbatimBlockEndCommandName.clear();
469 1.1 joerg VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
470 1.1 joerg VerbatimBlockEndCommandName.append(Info->EndCommandName);
471 1.1 joerg
472 1.1 joerg formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
473 1.1 joerg T.setVerbatimBlockID(Info->getID());
474 1.1 joerg
475 1.1 joerg // If there is a newline following the verbatim opening command, skip the
476 1.1 joerg // newline so that we don't create an tok::verbatim_block_line with empty
477 1.1 joerg // text content.
478 1.1 joerg if (BufferPtr != CommentEnd &&
479 1.1 joerg isVerticalWhitespace(*BufferPtr)) {
480 1.1 joerg BufferPtr = skipNewline(BufferPtr, CommentEnd);
481 1.1 joerg State = LS_VerbatimBlockBody;
482 1.1 joerg return;
483 1.1 joerg }
484 1.1 joerg
485 1.1 joerg State = LS_VerbatimBlockFirstLine;
486 1.1 joerg }
487 1.1 joerg
488 1.1 joerg void Lexer::lexVerbatimBlockFirstLine(Token &T) {
489 1.1 joerg again:
490 1.1 joerg assert(BufferPtr < CommentEnd);
491 1.1 joerg
492 1.1 joerg // FIXME: It would be better to scan the text once, finding either the block
493 1.1 joerg // end command or newline.
494 1.1 joerg //
495 1.1 joerg // Extract current line.
496 1.1 joerg const char *Newline = findNewline(BufferPtr, CommentEnd);
497 1.1 joerg StringRef Line(BufferPtr, Newline - BufferPtr);
498 1.1 joerg
499 1.1 joerg // Look for end command in current line.
500 1.1 joerg size_t Pos = Line.find(VerbatimBlockEndCommandName);
501 1.1 joerg const char *TextEnd;
502 1.1 joerg const char *NextLine;
503 1.1 joerg if (Pos == StringRef::npos) {
504 1.1 joerg // Current line is completely verbatim.
505 1.1 joerg TextEnd = Newline;
506 1.1 joerg NextLine = skipNewline(Newline, CommentEnd);
507 1.1 joerg } else if (Pos == 0) {
508 1.1 joerg // Current line contains just an end command.
509 1.1 joerg const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
510 1.1 joerg StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
511 1.1 joerg formTokenWithChars(T, End, tok::verbatim_block_end);
512 1.1 joerg T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
513 1.1 joerg State = LS_Normal;
514 1.1 joerg return;
515 1.1 joerg } else {
516 1.1 joerg // There is some text, followed by end command. Extract text first.
517 1.1 joerg TextEnd = BufferPtr + Pos;
518 1.1 joerg NextLine = TextEnd;
519 1.1 joerg // If there is only whitespace before end command, skip whitespace.
520 1.1 joerg if (isWhitespace(BufferPtr, TextEnd)) {
521 1.1 joerg BufferPtr = TextEnd;
522 1.1 joerg goto again;
523 1.1 joerg }
524 1.1 joerg }
525 1.1 joerg
526 1.1 joerg StringRef Text(BufferPtr, TextEnd - BufferPtr);
527 1.1 joerg formTokenWithChars(T, NextLine, tok::verbatim_block_line);
528 1.1 joerg T.setVerbatimBlockText(Text);
529 1.1 joerg
530 1.1 joerg State = LS_VerbatimBlockBody;
531 1.1 joerg }
532 1.1 joerg
533 1.1 joerg void Lexer::lexVerbatimBlockBody(Token &T) {
534 1.1 joerg assert(State == LS_VerbatimBlockBody);
535 1.1 joerg
536 1.1 joerg if (CommentState == LCS_InsideCComment)
537 1.1 joerg skipLineStartingDecorations();
538 1.1 joerg
539 1.1 joerg if (BufferPtr == CommentEnd) {
540 1.1 joerg formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
541 1.1 joerg T.setVerbatimBlockText("");
542 1.1 joerg return;
543 1.1 joerg }
544 1.1 joerg
545 1.1 joerg lexVerbatimBlockFirstLine(T);
546 1.1 joerg }
547 1.1 joerg
548 1.1 joerg void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
549 1.1 joerg const CommandInfo *Info) {
550 1.1 joerg assert(Info->IsVerbatimLineCommand);
551 1.1 joerg formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
552 1.1 joerg T.setVerbatimLineID(Info->getID());
553 1.1 joerg
554 1.1 joerg State = LS_VerbatimLineText;
555 1.1 joerg }
556 1.1 joerg
557 1.1 joerg void Lexer::lexVerbatimLineText(Token &T) {
558 1.1 joerg assert(State == LS_VerbatimLineText);
559 1.1 joerg
560 1.1 joerg // Extract current line.
561 1.1 joerg const char *Newline = findNewline(BufferPtr, CommentEnd);
562 1.1 joerg StringRef Text(BufferPtr, Newline - BufferPtr);
563 1.1 joerg formTokenWithChars(T, Newline, tok::verbatim_line_text);
564 1.1 joerg T.setVerbatimLineText(Text);
565 1.1 joerg
566 1.1 joerg State = LS_Normal;
567 1.1 joerg }
568 1.1 joerg
569 1.1 joerg void Lexer::lexHTMLCharacterReference(Token &T) {
570 1.1 joerg const char *TokenPtr = BufferPtr;
571 1.1 joerg assert(*TokenPtr == '&');
572 1.1 joerg TokenPtr++;
573 1.1 joerg if (TokenPtr == CommentEnd) {
574 1.1 joerg formTextToken(T, TokenPtr);
575 1.1 joerg return;
576 1.1 joerg }
577 1.1 joerg const char *NamePtr;
578 1.1 joerg bool isNamed = false;
579 1.1 joerg bool isDecimal = false;
580 1.1 joerg char C = *TokenPtr;
581 1.1 joerg if (isHTMLNamedCharacterReferenceCharacter(C)) {
582 1.1 joerg NamePtr = TokenPtr;
583 1.1 joerg TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
584 1.1 joerg isNamed = true;
585 1.1 joerg } else if (C == '#') {
586 1.1 joerg TokenPtr++;
587 1.1 joerg if (TokenPtr == CommentEnd) {
588 1.1 joerg formTextToken(T, TokenPtr);
589 1.1 joerg return;
590 1.1 joerg }
591 1.1 joerg C = *TokenPtr;
592 1.1 joerg if (isHTMLDecimalCharacterReferenceCharacter(C)) {
593 1.1 joerg NamePtr = TokenPtr;
594 1.1 joerg TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
595 1.1 joerg isDecimal = true;
596 1.1 joerg } else if (C == 'x' || C == 'X') {
597 1.1 joerg TokenPtr++;
598 1.1 joerg NamePtr = TokenPtr;
599 1.1 joerg TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
600 1.1 joerg } else {
601 1.1 joerg formTextToken(T, TokenPtr);
602 1.1 joerg return;
603 1.1 joerg }
604 1.1 joerg } else {
605 1.1 joerg formTextToken(T, TokenPtr);
606 1.1 joerg return;
607 1.1 joerg }
608 1.1 joerg if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
609 1.1 joerg *TokenPtr != ';') {
610 1.1 joerg formTextToken(T, TokenPtr);
611 1.1 joerg return;
612 1.1 joerg }
613 1.1 joerg StringRef Name(NamePtr, TokenPtr - NamePtr);
614 1.1 joerg TokenPtr++; // Skip semicolon.
615 1.1 joerg StringRef Resolved;
616 1.1 joerg if (isNamed)
617 1.1 joerg Resolved = resolveHTMLNamedCharacterReference(Name);
618 1.1 joerg else if (isDecimal)
619 1.1 joerg Resolved = resolveHTMLDecimalCharacterReference(Name);
620 1.1 joerg else
621 1.1 joerg Resolved = resolveHTMLHexCharacterReference(Name);
622 1.1 joerg
623 1.1 joerg if (Resolved.empty()) {
624 1.1 joerg formTextToken(T, TokenPtr);
625 1.1 joerg return;
626 1.1 joerg }
627 1.1 joerg formTokenWithChars(T, TokenPtr, tok::text);
628 1.1 joerg T.setText(Resolved);
629 1.1 joerg }
630 1.1 joerg
631 1.1 joerg void Lexer::setupAndLexHTMLStartTag(Token &T) {
632 1.1 joerg assert(BufferPtr[0] == '<' &&
633 1.1 joerg isHTMLIdentifierStartingCharacter(BufferPtr[1]));
634 1.1 joerg const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
635 1.1 joerg StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
636 1.1 joerg if (!isHTMLTagName(Name)) {
637 1.1 joerg formTextToken(T, TagNameEnd);
638 1.1 joerg return;
639 1.1 joerg }
640 1.1 joerg
641 1.1 joerg formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
642 1.1 joerg T.setHTMLTagStartName(Name);
643 1.1 joerg
644 1.1 joerg BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
645 1.1 joerg
646 1.1 joerg const char C = *BufferPtr;
647 1.1 joerg if (BufferPtr != CommentEnd &&
648 1.1 joerg (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
649 1.1 joerg State = LS_HTMLStartTag;
650 1.1 joerg }
651 1.1 joerg
652 1.1 joerg void Lexer::lexHTMLStartTag(Token &T) {
653 1.1 joerg assert(State == LS_HTMLStartTag);
654 1.1 joerg
655 1.1 joerg const char *TokenPtr = BufferPtr;
656 1.1 joerg char C = *TokenPtr;
657 1.1 joerg if (isHTMLIdentifierCharacter(C)) {
658 1.1 joerg TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
659 1.1 joerg StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
660 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_ident);
661 1.1 joerg T.setHTMLIdent(Ident);
662 1.1 joerg } else {
663 1.1 joerg switch (C) {
664 1.1 joerg case '=':
665 1.1 joerg TokenPtr++;
666 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_equals);
667 1.1 joerg break;
668 1.1 joerg case '\"':
669 1.1 joerg case '\'': {
670 1.1 joerg const char *OpenQuote = TokenPtr;
671 1.1 joerg TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
672 1.1 joerg const char *ClosingQuote = TokenPtr;
673 1.1 joerg if (TokenPtr != CommentEnd) // Skip closing quote.
674 1.1 joerg TokenPtr++;
675 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
676 1.1 joerg T.setHTMLQuotedString(StringRef(OpenQuote + 1,
677 1.1 joerg ClosingQuote - (OpenQuote + 1)));
678 1.1 joerg break;
679 1.1 joerg }
680 1.1 joerg case '>':
681 1.1 joerg TokenPtr++;
682 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_greater);
683 1.1 joerg State = LS_Normal;
684 1.1 joerg return;
685 1.1 joerg case '/':
686 1.1 joerg TokenPtr++;
687 1.1 joerg if (TokenPtr != CommentEnd && *TokenPtr == '>') {
688 1.1 joerg TokenPtr++;
689 1.1 joerg formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
690 1.1 joerg } else
691 1.1 joerg formTextToken(T, TokenPtr);
692 1.1 joerg
693 1.1 joerg State = LS_Normal;
694 1.1 joerg return;
695 1.1 joerg }
696 1.1 joerg }
697 1.1 joerg
698 1.1 joerg // Now look ahead and return to normal state if we don't see any HTML tokens
699 1.1 joerg // ahead.
700 1.1 joerg BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
701 1.1 joerg if (BufferPtr == CommentEnd) {
702 1.1 joerg State = LS_Normal;
703 1.1 joerg return;
704 1.1 joerg }
705 1.1 joerg
706 1.1 joerg C = *BufferPtr;
707 1.1 joerg if (!isHTMLIdentifierStartingCharacter(C) &&
708 1.1 joerg C != '=' && C != '\"' && C != '\'' && C != '>') {
709 1.1 joerg State = LS_Normal;
710 1.1 joerg return;
711 1.1 joerg }
712 1.1 joerg }
713 1.1 joerg
714 1.1 joerg void Lexer::setupAndLexHTMLEndTag(Token &T) {
715 1.1 joerg assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
716 1.1 joerg
717 1.1 joerg const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
718 1.1 joerg const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
719 1.1 joerg StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
720 1.1 joerg if (!isHTMLTagName(Name)) {
721 1.1 joerg formTextToken(T, TagNameEnd);
722 1.1 joerg return;
723 1.1 joerg }
724 1.1 joerg
725 1.1 joerg const char *End = skipWhitespace(TagNameEnd, CommentEnd);
726 1.1 joerg
727 1.1 joerg formTokenWithChars(T, End, tok::html_end_tag);
728 1.1 joerg T.setHTMLTagEndName(Name);
729 1.1 joerg
730 1.1 joerg if (BufferPtr != CommentEnd && *BufferPtr == '>')
731 1.1 joerg State = LS_HTMLEndTag;
732 1.1 joerg }
733 1.1 joerg
734 1.1 joerg void Lexer::lexHTMLEndTag(Token &T) {
735 1.1 joerg assert(BufferPtr != CommentEnd && *BufferPtr == '>');
736 1.1 joerg
737 1.1 joerg formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
738 1.1 joerg State = LS_Normal;
739 1.1 joerg }
740 1.1 joerg
741 1.1 joerg Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
742 1.1 joerg const CommandTraits &Traits, SourceLocation FileLoc,
743 1.1 joerg const char *BufferStart, const char *BufferEnd,
744 1.1 joerg bool ParseCommands)
745 1.1 joerg : Allocator(Allocator), Diags(Diags), Traits(Traits),
746 1.1 joerg BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
747 1.1 joerg BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
748 1.1 joerg ParseCommands(ParseCommands) {}
749 1.1 joerg
750 1.1 joerg void Lexer::lex(Token &T) {
751 1.1 joerg again:
752 1.1 joerg switch (CommentState) {
753 1.1 joerg case LCS_BeforeComment:
754 1.1 joerg if (BufferPtr == BufferEnd) {
755 1.1 joerg formTokenWithChars(T, BufferPtr, tok::eof);
756 1.1 joerg return;
757 1.1 joerg }
758 1.1 joerg
759 1.1 joerg assert(*BufferPtr == '/');
760 1.1 joerg BufferPtr++; // Skip first slash.
761 1.1 joerg switch(*BufferPtr) {
762 1.1 joerg case '/': { // BCPL comment.
763 1.1 joerg BufferPtr++; // Skip second slash.
764 1.1 joerg
765 1.1 joerg if (BufferPtr != BufferEnd) {
766 1.1 joerg // Skip Doxygen magic marker, if it is present.
767 1.1 joerg // It might be missing because of a typo //< or /*<, or because we
768 1.1 joerg // merged this non-Doxygen comment into a bunch of Doxygen comments
769 1.1 joerg // around it: /** ... */ /* ... */ /** ... */
770 1.1 joerg const char C = *BufferPtr;
771 1.1 joerg if (C == '/' || C == '!')
772 1.1 joerg BufferPtr++;
773 1.1 joerg }
774 1.1 joerg
775 1.1 joerg // Skip less-than symbol that marks trailing comments.
776 1.1 joerg // Skip it even if the comment is not a Doxygen one, because //< and /*<
777 1.1 joerg // are frequent typos.
778 1.1 joerg if (BufferPtr != BufferEnd && *BufferPtr == '<')
779 1.1 joerg BufferPtr++;
780 1.1 joerg
781 1.1 joerg CommentState = LCS_InsideBCPLComment;
782 1.1 joerg if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
783 1.1 joerg State = LS_Normal;
784 1.1 joerg CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
785 1.1 joerg goto again;
786 1.1 joerg }
787 1.1 joerg case '*': { // C comment.
788 1.1 joerg BufferPtr++; // Skip star.
789 1.1 joerg
790 1.1 joerg // Skip Doxygen magic marker.
791 1.1 joerg const char C = *BufferPtr;
792 1.1 joerg if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
793 1.1 joerg BufferPtr++;
794 1.1 joerg
795 1.1 joerg // Skip less-than symbol that marks trailing comments.
796 1.1 joerg if (BufferPtr != BufferEnd && *BufferPtr == '<')
797 1.1 joerg BufferPtr++;
798 1.1 joerg
799 1.1 joerg CommentState = LCS_InsideCComment;
800 1.1 joerg State = LS_Normal;
801 1.1 joerg CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
802 1.1 joerg goto again;
803 1.1 joerg }
804 1.1 joerg default:
805 1.1 joerg llvm_unreachable("second character of comment should be '/' or '*'");
806 1.1 joerg }
807 1.1 joerg
808 1.1 joerg case LCS_BetweenComments: {
809 1.1 joerg // Consecutive comments are extracted only if there is only whitespace
810 1.1 joerg // between them. So we can search for the start of the next comment.
811 1.1 joerg const char *EndWhitespace = BufferPtr;
812 1.1 joerg while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
813 1.1 joerg EndWhitespace++;
814 1.1 joerg
815 1.1 joerg // Turn any whitespace between comments (and there is only whitespace
816 1.1 joerg // between them -- guaranteed by comment extraction) into a newline. We
817 1.1 joerg // have two newlines between C comments in total (first one was synthesized
818 1.1 joerg // after a comment).
819 1.1 joerg formTokenWithChars(T, EndWhitespace, tok::newline);
820 1.1 joerg
821 1.1 joerg CommentState = LCS_BeforeComment;
822 1.1 joerg break;
823 1.1 joerg }
824 1.1 joerg
825 1.1 joerg case LCS_InsideBCPLComment:
826 1.1 joerg case LCS_InsideCComment:
827 1.1 joerg if (BufferPtr != CommentEnd) {
828 1.1 joerg lexCommentText(T);
829 1.1 joerg break;
830 1.1 joerg } else {
831 1.1 joerg // Skip C comment closing sequence.
832 1.1 joerg if (CommentState == LCS_InsideCComment) {
833 1.1 joerg assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
834 1.1 joerg BufferPtr += 2;
835 1.1 joerg assert(BufferPtr <= BufferEnd);
836 1.1 joerg
837 1.1 joerg // Synthenize newline just after the C comment, regardless if there is
838 1.1 joerg // actually a newline.
839 1.1 joerg formTokenWithChars(T, BufferPtr, tok::newline);
840 1.1 joerg
841 1.1 joerg CommentState = LCS_BetweenComments;
842 1.1 joerg break;
843 1.1 joerg } else {
844 1.1 joerg // Don't synthesized a newline after BCPL comment.
845 1.1 joerg CommentState = LCS_BetweenComments;
846 1.1 joerg goto again;
847 1.1 joerg }
848 1.1 joerg }
849 1.1 joerg }
850 1.1 joerg }
851 1.1 joerg
852 1.1 joerg StringRef Lexer::getSpelling(const Token &Tok,
853 1.1 joerg const SourceManager &SourceMgr) const {
854 1.1 joerg SourceLocation Loc = Tok.getLocation();
855 1.1 joerg std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
856 1.1 joerg
857 1.1 joerg bool InvalidTemp = false;
858 1.1 joerg StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
859 1.1 joerg if (InvalidTemp)
860 1.1 joerg return StringRef();
861 1.1 joerg
862 1.1 joerg const char *Begin = File.data() + LocInfo.second;
863 1.1 joerg return StringRef(Begin, Tok.getLength());
864 1.1 joerg }
865 1.1 joerg
866 1.1 joerg } // end namespace comments
867 1.1 joerg } // end namespace clang
868