Home | History | Annotate | Line # | Download | only in Format
      1 //===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 ///
      9 /// \file
     10 /// This file contains FormatTokenLexer, which tokenizes a source file
     11 /// into a token stream suitable for ClangFormat.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 
     15 #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
     16 #define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
     17 
     18 #include "Encoding.h"
     19 #include "FormatToken.h"
     20 #include "clang/Basic/SourceLocation.h"
     21 #include "clang/Basic/SourceManager.h"
     22 #include "clang/Format/Format.h"
     23 #include "llvm/ADT/MapVector.h"
     24 #include "llvm/ADT/StringSet.h"
     25 #include "llvm/Support/Regex.h"
     26 
     27 #include <stack>
     28 
     29 namespace clang {
     30 namespace format {
     31 
     32 enum LexerState {
     33   NORMAL,
     34   TEMPLATE_STRING,
     35   TOKEN_STASHED,
     36 };
     37 
     38 class FormatTokenLexer {
     39 public:
     40   FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column,
     41                    const FormatStyle &Style, encoding::Encoding Encoding,
     42                    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
     43                    IdentifierTable &IdentTable);
     44 
     45   ArrayRef<FormatToken *> lex();
     46 
     47   const AdditionalKeywords &getKeywords() { return Keywords; }
     48 
     49 private:
     50   void tryMergePreviousTokens();
     51 
     52   bool tryMergeLessLess();
     53   bool tryMergeNSStringLiteral();
     54   bool tryMergeJSPrivateIdentifier();
     55   bool tryMergeCSharpStringLiteral();
     56   bool tryMergeCSharpKeywordVariables();
     57   bool tryMergeNullishCoalescingEqual();
     58   bool tryTransformCSharpForEach();
     59   bool tryMergeForEach();
     60   bool tryTransformTryUsageForC();
     61 
     62   bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);
     63 
     64   // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
     65   bool precedesOperand(FormatToken *Tok);
     66 
     67   bool canPrecedeRegexLiteral(FormatToken *Prev);
     68 
     69   // Tries to parse a JavaScript Regex literal starting at the current token,
     70   // if that begins with a slash and is in a location where JavaScript allows
     71   // regex literals. Changes the current token to a regex literal and updates
     72   // its text if successful.
     73   void tryParseJSRegexLiteral();
     74 
     75   // Handles JavaScript template strings.
     76   //
     77   // JavaScript template strings use backticks ('`') as delimiters, and allow
     78   // embedding expressions nested in ${expr-here}. Template strings can be
     79   // nested recursively, i.e. expressions can contain template strings in turn.
     80   //
     81   // The code below parses starting from a backtick, up to a closing backtick or
     82   // an opening ${. It also maintains a stack of lexing contexts to handle
     83   // nested template parts by balancing curly braces.
     84   void handleTemplateStrings();
     85 
     86   void handleCSharpVerbatimAndInterpolatedStrings();
     87 
     88   void tryParsePythonComment();
     89 
     90   bool tryMerge_TMacro();
     91 
     92   bool tryMergeConflictMarkers();
     93 
     94   FormatToken *getStashedToken();
     95 
     96   FormatToken *getNextToken();
     97 
     98   FormatToken *FormatTok;
     99   bool IsFirstToken;
    100   std::stack<LexerState> StateStack;
    101   unsigned Column;
    102   unsigned TrailingWhitespace;
    103   std::unique_ptr<Lexer> Lex;
    104   const SourceManager &SourceMgr;
    105   FileID ID;
    106   const FormatStyle &Style;
    107   IdentifierTable &IdentTable;
    108   AdditionalKeywords Keywords;
    109   encoding::Encoding Encoding;
    110   llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
    111   // Index (in 'Tokens') of the last token that starts a new line.
    112   unsigned FirstInLineIndex;
    113   SmallVector<FormatToken *, 16> Tokens;
    114 
    115   llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros;
    116 
    117   bool FormattingDisabled;
    118 
    119   llvm::Regex MacroBlockBeginRegex;
    120   llvm::Regex MacroBlockEndRegex;
    121 
    122   // Targets that may appear inside a C# attribute.
    123   static const llvm::StringSet<> CSharpAttributeTargets;
    124 
    125   void readRawToken(FormatToken &Tok);
    126 
    127   void resetLexer(unsigned Offset);
    128 };
    129 
    130 } // namespace format
    131 } // namespace clang
    132 
    133 #endif
    134