Home | History | Annotate | Line # | Download | only in Support
      1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 //  This file implements a YAML parser.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "llvm/Support/YAMLParser.h"
     14 #include "llvm/ADT/AllocatorList.h"
     15 #include "llvm/ADT/ArrayRef.h"
     16 #include "llvm/ADT/None.h"
     17 #include "llvm/ADT/STLExtras.h"
     18 #include "llvm/ADT/SmallString.h"
     19 #include "llvm/ADT/SmallVector.h"
     20 #include "llvm/ADT/StringExtras.h"
     21 #include "llvm/ADT/StringRef.h"
     22 #include "llvm/ADT/Twine.h"
     23 #include "llvm/Support/Compiler.h"
     24 #include "llvm/Support/ErrorHandling.h"
     25 #include "llvm/Support/MemoryBuffer.h"
     26 #include "llvm/Support/SMLoc.h"
     27 #include "llvm/Support/SourceMgr.h"
     28 #include "llvm/Support/Unicode.h"
     29 #include "llvm/Support/raw_ostream.h"
     30 #include <algorithm>
     31 #include <cassert>
     32 #include <cstddef>
     33 #include <cstdint>
     34 #include <map>
     35 #include <memory>
     36 #include <string>
     37 #include <system_error>
     38 #include <utility>
     39 
     40 using namespace llvm;
     41 using namespace yaml;
     42 
     43 enum UnicodeEncodingForm {
     44   UEF_UTF32_LE, ///< UTF-32 Little Endian
     45   UEF_UTF32_BE, ///< UTF-32 Big Endian
     46   UEF_UTF16_LE, ///< UTF-16 Little Endian
     47   UEF_UTF16_BE, ///< UTF-16 Big Endian
     48   UEF_UTF8,     ///< UTF-8 or ascii.
     49   UEF_Unknown   ///< Not a valid Unicode encoding.
     50 };
     51 
     52 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
     53 ///                it exists. Length is in {0, 2, 3, 4}.
     54 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>;
     55 
     56 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
     57 ///                      encoding form of \a Input.
     58 ///
     59 /// @param Input A string of length 0 or more.
     60 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
     61 ///          and how long the byte order mark is if one exists.
     62 static EncodingInfo getUnicodeEncoding(StringRef Input) {
     63   if (Input.empty())
     64     return std::make_pair(UEF_Unknown, 0);
     65 
     66   switch (uint8_t(Input[0])) {
     67   case 0x00:
     68     if (Input.size() >= 4) {
     69       if (  Input[1] == 0
     70          && uint8_t(Input[2]) == 0xFE
     71          && uint8_t(Input[3]) == 0xFF)
     72         return std::make_pair(UEF_UTF32_BE, 4);
     73       if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
     74         return std::make_pair(UEF_UTF32_BE, 0);
     75     }
     76 
     77     if (Input.size() >= 2 && Input[1] != 0)
     78       return std::make_pair(UEF_UTF16_BE, 0);
     79     return std::make_pair(UEF_Unknown, 0);
     80   case 0xFF:
     81     if (  Input.size() >= 4
     82        && uint8_t(Input[1]) == 0xFE
     83        && Input[2] == 0
     84        && Input[3] == 0)
     85       return std::make_pair(UEF_UTF32_LE, 4);
     86 
     87     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
     88       return std::make_pair(UEF_UTF16_LE, 2);
     89     return std::make_pair(UEF_Unknown, 0);
     90   case 0xFE:
     91     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
     92       return std::make_pair(UEF_UTF16_BE, 2);
     93     return std::make_pair(UEF_Unknown, 0);
     94   case 0xEF:
     95     if (  Input.size() >= 3
     96        && uint8_t(Input[1]) == 0xBB
     97        && uint8_t(Input[2]) == 0xBF)
     98       return std::make_pair(UEF_UTF8, 3);
     99     return std::make_pair(UEF_Unknown, 0);
    100   }
    101 
    102   // It could still be utf-32 or utf-16.
    103   if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
    104     return std::make_pair(UEF_UTF32_LE, 0);
    105 
    106   if (Input.size() >= 2 && Input[1] == 0)
    107     return std::make_pair(UEF_UTF16_LE, 0);
    108 
    109   return std::make_pair(UEF_UTF8, 0);
    110 }
    111 
    112 /// Pin the vtables to this file.
    113 void Node::anchor() {}
    114 void NullNode::anchor() {}
    115 void ScalarNode::anchor() {}
    116 void BlockScalarNode::anchor() {}
    117 void KeyValueNode::anchor() {}
    118 void MappingNode::anchor() {}
    119 void SequenceNode::anchor() {}
    120 void AliasNode::anchor() {}
    121 
    122 namespace llvm {
    123 namespace yaml {
    124 
    125 /// Token - A single YAML token.
    126 struct Token {
    127   enum TokenKind {
    128     TK_Error, // Uninitialized token.
    129     TK_StreamStart,
    130     TK_StreamEnd,
    131     TK_VersionDirective,
    132     TK_TagDirective,
    133     TK_DocumentStart,
    134     TK_DocumentEnd,
    135     TK_BlockEntry,
    136     TK_BlockEnd,
    137     TK_BlockSequenceStart,
    138     TK_BlockMappingStart,
    139     TK_FlowEntry,
    140     TK_FlowSequenceStart,
    141     TK_FlowSequenceEnd,
    142     TK_FlowMappingStart,
    143     TK_FlowMappingEnd,
    144     TK_Key,
    145     TK_Value,
    146     TK_Scalar,
    147     TK_BlockScalar,
    148     TK_Alias,
    149     TK_Anchor,
    150     TK_Tag
    151   } Kind = TK_Error;
    152 
    153   /// A string of length 0 or more whose begin() points to the logical location
    154   /// of the token in the input.
    155   StringRef Range;
    156 
    157   /// The value of a block scalar node.
    158   std::string Value;
    159 
    160   Token() = default;
    161 };
    162 
    163 } // end namespace yaml
    164 } // end namespace llvm
    165 
    166 using TokenQueueT = BumpPtrList<Token>;
    167 
    168 namespace {
    169 
    170 /// This struct is used to track simple keys.
    171 ///
    172 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
    173 /// which could legally be the start of a simple key. When peekNext is called,
    174 /// if the Token To be returned is referenced by a SimpleKey, we continue
    175 /// tokenizing until that potential simple key has either been found to not be
    176 /// a simple key (we moved on to the next line or went further than 1024 chars).
    177 /// Or when we run into a Value, and then insert a Key token (and possibly
    178 /// others) before the SimpleKey's Tok.
    179 struct SimpleKey {
    180   TokenQueueT::iterator Tok;
    181   unsigned Column = 0;
    182   unsigned Line = 0;
    183   unsigned FlowLevel = 0;
    184   bool IsRequired = false;
    185 
    186   bool operator ==(const SimpleKey &Other) {
    187     return Tok == Other.Tok;
    188   }
    189 };
    190 
    191 } // end anonymous namespace
    192 
    193 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit
    194 ///        subsequence and the subsequence's length in code units (uint8_t).
    195 ///        A length of 0 represents an error.
    196 using UTF8Decoded = std::pair<uint32_t, unsigned>;
    197 
    198 static UTF8Decoded decodeUTF8(StringRef Range) {
    199   StringRef::iterator Position= Range.begin();
    200   StringRef::iterator End = Range.end();
    201   // 1 byte: [0x00, 0x7f]
    202   // Bit pattern: 0xxxxxxx
    203   if (Position < End && (*Position & 0x80) == 0) {
    204     return std::make_pair(*Position, 1);
    205   }
    206   // 2 bytes: [0x80, 0x7ff]
    207   // Bit pattern: 110xxxxx 10xxxxxx
    208   if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) &&
    209       ((*(Position + 1) & 0xC0) == 0x80)) {
    210     uint32_t codepoint = ((*Position & 0x1F) << 6) |
    211                           (*(Position + 1) & 0x3F);
    212     if (codepoint >= 0x80)
    213       return std::make_pair(codepoint, 2);
    214   }
    215   // 3 bytes: [0x8000, 0xffff]
    216   // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
    217   if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) &&
    218       ((*(Position + 1) & 0xC0) == 0x80) &&
    219       ((*(Position + 2) & 0xC0) == 0x80)) {
    220     uint32_t codepoint = ((*Position & 0x0F) << 12) |
    221                          ((*(Position + 1) & 0x3F) << 6) |
    222                           (*(Position + 2) & 0x3F);
    223     // Codepoints between 0xD800 and 0xDFFF are invalid, as
    224     // they are high / low surrogate halves used by UTF-16.
    225     if (codepoint >= 0x800 &&
    226         (codepoint < 0xD800 || codepoint > 0xDFFF))
    227       return std::make_pair(codepoint, 3);
    228   }
    229   // 4 bytes: [0x10000, 0x10FFFF]
    230   // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    231   if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) &&
    232       ((*(Position + 1) & 0xC0) == 0x80) &&
    233       ((*(Position + 2) & 0xC0) == 0x80) &&
    234       ((*(Position + 3) & 0xC0) == 0x80)) {
    235     uint32_t codepoint = ((*Position & 0x07) << 18) |
    236                          ((*(Position + 1) & 0x3F) << 12) |
    237                          ((*(Position + 2) & 0x3F) << 6) |
    238                           (*(Position + 3) & 0x3F);
    239     if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
    240       return std::make_pair(codepoint, 4);
    241   }
    242   return std::make_pair(0, 0);
    243 }
    244 
    245 namespace llvm {
    246 namespace yaml {
    247 
    248 /// Scans YAML tokens from a MemoryBuffer.
    249 class Scanner {
    250 public:
    251   Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true,
    252           std::error_code *EC = nullptr);
    253   Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true,
    254           std::error_code *EC = nullptr);
    255 
    256   /// Parse the next token and return it without popping it.
    257   Token &peekNext();
    258 
    259   /// Parse the next token and pop it from the queue.
    260   Token getNext();
    261 
    262   void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
    263                   ArrayRef<SMRange> Ranges = None) {
    264     SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
    265   }
    266 
    267   void setError(const Twine &Message, StringRef::iterator Position) {
    268     if (Position >= End)
    269       Position = End - 1;
    270 
    271     // propagate the error if possible
    272     if (EC)
    273       *EC = make_error_code(std::errc::invalid_argument);
    274 
    275     // Don't print out more errors after the first one we encounter. The rest
    276     // are just the result of the first, and have no meaning.
    277     if (!Failed)
    278       printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message);
    279     Failed = true;
    280   }
    281 
    282   /// Returns true if an error occurred while parsing.
    283   bool failed() {
    284     return Failed;
    285   }
    286 
    287 private:
    288   void init(MemoryBufferRef Buffer);
    289 
    290   StringRef currentInput() {
    291     return StringRef(Current, End - Current);
    292   }
    293 
    294   /// Decode a UTF-8 minimal well-formed code unit subsequence starting
    295   ///        at \a Position.
    296   ///
    297   /// If the UTF-8 code units starting at Position do not form a well-formed
    298   /// code unit subsequence, then the Unicode scalar value is 0, and the length
    299   /// is 0.
    300   UTF8Decoded decodeUTF8(StringRef::iterator Position) {
    301     return ::decodeUTF8(StringRef(Position, End - Position));
    302   }
    303 
    304   // The following functions are based on the gramar rules in the YAML spec. The
    305   // style of the function names it meant to closely match how they are written
    306   // in the spec. The number within the [] is the number of the grammar rule in
    307   // the spec.
    308   //
    309   // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
    310   //
    311   // c-
    312   //   A production starting and ending with a special character.
    313   // b-
    314   //   A production matching a single line break.
    315   // nb-
    316   //   A production starting and ending with a non-break character.
    317   // s-
    318   //   A production starting and ending with a white space character.
    319   // ns-
    320   //   A production starting and ending with a non-space character.
    321   // l-
    322   //   A production matching complete line(s).
    323 
    324   /// Skip a single nb-char[27] starting at Position.
    325   ///
    326   /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
    327   ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
    328   ///
    329   /// @returns The code unit after the nb-char, or Position if it's not an
    330   ///          nb-char.
    331   StringRef::iterator skip_nb_char(StringRef::iterator Position);
    332 
    333   /// Skip a single b-break[28] starting at Position.
    334   ///
    335   /// A b-break is 0xD 0xA | 0xD | 0xA
    336   ///
    337   /// @returns The code unit after the b-break, or Position if it's not a
    338   ///          b-break.
    339   StringRef::iterator skip_b_break(StringRef::iterator Position);
    340 
    341   /// Skip a single s-space[31] starting at Position.
    342   ///
    343   /// An s-space is 0x20
    344   ///
    345   /// @returns The code unit after the s-space, or Position if it's not a
    346   ///          s-space.
    347   StringRef::iterator skip_s_space(StringRef::iterator Position);
    348 
    349   /// Skip a single s-white[33] starting at Position.
    350   ///
    351   /// A s-white is 0x20 | 0x9
    352   ///
    353   /// @returns The code unit after the s-white, or Position if it's not a
    354   ///          s-white.
    355   StringRef::iterator skip_s_white(StringRef::iterator Position);
    356 
    357   /// Skip a single ns-char[34] starting at Position.
    358   ///
    359   /// A ns-char is nb-char - s-white
    360   ///
    361   /// @returns The code unit after the ns-char, or Position if it's not a
    362   ///          ns-char.
    363   StringRef::iterator skip_ns_char(StringRef::iterator Position);
    364 
    365   using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
    366 
    367   /// Skip minimal well-formed code unit subsequences until Func
    368   ///        returns its input.
    369   ///
    370   /// @returns The code unit after the last minimal well-formed code unit
    371   ///          subsequence that Func accepted.
    372   StringRef::iterator skip_while( SkipWhileFunc Func
    373                                 , StringRef::iterator Position);
    374 
    375   /// Skip minimal well-formed code unit subsequences until Func returns its
    376   /// input.
    377   void advanceWhile(SkipWhileFunc Func);
    378 
    379   /// Scan ns-uri-char[39]s starting at Cur.
    380   ///
    381   /// This updates Cur and Column while scanning.
    382   void scan_ns_uri_char();
    383 
    384   /// Consume a minimal well-formed code unit subsequence starting at
    385   ///        \a Cur. Return false if it is not the same Unicode scalar value as
    386   ///        \a Expected. This updates \a Column.
    387   bool consume(uint32_t Expected);
    388 
    389   /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
    390   void skip(uint32_t Distance);
    391 
    392   /// Return true if the minimal well-formed code unit subsequence at
    393   ///        Pos is whitespace or a new line
    394   bool isBlankOrBreak(StringRef::iterator Position);
    395 
    396   /// Consume a single b-break[28] if it's present at the current position.
    397   ///
    398   /// Return false if the code unit at the current position isn't a line break.
    399   bool consumeLineBreakIfPresent();
    400 
    401   /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
    402   void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
    403                              , unsigned AtColumn
    404                              , bool IsRequired);
    405 
    406   /// Remove simple keys that can no longer be valid simple keys.
    407   ///
    408   /// Invalid simple keys are not on the current line or are further than 1024
    409   /// columns back.
    410   void removeStaleSimpleKeyCandidates();
    411 
    412   /// Remove all simple keys on FlowLevel \a Level.
    413   void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
    414 
    415   /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
    416   ///        tokens if needed.
    417   bool unrollIndent(int ToColumn);
    418 
    419   /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
    420   ///        if needed.
    421   bool rollIndent( int ToColumn
    422                  , Token::TokenKind Kind
    423                  , TokenQueueT::iterator InsertPoint);
    424 
    425   /// Skip a single-line comment when the comment starts at the current
    426   /// position of the scanner.
    427   void skipComment();
    428 
    429   /// Skip whitespace and comments until the start of the next token.
    430   void scanToNextToken();
    431 
    432   /// Must be the first token generated.
    433   bool scanStreamStart();
    434 
    435   /// Generate tokens needed to close out the stream.
    436   bool scanStreamEnd();
    437 
    438   /// Scan a %BLAH directive.
    439   bool scanDirective();
    440 
    441   /// Scan a ... or ---.
    442   bool scanDocumentIndicator(bool IsStart);
    443 
    444   /// Scan a [ or { and generate the proper flow collection start token.
    445   bool scanFlowCollectionStart(bool IsSequence);
    446 
    447   /// Scan a ] or } and generate the proper flow collection end token.
    448   bool scanFlowCollectionEnd(bool IsSequence);
    449 
    450   /// Scan the , that separates entries in a flow collection.
    451   bool scanFlowEntry();
    452 
    453   /// Scan the - that starts block sequence entries.
    454   bool scanBlockEntry();
    455 
    456   /// Scan an explicit ? indicating a key.
    457   bool scanKey();
    458 
    459   /// Scan an explicit : indicating a value.
    460   bool scanValue();
    461 
    462   /// Scan a quoted scalar.
    463   bool scanFlowScalar(bool IsDoubleQuoted);
    464 
    465   /// Scan an unquoted scalar.
    466   bool scanPlainScalar();
    467 
    468   /// Scan an Alias or Anchor starting with * or &.
    469   bool scanAliasOrAnchor(bool IsAlias);
    470 
    471   /// Scan a block scalar starting with | or >.
    472   bool scanBlockScalar(bool IsLiteral);
    473 
    474   /// Scan a chomping indicator in a block scalar header.
    475   char scanBlockChompingIndicator();
    476 
    477   /// Scan an indentation indicator in a block scalar header.
    478   unsigned scanBlockIndentationIndicator();
    479 
    480   /// Scan a block scalar header.
    481   ///
    482   /// Return false if an error occurred.
    483   bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
    484                              bool &IsDone);
    485 
    486   /// Look for the indentation level of a block scalar.
    487   ///
    488   /// Return false if an error occurred.
    489   bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
    490                              unsigned &LineBreaks, bool &IsDone);
    491 
    492   /// Scan the indentation of a text line in a block scalar.
    493   ///
    494   /// Return false if an error occurred.
    495   bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
    496                              bool &IsDone);
    497 
    498   /// Scan a tag of the form !stuff.
    499   bool scanTag();
    500 
    501   /// Dispatch to the next scanning function based on \a *Cur.
    502   bool fetchMoreTokens();
    503 
    504   /// The SourceMgr used for diagnostics and buffer management.
    505   SourceMgr &SM;
    506 
    507   /// The original input.
    508   MemoryBufferRef InputBuffer;
    509 
    510   /// The current position of the scanner.
    511   StringRef::iterator Current;
    512 
    513   /// The end of the input (one past the last character).
    514   StringRef::iterator End;
    515 
    516   /// Current YAML indentation level in spaces.
    517   int Indent;
    518 
    519   /// Current column number in Unicode code points.
    520   unsigned Column;
    521 
    522   /// Current line number.
    523   unsigned Line;
    524 
    525   /// How deep we are in flow style containers. 0 Means at block level.
    526   unsigned FlowLevel;
    527 
    528   /// Are we at the start of the stream?
    529   bool IsStartOfStream;
    530 
    531   /// Can the next token be the start of a simple key?
    532   bool IsSimpleKeyAllowed;
    533 
    534   /// True if an error has occurred.
    535   bool Failed;
    536 
    537   /// Should colors be used when printing out the diagnostic messages?
    538   bool ShowColors;
    539 
    540   /// Queue of tokens. This is required to queue up tokens while looking
    541   ///        for the end of a simple key. And for cases where a single character
    542   ///        can produce multiple tokens (e.g. BlockEnd).
    543   TokenQueueT TokenQueue;
    544 
    545   /// Indentation levels.
    546   SmallVector<int, 4> Indents;
    547 
    548   /// Potential simple keys.
    549   SmallVector<SimpleKey, 4> SimpleKeys;
    550 
    551   std::error_code *EC;
    552 };
    553 
    554 } // end namespace yaml
    555 } // end namespace llvm
    556 
    557 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
    558 static void encodeUTF8( uint32_t UnicodeScalarValue
    559                       , SmallVectorImpl<char> &Result) {
    560   if (UnicodeScalarValue <= 0x7F) {
    561     Result.push_back(UnicodeScalarValue & 0x7F);
    562   } else if (UnicodeScalarValue <= 0x7FF) {
    563     uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
    564     uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
    565     Result.push_back(FirstByte);
    566     Result.push_back(SecondByte);
    567   } else if (UnicodeScalarValue <= 0xFFFF) {
    568     uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
    569     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
    570     uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
    571     Result.push_back(FirstByte);
    572     Result.push_back(SecondByte);
    573     Result.push_back(ThirdByte);
    574   } else if (UnicodeScalarValue <= 0x10FFFF) {
    575     uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
    576     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
    577     uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
    578     uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
    579     Result.push_back(FirstByte);
    580     Result.push_back(SecondByte);
    581     Result.push_back(ThirdByte);
    582     Result.push_back(FourthByte);
    583   }
    584 }
    585 
    586 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
    587   SourceMgr SM;
    588   Scanner scanner(Input, SM);
    589   while (true) {
    590     Token T = scanner.getNext();
    591     switch (T.Kind) {
    592     case Token::TK_StreamStart:
    593       OS << "Stream-Start: ";
    594       break;
    595     case Token::TK_StreamEnd:
    596       OS << "Stream-End: ";
    597       break;
    598     case Token::TK_VersionDirective:
    599       OS << "Version-Directive: ";
    600       break;
    601     case Token::TK_TagDirective:
    602       OS << "Tag-Directive: ";
    603       break;
    604     case Token::TK_DocumentStart:
    605       OS << "Document-Start: ";
    606       break;
    607     case Token::TK_DocumentEnd:
    608       OS << "Document-End: ";
    609       break;
    610     case Token::TK_BlockEntry:
    611       OS << "Block-Entry: ";
    612       break;
    613     case Token::TK_BlockEnd:
    614       OS << "Block-End: ";
    615       break;
    616     case Token::TK_BlockSequenceStart:
    617       OS << "Block-Sequence-Start: ";
    618       break;
    619     case Token::TK_BlockMappingStart:
    620       OS << "Block-Mapping-Start: ";
    621       break;
    622     case Token::TK_FlowEntry:
    623       OS << "Flow-Entry: ";
    624       break;
    625     case Token::TK_FlowSequenceStart:
    626       OS << "Flow-Sequence-Start: ";
    627       break;
    628     case Token::TK_FlowSequenceEnd:
    629       OS << "Flow-Sequence-End: ";
    630       break;
    631     case Token::TK_FlowMappingStart:
    632       OS << "Flow-Mapping-Start: ";
    633       break;
    634     case Token::TK_FlowMappingEnd:
    635       OS << "Flow-Mapping-End: ";
    636       break;
    637     case Token::TK_Key:
    638       OS << "Key: ";
    639       break;
    640     case Token::TK_Value:
    641       OS << "Value: ";
    642       break;
    643     case Token::TK_Scalar:
    644       OS << "Scalar: ";
    645       break;
    646     case Token::TK_BlockScalar:
    647       OS << "Block Scalar: ";
    648       break;
    649     case Token::TK_Alias:
    650       OS << "Alias: ";
    651       break;
    652     case Token::TK_Anchor:
    653       OS << "Anchor: ";
    654       break;
    655     case Token::TK_Tag:
    656       OS << "Tag: ";
    657       break;
    658     case Token::TK_Error:
    659       break;
    660     }
    661     OS << T.Range << "\n";
    662     if (T.Kind == Token::TK_StreamEnd)
    663       break;
    664     else if (T.Kind == Token::TK_Error)
    665       return false;
    666   }
    667   return true;
    668 }
    669 
    670 bool yaml::scanTokens(StringRef Input) {
    671   SourceMgr SM;
    672   Scanner scanner(Input, SM);
    673   while (true) {
    674     Token T = scanner.getNext();
    675     if (T.Kind == Token::TK_StreamEnd)
    676       break;
    677     else if (T.Kind == Token::TK_Error)
    678       return false;
    679   }
    680   return true;
    681 }
    682 
    683 std::string yaml::escape(StringRef Input, bool EscapePrintable) {
    684   std::string EscapedInput;
    685   for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
    686     if (*i == '\\')
    687       EscapedInput += "\\\\";
    688     else if (*i == '"')
    689       EscapedInput += "\\\"";
    690     else if (*i == 0)
    691       EscapedInput += "\\0";
    692     else if (*i == 0x07)
    693       EscapedInput += "\\a";
    694     else if (*i == 0x08)
    695       EscapedInput += "\\b";
    696     else if (*i == 0x09)
    697       EscapedInput += "\\t";
    698     else if (*i == 0x0A)
    699       EscapedInput += "\\n";
    700     else if (*i == 0x0B)
    701       EscapedInput += "\\v";
    702     else if (*i == 0x0C)
    703       EscapedInput += "\\f";
    704     else if (*i == 0x0D)
    705       EscapedInput += "\\r";
    706     else if (*i == 0x1B)
    707       EscapedInput += "\\e";
    708     else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
    709       std::string HexStr = utohexstr(*i);
    710       EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
    711     } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
    712       UTF8Decoded UnicodeScalarValue
    713         = decodeUTF8(StringRef(i, Input.end() - i));
    714       if (UnicodeScalarValue.second == 0) {
    715         // Found invalid char.
    716         SmallString<4> Val;
    717         encodeUTF8(0xFFFD, Val);
    718         llvm::append_range(EscapedInput, Val);
    719         // FIXME: Error reporting.
    720         return EscapedInput;
    721       }
    722       if (UnicodeScalarValue.first == 0x85)
    723         EscapedInput += "\\N";
    724       else if (UnicodeScalarValue.first == 0xA0)
    725         EscapedInput += "\\_";
    726       else if (UnicodeScalarValue.first == 0x2028)
    727         EscapedInput += "\\L";
    728       else if (UnicodeScalarValue.first == 0x2029)
    729         EscapedInput += "\\P";
    730       else if (!EscapePrintable &&
    731                sys::unicode::isPrintable(UnicodeScalarValue.first))
    732         EscapedInput += StringRef(i, UnicodeScalarValue.second);
    733       else {
    734         std::string HexStr = utohexstr(UnicodeScalarValue.first);
    735         if (HexStr.size() <= 2)
    736           EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
    737         else if (HexStr.size() <= 4)
    738           EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
    739         else if (HexStr.size() <= 8)
    740           EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
    741       }
    742       i += UnicodeScalarValue.second - 1;
    743     } else
    744       EscapedInput.push_back(*i);
    745   }
    746   return EscapedInput;
    747 }
    748 
    749 llvm::Optional<bool> yaml::parseBool(StringRef S) {
    750   switch (S.size()) {
    751   case 1:
    752     switch (S.front()) {
    753     case 'y':
    754     case 'Y':
    755       return true;
    756     case 'n':
    757     case 'N':
    758       return false;
    759     default:
    760       return None;
    761     }
    762   case 2:
    763     switch (S.front()) {
    764     case 'O':
    765       if (S[1] == 'N') // ON
    766         return true;
    767       LLVM_FALLTHROUGH;
    768     case 'o':
    769       if (S[1] == 'n') //[Oo]n
    770         return true;
    771       return None;
    772     case 'N':
    773       if (S[1] == 'O') // NO
    774         return false;
    775       LLVM_FALLTHROUGH;
    776     case 'n':
    777       if (S[1] == 'o') //[Nn]o
    778         return false;
    779       return None;
    780     default:
    781       return None;
    782     }
    783   case 3:
    784     switch (S.front()) {
    785     case 'O':
    786       if (S.drop_front() == "FF") // OFF
    787         return false;
    788       LLVM_FALLTHROUGH;
    789     case 'o':
    790       if (S.drop_front() == "ff") //[Oo]ff
    791         return false;
    792       return None;
    793     case 'Y':
    794       if (S.drop_front() == "ES") // YES
    795         return true;
    796       LLVM_FALLTHROUGH;
    797     case 'y':
    798       if (S.drop_front() == "es") //[Yy]es
    799         return true;
    800       return None;
    801     default:
    802       return None;
    803     }
    804   case 4:
    805     switch (S.front()) {
    806     case 'T':
    807       if (S.drop_front() == "RUE") // TRUE
    808         return true;
    809       LLVM_FALLTHROUGH;
    810     case 't':
    811       if (S.drop_front() == "rue") //[Tt]rue
    812         return true;
    813       return None;
    814     default:
    815       return None;
    816     }
    817   case 5:
    818     switch (S.front()) {
    819     case 'F':
    820       if (S.drop_front() == "ALSE") // FALSE
    821         return false;
    822       LLVM_FALLTHROUGH;
    823     case 'f':
    824       if (S.drop_front() == "alse") //[Ff]alse
    825         return false;
    826       return None;
    827     default:
    828       return None;
    829     }
    830   default:
    831     return None;
    832   }
    833 }
    834 
    835 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors,
    836                  std::error_code *EC)
    837     : SM(sm), ShowColors(ShowColors), EC(EC) {
    838   init(MemoryBufferRef(Input, "YAML"));
    839 }
    840 
    841 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors,
    842                  std::error_code *EC)
    843     : SM(SM_), ShowColors(ShowColors), EC(EC) {
    844   init(Buffer);
    845 }
    846 
    847 void Scanner::init(MemoryBufferRef Buffer) {
    848   InputBuffer = Buffer;
    849   Current = InputBuffer.getBufferStart();
    850   End = InputBuffer.getBufferEnd();
    851   Indent = -1;
    852   Column = 0;
    853   Line = 0;
    854   FlowLevel = 0;
    855   IsStartOfStream = true;
    856   IsSimpleKeyAllowed = true;
    857   Failed = false;
    858   std::unique_ptr<MemoryBuffer> InputBufferOwner =
    859       MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
    860   SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
    861 }
    862 
    863 Token &Scanner::peekNext() {
    864   // If the current token is a possible simple key, keep parsing until we
    865   // can confirm.
    866   bool NeedMore = false;
    867   while (true) {
    868     if (TokenQueue.empty() || NeedMore) {
    869       if (!fetchMoreTokens()) {
    870         TokenQueue.clear();
    871         SimpleKeys.clear();
    872         TokenQueue.push_back(Token());
    873         return TokenQueue.front();
    874       }
    875     }
    876     assert(!TokenQueue.empty() &&
    877             "fetchMoreTokens lied about getting tokens!");
    878 
    879     removeStaleSimpleKeyCandidates();
    880     SimpleKey SK;
    881     SK.Tok = TokenQueue.begin();
    882     if (!is_contained(SimpleKeys, SK))
    883       break;
    884     else
    885       NeedMore = true;
    886   }
    887   return TokenQueue.front();
    888 }
    889 
    890 Token Scanner::getNext() {
    891   Token Ret = peekNext();
    892   // TokenQueue can be empty if there was an error getting the next token.
    893   if (!TokenQueue.empty())
    894     TokenQueue.pop_front();
    895 
    896   // There cannot be any referenced Token's if the TokenQueue is empty. So do a
    897   // quick deallocation of them all.
    898   if (TokenQueue.empty())
    899     TokenQueue.resetAlloc();
    900 
    901   return Ret;
    902 }
    903 
    904 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
    905   if (Position == End)
    906     return Position;
    907   // Check 7 bit c-printable - b-char.
    908   if (   *Position == 0x09
    909       || (*Position >= 0x20 && *Position <= 0x7E))
    910     return Position + 1;
    911 
    912   // Check for valid UTF-8.
    913   if (uint8_t(*Position) & 0x80) {
    914     UTF8Decoded u8d = decodeUTF8(Position);
    915     if (   u8d.second != 0
    916         && u8d.first != 0xFEFF
    917         && ( u8d.first == 0x85
    918           || ( u8d.first >= 0xA0
    919             && u8d.first <= 0xD7FF)
    920           || ( u8d.first >= 0xE000
    921             && u8d.first <= 0xFFFD)
    922           || ( u8d.first >= 0x10000
    923             && u8d.first <= 0x10FFFF)))
    924       return Position + u8d.second;
    925   }
    926   return Position;
    927 }
    928 
    929 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
    930   if (Position == End)
    931     return Position;
    932   if (*Position == 0x0D) {
    933     if (Position + 1 != End && *(Position + 1) == 0x0A)
    934       return Position + 2;
    935     return Position + 1;
    936   }
    937 
    938   if (*Position == 0x0A)
    939     return Position + 1;
    940   return Position;
    941 }
    942 
    943 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
    944   if (Position == End)
    945     return Position;
    946   if (*Position == ' ')
    947     return Position + 1;
    948   return Position;
    949 }
    950 
    951 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
    952   if (Position == End)
    953     return Position;
    954   if (*Position == ' ' || *Position == '\t')
    955     return Position + 1;
    956   return Position;
    957 }
    958 
    959 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
    960   if (Position == End)
    961     return Position;
    962   if (*Position == ' ' || *Position == '\t')
    963     return Position;
    964   return skip_nb_char(Position);
    965 }
    966 
    967 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
    968                                        , StringRef::iterator Position) {
    969   while (true) {
    970     StringRef::iterator i = (this->*Func)(Position);
    971     if (i == Position)
    972       break;
    973     Position = i;
    974   }
    975   return Position;
    976 }
    977 
    978 void Scanner::advanceWhile(SkipWhileFunc Func) {
    979   auto Final = skip_while(Func, Current);
    980   Column += Final - Current;
    981   Current = Final;
    982 }
    983 
    984 static bool is_ns_hex_digit(const char C) { return isAlnum(C); }
    985 
    986 static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); }
    987 
    988 void Scanner::scan_ns_uri_char() {
    989   while (true) {
    990     if (Current == End)
    991       break;
    992     if ((   *Current == '%'
    993           && Current + 2 < End
    994           && is_ns_hex_digit(*(Current + 1))
    995           && is_ns_hex_digit(*(Current + 2)))
    996         || is_ns_word_char(*Current)
    997         || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
    998           != StringRef::npos) {
    999       ++Current;
   1000       ++Column;
   1001     } else
   1002       break;
   1003   }
   1004 }
   1005 
   1006 bool Scanner::consume(uint32_t Expected) {
   1007   if (Expected >= 0x80) {
   1008     setError("Cannot consume non-ascii characters", Current);
   1009     return false;
   1010   }
   1011   if (Current == End)
   1012     return false;
   1013   if (uint8_t(*Current) >= 0x80) {
   1014     setError("Cannot consume non-ascii characters", Current);
   1015     return false;
   1016   }
   1017   if (uint8_t(*Current) == Expected) {
   1018     ++Current;
   1019     ++Column;
   1020     return true;
   1021   }
   1022   return false;
   1023 }
   1024 
   1025 void Scanner::skip(uint32_t Distance) {
   1026   Current += Distance;
   1027   Column += Distance;
   1028   assert(Current <= End && "Skipped past the end");
   1029 }
   1030 
   1031 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
   1032   if (Position == End)
   1033     return false;
   1034   return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
   1035          *Position == '\n';
   1036 }
   1037 
   1038 bool Scanner::consumeLineBreakIfPresent() {
   1039   auto Next = skip_b_break(Current);
   1040   if (Next == Current)
   1041     return false;
   1042   Column = 0;
   1043   ++Line;
   1044   Current = Next;
   1045   return true;
   1046 }
   1047 
   1048 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
   1049                                     , unsigned AtColumn
   1050                                     , bool IsRequired) {
   1051   if (IsSimpleKeyAllowed) {
   1052     SimpleKey SK;
   1053     SK.Tok = Tok;
   1054     SK.Line = Line;
   1055     SK.Column = AtColumn;
   1056     SK.IsRequired = IsRequired;
   1057     SK.FlowLevel = FlowLevel;
   1058     SimpleKeys.push_back(SK);
   1059   }
   1060 }
   1061 
   1062 void Scanner::removeStaleSimpleKeyCandidates() {
   1063   for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
   1064                                             i != SimpleKeys.end();) {
   1065     if (i->Line != Line || i->Column + 1024 < Column) {
   1066       if (i->IsRequired)
   1067         setError( "Could not find expected : for simple key"
   1068                 , i->Tok->Range.begin());
   1069       i = SimpleKeys.erase(i);
   1070     } else
   1071       ++i;
   1072   }
   1073 }
   1074 
   1075 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
   1076   if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
   1077     SimpleKeys.pop_back();
   1078 }
   1079 
   1080 bool Scanner::unrollIndent(int ToColumn) {
   1081   Token T;
   1082   // Indentation is ignored in flow.
   1083   if (FlowLevel != 0)
   1084     return true;
   1085 
   1086   while (Indent > ToColumn) {
   1087     T.Kind = Token::TK_BlockEnd;
   1088     T.Range = StringRef(Current, 1);
   1089     TokenQueue.push_back(T);
   1090     Indent = Indents.pop_back_val();
   1091   }
   1092 
   1093   return true;
   1094 }
   1095 
   1096 bool Scanner::rollIndent( int ToColumn
   1097                         , Token::TokenKind Kind
   1098                         , TokenQueueT::iterator InsertPoint) {
   1099   if (FlowLevel)
   1100     return true;
   1101   if (Indent < ToColumn) {
   1102     Indents.push_back(Indent);
   1103     Indent = ToColumn;
   1104 
   1105     Token T;
   1106     T.Kind = Kind;
   1107     T.Range = StringRef(Current, 0);
   1108     TokenQueue.insert(InsertPoint, T);
   1109   }
   1110   return true;
   1111 }
   1112 
   1113 void Scanner::skipComment() {
   1114   if (Current == End || *Current != '#')
   1115     return;
   1116   while (true) {
   1117     // This may skip more than one byte, thus Column is only incremented
   1118     // for code points.
   1119     StringRef::iterator I = skip_nb_char(Current);
   1120     if (I == Current)
   1121       break;
   1122     Current = I;
   1123     ++Column;
   1124   }
   1125 }
   1126 
   1127 void Scanner::scanToNextToken() {
   1128   while (true) {
   1129     while (Current != End && (*Current == ' ' || *Current == '\t')) {
   1130       skip(1);
   1131     }
   1132 
   1133     skipComment();
   1134 
   1135     // Skip EOL.
   1136     StringRef::iterator i = skip_b_break(Current);
   1137     if (i == Current)
   1138       break;
   1139     Current = i;
   1140     ++Line;
   1141     Column = 0;
   1142     // New lines may start a simple key.
   1143     if (!FlowLevel)
   1144       IsSimpleKeyAllowed = true;
   1145   }
   1146 }
   1147 
   1148 bool Scanner::scanStreamStart() {
   1149   IsStartOfStream = false;
   1150 
   1151   EncodingInfo EI = getUnicodeEncoding(currentInput());
   1152 
   1153   Token T;
   1154   T.Kind = Token::TK_StreamStart;
   1155   T.Range = StringRef(Current, EI.second);
   1156   TokenQueue.push_back(T);
   1157   Current += EI.second;
   1158   return true;
   1159 }
   1160 
   1161 bool Scanner::scanStreamEnd() {
   1162   // Force an ending new line if one isn't present.
   1163   if (Column != 0) {
   1164     Column = 0;
   1165     ++Line;
   1166   }
   1167 
   1168   unrollIndent(-1);
   1169   SimpleKeys.clear();
   1170   IsSimpleKeyAllowed = false;
   1171 
   1172   Token T;
   1173   T.Kind = Token::TK_StreamEnd;
   1174   T.Range = StringRef(Current, 0);
   1175   TokenQueue.push_back(T);
   1176   return true;
   1177 }
   1178 
   1179 bool Scanner::scanDirective() {
   1180   // Reset the indentation level.
   1181   unrollIndent(-1);
   1182   SimpleKeys.clear();
   1183   IsSimpleKeyAllowed = false;
   1184 
   1185   StringRef::iterator Start = Current;
   1186   consume('%');
   1187   StringRef::iterator NameStart = Current;
   1188   Current = skip_while(&Scanner::skip_ns_char, Current);
   1189   StringRef Name(NameStart, Current - NameStart);
   1190   Current = skip_while(&Scanner::skip_s_white, Current);
   1191 
   1192   Token T;
   1193   if (Name == "YAML") {
   1194     Current = skip_while(&Scanner::skip_ns_char, Current);
   1195     T.Kind = Token::TK_VersionDirective;
   1196     T.Range = StringRef(Start, Current - Start);
   1197     TokenQueue.push_back(T);
   1198     return true;
   1199   } else if(Name == "TAG") {
   1200     Current = skip_while(&Scanner::skip_ns_char, Current);
   1201     Current = skip_while(&Scanner::skip_s_white, Current);
   1202     Current = skip_while(&Scanner::skip_ns_char, Current);
   1203     T.Kind = Token::TK_TagDirective;
   1204     T.Range = StringRef(Start, Current - Start);
   1205     TokenQueue.push_back(T);
   1206     return true;
   1207   }
   1208   return false;
   1209 }
   1210 
   1211 bool Scanner::scanDocumentIndicator(bool IsStart) {
   1212   unrollIndent(-1);
   1213   SimpleKeys.clear();
   1214   IsSimpleKeyAllowed = false;
   1215 
   1216   Token T;
   1217   T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
   1218   T.Range = StringRef(Current, 3);
   1219   skip(3);
   1220   TokenQueue.push_back(T);
   1221   return true;
   1222 }
   1223 
   1224 bool Scanner::scanFlowCollectionStart(bool IsSequence) {
   1225   Token T;
   1226   T.Kind = IsSequence ? Token::TK_FlowSequenceStart
   1227                       : Token::TK_FlowMappingStart;
   1228   T.Range = StringRef(Current, 1);
   1229   skip(1);
   1230   TokenQueue.push_back(T);
   1231 
   1232   // [ and { may begin a simple key.
   1233   saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
   1234 
   1235   // And may also be followed by a simple key.
   1236   IsSimpleKeyAllowed = true;
   1237   ++FlowLevel;
   1238   return true;
   1239 }
   1240 
   1241 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
   1242   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
   1243   IsSimpleKeyAllowed = false;
   1244   Token T;
   1245   T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
   1246                       : Token::TK_FlowMappingEnd;
   1247   T.Range = StringRef(Current, 1);
   1248   skip(1);
   1249   TokenQueue.push_back(T);
   1250   if (FlowLevel)
   1251     --FlowLevel;
   1252   return true;
   1253 }
   1254 
   1255 bool Scanner::scanFlowEntry() {
   1256   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
   1257   IsSimpleKeyAllowed = true;
   1258   Token T;
   1259   T.Kind = Token::TK_FlowEntry;
   1260   T.Range = StringRef(Current, 1);
   1261   skip(1);
   1262   TokenQueue.push_back(T);
   1263   return true;
   1264 }
   1265 
   1266 bool Scanner::scanBlockEntry() {
   1267   rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
   1268   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
   1269   IsSimpleKeyAllowed = true;
   1270   Token T;
   1271   T.Kind = Token::TK_BlockEntry;
   1272   T.Range = StringRef(Current, 1);
   1273   skip(1);
   1274   TokenQueue.push_back(T);
   1275   return true;
   1276 }
   1277 
   1278 bool Scanner::scanKey() {
   1279   if (!FlowLevel)
   1280     rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
   1281 
   1282   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
   1283   IsSimpleKeyAllowed = !FlowLevel;
   1284 
   1285   Token T;
   1286   T.Kind = Token::TK_Key;
   1287   T.Range = StringRef(Current, 1);
   1288   skip(1);
   1289   TokenQueue.push_back(T);
   1290   return true;
   1291 }
   1292 
   1293 bool Scanner::scanValue() {
   1294   // If the previous token could have been a simple key, insert the key token
   1295   // into the token queue.
   1296   if (!SimpleKeys.empty()) {
   1297     SimpleKey SK = SimpleKeys.pop_back_val();
   1298     Token T;
   1299     T.Kind = Token::TK_Key;
   1300     T.Range = SK.Tok->Range;
   1301     TokenQueueT::iterator i, e;
   1302     for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
   1303       if (i == SK.Tok)
   1304         break;
   1305     }
   1306     if (i == e) {
   1307       Failed = true;
   1308       return false;
   1309     }
   1310     i = TokenQueue.insert(i, T);
   1311 
   1312     // We may also need to add a Block-Mapping-Start token.
   1313     rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
   1314 
   1315     IsSimpleKeyAllowed = false;
   1316   } else {
   1317     if (!FlowLevel)
   1318       rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
   1319     IsSimpleKeyAllowed = !FlowLevel;
   1320   }
   1321 
   1322   Token T;
   1323   T.Kind = Token::TK_Value;
   1324   T.Range = StringRef(Current, 1);
   1325   skip(1);
   1326   TokenQueue.push_back(T);
   1327   return true;
   1328 }
   1329 
   1330 // Forbidding inlining improves performance by roughly 20%.
   1331 // FIXME: Remove once llvm optimizes this to the faster version without hints.
   1332 LLVM_ATTRIBUTE_NOINLINE static bool
   1333 wasEscaped(StringRef::iterator First, StringRef::iterator Position);
   1334 
   1335 // Returns whether a character at 'Position' was escaped with a leading '\'.
   1336 // 'First' specifies the position of the first character in the string.
   1337 static bool wasEscaped(StringRef::iterator First,
   1338                        StringRef::iterator Position) {
   1339   assert(Position - 1 >= First);
   1340   StringRef::iterator I = Position - 1;
   1341   // We calculate the number of consecutive '\'s before the current position
   1342   // by iterating backwards through our string.
   1343   while (I >= First && *I == '\\') --I;
   1344   // (Position - 1 - I) now contains the number of '\'s before the current
   1345   // position. If it is odd, the character at 'Position' was escaped.
   1346   return (Position - 1 - I) % 2 == 1;
   1347 }
   1348 
   1349 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
   1350   StringRef::iterator Start = Current;
   1351   unsigned ColStart = Column;
   1352   if (IsDoubleQuoted) {
   1353     do {
   1354       ++Current;
   1355       while (Current != End && *Current != '"')
   1356         ++Current;
   1357       // Repeat until the previous character was not a '\' or was an escaped
   1358       // backslash.
   1359     } while (   Current != End
   1360              && *(Current - 1) == '\\'
   1361              && wasEscaped(Start + 1, Current));
   1362   } else {
   1363     skip(1);
   1364     while (Current != End) {
   1365       // Skip a ' followed by another '.
   1366       if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
   1367         skip(2);
   1368         continue;
   1369       } else if (*Current == '\'')
   1370         break;
   1371       StringRef::iterator i = skip_nb_char(Current);
   1372       if (i == Current) {
   1373         i = skip_b_break(Current);
   1374         if (i == Current)
   1375           break;
   1376         Current = i;
   1377         Column = 0;
   1378         ++Line;
   1379       } else {
   1380         if (i == End)
   1381           break;
   1382         Current = i;
   1383         ++Column;
   1384       }
   1385     }
   1386   }
   1387 
   1388   if (Current == End) {
   1389     setError("Expected quote at end of scalar", Current);
   1390     return false;
   1391   }
   1392 
   1393   skip(1); // Skip ending quote.
   1394   Token T;
   1395   T.Kind = Token::TK_Scalar;
   1396   T.Range = StringRef(Start, Current - Start);
   1397   TokenQueue.push_back(T);
   1398 
   1399   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
   1400 
   1401   IsSimpleKeyAllowed = false;
   1402 
   1403   return true;
   1404 }
   1405 
   1406 bool Scanner::scanPlainScalar() {
   1407   StringRef::iterator Start = Current;
   1408   unsigned ColStart = Column;
   1409   unsigned LeadingBlanks = 0;
   1410   assert(Indent >= -1 && "Indent must be >= -1 !");
   1411   unsigned indent = static_cast<unsigned>(Indent + 1);
   1412   while (Current != End) {
   1413     if (*Current == '#')
   1414       break;
   1415 
   1416     while (Current != End && !isBlankOrBreak(Current)) {
   1417       if (FlowLevel && *Current == ':' &&
   1418           (Current + 1 == End ||
   1419            !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
   1420         setError("Found unexpected ':' while scanning a plain scalar", Current);
   1421         return false;
   1422       }
   1423 
   1424       // Check for the end of the plain scalar.
   1425       if (  (*Current == ':' && isBlankOrBreak(Current + 1))
   1426           || (  FlowLevel
   1427           && (StringRef(Current, 1).find_first_of(",:?[]{}")
   1428               != StringRef::npos)))
   1429         break;
   1430 
   1431       StringRef::iterator i = skip_nb_char(Current);
   1432       if (i == Current)
   1433         break;
   1434       Current = i;
   1435       ++Column;
   1436     }
   1437 
   1438     // Are we at the end?
   1439     if (!isBlankOrBreak(Current))
   1440       break;
   1441 
   1442     // Eat blanks.
   1443     StringRef::iterator Tmp = Current;
   1444     while (isBlankOrBreak(Tmp)) {
   1445       StringRef::iterator i = skip_s_white(Tmp);
   1446       if (i != Tmp) {
   1447         if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
   1448           setError("Found invalid tab character in indentation", Tmp);
   1449           return false;
   1450         }
   1451         Tmp = i;
   1452         ++Column;
   1453       } else {
   1454         i = skip_b_break(Tmp);
   1455         if (!LeadingBlanks)
   1456           LeadingBlanks = 1;
   1457         Tmp = i;
   1458         Column = 0;
   1459         ++Line;
   1460       }
   1461     }
   1462 
   1463     if (!FlowLevel && Column < indent)
   1464       break;
   1465 
   1466     Current = Tmp;
   1467   }
   1468   if (Start == Current) {
   1469     setError("Got empty plain scalar", Start);
   1470     return false;
   1471   }
   1472   Token T;
   1473   T.Kind = Token::TK_Scalar;
   1474   T.Range = StringRef(Start, Current - Start);
   1475   TokenQueue.push_back(T);
   1476 
   1477   // Plain scalars can be simple keys.
   1478   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
   1479 
   1480   IsSimpleKeyAllowed = false;
   1481 
   1482   return true;
   1483 }
   1484 
   1485 bool Scanner::scanAliasOrAnchor(bool IsAlias) {
   1486   StringRef::iterator Start = Current;
   1487   unsigned ColStart = Column;
   1488   skip(1);
   1489   while (Current != End) {
   1490     if (   *Current == '[' || *Current == ']'
   1491         || *Current == '{' || *Current == '}'
   1492         || *Current == ','
   1493         || *Current == ':')
   1494       break;
   1495     StringRef::iterator i = skip_ns_char(Current);
   1496     if (i == Current)
   1497       break;
   1498     Current = i;
   1499     ++Column;
   1500   }
   1501 
   1502   if (Start + 1 == Current) {
   1503     setError("Got empty alias or anchor", Start);
   1504     return false;
   1505   }
   1506 
   1507   Token T;
   1508   T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
   1509   T.Range = StringRef(Start, Current - Start);
   1510   TokenQueue.push_back(T);
   1511 
   1512   // Alias and anchors can be simple keys.
   1513   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
   1514 
   1515   IsSimpleKeyAllowed = false;
   1516 
   1517   return true;
   1518 }
   1519 
   1520 char Scanner::scanBlockChompingIndicator() {
   1521   char Indicator = ' ';
   1522   if (Current != End && (*Current == '+' || *Current == '-')) {
   1523     Indicator = *Current;
   1524     skip(1);
   1525   }
   1526   return Indicator;
   1527 }
   1528 
   1529 /// Get the number of line breaks after chomping.
   1530 ///
   1531 /// Return the number of trailing line breaks to emit, depending on
   1532 /// \p ChompingIndicator.
   1533 static unsigned getChompedLineBreaks(char ChompingIndicator,
   1534                                      unsigned LineBreaks, StringRef Str) {
   1535   if (ChompingIndicator == '-') // Strip all line breaks.
   1536     return 0;
   1537   if (ChompingIndicator == '+') // Keep all line breaks.
   1538     return LineBreaks;
   1539   // Clip trailing lines.
   1540   return Str.empty() ? 0 : 1;
   1541 }
   1542 
   1543 unsigned Scanner::scanBlockIndentationIndicator() {
   1544   unsigned Indent = 0;
   1545   if (Current != End && (*Current >= '1' && *Current <= '9')) {
   1546     Indent = unsigned(*Current - '0');
   1547     skip(1);
   1548   }
   1549   return Indent;
   1550 }
   1551 
   1552 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
   1553                                     unsigned &IndentIndicator, bool &IsDone) {
   1554   auto Start = Current;
   1555 
   1556   ChompingIndicator = scanBlockChompingIndicator();
   1557   IndentIndicator = scanBlockIndentationIndicator();
   1558   // Check for the chomping indicator once again.
   1559   if (ChompingIndicator == ' ')
   1560     ChompingIndicator = scanBlockChompingIndicator();
   1561   Current = skip_while(&Scanner::skip_s_white, Current);
   1562   skipComment();
   1563 
   1564   if (Current == End) { // EOF, we have an empty scalar.
   1565     Token T;
   1566     T.Kind = Token::TK_BlockScalar;
   1567     T.Range = StringRef(Start, Current - Start);
   1568     TokenQueue.push_back(T);
   1569     IsDone = true;
   1570     return true;
   1571   }
   1572 
   1573   if (!consumeLineBreakIfPresent()) {
   1574     setError("Expected a line break after block scalar header", Current);
   1575     return false;
   1576   }
   1577   return true;
   1578 }
   1579 
   1580 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
   1581                                     unsigned BlockExitIndent,
   1582                                     unsigned &LineBreaks, bool &IsDone) {
   1583   unsigned MaxAllSpaceLineCharacters = 0;
   1584   StringRef::iterator LongestAllSpaceLine;
   1585 
   1586   while (true) {
   1587     advanceWhile(&Scanner::skip_s_space);
   1588     if (skip_nb_char(Current) != Current) {
   1589       // This line isn't empty, so try and find the indentation.
   1590       if (Column <= BlockExitIndent) { // End of the block literal.
   1591         IsDone = true;
   1592         return true;
   1593       }
   1594       // We found the block's indentation.
   1595       BlockIndent = Column;
   1596       if (MaxAllSpaceLineCharacters > BlockIndent) {
   1597         setError(
   1598             "Leading all-spaces line must be smaller than the block indent",
   1599             LongestAllSpaceLine);
   1600         return false;
   1601       }
   1602       return true;
   1603     }
   1604     if (skip_b_break(Current) != Current &&
   1605         Column > MaxAllSpaceLineCharacters) {
   1606       // Record the longest all-space line in case it's longer than the
   1607       // discovered block indent.
   1608       MaxAllSpaceLineCharacters = Column;
   1609       LongestAllSpaceLine = Current;
   1610     }
   1611 
   1612     // Check for EOF.
   1613     if (Current == End) {
   1614       IsDone = true;
   1615       return true;
   1616     }
   1617 
   1618     if (!consumeLineBreakIfPresent()) {
   1619       IsDone = true;
   1620       return true;
   1621     }
   1622     ++LineBreaks;
   1623   }
   1624   return true;
   1625 }
   1626 
   1627 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
   1628                                     unsigned BlockExitIndent, bool &IsDone) {
   1629   // Skip the indentation.
   1630   while (Column < BlockIndent) {
   1631     auto I = skip_s_space(Current);
   1632     if (I == Current)
   1633       break;
   1634     Current = I;
   1635     ++Column;
   1636   }
   1637 
   1638   if (skip_nb_char(Current) == Current)
   1639     return true;
   1640 
   1641   if (Column <= BlockExitIndent) { // End of the block literal.
   1642     IsDone = true;
   1643     return true;
   1644   }
   1645 
   1646   if (Column < BlockIndent) {
   1647     if (Current != End && *Current == '#') { // Trailing comment.
   1648       IsDone = true;
   1649       return true;
   1650     }
   1651     setError("A text line is less indented than the block scalar", Current);
   1652     return false;
   1653   }
   1654   return true; // A normal text line.
   1655 }
   1656 
   1657 bool Scanner::scanBlockScalar(bool IsLiteral) {
   1658   // Eat '|' or '>'
   1659   assert(*Current == '|' || *Current == '>');
   1660   skip(1);
   1661 
   1662   char ChompingIndicator;
   1663   unsigned BlockIndent;
   1664   bool IsDone = false;
   1665   if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
   1666     return false;
   1667   if (IsDone)
   1668     return true;
   1669 
   1670   auto Start = Current;
   1671   unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
   1672   unsigned LineBreaks = 0;
   1673   if (BlockIndent == 0) {
   1674     if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
   1675                                IsDone))
   1676       return false;
   1677   }
   1678 
   1679   // Scan the block's scalars body.
   1680   SmallString<256> Str;
   1681   while (!IsDone) {
   1682     if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
   1683       return false;
   1684     if (IsDone)
   1685       break;
   1686 
   1687     // Parse the current line.
   1688     auto LineStart = Current;
   1689     advanceWhile(&Scanner::skip_nb_char);
   1690     if (LineStart != Current) {
   1691       Str.append(LineBreaks, '\n');
   1692       Str.append(StringRef(LineStart, Current - LineStart));
   1693       LineBreaks = 0;
   1694     }
   1695 
   1696     // Check for EOF.
   1697     if (Current == End)
   1698       break;
   1699 
   1700     if (!consumeLineBreakIfPresent())
   1701       break;
   1702     ++LineBreaks;
   1703   }
   1704 
   1705   if (Current == End && !LineBreaks)
   1706     // Ensure that there is at least one line break before the end of file.
   1707     LineBreaks = 1;
   1708   Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
   1709 
   1710   // New lines may start a simple key.
   1711   if (!FlowLevel)
   1712     IsSimpleKeyAllowed = true;
   1713 
   1714   Token T;
   1715   T.Kind = Token::TK_BlockScalar;
   1716   T.Range = StringRef(Start, Current - Start);
   1717   T.Value = std::string(Str);
   1718   TokenQueue.push_back(T);
   1719   return true;
   1720 }
   1721 
   1722 bool Scanner::scanTag() {
   1723   StringRef::iterator Start = Current;
   1724   unsigned ColStart = Column;
   1725   skip(1); // Eat !.
   1726   if (Current == End || isBlankOrBreak(Current)); // An empty tag.
   1727   else if (*Current == '<') {
   1728     skip(1);
   1729     scan_ns_uri_char();
   1730     if (!consume('>'))
   1731       return false;
   1732   } else {
   1733     // FIXME: Actually parse the c-ns-shorthand-tag rule.
   1734     Current = skip_while(&Scanner::skip_ns_char, Current);
   1735   }
   1736 
   1737   Token T;
   1738   T.Kind = Token::TK_Tag;
   1739   T.Range = StringRef(Start, Current - Start);
   1740   TokenQueue.push_back(T);
   1741 
   1742   // Tags can be simple keys.
   1743   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
   1744 
   1745   IsSimpleKeyAllowed = false;
   1746 
   1747   return true;
   1748 }
   1749 
   1750 bool Scanner::fetchMoreTokens() {
   1751   if (IsStartOfStream)
   1752     return scanStreamStart();
   1753 
   1754   scanToNextToken();
   1755 
   1756   if (Current == End)
   1757     return scanStreamEnd();
   1758 
   1759   removeStaleSimpleKeyCandidates();
   1760 
   1761   unrollIndent(Column);
   1762 
   1763   if (Column == 0 && *Current == '%')
   1764     return scanDirective();
   1765 
   1766   if (Column == 0 && Current + 4 <= End
   1767       && *Current == '-'
   1768       && *(Current + 1) == '-'
   1769       && *(Current + 2) == '-'
   1770       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
   1771     return scanDocumentIndicator(true);
   1772 
   1773   if (Column == 0 && Current + 4 <= End
   1774       && *Current == '.'
   1775       && *(Current + 1) == '.'
   1776       && *(Current + 2) == '.'
   1777       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
   1778     return scanDocumentIndicator(false);
   1779 
   1780   if (*Current == '[')
   1781     return scanFlowCollectionStart(true);
   1782 
   1783   if (*Current == '{')
   1784     return scanFlowCollectionStart(false);
   1785 
   1786   if (*Current == ']')
   1787     return scanFlowCollectionEnd(true);
   1788 
   1789   if (*Current == '}')
   1790     return scanFlowCollectionEnd(false);
   1791 
   1792   if (*Current == ',')
   1793     return scanFlowEntry();
   1794 
   1795   if (*Current == '-' && isBlankOrBreak(Current + 1))
   1796     return scanBlockEntry();
   1797 
   1798   if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
   1799     return scanKey();
   1800 
   1801   if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
   1802     return scanValue();
   1803 
   1804   if (*Current == '*')
   1805     return scanAliasOrAnchor(true);
   1806 
   1807   if (*Current == '&')
   1808     return scanAliasOrAnchor(false);
   1809 
   1810   if (*Current == '!')
   1811     return scanTag();
   1812 
   1813   if (*Current == '|' && !FlowLevel)
   1814     return scanBlockScalar(true);
   1815 
   1816   if (*Current == '>' && !FlowLevel)
   1817     return scanBlockScalar(false);
   1818 
   1819   if (*Current == '\'')
   1820     return scanFlowScalar(false);
   1821 
   1822   if (*Current == '"')
   1823     return scanFlowScalar(true);
   1824 
   1825   // Get a plain scalar.
   1826   StringRef FirstChar(Current, 1);
   1827   if (!(isBlankOrBreak(Current)
   1828         || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
   1829       || (*Current == '-' && !isBlankOrBreak(Current + 1))
   1830       || (!FlowLevel && (*Current == '?' || *Current == ':')
   1831           && isBlankOrBreak(Current + 1))
   1832       || (!FlowLevel && *Current == ':'
   1833                       && Current + 2 < End
   1834                       && *(Current + 1) == ':'
   1835                       && !isBlankOrBreak(Current + 2)))
   1836     return scanPlainScalar();
   1837 
   1838   setError("Unrecognized character while tokenizing.", Current);
   1839   return false;
   1840 }
   1841 
   1842 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors,
   1843                std::error_code *EC)
   1844     : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {}
   1845 
   1846 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
   1847                std::error_code *EC)
   1848     : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {}
   1849 
   1850 Stream::~Stream() = default;
   1851 
   1852 bool Stream::failed() { return scanner->failed(); }
   1853 
   1854 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) {
   1855   printError(N ? N->getSourceRange() : SMRange(), Msg, Kind);
   1856 }
   1857 
   1858 void Stream::printError(const SMRange &Range, const Twine &Msg,
   1859                         SourceMgr::DiagKind Kind) {
   1860   scanner->printError(Range.Start, Kind, Msg, Range);
   1861 }
   1862 
   1863 document_iterator Stream::begin() {
   1864   if (CurrentDoc)
   1865     report_fatal_error("Can only iterate over the stream once");
   1866 
   1867   // Skip Stream-Start.
   1868   scanner->getNext();
   1869 
   1870   CurrentDoc.reset(new Document(*this));
   1871   return document_iterator(CurrentDoc);
   1872 }
   1873 
   1874 document_iterator Stream::end() {
   1875   return document_iterator();
   1876 }
   1877 
   1878 void Stream::skip() {
   1879   for (document_iterator i = begin(), e = end(); i != e; ++i)
   1880     i->skip();
   1881 }
   1882 
   1883 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
   1884            StringRef T)
   1885     : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
   1886   SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
   1887   SourceRange = SMRange(Start, Start);
   1888 }
   1889 
   1890 std::string Node::getVerbatimTag() const {
   1891   StringRef Raw = getRawTag();
   1892   if (!Raw.empty() && Raw != "!") {
   1893     std::string Ret;
   1894     if (Raw.find_last_of('!') == 0) {
   1895       Ret = std::string(Doc->getTagMap().find("!")->second);
   1896       Ret += Raw.substr(1);
   1897       return Ret;
   1898     } else if (Raw.startswith("!!")) {
   1899       Ret = std::string(Doc->getTagMap().find("!!")->second);
   1900       Ret += Raw.substr(2);
   1901       return Ret;
   1902     } else {
   1903       StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
   1904       std::map<StringRef, StringRef>::const_iterator It =
   1905           Doc->getTagMap().find(TagHandle);
   1906       if (It != Doc->getTagMap().end())
   1907         Ret = std::string(It->second);
   1908       else {
   1909         Token T;
   1910         T.Kind = Token::TK_Tag;
   1911         T.Range = TagHandle;
   1912         setError(Twine("Unknown tag handle ") + TagHandle, T);
   1913       }
   1914       Ret += Raw.substr(Raw.find_last_of('!') + 1);
   1915       return Ret;
   1916     }
   1917   }
   1918 
   1919   switch (getType()) {
   1920   case NK_Null:
   1921     return "tag:yaml.org,2002:null";
   1922   case NK_Scalar:
   1923   case NK_BlockScalar:
   1924     // TODO: Tag resolution.
   1925     return "tag:yaml.org,2002:str";
   1926   case NK_Mapping:
   1927     return "tag:yaml.org,2002:map";
   1928   case NK_Sequence:
   1929     return "tag:yaml.org,2002:seq";
   1930   }
   1931 
   1932   return "";
   1933 }
   1934 
   1935 Token &Node::peekNext() {
   1936   return Doc->peekNext();
   1937 }
   1938 
   1939 Token Node::getNext() {
   1940   return Doc->getNext();
   1941 }
   1942 
   1943 Node *Node::parseBlockNode() {
   1944   return Doc->parseBlockNode();
   1945 }
   1946 
   1947 BumpPtrAllocator &Node::getAllocator() {
   1948   return Doc->NodeAllocator;
   1949 }
   1950 
   1951 void Node::setError(const Twine &Msg, Token &Tok) const {
   1952   Doc->setError(Msg, Tok);
   1953 }
   1954 
   1955 bool Node::failed() const {
   1956   return Doc->failed();
   1957 }
   1958 
   1959 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
   1960   // TODO: Handle newlines properly. We need to remove leading whitespace.
   1961   if (Value[0] == '"') { // Double quoted.
   1962     // Pull off the leading and trailing "s.
   1963     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
   1964     // Search for characters that would require unescaping the value.
   1965     StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
   1966     if (i != StringRef::npos)
   1967       return unescapeDoubleQuoted(UnquotedValue, i, Storage);
   1968     return UnquotedValue;
   1969   } else if (Value[0] == '\'') { // Single quoted.
   1970     // Pull off the leading and trailing 's.
   1971     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
   1972     StringRef::size_type i = UnquotedValue.find('\'');
   1973     if (i != StringRef::npos) {
   1974       // We're going to need Storage.
   1975       Storage.clear();
   1976       Storage.reserve(UnquotedValue.size());
   1977       for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
   1978         StringRef Valid(UnquotedValue.begin(), i);
   1979         llvm::append_range(Storage, Valid);
   1980         Storage.push_back('\'');
   1981         UnquotedValue = UnquotedValue.substr(i + 2);
   1982       }
   1983       llvm::append_range(Storage, UnquotedValue);
   1984       return StringRef(Storage.begin(), Storage.size());
   1985     }
   1986     return UnquotedValue;
   1987   }
   1988   // Plain or block.
   1989   return Value.rtrim(' ');
   1990 }
   1991 
   1992 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
   1993                                           , StringRef::size_type i
   1994                                           , SmallVectorImpl<char> &Storage)
   1995                                           const {
   1996   // Use Storage to build proper value.
   1997   Storage.clear();
   1998   Storage.reserve(UnquotedValue.size());
   1999   for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
   2000     // Insert all previous chars into Storage.
   2001     StringRef Valid(UnquotedValue.begin(), i);
   2002     llvm::append_range(Storage, Valid);
   2003     // Chop off inserted chars.
   2004     UnquotedValue = UnquotedValue.substr(i);
   2005 
   2006     assert(!UnquotedValue.empty() && "Can't be empty!");
   2007 
   2008     // Parse escape or line break.
   2009     switch (UnquotedValue[0]) {
   2010     case '\r':
   2011     case '\n':
   2012       Storage.push_back('\n');
   2013       if (   UnquotedValue.size() > 1
   2014           && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
   2015         UnquotedValue = UnquotedValue.substr(1);
   2016       UnquotedValue = UnquotedValue.substr(1);
   2017       break;
   2018     default:
   2019       if (UnquotedValue.size() == 1) {
   2020         Token T;
   2021         T.Range = StringRef(UnquotedValue.begin(), 1);
   2022         setError("Unrecognized escape code", T);
   2023         return "";
   2024       }
   2025       UnquotedValue = UnquotedValue.substr(1);
   2026       switch (UnquotedValue[0]) {
   2027       default: {
   2028           Token T;
   2029           T.Range = StringRef(UnquotedValue.begin(), 1);
   2030           setError("Unrecognized escape code", T);
   2031           return "";
   2032         }
   2033       case '\r':
   2034       case '\n':
   2035         // Remove the new line.
   2036         if (   UnquotedValue.size() > 1
   2037             && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
   2038           UnquotedValue = UnquotedValue.substr(1);
   2039         // If this was just a single byte newline, it will get skipped
   2040         // below.
   2041         break;
   2042       case '0':
   2043         Storage.push_back(0x00);
   2044         break;
   2045       case 'a':
   2046         Storage.push_back(0x07);
   2047         break;
   2048       case 'b':
   2049         Storage.push_back(0x08);
   2050         break;
   2051       case 't':
   2052       case 0x09:
   2053         Storage.push_back(0x09);
   2054         break;
   2055       case 'n':
   2056         Storage.push_back(0x0A);
   2057         break;
   2058       case 'v':
   2059         Storage.push_back(0x0B);
   2060         break;
   2061       case 'f':
   2062         Storage.push_back(0x0C);
   2063         break;
   2064       case 'r':
   2065         Storage.push_back(0x0D);
   2066         break;
   2067       case 'e':
   2068         Storage.push_back(0x1B);
   2069         break;
   2070       case ' ':
   2071         Storage.push_back(0x20);
   2072         break;
   2073       case '"':
   2074         Storage.push_back(0x22);
   2075         break;
   2076       case '/':
   2077         Storage.push_back(0x2F);
   2078         break;
   2079       case '\\':
   2080         Storage.push_back(0x5C);
   2081         break;
   2082       case 'N':
   2083         encodeUTF8(0x85, Storage);
   2084         break;
   2085       case '_':
   2086         encodeUTF8(0xA0, Storage);
   2087         break;
   2088       case 'L':
   2089         encodeUTF8(0x2028, Storage);
   2090         break;
   2091       case 'P':
   2092         encodeUTF8(0x2029, Storage);
   2093         break;
   2094       case 'x': {
   2095           if (UnquotedValue.size() < 3)
   2096             // TODO: Report error.
   2097             break;
   2098           unsigned int UnicodeScalarValue;
   2099           if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
   2100             // TODO: Report error.
   2101             UnicodeScalarValue = 0xFFFD;
   2102           encodeUTF8(UnicodeScalarValue, Storage);
   2103           UnquotedValue = UnquotedValue.substr(2);
   2104           break;
   2105         }
   2106       case 'u': {
   2107           if (UnquotedValue.size() < 5)
   2108             // TODO: Report error.
   2109             break;
   2110           unsigned int UnicodeScalarValue;
   2111           if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
   2112             // TODO: Report error.
   2113             UnicodeScalarValue = 0xFFFD;
   2114           encodeUTF8(UnicodeScalarValue, Storage);
   2115           UnquotedValue = UnquotedValue.substr(4);
   2116           break;
   2117         }
   2118       case 'U': {
   2119           if (UnquotedValue.size() < 9)
   2120             // TODO: Report error.
   2121             break;
   2122           unsigned int UnicodeScalarValue;
   2123           if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
   2124             // TODO: Report error.
   2125             UnicodeScalarValue = 0xFFFD;
   2126           encodeUTF8(UnicodeScalarValue, Storage);
   2127           UnquotedValue = UnquotedValue.substr(8);
   2128           break;
   2129         }
   2130       }
   2131       UnquotedValue = UnquotedValue.substr(1);
   2132     }
   2133   }
   2134   llvm::append_range(Storage, UnquotedValue);
   2135   return StringRef(Storage.begin(), Storage.size());
   2136 }
   2137 
   2138 Node *KeyValueNode::getKey() {
   2139   if (Key)
   2140     return Key;
   2141   // Handle implicit null keys.
   2142   {
   2143     Token &t = peekNext();
   2144     if (   t.Kind == Token::TK_BlockEnd
   2145         || t.Kind == Token::TK_Value
   2146         || t.Kind == Token::TK_Error) {
   2147       return Key = new (getAllocator()) NullNode(Doc);
   2148     }
   2149     if (t.Kind == Token::TK_Key)
   2150       getNext(); // skip TK_Key.
   2151   }
   2152 
   2153   // Handle explicit null keys.
   2154   Token &t = peekNext();
   2155   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
   2156     return Key = new (getAllocator()) NullNode(Doc);
   2157   }
   2158 
   2159   // We've got a normal key.
   2160   return Key = parseBlockNode();
   2161 }
   2162 
   2163 Node *KeyValueNode::getValue() {
   2164   if (Value)
   2165     return Value;
   2166 
   2167   if (Node* Key = getKey())
   2168     Key->skip();
   2169   else {
   2170     setError("Null key in Key Value.", peekNext());
   2171     return Value = new (getAllocator()) NullNode(Doc);
   2172   }
   2173 
   2174   if (failed())
   2175     return Value = new (getAllocator()) NullNode(Doc);
   2176 
   2177   // Handle implicit null values.
   2178   {
   2179     Token &t = peekNext();
   2180     if (   t.Kind == Token::TK_BlockEnd
   2181         || t.Kind == Token::TK_FlowMappingEnd
   2182         || t.Kind == Token::TK_Key
   2183         || t.Kind == Token::TK_FlowEntry
   2184         || t.Kind == Token::TK_Error) {
   2185       return Value = new (getAllocator()) NullNode(Doc);
   2186     }
   2187 
   2188     if (t.Kind != Token::TK_Value) {
   2189       setError("Unexpected token in Key Value.", t);
   2190       return Value = new (getAllocator()) NullNode(Doc);
   2191     }
   2192     getNext(); // skip TK_Value.
   2193   }
   2194 
   2195   // Handle explicit null values.
   2196   Token &t = peekNext();
   2197   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
   2198     return Value = new (getAllocator()) NullNode(Doc);
   2199   }
   2200 
   2201   // We got a normal value.
   2202   return Value = parseBlockNode();
   2203 }
   2204 
   2205 void MappingNode::increment() {
   2206   if (failed()) {
   2207     IsAtEnd = true;
   2208     CurrentEntry = nullptr;
   2209     return;
   2210   }
   2211   if (CurrentEntry) {
   2212     CurrentEntry->skip();
   2213     if (Type == MT_Inline) {
   2214       IsAtEnd = true;
   2215       CurrentEntry = nullptr;
   2216       return;
   2217     }
   2218   }
   2219   Token T = peekNext();
   2220   if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
   2221     // KeyValueNode eats the TK_Key. That way it can detect null keys.
   2222     CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
   2223   } else if (Type == MT_Block) {
   2224     switch (T.Kind) {
   2225     case Token::TK_BlockEnd:
   2226       getNext();
   2227       IsAtEnd = true;
   2228       CurrentEntry = nullptr;
   2229       break;
   2230     default:
   2231       setError("Unexpected token. Expected Key or Block End", T);
   2232       LLVM_FALLTHROUGH;
   2233     case Token::TK_Error:
   2234       IsAtEnd = true;
   2235       CurrentEntry = nullptr;
   2236     }
   2237   } else {
   2238     switch (T.Kind) {
   2239     case Token::TK_FlowEntry:
   2240       // Eat the flow entry and recurse.
   2241       getNext();
   2242       return increment();
   2243     case Token::TK_FlowMappingEnd:
   2244       getNext();
   2245       LLVM_FALLTHROUGH;
   2246     case Token::TK_Error:
   2247       // Set this to end iterator.
   2248       IsAtEnd = true;
   2249       CurrentEntry = nullptr;
   2250       break;
   2251     default:
   2252       setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
   2253                 "Mapping End."
   2254               , T);
   2255       IsAtEnd = true;
   2256       CurrentEntry = nullptr;
   2257     }
   2258   }
   2259 }
   2260 
   2261 void SequenceNode::increment() {
   2262   if (failed()) {
   2263     IsAtEnd = true;
   2264     CurrentEntry = nullptr;
   2265     return;
   2266   }
   2267   if (CurrentEntry)
   2268     CurrentEntry->skip();
   2269   Token T = peekNext();
   2270   if (SeqType == ST_Block) {
   2271     switch (T.Kind) {
   2272     case Token::TK_BlockEntry:
   2273       getNext();
   2274       CurrentEntry = parseBlockNode();
   2275       if (!CurrentEntry) { // An error occurred.
   2276         IsAtEnd = true;
   2277         CurrentEntry = nullptr;
   2278       }
   2279       break;
   2280     case Token::TK_BlockEnd:
   2281       getNext();
   2282       IsAtEnd = true;
   2283       CurrentEntry = nullptr;
   2284       break;
   2285     default:
   2286       setError( "Unexpected token. Expected Block Entry or Block End."
   2287               , T);
   2288       LLVM_FALLTHROUGH;
   2289     case Token::TK_Error:
   2290       IsAtEnd = true;
   2291       CurrentEntry = nullptr;
   2292     }
   2293   } else if (SeqType == ST_Indentless) {
   2294     switch (T.Kind) {
   2295     case Token::TK_BlockEntry:
   2296       getNext();
   2297       CurrentEntry = parseBlockNode();
   2298       if (!CurrentEntry) { // An error occurred.
   2299         IsAtEnd = true;
   2300         CurrentEntry = nullptr;
   2301       }
   2302       break;
   2303     default:
   2304     case Token::TK_Error:
   2305       IsAtEnd = true;
   2306       CurrentEntry = nullptr;
   2307     }
   2308   } else if (SeqType == ST_Flow) {
   2309     switch (T.Kind) {
   2310     case Token::TK_FlowEntry:
   2311       // Eat the flow entry and recurse.
   2312       getNext();
   2313       WasPreviousTokenFlowEntry = true;
   2314       return increment();
   2315     case Token::TK_FlowSequenceEnd:
   2316       getNext();
   2317       LLVM_FALLTHROUGH;
   2318     case Token::TK_Error:
   2319       // Set this to end iterator.
   2320       IsAtEnd = true;
   2321       CurrentEntry = nullptr;
   2322       break;
   2323     case Token::TK_StreamEnd:
   2324     case Token::TK_DocumentEnd:
   2325     case Token::TK_DocumentStart:
   2326       setError("Could not find closing ]!", T);
   2327       // Set this to end iterator.
   2328       IsAtEnd = true;
   2329       CurrentEntry = nullptr;
   2330       break;
   2331     default:
   2332       if (!WasPreviousTokenFlowEntry) {
   2333         setError("Expected , between entries!", T);
   2334         IsAtEnd = true;
   2335         CurrentEntry = nullptr;
   2336         break;
   2337       }
   2338       // Otherwise it must be a flow entry.
   2339       CurrentEntry = parseBlockNode();
   2340       if (!CurrentEntry) {
   2341         IsAtEnd = true;
   2342       }
   2343       WasPreviousTokenFlowEntry = false;
   2344       break;
   2345     }
   2346   }
   2347 }
   2348 
   2349 Document::Document(Stream &S) : stream(S), Root(nullptr) {
   2350   // Tag maps starts with two default mappings.
   2351   TagMap["!"] = "!";
   2352   TagMap["!!"] = "tag:yaml.org,2002:";
   2353 
   2354   if (parseDirectives())
   2355     expectToken(Token::TK_DocumentStart);
   2356   Token &T = peekNext();
   2357   if (T.Kind == Token::TK_DocumentStart)
   2358     getNext();
   2359 }
   2360 
   2361 bool Document::skip()  {
   2362   if (stream.scanner->failed())
   2363     return false;
   2364   if (!Root && !getRoot())
   2365     return false;
   2366   Root->skip();
   2367   Token &T = peekNext();
   2368   if (T.Kind == Token::TK_StreamEnd)
   2369     return false;
   2370   if (T.Kind == Token::TK_DocumentEnd) {
   2371     getNext();
   2372     return skip();
   2373   }
   2374   return true;
   2375 }
   2376 
   2377 Token &Document::peekNext() {
   2378   return stream.scanner->peekNext();
   2379 }
   2380 
   2381 Token Document::getNext() {
   2382   return stream.scanner->getNext();
   2383 }
   2384 
   2385 void Document::setError(const Twine &Message, Token &Location) const {
   2386   stream.scanner->setError(Message, Location.Range.begin());
   2387 }
   2388 
   2389 bool Document::failed() const {
   2390   return stream.scanner->failed();
   2391 }
   2392 
   2393 Node *Document::parseBlockNode() {
   2394   Token T = peekNext();
   2395   // Handle properties.
   2396   Token AnchorInfo;
   2397   Token TagInfo;
   2398 parse_property:
   2399   switch (T.Kind) {
   2400   case Token::TK_Alias:
   2401     getNext();
   2402     return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
   2403   case Token::TK_Anchor:
   2404     if (AnchorInfo.Kind == Token::TK_Anchor) {
   2405       setError("Already encountered an anchor for this node!", T);
   2406       return nullptr;
   2407     }
   2408     AnchorInfo = getNext(); // Consume TK_Anchor.
   2409     T = peekNext();
   2410     goto parse_property;
   2411   case Token::TK_Tag:
   2412     if (TagInfo.Kind == Token::TK_Tag) {
   2413       setError("Already encountered a tag for this node!", T);
   2414       return nullptr;
   2415     }
   2416     TagInfo = getNext(); // Consume TK_Tag.
   2417     T = peekNext();
   2418     goto parse_property;
   2419   default:
   2420     break;
   2421   }
   2422 
   2423   switch (T.Kind) {
   2424   case Token::TK_BlockEntry:
   2425     // We got an unindented BlockEntry sequence. This is not terminated with
   2426     // a BlockEnd.
   2427     // Don't eat the TK_BlockEntry, SequenceNode needs it.
   2428     return new (NodeAllocator) SequenceNode( stream.CurrentDoc
   2429                                            , AnchorInfo.Range.substr(1)
   2430                                            , TagInfo.Range
   2431                                            , SequenceNode::ST_Indentless);
   2432   case Token::TK_BlockSequenceStart:
   2433     getNext();
   2434     return new (NodeAllocator)
   2435       SequenceNode( stream.CurrentDoc
   2436                   , AnchorInfo.Range.substr(1)
   2437                   , TagInfo.Range
   2438                   , SequenceNode::ST_Block);
   2439   case Token::TK_BlockMappingStart:
   2440     getNext();
   2441     return new (NodeAllocator)
   2442       MappingNode( stream.CurrentDoc
   2443                  , AnchorInfo.Range.substr(1)
   2444                  , TagInfo.Range
   2445                  , MappingNode::MT_Block);
   2446   case Token::TK_FlowSequenceStart:
   2447     getNext();
   2448     return new (NodeAllocator)
   2449       SequenceNode( stream.CurrentDoc
   2450                   , AnchorInfo.Range.substr(1)
   2451                   , TagInfo.Range
   2452                   , SequenceNode::ST_Flow);
   2453   case Token::TK_FlowMappingStart:
   2454     getNext();
   2455     return new (NodeAllocator)
   2456       MappingNode( stream.CurrentDoc
   2457                  , AnchorInfo.Range.substr(1)
   2458                  , TagInfo.Range
   2459                  , MappingNode::MT_Flow);
   2460   case Token::TK_Scalar:
   2461     getNext();
   2462     return new (NodeAllocator)
   2463       ScalarNode( stream.CurrentDoc
   2464                 , AnchorInfo.Range.substr(1)
   2465                 , TagInfo.Range
   2466                 , T.Range);
   2467   case Token::TK_BlockScalar: {
   2468     getNext();
   2469     StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
   2470     StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
   2471     return new (NodeAllocator)
   2472         BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
   2473                         TagInfo.Range, StrCopy, T.Range);
   2474   }
   2475   case Token::TK_Key:
   2476     // Don't eat the TK_Key, KeyValueNode expects it.
   2477     return new (NodeAllocator)
   2478       MappingNode( stream.CurrentDoc
   2479                  , AnchorInfo.Range.substr(1)
   2480                  , TagInfo.Range
   2481                  , MappingNode::MT_Inline);
   2482   case Token::TK_DocumentStart:
   2483   case Token::TK_DocumentEnd:
   2484   case Token::TK_StreamEnd:
   2485   default:
   2486     // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
   2487     //       !!null null.
   2488     return new (NodeAllocator) NullNode(stream.CurrentDoc);
   2489   case Token::TK_FlowMappingEnd:
   2490   case Token::TK_FlowSequenceEnd:
   2491   case Token::TK_FlowEntry: {
   2492     if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root)))
   2493       return new (NodeAllocator) NullNode(stream.CurrentDoc);
   2494 
   2495     setError("Unexpected token", T);
   2496     return nullptr;
   2497   }
   2498   case Token::TK_Error:
   2499     return nullptr;
   2500   }
   2501   llvm_unreachable("Control flow shouldn't reach here.");
   2502   return nullptr;
   2503 }
   2504 
   2505 bool Document::parseDirectives() {
   2506   bool isDirective = false;
   2507   while (true) {
   2508     Token T = peekNext();
   2509     if (T.Kind == Token::TK_TagDirective) {
   2510       parseTAGDirective();
   2511       isDirective = true;
   2512     } else if (T.Kind == Token::TK_VersionDirective) {
   2513       parseYAMLDirective();
   2514       isDirective = true;
   2515     } else
   2516       break;
   2517   }
   2518   return isDirective;
   2519 }
   2520 
   2521 void Document::parseYAMLDirective() {
   2522   getNext(); // Eat %YAML <version>
   2523 }
   2524 
   2525 void Document::parseTAGDirective() {
   2526   Token Tag = getNext(); // %TAG <handle> <prefix>
   2527   StringRef T = Tag.Range;
   2528   // Strip %TAG
   2529   T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
   2530   std::size_t HandleEnd = T.find_first_of(" \t");
   2531   StringRef TagHandle = T.substr(0, HandleEnd);
   2532   StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
   2533   TagMap[TagHandle] = TagPrefix;
   2534 }
   2535 
   2536 bool Document::expectToken(int TK) {
   2537   Token T = getNext();
   2538   if (T.Kind != TK) {
   2539     setError("Unexpected token", T);
   2540     return false;
   2541   }
   2542   return true;
   2543 }
   2544