Home | History | Annotate | Line # | Download | only in Support
      1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 //  This is a YAML 1.2 parser.
     10 //
     11 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
     12 //
     13 //  This currently does not implement the following:
     14 //    * Multi-line literal folding.
     15 //    * Tag resolution.
     16 //    * UTF-16.
     17 //    * BOMs anywhere other than the first Unicode scalar value in the file.
     18 //
     19 //  The most important class here is Stream. This represents a YAML stream with
     20 //  0, 1, or many documents.
     21 //
     22 //  SourceMgr sm;
     23 //  StringRef input = getInput();
     24 //  yaml::Stream stream(input, sm);
     25 //
     26 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
     27 //       di != de; ++di) {
     28 //    yaml::Node *n = di->getRoot();
     29 //    if (n) {
     30 //      // Do something with n...
     31 //    } else
     32 //      break;
     33 //  }
     34 //
     35 //===----------------------------------------------------------------------===//
     36 
     37 #ifndef LLVM_SUPPORT_YAMLPARSER_H
     38 #define LLVM_SUPPORT_YAMLPARSER_H
     39 
     40 #include "llvm/ADT/StringRef.h"
     41 #include "llvm/Support/Allocator.h"
     42 #include "llvm/Support/SMLoc.h"
     43 #include "llvm/Support/SourceMgr.h"
     44 #include <cassert>
     45 #include <cstddef>
     46 #include <iterator>
     47 #include <map>
     48 #include <memory>
     49 #include <string>
     50 #include <system_error>
     51 
     52 namespace llvm {
     53 
     54 class MemoryBufferRef;
     55 class raw_ostream;
     56 class Twine;
     57 
     58 namespace yaml {
     59 
     60 class Document;
     61 class document_iterator;
     62 class Node;
     63 class Scanner;
     64 struct Token;
     65 
     66 /// Dump all the tokens in this stream to OS.
     67 /// \returns true if there was an error, false otherwise.
     68 bool dumpTokens(StringRef Input, raw_ostream &);
     69 
     70 /// Scans all tokens in input without outputting anything. This is used
     71 ///        for benchmarking the tokenizer.
     72 /// \returns true if there was an error, false otherwise.
     73 bool scanTokens(StringRef Input);
     74 
     75 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable
     76 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
     77 /// false, those UTF8 sequences encoding printable unicode scalars will not be
     78 /// escaped, but emitted verbatim.
     79 std::string escape(StringRef Input, bool EscapePrintable = true);
     80 
     81 /// Parse \p S as a bool according to https://yaml.org/type/bool.html.
     82 llvm::Optional<bool> parseBool(StringRef S);
     83 
     84 /// This class represents a YAML stream potentially containing multiple
     85 ///        documents.
     86 class Stream {
     87 public:
     88   /// This keeps a reference to the string referenced by \p Input.
     89   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
     90          std::error_code *EC = nullptr);
     91 
     92   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
     93          std::error_code *EC = nullptr);
     94   ~Stream();
     95 
     96   document_iterator begin();
     97   document_iterator end();
     98   void skip();
     99   bool failed();
    100 
    101   bool validate() {
    102     skip();
    103     return !failed();
    104   }
    105 
    106   void printError(Node *N, const Twine &Msg,
    107                   SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
    108   void printError(const SMRange &Range, const Twine &Msg,
    109                   SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
    110 
    111 private:
    112   friend class Document;
    113 
    114   std::unique_ptr<Scanner> scanner;
    115   std::unique_ptr<Document> CurrentDoc;
    116 };
    117 
    118 /// Abstract base class for all Nodes.
    119 class Node {
    120   virtual void anchor();
    121 
    122 public:
    123   enum NodeKind {
    124     NK_Null,
    125     NK_Scalar,
    126     NK_BlockScalar,
    127     NK_KeyValue,
    128     NK_Mapping,
    129     NK_Sequence,
    130     NK_Alias
    131   };
    132 
    133   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
    134        StringRef Tag);
    135 
    136   // It's not safe to copy YAML nodes; the document is streamed and the position
    137   // is part of the state.
    138   Node(const Node &) = delete;
    139   void operator=(const Node &) = delete;
    140 
    141   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
    142                      size_t Alignment = 16) noexcept {
    143     return Alloc.Allocate(Size, Alignment);
    144   }
    145 
    146   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
    147                        size_t Size) noexcept {
    148     Alloc.Deallocate(Ptr, Size, 0);
    149   }
    150 
    151   void operator delete(void *) noexcept = delete;
    152 
    153   /// Get the value of the anchor attached to this node. If it does not
    154   ///        have one, getAnchor().size() will be 0.
    155   StringRef getAnchor() const { return Anchor; }
    156 
    157   /// Get the tag as it was written in the document. This does not
    158   ///   perform tag resolution.
    159   StringRef getRawTag() const { return Tag; }
    160 
    161   /// Get the verbatium tag for a given Node. This performs tag resoluton
    162   ///   and substitution.
    163   std::string getVerbatimTag() const;
    164 
    165   SMRange getSourceRange() const { return SourceRange; }
    166   void setSourceRange(SMRange SR) { SourceRange = SR; }
    167 
    168   // These functions forward to Document and Scanner.
    169   Token &peekNext();
    170   Token getNext();
    171   Node *parseBlockNode();
    172   BumpPtrAllocator &getAllocator();
    173   void setError(const Twine &Message, Token &Location) const;
    174   bool failed() const;
    175 
    176   virtual void skip() {}
    177 
    178   unsigned int getType() const { return TypeID; }
    179 
    180 protected:
    181   std::unique_ptr<Document> &Doc;
    182   SMRange SourceRange;
    183 
    184   ~Node() = default;
    185 
    186 private:
    187   unsigned int TypeID;
    188   StringRef Anchor;
    189   /// The tag as typed in the document.
    190   StringRef Tag;
    191 };
    192 
    193 /// A null value.
    194 ///
    195 /// Example:
    196 ///   !!null null
    197 class NullNode final : public Node {
    198   void anchor() override;
    199 
    200 public:
    201   NullNode(std::unique_ptr<Document> &D)
    202       : Node(NK_Null, D, StringRef(), StringRef()) {}
    203 
    204   static bool classof(const Node *N) { return N->getType() == NK_Null; }
    205 };
    206 
    207 /// A scalar node is an opaque datum that can be presented as a
    208 ///        series of zero or more Unicode scalar values.
    209 ///
    210 /// Example:
    211 ///   Adena
    212 class ScalarNode final : public Node {
    213   void anchor() override;
    214 
    215 public:
    216   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    217              StringRef Val)
    218       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
    219     SMLoc Start = SMLoc::getFromPointer(Val.begin());
    220     SMLoc End = SMLoc::getFromPointer(Val.end());
    221     SourceRange = SMRange(Start, End);
    222   }
    223 
    224   // Return Value without any escaping or folding or other fun YAML stuff. This
    225   // is the exact bytes that are contained in the file (after conversion to
    226   // utf8).
    227   StringRef getRawValue() const { return Value; }
    228 
    229   /// Gets the value of this node as a StringRef.
    230   ///
    231   /// \param Storage is used to store the content of the returned StringRef if
    232   ///        it requires any modification from how it appeared in the source.
    233   ///        This happens with escaped characters and multi-line literals.
    234   StringRef getValue(SmallVectorImpl<char> &Storage) const;
    235 
    236   static bool classof(const Node *N) {
    237     return N->getType() == NK_Scalar;
    238   }
    239 
    240 private:
    241   StringRef Value;
    242 
    243   StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
    244                                  StringRef::size_type Start,
    245                                  SmallVectorImpl<char> &Storage) const;
    246 };
    247 
    248 /// A block scalar node is an opaque datum that can be presented as a
    249 ///        series of zero or more Unicode scalar values.
    250 ///
    251 /// Example:
    252 ///   |
    253 ///     Hello
    254 ///     World
    255 class BlockScalarNode final : public Node {
    256   void anchor() override;
    257 
    258 public:
    259   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    260                   StringRef Value, StringRef RawVal)
    261       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
    262     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
    263     SMLoc End = SMLoc::getFromPointer(RawVal.end());
    264     SourceRange = SMRange(Start, End);
    265   }
    266 
    267   /// Gets the value of this node as a StringRef.
    268   StringRef getValue() const { return Value; }
    269 
    270   static bool classof(const Node *N) {
    271     return N->getType() == NK_BlockScalar;
    272   }
    273 
    274 private:
    275   StringRef Value;
    276 };
    277 
    278 /// A key and value pair. While not technically a Node under the YAML
    279 ///        representation graph, it is easier to treat them this way.
    280 ///
    281 /// TODO: Consider making this not a child of Node.
    282 ///
    283 /// Example:
    284 ///   Section: .text
    285 class KeyValueNode final : public Node {
    286   void anchor() override;
    287 
    288 public:
    289   KeyValueNode(std::unique_ptr<Document> &D)
    290       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
    291 
    292   /// Parse and return the key.
    293   ///
    294   /// This may be called multiple times.
    295   ///
    296   /// \returns The key, or nullptr if failed() == true.
    297   Node *getKey();
    298 
    299   /// Parse and return the value.
    300   ///
    301   /// This may be called multiple times.
    302   ///
    303   /// \returns The value, or nullptr if failed() == true.
    304   Node *getValue();
    305 
    306   void skip() override {
    307     if (Node *Key = getKey()) {
    308       Key->skip();
    309       if (Node *Val = getValue())
    310         Val->skip();
    311     }
    312   }
    313 
    314   static bool classof(const Node *N) {
    315     return N->getType() == NK_KeyValue;
    316   }
    317 
    318 private:
    319   Node *Key = nullptr;
    320   Node *Value = nullptr;
    321 };
    322 
    323 /// This is an iterator abstraction over YAML collections shared by both
    324 ///        sequences and maps.
    325 ///
    326 /// BaseT must have a ValueT* member named CurrentEntry and a member function
    327 /// increment() which must set CurrentEntry to 0 to create an end iterator.
    328 template <class BaseT, class ValueT> class basic_collection_iterator {
    329 public:
    330   using iterator_category = std::input_iterator_tag;
    331   using value_type = ValueT;
    332   using difference_type = std::ptrdiff_t;
    333   using pointer = value_type *;
    334   using reference = value_type &;
    335 
    336   basic_collection_iterator() = default;
    337   basic_collection_iterator(BaseT *B) : Base(B) {}
    338 
    339   ValueT *operator->() const {
    340     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
    341     return Base->CurrentEntry;
    342   }
    343 
    344   ValueT &operator*() const {
    345     assert(Base && Base->CurrentEntry &&
    346            "Attempted to dereference end iterator!");
    347     return *Base->CurrentEntry;
    348   }
    349 
    350   operator ValueT *() const {
    351     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
    352     return Base->CurrentEntry;
    353   }
    354 
    355   /// Note on EqualityComparable:
    356   ///
    357   /// The iterator is not re-entrant,
    358   /// it is meant to be used for parsing YAML on-demand
    359   /// Once iteration started - it can point only to one entry at a time
    360   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
    361   /// iff Base and Other.Base are equal.
    362   bool operator==(const basic_collection_iterator &Other) const {
    363     if (Base && (Base == Other.Base)) {
    364       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
    365              && "Equal Bases expected to point to equal Entries");
    366     }
    367 
    368     return Base == Other.Base;
    369   }
    370 
    371   bool operator!=(const basic_collection_iterator &Other) const {
    372     return !(Base == Other.Base);
    373   }
    374 
    375   basic_collection_iterator &operator++() {
    376     assert(Base && "Attempted to advance iterator past end!");
    377     Base->increment();
    378     // Create an end iterator.
    379     if (!Base->CurrentEntry)
    380       Base = nullptr;
    381     return *this;
    382   }
    383 
    384 private:
    385   BaseT *Base = nullptr;
    386 };
    387 
    388 // The following two templates are used for both MappingNode and Sequence Node.
    389 template <class CollectionType>
    390 typename CollectionType::iterator begin(CollectionType &C) {
    391   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
    392   C.IsAtBeginning = false;
    393   typename CollectionType::iterator ret(&C);
    394   ++ret;
    395   return ret;
    396 }
    397 
    398 template <class CollectionType> void skip(CollectionType &C) {
    399   // TODO: support skipping from the middle of a parsed collection ;/
    400   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
    401   if (C.IsAtBeginning)
    402     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
    403          ++i)
    404       i->skip();
    405 }
    406 
    407 /// Represents a YAML map created from either a block map for a flow map.
    408 ///
    409 /// This parses the YAML stream as increment() is called.
    410 ///
    411 /// Example:
    412 ///   Name: _main
    413 ///   Scope: Global
    414 class MappingNode final : public Node {
    415   void anchor() override;
    416 
    417 public:
    418   enum MappingType {
    419     MT_Block,
    420     MT_Flow,
    421     MT_Inline ///< An inline mapping node is used for "[key: value]".
    422   };
    423 
    424   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    425               MappingType MT)
    426       : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
    427 
    428   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
    429 
    430   using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
    431 
    432   template <class T> friend typename T::iterator yaml::begin(T &);
    433   template <class T> friend void yaml::skip(T &);
    434 
    435   iterator begin() { return yaml::begin(*this); }
    436 
    437   iterator end() { return iterator(); }
    438 
    439   void skip() override { yaml::skip(*this); }
    440 
    441   static bool classof(const Node *N) {
    442     return N->getType() == NK_Mapping;
    443   }
    444 
    445 private:
    446   MappingType Type;
    447   bool IsAtBeginning = true;
    448   bool IsAtEnd = false;
    449   KeyValueNode *CurrentEntry = nullptr;
    450 
    451   void increment();
    452 };
    453 
    454 /// Represents a YAML sequence created from either a block sequence for a
    455 ///        flow sequence.
    456 ///
    457 /// This parses the YAML stream as increment() is called.
    458 ///
    459 /// Example:
    460 ///   - Hello
    461 ///   - World
    462 class SequenceNode final : public Node {
    463   void anchor() override;
    464 
    465 public:
    466   enum SequenceType {
    467     ST_Block,
    468     ST_Flow,
    469     // Use for:
    470     //
    471     // key:
    472     // - val1
    473     // - val2
    474     //
    475     // As a BlockMappingEntry and BlockEnd are not created in this case.
    476     ST_Indentless
    477   };
    478 
    479   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    480                SequenceType ST)
    481       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
    482 
    483   friend class basic_collection_iterator<SequenceNode, Node>;
    484 
    485   using iterator = basic_collection_iterator<SequenceNode, Node>;
    486 
    487   template <class T> friend typename T::iterator yaml::begin(T &);
    488   template <class T> friend void yaml::skip(T &);
    489 
    490   void increment();
    491 
    492   iterator begin() { return yaml::begin(*this); }
    493 
    494   iterator end() { return iterator(); }
    495 
    496   void skip() override { yaml::skip(*this); }
    497 
    498   static bool classof(const Node *N) {
    499     return N->getType() == NK_Sequence;
    500   }
    501 
    502 private:
    503   SequenceType SeqType;
    504   bool IsAtBeginning = true;
    505   bool IsAtEnd = false;
    506   bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
    507   Node *CurrentEntry = nullptr;
    508 };
    509 
    510 /// Represents an alias to a Node with an anchor.
    511 ///
    512 /// Example:
    513 ///   *AnchorName
    514 class AliasNode final : public Node {
    515   void anchor() override;
    516 
    517 public:
    518   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
    519       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
    520 
    521   StringRef getName() const { return Name; }
    522 
    523   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
    524 
    525 private:
    526   StringRef Name;
    527 };
    528 
    529 /// A YAML Stream is a sequence of Documents. A document contains a root
    530 ///        node.
    531 class Document {
    532 public:
    533   Document(Stream &ParentStream);
    534 
    535   /// Root for parsing a node. Returns a single node.
    536   Node *parseBlockNode();
    537 
    538   /// Finish parsing the current document and return true if there are
    539   ///        more. Return false otherwise.
    540   bool skip();
    541 
    542   /// Parse and return the root level node.
    543   Node *getRoot() {
    544     if (Root)
    545       return Root;
    546     return Root = parseBlockNode();
    547   }
    548 
    549   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
    550 
    551 private:
    552   friend class Node;
    553   friend class document_iterator;
    554 
    555   /// Stream to read tokens from.
    556   Stream &stream;
    557 
    558   /// Used to allocate nodes to. All are destroyed without calling their
    559   ///        destructor when the document is destroyed.
    560   BumpPtrAllocator NodeAllocator;
    561 
    562   /// The root node. Used to support skipping a partially parsed
    563   ///        document.
    564   Node *Root;
    565 
    566   /// Maps tag prefixes to their expansion.
    567   std::map<StringRef, StringRef> TagMap;
    568 
    569   Token &peekNext();
    570   Token getNext();
    571   void setError(const Twine &Message, Token &Location) const;
    572   bool failed() const;
    573 
    574   /// Parse %BLAH directives and return true if any were encountered.
    575   bool parseDirectives();
    576 
    577   /// Parse %YAML
    578   void parseYAMLDirective();
    579 
    580   /// Parse %TAG
    581   void parseTAGDirective();
    582 
    583   /// Consume the next token and error if it is not \a TK.
    584   bool expectToken(int TK);
    585 };
    586 
    587 /// Iterator abstraction for Documents over a Stream.
    588 class document_iterator {
    589 public:
    590   document_iterator() = default;
    591   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
    592 
    593   bool operator==(const document_iterator &Other) const {
    594     if (isAtEnd() || Other.isAtEnd())
    595       return isAtEnd() && Other.isAtEnd();
    596 
    597     return Doc == Other.Doc;
    598   }
    599   bool operator!=(const document_iterator &Other) const {
    600     return !(*this == Other);
    601   }
    602 
    603   document_iterator operator++() {
    604     assert(Doc && "incrementing iterator past the end.");
    605     if (!(*Doc)->skip()) {
    606       Doc->reset(nullptr);
    607     } else {
    608       Stream &S = (*Doc)->stream;
    609       Doc->reset(new Document(S));
    610     }
    611     return *this;
    612   }
    613 
    614   Document &operator*() { return *Doc->get(); }
    615 
    616   std::unique_ptr<Document> &operator->() { return *Doc; }
    617 
    618 private:
    619   bool isAtEnd() const { return !Doc || !*Doc; }
    620 
    621   std::unique_ptr<Document> *Doc = nullptr;
    622 };
    623 
    624 } // end namespace yaml
    625 
    626 } // end namespace llvm
    627 
    628 #endif // LLVM_SUPPORT_YAMLPARSER_H
    629