1 | //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This is a YAML 1.2 parser. |
10 | // |
11 | // See http://www.yaml.org/spec/1.2/spec.html for the full standard. |
12 | // |
13 | // This currently does not implement the following: |
14 | // * Tag resolution. |
15 | // * UTF-16. |
16 | // * BOMs anywhere other than the first Unicode scalar value in the file. |
17 | // |
18 | // The most important class here is Stream. This represents a YAML stream with |
19 | // 0, 1, or many documents. |
20 | // |
21 | // SourceMgr sm; |
22 | // StringRef input = getInput(); |
23 | // yaml::Stream stream(input, sm); |
24 | // |
25 | // for (yaml::document_iterator di = stream.begin(), de = stream.end(); |
26 | // di != de; ++di) { |
27 | // yaml::Node *n = di->getRoot(); |
28 | // if (n) { |
29 | // // Do something with n... |
30 | // } else |
31 | // break; |
32 | // } |
33 | // |
34 | //===----------------------------------------------------------------------===// |
35 | |
36 | #ifndef LLVM_SUPPORT_YAMLPARSER_H |
37 | #define LLVM_SUPPORT_YAMLPARSER_H |
38 | |
39 | #include "llvm/ADT/StringRef.h" |
40 | #include "llvm/Support/Allocator.h" |
41 | #include "llvm/Support/SMLoc.h" |
42 | #include "llvm/Support/SourceMgr.h" |
43 | #include <cassert> |
44 | #include <cstddef> |
45 | #include <iterator> |
46 | #include <map> |
47 | #include <memory> |
48 | #include <optional> |
49 | #include <string> |
50 | #include <system_error> |
51 | |
52 | namespace llvm { |
53 | |
54 | class MemoryBufferRef; |
55 | class raw_ostream; |
56 | class Twine; |
57 | |
58 | namespace yaml { |
59 | |
60 | class Document; |
61 | class document_iterator; |
62 | class Node; |
63 | class Scanner; |
64 | struct Token; |
65 | |
66 | /// Dump all the tokens in this stream to OS. |
67 | /// \returns true if there was an error, false otherwise. |
68 | bool dumpTokens(StringRef Input, raw_ostream &); |
69 | |
70 | /// Scans all tokens in input without outputting anything. This is used |
71 | /// for benchmarking the tokenizer. |
72 | /// \returns true if there was an error, false otherwise. |
73 | bool scanTokens(StringRef Input); |
74 | |
75 | /// Escape \a Input for a double quoted scalar; if \p EscapePrintable |
76 | /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is |
77 | /// false, those UTF8 sequences encoding printable unicode scalars will not be |
78 | /// escaped, but emitted verbatim. |
79 | std::string escape(StringRef Input, bool EscapePrintable = true); |
80 | |
81 | /// Parse \p S as a bool according to https://yaml.org/type/bool.html. |
82 | std::optional<bool> parseBool(StringRef S); |
83 | |
84 | /// This class represents a YAML stream potentially containing multiple |
85 | /// documents. |
86 | class Stream { |
87 | public: |
88 | /// This keeps a reference to the string referenced by \p Input. |
89 | Stream(StringRef Input, SourceMgr &, bool ShowColors = true, |
90 | std::error_code *EC = nullptr); |
91 | |
92 | Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true, |
93 | std::error_code *EC = nullptr); |
94 | ~Stream(); |
95 | |
96 | document_iterator begin(); |
97 | document_iterator end(); |
98 | void skip(); |
99 | bool failed(); |
100 | |
101 | bool validate() { |
102 | skip(); |
103 | return !failed(); |
104 | } |
105 | |
106 | void printError(Node *N, const Twine &Msg, |
107 | SourceMgr::DiagKind Kind = SourceMgr::DK_Error); |
108 | void printError(const SMRange &Range, const Twine &Msg, |
109 | SourceMgr::DiagKind Kind = SourceMgr::DK_Error); |
110 | |
111 | private: |
112 | friend class Document; |
113 | |
114 | std::unique_ptr<Scanner> scanner; |
115 | std::unique_ptr<Document> CurrentDoc; |
116 | }; |
117 | |
118 | /// Abstract base class for all Nodes. |
119 | class Node { |
120 | virtual void anchor(); |
121 | |
122 | public: |
123 | enum NodeKind { |
124 | NK_Null, |
125 | NK_Scalar, |
126 | NK_BlockScalar, |
127 | NK_KeyValue, |
128 | NK_Mapping, |
129 | NK_Sequence, |
130 | NK_Alias |
131 | }; |
132 | |
133 | Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, |
134 | StringRef Tag); |
135 | |
136 | // It's not safe to copy YAML nodes; the document is streamed and the position |
137 | // is part of the state. |
138 | Node(const Node &) = delete; |
139 | void operator=(const Node &) = delete; |
140 | |
141 | void *operator new(size_t Size, BumpPtrAllocator &Alloc, |
142 | size_t Alignment = 16) noexcept { |
143 | return Alloc.Allocate(Size, Alignment); |
144 | } |
145 | |
146 | void operator delete(void *Ptr, BumpPtrAllocator &Alloc, |
147 | size_t Size) noexcept { |
148 | Alloc.Deallocate(Ptr, Size, 0); |
149 | } |
150 | |
151 | void operator delete(void *) noexcept = delete; |
152 | |
153 | /// Get the value of the anchor attached to this node. If it does not |
154 | /// have one, getAnchor().size() will be 0. |
155 | StringRef getAnchor() const { return Anchor; } |
156 | |
157 | /// Get the tag as it was written in the document. This does not |
158 | /// perform tag resolution. |
159 | StringRef getRawTag() const { return Tag; } |
160 | |
161 | /// Get the verbatium tag for a given Node. This performs tag resoluton |
162 | /// and substitution. |
163 | std::string getVerbatimTag() const; |
164 | |
165 | SMRange getSourceRange() const { return SourceRange; } |
166 | void setSourceRange(SMRange SR) { SourceRange = SR; } |
167 | |
168 | // These functions forward to Document and Scanner. |
169 | Token &peekNext(); |
170 | Token getNext(); |
171 | Node *parseBlockNode(); |
172 | BumpPtrAllocator &getAllocator(); |
173 | void setError(const Twine &Message, Token &Location) const; |
174 | bool failed() const; |
175 | |
176 | virtual void skip() {} |
177 | |
178 | unsigned int getType() const { return TypeID; } |
179 | |
180 | protected: |
181 | std::unique_ptr<Document> &Doc; |
182 | SMRange SourceRange; |
183 | |
184 | ~Node() = default; |
185 | |
186 | private: |
187 | unsigned int TypeID; |
188 | StringRef Anchor; |
189 | /// The tag as typed in the document. |
190 | StringRef Tag; |
191 | }; |
192 | |
193 | /// A null value. |
194 | /// |
195 | /// Example: |
196 | /// !!null null |
197 | class NullNode final : public Node { |
198 | void anchor() override; |
199 | |
200 | public: |
201 | NullNode(std::unique_ptr<Document> &D) |
202 | : Node(NK_Null, D, StringRef(), StringRef()) {} |
203 | |
204 | static bool classof(const Node *N) { return N->getType() == NK_Null; } |
205 | }; |
206 | |
207 | /// A scalar node is an opaque datum that can be presented as a |
208 | /// series of zero or more Unicode scalar values. |
209 | /// |
210 | /// Example: |
211 | /// Adena |
212 | class ScalarNode final : public Node { |
213 | void anchor() override; |
214 | |
215 | public: |
216 | ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
217 | StringRef Val) |
218 | : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { |
219 | SMLoc Start = SMLoc::getFromPointer(Ptr: Val.begin()); |
220 | SMLoc End = SMLoc::getFromPointer(Ptr: Val.end()); |
221 | SourceRange = SMRange(Start, End); |
222 | } |
223 | |
224 | // Return Value without any escaping or folding or other fun YAML stuff. This |
225 | // is the exact bytes that are contained in the file (after conversion to |
226 | // utf8). |
227 | StringRef getRawValue() const { return Value; } |
228 | |
229 | /// Gets the value of this node as a StringRef. |
230 | /// |
231 | /// \param Storage is used to store the content of the returned StringRef if |
232 | /// it requires any modification from how it appeared in the source. |
233 | /// This happens with escaped characters and multi-line literals. |
234 | StringRef getValue(SmallVectorImpl<char> &Storage) const; |
235 | |
236 | static bool classof(const Node *N) { |
237 | return N->getType() == NK_Scalar; |
238 | } |
239 | |
240 | private: |
241 | StringRef Value; |
242 | |
243 | StringRef getDoubleQuotedValue(StringRef UnquotedValue, |
244 | SmallVectorImpl<char> &Storage) const; |
245 | |
246 | static StringRef getSingleQuotedValue(StringRef RawValue, |
247 | SmallVectorImpl<char> &Storage); |
248 | |
249 | static StringRef getPlainValue(StringRef RawValue, |
250 | SmallVectorImpl<char> &Storage); |
251 | }; |
252 | |
253 | /// A block scalar node is an opaque datum that can be presented as a |
254 | /// series of zero or more Unicode scalar values. |
255 | /// |
256 | /// Example: |
257 | /// | |
258 | /// Hello |
259 | /// World |
260 | class BlockScalarNode final : public Node { |
261 | void anchor() override; |
262 | |
263 | public: |
264 | BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
265 | StringRef Value, StringRef RawVal) |
266 | : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) { |
267 | SMLoc Start = SMLoc::getFromPointer(Ptr: RawVal.begin()); |
268 | SMLoc End = SMLoc::getFromPointer(Ptr: RawVal.end()); |
269 | SourceRange = SMRange(Start, End); |
270 | } |
271 | |
272 | /// Gets the value of this node as a StringRef. |
273 | StringRef getValue() const { return Value; } |
274 | |
275 | static bool classof(const Node *N) { |
276 | return N->getType() == NK_BlockScalar; |
277 | } |
278 | |
279 | private: |
280 | StringRef Value; |
281 | }; |
282 | |
283 | /// A key and value pair. While not technically a Node under the YAML |
284 | /// representation graph, it is easier to treat them this way. |
285 | /// |
286 | /// TODO: Consider making this not a child of Node. |
287 | /// |
288 | /// Example: |
289 | /// Section: .text |
290 | class KeyValueNode final : public Node { |
291 | void anchor() override; |
292 | |
293 | public: |
294 | KeyValueNode(std::unique_ptr<Document> &D) |
295 | : Node(NK_KeyValue, D, StringRef(), StringRef()) {} |
296 | |
297 | /// Parse and return the key. |
298 | /// |
299 | /// This may be called multiple times. |
300 | /// |
301 | /// \returns The key, or nullptr if failed() == true. |
302 | Node *getKey(); |
303 | |
304 | /// Parse and return the value. |
305 | /// |
306 | /// This may be called multiple times. |
307 | /// |
308 | /// \returns The value, or nullptr if failed() == true. |
309 | Node *getValue(); |
310 | |
311 | void skip() override { |
312 | if (Node *Key = getKey()) { |
313 | Key->skip(); |
314 | if (Node *Val = getValue()) |
315 | Val->skip(); |
316 | } |
317 | } |
318 | |
319 | static bool classof(const Node *N) { |
320 | return N->getType() == NK_KeyValue; |
321 | } |
322 | |
323 | private: |
324 | Node *Key = nullptr; |
325 | Node *Value = nullptr; |
326 | }; |
327 | |
328 | /// This is an iterator abstraction over YAML collections shared by both |
329 | /// sequences and maps. |
330 | /// |
331 | /// BaseT must have a ValueT* member named CurrentEntry and a member function |
332 | /// increment() which must set CurrentEntry to 0 to create an end iterator. |
333 | template <class BaseT, class ValueT> class basic_collection_iterator { |
334 | public: |
335 | using iterator_category = std::input_iterator_tag; |
336 | using value_type = ValueT; |
337 | using difference_type = std::ptrdiff_t; |
338 | using pointer = value_type *; |
339 | using reference = value_type &; |
340 | |
341 | basic_collection_iterator() = default; |
342 | basic_collection_iterator(BaseT *B) : Base(B) {} |
343 | |
344 | ValueT *operator->() const { |
345 | assert(Base && Base->CurrentEntry && "Attempted to access end iterator!" ); |
346 | return Base->CurrentEntry; |
347 | } |
348 | |
349 | ValueT &operator*() const { |
350 | assert(Base && Base->CurrentEntry && |
351 | "Attempted to dereference end iterator!" ); |
352 | return *Base->CurrentEntry; |
353 | } |
354 | |
355 | operator ValueT *() const { |
356 | assert(Base && Base->CurrentEntry && "Attempted to access end iterator!" ); |
357 | return Base->CurrentEntry; |
358 | } |
359 | |
360 | /// Note on EqualityComparable: |
361 | /// |
362 | /// The iterator is not re-entrant, |
363 | /// it is meant to be used for parsing YAML on-demand |
364 | /// Once iteration started - it can point only to one entry at a time |
365 | /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal |
366 | /// iff Base and Other.Base are equal. |
367 | bool operator==(const basic_collection_iterator &Other) const { |
368 | if (Base && (Base == Other.Base)) { |
369 | assert((Base->CurrentEntry == Other.Base->CurrentEntry) |
370 | && "Equal Bases expected to point to equal Entries" ); |
371 | } |
372 | |
373 | return Base == Other.Base; |
374 | } |
375 | |
376 | bool operator!=(const basic_collection_iterator &Other) const { |
377 | return !(Base == Other.Base); |
378 | } |
379 | |
380 | basic_collection_iterator &operator++() { |
381 | assert(Base && "Attempted to advance iterator past end!" ); |
382 | Base->increment(); |
383 | // Create an end iterator. |
384 | if (!Base->CurrentEntry) |
385 | Base = nullptr; |
386 | return *this; |
387 | } |
388 | |
389 | private: |
390 | BaseT *Base = nullptr; |
391 | }; |
392 | |
393 | // The following two templates are used for both MappingNode and Sequence Node. |
394 | template <class CollectionType> |
395 | typename CollectionType::iterator begin(CollectionType &C) { |
396 | assert(C.IsAtBeginning && "You may only iterate over a collection once!" ); |
397 | C.IsAtBeginning = false; |
398 | typename CollectionType::iterator ret(&C); |
399 | ++ret; |
400 | return ret; |
401 | } |
402 | |
403 | template <class CollectionType> void skip(CollectionType &C) { |
404 | // TODO: support skipping from the middle of a parsed collection ;/ |
405 | assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!" ); |
406 | if (C.IsAtBeginning) |
407 | for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; |
408 | ++i) |
409 | i->skip(); |
410 | } |
411 | |
412 | /// Represents a YAML map created from either a block map for a flow map. |
413 | /// |
414 | /// This parses the YAML stream as increment() is called. |
415 | /// |
416 | /// Example: |
417 | /// Name: _main |
418 | /// Scope: Global |
419 | class MappingNode final : public Node { |
420 | void anchor() override; |
421 | |
422 | public: |
423 | enum MappingType { |
424 | MT_Block, |
425 | MT_Flow, |
426 | MT_Inline ///< An inline mapping node is used for "[key: value]". |
427 | }; |
428 | |
429 | MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
430 | MappingType MT) |
431 | : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {} |
432 | |
433 | friend class basic_collection_iterator<MappingNode, KeyValueNode>; |
434 | |
435 | using iterator = basic_collection_iterator<MappingNode, KeyValueNode>; |
436 | |
437 | template <class T> friend typename T::iterator yaml::begin(T &); |
438 | template <class T> friend void yaml::skip(T &); |
439 | |
440 | iterator begin() { return yaml::begin(C&: *this); } |
441 | |
442 | iterator end() { return iterator(); } |
443 | |
444 | void skip() override { yaml::skip(C&: *this); } |
445 | |
446 | static bool classof(const Node *N) { |
447 | return N->getType() == NK_Mapping; |
448 | } |
449 | |
450 | private: |
451 | MappingType Type; |
452 | bool IsAtBeginning = true; |
453 | bool IsAtEnd = false; |
454 | KeyValueNode *CurrentEntry = nullptr; |
455 | |
456 | void increment(); |
457 | }; |
458 | |
459 | /// Represents a YAML sequence created from either a block sequence for a |
460 | /// flow sequence. |
461 | /// |
462 | /// This parses the YAML stream as increment() is called. |
463 | /// |
464 | /// Example: |
465 | /// - Hello |
466 | /// - World |
467 | class SequenceNode final : public Node { |
468 | void anchor() override; |
469 | |
470 | public: |
471 | enum SequenceType { |
472 | ST_Block, |
473 | ST_Flow, |
474 | // Use for: |
475 | // |
476 | // key: |
477 | // - val1 |
478 | // - val2 |
479 | // |
480 | // As a BlockMappingEntry and BlockEnd are not created in this case. |
481 | ST_Indentless |
482 | }; |
483 | |
484 | SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
485 | SequenceType ST) |
486 | : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {} |
487 | |
488 | friend class basic_collection_iterator<SequenceNode, Node>; |
489 | |
490 | using iterator = basic_collection_iterator<SequenceNode, Node>; |
491 | |
492 | template <class T> friend typename T::iterator yaml::begin(T &); |
493 | template <class T> friend void yaml::skip(T &); |
494 | |
495 | void increment(); |
496 | |
497 | iterator begin() { return yaml::begin(C&: *this); } |
498 | |
499 | iterator end() { return iterator(); } |
500 | |
501 | void skip() override { yaml::skip(C&: *this); } |
502 | |
503 | static bool classof(const Node *N) { |
504 | return N->getType() == NK_Sequence; |
505 | } |
506 | |
507 | private: |
508 | SequenceType SeqType; |
509 | bool IsAtBeginning = true; |
510 | bool IsAtEnd = false; |
511 | bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','. |
512 | Node *CurrentEntry = nullptr; |
513 | }; |
514 | |
515 | /// Represents an alias to a Node with an anchor. |
516 | /// |
517 | /// Example: |
518 | /// *AnchorName |
519 | class AliasNode final : public Node { |
520 | void anchor() override; |
521 | |
522 | public: |
523 | AliasNode(std::unique_ptr<Document> &D, StringRef Val) |
524 | : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} |
525 | |
526 | StringRef getName() const { return Name; } |
527 | |
528 | static bool classof(const Node *N) { return N->getType() == NK_Alias; } |
529 | |
530 | private: |
531 | StringRef Name; |
532 | }; |
533 | |
534 | /// A YAML Stream is a sequence of Documents. A document contains a root |
535 | /// node. |
536 | class Document { |
537 | public: |
538 | Document(Stream &ParentStream); |
539 | |
540 | /// Root for parsing a node. Returns a single node. |
541 | Node *parseBlockNode(); |
542 | |
543 | /// Finish parsing the current document and return true if there are |
544 | /// more. Return false otherwise. |
545 | bool skip(); |
546 | |
547 | /// Parse and return the root level node. |
548 | Node *getRoot() { |
549 | if (Root) |
550 | return Root; |
551 | return Root = parseBlockNode(); |
552 | } |
553 | |
554 | const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } |
555 | |
556 | private: |
557 | friend class Node; |
558 | friend class document_iterator; |
559 | |
560 | /// Stream to read tokens from. |
561 | Stream &stream; |
562 | |
563 | /// Used to allocate nodes to. All are destroyed without calling their |
564 | /// destructor when the document is destroyed. |
565 | BumpPtrAllocator NodeAllocator; |
566 | |
567 | /// The root node. Used to support skipping a partially parsed |
568 | /// document. |
569 | Node *Root; |
570 | |
571 | /// Maps tag prefixes to their expansion. |
572 | std::map<StringRef, StringRef> TagMap; |
573 | |
574 | Token &peekNext(); |
575 | Token getNext(); |
576 | void setError(const Twine &Message, Token &Location) const; |
577 | bool failed() const; |
578 | |
579 | /// Parse %BLAH directives and return true if any were encountered. |
580 | bool parseDirectives(); |
581 | |
582 | /// Parse %YAML |
583 | void parseYAMLDirective(); |
584 | |
585 | /// Parse %TAG |
586 | void parseTAGDirective(); |
587 | |
588 | /// Consume the next token and error if it is not \a TK. |
589 | bool expectToken(int TK); |
590 | }; |
591 | |
592 | /// Iterator abstraction for Documents over a Stream. |
593 | class document_iterator { |
594 | public: |
595 | document_iterator() = default; |
596 | document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} |
597 | |
598 | bool operator==(const document_iterator &Other) const { |
599 | if (isAtEnd() || Other.isAtEnd()) |
600 | return isAtEnd() && Other.isAtEnd(); |
601 | |
602 | return Doc == Other.Doc; |
603 | } |
604 | bool operator!=(const document_iterator &Other) const { |
605 | return !(*this == Other); |
606 | } |
607 | |
608 | document_iterator operator++() { |
609 | assert(Doc && "incrementing iterator past the end." ); |
610 | if (!(*Doc)->skip()) { |
611 | Doc->reset(p: nullptr); |
612 | } else { |
613 | Stream &S = (*Doc)->stream; |
614 | Doc->reset(p: new Document(S)); |
615 | } |
616 | return *this; |
617 | } |
618 | |
619 | Document &operator*() { return **Doc; } |
620 | |
621 | std::unique_ptr<Document> &operator->() { return *Doc; } |
622 | |
623 | private: |
624 | bool isAtEnd() const { return !Doc || !*Doc; } |
625 | |
626 | std::unique_ptr<Document> *Doc = nullptr; |
627 | }; |
628 | |
629 | } // end namespace yaml |
630 | |
631 | } // end namespace llvm |
632 | |
633 | #endif // LLVM_SUPPORT_YAMLPARSER_H |
634 | |