1 | //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===// |
2 | // |
3 | // The LLVM Compiler Infrastructure |
4 | // |
5 | // This file is distributed under the University of Illinois Open Source |
6 | // License. See LICENSE.TXT for details. |
7 | // |
8 | //===----------------------------------------------------------------------===// |
9 | // |
10 | // This is a YAML 1.2 parser. |
11 | // |
12 | // See http://www.yaml.org/spec/1.2/spec.html for the full standard. |
13 | // |
14 | // This currently does not implement the following: |
15 | // * Multi-line literal folding. |
16 | // * Tag resolution. |
17 | // * UTF-16. |
18 | // * BOMs anywhere other than the first Unicode scalar value in the file. |
19 | // |
20 | // The most important class here is Stream. This represents a YAML stream with |
21 | // 0, 1, or many documents. |
22 | // |
23 | // SourceMgr sm; |
24 | // StringRef input = getInput(); |
25 | // yaml::Stream stream(input, sm); |
26 | // |
27 | // for (yaml::document_iterator di = stream.begin(), de = stream.end(); |
28 | // di != de; ++di) { |
29 | // yaml::Node *n = di->getRoot(); |
30 | // if (n) { |
31 | // // Do something with n... |
32 | // } else |
33 | // break; |
34 | // } |
35 | // |
36 | //===----------------------------------------------------------------------===// |
37 | |
38 | #ifndef LLVM_SUPPORT_YAMLPARSER_H |
39 | #define LLVM_SUPPORT_YAMLPARSER_H |
40 | |
41 | #include "llvm/ADT/StringRef.h" |
42 | #include "llvm/Support/Allocator.h" |
43 | #include "llvm/Support/SMLoc.h" |
44 | #include <cassert> |
45 | #include <cstddef> |
46 | #include <iterator> |
47 | #include <map> |
48 | #include <memory> |
49 | #include <string> |
50 | #include <system_error> |
51 | |
52 | namespace llvm { |
53 | |
54 | class MemoryBufferRef; |
55 | class SourceMgr; |
56 | class raw_ostream; |
57 | class Twine; |
58 | |
59 | namespace yaml { |
60 | |
61 | class Document; |
62 | class document_iterator; |
63 | class Node; |
64 | class Scanner; |
65 | struct Token; |
66 | |
67 | /// Dump all the tokens in this stream to OS. |
68 | /// \returns true if there was an error, false otherwise. |
69 | bool dumpTokens(StringRef Input, raw_ostream &); |
70 | |
71 | /// Scans all tokens in input without outputting anything. This is used |
72 | /// for benchmarking the tokenizer. |
73 | /// \returns true if there was an error, false otherwise. |
74 | bool scanTokens(StringRef Input); |
75 | |
76 | /// Escape \a Input for a double quoted scalar; if \p EscapePrintable |
77 | /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is |
78 | /// false, those UTF8 sequences encoding printable unicode scalars will not be |
79 | /// escaped, but emitted verbatim. |
80 | std::string escape(StringRef Input, bool EscapePrintable = true); |
81 | |
82 | /// This class represents a YAML stream potentially containing multiple |
83 | /// documents. |
84 | class Stream { |
85 | public: |
86 | /// This keeps a reference to the string referenced by \p Input. |
87 | Stream(StringRef Input, SourceMgr &, bool ShowColors = true, |
88 | std::error_code *EC = nullptr); |
89 | |
90 | Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true, |
91 | std::error_code *EC = nullptr); |
92 | ~Stream(); |
93 | |
94 | document_iterator begin(); |
95 | document_iterator end(); |
96 | void skip(); |
97 | bool failed(); |
98 | |
99 | bool validate() { |
100 | skip(); |
101 | return !failed(); |
102 | } |
103 | |
104 | void printError(Node *N, const Twine &Msg); |
105 | |
106 | private: |
107 | friend class Document; |
108 | |
109 | std::unique_ptr<Scanner> scanner; |
110 | std::unique_ptr<Document> CurrentDoc; |
111 | }; |
112 | |
113 | /// Abstract base class for all Nodes. |
114 | class Node { |
115 | virtual void anchor(); |
116 | |
117 | public: |
118 | enum NodeKind { |
119 | NK_Null, |
120 | NK_Scalar, |
121 | NK_BlockScalar, |
122 | NK_KeyValue, |
123 | NK_Mapping, |
124 | NK_Sequence, |
125 | NK_Alias |
126 | }; |
127 | |
128 | Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, |
129 | StringRef Tag); |
130 | |
131 | // It's not safe to copy YAML nodes; the document is streamed and the position |
132 | // is part of the state. |
133 | Node(const Node &) = delete; |
134 | void operator=(const Node &) = delete; |
135 | |
136 | void *operator new(size_t Size, BumpPtrAllocator &Alloc, |
137 | size_t Alignment = 16) noexcept { |
138 | return Alloc.Allocate(Size, Alignment); |
139 | } |
140 | |
141 | void operator delete(void *Ptr, BumpPtrAllocator &Alloc, |
142 | size_t Size) noexcept { |
143 | Alloc.Deallocate(Ptr, Size); |
144 | } |
145 | |
146 | void operator delete(void *) noexcept = delete; |
147 | |
148 | /// Get the value of the anchor attached to this node. If it does not |
149 | /// have one, getAnchor().size() will be 0. |
150 | StringRef getAnchor() const { return Anchor; } |
151 | |
152 | /// Get the tag as it was written in the document. This does not |
153 | /// perform tag resolution. |
154 | StringRef getRawTag() const { return Tag; } |
155 | |
156 | /// Get the verbatium tag for a given Node. This performs tag resoluton |
157 | /// and substitution. |
158 | std::string getVerbatimTag() const; |
159 | |
160 | SMRange getSourceRange() const { return SourceRange; } |
161 | void setSourceRange(SMRange SR) { SourceRange = SR; } |
162 | |
163 | // These functions forward to Document and Scanner. |
164 | Token &peekNext(); |
165 | Token getNext(); |
166 | Node *parseBlockNode(); |
167 | BumpPtrAllocator &getAllocator(); |
168 | void setError(const Twine &Message, Token &Location) const; |
169 | bool failed() const; |
170 | |
171 | virtual void skip() {} |
172 | |
173 | unsigned int getType() const { return TypeID; } |
174 | |
175 | protected: |
176 | std::unique_ptr<Document> &Doc; |
177 | SMRange SourceRange; |
178 | |
179 | ~Node() = default; |
180 | |
181 | private: |
182 | unsigned int TypeID; |
183 | StringRef Anchor; |
184 | /// The tag as typed in the document. |
185 | StringRef Tag; |
186 | }; |
187 | |
188 | /// A null value. |
189 | /// |
190 | /// Example: |
191 | /// !!null null |
192 | class NullNode final : public Node { |
193 | void anchor() override; |
194 | |
195 | public: |
196 | NullNode(std::unique_ptr<Document> &D) |
197 | : Node(NK_Null, D, StringRef(), StringRef()) {} |
198 | |
199 | static bool classof(const Node *N) { return N->getType() == NK_Null; } |
200 | }; |
201 | |
202 | /// A scalar node is an opaque datum that can be presented as a |
203 | /// series of zero or more Unicode scalar values. |
204 | /// |
205 | /// Example: |
206 | /// Adena |
207 | class ScalarNode final : public Node { |
208 | void anchor() override; |
209 | |
210 | public: |
211 | ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
212 | StringRef Val) |
213 | : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { |
214 | SMLoc Start = SMLoc::getFromPointer(Val.begin()); |
215 | SMLoc End = SMLoc::getFromPointer(Val.end()); |
216 | SourceRange = SMRange(Start, End); |
217 | } |
218 | |
219 | // Return Value without any escaping or folding or other fun YAML stuff. This |
220 | // is the exact bytes that are contained in the file (after conversion to |
221 | // utf8). |
222 | StringRef getRawValue() const { return Value; } |
223 | |
224 | /// Gets the value of this node as a StringRef. |
225 | /// |
226 | /// \param Storage is used to store the content of the returned StringRef iff |
227 | /// it requires any modification from how it appeared in the source. |
228 | /// This happens with escaped characters and multi-line literals. |
229 | StringRef getValue(SmallVectorImpl<char> &Storage) const; |
230 | |
231 | static bool classof(const Node *N) { |
232 | return N->getType() == NK_Scalar; |
233 | } |
234 | |
235 | private: |
236 | StringRef Value; |
237 | |
238 | StringRef unescapeDoubleQuoted(StringRef UnquotedValue, |
239 | StringRef::size_type Start, |
240 | SmallVectorImpl<char> &Storage) const; |
241 | }; |
242 | |
243 | /// A block scalar node is an opaque datum that can be presented as a |
244 | /// series of zero or more Unicode scalar values. |
245 | /// |
246 | /// Example: |
247 | /// | |
248 | /// Hello |
249 | /// World |
250 | class BlockScalarNode final : public Node { |
251 | void anchor() override; |
252 | |
253 | public: |
254 | BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
255 | StringRef Value, StringRef RawVal) |
256 | : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) { |
257 | SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); |
258 | SMLoc End = SMLoc::getFromPointer(RawVal.end()); |
259 | SourceRange = SMRange(Start, End); |
260 | } |
261 | |
262 | /// Gets the value of this node as a StringRef. |
263 | StringRef getValue() const { return Value; } |
264 | |
265 | static bool classof(const Node *N) { |
266 | return N->getType() == NK_BlockScalar; |
267 | } |
268 | |
269 | private: |
270 | StringRef Value; |
271 | }; |
272 | |
273 | /// A key and value pair. While not technically a Node under the YAML |
274 | /// representation graph, it is easier to treat them this way. |
275 | /// |
276 | /// TODO: Consider making this not a child of Node. |
277 | /// |
278 | /// Example: |
279 | /// Section: .text |
280 | class KeyValueNode final : public Node { |
281 | void anchor() override; |
282 | |
283 | public: |
284 | KeyValueNode(std::unique_ptr<Document> &D) |
285 | : Node(NK_KeyValue, D, StringRef(), StringRef()) {} |
286 | |
287 | /// Parse and return the key. |
288 | /// |
289 | /// This may be called multiple times. |
290 | /// |
291 | /// \returns The key, or nullptr if failed() == true. |
292 | Node *getKey(); |
293 | |
294 | /// Parse and return the value. |
295 | /// |
296 | /// This may be called multiple times. |
297 | /// |
298 | /// \returns The value, or nullptr if failed() == true. |
299 | Node *getValue(); |
300 | |
301 | void skip() override { |
302 | if (Node *Key = getKey()) { |
303 | Key->skip(); |
304 | if (Node *Val = getValue()) |
305 | Val->skip(); |
306 | } |
307 | } |
308 | |
309 | static bool classof(const Node *N) { |
310 | return N->getType() == NK_KeyValue; |
311 | } |
312 | |
313 | private: |
314 | Node *Key = nullptr; |
315 | Node *Value = nullptr; |
316 | }; |
317 | |
318 | /// This is an iterator abstraction over YAML collections shared by both |
319 | /// sequences and maps. |
320 | /// |
321 | /// BaseT must have a ValueT* member named CurrentEntry and a member function |
322 | /// increment() which must set CurrentEntry to 0 to create an end iterator. |
323 | template <class BaseT, class ValueT> |
324 | class basic_collection_iterator |
325 | : public std::iterator<std::input_iterator_tag, ValueT> { |
326 | public: |
327 | basic_collection_iterator() = default; |
328 | basic_collection_iterator(BaseT *B) : Base(B) {} |
329 | |
330 | ValueT *operator->() const { |
331 | assert(Base && Base->CurrentEntry && "Attempted to access end iterator!" ); |
332 | return Base->CurrentEntry; |
333 | } |
334 | |
335 | ValueT &operator*() const { |
336 | assert(Base && Base->CurrentEntry && |
337 | "Attempted to dereference end iterator!" ); |
338 | return *Base->CurrentEntry; |
339 | } |
340 | |
341 | operator ValueT *() const { |
342 | assert(Base && Base->CurrentEntry && "Attempted to access end iterator!" ); |
343 | return Base->CurrentEntry; |
344 | } |
345 | |
346 | /// Note on EqualityComparable: |
347 | /// |
348 | /// The iterator is not re-entrant, |
349 | /// it is meant to be used for parsing YAML on-demand |
350 | /// Once iteration started - it can point only to one entry at a time |
351 | /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal |
352 | /// iff Base and Other.Base are equal. |
353 | bool operator==(const basic_collection_iterator &Other) const { |
354 | if (Base && (Base == Other.Base)) { |
355 | assert((Base->CurrentEntry == Other.Base->CurrentEntry) |
356 | && "Equal Bases expected to point to equal Entries" ); |
357 | } |
358 | |
359 | return Base == Other.Base; |
360 | } |
361 | |
362 | bool operator!=(const basic_collection_iterator &Other) const { |
363 | return !(Base == Other.Base); |
364 | } |
365 | |
366 | basic_collection_iterator &operator++() { |
367 | assert(Base && "Attempted to advance iterator past end!" ); |
368 | Base->increment(); |
369 | // Create an end iterator. |
370 | if (!Base->CurrentEntry) |
371 | Base = nullptr; |
372 | return *this; |
373 | } |
374 | |
375 | private: |
376 | BaseT *Base = nullptr; |
377 | }; |
378 | |
379 | // The following two templates are used for both MappingNode and Sequence Node. |
380 | template <class CollectionType> |
381 | typename CollectionType::iterator begin(CollectionType &C) { |
382 | assert(C.IsAtBeginning && "You may only iterate over a collection once!" ); |
383 | C.IsAtBeginning = false; |
384 | typename CollectionType::iterator ret(&C); |
385 | ++ret; |
386 | return ret; |
387 | } |
388 | |
389 | template <class CollectionType> void skip(CollectionType &C) { |
390 | // TODO: support skipping from the middle of a parsed collection ;/ |
391 | assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!" ); |
392 | if (C.IsAtBeginning) |
393 | for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; |
394 | ++i) |
395 | i->skip(); |
396 | } |
397 | |
398 | /// Represents a YAML map created from either a block map for a flow map. |
399 | /// |
400 | /// This parses the YAML stream as increment() is called. |
401 | /// |
402 | /// Example: |
403 | /// Name: _main |
404 | /// Scope: Global |
405 | class MappingNode final : public Node { |
406 | void anchor() override; |
407 | |
408 | public: |
409 | enum MappingType { |
410 | MT_Block, |
411 | MT_Flow, |
412 | MT_Inline ///< An inline mapping node is used for "[key: value]". |
413 | }; |
414 | |
415 | MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
416 | MappingType MT) |
417 | : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {} |
418 | |
419 | friend class basic_collection_iterator<MappingNode, KeyValueNode>; |
420 | |
421 | using iterator = basic_collection_iterator<MappingNode, KeyValueNode>; |
422 | |
423 | template <class T> friend typename T::iterator yaml::begin(T &); |
424 | template <class T> friend void yaml::skip(T &); |
425 | |
426 | iterator begin() { return yaml::begin(*this); } |
427 | |
428 | iterator end() { return iterator(); } |
429 | |
430 | void skip() override { yaml::skip(*this); } |
431 | |
432 | static bool classof(const Node *N) { |
433 | return N->getType() == NK_Mapping; |
434 | } |
435 | |
436 | private: |
437 | MappingType Type; |
438 | bool IsAtBeginning = true; |
439 | bool IsAtEnd = false; |
440 | KeyValueNode *CurrentEntry = nullptr; |
441 | |
442 | void increment(); |
443 | }; |
444 | |
445 | /// Represents a YAML sequence created from either a block sequence for a |
446 | /// flow sequence. |
447 | /// |
448 | /// This parses the YAML stream as increment() is called. |
449 | /// |
450 | /// Example: |
451 | /// - Hello |
452 | /// - World |
453 | class SequenceNode final : public Node { |
454 | void anchor() override; |
455 | |
456 | public: |
457 | enum SequenceType { |
458 | ST_Block, |
459 | ST_Flow, |
460 | // Use for: |
461 | // |
462 | // key: |
463 | // - val1 |
464 | // - val2 |
465 | // |
466 | // As a BlockMappingEntry and BlockEnd are not created in this case. |
467 | ST_Indentless |
468 | }; |
469 | |
470 | SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
471 | SequenceType ST) |
472 | : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {} |
473 | |
474 | friend class basic_collection_iterator<SequenceNode, Node>; |
475 | |
476 | using iterator = basic_collection_iterator<SequenceNode, Node>; |
477 | |
478 | template <class T> friend typename T::iterator yaml::begin(T &); |
479 | template <class T> friend void yaml::skip(T &); |
480 | |
481 | void increment(); |
482 | |
483 | iterator begin() { return yaml::begin(*this); } |
484 | |
485 | iterator end() { return iterator(); } |
486 | |
487 | void skip() override { yaml::skip(*this); } |
488 | |
489 | static bool classof(const Node *N) { |
490 | return N->getType() == NK_Sequence; |
491 | } |
492 | |
493 | private: |
494 | SequenceType SeqType; |
495 | bool IsAtBeginning = true; |
496 | bool IsAtEnd = false; |
497 | bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','. |
498 | Node *CurrentEntry = nullptr; |
499 | }; |
500 | |
501 | /// Represents an alias to a Node with an anchor. |
502 | /// |
503 | /// Example: |
504 | /// *AnchorName |
505 | class AliasNode final : public Node { |
506 | void anchor() override; |
507 | |
508 | public: |
509 | AliasNode(std::unique_ptr<Document> &D, StringRef Val) |
510 | : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} |
511 | |
512 | StringRef getName() const { return Name; } |
513 | Node *getTarget(); |
514 | |
515 | static bool classof(const Node *N) { return N->getType() == NK_Alias; } |
516 | |
517 | private: |
518 | StringRef Name; |
519 | }; |
520 | |
521 | /// A YAML Stream is a sequence of Documents. A document contains a root |
522 | /// node. |
523 | class Document { |
524 | public: |
525 | Document(Stream &ParentStream); |
526 | |
527 | /// Root for parsing a node. Returns a single node. |
528 | Node *parseBlockNode(); |
529 | |
530 | /// Finish parsing the current document and return true if there are |
531 | /// more. Return false otherwise. |
532 | bool skip(); |
533 | |
534 | /// Parse and return the root level node. |
535 | Node *getRoot() { |
536 | if (Root) |
537 | return Root; |
538 | return Root = parseBlockNode(); |
539 | } |
540 | |
541 | const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } |
542 | |
543 | private: |
544 | friend class Node; |
545 | friend class document_iterator; |
546 | |
547 | /// Stream to read tokens from. |
548 | Stream &stream; |
549 | |
550 | /// Used to allocate nodes to. All are destroyed without calling their |
551 | /// destructor when the document is destroyed. |
552 | BumpPtrAllocator NodeAllocator; |
553 | |
554 | /// The root node. Used to support skipping a partially parsed |
555 | /// document. |
556 | Node *Root; |
557 | |
558 | /// Maps tag prefixes to their expansion. |
559 | std::map<StringRef, StringRef> TagMap; |
560 | |
561 | Token &peekNext(); |
562 | Token getNext(); |
563 | void setError(const Twine &Message, Token &Location) const; |
564 | bool failed() const; |
565 | |
566 | /// Parse %BLAH directives and return true if any were encountered. |
567 | bool parseDirectives(); |
568 | |
569 | /// Parse %YAML |
570 | void parseYAMLDirective(); |
571 | |
572 | /// Parse %TAG |
573 | void parseTAGDirective(); |
574 | |
575 | /// Consume the next token and error if it is not \a TK. |
576 | bool expectToken(int TK); |
577 | }; |
578 | |
579 | /// Iterator abstraction for Documents over a Stream. |
580 | class document_iterator { |
581 | public: |
582 | document_iterator() = default; |
583 | document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} |
584 | |
585 | bool operator==(const document_iterator &Other) const { |
586 | if (isAtEnd() || Other.isAtEnd()) |
587 | return isAtEnd() && Other.isAtEnd(); |
588 | |
589 | return Doc == Other.Doc; |
590 | } |
591 | bool operator!=(const document_iterator &Other) const { |
592 | return !(*this == Other); |
593 | } |
594 | |
595 | document_iterator operator++() { |
596 | assert(Doc && "incrementing iterator past the end." ); |
597 | if (!(*Doc)->skip()) { |
598 | Doc->reset(nullptr); |
599 | } else { |
600 | Stream &S = (*Doc)->stream; |
601 | Doc->reset(new Document(S)); |
602 | } |
603 | return *this; |
604 | } |
605 | |
606 | Document &operator*() { return *Doc->get(); } |
607 | |
608 | std::unique_ptr<Document> &operator->() { return *Doc; } |
609 | |
610 | private: |
611 | bool isAtEnd() const { return !Doc || !*Doc; } |
612 | |
613 | std::unique_ptr<Document> *Doc = nullptr; |
614 | }; |
615 | |
616 | } // end namespace yaml |
617 | |
618 | } // end namespace llvm |
619 | |
620 | #endif // LLVM_SUPPORT_YAMLPARSER_H |
621 | |