1 | //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a YAML parser. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/Support/YAMLParser.h" |
14 | #include "llvm/ADT/AllocatorList.h" |
15 | #include "llvm/ADT/ArrayRef.h" |
16 | #include "llvm/ADT/STLExtras.h" |
17 | #include "llvm/ADT/SmallString.h" |
18 | #include "llvm/ADT/SmallVector.h" |
19 | #include "llvm/ADT/StringExtras.h" |
20 | #include "llvm/ADT/StringRef.h" |
21 | #include "llvm/ADT/Twine.h" |
22 | #include "llvm/Support/Compiler.h" |
23 | #include "llvm/Support/ErrorHandling.h" |
24 | #include "llvm/Support/MemoryBuffer.h" |
25 | #include "llvm/Support/SMLoc.h" |
26 | #include "llvm/Support/SourceMgr.h" |
27 | #include "llvm/Support/Unicode.h" |
28 | #include "llvm/Support/raw_ostream.h" |
29 | #include <cassert> |
30 | #include <cstddef> |
31 | #include <cstdint> |
32 | #include <map> |
33 | #include <memory> |
34 | #include <string> |
35 | #include <system_error> |
36 | #include <utility> |
37 | |
38 | using namespace llvm; |
39 | using namespace yaml; |
40 | |
41 | enum UnicodeEncodingForm { |
42 | UEF_UTF32_LE, ///< UTF-32 Little Endian |
43 | UEF_UTF32_BE, ///< UTF-32 Big Endian |
44 | UEF_UTF16_LE, ///< UTF-16 Little Endian |
45 | UEF_UTF16_BE, ///< UTF-16 Big Endian |
46 | UEF_UTF8, ///< UTF-8 or ascii. |
47 | UEF_Unknown ///< Not a valid Unicode encoding. |
48 | }; |
49 | |
50 | /// EncodingInfo - Holds the encoding type and length of the byte order mark if |
51 | /// it exists. Length is in {0, 2, 3, 4}. |
52 | using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; |
53 | |
54 | /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode |
55 | /// encoding form of \a Input. |
56 | /// |
57 | /// @param Input A string of length 0 or more. |
58 | /// @returns An EncodingInfo indicating the Unicode encoding form of the input |
59 | /// and how long the byte order mark is if one exists. |
60 | static EncodingInfo getUnicodeEncoding(StringRef Input) { |
61 | if (Input.empty()) |
62 | return std::make_pair(x: UEF_Unknown, y: 0); |
63 | |
64 | switch (uint8_t(Input[0])) { |
65 | case 0x00: |
66 | if (Input.size() >= 4) { |
67 | if ( Input[1] == 0 |
68 | && uint8_t(Input[2]) == 0xFE |
69 | && uint8_t(Input[3]) == 0xFF) |
70 | return std::make_pair(x: UEF_UTF32_BE, y: 4); |
71 | if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) |
72 | return std::make_pair(x: UEF_UTF32_BE, y: 0); |
73 | } |
74 | |
75 | if (Input.size() >= 2 && Input[1] != 0) |
76 | return std::make_pair(x: UEF_UTF16_BE, y: 0); |
77 | return std::make_pair(x: UEF_Unknown, y: 0); |
78 | case 0xFF: |
79 | if ( Input.size() >= 4 |
80 | && uint8_t(Input[1]) == 0xFE |
81 | && Input[2] == 0 |
82 | && Input[3] == 0) |
83 | return std::make_pair(x: UEF_UTF32_LE, y: 4); |
84 | |
85 | if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) |
86 | return std::make_pair(x: UEF_UTF16_LE, y: 2); |
87 | return std::make_pair(x: UEF_Unknown, y: 0); |
88 | case 0xFE: |
89 | if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) |
90 | return std::make_pair(x: UEF_UTF16_BE, y: 2); |
91 | return std::make_pair(x: UEF_Unknown, y: 0); |
92 | case 0xEF: |
93 | if ( Input.size() >= 3 |
94 | && uint8_t(Input[1]) == 0xBB |
95 | && uint8_t(Input[2]) == 0xBF) |
96 | return std::make_pair(x: UEF_UTF8, y: 3); |
97 | return std::make_pair(x: UEF_Unknown, y: 0); |
98 | } |
99 | |
100 | // It could still be utf-32 or utf-16. |
101 | if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) |
102 | return std::make_pair(x: UEF_UTF32_LE, y: 0); |
103 | |
104 | if (Input.size() >= 2 && Input[1] == 0) |
105 | return std::make_pair(x: UEF_UTF16_LE, y: 0); |
106 | |
107 | return std::make_pair(x: UEF_UTF8, y: 0); |
108 | } |
109 | |
110 | /// Pin the vtables to this file. |
111 | void Node::anchor() {} |
112 | void NullNode::anchor() {} |
113 | void ScalarNode::anchor() {} |
114 | void BlockScalarNode::anchor() {} |
115 | void KeyValueNode::anchor() {} |
116 | void MappingNode::anchor() {} |
117 | void SequenceNode::anchor() {} |
118 | void AliasNode::anchor() {} |
119 | |
120 | namespace llvm { |
121 | namespace yaml { |
122 | |
123 | /// Token - A single YAML token. |
124 | struct Token { |
125 | enum TokenKind { |
126 | TK_Error, // Uninitialized token. |
127 | TK_StreamStart, |
128 | TK_StreamEnd, |
129 | TK_VersionDirective, |
130 | TK_TagDirective, |
131 | TK_DocumentStart, |
132 | TK_DocumentEnd, |
133 | TK_BlockEntry, |
134 | TK_BlockEnd, |
135 | TK_BlockSequenceStart, |
136 | TK_BlockMappingStart, |
137 | TK_FlowEntry, |
138 | TK_FlowSequenceStart, |
139 | TK_FlowSequenceEnd, |
140 | TK_FlowMappingStart, |
141 | TK_FlowMappingEnd, |
142 | TK_Key, |
143 | TK_Value, |
144 | TK_Scalar, |
145 | TK_BlockScalar, |
146 | TK_Alias, |
147 | TK_Anchor, |
148 | TK_Tag |
149 | } Kind = TK_Error; |
150 | |
151 | /// A string of length 0 or more whose begin() points to the logical location |
152 | /// of the token in the input. |
153 | StringRef Range; |
154 | |
155 | /// The value of a block scalar node. |
156 | std::string Value; |
157 | |
158 | Token() = default; |
159 | }; |
160 | |
161 | } // end namespace yaml |
162 | } // end namespace llvm |
163 | |
164 | using TokenQueueT = BumpPtrList<Token>; |
165 | |
166 | namespace { |
167 | |
168 | /// This struct is used to track simple keys. |
169 | /// |
170 | /// Simple keys are handled by creating an entry in SimpleKeys for each Token |
171 | /// which could legally be the start of a simple key. When peekNext is called, |
172 | /// if the Token To be returned is referenced by a SimpleKey, we continue |
173 | /// tokenizing until that potential simple key has either been found to not be |
174 | /// a simple key (we moved on to the next line or went further than 1024 chars). |
175 | /// Or when we run into a Value, and then insert a Key token (and possibly |
176 | /// others) before the SimpleKey's Tok. |
177 | struct SimpleKey { |
178 | TokenQueueT::iterator Tok; |
179 | unsigned Column = 0; |
180 | unsigned Line = 0; |
181 | unsigned FlowLevel = 0; |
182 | bool IsRequired = false; |
183 | |
184 | bool operator ==(const SimpleKey &Other) { |
185 | return Tok == Other.Tok; |
186 | } |
187 | }; |
188 | |
189 | } // end anonymous namespace |
190 | |
191 | /// The Unicode scalar value of a UTF-8 minimal well-formed code unit |
192 | /// subsequence and the subsequence's length in code units (uint8_t). |
193 | /// A length of 0 represents an error. |
194 | using UTF8Decoded = std::pair<uint32_t, unsigned>; |
195 | |
196 | static UTF8Decoded decodeUTF8(StringRef Range) { |
197 | StringRef::iterator Position= Range.begin(); |
198 | StringRef::iterator End = Range.end(); |
199 | // 1 byte: [0x00, 0x7f] |
200 | // Bit pattern: 0xxxxxxx |
201 | if (Position < End && (*Position & 0x80) == 0) { |
202 | return std::make_pair(x: *Position, y: 1); |
203 | } |
204 | // 2 bytes: [0x80, 0x7ff] |
205 | // Bit pattern: 110xxxxx 10xxxxxx |
206 | if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) && |
207 | ((*(Position + 1) & 0xC0) == 0x80)) { |
208 | uint32_t codepoint = ((*Position & 0x1F) << 6) | |
209 | (*(Position + 1) & 0x3F); |
210 | if (codepoint >= 0x80) |
211 | return std::make_pair(x&: codepoint, y: 2); |
212 | } |
213 | // 3 bytes: [0x8000, 0xffff] |
214 | // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx |
215 | if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) && |
216 | ((*(Position + 1) & 0xC0) == 0x80) && |
217 | ((*(Position + 2) & 0xC0) == 0x80)) { |
218 | uint32_t codepoint = ((*Position & 0x0F) << 12) | |
219 | ((*(Position + 1) & 0x3F) << 6) | |
220 | (*(Position + 2) & 0x3F); |
221 | // Codepoints between 0xD800 and 0xDFFF are invalid, as |
222 | // they are high / low surrogate halves used by UTF-16. |
223 | if (codepoint >= 0x800 && |
224 | (codepoint < 0xD800 || codepoint > 0xDFFF)) |
225 | return std::make_pair(x&: codepoint, y: 3); |
226 | } |
227 | // 4 bytes: [0x10000, 0x10FFFF] |
228 | // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
229 | if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) && |
230 | ((*(Position + 1) & 0xC0) == 0x80) && |
231 | ((*(Position + 2) & 0xC0) == 0x80) && |
232 | ((*(Position + 3) & 0xC0) == 0x80)) { |
233 | uint32_t codepoint = ((*Position & 0x07) << 18) | |
234 | ((*(Position + 1) & 0x3F) << 12) | |
235 | ((*(Position + 2) & 0x3F) << 6) | |
236 | (*(Position + 3) & 0x3F); |
237 | if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) |
238 | return std::make_pair(x&: codepoint, y: 4); |
239 | } |
240 | return std::make_pair(x: 0, y: 0); |
241 | } |
242 | |
243 | namespace llvm { |
244 | namespace yaml { |
245 | |
246 | /// Scans YAML tokens from a MemoryBuffer. |
247 | class Scanner { |
248 | public: |
249 | Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, |
250 | std::error_code *EC = nullptr); |
251 | Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, |
252 | std::error_code *EC = nullptr); |
253 | |
254 | /// Parse the next token and return it without popping it. |
255 | Token &peekNext(); |
256 | |
257 | /// Parse the next token and pop it from the queue. |
258 | Token getNext(); |
259 | |
260 | void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, |
261 | ArrayRef<SMRange> Ranges = std::nullopt) { |
262 | SM.PrintMessage(Loc, Kind, Msg: Message, Ranges, /* FixIts= */ std::nullopt, |
263 | ShowColors); |
264 | } |
265 | |
266 | void setError(const Twine &Message, StringRef::iterator Position) { |
267 | if (Position >= End) |
268 | Position = End - 1; |
269 | |
270 | // propagate the error if possible |
271 | if (EC) |
272 | *EC = make_error_code(e: std::errc::invalid_argument); |
273 | |
274 | // Don't print out more errors after the first one we encounter. The rest |
275 | // are just the result of the first, and have no meaning. |
276 | if (!Failed) |
277 | printError(Loc: SMLoc::getFromPointer(Ptr: Position), Kind: SourceMgr::DK_Error, Message); |
278 | Failed = true; |
279 | } |
280 | |
281 | /// Returns true if an error occurred while parsing. |
282 | bool failed() { |
283 | return Failed; |
284 | } |
285 | |
286 | private: |
287 | void init(MemoryBufferRef Buffer); |
288 | |
289 | StringRef currentInput() { |
290 | return StringRef(Current, End - Current); |
291 | } |
292 | |
293 | /// Decode a UTF-8 minimal well-formed code unit subsequence starting |
294 | /// at \a Position. |
295 | /// |
296 | /// If the UTF-8 code units starting at Position do not form a well-formed |
297 | /// code unit subsequence, then the Unicode scalar value is 0, and the length |
298 | /// is 0. |
299 | UTF8Decoded decodeUTF8(StringRef::iterator Position) { |
300 | return ::decodeUTF8(Range: StringRef(Position, End - Position)); |
301 | } |
302 | |
303 | // The following functions are based on the gramar rules in the YAML spec. The |
304 | // style of the function names it meant to closely match how they are written |
305 | // in the spec. The number within the [] is the number of the grammar rule in |
306 | // the spec. |
307 | // |
308 | // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. |
309 | // |
310 | // c- |
311 | // A production starting and ending with a special character. |
312 | // b- |
313 | // A production matching a single line break. |
314 | // nb- |
315 | // A production starting and ending with a non-break character. |
316 | // s- |
317 | // A production starting and ending with a white space character. |
318 | // ns- |
319 | // A production starting and ending with a non-space character. |
320 | // l- |
321 | // A production matching complete line(s). |
322 | |
323 | /// Skip a single nb-char[27] starting at Position. |
324 | /// |
325 | /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] |
326 | /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] |
327 | /// |
328 | /// @returns The code unit after the nb-char, or Position if it's not an |
329 | /// nb-char. |
330 | StringRef::iterator skip_nb_char(StringRef::iterator Position); |
331 | |
332 | /// Skip a single b-break[28] starting at Position. |
333 | /// |
334 | /// A b-break is 0xD 0xA | 0xD | 0xA |
335 | /// |
336 | /// @returns The code unit after the b-break, or Position if it's not a |
337 | /// b-break. |
338 | StringRef::iterator skip_b_break(StringRef::iterator Position); |
339 | |
340 | /// Skip a single s-space[31] starting at Position. |
341 | /// |
342 | /// An s-space is 0x20 |
343 | /// |
344 | /// @returns The code unit after the s-space, or Position if it's not a |
345 | /// s-space. |
346 | StringRef::iterator skip_s_space(StringRef::iterator Position); |
347 | |
348 | /// Skip a single s-white[33] starting at Position. |
349 | /// |
350 | /// A s-white is 0x20 | 0x9 |
351 | /// |
352 | /// @returns The code unit after the s-white, or Position if it's not a |
353 | /// s-white. |
354 | StringRef::iterator skip_s_white(StringRef::iterator Position); |
355 | |
356 | /// Skip a single ns-char[34] starting at Position. |
357 | /// |
358 | /// A ns-char is nb-char - s-white |
359 | /// |
360 | /// @returns The code unit after the ns-char, or Position if it's not a |
361 | /// ns-char. |
362 | StringRef::iterator skip_ns_char(StringRef::iterator Position); |
363 | |
364 | using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); |
365 | |
366 | /// Skip minimal well-formed code unit subsequences until Func |
367 | /// returns its input. |
368 | /// |
369 | /// @returns The code unit after the last minimal well-formed code unit |
370 | /// subsequence that Func accepted. |
371 | StringRef::iterator skip_while( SkipWhileFunc Func |
372 | , StringRef::iterator Position); |
373 | |
374 | /// Skip minimal well-formed code unit subsequences until Func returns its |
375 | /// input. |
376 | void advanceWhile(SkipWhileFunc Func); |
377 | |
378 | /// Scan ns-uri-char[39]s starting at Cur. |
379 | /// |
380 | /// This updates Cur and Column while scanning. |
381 | void scan_ns_uri_char(); |
382 | |
383 | /// Consume a minimal well-formed code unit subsequence starting at |
384 | /// \a Cur. Return false if it is not the same Unicode scalar value as |
385 | /// \a Expected. This updates \a Column. |
386 | bool consume(uint32_t Expected); |
387 | |
388 | /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. |
389 | void skip(uint32_t Distance); |
390 | |
391 | /// Return true if the minimal well-formed code unit subsequence at |
392 | /// Pos is whitespace or a new line |
393 | bool isBlankOrBreak(StringRef::iterator Position); |
394 | |
395 | /// Return true if the minimal well-formed code unit subsequence at |
396 | /// Pos is considered a "safe" character for plain scalars. |
397 | bool isPlainSafeNonBlank(StringRef::iterator Position); |
398 | |
399 | /// Return true if the line is a line break, false otherwise. |
400 | bool isLineEmpty(StringRef Line); |
401 | |
402 | /// Consume a single b-break[28] if it's present at the current position. |
403 | /// |
404 | /// Return false if the code unit at the current position isn't a line break. |
405 | bool consumeLineBreakIfPresent(); |
406 | |
407 | /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. |
408 | void saveSimpleKeyCandidate( TokenQueueT::iterator Tok |
409 | , unsigned AtColumn |
410 | , bool IsRequired); |
411 | |
412 | /// Remove simple keys that can no longer be valid simple keys. |
413 | /// |
414 | /// Invalid simple keys are not on the current line or are further than 1024 |
415 | /// columns back. |
416 | void removeStaleSimpleKeyCandidates(); |
417 | |
418 | /// Remove all simple keys on FlowLevel \a Level. |
419 | void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); |
420 | |
421 | /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd |
422 | /// tokens if needed. |
423 | bool unrollIndent(int ToColumn); |
424 | |
425 | /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint |
426 | /// if needed. |
427 | bool rollIndent( int ToColumn |
428 | , Token::TokenKind Kind |
429 | , TokenQueueT::iterator InsertPoint); |
430 | |
431 | /// Skip a single-line comment when the comment starts at the current |
432 | /// position of the scanner. |
433 | void skipComment(); |
434 | |
435 | /// Skip whitespace and comments until the start of the next token. |
436 | void scanToNextToken(); |
437 | |
438 | /// Must be the first token generated. |
439 | bool scanStreamStart(); |
440 | |
441 | /// Generate tokens needed to close out the stream. |
442 | bool scanStreamEnd(); |
443 | |
444 | /// Scan a %BLAH directive. |
445 | bool scanDirective(); |
446 | |
447 | /// Scan a ... or ---. |
448 | bool scanDocumentIndicator(bool IsStart); |
449 | |
450 | /// Scan a [ or { and generate the proper flow collection start token. |
451 | bool scanFlowCollectionStart(bool IsSequence); |
452 | |
453 | /// Scan a ] or } and generate the proper flow collection end token. |
454 | bool scanFlowCollectionEnd(bool IsSequence); |
455 | |
456 | /// Scan the , that separates entries in a flow collection. |
457 | bool scanFlowEntry(); |
458 | |
459 | /// Scan the - that starts block sequence entries. |
460 | bool scanBlockEntry(); |
461 | |
462 | /// Scan an explicit ? indicating a key. |
463 | bool scanKey(); |
464 | |
465 | /// Scan an explicit : indicating a value. |
466 | bool scanValue(); |
467 | |
468 | /// Scan a quoted scalar. |
469 | bool scanFlowScalar(bool IsDoubleQuoted); |
470 | |
471 | /// Scan an unquoted scalar. |
472 | bool scanPlainScalar(); |
473 | |
474 | /// Scan an Alias or Anchor starting with * or &. |
475 | bool scanAliasOrAnchor(bool IsAlias); |
476 | |
477 | /// Scan a block scalar starting with | or >. |
478 | bool scanBlockScalar(bool IsLiteral); |
479 | |
480 | /// Scan a block scalar style indicator and header. |
481 | /// |
482 | /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that |
483 | /// YAML does not consider the style indicator to be a part of the header. |
484 | /// |
485 | /// Return false if an error occurred. |
486 | bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator, |
487 | unsigned &IndentIndicator, bool &IsDone); |
488 | |
489 | /// Scan a style indicator in a block scalar header. |
490 | char scanBlockStyleIndicator(); |
491 | |
492 | /// Scan a chomping indicator in a block scalar header. |
493 | char scanBlockChompingIndicator(); |
494 | |
495 | /// Scan an indentation indicator in a block scalar header. |
496 | unsigned scanBlockIndentationIndicator(); |
497 | |
498 | /// Scan a block scalar header. |
499 | /// |
500 | /// Return false if an error occurred. |
501 | bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, |
502 | bool &IsDone); |
503 | |
504 | /// Look for the indentation level of a block scalar. |
505 | /// |
506 | /// Return false if an error occurred. |
507 | bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, |
508 | unsigned &LineBreaks, bool &IsDone); |
509 | |
510 | /// Scan the indentation of a text line in a block scalar. |
511 | /// |
512 | /// Return false if an error occurred. |
513 | bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, |
514 | bool &IsDone); |
515 | |
516 | /// Scan a tag of the form !stuff. |
517 | bool scanTag(); |
518 | |
519 | /// Dispatch to the next scanning function based on \a *Cur. |
520 | bool fetchMoreTokens(); |
521 | |
522 | /// The SourceMgr used for diagnostics and buffer management. |
523 | SourceMgr &SM; |
524 | |
525 | /// The original input. |
526 | MemoryBufferRef InputBuffer; |
527 | |
528 | /// The current position of the scanner. |
529 | StringRef::iterator Current; |
530 | |
531 | /// The end of the input (one past the last character). |
532 | StringRef::iterator End; |
533 | |
534 | /// Current YAML indentation level in spaces. |
535 | int Indent; |
536 | |
537 | /// Current column number in Unicode code points. |
538 | unsigned Column; |
539 | |
540 | /// Current line number. |
541 | unsigned Line; |
542 | |
543 | /// How deep we are in flow style containers. 0 Means at block level. |
544 | unsigned FlowLevel; |
545 | |
546 | /// Are we at the start of the stream? |
547 | bool IsStartOfStream; |
548 | |
549 | /// Can the next token be the start of a simple key? |
550 | bool IsSimpleKeyAllowed; |
551 | |
552 | /// Can the next token be a value indicator even if it does not have a |
553 | /// trailing space? |
554 | bool IsAdjacentValueAllowedInFlow; |
555 | |
556 | /// True if an error has occurred. |
557 | bool Failed; |
558 | |
559 | /// Should colors be used when printing out the diagnostic messages? |
560 | bool ShowColors; |
561 | |
562 | /// Queue of tokens. This is required to queue up tokens while looking |
563 | /// for the end of a simple key. And for cases where a single character |
564 | /// can produce multiple tokens (e.g. BlockEnd). |
565 | TokenQueueT TokenQueue; |
566 | |
567 | /// Indentation levels. |
568 | SmallVector<int, 4> Indents; |
569 | |
570 | /// Potential simple keys. |
571 | SmallVector<SimpleKey, 4> SimpleKeys; |
572 | |
573 | std::error_code *EC; |
574 | }; |
575 | |
576 | } // end namespace yaml |
577 | } // end namespace llvm |
578 | |
579 | /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. |
580 | static void encodeUTF8( uint32_t UnicodeScalarValue |
581 | , SmallVectorImpl<char> &Result) { |
582 | if (UnicodeScalarValue <= 0x7F) { |
583 | Result.push_back(Elt: UnicodeScalarValue & 0x7F); |
584 | } else if (UnicodeScalarValue <= 0x7FF) { |
585 | uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); |
586 | uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); |
587 | Result.push_back(Elt: FirstByte); |
588 | Result.push_back(Elt: SecondByte); |
589 | } else if (UnicodeScalarValue <= 0xFFFF) { |
590 | uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); |
591 | uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); |
592 | uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); |
593 | Result.push_back(Elt: FirstByte); |
594 | Result.push_back(Elt: SecondByte); |
595 | Result.push_back(Elt: ThirdByte); |
596 | } else if (UnicodeScalarValue <= 0x10FFFF) { |
597 | uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); |
598 | uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); |
599 | uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); |
600 | uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); |
601 | Result.push_back(Elt: FirstByte); |
602 | Result.push_back(Elt: SecondByte); |
603 | Result.push_back(Elt: ThirdByte); |
604 | Result.push_back(Elt: FourthByte); |
605 | } |
606 | } |
607 | |
608 | bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { |
609 | SourceMgr SM; |
610 | Scanner scanner(Input, SM); |
611 | while (true) { |
612 | Token T = scanner.getNext(); |
613 | switch (T.Kind) { |
614 | case Token::TK_StreamStart: |
615 | OS << "Stream-Start: " ; |
616 | break; |
617 | case Token::TK_StreamEnd: |
618 | OS << "Stream-End: " ; |
619 | break; |
620 | case Token::TK_VersionDirective: |
621 | OS << "Version-Directive: " ; |
622 | break; |
623 | case Token::TK_TagDirective: |
624 | OS << "Tag-Directive: " ; |
625 | break; |
626 | case Token::TK_DocumentStart: |
627 | OS << "Document-Start: " ; |
628 | break; |
629 | case Token::TK_DocumentEnd: |
630 | OS << "Document-End: " ; |
631 | break; |
632 | case Token::TK_BlockEntry: |
633 | OS << "Block-Entry: " ; |
634 | break; |
635 | case Token::TK_BlockEnd: |
636 | OS << "Block-End: " ; |
637 | break; |
638 | case Token::TK_BlockSequenceStart: |
639 | OS << "Block-Sequence-Start: " ; |
640 | break; |
641 | case Token::TK_BlockMappingStart: |
642 | OS << "Block-Mapping-Start: " ; |
643 | break; |
644 | case Token::TK_FlowEntry: |
645 | OS << "Flow-Entry: " ; |
646 | break; |
647 | case Token::TK_FlowSequenceStart: |
648 | OS << "Flow-Sequence-Start: " ; |
649 | break; |
650 | case Token::TK_FlowSequenceEnd: |
651 | OS << "Flow-Sequence-End: " ; |
652 | break; |
653 | case Token::TK_FlowMappingStart: |
654 | OS << "Flow-Mapping-Start: " ; |
655 | break; |
656 | case Token::TK_FlowMappingEnd: |
657 | OS << "Flow-Mapping-End: " ; |
658 | break; |
659 | case Token::TK_Key: |
660 | OS << "Key: " ; |
661 | break; |
662 | case Token::TK_Value: |
663 | OS << "Value: " ; |
664 | break; |
665 | case Token::TK_Scalar: |
666 | OS << "Scalar: " ; |
667 | break; |
668 | case Token::TK_BlockScalar: |
669 | OS << "Block Scalar: " ; |
670 | break; |
671 | case Token::TK_Alias: |
672 | OS << "Alias: " ; |
673 | break; |
674 | case Token::TK_Anchor: |
675 | OS << "Anchor: " ; |
676 | break; |
677 | case Token::TK_Tag: |
678 | OS << "Tag: " ; |
679 | break; |
680 | case Token::TK_Error: |
681 | break; |
682 | } |
683 | OS << T.Range << "\n" ; |
684 | if (T.Kind == Token::TK_StreamEnd) |
685 | break; |
686 | else if (T.Kind == Token::TK_Error) |
687 | return false; |
688 | } |
689 | return true; |
690 | } |
691 | |
692 | bool yaml::scanTokens(StringRef Input) { |
693 | SourceMgr SM; |
694 | Scanner scanner(Input, SM); |
695 | while (true) { |
696 | Token T = scanner.getNext(); |
697 | if (T.Kind == Token::TK_StreamEnd) |
698 | break; |
699 | else if (T.Kind == Token::TK_Error) |
700 | return false; |
701 | } |
702 | return true; |
703 | } |
704 | |
705 | std::string yaml::escape(StringRef Input, bool EscapePrintable) { |
706 | std::string EscapedInput; |
707 | for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { |
708 | if (*i == '\\') |
709 | EscapedInput += "\\\\" ; |
710 | else if (*i == '"') |
711 | EscapedInput += "\\\"" ; |
712 | else if (*i == 0) |
713 | EscapedInput += "\\0" ; |
714 | else if (*i == 0x07) |
715 | EscapedInput += "\\a" ; |
716 | else if (*i == 0x08) |
717 | EscapedInput += "\\b" ; |
718 | else if (*i == 0x09) |
719 | EscapedInput += "\\t" ; |
720 | else if (*i == 0x0A) |
721 | EscapedInput += "\\n" ; |
722 | else if (*i == 0x0B) |
723 | EscapedInput += "\\v" ; |
724 | else if (*i == 0x0C) |
725 | EscapedInput += "\\f" ; |
726 | else if (*i == 0x0D) |
727 | EscapedInput += "\\r" ; |
728 | else if (*i == 0x1B) |
729 | EscapedInput += "\\e" ; |
730 | else if ((unsigned char)*i < 0x20) { // Control characters not handled above. |
731 | std::string HexStr = utohexstr(X: *i); |
732 | EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; |
733 | } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. |
734 | UTF8Decoded UnicodeScalarValue |
735 | = decodeUTF8(Range: StringRef(i, Input.end() - i)); |
736 | if (UnicodeScalarValue.second == 0) { |
737 | // Found invalid char. |
738 | SmallString<4> Val; |
739 | encodeUTF8(UnicodeScalarValue: 0xFFFD, Result&: Val); |
740 | llvm::append_range(C&: EscapedInput, R&: Val); |
741 | // FIXME: Error reporting. |
742 | return EscapedInput; |
743 | } |
744 | if (UnicodeScalarValue.first == 0x85) |
745 | EscapedInput += "\\N" ; |
746 | else if (UnicodeScalarValue.first == 0xA0) |
747 | EscapedInput += "\\_" ; |
748 | else if (UnicodeScalarValue.first == 0x2028) |
749 | EscapedInput += "\\L" ; |
750 | else if (UnicodeScalarValue.first == 0x2029) |
751 | EscapedInput += "\\P" ; |
752 | else if (!EscapePrintable && |
753 | sys::unicode::isPrintable(UCS: UnicodeScalarValue.first)) |
754 | EscapedInput += StringRef(i, UnicodeScalarValue.second); |
755 | else { |
756 | std::string HexStr = utohexstr(X: UnicodeScalarValue.first); |
757 | if (HexStr.size() <= 2) |
758 | EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; |
759 | else if (HexStr.size() <= 4) |
760 | EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; |
761 | else if (HexStr.size() <= 8) |
762 | EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; |
763 | } |
764 | i += UnicodeScalarValue.second - 1; |
765 | } else |
766 | EscapedInput.push_back(c: *i); |
767 | } |
768 | return EscapedInput; |
769 | } |
770 | |
771 | std::optional<bool> yaml::parseBool(StringRef S) { |
772 | switch (S.size()) { |
773 | case 1: |
774 | switch (S.front()) { |
775 | case 'y': |
776 | case 'Y': |
777 | return true; |
778 | case 'n': |
779 | case 'N': |
780 | return false; |
781 | default: |
782 | return std::nullopt; |
783 | } |
784 | case 2: |
785 | switch (S.front()) { |
786 | case 'O': |
787 | if (S[1] == 'N') // ON |
788 | return true; |
789 | [[fallthrough]]; |
790 | case 'o': |
791 | if (S[1] == 'n') //[Oo]n |
792 | return true; |
793 | return std::nullopt; |
794 | case 'N': |
795 | if (S[1] == 'O') // NO |
796 | return false; |
797 | [[fallthrough]]; |
798 | case 'n': |
799 | if (S[1] == 'o') //[Nn]o |
800 | return false; |
801 | return std::nullopt; |
802 | default: |
803 | return std::nullopt; |
804 | } |
805 | case 3: |
806 | switch (S.front()) { |
807 | case 'O': |
808 | if (S.drop_front() == "FF" ) // OFF |
809 | return false; |
810 | [[fallthrough]]; |
811 | case 'o': |
812 | if (S.drop_front() == "ff" ) //[Oo]ff |
813 | return false; |
814 | return std::nullopt; |
815 | case 'Y': |
816 | if (S.drop_front() == "ES" ) // YES |
817 | return true; |
818 | [[fallthrough]]; |
819 | case 'y': |
820 | if (S.drop_front() == "es" ) //[Yy]es |
821 | return true; |
822 | return std::nullopt; |
823 | default: |
824 | return std::nullopt; |
825 | } |
826 | case 4: |
827 | switch (S.front()) { |
828 | case 'T': |
829 | if (S.drop_front() == "RUE" ) // TRUE |
830 | return true; |
831 | [[fallthrough]]; |
832 | case 't': |
833 | if (S.drop_front() == "rue" ) //[Tt]rue |
834 | return true; |
835 | return std::nullopt; |
836 | default: |
837 | return std::nullopt; |
838 | } |
839 | case 5: |
840 | switch (S.front()) { |
841 | case 'F': |
842 | if (S.drop_front() == "ALSE" ) // FALSE |
843 | return false; |
844 | [[fallthrough]]; |
845 | case 'f': |
846 | if (S.drop_front() == "alse" ) //[Ff]alse |
847 | return false; |
848 | return std::nullopt; |
849 | default: |
850 | return std::nullopt; |
851 | } |
852 | default: |
853 | return std::nullopt; |
854 | } |
855 | } |
856 | |
857 | Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, |
858 | std::error_code *EC) |
859 | : SM(sm), ShowColors(ShowColors), EC(EC) { |
860 | init(Buffer: MemoryBufferRef(Input, "YAML" )); |
861 | } |
862 | |
863 | Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, |
864 | std::error_code *EC) |
865 | : SM(SM_), ShowColors(ShowColors), EC(EC) { |
866 | init(Buffer); |
867 | } |
868 | |
869 | void Scanner::init(MemoryBufferRef Buffer) { |
870 | InputBuffer = Buffer; |
871 | Current = InputBuffer.getBufferStart(); |
872 | End = InputBuffer.getBufferEnd(); |
873 | Indent = -1; |
874 | Column = 0; |
875 | Line = 0; |
876 | FlowLevel = 0; |
877 | IsStartOfStream = true; |
878 | IsSimpleKeyAllowed = true; |
879 | IsAdjacentValueAllowedInFlow = false; |
880 | Failed = false; |
881 | std::unique_ptr<MemoryBuffer> InputBufferOwner = |
882 | MemoryBuffer::getMemBuffer(Ref: Buffer, /*RequiresNullTerminator=*/false); |
883 | SM.AddNewSourceBuffer(F: std::move(InputBufferOwner), IncludeLoc: SMLoc()); |
884 | } |
885 | |
886 | Token &Scanner::peekNext() { |
887 | // If the current token is a possible simple key, keep parsing until we |
888 | // can confirm. |
889 | bool NeedMore = false; |
890 | while (true) { |
891 | if (TokenQueue.empty() || NeedMore) { |
892 | if (!fetchMoreTokens()) { |
893 | TokenQueue.clear(); |
894 | SimpleKeys.clear(); |
895 | TokenQueue.push_back(V: Token()); |
896 | return TokenQueue.front(); |
897 | } |
898 | } |
899 | assert(!TokenQueue.empty() && |
900 | "fetchMoreTokens lied about getting tokens!" ); |
901 | |
902 | removeStaleSimpleKeyCandidates(); |
903 | SimpleKey SK; |
904 | SK.Tok = TokenQueue.begin(); |
905 | if (!is_contained(Range&: SimpleKeys, Element: SK)) |
906 | break; |
907 | else |
908 | NeedMore = true; |
909 | } |
910 | return TokenQueue.front(); |
911 | } |
912 | |
913 | Token Scanner::getNext() { |
914 | Token Ret = peekNext(); |
915 | // TokenQueue can be empty if there was an error getting the next token. |
916 | if (!TokenQueue.empty()) |
917 | TokenQueue.pop_front(); |
918 | |
919 | // There cannot be any referenced Token's if the TokenQueue is empty. So do a |
920 | // quick deallocation of them all. |
921 | if (TokenQueue.empty()) |
922 | TokenQueue.resetAlloc(); |
923 | |
924 | return Ret; |
925 | } |
926 | |
927 | StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { |
928 | if (Position == End) |
929 | return Position; |
930 | // Check 7 bit c-printable - b-char. |
931 | if ( *Position == 0x09 |
932 | || (*Position >= 0x20 && *Position <= 0x7E)) |
933 | return Position + 1; |
934 | |
935 | // Check for valid UTF-8. |
936 | if (uint8_t(*Position) & 0x80) { |
937 | UTF8Decoded u8d = decodeUTF8(Position); |
938 | if ( u8d.second != 0 |
939 | && u8d.first != 0xFEFF |
940 | && ( u8d.first == 0x85 |
941 | || ( u8d.first >= 0xA0 |
942 | && u8d.first <= 0xD7FF) |
943 | || ( u8d.first >= 0xE000 |
944 | && u8d.first <= 0xFFFD) |
945 | || ( u8d.first >= 0x10000 |
946 | && u8d.first <= 0x10FFFF))) |
947 | return Position + u8d.second; |
948 | } |
949 | return Position; |
950 | } |
951 | |
952 | StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { |
953 | if (Position == End) |
954 | return Position; |
955 | if (*Position == 0x0D) { |
956 | if (Position + 1 != End && *(Position + 1) == 0x0A) |
957 | return Position + 2; |
958 | return Position + 1; |
959 | } |
960 | |
961 | if (*Position == 0x0A) |
962 | return Position + 1; |
963 | return Position; |
964 | } |
965 | |
966 | StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { |
967 | if (Position == End) |
968 | return Position; |
969 | if (*Position == ' ') |
970 | return Position + 1; |
971 | return Position; |
972 | } |
973 | |
974 | StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { |
975 | if (Position == End) |
976 | return Position; |
977 | if (*Position == ' ' || *Position == '\t') |
978 | return Position + 1; |
979 | return Position; |
980 | } |
981 | |
982 | StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { |
983 | if (Position == End) |
984 | return Position; |
985 | if (*Position == ' ' || *Position == '\t') |
986 | return Position; |
987 | return skip_nb_char(Position); |
988 | } |
989 | |
990 | StringRef::iterator Scanner::skip_while( SkipWhileFunc Func |
991 | , StringRef::iterator Position) { |
992 | while (true) { |
993 | StringRef::iterator i = (this->*Func)(Position); |
994 | if (i == Position) |
995 | break; |
996 | Position = i; |
997 | } |
998 | return Position; |
999 | } |
1000 | |
1001 | void Scanner::advanceWhile(SkipWhileFunc Func) { |
1002 | auto Final = skip_while(Func, Position: Current); |
1003 | Column += Final - Current; |
1004 | Current = Final; |
1005 | } |
1006 | |
1007 | static bool is_ns_hex_digit(const char C) { return isAlnum(C); } |
1008 | |
1009 | static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); } |
1010 | |
1011 | void Scanner::scan_ns_uri_char() { |
1012 | while (true) { |
1013 | if (Current == End) |
1014 | break; |
1015 | if (( *Current == '%' |
1016 | && Current + 2 < End |
1017 | && is_ns_hex_digit(C: *(Current + 1)) |
1018 | && is_ns_hex_digit(C: *(Current + 2))) |
1019 | || is_ns_word_char(C: *Current) |
1020 | || StringRef(Current, 1).find_first_of(Chars: "#;/?:@&=+$,_.!~*'()[]" ) |
1021 | != StringRef::npos) { |
1022 | ++Current; |
1023 | ++Column; |
1024 | } else |
1025 | break; |
1026 | } |
1027 | } |
1028 | |
1029 | bool Scanner::consume(uint32_t Expected) { |
1030 | if (Expected >= 0x80) { |
1031 | setError(Message: "Cannot consume non-ascii characters" , Position: Current); |
1032 | return false; |
1033 | } |
1034 | if (Current == End) |
1035 | return false; |
1036 | if (uint8_t(*Current) >= 0x80) { |
1037 | setError(Message: "Cannot consume non-ascii characters" , Position: Current); |
1038 | return false; |
1039 | } |
1040 | if (uint8_t(*Current) == Expected) { |
1041 | ++Current; |
1042 | ++Column; |
1043 | return true; |
1044 | } |
1045 | return false; |
1046 | } |
1047 | |
1048 | void Scanner::skip(uint32_t Distance) { |
1049 | Current += Distance; |
1050 | Column += Distance; |
1051 | assert(Current <= End && "Skipped past the end" ); |
1052 | } |
1053 | |
1054 | bool Scanner::isBlankOrBreak(StringRef::iterator Position) { |
1055 | if (Position == End) |
1056 | return false; |
1057 | return *Position == ' ' || *Position == '\t' || *Position == '\r' || |
1058 | *Position == '\n'; |
1059 | } |
1060 | |
1061 | bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position) { |
1062 | if (Position == End || isBlankOrBreak(Position)) |
1063 | return false; |
1064 | if (FlowLevel && |
1065 | StringRef(Position, 1).find_first_of(Chars: ",[]{}" ) != StringRef::npos) |
1066 | return false; |
1067 | return true; |
1068 | } |
1069 | |
1070 | bool Scanner::isLineEmpty(StringRef Line) { |
1071 | for (const auto *Position = Line.begin(); Position != Line.end(); ++Position) |
1072 | if (!isBlankOrBreak(Position)) |
1073 | return false; |
1074 | return true; |
1075 | } |
1076 | |
1077 | bool Scanner::consumeLineBreakIfPresent() { |
1078 | auto Next = skip_b_break(Position: Current); |
1079 | if (Next == Current) |
1080 | return false; |
1081 | Column = 0; |
1082 | ++Line; |
1083 | Current = Next; |
1084 | return true; |
1085 | } |
1086 | |
1087 | void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok |
1088 | , unsigned AtColumn |
1089 | , bool IsRequired) { |
1090 | if (IsSimpleKeyAllowed) { |
1091 | SimpleKey SK; |
1092 | SK.Tok = Tok; |
1093 | SK.Line = Line; |
1094 | SK.Column = AtColumn; |
1095 | SK.IsRequired = IsRequired; |
1096 | SK.FlowLevel = FlowLevel; |
1097 | SimpleKeys.push_back(Elt: SK); |
1098 | } |
1099 | } |
1100 | |
1101 | void Scanner::removeStaleSimpleKeyCandidates() { |
1102 | for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); |
1103 | i != SimpleKeys.end();) { |
1104 | if (i->Line != Line || i->Column + 1024 < Column) { |
1105 | if (i->IsRequired) |
1106 | setError( Message: "Could not find expected : for simple key" |
1107 | , Position: i->Tok->Range.begin()); |
1108 | i = SimpleKeys.erase(CI: i); |
1109 | } else |
1110 | ++i; |
1111 | } |
1112 | } |
1113 | |
1114 | void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { |
1115 | if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) |
1116 | SimpleKeys.pop_back(); |
1117 | } |
1118 | |
1119 | bool Scanner::unrollIndent(int ToColumn) { |
1120 | Token T; |
1121 | // Indentation is ignored in flow. |
1122 | if (FlowLevel != 0) |
1123 | return true; |
1124 | |
1125 | while (Indent > ToColumn) { |
1126 | T.Kind = Token::TK_BlockEnd; |
1127 | T.Range = StringRef(Current, 1); |
1128 | TokenQueue.push_back(V: T); |
1129 | Indent = Indents.pop_back_val(); |
1130 | } |
1131 | |
1132 | return true; |
1133 | } |
1134 | |
1135 | bool Scanner::rollIndent( int ToColumn |
1136 | , Token::TokenKind Kind |
1137 | , TokenQueueT::iterator InsertPoint) { |
1138 | if (FlowLevel) |
1139 | return true; |
1140 | if (Indent < ToColumn) { |
1141 | Indents.push_back(Elt: Indent); |
1142 | Indent = ToColumn; |
1143 | |
1144 | Token T; |
1145 | T.Kind = Kind; |
1146 | T.Range = StringRef(Current, 0); |
1147 | TokenQueue.insert(I: InsertPoint, V: T); |
1148 | } |
1149 | return true; |
1150 | } |
1151 | |
1152 | void Scanner::() { |
1153 | if (Current == End || *Current != '#') |
1154 | return; |
1155 | while (true) { |
1156 | // This may skip more than one byte, thus Column is only incremented |
1157 | // for code points. |
1158 | StringRef::iterator I = skip_nb_char(Position: Current); |
1159 | if (I == Current) |
1160 | break; |
1161 | Current = I; |
1162 | ++Column; |
1163 | } |
1164 | } |
1165 | |
1166 | void Scanner::scanToNextToken() { |
1167 | while (true) { |
1168 | while (Current != End && (*Current == ' ' || *Current == '\t')) { |
1169 | skip(Distance: 1); |
1170 | } |
1171 | |
1172 | skipComment(); |
1173 | |
1174 | // Skip EOL. |
1175 | StringRef::iterator i = skip_b_break(Position: Current); |
1176 | if (i == Current) |
1177 | break; |
1178 | Current = i; |
1179 | ++Line; |
1180 | Column = 0; |
1181 | // New lines may start a simple key. |
1182 | if (!FlowLevel) |
1183 | IsSimpleKeyAllowed = true; |
1184 | } |
1185 | } |
1186 | |
1187 | bool Scanner::scanStreamStart() { |
1188 | IsStartOfStream = false; |
1189 | |
1190 | EncodingInfo EI = getUnicodeEncoding(Input: currentInput()); |
1191 | |
1192 | Token T; |
1193 | T.Kind = Token::TK_StreamStart; |
1194 | T.Range = StringRef(Current, EI.second); |
1195 | TokenQueue.push_back(V: T); |
1196 | Current += EI.second; |
1197 | return true; |
1198 | } |
1199 | |
1200 | bool Scanner::scanStreamEnd() { |
1201 | // Force an ending new line if one isn't present. |
1202 | if (Column != 0) { |
1203 | Column = 0; |
1204 | ++Line; |
1205 | } |
1206 | |
1207 | unrollIndent(ToColumn: -1); |
1208 | SimpleKeys.clear(); |
1209 | IsSimpleKeyAllowed = false; |
1210 | IsAdjacentValueAllowedInFlow = false; |
1211 | |
1212 | Token T; |
1213 | T.Kind = Token::TK_StreamEnd; |
1214 | T.Range = StringRef(Current, 0); |
1215 | TokenQueue.push_back(V: T); |
1216 | return true; |
1217 | } |
1218 | |
1219 | bool Scanner::scanDirective() { |
1220 | // Reset the indentation level. |
1221 | unrollIndent(ToColumn: -1); |
1222 | SimpleKeys.clear(); |
1223 | IsSimpleKeyAllowed = false; |
1224 | IsAdjacentValueAllowedInFlow = false; |
1225 | |
1226 | StringRef::iterator Start = Current; |
1227 | consume(Expected: '%'); |
1228 | StringRef::iterator NameStart = Current; |
1229 | Current = skip_while(Func: &Scanner::skip_ns_char, Position: Current); |
1230 | StringRef Name(NameStart, Current - NameStart); |
1231 | Current = skip_while(Func: &Scanner::skip_s_white, Position: Current); |
1232 | |
1233 | Token T; |
1234 | if (Name == "YAML" ) { |
1235 | Current = skip_while(Func: &Scanner::skip_ns_char, Position: Current); |
1236 | T.Kind = Token::TK_VersionDirective; |
1237 | T.Range = StringRef(Start, Current - Start); |
1238 | TokenQueue.push_back(V: T); |
1239 | return true; |
1240 | } else if(Name == "TAG" ) { |
1241 | Current = skip_while(Func: &Scanner::skip_ns_char, Position: Current); |
1242 | Current = skip_while(Func: &Scanner::skip_s_white, Position: Current); |
1243 | Current = skip_while(Func: &Scanner::skip_ns_char, Position: Current); |
1244 | T.Kind = Token::TK_TagDirective; |
1245 | T.Range = StringRef(Start, Current - Start); |
1246 | TokenQueue.push_back(V: T); |
1247 | return true; |
1248 | } |
1249 | return false; |
1250 | } |
1251 | |
1252 | bool Scanner::scanDocumentIndicator(bool IsStart) { |
1253 | unrollIndent(ToColumn: -1); |
1254 | SimpleKeys.clear(); |
1255 | IsSimpleKeyAllowed = false; |
1256 | IsAdjacentValueAllowedInFlow = false; |
1257 | |
1258 | Token T; |
1259 | T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; |
1260 | T.Range = StringRef(Current, 3); |
1261 | skip(Distance: 3); |
1262 | TokenQueue.push_back(V: T); |
1263 | return true; |
1264 | } |
1265 | |
1266 | bool Scanner::scanFlowCollectionStart(bool IsSequence) { |
1267 | Token T; |
1268 | T.Kind = IsSequence ? Token::TK_FlowSequenceStart |
1269 | : Token::TK_FlowMappingStart; |
1270 | T.Range = StringRef(Current, 1); |
1271 | skip(Distance: 1); |
1272 | TokenQueue.push_back(V: T); |
1273 | |
1274 | // [ and { may begin a simple key. |
1275 | saveSimpleKeyCandidate(Tok: --TokenQueue.end(), AtColumn: Column - 1, IsRequired: false); |
1276 | |
1277 | // And may also be followed by a simple key. |
1278 | IsSimpleKeyAllowed = true; |
1279 | // Adjacent values are allowed in flows only after JSON-style keys. |
1280 | IsAdjacentValueAllowedInFlow = false; |
1281 | ++FlowLevel; |
1282 | return true; |
1283 | } |
1284 | |
1285 | bool Scanner::scanFlowCollectionEnd(bool IsSequence) { |
1286 | removeSimpleKeyCandidatesOnFlowLevel(Level: FlowLevel); |
1287 | IsSimpleKeyAllowed = false; |
1288 | IsAdjacentValueAllowedInFlow = true; |
1289 | Token T; |
1290 | T.Kind = IsSequence ? Token::TK_FlowSequenceEnd |
1291 | : Token::TK_FlowMappingEnd; |
1292 | T.Range = StringRef(Current, 1); |
1293 | skip(Distance: 1); |
1294 | TokenQueue.push_back(V: T); |
1295 | if (FlowLevel) |
1296 | --FlowLevel; |
1297 | return true; |
1298 | } |
1299 | |
1300 | bool Scanner::scanFlowEntry() { |
1301 | removeSimpleKeyCandidatesOnFlowLevel(Level: FlowLevel); |
1302 | IsSimpleKeyAllowed = true; |
1303 | IsAdjacentValueAllowedInFlow = false; |
1304 | Token T; |
1305 | T.Kind = Token::TK_FlowEntry; |
1306 | T.Range = StringRef(Current, 1); |
1307 | skip(Distance: 1); |
1308 | TokenQueue.push_back(V: T); |
1309 | return true; |
1310 | } |
1311 | |
1312 | bool Scanner::scanBlockEntry() { |
1313 | rollIndent(ToColumn: Column, Kind: Token::TK_BlockSequenceStart, InsertPoint: TokenQueue.end()); |
1314 | removeSimpleKeyCandidatesOnFlowLevel(Level: FlowLevel); |
1315 | IsSimpleKeyAllowed = true; |
1316 | IsAdjacentValueAllowedInFlow = false; |
1317 | Token T; |
1318 | T.Kind = Token::TK_BlockEntry; |
1319 | T.Range = StringRef(Current, 1); |
1320 | skip(Distance: 1); |
1321 | TokenQueue.push_back(V: T); |
1322 | return true; |
1323 | } |
1324 | |
1325 | bool Scanner::scanKey() { |
1326 | if (!FlowLevel) |
1327 | rollIndent(ToColumn: Column, Kind: Token::TK_BlockMappingStart, InsertPoint: TokenQueue.end()); |
1328 | |
1329 | removeSimpleKeyCandidatesOnFlowLevel(Level: FlowLevel); |
1330 | IsSimpleKeyAllowed = !FlowLevel; |
1331 | IsAdjacentValueAllowedInFlow = false; |
1332 | |
1333 | Token T; |
1334 | T.Kind = Token::TK_Key; |
1335 | T.Range = StringRef(Current, 1); |
1336 | skip(Distance: 1); |
1337 | TokenQueue.push_back(V: T); |
1338 | return true; |
1339 | } |
1340 | |
1341 | bool Scanner::scanValue() { |
1342 | // If the previous token could have been a simple key, insert the key token |
1343 | // into the token queue. |
1344 | if (!SimpleKeys.empty()) { |
1345 | SimpleKey SK = SimpleKeys.pop_back_val(); |
1346 | Token T; |
1347 | T.Kind = Token::TK_Key; |
1348 | T.Range = SK.Tok->Range; |
1349 | TokenQueueT::iterator i, e; |
1350 | for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { |
1351 | if (i == SK.Tok) |
1352 | break; |
1353 | } |
1354 | if (i == e) { |
1355 | Failed = true; |
1356 | return false; |
1357 | } |
1358 | i = TokenQueue.insert(I: i, V: T); |
1359 | |
1360 | // We may also need to add a Block-Mapping-Start token. |
1361 | rollIndent(ToColumn: SK.Column, Kind: Token::TK_BlockMappingStart, InsertPoint: i); |
1362 | |
1363 | IsSimpleKeyAllowed = false; |
1364 | } else { |
1365 | if (!FlowLevel) |
1366 | rollIndent(ToColumn: Column, Kind: Token::TK_BlockMappingStart, InsertPoint: TokenQueue.end()); |
1367 | IsSimpleKeyAllowed = !FlowLevel; |
1368 | } |
1369 | IsAdjacentValueAllowedInFlow = false; |
1370 | |
1371 | Token T; |
1372 | T.Kind = Token::TK_Value; |
1373 | T.Range = StringRef(Current, 1); |
1374 | skip(Distance: 1); |
1375 | TokenQueue.push_back(V: T); |
1376 | return true; |
1377 | } |
1378 | |
1379 | // Forbidding inlining improves performance by roughly 20%. |
1380 | // FIXME: Remove once llvm optimizes this to the faster version without hints. |
1381 | LLVM_ATTRIBUTE_NOINLINE static bool |
1382 | wasEscaped(StringRef::iterator First, StringRef::iterator Position); |
1383 | |
1384 | // Returns whether a character at 'Position' was escaped with a leading '\'. |
1385 | // 'First' specifies the position of the first character in the string. |
1386 | static bool wasEscaped(StringRef::iterator First, |
1387 | StringRef::iterator Position) { |
1388 | assert(Position - 1 >= First); |
1389 | StringRef::iterator I = Position - 1; |
1390 | // We calculate the number of consecutive '\'s before the current position |
1391 | // by iterating backwards through our string. |
1392 | while (I >= First && *I == '\\') --I; |
1393 | // (Position - 1 - I) now contains the number of '\'s before the current |
1394 | // position. If it is odd, the character at 'Position' was escaped. |
1395 | return (Position - 1 - I) % 2 == 1; |
1396 | } |
1397 | |
1398 | bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { |
1399 | StringRef::iterator Start = Current; |
1400 | unsigned ColStart = Column; |
1401 | if (IsDoubleQuoted) { |
1402 | do { |
1403 | ++Current; |
1404 | while (Current != End && *Current != '"') |
1405 | ++Current; |
1406 | // Repeat until the previous character was not a '\' or was an escaped |
1407 | // backslash. |
1408 | } while ( Current != End |
1409 | && *(Current - 1) == '\\' |
1410 | && wasEscaped(First: Start + 1, Position: Current)); |
1411 | } else { |
1412 | skip(Distance: 1); |
1413 | while (Current != End) { |
1414 | // Skip a ' followed by another '. |
1415 | if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { |
1416 | skip(Distance: 2); |
1417 | continue; |
1418 | } else if (*Current == '\'') |
1419 | break; |
1420 | StringRef::iterator i = skip_nb_char(Position: Current); |
1421 | if (i == Current) { |
1422 | i = skip_b_break(Position: Current); |
1423 | if (i == Current) |
1424 | break; |
1425 | Current = i; |
1426 | Column = 0; |
1427 | ++Line; |
1428 | } else { |
1429 | if (i == End) |
1430 | break; |
1431 | Current = i; |
1432 | ++Column; |
1433 | } |
1434 | } |
1435 | } |
1436 | |
1437 | if (Current == End) { |
1438 | setError(Message: "Expected quote at end of scalar" , Position: Current); |
1439 | return false; |
1440 | } |
1441 | |
1442 | skip(Distance: 1); // Skip ending quote. |
1443 | Token T; |
1444 | T.Kind = Token::TK_Scalar; |
1445 | T.Range = StringRef(Start, Current - Start); |
1446 | TokenQueue.push_back(V: T); |
1447 | |
1448 | saveSimpleKeyCandidate(Tok: --TokenQueue.end(), AtColumn: ColStart, IsRequired: false); |
1449 | |
1450 | IsSimpleKeyAllowed = false; |
1451 | IsAdjacentValueAllowedInFlow = true; |
1452 | |
1453 | return true; |
1454 | } |
1455 | |
1456 | bool Scanner::scanPlainScalar() { |
1457 | StringRef::iterator Start = Current; |
1458 | unsigned ColStart = Column; |
1459 | unsigned LeadingBlanks = 0; |
1460 | assert(Indent >= -1 && "Indent must be >= -1 !" ); |
1461 | unsigned indent = static_cast<unsigned>(Indent + 1); |
1462 | while (Current != End) { |
1463 | if (*Current == '#') |
1464 | break; |
1465 | |
1466 | while (Current != End && |
1467 | ((*Current != ':' && isPlainSafeNonBlank(Position: Current)) || |
1468 | (*Current == ':' && isPlainSafeNonBlank(Position: Current + 1)))) { |
1469 | StringRef::iterator i = skip_nb_char(Position: Current); |
1470 | if (i == Current) |
1471 | break; |
1472 | Current = i; |
1473 | ++Column; |
1474 | } |
1475 | |
1476 | // Are we at the end? |
1477 | if (!isBlankOrBreak(Position: Current)) |
1478 | break; |
1479 | |
1480 | // Eat blanks. |
1481 | StringRef::iterator Tmp = Current; |
1482 | while (isBlankOrBreak(Position: Tmp)) { |
1483 | StringRef::iterator i = skip_s_white(Position: Tmp); |
1484 | if (i != Tmp) { |
1485 | if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { |
1486 | setError(Message: "Found invalid tab character in indentation" , Position: Tmp); |
1487 | return false; |
1488 | } |
1489 | Tmp = i; |
1490 | ++Column; |
1491 | } else { |
1492 | i = skip_b_break(Position: Tmp); |
1493 | if (!LeadingBlanks) |
1494 | LeadingBlanks = 1; |
1495 | Tmp = i; |
1496 | Column = 0; |
1497 | ++Line; |
1498 | } |
1499 | } |
1500 | |
1501 | if (!FlowLevel && Column < indent) |
1502 | break; |
1503 | |
1504 | Current = Tmp; |
1505 | } |
1506 | if (Start == Current) { |
1507 | setError(Message: "Got empty plain scalar" , Position: Start); |
1508 | return false; |
1509 | } |
1510 | Token T; |
1511 | T.Kind = Token::TK_Scalar; |
1512 | T.Range = StringRef(Start, Current - Start); |
1513 | TokenQueue.push_back(V: T); |
1514 | |
1515 | // Plain scalars can be simple keys. |
1516 | saveSimpleKeyCandidate(Tok: --TokenQueue.end(), AtColumn: ColStart, IsRequired: false); |
1517 | |
1518 | IsSimpleKeyAllowed = false; |
1519 | IsAdjacentValueAllowedInFlow = false; |
1520 | |
1521 | return true; |
1522 | } |
1523 | |
1524 | bool Scanner::scanAliasOrAnchor(bool IsAlias) { |
1525 | StringRef::iterator Start = Current; |
1526 | unsigned ColStart = Column; |
1527 | skip(Distance: 1); |
1528 | while (Current != End) { |
1529 | if ( *Current == '[' || *Current == ']' |
1530 | || *Current == '{' || *Current == '}' |
1531 | || *Current == ',' |
1532 | || *Current == ':') |
1533 | break; |
1534 | StringRef::iterator i = skip_ns_char(Position: Current); |
1535 | if (i == Current) |
1536 | break; |
1537 | Current = i; |
1538 | ++Column; |
1539 | } |
1540 | |
1541 | if (Start + 1 == Current) { |
1542 | setError(Message: "Got empty alias or anchor" , Position: Start); |
1543 | return false; |
1544 | } |
1545 | |
1546 | Token T; |
1547 | T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; |
1548 | T.Range = StringRef(Start, Current - Start); |
1549 | TokenQueue.push_back(V: T); |
1550 | |
1551 | // Alias and anchors can be simple keys. |
1552 | saveSimpleKeyCandidate(Tok: --TokenQueue.end(), AtColumn: ColStart, IsRequired: false); |
1553 | |
1554 | IsSimpleKeyAllowed = false; |
1555 | IsAdjacentValueAllowedInFlow = false; |
1556 | |
1557 | return true; |
1558 | } |
1559 | |
1560 | bool Scanner::scanBlockScalarIndicators(char &StyleIndicator, |
1561 | char &ChompingIndicator, |
1562 | unsigned &IndentIndicator, |
1563 | bool &IsDone) { |
1564 | StyleIndicator = scanBlockStyleIndicator(); |
1565 | if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone)) |
1566 | return false; |
1567 | return true; |
1568 | } |
1569 | |
1570 | char Scanner::scanBlockStyleIndicator() { |
1571 | char Indicator = ' '; |
1572 | if (Current != End && (*Current == '>' || *Current == '|')) { |
1573 | Indicator = *Current; |
1574 | skip(Distance: 1); |
1575 | } |
1576 | return Indicator; |
1577 | } |
1578 | |
1579 | char Scanner::scanBlockChompingIndicator() { |
1580 | char Indicator = ' '; |
1581 | if (Current != End && (*Current == '+' || *Current == '-')) { |
1582 | Indicator = *Current; |
1583 | skip(Distance: 1); |
1584 | } |
1585 | return Indicator; |
1586 | } |
1587 | |
1588 | /// Get the number of line breaks after chomping. |
1589 | /// |
1590 | /// Return the number of trailing line breaks to emit, depending on |
1591 | /// \p ChompingIndicator. |
1592 | static unsigned getChompedLineBreaks(char ChompingIndicator, |
1593 | unsigned LineBreaks, StringRef Str) { |
1594 | if (ChompingIndicator == '-') // Strip all line breaks. |
1595 | return 0; |
1596 | if (ChompingIndicator == '+') // Keep all line breaks. |
1597 | return LineBreaks; |
1598 | // Clip trailing lines. |
1599 | return Str.empty() ? 0 : 1; |
1600 | } |
1601 | |
1602 | unsigned Scanner::scanBlockIndentationIndicator() { |
1603 | unsigned Indent = 0; |
1604 | if (Current != End && (*Current >= '1' && *Current <= '9')) { |
1605 | Indent = unsigned(*Current - '0'); |
1606 | skip(Distance: 1); |
1607 | } |
1608 | return Indent; |
1609 | } |
1610 | |
1611 | bool Scanner::(char &ChompingIndicator, |
1612 | unsigned &IndentIndicator, bool &IsDone) { |
1613 | auto Start = Current; |
1614 | |
1615 | ChompingIndicator = scanBlockChompingIndicator(); |
1616 | IndentIndicator = scanBlockIndentationIndicator(); |
1617 | // Check for the chomping indicator once again. |
1618 | if (ChompingIndicator == ' ') |
1619 | ChompingIndicator = scanBlockChompingIndicator(); |
1620 | Current = skip_while(Func: &Scanner::skip_s_white, Position: Current); |
1621 | skipComment(); |
1622 | |
1623 | if (Current == End) { // EOF, we have an empty scalar. |
1624 | Token T; |
1625 | T.Kind = Token::TK_BlockScalar; |
1626 | T.Range = StringRef(Start, Current - Start); |
1627 | TokenQueue.push_back(V: T); |
1628 | IsDone = true; |
1629 | return true; |
1630 | } |
1631 | |
1632 | if (!consumeLineBreakIfPresent()) { |
1633 | setError(Message: "Expected a line break after block scalar header" , Position: Current); |
1634 | return false; |
1635 | } |
1636 | return true; |
1637 | } |
1638 | |
1639 | bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, |
1640 | unsigned BlockExitIndent, |
1641 | unsigned &LineBreaks, bool &IsDone) { |
1642 | unsigned MaxAllSpaceLineCharacters = 0; |
1643 | StringRef::iterator LongestAllSpaceLine; |
1644 | |
1645 | while (true) { |
1646 | advanceWhile(Func: &Scanner::skip_s_space); |
1647 | if (skip_nb_char(Position: Current) != Current) { |
1648 | // This line isn't empty, so try and find the indentation. |
1649 | if (Column <= BlockExitIndent) { // End of the block literal. |
1650 | IsDone = true; |
1651 | return true; |
1652 | } |
1653 | // We found the block's indentation. |
1654 | BlockIndent = Column; |
1655 | if (MaxAllSpaceLineCharacters > BlockIndent) { |
1656 | setError( |
1657 | Message: "Leading all-spaces line must be smaller than the block indent" , |
1658 | Position: LongestAllSpaceLine); |
1659 | return false; |
1660 | } |
1661 | return true; |
1662 | } |
1663 | if (skip_b_break(Position: Current) != Current && |
1664 | Column > MaxAllSpaceLineCharacters) { |
1665 | // Record the longest all-space line in case it's longer than the |
1666 | // discovered block indent. |
1667 | MaxAllSpaceLineCharacters = Column; |
1668 | LongestAllSpaceLine = Current; |
1669 | } |
1670 | |
1671 | // Check for EOF. |
1672 | if (Current == End) { |
1673 | IsDone = true; |
1674 | return true; |
1675 | } |
1676 | |
1677 | if (!consumeLineBreakIfPresent()) { |
1678 | IsDone = true; |
1679 | return true; |
1680 | } |
1681 | ++LineBreaks; |
1682 | } |
1683 | return true; |
1684 | } |
1685 | |
1686 | bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, |
1687 | unsigned BlockExitIndent, bool &IsDone) { |
1688 | // Skip the indentation. |
1689 | while (Column < BlockIndent) { |
1690 | auto I = skip_s_space(Position: Current); |
1691 | if (I == Current) |
1692 | break; |
1693 | Current = I; |
1694 | ++Column; |
1695 | } |
1696 | |
1697 | if (skip_nb_char(Position: Current) == Current) |
1698 | return true; |
1699 | |
1700 | if (Column <= BlockExitIndent) { // End of the block literal. |
1701 | IsDone = true; |
1702 | return true; |
1703 | } |
1704 | |
1705 | if (Column < BlockIndent) { |
1706 | if (Current != End && *Current == '#') { // Trailing comment. |
1707 | IsDone = true; |
1708 | return true; |
1709 | } |
1710 | setError(Message: "A text line is less indented than the block scalar" , Position: Current); |
1711 | return false; |
1712 | } |
1713 | return true; // A normal text line. |
1714 | } |
1715 | |
1716 | bool Scanner::scanBlockScalar(bool IsLiteral) { |
1717 | assert(*Current == '|' || *Current == '>'); |
1718 | char StyleIndicator; |
1719 | char ChompingIndicator; |
1720 | unsigned BlockIndent; |
1721 | bool IsDone = false; |
1722 | if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, IndentIndicator&: BlockIndent, |
1723 | IsDone)) |
1724 | return false; |
1725 | if (IsDone) |
1726 | return true; |
1727 | bool IsFolded = StyleIndicator == '>'; |
1728 | |
1729 | const auto *Start = Current; |
1730 | unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; |
1731 | unsigned LineBreaks = 0; |
1732 | if (BlockIndent == 0) { |
1733 | if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, |
1734 | IsDone)) |
1735 | return false; |
1736 | } |
1737 | |
1738 | // Scan the block's scalars body. |
1739 | SmallString<256> Str; |
1740 | while (!IsDone) { |
1741 | if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) |
1742 | return false; |
1743 | if (IsDone) |
1744 | break; |
1745 | |
1746 | // Parse the current line. |
1747 | auto LineStart = Current; |
1748 | advanceWhile(Func: &Scanner::skip_nb_char); |
1749 | if (LineStart != Current) { |
1750 | if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Line: Str)) { |
1751 | // The folded style "folds" any single line break between content into a |
1752 | // single space, except when that content is "empty" (only contains |
1753 | // whitespace) in which case the line break is left as-is. |
1754 | if (LineBreaks == 1) { |
1755 | Str.append(NumInputs: LineBreaks, |
1756 | Elt: isLineEmpty(Line: StringRef(LineStart, Current - LineStart)) |
1757 | ? '\n' |
1758 | : ' '); |
1759 | } |
1760 | // If we saw a single line break, we are completely replacing it and so |
1761 | // want `LineBreaks == 0`. Otherwise this decrement accounts for the |
1762 | // fact that the first line break is "trimmed", only being used to |
1763 | // signal a sequence of line breaks which should not be folded. |
1764 | LineBreaks--; |
1765 | } |
1766 | Str.append(NumInputs: LineBreaks, Elt: '\n'); |
1767 | Str.append(RHS: StringRef(LineStart, Current - LineStart)); |
1768 | LineBreaks = 0; |
1769 | } |
1770 | |
1771 | // Check for EOF. |
1772 | if (Current == End) |
1773 | break; |
1774 | |
1775 | if (!consumeLineBreakIfPresent()) |
1776 | break; |
1777 | ++LineBreaks; |
1778 | } |
1779 | |
1780 | if (Current == End && !LineBreaks) |
1781 | // Ensure that there is at least one line break before the end of file. |
1782 | LineBreaks = 1; |
1783 | Str.append(NumInputs: getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), Elt: '\n'); |
1784 | |
1785 | // New lines may start a simple key. |
1786 | if (!FlowLevel) |
1787 | IsSimpleKeyAllowed = true; |
1788 | IsAdjacentValueAllowedInFlow = false; |
1789 | |
1790 | Token T; |
1791 | T.Kind = Token::TK_BlockScalar; |
1792 | T.Range = StringRef(Start, Current - Start); |
1793 | T.Value = std::string(Str); |
1794 | TokenQueue.push_back(V: T); |
1795 | return true; |
1796 | } |
1797 | |
1798 | bool Scanner::scanTag() { |
1799 | StringRef::iterator Start = Current; |
1800 | unsigned ColStart = Column; |
1801 | skip(Distance: 1); // Eat !. |
1802 | if (Current == End || isBlankOrBreak(Position: Current)); // An empty tag. |
1803 | else if (*Current == '<') { |
1804 | skip(Distance: 1); |
1805 | scan_ns_uri_char(); |
1806 | if (!consume(Expected: '>')) |
1807 | return false; |
1808 | } else { |
1809 | // FIXME: Actually parse the c-ns-shorthand-tag rule. |
1810 | Current = skip_while(Func: &Scanner::skip_ns_char, Position: Current); |
1811 | } |
1812 | |
1813 | Token T; |
1814 | T.Kind = Token::TK_Tag; |
1815 | T.Range = StringRef(Start, Current - Start); |
1816 | TokenQueue.push_back(V: T); |
1817 | |
1818 | // Tags can be simple keys. |
1819 | saveSimpleKeyCandidate(Tok: --TokenQueue.end(), AtColumn: ColStart, IsRequired: false); |
1820 | |
1821 | IsSimpleKeyAllowed = false; |
1822 | IsAdjacentValueAllowedInFlow = false; |
1823 | |
1824 | return true; |
1825 | } |
1826 | |
1827 | bool Scanner::fetchMoreTokens() { |
1828 | if (IsStartOfStream) |
1829 | return scanStreamStart(); |
1830 | |
1831 | scanToNextToken(); |
1832 | |
1833 | if (Current == End) |
1834 | return scanStreamEnd(); |
1835 | |
1836 | removeStaleSimpleKeyCandidates(); |
1837 | |
1838 | unrollIndent(ToColumn: Column); |
1839 | |
1840 | if (Column == 0 && *Current == '%') |
1841 | return scanDirective(); |
1842 | |
1843 | if (Column == 0 && Current + 4 <= End |
1844 | && *Current == '-' |
1845 | && *(Current + 1) == '-' |
1846 | && *(Current + 2) == '-' |
1847 | && (Current + 3 == End || isBlankOrBreak(Position: Current + 3))) |
1848 | return scanDocumentIndicator(IsStart: true); |
1849 | |
1850 | if (Column == 0 && Current + 4 <= End |
1851 | && *Current == '.' |
1852 | && *(Current + 1) == '.' |
1853 | && *(Current + 2) == '.' |
1854 | && (Current + 3 == End || isBlankOrBreak(Position: Current + 3))) |
1855 | return scanDocumentIndicator(IsStart: false); |
1856 | |
1857 | if (*Current == '[') |
1858 | return scanFlowCollectionStart(IsSequence: true); |
1859 | |
1860 | if (*Current == '{') |
1861 | return scanFlowCollectionStart(IsSequence: false); |
1862 | |
1863 | if (*Current == ']') |
1864 | return scanFlowCollectionEnd(IsSequence: true); |
1865 | |
1866 | if (*Current == '}') |
1867 | return scanFlowCollectionEnd(IsSequence: false); |
1868 | |
1869 | if (*Current == ',') |
1870 | return scanFlowEntry(); |
1871 | |
1872 | if (*Current == '-' && (isBlankOrBreak(Position: Current + 1) || Current + 1 == End)) |
1873 | return scanBlockEntry(); |
1874 | |
1875 | if (*Current == '?' && (Current + 1 == End || isBlankOrBreak(Position: Current + 1))) |
1876 | return scanKey(); |
1877 | |
1878 | if (*Current == ':' && |
1879 | (!isPlainSafeNonBlank(Position: Current + 1) || IsAdjacentValueAllowedInFlow)) |
1880 | return scanValue(); |
1881 | |
1882 | if (*Current == '*') |
1883 | return scanAliasOrAnchor(IsAlias: true); |
1884 | |
1885 | if (*Current == '&') |
1886 | return scanAliasOrAnchor(IsAlias: false); |
1887 | |
1888 | if (*Current == '!') |
1889 | return scanTag(); |
1890 | |
1891 | if (*Current == '|' && !FlowLevel) |
1892 | return scanBlockScalar(IsLiteral: true); |
1893 | |
1894 | if (*Current == '>' && !FlowLevel) |
1895 | return scanBlockScalar(IsLiteral: false); |
1896 | |
1897 | if (*Current == '\'') |
1898 | return scanFlowScalar(IsDoubleQuoted: false); |
1899 | |
1900 | if (*Current == '"') |
1901 | return scanFlowScalar(IsDoubleQuoted: true); |
1902 | |
1903 | // Get a plain scalar. |
1904 | StringRef FirstChar(Current, 1); |
1905 | if ((!isBlankOrBreak(Position: Current) && |
1906 | FirstChar.find_first_of(Chars: "-?:,[]{}#&*!|>'\"%@`" ) == StringRef::npos) || |
1907 | (FirstChar.find_first_of(Chars: "?:-" ) != StringRef::npos && |
1908 | isPlainSafeNonBlank(Position: Current + 1))) |
1909 | return scanPlainScalar(); |
1910 | |
1911 | setError(Message: "Unrecognized character while tokenizing." , Position: Current); |
1912 | return false; |
1913 | } |
1914 | |
1915 | Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, |
1916 | std::error_code *EC) |
1917 | : scanner(new Scanner(Input, SM, ShowColors, EC)) {} |
1918 | |
1919 | Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, |
1920 | std::error_code *EC) |
1921 | : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)) {} |
1922 | |
1923 | Stream::~Stream() = default; |
1924 | |
1925 | bool Stream::failed() { return scanner->failed(); } |
1926 | |
1927 | void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) { |
1928 | printError(Range: N ? N->getSourceRange() : SMRange(), Msg, Kind); |
1929 | } |
1930 | |
1931 | void Stream::printError(const SMRange &Range, const Twine &Msg, |
1932 | SourceMgr::DiagKind Kind) { |
1933 | scanner->printError(Loc: Range.Start, Kind, Message: Msg, Ranges: Range); |
1934 | } |
1935 | |
1936 | document_iterator Stream::begin() { |
1937 | if (CurrentDoc) |
1938 | report_fatal_error(reason: "Can only iterate over the stream once" ); |
1939 | |
1940 | // Skip Stream-Start. |
1941 | scanner->getNext(); |
1942 | |
1943 | CurrentDoc.reset(p: new Document(*this)); |
1944 | return document_iterator(CurrentDoc); |
1945 | } |
1946 | |
1947 | document_iterator Stream::end() { |
1948 | return document_iterator(); |
1949 | } |
1950 | |
1951 | void Stream::skip() { |
1952 | for (Document &Doc : *this) |
1953 | Doc.skip(); |
1954 | } |
1955 | |
1956 | Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, |
1957 | StringRef T) |
1958 | : Doc(D), TypeID(Type), Anchor(A), Tag(T) { |
1959 | SMLoc Start = SMLoc::getFromPointer(Ptr: peekNext().Range.begin()); |
1960 | SourceRange = SMRange(Start, Start); |
1961 | } |
1962 | |
1963 | std::string Node::getVerbatimTag() const { |
1964 | StringRef Raw = getRawTag(); |
1965 | if (!Raw.empty() && Raw != "!" ) { |
1966 | std::string Ret; |
1967 | if (Raw.find_last_of(C: '!') == 0) { |
1968 | Ret = std::string(Doc->getTagMap().find(x: "!" )->second); |
1969 | Ret += Raw.substr(Start: 1); |
1970 | return Ret; |
1971 | } else if (Raw.starts_with(Prefix: "!!" )) { |
1972 | Ret = std::string(Doc->getTagMap().find(x: "!!" )->second); |
1973 | Ret += Raw.substr(Start: 2); |
1974 | return Ret; |
1975 | } else { |
1976 | StringRef TagHandle = Raw.substr(Start: 0, N: Raw.find_last_of(C: '!') + 1); |
1977 | std::map<StringRef, StringRef>::const_iterator It = |
1978 | Doc->getTagMap().find(x: TagHandle); |
1979 | if (It != Doc->getTagMap().end()) |
1980 | Ret = std::string(It->second); |
1981 | else { |
1982 | Token T; |
1983 | T.Kind = Token::TK_Tag; |
1984 | T.Range = TagHandle; |
1985 | setError(Message: Twine("Unknown tag handle " ) + TagHandle, Location&: T); |
1986 | } |
1987 | Ret += Raw.substr(Start: Raw.find_last_of(C: '!') + 1); |
1988 | return Ret; |
1989 | } |
1990 | } |
1991 | |
1992 | switch (getType()) { |
1993 | case NK_Null: |
1994 | return "tag:yaml.org,2002:null" ; |
1995 | case NK_Scalar: |
1996 | case NK_BlockScalar: |
1997 | // TODO: Tag resolution. |
1998 | return "tag:yaml.org,2002:str" ; |
1999 | case NK_Mapping: |
2000 | return "tag:yaml.org,2002:map" ; |
2001 | case NK_Sequence: |
2002 | return "tag:yaml.org,2002:seq" ; |
2003 | } |
2004 | |
2005 | return "" ; |
2006 | } |
2007 | |
2008 | Token &Node::peekNext() { |
2009 | return Doc->peekNext(); |
2010 | } |
2011 | |
2012 | Token Node::getNext() { |
2013 | return Doc->getNext(); |
2014 | } |
2015 | |
2016 | Node *Node::parseBlockNode() { |
2017 | return Doc->parseBlockNode(); |
2018 | } |
2019 | |
2020 | BumpPtrAllocator &Node::getAllocator() { |
2021 | return Doc->NodeAllocator; |
2022 | } |
2023 | |
2024 | void Node::setError(const Twine &Msg, Token &Tok) const { |
2025 | Doc->setError(Message: Msg, Location&: Tok); |
2026 | } |
2027 | |
2028 | bool Node::failed() const { |
2029 | return Doc->failed(); |
2030 | } |
2031 | |
2032 | StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { |
2033 | if (Value[0] == '"') |
2034 | return getDoubleQuotedValue(UnquotedValue: Value, Storage); |
2035 | if (Value[0] == '\'') |
2036 | return getSingleQuotedValue(RawValue: Value, Storage); |
2037 | return getPlainValue(RawValue: Value, Storage); |
2038 | } |
2039 | |
2040 | /// parseScalarValue - A common parsing routine for all flow scalar styles. |
2041 | /// It handles line break characters by itself, adds regular content characters |
2042 | /// to the result, and forwards escaped sequences to the provided routine for |
2043 | /// the style-specific processing. |
2044 | /// |
2045 | /// \param UnquotedValue - An input value without quotation marks. |
2046 | /// \param Storage - A storage for the result if the input value is multiline or |
2047 | /// contains escaped characters. |
2048 | /// \param LookupChars - A set of special characters to search in the input |
2049 | /// string. Should include line break characters and the escape character |
2050 | /// specific for the processing scalar style, if any. |
2051 | /// \param UnescapeCallback - This is called when the escape character is found |
2052 | /// in the input. |
2053 | /// \returns - The unfolded and unescaped value. |
2054 | static StringRef |
2055 | parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage, |
2056 | StringRef LookupChars, |
2057 | std::function<StringRef(StringRef, SmallVectorImpl<char> &)> |
2058 | UnescapeCallback) { |
2059 | size_t I = UnquotedValue.find_first_of(Chars: LookupChars); |
2060 | if (I == StringRef::npos) |
2061 | return UnquotedValue; |
2062 | |
2063 | Storage.clear(); |
2064 | Storage.reserve(N: UnquotedValue.size()); |
2065 | char LastNewLineAddedAs = '\0'; |
2066 | for (; I != StringRef::npos; I = UnquotedValue.find_first_of(Chars: LookupChars)) { |
2067 | if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') { |
2068 | llvm::append_range(C&: Storage, R: UnquotedValue.take_front(N: I)); |
2069 | UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(N: I), Storage); |
2070 | LastNewLineAddedAs = '\0'; |
2071 | continue; |
2072 | } |
2073 | if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(Chars: " \t" , From: I); |
2074 | LastNonSWhite != StringRef::npos) { |
2075 | llvm::append_range(C&: Storage, R: UnquotedValue.take_front(N: LastNonSWhite + 1)); |
2076 | Storage.push_back(Elt: ' '); |
2077 | LastNewLineAddedAs = ' '; |
2078 | } else { |
2079 | // Note: we can't just check if the last character in Storage is ' ', |
2080 | // '\n', or something else; that would give a wrong result for double |
2081 | // quoted values containing an escaped space character before a new-line |
2082 | // character. |
2083 | switch (LastNewLineAddedAs) { |
2084 | case ' ': |
2085 | assert(!Storage.empty() && Storage.back() == ' '); |
2086 | Storage.back() = '\n'; |
2087 | LastNewLineAddedAs = '\n'; |
2088 | break; |
2089 | case '\n': |
2090 | assert(!Storage.empty() && Storage.back() == '\n'); |
2091 | Storage.push_back(Elt: '\n'); |
2092 | break; |
2093 | default: |
2094 | Storage.push_back(Elt: ' '); |
2095 | LastNewLineAddedAs = ' '; |
2096 | break; |
2097 | } |
2098 | } |
2099 | // Handle Windows-style EOL |
2100 | if (UnquotedValue.substr(Start: I, N: 2) == "\r\n" ) |
2101 | I++; |
2102 | UnquotedValue = UnquotedValue.drop_front(N: I + 1).ltrim(Chars: " \t" ); |
2103 | } |
2104 | llvm::append_range(C&: Storage, R&: UnquotedValue); |
2105 | return StringRef(Storage.begin(), Storage.size()); |
2106 | } |
2107 | |
2108 | StringRef |
2109 | ScalarNode::getDoubleQuotedValue(StringRef RawValue, |
2110 | SmallVectorImpl<char> &Storage) const { |
2111 | assert(RawValue.size() >= 2 && RawValue.front() == '"' && |
2112 | RawValue.back() == '"'); |
2113 | StringRef UnquotedValue = RawValue.substr(Start: 1, N: RawValue.size() - 2); |
2114 | |
2115 | auto UnescapeFunc = [this](StringRef UnquotedValue, |
2116 | SmallVectorImpl<char> &Storage) { |
2117 | assert(UnquotedValue.take_front(1) == "\\" ); |
2118 | if (UnquotedValue.size() == 1) { |
2119 | Token T; |
2120 | T.Range = UnquotedValue; |
2121 | setError(Msg: "Unrecognized escape code" , Tok&: T); |
2122 | Storage.clear(); |
2123 | return StringRef(); |
2124 | } |
2125 | UnquotedValue = UnquotedValue.drop_front(N: 1); |
2126 | switch (UnquotedValue[0]) { |
2127 | default: { |
2128 | Token T; |
2129 | T.Range = UnquotedValue.take_front(N: 1); |
2130 | setError(Msg: "Unrecognized escape code" , Tok&: T); |
2131 | Storage.clear(); |
2132 | return StringRef(); |
2133 | } |
2134 | case '\r': |
2135 | // Shrink the Windows-style EOL. |
2136 | if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n') |
2137 | UnquotedValue = UnquotedValue.drop_front(N: 1); |
2138 | [[fallthrough]]; |
2139 | case '\n': |
2140 | return UnquotedValue.drop_front(N: 1).ltrim(Chars: " \t" ); |
2141 | case '0': |
2142 | Storage.push_back(Elt: 0x00); |
2143 | break; |
2144 | case 'a': |
2145 | Storage.push_back(Elt: 0x07); |
2146 | break; |
2147 | case 'b': |
2148 | Storage.push_back(Elt: 0x08); |
2149 | break; |
2150 | case 't': |
2151 | case 0x09: |
2152 | Storage.push_back(Elt: 0x09); |
2153 | break; |
2154 | case 'n': |
2155 | Storage.push_back(Elt: 0x0A); |
2156 | break; |
2157 | case 'v': |
2158 | Storage.push_back(Elt: 0x0B); |
2159 | break; |
2160 | case 'f': |
2161 | Storage.push_back(Elt: 0x0C); |
2162 | break; |
2163 | case 'r': |
2164 | Storage.push_back(Elt: 0x0D); |
2165 | break; |
2166 | case 'e': |
2167 | Storage.push_back(Elt: 0x1B); |
2168 | break; |
2169 | case ' ': |
2170 | Storage.push_back(Elt: 0x20); |
2171 | break; |
2172 | case '"': |
2173 | Storage.push_back(Elt: 0x22); |
2174 | break; |
2175 | case '/': |
2176 | Storage.push_back(Elt: 0x2F); |
2177 | break; |
2178 | case '\\': |
2179 | Storage.push_back(Elt: 0x5C); |
2180 | break; |
2181 | case 'N': |
2182 | encodeUTF8(UnicodeScalarValue: 0x85, Result&: Storage); |
2183 | break; |
2184 | case '_': |
2185 | encodeUTF8(UnicodeScalarValue: 0xA0, Result&: Storage); |
2186 | break; |
2187 | case 'L': |
2188 | encodeUTF8(UnicodeScalarValue: 0x2028, Result&: Storage); |
2189 | break; |
2190 | case 'P': |
2191 | encodeUTF8(UnicodeScalarValue: 0x2029, Result&: Storage); |
2192 | break; |
2193 | case 'x': { |
2194 | if (UnquotedValue.size() < 3) |
2195 | // TODO: Report error. |
2196 | break; |
2197 | unsigned int UnicodeScalarValue; |
2198 | if (UnquotedValue.substr(Start: 1, N: 2).getAsInteger(Radix: 16, Result&: UnicodeScalarValue)) |
2199 | // TODO: Report error. |
2200 | UnicodeScalarValue = 0xFFFD; |
2201 | encodeUTF8(UnicodeScalarValue, Result&: Storage); |
2202 | return UnquotedValue.drop_front(N: 3); |
2203 | } |
2204 | case 'u': { |
2205 | if (UnquotedValue.size() < 5) |
2206 | // TODO: Report error. |
2207 | break; |
2208 | unsigned int UnicodeScalarValue; |
2209 | if (UnquotedValue.substr(Start: 1, N: 4).getAsInteger(Radix: 16, Result&: UnicodeScalarValue)) |
2210 | // TODO: Report error. |
2211 | UnicodeScalarValue = 0xFFFD; |
2212 | encodeUTF8(UnicodeScalarValue, Result&: Storage); |
2213 | return UnquotedValue.drop_front(N: 5); |
2214 | } |
2215 | case 'U': { |
2216 | if (UnquotedValue.size() < 9) |
2217 | // TODO: Report error. |
2218 | break; |
2219 | unsigned int UnicodeScalarValue; |
2220 | if (UnquotedValue.substr(Start: 1, N: 8).getAsInteger(Radix: 16, Result&: UnicodeScalarValue)) |
2221 | // TODO: Report error. |
2222 | UnicodeScalarValue = 0xFFFD; |
2223 | encodeUTF8(UnicodeScalarValue, Result&: Storage); |
2224 | return UnquotedValue.drop_front(N: 9); |
2225 | } |
2226 | } |
2227 | return UnquotedValue.drop_front(N: 1); |
2228 | }; |
2229 | |
2230 | return parseScalarValue(UnquotedValue, Storage, LookupChars: "\\\r\n" , UnescapeCallback: UnescapeFunc); |
2231 | } |
2232 | |
2233 | StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue, |
2234 | SmallVectorImpl<char> &Storage) { |
2235 | assert(RawValue.size() >= 2 && RawValue.front() == '\'' && |
2236 | RawValue.back() == '\''); |
2237 | StringRef UnquotedValue = RawValue.substr(Start: 1, N: RawValue.size() - 2); |
2238 | |
2239 | auto UnescapeFunc = [](StringRef UnquotedValue, |
2240 | SmallVectorImpl<char> &Storage) { |
2241 | assert(UnquotedValue.take_front(2) == "''" ); |
2242 | Storage.push_back(Elt: '\''); |
2243 | return UnquotedValue.drop_front(N: 2); |
2244 | }; |
2245 | |
2246 | return parseScalarValue(UnquotedValue, Storage, LookupChars: "'\r\n" , UnescapeCallback: UnescapeFunc); |
2247 | } |
2248 | |
2249 | StringRef ScalarNode::getPlainValue(StringRef RawValue, |
2250 | SmallVectorImpl<char> &Storage) { |
2251 | // Trim trailing whitespace ('b-char' and 's-white'). |
2252 | // NOTE: Alternatively we could change the scanner to not include whitespace |
2253 | // here in the first place. |
2254 | RawValue = RawValue.rtrim(Chars: "\r\n \t" ); |
2255 | return parseScalarValue(UnquotedValue: RawValue, Storage, LookupChars: "\r\n" , UnescapeCallback: nullptr); |
2256 | } |
2257 | |
2258 | Node *KeyValueNode::getKey() { |
2259 | if (Key) |
2260 | return Key; |
2261 | // Handle implicit null keys. |
2262 | { |
2263 | Token &t = peekNext(); |
2264 | if ( t.Kind == Token::TK_BlockEnd |
2265 | || t.Kind == Token::TK_Value |
2266 | || t.Kind == Token::TK_Error) { |
2267 | return Key = new (getAllocator()) NullNode(Doc); |
2268 | } |
2269 | if (t.Kind == Token::TK_Key) |
2270 | getNext(); // skip TK_Key. |
2271 | } |
2272 | |
2273 | // Handle explicit null keys. |
2274 | Token &t = peekNext(); |
2275 | if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { |
2276 | return Key = new (getAllocator()) NullNode(Doc); |
2277 | } |
2278 | |
2279 | // We've got a normal key. |
2280 | return Key = parseBlockNode(); |
2281 | } |
2282 | |
2283 | Node *KeyValueNode::getValue() { |
2284 | if (Value) |
2285 | return Value; |
2286 | |
2287 | if (Node* Key = getKey()) |
2288 | Key->skip(); |
2289 | else { |
2290 | setError(Msg: "Null key in Key Value." , Tok&: peekNext()); |
2291 | return Value = new (getAllocator()) NullNode(Doc); |
2292 | } |
2293 | |
2294 | if (failed()) |
2295 | return Value = new (getAllocator()) NullNode(Doc); |
2296 | |
2297 | // Handle implicit null values. |
2298 | { |
2299 | Token &t = peekNext(); |
2300 | if ( t.Kind == Token::TK_BlockEnd |
2301 | || t.Kind == Token::TK_FlowMappingEnd |
2302 | || t.Kind == Token::TK_Key |
2303 | || t.Kind == Token::TK_FlowEntry |
2304 | || t.Kind == Token::TK_Error) { |
2305 | return Value = new (getAllocator()) NullNode(Doc); |
2306 | } |
2307 | |
2308 | if (t.Kind != Token::TK_Value) { |
2309 | setError(Msg: "Unexpected token in Key Value." , Tok&: t); |
2310 | return Value = new (getAllocator()) NullNode(Doc); |
2311 | } |
2312 | getNext(); // skip TK_Value. |
2313 | } |
2314 | |
2315 | // Handle explicit null values. |
2316 | Token &t = peekNext(); |
2317 | if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { |
2318 | return Value = new (getAllocator()) NullNode(Doc); |
2319 | } |
2320 | |
2321 | // We got a normal value. |
2322 | return Value = parseBlockNode(); |
2323 | } |
2324 | |
2325 | void MappingNode::increment() { |
2326 | if (failed()) { |
2327 | IsAtEnd = true; |
2328 | CurrentEntry = nullptr; |
2329 | return; |
2330 | } |
2331 | if (CurrentEntry) { |
2332 | CurrentEntry->skip(); |
2333 | if (Type == MT_Inline) { |
2334 | IsAtEnd = true; |
2335 | CurrentEntry = nullptr; |
2336 | return; |
2337 | } |
2338 | } |
2339 | Token T = peekNext(); |
2340 | if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { |
2341 | // KeyValueNode eats the TK_Key. That way it can detect null keys. |
2342 | CurrentEntry = new (getAllocator()) KeyValueNode(Doc); |
2343 | } else if (Type == MT_Block) { |
2344 | switch (T.Kind) { |
2345 | case Token::TK_BlockEnd: |
2346 | getNext(); |
2347 | IsAtEnd = true; |
2348 | CurrentEntry = nullptr; |
2349 | break; |
2350 | default: |
2351 | setError(Msg: "Unexpected token. Expected Key or Block End" , Tok&: T); |
2352 | [[fallthrough]]; |
2353 | case Token::TK_Error: |
2354 | IsAtEnd = true; |
2355 | CurrentEntry = nullptr; |
2356 | } |
2357 | } else { |
2358 | switch (T.Kind) { |
2359 | case Token::TK_FlowEntry: |
2360 | // Eat the flow entry and recurse. |
2361 | getNext(); |
2362 | return increment(); |
2363 | case Token::TK_FlowMappingEnd: |
2364 | getNext(); |
2365 | [[fallthrough]]; |
2366 | case Token::TK_Error: |
2367 | // Set this to end iterator. |
2368 | IsAtEnd = true; |
2369 | CurrentEntry = nullptr; |
2370 | break; |
2371 | default: |
2372 | setError( Msg: "Unexpected token. Expected Key, Flow Entry, or Flow " |
2373 | "Mapping End." |
2374 | , Tok&: T); |
2375 | IsAtEnd = true; |
2376 | CurrentEntry = nullptr; |
2377 | } |
2378 | } |
2379 | } |
2380 | |
2381 | void SequenceNode::increment() { |
2382 | if (failed()) { |
2383 | IsAtEnd = true; |
2384 | CurrentEntry = nullptr; |
2385 | return; |
2386 | } |
2387 | if (CurrentEntry) |
2388 | CurrentEntry->skip(); |
2389 | Token T = peekNext(); |
2390 | if (SeqType == ST_Block) { |
2391 | switch (T.Kind) { |
2392 | case Token::TK_BlockEntry: |
2393 | getNext(); |
2394 | CurrentEntry = parseBlockNode(); |
2395 | if (!CurrentEntry) { // An error occurred. |
2396 | IsAtEnd = true; |
2397 | CurrentEntry = nullptr; |
2398 | } |
2399 | break; |
2400 | case Token::TK_BlockEnd: |
2401 | getNext(); |
2402 | IsAtEnd = true; |
2403 | CurrentEntry = nullptr; |
2404 | break; |
2405 | default: |
2406 | setError( Msg: "Unexpected token. Expected Block Entry or Block End." |
2407 | , Tok&: T); |
2408 | [[fallthrough]]; |
2409 | case Token::TK_Error: |
2410 | IsAtEnd = true; |
2411 | CurrentEntry = nullptr; |
2412 | } |
2413 | } else if (SeqType == ST_Indentless) { |
2414 | switch (T.Kind) { |
2415 | case Token::TK_BlockEntry: |
2416 | getNext(); |
2417 | CurrentEntry = parseBlockNode(); |
2418 | if (!CurrentEntry) { // An error occurred. |
2419 | IsAtEnd = true; |
2420 | CurrentEntry = nullptr; |
2421 | } |
2422 | break; |
2423 | default: |
2424 | case Token::TK_Error: |
2425 | IsAtEnd = true; |
2426 | CurrentEntry = nullptr; |
2427 | } |
2428 | } else if (SeqType == ST_Flow) { |
2429 | switch (T.Kind) { |
2430 | case Token::TK_FlowEntry: |
2431 | // Eat the flow entry and recurse. |
2432 | getNext(); |
2433 | WasPreviousTokenFlowEntry = true; |
2434 | return increment(); |
2435 | case Token::TK_FlowSequenceEnd: |
2436 | getNext(); |
2437 | [[fallthrough]]; |
2438 | case Token::TK_Error: |
2439 | // Set this to end iterator. |
2440 | IsAtEnd = true; |
2441 | CurrentEntry = nullptr; |
2442 | break; |
2443 | case Token::TK_StreamEnd: |
2444 | case Token::TK_DocumentEnd: |
2445 | case Token::TK_DocumentStart: |
2446 | setError(Msg: "Could not find closing ]!" , Tok&: T); |
2447 | // Set this to end iterator. |
2448 | IsAtEnd = true; |
2449 | CurrentEntry = nullptr; |
2450 | break; |
2451 | default: |
2452 | if (!WasPreviousTokenFlowEntry) { |
2453 | setError(Msg: "Expected , between entries!" , Tok&: T); |
2454 | IsAtEnd = true; |
2455 | CurrentEntry = nullptr; |
2456 | break; |
2457 | } |
2458 | // Otherwise it must be a flow entry. |
2459 | CurrentEntry = parseBlockNode(); |
2460 | if (!CurrentEntry) { |
2461 | IsAtEnd = true; |
2462 | } |
2463 | WasPreviousTokenFlowEntry = false; |
2464 | break; |
2465 | } |
2466 | } |
2467 | } |
2468 | |
2469 | Document::Document(Stream &S) : stream(S), Root(nullptr) { |
2470 | // Tag maps starts with two default mappings. |
2471 | TagMap["!" ] = "!" ; |
2472 | TagMap["!!" ] = "tag:yaml.org,2002:" ; |
2473 | |
2474 | if (parseDirectives()) |
2475 | expectToken(TK: Token::TK_DocumentStart); |
2476 | Token &T = peekNext(); |
2477 | if (T.Kind == Token::TK_DocumentStart) |
2478 | getNext(); |
2479 | } |
2480 | |
2481 | bool Document::skip() { |
2482 | if (stream.scanner->failed()) |
2483 | return false; |
2484 | if (!Root && !getRoot()) |
2485 | return false; |
2486 | Root->skip(); |
2487 | Token &T = peekNext(); |
2488 | if (T.Kind == Token::TK_StreamEnd) |
2489 | return false; |
2490 | if (T.Kind == Token::TK_DocumentEnd) { |
2491 | getNext(); |
2492 | return skip(); |
2493 | } |
2494 | return true; |
2495 | } |
2496 | |
2497 | Token &Document::peekNext() { |
2498 | return stream.scanner->peekNext(); |
2499 | } |
2500 | |
2501 | Token Document::getNext() { |
2502 | return stream.scanner->getNext(); |
2503 | } |
2504 | |
2505 | void Document::setError(const Twine &Message, Token &Location) const { |
2506 | stream.scanner->setError(Message, Position: Location.Range.begin()); |
2507 | } |
2508 | |
2509 | bool Document::failed() const { |
2510 | return stream.scanner->failed(); |
2511 | } |
2512 | |
2513 | Node *Document::parseBlockNode() { |
2514 | Token T = peekNext(); |
2515 | // Handle properties. |
2516 | Token AnchorInfo; |
2517 | Token TagInfo; |
2518 | parse_property: |
2519 | switch (T.Kind) { |
2520 | case Token::TK_Alias: |
2521 | getNext(); |
2522 | return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(Start: 1)); |
2523 | case Token::TK_Anchor: |
2524 | if (AnchorInfo.Kind == Token::TK_Anchor) { |
2525 | setError(Message: "Already encountered an anchor for this node!" , Location&: T); |
2526 | return nullptr; |
2527 | } |
2528 | AnchorInfo = getNext(); // Consume TK_Anchor. |
2529 | T = peekNext(); |
2530 | goto parse_property; |
2531 | case Token::TK_Tag: |
2532 | if (TagInfo.Kind == Token::TK_Tag) { |
2533 | setError(Message: "Already encountered a tag for this node!" , Location&: T); |
2534 | return nullptr; |
2535 | } |
2536 | TagInfo = getNext(); // Consume TK_Tag. |
2537 | T = peekNext(); |
2538 | goto parse_property; |
2539 | default: |
2540 | break; |
2541 | } |
2542 | |
2543 | switch (T.Kind) { |
2544 | case Token::TK_BlockEntry: |
2545 | // We got an unindented BlockEntry sequence. This is not terminated with |
2546 | // a BlockEnd. |
2547 | // Don't eat the TK_BlockEntry, SequenceNode needs it. |
2548 | return new (NodeAllocator) SequenceNode( stream.CurrentDoc |
2549 | , AnchorInfo.Range.substr(Start: 1) |
2550 | , TagInfo.Range |
2551 | , SequenceNode::ST_Indentless); |
2552 | case Token::TK_BlockSequenceStart: |
2553 | getNext(); |
2554 | return new (NodeAllocator) |
2555 | SequenceNode( stream.CurrentDoc |
2556 | , AnchorInfo.Range.substr(Start: 1) |
2557 | , TagInfo.Range |
2558 | , SequenceNode::ST_Block); |
2559 | case Token::TK_BlockMappingStart: |
2560 | getNext(); |
2561 | return new (NodeAllocator) |
2562 | MappingNode( stream.CurrentDoc |
2563 | , AnchorInfo.Range.substr(Start: 1) |
2564 | , TagInfo.Range |
2565 | , MappingNode::MT_Block); |
2566 | case Token::TK_FlowSequenceStart: |
2567 | getNext(); |
2568 | return new (NodeAllocator) |
2569 | SequenceNode( stream.CurrentDoc |
2570 | , AnchorInfo.Range.substr(Start: 1) |
2571 | , TagInfo.Range |
2572 | , SequenceNode::ST_Flow); |
2573 | case Token::TK_FlowMappingStart: |
2574 | getNext(); |
2575 | return new (NodeAllocator) |
2576 | MappingNode( stream.CurrentDoc |
2577 | , AnchorInfo.Range.substr(Start: 1) |
2578 | , TagInfo.Range |
2579 | , MappingNode::MT_Flow); |
2580 | case Token::TK_Scalar: |
2581 | getNext(); |
2582 | return new (NodeAllocator) |
2583 | ScalarNode( stream.CurrentDoc |
2584 | , AnchorInfo.Range.substr(Start: 1) |
2585 | , TagInfo.Range |
2586 | , T.Range); |
2587 | case Token::TK_BlockScalar: { |
2588 | getNext(); |
2589 | StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); |
2590 | StringRef StrCopy = NullTerminatedStr.copy(A&: NodeAllocator).drop_back(); |
2591 | return new (NodeAllocator) |
2592 | BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(Start: 1), |
2593 | TagInfo.Range, StrCopy, T.Range); |
2594 | } |
2595 | case Token::TK_Key: |
2596 | // Don't eat the TK_Key, KeyValueNode expects it. |
2597 | return new (NodeAllocator) |
2598 | MappingNode( stream.CurrentDoc |
2599 | , AnchorInfo.Range.substr(Start: 1) |
2600 | , TagInfo.Range |
2601 | , MappingNode::MT_Inline); |
2602 | case Token::TK_DocumentStart: |
2603 | case Token::TK_DocumentEnd: |
2604 | case Token::TK_StreamEnd: |
2605 | default: |
2606 | // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not |
2607 | // !!null null. |
2608 | return new (NodeAllocator) NullNode(stream.CurrentDoc); |
2609 | case Token::TK_FlowMappingEnd: |
2610 | case Token::TK_FlowSequenceEnd: |
2611 | case Token::TK_FlowEntry: { |
2612 | if (Root && (isa<MappingNode>(Val: Root) || isa<SequenceNode>(Val: Root))) |
2613 | return new (NodeAllocator) NullNode(stream.CurrentDoc); |
2614 | |
2615 | setError(Message: "Unexpected token" , Location&: T); |
2616 | return nullptr; |
2617 | } |
2618 | case Token::TK_Error: |
2619 | return nullptr; |
2620 | } |
2621 | llvm_unreachable("Control flow shouldn't reach here." ); |
2622 | return nullptr; |
2623 | } |
2624 | |
2625 | bool Document::parseDirectives() { |
2626 | bool isDirective = false; |
2627 | while (true) { |
2628 | Token T = peekNext(); |
2629 | if (T.Kind == Token::TK_TagDirective) { |
2630 | parseTAGDirective(); |
2631 | isDirective = true; |
2632 | } else if (T.Kind == Token::TK_VersionDirective) { |
2633 | parseYAMLDirective(); |
2634 | isDirective = true; |
2635 | } else |
2636 | break; |
2637 | } |
2638 | return isDirective; |
2639 | } |
2640 | |
2641 | void Document::parseYAMLDirective() { |
2642 | getNext(); // Eat %YAML <version> |
2643 | } |
2644 | |
2645 | void Document::parseTAGDirective() { |
2646 | Token Tag = getNext(); // %TAG <handle> <prefix> |
2647 | StringRef T = Tag.Range; |
2648 | // Strip %TAG |
2649 | T = T.substr(Start: T.find_first_of(Chars: " \t" )).ltrim(Chars: " \t" ); |
2650 | std::size_t HandleEnd = T.find_first_of(Chars: " \t" ); |
2651 | StringRef TagHandle = T.substr(Start: 0, N: HandleEnd); |
2652 | StringRef TagPrefix = T.substr(Start: HandleEnd).ltrim(Chars: " \t" ); |
2653 | TagMap[TagHandle] = TagPrefix; |
2654 | } |
2655 | |
2656 | bool Document::expectToken(int TK) { |
2657 | Token T = getNext(); |
2658 | if (T.Kind != TK) { |
2659 | setError(Message: "Unexpected token" , Location&: T); |
2660 | return false; |
2661 | } |
2662 | return true; |
2663 | } |
2664 | |