1 | //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines lexer for structured comments and supporting token class. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_CLANG_AST_COMMENTLEXER_H |
14 | #define |
15 | |
16 | #include "clang/Basic/Diagnostic.h" |
17 | #include "clang/Basic/SourceManager.h" |
18 | #include "llvm/ADT/SmallString.h" |
19 | #include "llvm/ADT/StringRef.h" |
20 | #include "llvm/Support/Allocator.h" |
21 | #include "llvm/Support/raw_ostream.h" |
22 | |
23 | namespace clang { |
24 | namespace comments { |
25 | |
26 | class Lexer; |
27 | class ; |
28 | struct CommandInfo; |
29 | class CommandTraits; |
30 | |
31 | namespace tok { |
32 | enum { |
33 | , |
34 | , |
35 | , |
36 | unknown_command, // Command that does not have an ID. |
37 | backslash_command, // Command with an ID, that used backslash marker. |
38 | at_command, // Command with an ID, that used 'at' marker. |
39 | , |
40 | , |
41 | , |
42 | , |
43 | , |
44 | , // <tag |
45 | , // attr |
46 | , // = |
47 | , // "blah\"blah" or 'blah\'blah' |
48 | , // > |
49 | , // /> |
50 | // </tag |
51 | }; |
52 | } // end namespace tok |
53 | |
54 | /// Comment token. |
55 | class { |
56 | friend class Lexer; |
57 | friend class TextTokenRetokenizer; |
58 | |
59 | /// The location of the token. |
60 | SourceLocation ; |
61 | |
62 | /// The actual kind of the token. |
63 | tok::TokenKind ; |
64 | |
65 | /// Integer value associated with a token. |
66 | /// |
67 | /// If the token is a known command, contains command ID and TextPtr is |
68 | /// unused (command spelling can be found with CommandTraits). Otherwise, |
69 | /// contains the length of the string that starts at TextPtr. |
70 | unsigned ; |
71 | |
72 | /// Length of the token spelling in comment. Can be 0 for synthenized |
73 | /// tokens. |
74 | unsigned ; |
75 | |
76 | /// Contains text value associated with a token. |
77 | const char *; |
78 | |
79 | public: |
80 | SourceLocation () const LLVM_READONLY { return Loc; } |
81 | void (SourceLocation SL) { Loc = SL; } |
82 | |
83 | SourceLocation () const LLVM_READONLY { |
84 | if (Length == 0 || Length == 1) |
85 | return Loc; |
86 | return Loc.getLocWithOffset(Offset: Length - 1); |
87 | } |
88 | |
89 | tok::TokenKind () const LLVM_READONLY { return Kind; } |
90 | void (tok::TokenKind K) { Kind = K; } |
91 | |
92 | bool (tok::TokenKind K) const LLVM_READONLY { return Kind == K; } |
93 | bool (tok::TokenKind K) const LLVM_READONLY { return Kind != K; } |
94 | |
95 | unsigned () const LLVM_READONLY { return Length; } |
96 | void (unsigned L) { Length = L; } |
97 | |
98 | StringRef () const LLVM_READONLY { |
99 | assert(is(tok::text)); |
100 | return StringRef(TextPtr, IntVal); |
101 | } |
102 | |
103 | void (StringRef Text) { |
104 | assert(is(tok::text)); |
105 | TextPtr = Text.data(); |
106 | IntVal = Text.size(); |
107 | } |
108 | |
109 | StringRef getUnknownCommandName() const LLVM_READONLY { |
110 | assert(is(tok::unknown_command)); |
111 | return StringRef(TextPtr, IntVal); |
112 | } |
113 | |
114 | void setUnknownCommandName(StringRef Name) { |
115 | assert(is(tok::unknown_command)); |
116 | TextPtr = Name.data(); |
117 | IntVal = Name.size(); |
118 | } |
119 | |
120 | unsigned getCommandID() const LLVM_READONLY { |
121 | assert(is(tok::backslash_command) || is(tok::at_command)); |
122 | return IntVal; |
123 | } |
124 | |
125 | void setCommandID(unsigned ID) { |
126 | assert(is(tok::backslash_command) || is(tok::at_command)); |
127 | IntVal = ID; |
128 | } |
129 | |
130 | unsigned () const LLVM_READONLY { |
131 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
132 | return IntVal; |
133 | } |
134 | |
135 | void (unsigned ID) { |
136 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
137 | IntVal = ID; |
138 | } |
139 | |
140 | StringRef () const LLVM_READONLY { |
141 | assert(is(tok::verbatim_block_line)); |
142 | return StringRef(TextPtr, IntVal); |
143 | } |
144 | |
145 | void (StringRef Text) { |
146 | assert(is(tok::verbatim_block_line)); |
147 | TextPtr = Text.data(); |
148 | IntVal = Text.size(); |
149 | } |
150 | |
151 | unsigned () const LLVM_READONLY { |
152 | assert(is(tok::verbatim_line_name)); |
153 | return IntVal; |
154 | } |
155 | |
156 | void (unsigned ID) { |
157 | assert(is(tok::verbatim_line_name)); |
158 | IntVal = ID; |
159 | } |
160 | |
161 | StringRef () const LLVM_READONLY { |
162 | assert(is(tok::verbatim_line_text)); |
163 | return StringRef(TextPtr, IntVal); |
164 | } |
165 | |
166 | void (StringRef Text) { |
167 | assert(is(tok::verbatim_line_text)); |
168 | TextPtr = Text.data(); |
169 | IntVal = Text.size(); |
170 | } |
171 | |
172 | StringRef () const LLVM_READONLY { |
173 | assert(is(tok::html_start_tag)); |
174 | return StringRef(TextPtr, IntVal); |
175 | } |
176 | |
177 | void (StringRef Name) { |
178 | assert(is(tok::html_start_tag)); |
179 | TextPtr = Name.data(); |
180 | IntVal = Name.size(); |
181 | } |
182 | |
183 | StringRef () const LLVM_READONLY { |
184 | assert(is(tok::html_ident)); |
185 | return StringRef(TextPtr, IntVal); |
186 | } |
187 | |
188 | void (StringRef Name) { |
189 | assert(is(tok::html_ident)); |
190 | TextPtr = Name.data(); |
191 | IntVal = Name.size(); |
192 | } |
193 | |
194 | StringRef () const LLVM_READONLY { |
195 | assert(is(tok::html_quoted_string)); |
196 | return StringRef(TextPtr, IntVal); |
197 | } |
198 | |
199 | void (StringRef Str) { |
200 | assert(is(tok::html_quoted_string)); |
201 | TextPtr = Str.data(); |
202 | IntVal = Str.size(); |
203 | } |
204 | |
205 | StringRef () const LLVM_READONLY { |
206 | assert(is(tok::html_end_tag)); |
207 | return StringRef(TextPtr, IntVal); |
208 | } |
209 | |
210 | void (StringRef Name) { |
211 | assert(is(tok::html_end_tag)); |
212 | TextPtr = Name.data(); |
213 | IntVal = Name.size(); |
214 | } |
215 | |
216 | void (const Lexer &L, const SourceManager &SM) const; |
217 | }; |
218 | |
219 | /// Comment lexer. |
220 | class { |
221 | private: |
222 | (const Lexer &) = delete; |
223 | void (const Lexer &) = delete; |
224 | |
225 | /// Allocator for strings that are semantic values of tokens and have to be |
226 | /// computed (for example, resolved decimal character references). |
227 | llvm::BumpPtrAllocator &; |
228 | |
229 | DiagnosticsEngine &; |
230 | |
231 | const CommandTraits &; |
232 | |
233 | const char *const ; |
234 | const char *const ; |
235 | |
236 | const char *; |
237 | |
238 | /// One past end pointer for the current comment. For BCPL comments points |
239 | /// to newline or BufferEnd, for C comments points to star in '*/'. |
240 | const char *; |
241 | |
242 | SourceLocation ; |
243 | |
244 | /// If true, the commands, html tags, etc will be parsed and reported as |
245 | /// separate tokens inside the comment body. If false, the comment text will |
246 | /// be parsed into text and newline tokens. |
247 | bool ParseCommands; |
248 | |
249 | enum : uint8_t { |
250 | , |
251 | , |
252 | , |
253 | |
254 | }; |
255 | |
256 | /// Low-level lexer state, track if we are inside or outside of comment. |
257 | LexerCommentState ; |
258 | |
259 | enum : uint8_t { |
260 | /// Lexing normal comment text |
261 | , |
262 | |
263 | /// Finished lexing verbatim block beginning command, will lex first body |
264 | /// line. |
265 | , |
266 | |
267 | /// Lexing verbatim block body line-by-line, skipping line-starting |
268 | /// decorations. |
269 | LS_VerbatimBlockBody, |
270 | |
271 | /// Finished lexing verbatim line beginning command, will lex text (one |
272 | /// line). |
273 | , |
274 | |
275 | /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. |
276 | , |
277 | |
278 | /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. |
279 | |
280 | }; |
281 | |
282 | /// Current lexing mode. |
283 | LexerState ; |
284 | |
285 | /// If State is LS_VerbatimBlock, contains the name of verbatim end |
286 | /// command, including command marker. |
287 | SmallString<16> VerbatimBlockEndCommandName; |
288 | |
289 | /// Given a character reference name (e.g., "lt"), return the character that |
290 | /// it stands for (e.g., "<"). |
291 | StringRef (StringRef Name) const; |
292 | |
293 | /// Given a Unicode codepoint as base-10 integer, return the character. |
294 | StringRef (StringRef Name) const; |
295 | |
296 | /// Given a Unicode codepoint as base-16 integer, return the character. |
297 | StringRef (StringRef Name) const; |
298 | |
299 | void (Token &Result, const char *TokEnd, |
300 | tok::TokenKind Kind); |
301 | |
302 | void (Token &Result, const char *TokEnd) { |
303 | StringRef Text(BufferPtr, TokEnd - BufferPtr); |
304 | formTokenWithChars(Result, TokEnd, Kind: tok::text); |
305 | Result.setText(Text); |
306 | } |
307 | |
308 | SourceLocation (const char *Loc) const { |
309 | assert(Loc >= BufferStart && Loc <= BufferEnd && |
310 | "Location out of range for this buffer!" ); |
311 | |
312 | const unsigned CharNo = Loc - BufferStart; |
313 | return FileLoc.getLocWithOffset(Offset: CharNo); |
314 | } |
315 | |
316 | DiagnosticBuilder (SourceLocation Loc, unsigned DiagID) { |
317 | return Diags.Report(Loc, DiagID); |
318 | } |
319 | |
320 | /// Eat string matching regexp \code \s*\* \endcode. |
321 | void (); |
322 | |
323 | /// Skip over pure text. |
324 | const char *(); |
325 | |
326 | /// Lex comment text, including commands if ParseCommands is set to true. |
327 | void (Token &T); |
328 | |
329 | void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, |
330 | const CommandInfo *Info); |
331 | |
332 | void (Token &T); |
333 | |
334 | void lexVerbatimBlockBody(Token &T); |
335 | |
336 | void setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
337 | const CommandInfo *Info); |
338 | |
339 | void (Token &T); |
340 | |
341 | void (Token &T); |
342 | |
343 | void setupAndLexHTMLStartTag(Token &T); |
344 | |
345 | void (Token &T); |
346 | |
347 | void setupAndLexHTMLEndTag(Token &T); |
348 | |
349 | void (Token &T); |
350 | |
351 | public: |
352 | Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
353 | const CommandTraits &Traits, SourceLocation FileLoc, |
354 | const char *BufferStart, const char *BufferEnd, |
355 | bool ParseCommands = true); |
356 | |
357 | void (Token &T); |
358 | |
359 | StringRef (const Token &Tok, const SourceManager &SourceMgr) const; |
360 | }; |
361 | |
362 | } // end namespace comments |
363 | } // end namespace clang |
364 | |
365 | #endif |
366 | |
367 | |