1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines lexer for structured comments and supporting token class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14#define LLVM_CLANG_AST_COMMENTLEXER_H
15
16#include "clang/Basic/Diagnostic.h"
17#include "clang/Basic/SourceManager.h"
18#include "llvm/ADT/SmallString.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/Support/Allocator.h"
21#include "llvm/Support/raw_ostream.h"
22
23namespace clang {
24namespace comments {
25
26class Lexer;
27class TextTokenRetokenizer;
28struct CommandInfo;
29class CommandTraits;
30
31namespace tok {
32enum TokenKind {
33 eof,
34 newline,
35 text,
36 unknown_command, // Command that does not have an ID.
37 backslash_command, // Command with an ID, that used backslash marker.
38 at_command, // Command with an ID, that used 'at' marker.
39 verbatim_block_begin,
40 verbatim_block_line,
41 verbatim_block_end,
42 verbatim_line_name,
43 verbatim_line_text,
44 html_start_tag, // <tag
45 html_ident, // attr
46 html_equals, // =
47 html_quoted_string, // "blah\"blah" or 'blah\'blah'
48 html_greater, // >
49 html_slash_greater, // />
50 html_end_tag // </tag
51};
52} // end namespace tok
53
54/// Comment token.
55class Token {
56 friend class Lexer;
57 friend class TextTokenRetokenizer;
58
59 /// The location of the token.
60 SourceLocation Loc;
61
62 /// The actual kind of the token.
63 tok::TokenKind Kind;
64
65 /// Integer value associated with a token.
66 ///
67 /// If the token is a known command, contains command ID and TextPtr is
68 /// unused (command spelling can be found with CommandTraits). Otherwise,
69 /// contains the length of the string that starts at TextPtr.
70 unsigned IntVal;
71
72 /// Length of the token spelling in comment. Can be 0 for synthenized
73 /// tokens.
74 unsigned Length;
75
76 /// Contains text value associated with a token.
77 const char *TextPtr;
78
79public:
80 SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81 void setLocation(SourceLocation SL) { Loc = SL; }
82
83 SourceLocation getEndLocation() const LLVM_READONLY {
84 if (Length == 0 || Length == 1)
85 return Loc;
86 return Loc.getLocWithOffset(Offset: Length - 1);
87 }
88
89 tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90 void setKind(tok::TokenKind K) { Kind = K; }
91
92 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94
95 unsigned getLength() const LLVM_READONLY { return Length; }
96 void setLength(unsigned L) { Length = L; }
97
98 StringRef getText() const LLVM_READONLY {
99 assert(is(tok::text));
100 return StringRef(TextPtr, IntVal);
101 }
102
103 void setText(StringRef Text) {
104 assert(is(tok::text));
105 TextPtr = Text.data();
106 IntVal = Text.size();
107 }
108
109 StringRef getUnknownCommandName() const LLVM_READONLY {
110 assert(is(tok::unknown_command));
111 return StringRef(TextPtr, IntVal);
112 }
113
114 void setUnknownCommandName(StringRef Name) {
115 assert(is(tok::unknown_command));
116 TextPtr = Name.data();
117 IntVal = Name.size();
118 }
119
120 unsigned getCommandID() const LLVM_READONLY {
121 assert(is(tok::backslash_command) || is(tok::at_command));
122 return IntVal;
123 }
124
125 void setCommandID(unsigned ID) {
126 assert(is(tok::backslash_command) || is(tok::at_command));
127 IntVal = ID;
128 }
129
130 unsigned getVerbatimBlockID() const LLVM_READONLY {
131 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
132 return IntVal;
133 }
134
135 void setVerbatimBlockID(unsigned ID) {
136 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
137 IntVal = ID;
138 }
139
140 StringRef getVerbatimBlockText() const LLVM_READONLY {
141 assert(is(tok::verbatim_block_line));
142 return StringRef(TextPtr, IntVal);
143 }
144
145 void setVerbatimBlockText(StringRef Text) {
146 assert(is(tok::verbatim_block_line));
147 TextPtr = Text.data();
148 IntVal = Text.size();
149 }
150
151 unsigned getVerbatimLineID() const LLVM_READONLY {
152 assert(is(tok::verbatim_line_name));
153 return IntVal;
154 }
155
156 void setVerbatimLineID(unsigned ID) {
157 assert(is(tok::verbatim_line_name));
158 IntVal = ID;
159 }
160
161 StringRef getVerbatimLineText() const LLVM_READONLY {
162 assert(is(tok::verbatim_line_text));
163 return StringRef(TextPtr, IntVal);
164 }
165
166 void setVerbatimLineText(StringRef Text) {
167 assert(is(tok::verbatim_line_text));
168 TextPtr = Text.data();
169 IntVal = Text.size();
170 }
171
172 StringRef getHTMLTagStartName() const LLVM_READONLY {
173 assert(is(tok::html_start_tag));
174 return StringRef(TextPtr, IntVal);
175 }
176
177 void setHTMLTagStartName(StringRef Name) {
178 assert(is(tok::html_start_tag));
179 TextPtr = Name.data();
180 IntVal = Name.size();
181 }
182
183 StringRef getHTMLIdent() const LLVM_READONLY {
184 assert(is(tok::html_ident));
185 return StringRef(TextPtr, IntVal);
186 }
187
188 void setHTMLIdent(StringRef Name) {
189 assert(is(tok::html_ident));
190 TextPtr = Name.data();
191 IntVal = Name.size();
192 }
193
194 StringRef getHTMLQuotedString() const LLVM_READONLY {
195 assert(is(tok::html_quoted_string));
196 return StringRef(TextPtr, IntVal);
197 }
198
199 void setHTMLQuotedString(StringRef Str) {
200 assert(is(tok::html_quoted_string));
201 TextPtr = Str.data();
202 IntVal = Str.size();
203 }
204
205 StringRef getHTMLTagEndName() const LLVM_READONLY {
206 assert(is(tok::html_end_tag));
207 return StringRef(TextPtr, IntVal);
208 }
209
210 void setHTMLTagEndName(StringRef Name) {
211 assert(is(tok::html_end_tag));
212 TextPtr = Name.data();
213 IntVal = Name.size();
214 }
215
216 void dump(const Lexer &L, const SourceManager &SM) const;
217};
218
219/// Comment lexer.
220class Lexer {
221private:
222 Lexer(const Lexer &) = delete;
223 void operator=(const Lexer &) = delete;
224
225 /// Allocator for strings that are semantic values of tokens and have to be
226 /// computed (for example, resolved decimal character references).
227 llvm::BumpPtrAllocator &Allocator;
228
229 DiagnosticsEngine &Diags;
230
231 const CommandTraits &Traits;
232
233 const char *const BufferStart;
234 const char *const BufferEnd;
235
236 const char *BufferPtr;
237
238 /// One past end pointer for the current comment. For BCPL comments points
239 /// to newline or BufferEnd, for C comments points to star in '*/'.
240 const char *CommentEnd;
241
242 SourceLocation FileLoc;
243
244 /// If true, the commands, html tags, etc will be parsed and reported as
245 /// separate tokens inside the comment body. If false, the comment text will
246 /// be parsed into text and newline tokens.
247 bool ParseCommands;
248
249 enum LexerCommentState : uint8_t {
250 LCS_BeforeComment,
251 LCS_InsideBCPLComment,
252 LCS_InsideCComment,
253 LCS_BetweenComments
254 };
255
256 /// Low-level lexer state, track if we are inside or outside of comment.
257 LexerCommentState CommentState;
258
259 enum LexerState : uint8_t {
260 /// Lexing normal comment text
261 LS_Normal,
262
263 /// Finished lexing verbatim block beginning command, will lex first body
264 /// line.
265 LS_VerbatimBlockFirstLine,
266
267 /// Lexing verbatim block body line-by-line, skipping line-starting
268 /// decorations.
269 LS_VerbatimBlockBody,
270
271 /// Finished lexing verbatim line beginning command, will lex text (one
272 /// line).
273 LS_VerbatimLineText,
274
275 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
276 LS_HTMLStartTag,
277
278 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
279 LS_HTMLEndTag
280 };
281
282 /// Current lexing mode.
283 LexerState State;
284
285 /// If State is LS_VerbatimBlock, contains the name of verbatim end
286 /// command, including command marker.
287 SmallString<16> VerbatimBlockEndCommandName;
288
289 /// Given a character reference name (e.g., "lt"), return the character that
290 /// it stands for (e.g., "<").
291 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
292
293 /// Given a Unicode codepoint as base-10 integer, return the character.
294 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
295
296 /// Given a Unicode codepoint as base-16 integer, return the character.
297 StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
298
299 void formTokenWithChars(Token &Result, const char *TokEnd,
300 tok::TokenKind Kind);
301
302 void formTextToken(Token &Result, const char *TokEnd) {
303 StringRef Text(BufferPtr, TokEnd - BufferPtr);
304 formTokenWithChars(Result, TokEnd, Kind: tok::text);
305 Result.setText(Text);
306 }
307
308 SourceLocation getSourceLocation(const char *Loc) const {
309 assert(Loc >= BufferStart && Loc <= BufferEnd &&
310 "Location out of range for this buffer!");
311
312 const unsigned CharNo = Loc - BufferStart;
313 return FileLoc.getLocWithOffset(Offset: CharNo);
314 }
315
316 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
317 return Diags.Report(Loc, DiagID);
318 }
319
320 /// Eat string matching regexp \code \s*\* \endcode.
321 void skipLineStartingDecorations();
322
323 /// Skip over pure text.
324 const char *skipTextToken();
325
326 /// Lex comment text, including commands if ParseCommands is set to true.
327 void lexCommentText(Token &T);
328
329 void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
330 const CommandInfo *Info);
331
332 void lexVerbatimBlockFirstLine(Token &T);
333
334 void lexVerbatimBlockBody(Token &T);
335
336 void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
337 const CommandInfo *Info);
338
339 void lexVerbatimLineText(Token &T);
340
341 void lexHTMLCharacterReference(Token &T);
342
343 void setupAndLexHTMLStartTag(Token &T);
344
345 void lexHTMLStartTag(Token &T);
346
347 void setupAndLexHTMLEndTag(Token &T);
348
349 void lexHTMLEndTag(Token &T);
350
351public:
352 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
353 const CommandTraits &Traits, SourceLocation FileLoc,
354 const char *BufferStart, const char *BufferEnd,
355 bool ParseCommands = true);
356
357 void lex(Token &T);
358
359 StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
360};
361
362} // end namespace comments
363} // end namespace clang
364
365#endif
366
367