1//===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang/AST/RawCommentList.h"
10#include "clang/AST/ASTContext.h"
11#include "clang/AST/Comment.h"
12#include "clang/AST/CommentBriefParser.h"
13#include "clang/AST/CommentCommandTraits.h"
14#include "clang/AST/CommentLexer.h"
15#include "clang/AST/CommentParser.h"
16#include "clang/AST/CommentSema.h"
17#include "clang/Basic/CharInfo.h"
18#include "llvm/Support/Allocator.h"
19
20using namespace clang;
21
22namespace {
23/// Get comment kind and bool describing if it is a trailing comment.
24std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
25 bool ParseAllComments) {
26 const size_t MinCommentLength = ParseAllComments ? 2 : 3;
27 if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
28 return std::make_pair(x: RawComment::RCK_Invalid, y: false);
29
30 RawComment::CommentKind K;
31 if (Comment[1] == '/') {
32 if (Comment.size() < 3)
33 return std::make_pair(x: RawComment::RCK_OrdinaryBCPL, y: false);
34
35 if (Comment[2] == '/')
36 K = RawComment::RCK_BCPLSlash;
37 else if (Comment[2] == '!')
38 K = RawComment::RCK_BCPLExcl;
39 else
40 return std::make_pair(x: RawComment::RCK_OrdinaryBCPL, y: false);
41 } else {
42 assert(Comment.size() >= 4);
43
44 // Comment lexer does not understand escapes in comment markers, so pretend
45 // that this is not a comment.
46 if (Comment[1] != '*' ||
47 Comment[Comment.size() - 2] != '*' ||
48 Comment[Comment.size() - 1] != '/')
49 return std::make_pair(x: RawComment::RCK_Invalid, y: false);
50
51 if (Comment[2] == '*')
52 K = RawComment::RCK_JavaDoc;
53 else if (Comment[2] == '!')
54 K = RawComment::RCK_Qt;
55 else
56 return std::make_pair(x: RawComment::RCK_OrdinaryC, y: false);
57 }
58 const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
59 return std::make_pair(x&: K, y: TrailingComment);
60}
61
62bool mergedCommentIsTrailingComment(StringRef Comment) {
63 return (Comment.size() > 3) && (Comment[3] == '<');
64}
65
66/// Returns true if R1 and R2 both have valid locations that start on the same
67/// column.
68bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
69 const RawComment &R2) {
70 SourceLocation L1 = R1.getBeginLoc();
71 SourceLocation L2 = R2.getBeginLoc();
72 bool Invalid = false;
73 unsigned C1 = SM.getPresumedColumnNumber(Loc: L1, Invalid: &Invalid);
74 if (!Invalid) {
75 unsigned C2 = SM.getPresumedColumnNumber(Loc: L2, Invalid: &Invalid);
76 return !Invalid && (C1 == C2);
77 }
78 return false;
79}
80} // unnamed namespace
81
82/// Determines whether there is only whitespace in `Buffer` between `P`
83/// and the previous line.
84/// \param Buffer The buffer to search in.
85/// \param P The offset from the beginning of `Buffer` to start from.
86/// \return true if all of the characters in `Buffer` ranging from the closest
87/// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
88/// are whitespace.
89static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
90 // Search backwards until we see linefeed or carriage return.
91 for (unsigned I = P; I != 0; --I) {
92 char C = Buffer[I - 1];
93 if (isVerticalWhitespace(c: C))
94 return true;
95 if (!isHorizontalWhitespace(c: C))
96 return false;
97 }
98 // We hit the beginning of the buffer.
99 return true;
100}
101
102/// Returns whether `K` is an ordinary comment kind.
103static bool isOrdinaryKind(RawComment::CommentKind K) {
104 return (K == RawComment::RCK_OrdinaryBCPL) ||
105 (K == RawComment::RCK_OrdinaryC);
106}
107
108RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
109 const CommentOptions &CommentOpts, bool Merged) :
110 Range(SR), RawTextValid(false), BriefTextValid(false),
111 IsAttached(false), IsTrailingComment(false),
112 IsAlmostTrailingComment(false) {
113 // Extract raw comment text, if possible.
114 if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
115 Kind = RCK_Invalid;
116 return;
117 }
118
119 // Guess comment kind.
120 std::pair<CommentKind, bool> K =
121 getCommentKind(Comment: RawText, ParseAllComments: CommentOpts.ParseAllComments);
122
123 // Guess whether an ordinary comment is trailing.
124 if (CommentOpts.ParseAllComments && isOrdinaryKind(K: K.first)) {
125 FileID BeginFileID;
126 unsigned BeginOffset;
127 std::tie(args&: BeginFileID, args&: BeginOffset) =
128 SourceMgr.getDecomposedLoc(Loc: Range.getBegin());
129 if (BeginOffset != 0) {
130 bool Invalid = false;
131 const char *Buffer =
132 SourceMgr.getBufferData(FID: BeginFileID, Invalid: &Invalid).data();
133 IsTrailingComment |=
134 (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, P: BeginOffset));
135 }
136 }
137
138 if (!Merged) {
139 Kind = K.first;
140 IsTrailingComment |= K.second;
141
142 IsAlmostTrailingComment =
143 RawText.starts_with(Prefix: "//<") || RawText.starts_with(Prefix: "/*<");
144 } else {
145 Kind = RCK_Merged;
146 IsTrailingComment =
147 IsTrailingComment || mergedCommentIsTrailingComment(Comment: RawText);
148 }
149}
150
151StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
152 FileID BeginFileID;
153 FileID EndFileID;
154 unsigned BeginOffset;
155 unsigned EndOffset;
156
157 std::tie(args&: BeginFileID, args&: BeginOffset) =
158 SourceMgr.getDecomposedLoc(Loc: Range.getBegin());
159 std::tie(args&: EndFileID, args&: EndOffset) = SourceMgr.getDecomposedLoc(Loc: Range.getEnd());
160
161 const unsigned Length = EndOffset - BeginOffset;
162 if (Length < 2)
163 return StringRef();
164
165 // The comment can't begin in one file and end in another.
166 assert(BeginFileID == EndFileID);
167
168 bool Invalid = false;
169 const char *BufferStart = SourceMgr.getBufferData(FID: BeginFileID,
170 Invalid: &Invalid).data();
171 if (Invalid)
172 return StringRef();
173
174 return StringRef(BufferStart + BeginOffset, Length);
175}
176
177const char *RawComment::extractBriefText(const ASTContext &Context) const {
178 // Lazily initialize RawText using the accessor before using it.
179 (void)getRawText(SourceMgr: Context.getSourceManager());
180
181 // Since we will be copying the resulting text, all allocations made during
182 // parsing are garbage after resulting string is formed. Thus we can use
183 // a separate allocator for all temporary stuff.
184 llvm::BumpPtrAllocator Allocator;
185
186 comments::Lexer L(Allocator, Context.getDiagnostics(),
187 Context.getCommentCommandTraits(),
188 Range.getBegin(),
189 RawText.begin(), RawText.end());
190 comments::BriefParser P(L, Context.getCommentCommandTraits());
191
192 const std::string Result = P.Parse();
193 const unsigned BriefTextLength = Result.size();
194 char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
195 memcpy(dest: BriefTextPtr, src: Result.c_str(), n: BriefTextLength + 1);
196 BriefText = BriefTextPtr;
197 BriefTextValid = true;
198
199 return BriefTextPtr;
200}
201
202comments::FullComment *RawComment::parse(const ASTContext &Context,
203 const Preprocessor *PP,
204 const Decl *D) const {
205 if (D->isInvalidDecl())
206 return nullptr;
207
208 // Lazily initialize RawText using the accessor before using it.
209 (void)getRawText(SourceMgr: Context.getSourceManager());
210
211 comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
212 Context.getCommentCommandTraits(),
213 getSourceRange().getBegin(),
214 RawText.begin(), RawText.end());
215 comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
216 Context.getDiagnostics(),
217 Context.getCommentCommandTraits(),
218 PP);
219 S.setDecl(D);
220 comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
221 Context.getDiagnostics(),
222 Context.getCommentCommandTraits());
223
224 return P.parseFullComment();
225}
226
227static bool onlyWhitespaceBetween(SourceManager &SM,
228 SourceLocation Loc1, SourceLocation Loc2,
229 unsigned MaxNewlinesAllowed) {
230 FileIDAndOffset Loc1Info = SM.getDecomposedLoc(Loc: Loc1);
231 FileIDAndOffset Loc2Info = SM.getDecomposedLoc(Loc: Loc2);
232
233 // Question does not make sense if locations are in different files.
234 if (Loc1Info.first != Loc2Info.first)
235 return false;
236
237 bool Invalid = false;
238 const char *Buffer = SM.getBufferData(FID: Loc1Info.first, Invalid: &Invalid).data();
239 if (Invalid)
240 return false;
241
242 unsigned NumNewlines = 0;
243 assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
244 // Look for non-whitespace characters and remember any newlines seen.
245 for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
246 switch (Buffer[I]) {
247 default:
248 return false;
249 case ' ':
250 case '\t':
251 case '\f':
252 case '\v':
253 break;
254 case '\r':
255 case '\n':
256 ++NumNewlines;
257
258 // Check if we have found more than the maximum allowed number of
259 // newlines.
260 if (NumNewlines > MaxNewlinesAllowed)
261 return false;
262
263 // Collapse \r\n and \n\r into a single newline.
264 if (I + 1 != Loc2Info.second &&
265 (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
266 Buffer[I] != Buffer[I + 1])
267 ++I;
268 break;
269 }
270 }
271
272 return true;
273}
274
275void RawCommentList::addComment(const RawComment &RC,
276 const CommentOptions &CommentOpts,
277 llvm::BumpPtrAllocator &Allocator) {
278 if (RC.isInvalid())
279 return;
280
281 // Ordinary comments are not interesting for us.
282 if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
283 return;
284
285 FileIDAndOffset Loc = SourceMgr.getDecomposedLoc(Loc: RC.getBeginLoc());
286
287 const FileID CommentFile = Loc.first;
288 const unsigned CommentOffset = Loc.second;
289
290 // If this is the first Doxygen comment, save it (because there isn't
291 // anything to merge it with).
292 auto &OC = OrderedComments[CommentFile];
293 if (OC.empty()) {
294 OC[CommentOffset] = new (Allocator) RawComment(RC);
295 return;
296 }
297
298 const RawComment &C1 = *OC.rbegin()->second;
299 const RawComment &C2 = RC;
300
301 // Merge comments only if there is only whitespace between them.
302 // Can't merge trailing and non-trailing comments unless the second is
303 // non-trailing ordinary in the same column, as in the case:
304 // int x; // documents x
305 // // more text
306 // versus:
307 // int x; // documents x
308 // int y; // documents y
309 // or:
310 // int x; // documents x
311 // // documents y
312 // int y;
313 // Merge comments if they are on same or consecutive lines.
314 if ((C1.isTrailingComment() == C2.isTrailingComment() ||
315 (C1.isTrailingComment() && !C2.isTrailingComment() &&
316 isOrdinaryKind(K: C2.getKind()) &&
317 commentsStartOnSameColumn(SM: SourceMgr, R1: C1, R2: C2))) &&
318 onlyWhitespaceBetween(SM&: SourceMgr, Loc1: C1.getEndLoc(), Loc2: C2.getBeginLoc(),
319 /*MaxNewlinesAllowed=*/1)) {
320 SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
321 *OrderedComments[CommentFile].rbegin()->second =
322 RawComment(SourceMgr, MergedRange, CommentOpts, true);
323 } else {
324 OrderedComments[CommentFile][CommentOffset] =
325 new (Allocator) RawComment(RC);
326 }
327}
328
329const std::map<unsigned, RawComment *> *
330RawCommentList::getCommentsInFile(FileID File) const {
331 auto CommentsInFile = OrderedComments.find(Val: File);
332 if (CommentsInFile == OrderedComments.end())
333 return nullptr;
334
335 return &CommentsInFile->second;
336}
337
338bool RawCommentList::empty() const { return OrderedComments.empty(); }
339
340unsigned RawCommentList::getCommentBeginLine(RawComment *C, FileID File,
341 unsigned Offset) const {
342 auto Cached = CommentBeginLine.find(Val: C);
343 if (Cached != CommentBeginLine.end())
344 return Cached->second;
345 const unsigned Line = SourceMgr.getLineNumber(FID: File, FilePos: Offset);
346 CommentBeginLine[C] = Line;
347 return Line;
348}
349
350unsigned RawCommentList::getCommentEndOffset(RawComment *C) const {
351 auto Cached = CommentEndOffset.find(Val: C);
352 if (Cached != CommentEndOffset.end())
353 return Cached->second;
354 const unsigned Offset =
355 SourceMgr.getDecomposedLoc(Loc: C->getSourceRange().getEnd()).second;
356 CommentEndOffset[C] = Offset;
357 return Offset;
358}
359
360std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
361 DiagnosticsEngine &Diags) const {
362 llvm::StringRef CommentText = getRawText(SourceMgr);
363 if (CommentText.empty())
364 return "";
365
366 std::string Result;
367 for (const RawComment::CommentLine &Line :
368 getFormattedLines(SourceMgr, Diags))
369 Result += Line.Text + "\n";
370
371 auto LastChar = Result.find_last_not_of(c: '\n');
372 Result.erase(pos: LastChar + 1, n: Result.size());
373
374 return Result;
375}
376
377std::vector<RawComment::CommentLine>
378RawComment::getFormattedLines(const SourceManager &SourceMgr,
379 DiagnosticsEngine &Diags) const {
380 llvm::StringRef CommentText = getRawText(SourceMgr);
381 if (CommentText.empty())
382 return {};
383
384 llvm::BumpPtrAllocator Allocator;
385 // We do not parse any commands, so CommentOptions are ignored by
386 // comments::Lexer. Therefore, we just use default-constructed options.
387 CommentOptions DefOpts;
388 comments::CommandTraits EmptyTraits(Allocator, DefOpts);
389 comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
390 CommentText.begin(), CommentText.end(),
391 /*ParseCommands=*/false);
392
393 std::vector<RawComment::CommentLine> Result;
394 // A column number of the first non-whitespace token in the comment text.
395 // We skip whitespace up to this column, but keep the whitespace after this
396 // column. IndentColumn is calculated when lexing the first line and reused
397 // for the rest of lines.
398 unsigned IndentColumn = 0;
399
400 // Record the line number of the last processed comment line.
401 // For block-style comments, an extra newline token will be produced after
402 // the end-comment marker, e.g.:
403 // /** This is a multi-line comment block.
404 // The lexer will produce two newline tokens here > */
405 // previousLine will record the line number when we previously saw a newline
406 // token and recorded a comment line. If we see another newline token on the
407 // same line, don't record anything in between.
408 unsigned PreviousLine = 0;
409
410 // Processes one line of the comment and adds it to the result.
411 // Handles skipping the indent at the start of the line.
412 // Returns false when eof is reached and true otherwise.
413 auto LexLine = [&](bool IsFirstLine) -> bool {
414 comments::Token Tok;
415 // Lex the first token on the line. We handle it separately, because we to
416 // fix up its indentation.
417 L.lex(T&: Tok);
418 if (Tok.is(K: comments::tok::eof))
419 return false;
420 if (Tok.is(K: comments::tok::newline)) {
421 PresumedLoc Loc = SourceMgr.getPresumedLoc(Loc: Tok.getLocation());
422 if (Loc.getLine() != PreviousLine) {
423 Result.emplace_back(args: "", args&: Loc, args&: Loc);
424 PreviousLine = Loc.getLine();
425 }
426 return true;
427 }
428 SmallString<124> Line;
429 llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
430 bool LocInvalid = false;
431 unsigned TokColumn =
432 SourceMgr.getSpellingColumnNumber(Loc: Tok.getLocation(), Invalid: &LocInvalid);
433 assert(!LocInvalid && "getFormattedText for invalid location");
434
435 // Amount of leading whitespace in TokText.
436 size_t WhitespaceLen = TokText.find_first_not_of(Chars: " \t");
437 if (WhitespaceLen == StringRef::npos)
438 WhitespaceLen = TokText.size();
439 // Remember the amount of whitespace we skipped in the first line to remove
440 // indent up to that column in the following lines.
441 if (IsFirstLine)
442 IndentColumn = TokColumn + WhitespaceLen;
443
444 // Amount of leading whitespace we actually want to skip.
445 // For the first line we skip all the whitespace.
446 // For the rest of the lines, we skip whitespace up to IndentColumn.
447 unsigned SkipLen =
448 IsFirstLine
449 ? WhitespaceLen
450 : std::min<size_t>(
451 a: WhitespaceLen,
452 b: std::max<int>(a: static_cast<int>(IndentColumn) - TokColumn, b: 0));
453 llvm::StringRef Trimmed = TokText.drop_front(N: SkipLen);
454 Line += Trimmed;
455 // Get the beginning location of the adjusted comment line.
456 PresumedLoc Begin =
457 SourceMgr.getPresumedLoc(Loc: Tok.getLocation().getLocWithOffset(Offset: SkipLen));
458
459 // Lex all tokens in the rest of the line.
460 for (L.lex(T&: Tok); Tok.isNot(K: comments::tok::eof); L.lex(T&: Tok)) {
461 if (Tok.is(K: comments::tok::newline)) {
462 // Get the ending location of the comment line.
463 PresumedLoc End = SourceMgr.getPresumedLoc(Loc: Tok.getLocation());
464 if (End.getLine() != PreviousLine) {
465 Result.emplace_back(args&: Line, args&: Begin, args&: End);
466 PreviousLine = End.getLine();
467 }
468 return true;
469 }
470 Line += L.getSpelling(Tok, SourceMgr);
471 }
472 PresumedLoc End = SourceMgr.getPresumedLoc(Loc: Tok.getLocation());
473 Result.emplace_back(args&: Line, args&: Begin, args&: End);
474 // We've reached the end of file token.
475 return false;
476 };
477
478 // Process first line separately to remember indent for the following lines.
479 if (!LexLine(/*IsFirstLine=*/true))
480 return Result;
481 // Process the rest of the lines.
482 while (LexLine(/*IsFirstLine=*/false))
483 ;
484 return Result;
485}
486