1 | //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides functions that simplify extraction of source code. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | #include "clang/Tooling/Transformer/SourceCode.h" |
13 | #include "clang/AST/ASTContext.h" |
14 | #include "clang/AST/Attr.h" |
15 | #include "clang/AST/Comment.h" |
16 | #include "clang/AST/Decl.h" |
17 | #include "clang/AST/DeclCXX.h" |
18 | #include "clang/AST/DeclTemplate.h" |
19 | #include "clang/AST/Expr.h" |
20 | #include "clang/Basic/SourceManager.h" |
21 | #include "clang/Lex/Lexer.h" |
22 | #include "llvm/Support/Errc.h" |
23 | #include "llvm/Support/Error.h" |
24 | #include <set> |
25 | |
26 | using namespace clang; |
27 | |
28 | using llvm::errc; |
29 | using llvm::StringError; |
30 | |
31 | StringRef clang::tooling::getText(CharSourceRange Range, |
32 | const ASTContext &Context) { |
33 | return Lexer::getSourceText(Range, SM: Context.getSourceManager(), |
34 | LangOpts: Context.getLangOpts()); |
35 | } |
36 | |
37 | CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range, |
38 | tok::TokenKind Next, |
39 | ASTContext &Context) { |
40 | CharSourceRange R = Lexer::getAsCharRange(Range, SM: Context.getSourceManager(), |
41 | LangOpts: Context.getLangOpts()); |
42 | if (R.isInvalid()) |
43 | return Range; |
44 | Token Tok; |
45 | bool Err = |
46 | Lexer::getRawToken(Loc: R.getEnd(), Result&: Tok, SM: Context.getSourceManager(), |
47 | LangOpts: Context.getLangOpts(), /*IgnoreWhiteSpace=*/true); |
48 | if (Err || !Tok.is(K: Next)) |
49 | return Range; |
50 | return CharSourceRange::getTokenRange(B: Range.getBegin(), E: Tok.getLocation()); |
51 | } |
52 | |
53 | llvm::Error clang::tooling::validateRange(const CharSourceRange &Range, |
54 | const SourceManager &SM, |
55 | bool ) { |
56 | if (Range.isInvalid()) |
57 | return llvm::make_error<StringError>(Args: errc::invalid_argument, |
58 | Args: "Invalid range" ); |
59 | |
60 | if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID()) |
61 | return llvm::make_error<StringError>( |
62 | Args: errc::invalid_argument, Args: "Range starts or ends in a macro expansion" ); |
63 | |
64 | if (!AllowSystemHeaders) { |
65 | if (SM.isInSystemHeader(Loc: Range.getBegin()) || |
66 | SM.isInSystemHeader(Loc: Range.getEnd())) |
67 | return llvm::make_error<StringError>(Args: errc::invalid_argument, |
68 | Args: "Range is in system header" ); |
69 | } |
70 | |
71 | std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Loc: Range.getBegin()); |
72 | std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Loc: Range.getEnd()); |
73 | if (BeginInfo.first != EndInfo.first) |
74 | return llvm::make_error<StringError>( |
75 | Args: errc::invalid_argument, Args: "Range begins and ends in different files" ); |
76 | |
77 | if (BeginInfo.second > EndInfo.second) |
78 | return llvm::make_error<StringError>(Args: errc::invalid_argument, |
79 | Args: "Range's begin is past its end" ); |
80 | |
81 | return llvm::Error::success(); |
82 | } |
83 | |
84 | llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range, |
85 | const SourceManager &SM) { |
86 | return validateRange(Range, SM, /*AllowSystemHeaders=*/false); |
87 | } |
88 | |
89 | static bool spelledInMacroDefinition(SourceLocation Loc, |
90 | const SourceManager &SM) { |
91 | while (Loc.isMacroID()) { |
92 | const auto &Expansion = SM.getSLocEntry(FID: SM.getFileID(SpellingLoc: Loc)).getExpansion(); |
93 | if (Expansion.isMacroArgExpansion()) { |
94 | // Check the spelling location of the macro arg, in case the arg itself is |
95 | // in a macro expansion. |
96 | Loc = Expansion.getSpellingLoc(); |
97 | } else { |
98 | return true; |
99 | } |
100 | } |
101 | return false; |
102 | } |
103 | |
104 | // Returns the expansion char-range of `Loc` if `Loc` is a split token. For |
105 | // example, `>>` in nested templates needs the first `>` to be split, otherwise |
106 | // the `SourceLocation` of the token would lex as `>>` instead of `>`. |
107 | static std::optional<CharSourceRange> |
108 | getExpansionForSplitToken(SourceLocation Loc, const SourceManager &SM, |
109 | const LangOptions &LangOpts) { |
110 | if (Loc.isMacroID()) { |
111 | bool Invalid = false; |
112 | auto &SLoc = SM.getSLocEntry(FID: SM.getFileID(SpellingLoc: Loc), Invalid: &Invalid); |
113 | if (Invalid) |
114 | return std::nullopt; |
115 | if (auto &Expansion = SLoc.getExpansion(); |
116 | !Expansion.isExpansionTokenRange()) { |
117 | // A char-range expansion is only used where a token-range would be |
118 | // incorrect, and so identifies this as a split token (and importantly, |
119 | // not as a macro). |
120 | return Expansion.getExpansionLocRange(); |
121 | } |
122 | } |
123 | return std::nullopt; |
124 | } |
125 | |
126 | // If `Range` covers a split token, returns the expansion range, otherwise |
127 | // returns `Range`. |
128 | static CharSourceRange getRangeForSplitTokens(CharSourceRange Range, |
129 | const SourceManager &SM, |
130 | const LangOptions &LangOpts) { |
131 | if (Range.isTokenRange()) { |
132 | auto BeginToken = getExpansionForSplitToken(Loc: Range.getBegin(), SM, LangOpts); |
133 | auto EndToken = getExpansionForSplitToken(Loc: Range.getEnd(), SM, LangOpts); |
134 | if (EndToken) { |
135 | SourceLocation BeginLoc = |
136 | BeginToken ? BeginToken->getBegin() : Range.getBegin(); |
137 | // We can't use the expansion location with a token-range, because that |
138 | // will incorrectly lex the end token, so use a char-range that ends at |
139 | // the split. |
140 | return CharSourceRange::getCharRange(B: BeginLoc, E: EndToken->getEnd()); |
141 | } else if (BeginToken) { |
142 | // Since the end token is not split, the whole range covers the split, so |
143 | // the only adjustment we make is to use the expansion location of the |
144 | // begin token. |
145 | return CharSourceRange::getTokenRange(B: BeginToken->getBegin(), |
146 | E: Range.getEnd()); |
147 | } |
148 | } |
149 | return Range; |
150 | } |
151 | |
152 | static CharSourceRange getRange(const CharSourceRange &EditRange, |
153 | const SourceManager &SM, |
154 | const LangOptions &LangOpts, |
155 | bool IncludeMacroExpansion) { |
156 | CharSourceRange Range; |
157 | if (IncludeMacroExpansion) { |
158 | Range = Lexer::makeFileCharRange(Range: EditRange, SM, LangOpts); |
159 | } else { |
160 | auto AdjustedRange = getRangeForSplitTokens(Range: EditRange, SM, LangOpts); |
161 | if (spelledInMacroDefinition(Loc: AdjustedRange.getBegin(), SM) || |
162 | spelledInMacroDefinition(Loc: AdjustedRange.getEnd(), SM)) |
163 | return {}; |
164 | |
165 | auto B = SM.getSpellingLoc(Loc: AdjustedRange.getBegin()); |
166 | auto E = SM.getSpellingLoc(Loc: AdjustedRange.getEnd()); |
167 | if (AdjustedRange.isTokenRange()) |
168 | E = Lexer::getLocForEndOfToken(Loc: E, Offset: 0, SM, LangOpts); |
169 | Range = CharSourceRange::getCharRange(B, E); |
170 | } |
171 | return Range; |
172 | } |
173 | |
174 | std::optional<CharSourceRange> clang::tooling::getFileRangeForEdit( |
175 | const CharSourceRange &EditRange, const SourceManager &SM, |
176 | const LangOptions &LangOpts, bool IncludeMacroExpansion) { |
177 | CharSourceRange Range = |
178 | getRange(EditRange, SM, LangOpts, IncludeMacroExpansion); |
179 | bool IsInvalid = llvm::errorToBool(Err: validateEditRange(Range, SM)); |
180 | if (IsInvalid) |
181 | return std::nullopt; |
182 | return Range; |
183 | } |
184 | |
185 | std::optional<CharSourceRange> clang::tooling::getFileRange( |
186 | const CharSourceRange &EditRange, const SourceManager &SM, |
187 | const LangOptions &LangOpts, bool IncludeMacroExpansion) { |
188 | CharSourceRange Range = |
189 | getRange(EditRange, SM, LangOpts, IncludeMacroExpansion); |
190 | bool IsInvalid = |
191 | llvm::errorToBool(Err: validateRange(Range, SM, /*AllowSystemHeaders=*/true)); |
192 | if (IsInvalid) |
193 | return std::nullopt; |
194 | return Range; |
195 | } |
196 | |
197 | static bool startsWithNewline(const SourceManager &SM, const Token &Tok) { |
198 | return isVerticalWhitespace(c: SM.getCharacterData(SL: Tok.getLocation())[0]); |
199 | } |
200 | |
201 | static bool contains(const std::set<tok::TokenKind> &Terminators, |
202 | const Token &Tok) { |
203 | return Terminators.count(x: Tok.getKind()) > 0; |
204 | } |
205 | |
206 | // Returns the exclusive, *file* end location of the entity whose last token is |
207 | // at location 'EntityLast'. That is, it returns the location one past the last |
208 | // relevant character. |
209 | // |
210 | // Associated tokens include comments, horizontal whitespace and 'Terminators' |
211 | // -- optional tokens, which, if any are found, will be included; if |
212 | // 'Terminators' is empty, we will not include any extra tokens beyond comments |
213 | // and horizontal whitespace. |
214 | static SourceLocation |
215 | getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast, |
216 | const std::set<tok::TokenKind> &Terminators, |
217 | const LangOptions &LangOpts) { |
218 | assert(EntityLast.isValid() && "Invalid end location found." ); |
219 | |
220 | // We remember the last location of a non-horizontal-whitespace token we have |
221 | // lexed; this is the location up to which we will want to delete. |
222 | // FIXME: Support using the spelling loc here for cases where we want to |
223 | // analyze the macro text. |
224 | |
225 | CharSourceRange ExpansionRange = SM.getExpansionRange(Loc: EntityLast); |
226 | // FIXME: Should check isTokenRange(), for the (rare) case that |
227 | // `ExpansionRange` is a character range. |
228 | std::unique_ptr<Lexer> Lexer = [&]() { |
229 | bool Invalid = false; |
230 | auto FileOffset = SM.getDecomposedLoc(Loc: ExpansionRange.getEnd()); |
231 | llvm::StringRef File = SM.getBufferData(FID: FileOffset.first, Invalid: &Invalid); |
232 | assert(!Invalid && "Cannot get file/offset" ); |
233 | return std::make_unique<clang::Lexer>( |
234 | args: SM.getLocForStartOfFile(FID: FileOffset.first), args: LangOpts, args: File.begin(), |
235 | args: File.data() + FileOffset.second, args: File.end()); |
236 | }(); |
237 | |
238 | // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown). |
239 | Lexer->SetKeepWhitespaceMode(true); |
240 | |
241 | // Generally, the code we want to include looks like this ([] are optional), |
242 | // If Terminators is empty: |
243 | // [ <comment> ] [ <newline> ] |
244 | // Otherwise: |
245 | // ... <terminator> [ <comment> ] [ <newline> ] |
246 | |
247 | Token Tok; |
248 | bool Terminated = false; |
249 | |
250 | // First, lex to the current token (which is the last token of the range that |
251 | // is definitely associated with the decl). Then, we process the first token |
252 | // separately from the rest based on conditions that hold specifically for |
253 | // that first token. |
254 | // |
255 | // We do not search for a terminator if none is required or we've already |
256 | // encountered it. Otherwise, if the original `EntityLast` location was in a |
257 | // macro expansion, we don't have visibility into the text, so we assume we've |
258 | // already terminated. However, we note this assumption with |
259 | // `TerminatedByMacro`, because we'll want to handle it somewhat differently |
260 | // for the terminators semicolon and comma. These terminators can be safely |
261 | // associated with the entity when they appear after the macro -- extra |
262 | // semicolons have no effect on the program and a well-formed program won't |
263 | // have multiple commas in a row, so we're guaranteed that there is only one. |
264 | // |
265 | // FIXME: This handling of macros is more conservative than necessary. When |
266 | // the end of the expansion coincides with the end of the node, we can still |
267 | // safely analyze the code. But, it is more complicated, because we need to |
268 | // start by lexing the spelling loc for the first token and then switch to the |
269 | // expansion loc. |
270 | bool TerminatedByMacro = false; |
271 | Lexer->LexFromRawLexer(Result&: Tok); |
272 | if (Terminators.empty() || contains(Terminators, Tok)) |
273 | Terminated = true; |
274 | else if (EntityLast.isMacroID()) { |
275 | Terminated = true; |
276 | TerminatedByMacro = true; |
277 | } |
278 | |
279 | // We save the most recent candidate for the exclusive end location. |
280 | SourceLocation End = Tok.getEndLoc(); |
281 | |
282 | while (!Terminated) { |
283 | // Lex the next token we want to possibly expand the range with. |
284 | Lexer->LexFromRawLexer(Result&: Tok); |
285 | |
286 | switch (Tok.getKind()) { |
287 | case tok::eof: |
288 | // Unexpected separators. |
289 | case tok::l_brace: |
290 | case tok::r_brace: |
291 | case tok::comma: |
292 | return End; |
293 | // Whitespace pseudo-tokens. |
294 | case tok::unknown: |
295 | if (startsWithNewline(SM, Tok)) |
296 | // Include at least until the end of the line. |
297 | End = Tok.getEndLoc(); |
298 | break; |
299 | default: |
300 | if (contains(Terminators, Tok)) |
301 | Terminated = true; |
302 | End = Tok.getEndLoc(); |
303 | break; |
304 | } |
305 | } |
306 | |
307 | do { |
308 | // Lex the next token we want to possibly expand the range with. |
309 | Lexer->LexFromRawLexer(Result&: Tok); |
310 | |
311 | switch (Tok.getKind()) { |
312 | case tok::unknown: |
313 | if (startsWithNewline(SM, Tok)) |
314 | // We're done, but include this newline. |
315 | return Tok.getEndLoc(); |
316 | break; |
317 | case tok::comment: |
318 | // Include any comments we find on the way. |
319 | End = Tok.getEndLoc(); |
320 | break; |
321 | case tok::semi: |
322 | case tok::comma: |
323 | if (TerminatedByMacro && contains(Terminators, Tok)) { |
324 | End = Tok.getEndLoc(); |
325 | // We've found a real terminator. |
326 | TerminatedByMacro = false; |
327 | break; |
328 | } |
329 | // Found an unrelated token; stop and don't include it. |
330 | return End; |
331 | default: |
332 | // Found an unrelated token; stop and don't include it. |
333 | return End; |
334 | } |
335 | } while (true); |
336 | } |
337 | |
338 | // Returns the expected terminator tokens for the given declaration. |
339 | // |
340 | // If we do not know the correct terminator token, returns an empty set. |
341 | // |
342 | // There are cases where we have more than one possible terminator (for example, |
343 | // we find either a comma or a semicolon after a VarDecl). |
344 | static std::set<tok::TokenKind> getTerminators(const Decl &D) { |
345 | if (llvm::isa<RecordDecl>(Val: D) || llvm::isa<UsingDecl>(Val: D)) |
346 | return {tok::semi}; |
347 | |
348 | if (llvm::isa<FunctionDecl>(Val: D) || llvm::isa<LinkageSpecDecl>(Val: D)) |
349 | return {tok::r_brace, tok::semi}; |
350 | |
351 | if (llvm::isa<VarDecl>(Val: D) || llvm::isa<FieldDecl>(Val: D)) |
352 | return {tok::comma, tok::semi}; |
353 | |
354 | return {}; |
355 | } |
356 | |
357 | // Starting from `Loc`, skips whitespace up to, and including, a single |
358 | // newline. Returns the (exclusive) end of any skipped whitespace (that is, the |
359 | // location immediately after the whitespace). |
360 | static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM, |
361 | SourceLocation Loc, |
362 | const LangOptions &LangOpts) { |
363 | const char *LocChars = SM.getCharacterData(SL: Loc); |
364 | int i = 0; |
365 | while (isHorizontalWhitespace(c: LocChars[i])) |
366 | ++i; |
367 | if (isVerticalWhitespace(c: LocChars[i])) |
368 | ++i; |
369 | return Loc.getLocWithOffset(Offset: i); |
370 | } |
371 | |
372 | // Is `Loc` separated from any following decl by something meaningful (e.g. an |
373 | // empty line, a comment), ignoring horizontal whitespace? Since this is a |
374 | // heuristic, we return false when in doubt. `Loc` cannot be the first location |
375 | // in the file. |
376 | static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc, |
377 | const LangOptions &LangOpts) { |
378 | // If the preceding character is a newline, we'll check for an empty line as a |
379 | // separator. However, we can't identify an empty line using tokens, so we |
380 | // analyse the characters. If we try to use tokens, we'll just end up with a |
381 | // whitespace token, whose characters we'd have to analyse anyhow. |
382 | bool Invalid = false; |
383 | const char *LocChars = |
384 | SM.getCharacterData(SL: Loc.getLocWithOffset(Offset: -1), Invalid: &Invalid); |
385 | assert(!Invalid && |
386 | "Loc must be a valid character and not the first of the source file." ); |
387 | if (isVerticalWhitespace(c: LocChars[0])) { |
388 | for (int i = 1; isWhitespace(c: LocChars[i]); ++i) |
389 | if (isVerticalWhitespace(c: LocChars[i])) |
390 | return true; |
391 | } |
392 | // We didn't find an empty line, so lex the next token, skipping past any |
393 | // whitespace we just scanned. |
394 | Token Tok; |
395 | bool Failed = Lexer::getRawToken(Loc, Result&: Tok, SM, LangOpts, |
396 | /*IgnoreWhiteSpace=*/true); |
397 | if (Failed) |
398 | // Any text that confuses the lexer seems fair to consider a separation. |
399 | return true; |
400 | |
401 | switch (Tok.getKind()) { |
402 | case tok::comment: |
403 | case tok::l_brace: |
404 | case tok::r_brace: |
405 | case tok::eof: |
406 | return true; |
407 | default: |
408 | return false; |
409 | } |
410 | } |
411 | |
412 | CharSourceRange tooling::getAssociatedRange(const Decl &Decl, |
413 | ASTContext &Context) { |
414 | const SourceManager &SM = Context.getSourceManager(); |
415 | const LangOptions &LangOpts = Context.getLangOpts(); |
416 | CharSourceRange Range = CharSourceRange::getTokenRange(R: Decl.getSourceRange()); |
417 | |
418 | // First, expand to the start of the template<> declaration if necessary. |
419 | if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(Val: &Decl)) { |
420 | if (const auto *T = Record->getDescribedClassTemplate()) |
421 | if (SM.isBeforeInTranslationUnit(LHS: T->getBeginLoc(), RHS: Range.getBegin())) |
422 | Range.setBegin(T->getBeginLoc()); |
423 | } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(Val: &Decl)) { |
424 | if (const auto *T = F->getDescribedFunctionTemplate()) |
425 | if (SM.isBeforeInTranslationUnit(LHS: T->getBeginLoc(), RHS: Range.getBegin())) |
426 | Range.setBegin(T->getBeginLoc()); |
427 | } |
428 | |
429 | // Next, expand the end location past trailing comments to include a potential |
430 | // newline at the end of the decl's line. |
431 | Range.setEnd( |
432 | getEntityEndLoc(SM, EntityLast: Decl.getEndLoc(), Terminators: getTerminators(D: Decl), LangOpts)); |
433 | Range.setTokenRange(false); |
434 | |
435 | // Expand to include preceeding associated comments. We ignore any comments |
436 | // that are not preceeding the decl, since we've already skipped trailing |
437 | // comments with getEntityEndLoc. |
438 | if (const RawComment * = |
439 | Decl.getASTContext().getRawCommentForDeclNoCache(D: &Decl)) |
440 | // Only include a preceding comment if: |
441 | // * it is *not* separate from the declaration (not including any newline |
442 | // that immediately follows the comment), |
443 | // * the decl *is* separate from any following entity (so, there are no |
444 | // other entities the comment could refer to), and |
445 | // * it is not a IfThisThenThat lint check. |
446 | if (SM.isBeforeInTranslationUnit(LHS: Comment->getBeginLoc(), |
447 | RHS: Range.getBegin()) && |
448 | !atOrBeforeSeparation( |
449 | SM, Loc: skipWhitespaceAndNewline(SM, Loc: Comment->getEndLoc(), LangOpts), |
450 | LangOpts) && |
451 | atOrBeforeSeparation(SM, Loc: Range.getEnd(), LangOpts)) { |
452 | const StringRef = Comment->getRawText(SourceMgr: SM); |
453 | if (!CommentText.contains(Other: "LINT.IfChange" ) && |
454 | !CommentText.contains(Other: "LINT.ThenChange" )) |
455 | Range.setBegin(Comment->getBeginLoc()); |
456 | } |
457 | // Add leading attributes. |
458 | for (auto *Attr : Decl.attrs()) { |
459 | if (Attr->getLocation().isInvalid() || |
460 | !SM.isBeforeInTranslationUnit(LHS: Attr->getLocation(), RHS: Range.getBegin())) |
461 | continue; |
462 | Range.setBegin(Attr->getLocation()); |
463 | |
464 | // Extend to the left '[[' or '__attribute((' if we saw the attribute, |
465 | // unless it is not a valid location. |
466 | bool Invalid; |
467 | StringRef Source = |
468 | SM.getBufferData(FID: SM.getFileID(SpellingLoc: Range.getBegin()), Invalid: &Invalid); |
469 | if (Invalid) |
470 | continue; |
471 | llvm::StringRef BeforeAttr = |
472 | Source.substr(Start: 0, N: SM.getFileOffset(SpellingLoc: Range.getBegin())); |
473 | llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim(); |
474 | |
475 | for (llvm::StringRef Prefix : {"[[" , "__attribute__((" }) { |
476 | // Handle whitespace between attribute prefix and attribute value. |
477 | if (BeforeAttrStripped.ends_with(Suffix: Prefix)) { |
478 | // Move start to start position of prefix, which is |
479 | // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix) |
480 | // positions to the left. |
481 | Range.setBegin(Range.getBegin().getLocWithOffset(Offset: static_cast<int>( |
482 | -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size()))); |
483 | break; |
484 | // If we didn't see '[[' or '__attribute' it's probably coming from a |
485 | // macro expansion which is already handled by makeFileCharRange(), |
486 | // below. |
487 | } |
488 | } |
489 | } |
490 | |
491 | // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But, |
492 | // Range.getBegin() may be inside an expansion. |
493 | return Lexer::makeFileCharRange(Range, SM, LangOpts); |
494 | } |
495 | |