1//===--- HeaderIncludes.cpp - Insert/Delete #includes --*- C++ -*----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang/Tooling/Inclusions/HeaderIncludes.h"
10#include "clang/Basic/SourceManager.h"
11#include "clang/Lex/Lexer.h"
12#include "llvm/Support/FormatVariadic.h"
13#include "llvm/Support/Path.h"
14#include <optional>
15
16namespace clang {
17namespace tooling {
18namespace {
19
20LangOptions createLangOpts() {
21 LangOptions LangOpts;
22 LangOpts.CPlusPlus = 1;
23 LangOpts.CPlusPlus11 = 1;
24 LangOpts.CPlusPlus14 = 1;
25 LangOpts.LineComment = 1;
26 LangOpts.CXXOperatorNames = 1;
27 LangOpts.Bool = 1;
28 LangOpts.ObjC = 1;
29 LangOpts.MicrosoftExt = 1; // To get kw___try, kw___finally.
30 LangOpts.DeclSpecKeyword = 1; // To get __declspec.
31 LangOpts.WChar = 1; // To get wchar_t
32 return LangOpts;
33}
34
35// Create a new lexer on the given \p Code and calls \p Callback with the
36// created source manager and lexer. \p Callback must be a callable object that
37// could be invoked with (const SourceManager &, Lexer &). This function returns
38// whatever \p Callback returns.
39template <typename F>
40auto withLexer(StringRef FileName, StringRef Code, const IncludeStyle &Style,
41 F &&Callback)
42 -> std::invoke_result_t<F, const SourceManager &, Lexer &> {
43 SourceManagerForFile VirtualSM(FileName, Code);
44 SourceManager &SM = VirtualSM.get();
45 LangOptions LangOpts = createLangOpts();
46 Lexer Lex(SM.getMainFileID(), SM.getBufferOrFake(FID: SM.getMainFileID()), SM,
47 LangOpts);
48 return std::invoke(std::forward<F>(Callback), std::as_const(t&: SM), Lex);
49}
50
51// Returns the offset after skipping a sequence of tokens, matched by \p
52// GetOffsetAfterSequence, from the start of the code.
53// \p GetOffsetAfterSequence should be a function that matches a sequence of
54// tokens and returns an offset after the sequence.
55unsigned getOffsetAfterTokenSequence(
56 StringRef FileName, StringRef Code, const IncludeStyle &Style,
57 llvm::function_ref<unsigned(const SourceManager &, Lexer &, Token &)>
58 GetOffsetAfterSequence) {
59 return withLexer(FileName, Code, Style,
60 Callback: [&](const SourceManager &SM, Lexer &Lex) {
61 Token Tok;
62 // Get the first token.
63 Lex.LexFromRawLexer(Result&: Tok);
64 return GetOffsetAfterSequence(SM, Lex, Tok);
65 });
66}
67
68// Check if a sequence of tokens is like "#<Name> <raw_identifier>". If it is,
69// \p Tok will be the token after this directive; otherwise, it can be any token
70// after the given \p Tok (including \p Tok). If \p RawIDName is provided, the
71// (second) raw_identifier name is checked.
72bool checkAndConsumeDirectiveWithName(
73 Lexer &Lex, StringRef Name, Token &Tok,
74 std::optional<StringRef> RawIDName = std::nullopt) {
75 bool Matched = Tok.is(K: tok::hash) && !Lex.LexFromRawLexer(Result&: Tok) &&
76 Tok.is(K: tok::raw_identifier) &&
77 Tok.getRawIdentifier() == Name && !Lex.LexFromRawLexer(Result&: Tok) &&
78 Tok.is(K: tok::raw_identifier) &&
79 (!RawIDName || Tok.getRawIdentifier() == *RawIDName);
80 if (Matched)
81 Lex.LexFromRawLexer(Result&: Tok);
82 return Matched;
83}
84
85void skipComments(Lexer &Lex, Token &Tok) {
86 while (Tok.is(K: tok::comment))
87 if (Lex.LexFromRawLexer(Result&: Tok))
88 return;
89}
90
91bool checkAndConsumeModuleDecl(const SourceManager &SM, Lexer &Lex,
92 Token &Tok) {
93 bool Matched = Tok.is(K: tok::raw_identifier) &&
94 Tok.getRawIdentifier() == "module" &&
95 !Lex.LexFromRawLexer(Result&: Tok) && Tok.is(K: tok::semi) &&
96 !Lex.LexFromRawLexer(Result&: Tok);
97 return Matched;
98}
99
100// Determines the minimum offset into the file where we want to insert header
101// includes. This will be put (when available):
102// - after `#pragma once`
103// - after header guards (`#ifdef` and `#define`)
104// - after opening global module (`module;`)
105// - after any comments at the start of the file or immediately following one of
106// the above constructs
107unsigned getMinHeaderInsertionOffset(StringRef FileName, StringRef Code,
108 const IncludeStyle &Style) {
109 // \p Consume returns location after header guard or 0 if no header guard is
110 // found.
111 auto ConsumeHeaderGuardAndComment =
112 [&](std::function<unsigned(const SourceManager &SM, Lexer &Lex,
113 Token Tok)>
114 Consume) {
115 return getOffsetAfterTokenSequence(
116 FileName, Code, Style,
117 GetOffsetAfterSequence: [&Consume](const SourceManager &SM, Lexer &Lex, Token Tok) {
118 skipComments(Lex, Tok);
119 unsigned InitialOffset = SM.getFileOffset(SpellingLoc: Tok.getLocation());
120 return std::max(a: InitialOffset, b: Consume(SM, Lex, Tok));
121 });
122 };
123
124 auto ModuleDecl = ConsumeHeaderGuardAndComment(
125 [](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned {
126 if (checkAndConsumeModuleDecl(SM, Lex, Tok)) {
127 skipComments(Lex, Tok);
128 return SM.getFileOffset(SpellingLoc: Tok.getLocation());
129 }
130 return 0;
131 });
132
133 auto HeaderAndPPOffset = std::max(
134 // #ifndef/#define
135 a: ConsumeHeaderGuardAndComment(
136 [](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned {
137 if (checkAndConsumeDirectiveWithName(Lex, Name: "ifndef", Tok)) {
138 skipComments(Lex, Tok);
139 if (checkAndConsumeDirectiveWithName(Lex, Name: "define", Tok) &&
140 Tok.isAtStartOfLine())
141 return SM.getFileOffset(SpellingLoc: Tok.getLocation());
142 }
143 return 0;
144 }),
145 // #pragma once
146 b: ConsumeHeaderGuardAndComment(
147 [](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned {
148 if (checkAndConsumeDirectiveWithName(Lex, Name: "pragma", Tok,
149 RawIDName: StringRef("once")))
150 return SM.getFileOffset(SpellingLoc: Tok.getLocation());
151 return 0;
152 }));
153 return std::max(a: HeaderAndPPOffset, b: ModuleDecl);
154}
155
156// Check if a sequence of tokens is like
157// "#include ("header.h" | <header.h>)".
158// If it is, \p Tok will be the token after this directive; otherwise, it can be
159// any token after the given \p Tok (including \p Tok).
160bool checkAndConsumeInclusiveDirective(Lexer &Lex, Token &Tok) {
161 auto Matched = [&]() {
162 Lex.LexFromRawLexer(Result&: Tok);
163 return true;
164 };
165 if (Tok.is(K: tok::hash) && !Lex.LexFromRawLexer(Result&: Tok) &&
166 Tok.is(K: tok::raw_identifier) && Tok.getRawIdentifier() == "include") {
167 if (Lex.LexFromRawLexer(Result&: Tok))
168 return false;
169 if (Tok.is(K: tok::string_literal))
170 return Matched();
171 if (Tok.is(K: tok::less)) {
172 while (!Lex.LexFromRawLexer(Result&: Tok) && Tok.isNot(K: tok::greater)) {
173 }
174 if (Tok.is(K: tok::greater))
175 return Matched();
176 }
177 }
178 return false;
179}
180
181// Returns the offset of the last #include directive after which a new
182// #include can be inserted. This ignores #include's after the #include block(s)
183// in the beginning of a file to avoid inserting headers into code sections
184// where new #include's should not be added by default.
185// These code sections include:
186// - raw string literals (containing #include).
187// - #if blocks.
188// - Special #include's among declarations (e.g. functions).
189//
190// If no #include after which a new #include can be inserted, this returns the
191// offset after skipping all comments from the start of the code.
192// Inserting after an #include is not allowed if it comes after code that is not
193// #include (e.g. pre-processing directive that is not #include, declarations).
194unsigned getMaxHeaderInsertionOffset(StringRef FileName, StringRef Code,
195 const IncludeStyle &Style) {
196 return getOffsetAfterTokenSequence(
197 FileName, Code, Style,
198 GetOffsetAfterSequence: [](const SourceManager &SM, Lexer &Lex, Token Tok) {
199 skipComments(Lex, Tok);
200 unsigned MaxOffset = SM.getFileOffset(SpellingLoc: Tok.getLocation());
201 while (checkAndConsumeInclusiveDirective(Lex, Tok))
202 MaxOffset = SM.getFileOffset(SpellingLoc: Tok.getLocation());
203 return MaxOffset;
204 });
205}
206
207// Check whether the first declaration in the code is a C++20 module
208// declaration, and it is not preceded by any preprocessor directives.
209bool isFirstDeclModuleDecl(StringRef FileName, StringRef Code,
210 const IncludeStyle &Style) {
211 return withLexer(
212 FileName, Code, Style, Callback: [](const SourceManager &SM, Lexer &Lex) {
213 // Let the lexer skip any comments and whitespaces for us.
214 Lex.SetKeepWhitespaceMode(false);
215 Lex.SetCommentRetentionState(false);
216
217 Token tok;
218 if (Lex.LexFromRawLexer(Result&: tok))
219 return false;
220
221 // A module declaration is made up of the following token sequence:
222 // export? module <ident> ('.' <ident>)* <partition> <attr> ;
223 //
224 // For convenience, we don't actually lex the whole declaration -- it's
225 // enough to distinguish a module declaration to just ensure an <ident>
226 // is following the "module" keyword.
227
228 // Lex the optional "export" keyword.
229 if (tok.is(K: tok::raw_identifier) && tok.getRawIdentifier() == "export") {
230 if (Lex.LexFromRawLexer(Result&: tok))
231 return false;
232 }
233
234 // Lex the "module" keyword.
235 if (!tok.is(K: tok::raw_identifier) ||
236 tok.getRawIdentifier() != "module" || Lex.LexFromRawLexer(Result&: tok))
237 return false;
238
239 // Make sure an identifier follows the "module" keyword.
240 return tok.is(K: tok::raw_identifier);
241 });
242}
243
244inline StringRef trimInclude(StringRef IncludeName) {
245 return IncludeName.trim(Chars: "\"<>");
246}
247
248const char IncludeRegexPattern[] =
249 R"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))";
250
251// The filename of Path excluding extension.
252// Used to match implementation with headers, this differs from sys::path::stem:
253// - in names with multiple dots (foo.cu.cc) it terminates at the *first*
254// - an empty stem is never returned: /foo/.bar.x => .bar
255// - we don't bother to handle . and .. specially
256StringRef matchingStem(llvm::StringRef Path) {
257 StringRef Name = llvm::sys::path::filename(path: Path);
258 return Name.substr(Start: 0, N: Name.find(C: '.', From: 1));
259}
260
261} // anonymous namespace
262
263IncludeCategoryManager::IncludeCategoryManager(const IncludeStyle &Style,
264 StringRef FileName)
265 : Style(Style), FileName(FileName) {
266 for (const auto &Category : Style.IncludeCategories) {
267 CategoryRegexs.emplace_back(Args: Category.Regex, Args: Category.RegexIsCaseSensitive
268 ? llvm::Regex::NoFlags
269 : llvm::Regex::IgnoreCase);
270 }
271 IsMainFile = FileName.ends_with(Suffix: ".c") || FileName.ends_with(Suffix: ".cc") ||
272 FileName.ends_with(Suffix: ".cpp") || FileName.ends_with(Suffix: ".c++") ||
273 FileName.ends_with(Suffix: ".cxx") || FileName.ends_with(Suffix: ".m") ||
274 FileName.ends_with(Suffix: ".mm");
275 if (!Style.IncludeIsMainSourceRegex.empty()) {
276 llvm::Regex MainFileRegex(Style.IncludeIsMainSourceRegex);
277 IsMainFile |= MainFileRegex.match(String: FileName);
278 }
279}
280
281int IncludeCategoryManager::getIncludePriority(StringRef IncludeName,
282 bool CheckMainHeader) const {
283 int Ret = INT_MAX;
284 for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i)
285 if (CategoryRegexs[i].match(String: IncludeName)) {
286 Ret = Style.IncludeCategories[i].Priority;
287 break;
288 }
289 if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName))
290 Ret = 0;
291 return Ret;
292}
293
294int IncludeCategoryManager::getSortIncludePriority(StringRef IncludeName,
295 bool CheckMainHeader) const {
296 int Ret = INT_MAX;
297 for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i)
298 if (CategoryRegexs[i].match(String: IncludeName)) {
299 Ret = Style.IncludeCategories[i].SortPriority;
300 if (Ret == 0)
301 Ret = Style.IncludeCategories[i].Priority;
302 break;
303 }
304 if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName))
305 Ret = 0;
306 return Ret;
307}
308bool IncludeCategoryManager::isMainHeader(StringRef IncludeName) const {
309 switch (Style.MainIncludeChar) {
310 case IncludeStyle::MICD_Quote:
311 if (!IncludeName.starts_with(Prefix: "\""))
312 return false;
313 break;
314 case IncludeStyle::MICD_AngleBracket:
315 if (!IncludeName.starts_with(Prefix: "<"))
316 return false;
317 break;
318 case IncludeStyle::MICD_Any:
319 break;
320 }
321
322 IncludeName =
323 IncludeName.drop_front(N: 1).drop_back(N: 1); // remove the surrounding "" or <>
324 // Not matchingStem: implementation files may have compound extensions but
325 // headers may not.
326 StringRef HeaderStem = llvm::sys::path::stem(path: IncludeName);
327 StringRef FileStem = llvm::sys::path::stem(path: FileName); // foo.cu for foo.cu.cc
328 StringRef MatchingFileStem = matchingStem(Path: FileName); // foo for foo.cu.cc
329 // main-header examples:
330 // 1) foo.h => foo.cc
331 // 2) foo.h => foo.cu.cc
332 // 3) foo.proto.h => foo.proto.cc
333 //
334 // non-main-header examples:
335 // 1) foo.h => bar.cc
336 // 2) foo.proto.h => foo.cc
337 StringRef Matching;
338 if (MatchingFileStem.starts_with_insensitive(Prefix: HeaderStem))
339 Matching = MatchingFileStem; // example 1), 2)
340 else if (FileStem.equals_insensitive(RHS: HeaderStem))
341 Matching = FileStem; // example 3)
342 if (!Matching.empty()) {
343 llvm::Regex MainIncludeRegex(HeaderStem.str() + Style.IncludeIsMainRegex,
344 llvm::Regex::IgnoreCase);
345 if (MainIncludeRegex.match(String: Matching))
346 return true;
347 }
348 return false;
349}
350
351const llvm::Regex HeaderIncludes::IncludeRegex(IncludeRegexPattern);
352
353HeaderIncludes::HeaderIncludes(StringRef FileName, StringRef Code,
354 const IncludeStyle &Style)
355 : FileName(FileName), Code(Code), FirstIncludeOffset(-1),
356 MinInsertOffset(getMinHeaderInsertionOffset(FileName, Code, Style)),
357 MaxInsertOffset(MinInsertOffset +
358 getMaxHeaderInsertionOffset(
359 FileName, Code: Code.drop_front(N: MinInsertOffset), Style)),
360 MainIncludeFound(false),
361 ShouldInsertGlobalModuleFragmentDecl(
362 isFirstDeclModuleDecl(FileName, Code, Style)),
363 Categories(Style, FileName) {
364 // Add 0 for main header and INT_MAX for headers that are not in any
365 // category.
366 Priorities = {0, INT_MAX};
367 for (const auto &Category : Style.IncludeCategories)
368 Priorities.insert(x: Category.Priority);
369 SmallVector<StringRef, 32> Lines;
370 Code.drop_front(N: MinInsertOffset).split(A&: Lines, Separator: "\n");
371
372 unsigned Offset = MinInsertOffset;
373 unsigned NextLineOffset;
374 SmallVector<StringRef, 4> Matches;
375 for (auto Line : Lines) {
376 NextLineOffset = std::min(a: Code.size(), b: Offset + Line.size() + 1);
377 if (IncludeRegex.match(String: Line, Matches: &Matches)) {
378 // If this is the last line without trailing newline, we need to make
379 // sure we don't delete across the file boundary.
380 addExistingInclude(
381 IncludeToAdd: Include(Matches[2],
382 tooling::Range(
383 Offset, std::min(a: Line.size() + 1, b: Code.size() - Offset)),
384 Matches[1] == "import" ? tooling::IncludeDirective::Import
385 : tooling::IncludeDirective::Include),
386 NextLineOffset);
387 }
388 Offset = NextLineOffset;
389 }
390
391 // Populate CategoryEndOfssets:
392 // - Ensure that CategoryEndOffset[Highest] is always populated.
393 // - If CategoryEndOffset[Priority] isn't set, use the next higher value
394 // that is set, up to CategoryEndOffset[Highest].
395 auto Highest = Priorities.begin();
396 auto [It, Inserted] = CategoryEndOffsets.try_emplace(k: *Highest);
397 if (Inserted)
398 It->second = FirstIncludeOffset >= 0 ? FirstIncludeOffset : MinInsertOffset;
399 // By this point, CategoryEndOffset[Highest] is always set appropriately:
400 // - to an appropriate location before/after existing #includes, or
401 // - to right after the header guard, or
402 // - to the beginning of the file.
403 for (auto I = ++Priorities.begin(), E = Priorities.end(); I != E; ++I)
404 if (CategoryEndOffsets.find(x: *I) == CategoryEndOffsets.end())
405 CategoryEndOffsets[*I] = CategoryEndOffsets[*std::prev(x: I)];
406}
407
408// \p Offset: the start of the line following this include directive.
409void HeaderIncludes::addExistingInclude(Include IncludeToAdd,
410 unsigned NextLineOffset) {
411 auto &Incs = ExistingIncludes[trimInclude(IncludeName: IncludeToAdd.Name)];
412 Incs.push_back(x: std::move(IncludeToAdd));
413 auto &CurInclude = Incs.back();
414 // The header name with quotes or angle brackets.
415 // Only record the offset of current #include if we can insert after it.
416 if (CurInclude.R.getOffset() <= MaxInsertOffset) {
417 int Priority = Categories.getIncludePriority(
418 IncludeName: CurInclude.Name, /*CheckMainHeader=*/!MainIncludeFound);
419 if (Priority == 0)
420 MainIncludeFound = true;
421 CategoryEndOffsets[Priority] = NextLineOffset;
422 IncludesByPriority[Priority].push_back(Elt: &CurInclude);
423 if (FirstIncludeOffset < 0)
424 FirstIncludeOffset = CurInclude.R.getOffset();
425 }
426}
427
428std::optional<tooling::Replacement>
429HeaderIncludes::insert(llvm::StringRef IncludeName, bool IsAngled,
430 IncludeDirective Directive) const {
431 assert(IncludeName == trimInclude(IncludeName));
432 // If a <header> ("header") already exists in code, "header" (<header>) with
433 // different quotation and/or directive will still be inserted.
434 // FIXME: figure out if this is the best behavior.
435 auto It = ExistingIncludes.find(Key: IncludeName);
436 if (It != ExistingIncludes.end()) {
437 for (const auto &Inc : It->second)
438 if (Inc.Directive == Directive &&
439 ((IsAngled && StringRef(Inc.Name).starts_with(Prefix: "<")) ||
440 (!IsAngled && StringRef(Inc.Name).starts_with(Prefix: "\""))))
441 return std::nullopt;
442 }
443 std::string Quoted =
444 std::string(llvm::formatv(Fmt: IsAngled ? "<{0}>" : "\"{0}\"", Vals&: IncludeName));
445 StringRef QuotedName = Quoted;
446 int Priority = Categories.getIncludePriority(
447 IncludeName: QuotedName, /*CheckMainHeader=*/!MainIncludeFound);
448 auto CatOffset = CategoryEndOffsets.find(x: Priority);
449 assert(CatOffset != CategoryEndOffsets.end());
450 unsigned InsertOffset = CatOffset->second; // Fall back offset
451 auto Iter = IncludesByPriority.find(x: Priority);
452 if (Iter != IncludesByPriority.end()) {
453 for (const auto *Inc : Iter->second) {
454 if (QuotedName < Inc->Name) {
455 InsertOffset = Inc->R.getOffset();
456 break;
457 }
458 }
459 }
460 assert(InsertOffset <= Code.size());
461 llvm::StringRef DirectiveSpelling =
462 Directive == IncludeDirective::Include ? "include" : "import";
463 std::string NewInclude =
464 llvm::formatv(Fmt: "#{0} {1}\n", Vals&: DirectiveSpelling, Vals&: QuotedName);
465 // When inserting headers at end of the code, also append '\n' to the code
466 // if it does not end with '\n'.
467 // FIXME: when inserting multiple #includes at the end of code, only one
468 // newline should be added.
469 if (InsertOffset == Code.size() && (!Code.empty() && Code.back() != '\n'))
470 NewInclude = "\n" + NewInclude;
471 if (ShouldInsertGlobalModuleFragmentDecl)
472 NewInclude = "module;\n" + NewInclude;
473 return tooling::Replacement(FileName, InsertOffset, 0, NewInclude);
474}
475
476tooling::Replacements HeaderIncludes::remove(llvm::StringRef IncludeName,
477 bool IsAngled) const {
478 assert(IncludeName == trimInclude(IncludeName));
479 tooling::Replacements Result;
480 auto Iter = ExistingIncludes.find(Key: IncludeName);
481 if (Iter == ExistingIncludes.end())
482 return Result;
483 for (const auto &Inc : Iter->second) {
484 if ((IsAngled && StringRef(Inc.Name).starts_with(Prefix: "\"")) ||
485 (!IsAngled && StringRef(Inc.Name).starts_with(Prefix: "<")))
486 continue;
487 llvm::Error Err = Result.add(R: tooling::Replacement(
488 FileName, Inc.R.getOffset(), Inc.R.getLength(), ""));
489 if (Err) {
490 auto ErrMsg = "Unexpected conflicts in #include deletions: " +
491 llvm::toString(E: std::move(Err));
492 llvm_unreachable(ErrMsg.c_str());
493 }
494 }
495 return Result;
496}
497
498} // namespace tooling
499} // namespace clang
500