| 1 | //===--- HeaderIncludes.cpp - Insert/Delete #includes --*- C++ -*----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "clang/Tooling/Inclusions/HeaderIncludes.h" |
| 10 | #include "clang/Basic/SourceManager.h" |
| 11 | #include "clang/Lex/Lexer.h" |
| 12 | #include "llvm/Support/FormatVariadic.h" |
| 13 | #include "llvm/Support/Path.h" |
| 14 | #include <optional> |
| 15 | |
| 16 | namespace clang { |
| 17 | namespace tooling { |
| 18 | namespace { |
| 19 | |
| 20 | LangOptions createLangOpts() { |
| 21 | LangOptions LangOpts; |
| 22 | LangOpts.CPlusPlus = 1; |
| 23 | LangOpts.CPlusPlus11 = 1; |
| 24 | LangOpts.CPlusPlus14 = 1; |
| 25 | LangOpts.LineComment = 1; |
| 26 | LangOpts.CXXOperatorNames = 1; |
| 27 | LangOpts.Bool = 1; |
| 28 | LangOpts.ObjC = 1; |
| 29 | LangOpts.MicrosoftExt = 1; // To get kw___try, kw___finally. |
| 30 | LangOpts.DeclSpecKeyword = 1; // To get __declspec. |
| 31 | LangOpts.WChar = 1; // To get wchar_t |
| 32 | return LangOpts; |
| 33 | } |
| 34 | |
| 35 | // Returns the offset after skipping a sequence of tokens, matched by \p |
| 36 | // GetOffsetAfterSequence, from the start of the code. |
| 37 | // \p GetOffsetAfterSequence should be a function that matches a sequence of |
| 38 | // tokens and returns an offset after the sequence. |
| 39 | unsigned getOffsetAfterTokenSequence( |
| 40 | StringRef FileName, StringRef Code, const IncludeStyle &Style, |
| 41 | llvm::function_ref<unsigned(const SourceManager &, Lexer &, Token &)> |
| 42 | GetOffsetAfterSequence) { |
| 43 | SourceManagerForFile VirtualSM(FileName, Code); |
| 44 | SourceManager &SM = VirtualSM.get(); |
| 45 | LangOptions LangOpts = createLangOpts(); |
| 46 | Lexer Lex(SM.getMainFileID(), SM.getBufferOrFake(FID: SM.getMainFileID()), SM, |
| 47 | LangOpts); |
| 48 | Token Tok; |
| 49 | // Get the first token. |
| 50 | Lex.LexFromRawLexer(Result&: Tok); |
| 51 | return GetOffsetAfterSequence(SM, Lex, Tok); |
| 52 | } |
| 53 | |
| 54 | // Check if a sequence of tokens is like "#<Name> <raw_identifier>". If it is, |
| 55 | // \p Tok will be the token after this directive; otherwise, it can be any token |
| 56 | // after the given \p Tok (including \p Tok). If \p RawIDName is provided, the |
| 57 | // (second) raw_identifier name is checked. |
| 58 | bool checkAndConsumeDirectiveWithName( |
| 59 | Lexer &Lex, StringRef Name, Token &Tok, |
| 60 | std::optional<StringRef> RawIDName = std::nullopt) { |
| 61 | bool Matched = Tok.is(K: tok::hash) && !Lex.LexFromRawLexer(Result&: Tok) && |
| 62 | Tok.is(K: tok::raw_identifier) && |
| 63 | Tok.getRawIdentifier() == Name && !Lex.LexFromRawLexer(Result&: Tok) && |
| 64 | Tok.is(K: tok::raw_identifier) && |
| 65 | (!RawIDName || Tok.getRawIdentifier() == *RawIDName); |
| 66 | if (Matched) |
| 67 | Lex.LexFromRawLexer(Result&: Tok); |
| 68 | return Matched; |
| 69 | } |
| 70 | |
| 71 | void (Lexer &Lex, Token &Tok) { |
| 72 | while (Tok.is(K: tok::comment)) |
| 73 | if (Lex.LexFromRawLexer(Result&: Tok)) |
| 74 | return; |
| 75 | } |
| 76 | |
| 77 | // Returns the offset after header guard directives and any comments |
| 78 | // before/after header guards (e.g. #ifndef/#define pair, #pragma once). If no |
| 79 | // header guard is present in the code, this will return the offset after |
| 80 | // skipping all comments from the start of the code. |
| 81 | unsigned getOffsetAfterHeaderGuardsAndComments(StringRef FileName, |
| 82 | StringRef Code, |
| 83 | const IncludeStyle &Style) { |
| 84 | // \p Consume returns location after header guard or 0 if no header guard is |
| 85 | // found. |
| 86 | auto ConsumeHeaderGuardAndComment = |
| 87 | [&](std::function<unsigned(const SourceManager &SM, Lexer &Lex, |
| 88 | Token Tok)> |
| 89 | Consume) { |
| 90 | return getOffsetAfterTokenSequence( |
| 91 | FileName, Code, Style, |
| 92 | GetOffsetAfterSequence: [&Consume](const SourceManager &SM, Lexer &Lex, Token Tok) { |
| 93 | skipComments(Lex, Tok); |
| 94 | unsigned InitialOffset = SM.getFileOffset(SpellingLoc: Tok.getLocation()); |
| 95 | return std::max(a: InitialOffset, b: Consume(SM, Lex, Tok)); |
| 96 | }); |
| 97 | }; |
| 98 | return std::max( |
| 99 | // #ifndef/#define |
| 100 | a: ConsumeHeaderGuardAndComment( |
| 101 | [](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned { |
| 102 | if (checkAndConsumeDirectiveWithName(Lex, Name: "ifndef" , Tok)) { |
| 103 | skipComments(Lex, Tok); |
| 104 | if (checkAndConsumeDirectiveWithName(Lex, Name: "define" , Tok) && |
| 105 | Tok.isAtStartOfLine()) |
| 106 | return SM.getFileOffset(SpellingLoc: Tok.getLocation()); |
| 107 | } |
| 108 | return 0; |
| 109 | }), |
| 110 | // #pragma once |
| 111 | b: ConsumeHeaderGuardAndComment( |
| 112 | [](const SourceManager &SM, Lexer &Lex, Token Tok) -> unsigned { |
| 113 | if (checkAndConsumeDirectiveWithName(Lex, Name: "pragma" , Tok, |
| 114 | RawIDName: StringRef("once" ))) |
| 115 | return SM.getFileOffset(SpellingLoc: Tok.getLocation()); |
| 116 | return 0; |
| 117 | })); |
| 118 | } |
| 119 | |
| 120 | // Check if a sequence of tokens is like |
| 121 | // "#include ("header.h" | <header.h>)". |
| 122 | // If it is, \p Tok will be the token after this directive; otherwise, it can be |
| 123 | // any token after the given \p Tok (including \p Tok). |
| 124 | bool checkAndConsumeInclusiveDirective(Lexer &Lex, Token &Tok) { |
| 125 | auto Matched = [&]() { |
| 126 | Lex.LexFromRawLexer(Result&: Tok); |
| 127 | return true; |
| 128 | }; |
| 129 | if (Tok.is(K: tok::hash) && !Lex.LexFromRawLexer(Result&: Tok) && |
| 130 | Tok.is(K: tok::raw_identifier) && Tok.getRawIdentifier() == "include" ) { |
| 131 | if (Lex.LexFromRawLexer(Result&: Tok)) |
| 132 | return false; |
| 133 | if (Tok.is(K: tok::string_literal)) |
| 134 | return Matched(); |
| 135 | if (Tok.is(K: tok::less)) { |
| 136 | while (!Lex.LexFromRawLexer(Result&: Tok) && Tok.isNot(K: tok::greater)) { |
| 137 | } |
| 138 | if (Tok.is(K: tok::greater)) |
| 139 | return Matched(); |
| 140 | } |
| 141 | } |
| 142 | return false; |
| 143 | } |
| 144 | |
| 145 | // Returns the offset of the last #include directive after which a new |
| 146 | // #include can be inserted. This ignores #include's after the #include block(s) |
| 147 | // in the beginning of a file to avoid inserting headers into code sections |
| 148 | // where new #include's should not be added by default. |
| 149 | // These code sections include: |
| 150 | // - raw string literals (containing #include). |
| 151 | // - #if blocks. |
| 152 | // - Special #include's among declarations (e.g. functions). |
| 153 | // |
| 154 | // If no #include after which a new #include can be inserted, this returns the |
| 155 | // offset after skipping all comments from the start of the code. |
| 156 | // Inserting after an #include is not allowed if it comes after code that is not |
| 157 | // #include (e.g. pre-processing directive that is not #include, declarations). |
| 158 | unsigned (StringRef FileName, StringRef Code, |
| 159 | const IncludeStyle &Style) { |
| 160 | return getOffsetAfterTokenSequence( |
| 161 | FileName, Code, Style, |
| 162 | GetOffsetAfterSequence: [](const SourceManager &SM, Lexer &Lex, Token Tok) { |
| 163 | skipComments(Lex, Tok); |
| 164 | unsigned MaxOffset = SM.getFileOffset(SpellingLoc: Tok.getLocation()); |
| 165 | while (checkAndConsumeInclusiveDirective(Lex, Tok)) |
| 166 | MaxOffset = SM.getFileOffset(SpellingLoc: Tok.getLocation()); |
| 167 | return MaxOffset; |
| 168 | }); |
| 169 | } |
| 170 | |
| 171 | inline StringRef trimInclude(StringRef IncludeName) { |
| 172 | return IncludeName.trim(Chars: "\"<>" ); |
| 173 | } |
| 174 | |
| 175 | const char IncludeRegexPattern[] = |
| 176 | R"(^[\t\ ]*#[\t\ ]*(import|include)[^"<]*(["<][^">]*[">]))" ; |
| 177 | |
| 178 | // The filename of Path excluding extension. |
| 179 | // Used to match implementation with headers, this differs from sys::path::stem: |
| 180 | // - in names with multiple dots (foo.cu.cc) it terminates at the *first* |
| 181 | // - an empty stem is never returned: /foo/.bar.x => .bar |
| 182 | // - we don't bother to handle . and .. specially |
| 183 | StringRef matchingStem(llvm::StringRef Path) { |
| 184 | StringRef Name = llvm::sys::path::filename(path: Path); |
| 185 | return Name.substr(Start: 0, N: Name.find(C: '.', From: 1)); |
| 186 | } |
| 187 | |
| 188 | } // anonymous namespace |
| 189 | |
| 190 | IncludeCategoryManager::IncludeCategoryManager(const IncludeStyle &Style, |
| 191 | StringRef FileName) |
| 192 | : Style(Style), FileName(FileName) { |
| 193 | for (const auto &Category : Style.IncludeCategories) { |
| 194 | CategoryRegexs.emplace_back(Args: Category.Regex, Args: Category.RegexIsCaseSensitive |
| 195 | ? llvm::Regex::NoFlags |
| 196 | : llvm::Regex::IgnoreCase); |
| 197 | } |
| 198 | IsMainFile = FileName.ends_with(Suffix: ".c" ) || FileName.ends_with(Suffix: ".cc" ) || |
| 199 | FileName.ends_with(Suffix: ".cpp" ) || FileName.ends_with(Suffix: ".c++" ) || |
| 200 | FileName.ends_with(Suffix: ".cxx" ) || FileName.ends_with(Suffix: ".m" ) || |
| 201 | FileName.ends_with(Suffix: ".mm" ); |
| 202 | if (!Style.IncludeIsMainSourceRegex.empty()) { |
| 203 | llvm::Regex MainFileRegex(Style.IncludeIsMainSourceRegex); |
| 204 | IsMainFile |= MainFileRegex.match(String: FileName); |
| 205 | } |
| 206 | } |
| 207 | |
| 208 | int IncludeCategoryManager::getIncludePriority(StringRef IncludeName, |
| 209 | bool CheckMainHeader) const { |
| 210 | int Ret = INT_MAX; |
| 211 | for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i) |
| 212 | if (CategoryRegexs[i].match(String: IncludeName)) { |
| 213 | Ret = Style.IncludeCategories[i].Priority; |
| 214 | break; |
| 215 | } |
| 216 | if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName)) |
| 217 | Ret = 0; |
| 218 | return Ret; |
| 219 | } |
| 220 | |
| 221 | int IncludeCategoryManager::getSortIncludePriority(StringRef IncludeName, |
| 222 | bool CheckMainHeader) const { |
| 223 | int Ret = INT_MAX; |
| 224 | for (unsigned i = 0, e = CategoryRegexs.size(); i != e; ++i) |
| 225 | if (CategoryRegexs[i].match(String: IncludeName)) { |
| 226 | Ret = Style.IncludeCategories[i].SortPriority; |
| 227 | if (Ret == 0) |
| 228 | Ret = Style.IncludeCategories[i].Priority; |
| 229 | break; |
| 230 | } |
| 231 | if (CheckMainHeader && IsMainFile && Ret > 0 && isMainHeader(IncludeName)) |
| 232 | Ret = 0; |
| 233 | return Ret; |
| 234 | } |
| 235 | bool IncludeCategoryManager::isMainHeader(StringRef IncludeName) const { |
| 236 | switch (Style.MainIncludeChar) { |
| 237 | case IncludeStyle::MICD_Quote: |
| 238 | if (!IncludeName.starts_with(Prefix: "\"" )) |
| 239 | return false; |
| 240 | break; |
| 241 | case IncludeStyle::MICD_AngleBracket: |
| 242 | if (!IncludeName.starts_with(Prefix: "<" )) |
| 243 | return false; |
| 244 | break; |
| 245 | case IncludeStyle::MICD_Any: |
| 246 | break; |
| 247 | } |
| 248 | |
| 249 | IncludeName = |
| 250 | IncludeName.drop_front(N: 1).drop_back(N: 1); // remove the surrounding "" or <> |
| 251 | // Not matchingStem: implementation files may have compound extensions but |
| 252 | // headers may not. |
| 253 | StringRef = llvm::sys::path::stem(path: IncludeName); |
| 254 | StringRef FileStem = llvm::sys::path::stem(path: FileName); // foo.cu for foo.cu.cc |
| 255 | StringRef MatchingFileStem = matchingStem(Path: FileName); // foo for foo.cu.cc |
| 256 | // main-header examples: |
| 257 | // 1) foo.h => foo.cc |
| 258 | // 2) foo.h => foo.cu.cc |
| 259 | // 3) foo.proto.h => foo.proto.cc |
| 260 | // |
| 261 | // non-main-header examples: |
| 262 | // 1) foo.h => bar.cc |
| 263 | // 2) foo.proto.h => foo.cc |
| 264 | StringRef Matching; |
| 265 | if (MatchingFileStem.starts_with_insensitive(Prefix: HeaderStem)) |
| 266 | Matching = MatchingFileStem; // example 1), 2) |
| 267 | else if (FileStem.equals_insensitive(RHS: HeaderStem)) |
| 268 | Matching = FileStem; // example 3) |
| 269 | if (!Matching.empty()) { |
| 270 | llvm::Regex MainIncludeRegex(HeaderStem.str() + Style.IncludeIsMainRegex, |
| 271 | llvm::Regex::IgnoreCase); |
| 272 | if (MainIncludeRegex.match(String: Matching)) |
| 273 | return true; |
| 274 | } |
| 275 | return false; |
| 276 | } |
| 277 | |
| 278 | const llvm::Regex HeaderIncludes::(IncludeRegexPattern); |
| 279 | |
| 280 | HeaderIncludes::(StringRef FileName, StringRef Code, |
| 281 | const IncludeStyle &Style) |
| 282 | : FileName(FileName), Code(Code), FirstIncludeOffset(-1), |
| 283 | MinInsertOffset( |
| 284 | getOffsetAfterHeaderGuardsAndComments(FileName, Code, Style)), |
| 285 | MaxInsertOffset(MinInsertOffset + |
| 286 | getMaxHeaderInsertionOffset( |
| 287 | FileName, Code: Code.drop_front(N: MinInsertOffset), Style)), |
| 288 | MainIncludeFound(false), |
| 289 | Categories(Style, FileName) { |
| 290 | // Add 0 for main header and INT_MAX for headers that are not in any |
| 291 | // category. |
| 292 | Priorities = {0, INT_MAX}; |
| 293 | for (const auto &Category : Style.IncludeCategories) |
| 294 | Priorities.insert(x: Category.Priority); |
| 295 | SmallVector<StringRef, 32> Lines; |
| 296 | Code.drop_front(N: MinInsertOffset).split(A&: Lines, Separator: "\n" ); |
| 297 | |
| 298 | unsigned Offset = MinInsertOffset; |
| 299 | unsigned NextLineOffset; |
| 300 | SmallVector<StringRef, 4> Matches; |
| 301 | for (auto Line : Lines) { |
| 302 | NextLineOffset = std::min(a: Code.size(), b: Offset + Line.size() + 1); |
| 303 | if (IncludeRegex.match(String: Line, Matches: &Matches)) { |
| 304 | // If this is the last line without trailing newline, we need to make |
| 305 | // sure we don't delete across the file boundary. |
| 306 | addExistingInclude( |
| 307 | IncludeToAdd: Include(Matches[2], |
| 308 | tooling::Range( |
| 309 | Offset, std::min(a: Line.size() + 1, b: Code.size() - Offset)), |
| 310 | Matches[1] == "import" ? tooling::IncludeDirective::Import |
| 311 | : tooling::IncludeDirective::Include), |
| 312 | NextLineOffset); |
| 313 | } |
| 314 | Offset = NextLineOffset; |
| 315 | } |
| 316 | |
| 317 | // Populate CategoryEndOfssets: |
| 318 | // - Ensure that CategoryEndOffset[Highest] is always populated. |
| 319 | // - If CategoryEndOffset[Priority] isn't set, use the next higher value |
| 320 | // that is set, up to CategoryEndOffset[Highest]. |
| 321 | auto Highest = Priorities.begin(); |
| 322 | auto [It, Inserted] = CategoryEndOffsets.try_emplace(k: *Highest); |
| 323 | if (Inserted) |
| 324 | It->second = FirstIncludeOffset >= 0 ? FirstIncludeOffset : MinInsertOffset; |
| 325 | // By this point, CategoryEndOffset[Highest] is always set appropriately: |
| 326 | // - to an appropriate location before/after existing #includes, or |
| 327 | // - to right after the header guard, or |
| 328 | // - to the beginning of the file. |
| 329 | for (auto I = ++Priorities.begin(), E = Priorities.end(); I != E; ++I) |
| 330 | if (CategoryEndOffsets.find(x: *I) == CategoryEndOffsets.end()) |
| 331 | CategoryEndOffsets[*I] = CategoryEndOffsets[*std::prev(x: I)]; |
| 332 | } |
| 333 | |
| 334 | // \p Offset: the start of the line following this include directive. |
| 335 | void HeaderIncludes::(Include IncludeToAdd, |
| 336 | unsigned NextLineOffset) { |
| 337 | auto &Incs = ExistingIncludes[trimInclude(IncludeName: IncludeToAdd.Name)]; |
| 338 | Incs.push_back(x: std::move(IncludeToAdd)); |
| 339 | auto &CurInclude = Incs.back(); |
| 340 | // The header name with quotes or angle brackets. |
| 341 | // Only record the offset of current #include if we can insert after it. |
| 342 | if (CurInclude.R.getOffset() <= MaxInsertOffset) { |
| 343 | int Priority = Categories.getIncludePriority( |
| 344 | IncludeName: CurInclude.Name, /*CheckMainHeader=*/!MainIncludeFound); |
| 345 | if (Priority == 0) |
| 346 | MainIncludeFound = true; |
| 347 | CategoryEndOffsets[Priority] = NextLineOffset; |
| 348 | IncludesByPriority[Priority].push_back(Elt: &CurInclude); |
| 349 | if (FirstIncludeOffset < 0) |
| 350 | FirstIncludeOffset = CurInclude.R.getOffset(); |
| 351 | } |
| 352 | } |
| 353 | |
| 354 | std::optional<tooling::Replacement> |
| 355 | HeaderIncludes::(llvm::StringRef IncludeName, bool IsAngled, |
| 356 | IncludeDirective Directive) const { |
| 357 | assert(IncludeName == trimInclude(IncludeName)); |
| 358 | // If a <header> ("header") already exists in code, "header" (<header>) with |
| 359 | // different quotation and/or directive will still be inserted. |
| 360 | // FIXME: figure out if this is the best behavior. |
| 361 | auto It = ExistingIncludes.find(Key: IncludeName); |
| 362 | if (It != ExistingIncludes.end()) { |
| 363 | for (const auto &Inc : It->second) |
| 364 | if (Inc.Directive == Directive && |
| 365 | ((IsAngled && StringRef(Inc.Name).starts_with(Prefix: "<" )) || |
| 366 | (!IsAngled && StringRef(Inc.Name).starts_with(Prefix: "\"" )))) |
| 367 | return std::nullopt; |
| 368 | } |
| 369 | std::string Quoted = |
| 370 | std::string(llvm::formatv(Fmt: IsAngled ? "<{0}>" : "\"{0}\"" , Vals&: IncludeName)); |
| 371 | StringRef QuotedName = Quoted; |
| 372 | int Priority = Categories.getIncludePriority( |
| 373 | IncludeName: QuotedName, /*CheckMainHeader=*/!MainIncludeFound); |
| 374 | auto CatOffset = CategoryEndOffsets.find(x: Priority); |
| 375 | assert(CatOffset != CategoryEndOffsets.end()); |
| 376 | unsigned InsertOffset = CatOffset->second; // Fall back offset |
| 377 | auto Iter = IncludesByPriority.find(x: Priority); |
| 378 | if (Iter != IncludesByPriority.end()) { |
| 379 | for (const auto *Inc : Iter->second) { |
| 380 | if (QuotedName < Inc->Name) { |
| 381 | InsertOffset = Inc->R.getOffset(); |
| 382 | break; |
| 383 | } |
| 384 | } |
| 385 | } |
| 386 | assert(InsertOffset <= Code.size()); |
| 387 | llvm::StringRef DirectiveSpelling = |
| 388 | Directive == IncludeDirective::Include ? "include" : "import" ; |
| 389 | std::string NewInclude = |
| 390 | llvm::formatv(Fmt: "#{0} {1}\n" , Vals&: DirectiveSpelling, Vals&: QuotedName); |
| 391 | // When inserting headers at end of the code, also append '\n' to the code |
| 392 | // if it does not end with '\n'. |
| 393 | // FIXME: when inserting multiple #includes at the end of code, only one |
| 394 | // newline should be added. |
| 395 | if (InsertOffset == Code.size() && (!Code.empty() && Code.back() != '\n')) |
| 396 | NewInclude = "\n" + NewInclude; |
| 397 | return tooling::Replacement(FileName, InsertOffset, 0, NewInclude); |
| 398 | } |
| 399 | |
| 400 | tooling::Replacements HeaderIncludes::(llvm::StringRef IncludeName, |
| 401 | bool IsAngled) const { |
| 402 | assert(IncludeName == trimInclude(IncludeName)); |
| 403 | tooling::Replacements Result; |
| 404 | auto Iter = ExistingIncludes.find(Key: IncludeName); |
| 405 | if (Iter == ExistingIncludes.end()) |
| 406 | return Result; |
| 407 | for (const auto &Inc : Iter->second) { |
| 408 | if ((IsAngled && StringRef(Inc.Name).starts_with(Prefix: "\"" )) || |
| 409 | (!IsAngled && StringRef(Inc.Name).starts_with(Prefix: "<" ))) |
| 410 | continue; |
| 411 | llvm::Error Err = Result.add(R: tooling::Replacement( |
| 412 | FileName, Inc.R.getOffset(), Inc.R.getLength(), "" )); |
| 413 | if (Err) { |
| 414 | auto ErrMsg = "Unexpected conflicts in #include deletions: " + |
| 415 | llvm::toString(E: std::move(Err)); |
| 416 | llvm_unreachable(ErrMsg.c_str()); |
| 417 | } |
| 418 | } |
| 419 | return Result; |
| 420 | } |
| 421 | |
| 422 | } // namespace tooling |
| 423 | } // namespace clang |
| 424 | |