| 1 | //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the SampleProfileLoader transformation. This pass |
| 10 | // reads a profile file generated by a sampling profiler (e.g. Linux Perf - |
| 11 | // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the |
| 12 | // profile information in the given profile. |
| 13 | // |
| 14 | // This pass generates branch weight annotations on the IR: |
| 15 | // |
| 16 | // - prof: Represents branch weights. This annotation is added to branches |
| 17 | // to indicate the weights of each edge coming out of the branch. |
| 18 | // The weight of each edge is the weight of the target block for |
| 19 | // that edge. The weight of a block B is computed as the maximum |
| 20 | // number of samples found in B. |
| 21 | // |
| 22 | //===----------------------------------------------------------------------===// |
| 23 | |
| 24 | #include "llvm/Transforms/IPO/SampleProfile.h" |
| 25 | #include "llvm/ADT/ArrayRef.h" |
| 26 | #include "llvm/ADT/DenseMap.h" |
| 27 | #include "llvm/ADT/DenseSet.h" |
| 28 | #include "llvm/ADT/MapVector.h" |
| 29 | #include "llvm/ADT/PriorityQueue.h" |
| 30 | #include "llvm/ADT/SCCIterator.h" |
| 31 | #include "llvm/ADT/SmallVector.h" |
| 32 | #include "llvm/ADT/Statistic.h" |
| 33 | #include "llvm/ADT/StringRef.h" |
| 34 | #include "llvm/ADT/Twine.h" |
| 35 | #include "llvm/Analysis/AssumptionCache.h" |
| 36 | #include "llvm/Analysis/BlockFrequencyInfoImpl.h" |
| 37 | #include "llvm/Analysis/InlineAdvisor.h" |
| 38 | #include "llvm/Analysis/InlineCost.h" |
| 39 | #include "llvm/Analysis/LazyCallGraph.h" |
| 40 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 41 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
| 42 | #include "llvm/Analysis/ReplayInlineAdvisor.h" |
| 43 | #include "llvm/Analysis/TargetLibraryInfo.h" |
| 44 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 45 | #include "llvm/IR/BasicBlock.h" |
| 46 | #include "llvm/IR/DebugLoc.h" |
| 47 | #include "llvm/IR/DiagnosticInfo.h" |
| 48 | #include "llvm/IR/Function.h" |
| 49 | #include "llvm/IR/GlobalValue.h" |
| 50 | #include "llvm/IR/InstrTypes.h" |
| 51 | #include "llvm/IR/Instruction.h" |
| 52 | #include "llvm/IR/Instructions.h" |
| 53 | #include "llvm/IR/IntrinsicInst.h" |
| 54 | #include "llvm/IR/LLVMContext.h" |
| 55 | #include "llvm/IR/MDBuilder.h" |
| 56 | #include "llvm/IR/Module.h" |
| 57 | #include "llvm/IR/PassManager.h" |
| 58 | #include "llvm/IR/ProfDataUtils.h" |
| 59 | #include "llvm/IR/PseudoProbe.h" |
| 60 | #include "llvm/IR/ValueSymbolTable.h" |
| 61 | #include "llvm/ProfileData/InstrProf.h" |
| 62 | #include "llvm/ProfileData/SampleProf.h" |
| 63 | #include "llvm/ProfileData/SampleProfReader.h" |
| 64 | #include "llvm/Support/Casting.h" |
| 65 | #include "llvm/Support/CommandLine.h" |
| 66 | #include "llvm/Support/Debug.h" |
| 67 | #include "llvm/Support/ErrorOr.h" |
| 68 | #include "llvm/Support/VirtualFileSystem.h" |
| 69 | #include "llvm/Support/raw_ostream.h" |
| 70 | #include "llvm/Transforms/IPO.h" |
| 71 | #include "llvm/Transforms/IPO/ProfiledCallGraph.h" |
| 72 | #include "llvm/Transforms/IPO/SampleContextTracker.h" |
| 73 | #include "llvm/Transforms/IPO/SampleProfileMatcher.h" |
| 74 | #include "llvm/Transforms/IPO/SampleProfileProbe.h" |
| 75 | #include "llvm/Transforms/Utils/CallPromotionUtils.h" |
| 76 | #include "llvm/Transforms/Utils/Cloning.h" |
| 77 | #include "llvm/Transforms/Utils/Instrumentation.h" |
| 78 | #include "llvm/Transforms/Utils/MisExpect.h" |
| 79 | #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" |
| 80 | #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" |
| 81 | #include <algorithm> |
| 82 | #include <cassert> |
| 83 | #include <cstdint> |
| 84 | #include <functional> |
| 85 | #include <limits> |
| 86 | #include <map> |
| 87 | #include <memory> |
| 88 | #include <queue> |
| 89 | #include <string> |
| 90 | #include <system_error> |
| 91 | #include <utility> |
| 92 | #include <vector> |
| 93 | |
| 94 | using namespace llvm; |
| 95 | using namespace sampleprof; |
| 96 | using namespace llvm::sampleprofutil; |
| 97 | using ProfileCount = Function::ProfileCount; |
| 98 | #define DEBUG_TYPE "sample-profile" |
| 99 | #define CSINLINE_DEBUG DEBUG_TYPE "-inline" |
| 100 | |
| 101 | STATISTIC(NumCSInlined, |
| 102 | "Number of functions inlined with context sensitive profile" ); |
| 103 | STATISTIC(NumCSNotInlined, |
| 104 | "Number of functions not inlined with context sensitive profile" ); |
| 105 | STATISTIC(NumMismatchedProfile, |
| 106 | "Number of functions with CFG mismatched profile" ); |
| 107 | STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile" ); |
| 108 | STATISTIC(NumDuplicatedInlinesite, |
| 109 | "Number of inlined callsites with a partial distribution factor" ); |
| 110 | |
| 111 | STATISTIC(NumCSInlinedHitMinLimit, |
| 112 | "Number of functions with FDO inline stopped due to min size limit" ); |
| 113 | STATISTIC(NumCSInlinedHitMaxLimit, |
| 114 | "Number of functions with FDO inline stopped due to max size limit" ); |
| 115 | STATISTIC( |
| 116 | NumCSInlinedHitGrowthLimit, |
| 117 | "Number of functions with FDO inline stopped due to growth size limit" ); |
| 118 | |
| 119 | // Command line option to specify the file to read samples from. This is |
| 120 | // mainly used for debugging. |
| 121 | static cl::opt<std::string> SampleProfileFile( |
| 122 | "sample-profile-file" , cl::init(Val: "" ), cl::value_desc("filename" ), |
| 123 | cl::desc("Profile file loaded by -sample-profile" ), cl::Hidden); |
| 124 | |
| 125 | // The named file contains a set of transformations that may have been applied |
| 126 | // to the symbol names between the program from which the sample data was |
| 127 | // collected and the current program's symbols. |
| 128 | static cl::opt<std::string> SampleProfileRemappingFile( |
| 129 | "sample-profile-remapping-file" , cl::init(Val: "" ), cl::value_desc("filename" ), |
| 130 | cl::desc("Profile remapping file loaded by -sample-profile" ), cl::Hidden); |
| 131 | |
| 132 | cl::opt<bool> SalvageStaleProfile( |
| 133 | "salvage-stale-profile" , cl::Hidden, cl::init(Val: false), |
| 134 | cl::desc("Salvage stale profile by fuzzy matching and use the remapped " |
| 135 | "location for sample profile query." )); |
| 136 | cl::opt<bool> |
| 137 | SalvageUnusedProfile("salvage-unused-profile" , cl::Hidden, cl::init(Val: false), |
| 138 | cl::desc("Salvage unused profile by matching with new " |
| 139 | "functions on call graph." )); |
| 140 | |
| 141 | cl::opt<bool> ReportProfileStaleness( |
| 142 | "report-profile-staleness" , cl::Hidden, cl::init(Val: false), |
| 143 | cl::desc("Compute and report stale profile statistical metrics." )); |
| 144 | |
| 145 | cl::opt<bool> PersistProfileStaleness( |
| 146 | "persist-profile-staleness" , cl::Hidden, cl::init(Val: false), |
| 147 | cl::desc("Compute stale profile statistical metrics and write it into the " |
| 148 | "native object file(.llvm_stats section)." )); |
| 149 | |
| 150 | static cl::opt<bool> ProfileSampleAccurate( |
| 151 | "profile-sample-accurate" , cl::Hidden, cl::init(Val: false), |
| 152 | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
| 153 | "callsite and function as having 0 samples. Otherwise, treat " |
| 154 | "un-sampled callsites and functions conservatively as unknown. " )); |
| 155 | |
| 156 | static cl::opt<bool> ProfileSampleBlockAccurate( |
| 157 | "profile-sample-block-accurate" , cl::Hidden, cl::init(Val: false), |
| 158 | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
| 159 | "branches and calls as having 0 samples. Otherwise, treat " |
| 160 | "them conservatively as unknown. " )); |
| 161 | |
| 162 | static cl::opt<bool> ProfileAccurateForSymsInList( |
| 163 | "profile-accurate-for-symsinlist" , cl::Hidden, cl::init(Val: true), |
| 164 | cl::desc("For symbols in profile symbol list, regard their profiles to " |
| 165 | "be accurate. It may be overridden by profile-sample-accurate. " )); |
| 166 | |
| 167 | static cl::opt<bool> ProfileMergeInlinee( |
| 168 | "sample-profile-merge-inlinee" , cl::Hidden, cl::init(Val: true), |
| 169 | cl::desc("Merge past inlinee's profile to outline version if sample " |
| 170 | "profile loader decided not to inline a call site. It will " |
| 171 | "only be enabled when top-down order of profile loading is " |
| 172 | "enabled. " )); |
| 173 | |
| 174 | static cl::opt<bool> ProfileTopDownLoad( |
| 175 | "sample-profile-top-down-load" , cl::Hidden, cl::init(Val: true), |
| 176 | cl::desc("Do profile annotation and inlining for functions in top-down " |
| 177 | "order of call graph during sample profile loading. It only " |
| 178 | "works for new pass manager. " )); |
| 179 | |
| 180 | static cl::opt<bool> |
| 181 | UseProfiledCallGraph("use-profiled-call-graph" , cl::init(Val: true), cl::Hidden, |
| 182 | cl::desc("Process functions in a top-down order " |
| 183 | "defined by the profiled call graph when " |
| 184 | "-sample-profile-top-down-load is on." )); |
| 185 | |
| 186 | static cl::opt<bool> ProfileSizeInline( |
| 187 | "sample-profile-inline-size" , cl::Hidden, cl::init(Val: false), |
| 188 | cl::desc("Inline cold call sites in profile loader if it's beneficial " |
| 189 | "for code size." )); |
| 190 | |
| 191 | // Since profiles are consumed by many passes, turning on this option has |
| 192 | // side effects. For instance, pre-link SCC inliner would see merged profiles |
| 193 | // and inline the hot functions (that are skipped in this pass). |
| 194 | static cl::opt<bool> DisableSampleLoaderInlining( |
| 195 | "disable-sample-loader-inlining" , cl::Hidden, cl::init(Val: false), |
| 196 | cl::desc( |
| 197 | "If true, artificially skip inline transformation in sample-loader " |
| 198 | "pass, and merge (or scale) profiles (as configured by " |
| 199 | "--sample-profile-merge-inlinee)." )); |
| 200 | |
| 201 | namespace llvm { |
| 202 | cl::opt<bool> |
| 203 | SortProfiledSCC("sort-profiled-scc-member" , cl::init(Val: true), cl::Hidden, |
| 204 | cl::desc("Sort profiled recursion by edge weights." )); |
| 205 | |
| 206 | cl::opt<int> ProfileInlineGrowthLimit( |
| 207 | "sample-profile-inline-growth-limit" , cl::Hidden, cl::init(Val: 12), |
| 208 | cl::desc("The size growth ratio limit for proirity-based sample profile " |
| 209 | "loader inlining." )); |
| 210 | |
| 211 | cl::opt<int> ProfileInlineLimitMin( |
| 212 | "sample-profile-inline-limit-min" , cl::Hidden, cl::init(Val: 100), |
| 213 | cl::desc("The lower bound of size growth limit for " |
| 214 | "proirity-based sample profile loader inlining." )); |
| 215 | |
| 216 | cl::opt<int> ProfileInlineLimitMax( |
| 217 | "sample-profile-inline-limit-max" , cl::Hidden, cl::init(Val: 10000), |
| 218 | cl::desc("The upper bound of size growth limit for " |
| 219 | "proirity-based sample profile loader inlining." )); |
| 220 | |
| 221 | cl::opt<int> SampleHotCallSiteThreshold( |
| 222 | "sample-profile-hot-inline-threshold" , cl::Hidden, cl::init(Val: 3000), |
| 223 | cl::desc("Hot callsite threshold for proirity-based sample profile loader " |
| 224 | "inlining." )); |
| 225 | |
| 226 | cl::opt<int> SampleColdCallSiteThreshold( |
| 227 | "sample-profile-cold-inline-threshold" , cl::Hidden, cl::init(Val: 45), |
| 228 | cl::desc("Threshold for inlining cold callsites" )); |
| 229 | } // namespace llvm |
| 230 | |
| 231 | static cl::opt<unsigned> ProfileICPRelativeHotness( |
| 232 | "sample-profile-icp-relative-hotness" , cl::Hidden, cl::init(Val: 25), |
| 233 | cl::desc( |
| 234 | "Relative hotness percentage threshold for indirect " |
| 235 | "call promotion in proirity-based sample profile loader inlining." )); |
| 236 | |
| 237 | static cl::opt<unsigned> ProfileICPRelativeHotnessSkip( |
| 238 | "sample-profile-icp-relative-hotness-skip" , cl::Hidden, cl::init(Val: 1), |
| 239 | cl::desc( |
| 240 | "Skip relative hotness check for ICP up to given number of targets." )); |
| 241 | |
| 242 | static cl::opt<unsigned> HotFuncCutoffForStalenessError( |
| 243 | "hot-func-cutoff-for-staleness-error" , cl::Hidden, cl::init(Val: 800000), |
| 244 | cl::desc("A function is considered hot for staleness error check if its " |
| 245 | "total sample count is above the specified percentile" )); |
| 246 | |
| 247 | static cl::opt<unsigned> MinfuncsForStalenessError( |
| 248 | "min-functions-for-staleness-error" , cl::Hidden, cl::init(Val: 50), |
| 249 | cl::desc("Skip the check if the number of hot functions is smaller than " |
| 250 | "the specified number." )); |
| 251 | |
| 252 | static cl::opt<unsigned> PrecentMismatchForStalenessError( |
| 253 | "precent-mismatch-for-staleness-error" , cl::Hidden, cl::init(Val: 80), |
| 254 | cl::desc("Reject the profile if the mismatch percent is higher than the " |
| 255 | "given number." )); |
| 256 | |
| 257 | static cl::opt<bool> CallsitePrioritizedInline( |
| 258 | "sample-profile-prioritized-inline" , cl::Hidden, |
| 259 | cl::desc("Use call site prioritized inlining for sample profile loader. " |
| 260 | "Currently only CSSPGO is supported." )); |
| 261 | |
| 262 | static cl::opt<bool> UsePreInlinerDecision( |
| 263 | "sample-profile-use-preinliner" , cl::Hidden, |
| 264 | cl::desc("Use the preinliner decisions stored in profile context." )); |
| 265 | |
| 266 | static cl::opt<bool> AllowRecursiveInline( |
| 267 | "sample-profile-recursive-inline" , cl::Hidden, |
| 268 | cl::desc("Allow sample loader inliner to inline recursive calls." )); |
| 269 | |
| 270 | static cl::opt<bool> RemoveProbeAfterProfileAnnotation( |
| 271 | "sample-profile-remove-probe" , cl::Hidden, cl::init(Val: false), |
| 272 | cl::desc("Remove pseudo-probe after sample profile annotation." )); |
| 273 | |
| 274 | static cl::opt<std::string> ProfileInlineReplayFile( |
| 275 | "sample-profile-inline-replay" , cl::init(Val: "" ), cl::value_desc("filename" ), |
| 276 | cl::desc( |
| 277 | "Optimization remarks file containing inline remarks to be replayed " |
| 278 | "by inlining from sample profile loader." ), |
| 279 | cl::Hidden); |
| 280 | |
| 281 | static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope( |
| 282 | "sample-profile-inline-replay-scope" , |
| 283 | cl::init(Val: ReplayInlinerSettings::Scope::Function), |
| 284 | cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function" , |
| 285 | "Replay on functions that have remarks associated " |
| 286 | "with them (default)" ), |
| 287 | clEnumValN(ReplayInlinerSettings::Scope::Module, "Module" , |
| 288 | "Replay on the entire module" )), |
| 289 | cl::desc("Whether inline replay should be applied to the entire " |
| 290 | "Module or just the Functions (default) that are present as " |
| 291 | "callers in remarks during sample profile inlining." ), |
| 292 | cl::Hidden); |
| 293 | |
| 294 | static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback( |
| 295 | "sample-profile-inline-replay-fallback" , |
| 296 | cl::init(Val: ReplayInlinerSettings::Fallback::Original), |
| 297 | cl::values( |
| 298 | clEnumValN( |
| 299 | ReplayInlinerSettings::Fallback::Original, "Original" , |
| 300 | "All decisions not in replay send to original advisor (default)" ), |
| 301 | clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, |
| 302 | "AlwaysInline" , "All decisions not in replay are inlined" ), |
| 303 | clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline" , |
| 304 | "All decisions not in replay are not inlined" )), |
| 305 | cl::desc("How sample profile inline replay treats sites that don't come " |
| 306 | "from the replay. Original: defers to original advisor, " |
| 307 | "AlwaysInline: inline all sites not in replay, NeverInline: " |
| 308 | "inline no sites not in replay" ), |
| 309 | cl::Hidden); |
| 310 | |
| 311 | static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat( |
| 312 | "sample-profile-inline-replay-format" , |
| 313 | cl::init(Val: CallSiteFormat::Format::LineColumnDiscriminator), |
| 314 | cl::values( |
| 315 | clEnumValN(CallSiteFormat::Format::Line, "Line" , "<Line Number>" ), |
| 316 | clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn" , |
| 317 | "<Line Number>:<Column Number>" ), |
| 318 | clEnumValN(CallSiteFormat::Format::LineDiscriminator, |
| 319 | "LineDiscriminator" , "<Line Number>.<Discriminator>" ), |
| 320 | clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, |
| 321 | "LineColumnDiscriminator" , |
| 322 | "<Line Number>:<Column Number>.<Discriminator> (default)" )), |
| 323 | cl::desc("How sample profile inline replay file is formatted" ), cl::Hidden); |
| 324 | |
| 325 | static cl::opt<unsigned> |
| 326 | MaxNumPromotions("sample-profile-icp-max-prom" , cl::init(Val: 3), cl::Hidden, |
| 327 | cl::desc("Max number of promotions for a single indirect " |
| 328 | "call callsite in sample profile loader" )); |
| 329 | |
| 330 | static cl::opt<bool> OverwriteExistingWeights( |
| 331 | "overwrite-existing-weights" , cl::Hidden, cl::init(Val: false), |
| 332 | cl::desc("Ignore existing branch weights on IR and always overwrite." )); |
| 333 | |
| 334 | static cl::opt<bool> AnnotateSampleProfileInlinePhase( |
| 335 | "annotate-sample-profile-inline-phase" , cl::Hidden, cl::init(Val: false), |
| 336 | cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " |
| 337 | "sample-profile inline pass name." )); |
| 338 | |
| 339 | namespace llvm { |
| 340 | extern cl::opt<bool> EnableExtTspBlockPlacement; |
| 341 | } |
| 342 | |
| 343 | namespace { |
| 344 | |
| 345 | using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; |
| 346 | using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>; |
| 347 | using Edge = std::pair<const BasicBlock *, const BasicBlock *>; |
| 348 | using EdgeWeightMap = DenseMap<Edge, uint64_t>; |
| 349 | using BlockEdgeMap = |
| 350 | DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; |
| 351 | |
| 352 | class GUIDToFuncNameMapper { |
| 353 | public: |
| 354 | GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader, |
| 355 | DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap) |
| 356 | : CurrentReader(Reader), CurrentModule(M), |
| 357 | CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) { |
| 358 | if (!CurrentReader.useMD5()) |
| 359 | return; |
| 360 | |
| 361 | for (const auto &F : CurrentModule) { |
| 362 | StringRef OrigName = F.getName(); |
| 363 | CurrentGUIDToFuncNameMap.insert( |
| 364 | KV: {Function::getGUIDAssumingExternalLinkage(GlobalName: OrigName), OrigName}); |
| 365 | |
| 366 | // Local to global var promotion used by optimization like thinlto |
| 367 | // will rename the var and add suffix like ".llvm.xxx" to the |
| 368 | // original local name. In sample profile, the suffixes of function |
| 369 | // names are all stripped. Since it is possible that the mapper is |
| 370 | // built in post-thin-link phase and var promotion has been done, |
| 371 | // we need to add the substring of function name without the suffix |
| 372 | // into the GUIDToFuncNameMap. |
| 373 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
| 374 | if (CanonName != OrigName) |
| 375 | CurrentGUIDToFuncNameMap.insert( |
| 376 | KV: {Function::getGUIDAssumingExternalLinkage(GlobalName: CanonName), CanonName}); |
| 377 | } |
| 378 | |
| 379 | // Update GUIDToFuncNameMap for each function including inlinees. |
| 380 | SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap); |
| 381 | } |
| 382 | |
| 383 | ~GUIDToFuncNameMapper() { |
| 384 | if (!CurrentReader.useMD5()) |
| 385 | return; |
| 386 | |
| 387 | CurrentGUIDToFuncNameMap.clear(); |
| 388 | |
| 389 | // Reset GUIDToFuncNameMap for of each function as they're no |
| 390 | // longer valid at this point. |
| 391 | SetGUIDToFuncNameMapForAll(nullptr); |
| 392 | } |
| 393 | |
| 394 | private: |
| 395 | void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) { |
| 396 | std::queue<FunctionSamples *> FSToUpdate; |
| 397 | for (auto &IFS : CurrentReader.getProfiles()) { |
| 398 | FSToUpdate.push(x: &IFS.second); |
| 399 | } |
| 400 | |
| 401 | while (!FSToUpdate.empty()) { |
| 402 | FunctionSamples *FS = FSToUpdate.front(); |
| 403 | FSToUpdate.pop(); |
| 404 | FS->GUIDToFuncNameMap = Map; |
| 405 | for (const auto &ICS : FS->getCallsiteSamples()) { |
| 406 | const FunctionSamplesMap &FSMap = ICS.second; |
| 407 | for (const auto &IFS : FSMap) { |
| 408 | FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second); |
| 409 | FSToUpdate.push(x: &FS); |
| 410 | } |
| 411 | } |
| 412 | } |
| 413 | } |
| 414 | |
| 415 | SampleProfileReader &CurrentReader; |
| 416 | Module &CurrentModule; |
| 417 | DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap; |
| 418 | }; |
| 419 | |
| 420 | // Inline candidate used by iterative callsite prioritized inliner |
| 421 | struct InlineCandidate { |
| 422 | CallBase *CallInstr; |
| 423 | const FunctionSamples *CalleeSamples; |
| 424 | // Prorated callsite count, which will be used to guide inlining. For example, |
| 425 | // if a callsite is duplicated in LTO prelink, then in LTO postlink the two |
| 426 | // copies will get their own distribution factors and their prorated counts |
| 427 | // will be used to decide if they should be inlined independently. |
| 428 | uint64_t CallsiteCount; |
| 429 | // Call site distribution factor to prorate the profile samples for a |
| 430 | // duplicated callsite. Default value is 1.0. |
| 431 | float CallsiteDistribution; |
| 432 | }; |
| 433 | |
| 434 | // Inline candidate comparer using call site weight |
| 435 | struct CandidateComparer { |
| 436 | bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) { |
| 437 | if (LHS.CallsiteCount != RHS.CallsiteCount) |
| 438 | return LHS.CallsiteCount < RHS.CallsiteCount; |
| 439 | |
| 440 | const FunctionSamples *LCS = LHS.CalleeSamples; |
| 441 | const FunctionSamples *RCS = RHS.CalleeSamples; |
| 442 | // In inline replay mode, CalleeSamples may be null and the order doesn't |
| 443 | // matter. |
| 444 | if (!LCS || !RCS) |
| 445 | return LCS; |
| 446 | |
| 447 | // Tie breaker using number of samples try to favor smaller functions first |
| 448 | if (LCS->getBodySamples().size() != RCS->getBodySamples().size()) |
| 449 | return LCS->getBodySamples().size() > RCS->getBodySamples().size(); |
| 450 | |
| 451 | // Tie breaker using GUID so we have stable/deterministic inlining order |
| 452 | return LCS->getGUID() < RCS->getGUID(); |
| 453 | } |
| 454 | }; |
| 455 | |
| 456 | using CandidateQueue = |
| 457 | PriorityQueue<InlineCandidate, std::vector<InlineCandidate>, |
| 458 | CandidateComparer>; |
| 459 | |
| 460 | /// Sample profile pass. |
| 461 | /// |
| 462 | /// This pass reads profile data from the file specified by |
| 463 | /// -sample-profile-file and annotates every affected function with the |
| 464 | /// profile information found in that file. |
| 465 | class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> { |
| 466 | public: |
| 467 | SampleProfileLoader( |
| 468 | StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase, |
| 469 | IntrusiveRefCntPtr<vfs::FileSystem> FS, |
| 470 | std::function<AssumptionCache &(Function &)> GetAssumptionCache, |
| 471 | std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo, |
| 472 | std::function<const TargetLibraryInfo &(Function &)> GetTLI, |
| 473 | LazyCallGraph &CG, bool DisableSampleProfileInlining, |
| 474 | bool UseFlattenedProfile) |
| 475 | : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName), |
| 476 | std::move(FS)), |
| 477 | GetAC(std::move(GetAssumptionCache)), |
| 478 | GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)), |
| 479 | CG(CG), LTOPhase(LTOPhase), |
| 480 | AnnotatedPassName(AnnotateSampleProfileInlinePhase |
| 481 | ? llvm::AnnotateInlinePassName(IC: InlineContext{ |
| 482 | .LTOPhase: LTOPhase, .Pass: InlinePass::SampleProfileInliner}) |
| 483 | : CSINLINE_DEBUG), |
| 484 | DisableSampleProfileInlining(DisableSampleProfileInlining), |
| 485 | UseFlattenedProfile(UseFlattenedProfile) {} |
| 486 | |
| 487 | bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr); |
| 488 | bool runOnModule(Module &M, ModuleAnalysisManager *AM, |
| 489 | ProfileSummaryInfo *_PSI); |
| 490 | |
| 491 | protected: |
| 492 | bool runOnFunction(Function &F, ModuleAnalysisManager *AM); |
| 493 | bool emitAnnotations(Function &F); |
| 494 | ErrorOr<uint64_t> getInstWeight(const Instruction &I) override; |
| 495 | const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const; |
| 496 | const FunctionSamples * |
| 497 | findFunctionSamples(const Instruction &I) const override; |
| 498 | std::vector<const FunctionSamples *> |
| 499 | findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; |
| 500 | void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples, |
| 501 | DenseSet<GlobalValue::GUID> &InlinedGUIDs, |
| 502 | uint64_t Threshold); |
| 503 | // Attempt to promote indirect call and also inline the promoted call |
| 504 | bool tryPromoteAndInlineCandidate( |
| 505 | Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, |
| 506 | uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); |
| 507 | |
| 508 | bool inlineHotFunctions(Function &F, |
| 509 | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
| 510 | std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB); |
| 511 | bool getExternalInlineAdvisorShouldInline(CallBase &CB); |
| 512 | InlineCost shouldInlineCandidate(InlineCandidate &Candidate); |
| 513 | bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB); |
| 514 | bool |
| 515 | tryInlineCandidate(InlineCandidate &Candidate, |
| 516 | SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); |
| 517 | bool |
| 518 | inlineHotFunctionsWithPriority(Function &F, |
| 519 | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
| 520 | // Inline cold/small functions in addition to hot ones |
| 521 | bool shouldInlineColdCallee(CallBase &CallInst); |
| 522 | void emitOptimizationRemarksForInlineCandidates( |
| 523 | const SmallVectorImpl<CallBase *> &Candidates, const Function &F, |
| 524 | bool Hot); |
| 525 | void promoteMergeNotInlinedContextSamples( |
| 526 | MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites, |
| 527 | const Function &F); |
| 528 | std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG); |
| 529 | std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M); |
| 530 | void generateMDProfMetadata(Function &F); |
| 531 | bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI, |
| 532 | const SampleProfileMap &Profiles); |
| 533 | void removePseudoProbeInstsDiscriminator(Module &M); |
| 534 | |
| 535 | /// Map from function name to Function *. Used to find the function from |
| 536 | /// the function name. If the function name contains suffix, additional |
| 537 | /// entry is added to map from the stripped name to the function if there |
| 538 | /// is one-to-one mapping. |
| 539 | HashKeyMap<std::unordered_map, FunctionId, Function *> SymbolMap; |
| 540 | |
| 541 | /// Map from function name to profile name generated by call-graph based |
| 542 | /// profile fuzzy matching(--salvage-unused-profile). |
| 543 | HashKeyMap<std::unordered_map, FunctionId, FunctionId> FuncNameToProfNameMap; |
| 544 | |
| 545 | std::function<AssumptionCache &(Function &)> GetAC; |
| 546 | std::function<TargetTransformInfo &(Function &)> GetTTI; |
| 547 | std::function<const TargetLibraryInfo &(Function &)> GetTLI; |
| 548 | LazyCallGraph &CG; |
| 549 | |
| 550 | /// Profile tracker for different context. |
| 551 | std::unique_ptr<SampleContextTracker> ContextTracker; |
| 552 | |
| 553 | /// Flag indicating which LTO/ThinLTO phase the pass is invoked in. |
| 554 | /// |
| 555 | /// We need to know the LTO phase because for example in ThinLTOPrelink |
| 556 | /// phase, in annotation, we should not promote indirect calls. Instead, |
| 557 | /// we will mark GUIDs that needs to be annotated to the function. |
| 558 | const ThinOrFullLTOPhase LTOPhase; |
| 559 | const std::string AnnotatedPassName; |
| 560 | |
| 561 | /// Profle Symbol list tells whether a function name appears in the binary |
| 562 | /// used to generate the current profile. |
| 563 | std::shared_ptr<ProfileSymbolList> PSL; |
| 564 | |
| 565 | // Information recorded when we declined to inline a call site |
| 566 | // because we have determined it is too cold is accumulated for |
| 567 | // each callee function. Initially this is just the entry count. |
| 568 | struct NotInlinedProfileInfo { |
| 569 | uint64_t entryCount; |
| 570 | }; |
| 571 | DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo; |
| 572 | |
| 573 | // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for |
| 574 | // all the function symbols defined or declared in current module. |
| 575 | DenseMap<uint64_t, StringRef> GUIDToFuncNameMap; |
| 576 | |
| 577 | // All the Names used in FunctionSamples including outline function |
| 578 | // names, inline instance names and call target names. |
| 579 | StringSet<> NamesInProfile; |
| 580 | // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is |
| 581 | // populated, depends on whether the profile uses MD5. Because the name table |
| 582 | // generally contains several magnitude more entries than the number of |
| 583 | // functions, we do not want to convert all names from one form to another. |
| 584 | llvm::DenseSet<uint64_t> GUIDsInProfile; |
| 585 | |
| 586 | // For symbol in profile symbol list, whether to regard their profiles |
| 587 | // to be accurate. It is mainly decided by existance of profile symbol |
| 588 | // list and -profile-accurate-for-symsinlist flag, but it can be |
| 589 | // overriden by -profile-sample-accurate or profile-sample-accurate |
| 590 | // attribute. |
| 591 | bool ProfAccForSymsInList; |
| 592 | |
| 593 | bool DisableSampleProfileInlining; |
| 594 | |
| 595 | bool UseFlattenedProfile; |
| 596 | |
| 597 | // External inline advisor used to replay inline decision from remarks. |
| 598 | std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor; |
| 599 | |
| 600 | // A helper to implement the sample profile matching algorithm. |
| 601 | std::unique_ptr<SampleProfileMatcher> MatchingManager; |
| 602 | |
| 603 | private: |
| 604 | const char *() const { |
| 605 | return AnnotatedPassName.c_str(); |
| 606 | } |
| 607 | }; |
| 608 | } // end anonymous namespace |
| 609 | |
| 610 | namespace llvm { |
| 611 | template <> |
| 612 | inline bool SampleProfileInference<Function>::isExit(const BasicBlock *BB) { |
| 613 | return succ_empty(BB); |
| 614 | } |
| 615 | |
| 616 | template <> |
| 617 | inline void SampleProfileInference<Function>::findUnlikelyJumps( |
| 618 | const std::vector<const BasicBlockT *> &BasicBlocks, |
| 619 | BlockEdgeMap &Successors, FlowFunction &Func) { |
| 620 | for (auto &Jump : Func.Jumps) { |
| 621 | const auto *BB = BasicBlocks[Jump.Source]; |
| 622 | const auto *Succ = BasicBlocks[Jump.Target]; |
| 623 | const Instruction *TI = BB->getTerminator(); |
| 624 | // Check if a block ends with InvokeInst and mark non-taken branch unlikely. |
| 625 | // In that case block Succ should be a landing pad |
| 626 | const auto &Succs = Successors[BB]; |
| 627 | if (Succs.size() == 2 && Succs.back() == Succ) { |
| 628 | if (isa<InvokeInst>(Val: TI)) { |
| 629 | Jump.IsUnlikely = true; |
| 630 | } |
| 631 | } |
| 632 | const Instruction *SuccTI = Succ->getTerminator(); |
| 633 | // Check if the target block contains UnreachableInst and mark it unlikely |
| 634 | if (SuccTI->getNumSuccessors() == 0) { |
| 635 | if (isa<UnreachableInst>(Val: SuccTI)) { |
| 636 | Jump.IsUnlikely = true; |
| 637 | } |
| 638 | } |
| 639 | } |
| 640 | } |
| 641 | |
| 642 | template <> |
| 643 | void SampleProfileLoaderBaseImpl<Function>::computeDominanceAndLoopInfo( |
| 644 | Function &F) { |
| 645 | DT.reset(p: new DominatorTree); |
| 646 | DT->recalculate(Func&: F); |
| 647 | |
| 648 | PDT.reset(p: new PostDominatorTree(F)); |
| 649 | |
| 650 | LI.reset(p: new LoopInfo); |
| 651 | LI->analyze(DomTree: *DT); |
| 652 | } |
| 653 | } // namespace llvm |
| 654 | |
| 655 | ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { |
| 656 | if (FunctionSamples::ProfileIsProbeBased) |
| 657 | return getProbeWeight(Inst); |
| 658 | |
| 659 | const DebugLoc &DLoc = Inst.getDebugLoc(); |
| 660 | if (!DLoc) |
| 661 | return std::error_code(); |
| 662 | |
| 663 | // Ignore all intrinsics, phinodes and branch instructions. |
| 664 | // Branch and phinodes instruction usually contains debug info from sources |
| 665 | // outside of the residing basic block, thus we ignore them during annotation. |
| 666 | if (isa<BranchInst>(Val: Inst) || isa<IntrinsicInst>(Val: Inst) || isa<PHINode>(Val: Inst)) |
| 667 | return std::error_code(); |
| 668 | |
| 669 | // For non-CS profile, if a direct call/invoke instruction is inlined in |
| 670 | // profile (findCalleeFunctionSamples returns non-empty result), but not |
| 671 | // inlined here, it means that the inlined callsite has no sample, thus the |
| 672 | // call instruction should have 0 count. |
| 673 | // For CS profile, the callsite count of previously inlined callees is |
| 674 | // populated with the entry count of the callees. |
| 675 | if (!FunctionSamples::ProfileIsCS) |
| 676 | if (const auto *CB = dyn_cast<CallBase>(Val: &Inst)) |
| 677 | if (!CB->isIndirectCall() && findCalleeFunctionSamples(I: *CB)) |
| 678 | return 0; |
| 679 | |
| 680 | return getInstWeightImpl(Inst); |
| 681 | } |
| 682 | |
| 683 | /// Get the FunctionSamples for a call instruction. |
| 684 | /// |
| 685 | /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined |
| 686 | /// instance in which that call instruction is calling to. It contains |
| 687 | /// all samples that resides in the inlined instance. We first find the |
| 688 | /// inlined instance in which the call instruction is from, then we |
| 689 | /// traverse its children to find the callsite with the matching |
| 690 | /// location. |
| 691 | /// |
| 692 | /// \param Inst Call/Invoke instruction to query. |
| 693 | /// |
| 694 | /// \returns The FunctionSamples pointer to the inlined instance. |
| 695 | const FunctionSamples * |
| 696 | SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { |
| 697 | const DILocation *DIL = Inst.getDebugLoc(); |
| 698 | if (!DIL) { |
| 699 | return nullptr; |
| 700 | } |
| 701 | |
| 702 | StringRef CalleeName; |
| 703 | if (Function *Callee = Inst.getCalledFunction()) |
| 704 | CalleeName = Callee->getName(); |
| 705 | |
| 706 | if (FunctionSamples::ProfileIsCS) |
| 707 | return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName); |
| 708 | |
| 709 | const FunctionSamples *FS = findFunctionSamples(I: Inst); |
| 710 | if (FS == nullptr) |
| 711 | return nullptr; |
| 712 | |
| 713 | return FS->findFunctionSamplesAt(Loc: FunctionSamples::getCallSiteIdentifier(DIL), |
| 714 | CalleeName, Remapper: Reader->getRemapper(), |
| 715 | FuncNameToProfNameMap: &FuncNameToProfNameMap); |
| 716 | } |
| 717 | |
| 718 | /// Returns a vector of FunctionSamples that are the indirect call targets |
| 719 | /// of \p Inst. The vector is sorted by the total number of samples. Stores |
| 720 | /// the total call count of the indirect call in \p Sum. |
| 721 | std::vector<const FunctionSamples *> |
| 722 | SampleProfileLoader::findIndirectCallFunctionSamples( |
| 723 | const Instruction &Inst, uint64_t &Sum) const { |
| 724 | const DILocation *DIL = Inst.getDebugLoc(); |
| 725 | std::vector<const FunctionSamples *> R; |
| 726 | |
| 727 | if (!DIL) { |
| 728 | return R; |
| 729 | } |
| 730 | |
| 731 | auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) { |
| 732 | assert(L && R && "Expect non-null FunctionSamples" ); |
| 733 | if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate()) |
| 734 | return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate(); |
| 735 | return L->getGUID() < R->getGUID(); |
| 736 | }; |
| 737 | |
| 738 | if (FunctionSamples::ProfileIsCS) { |
| 739 | auto CalleeSamples = |
| 740 | ContextTracker->getIndirectCalleeContextSamplesFor(DIL); |
| 741 | if (CalleeSamples.empty()) |
| 742 | return R; |
| 743 | |
| 744 | // For CSSPGO, we only use target context profile's entry count |
| 745 | // as that already includes both inlined callee and non-inlined ones.. |
| 746 | Sum = 0; |
| 747 | for (const auto *const FS : CalleeSamples) { |
| 748 | Sum += FS->getHeadSamplesEstimate(); |
| 749 | R.push_back(x: FS); |
| 750 | } |
| 751 | llvm::sort(C&: R, Comp: FSCompare); |
| 752 | return R; |
| 753 | } |
| 754 | |
| 755 | const FunctionSamples *FS = findFunctionSamples(I: Inst); |
| 756 | if (FS == nullptr) |
| 757 | return R; |
| 758 | |
| 759 | auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL); |
| 760 | Sum = 0; |
| 761 | if (auto T = FS->findCallTargetMapAt(CallSite)) |
| 762 | for (const auto &T_C : *T) |
| 763 | Sum += T_C.second; |
| 764 | if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(Loc: CallSite)) { |
| 765 | if (M->empty()) |
| 766 | return R; |
| 767 | for (const auto &NameFS : *M) { |
| 768 | Sum += NameFS.second.getHeadSamplesEstimate(); |
| 769 | R.push_back(x: &NameFS.second); |
| 770 | } |
| 771 | llvm::sort(C&: R, Comp: FSCompare); |
| 772 | } |
| 773 | return R; |
| 774 | } |
| 775 | |
| 776 | const FunctionSamples * |
| 777 | SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { |
| 778 | if (FunctionSamples::ProfileIsProbeBased) { |
| 779 | std::optional<PseudoProbe> Probe = extractProbe(Inst); |
| 780 | if (!Probe) |
| 781 | return nullptr; |
| 782 | } |
| 783 | |
| 784 | const DILocation *DIL = Inst.getDebugLoc(); |
| 785 | if (!DIL) |
| 786 | return Samples; |
| 787 | |
| 788 | auto it = DILocation2SampleMap.try_emplace(Key: DIL,Args: nullptr); |
| 789 | if (it.second) { |
| 790 | if (FunctionSamples::ProfileIsCS) |
| 791 | it.first->second = ContextTracker->getContextSamplesFor(DIL); |
| 792 | else |
| 793 | it.first->second = Samples->findFunctionSamples( |
| 794 | DIL, Remapper: Reader->getRemapper(), FuncNameToProfNameMap: &FuncNameToProfNameMap); |
| 795 | } |
| 796 | return it.first->second; |
| 797 | } |
| 798 | |
| 799 | /// Check whether the indirect call promotion history of \p Inst allows |
| 800 | /// the promotion for \p Candidate. |
| 801 | /// If the profile count for the promotion candidate \p Candidate is |
| 802 | /// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted |
| 803 | /// for \p Inst. If we already have at least MaxNumPromotions |
| 804 | /// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we |
| 805 | /// cannot promote for \p Inst anymore. |
| 806 | static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) { |
| 807 | uint64_t TotalCount = 0; |
| 808 | auto ValueData = getValueProfDataFromInst(Inst, ValueKind: IPVK_IndirectCallTarget, |
| 809 | MaxNumValueData: MaxNumPromotions, TotalC&: TotalCount, GetNoICPValue: true); |
| 810 | // No valid value profile so no promoted targets have been recorded |
| 811 | // before. Ok to do ICP. |
| 812 | if (ValueData.empty()) |
| 813 | return true; |
| 814 | |
| 815 | unsigned NumPromoted = 0; |
| 816 | for (const auto &V : ValueData) { |
| 817 | if (V.Count != NOMORE_ICP_MAGICNUM) |
| 818 | continue; |
| 819 | |
| 820 | // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the |
| 821 | // metadata, it means the candidate has been promoted for this |
| 822 | // indirect call. |
| 823 | if (V.Value == Function::getGUIDAssumingExternalLinkage(GlobalName: Candidate)) |
| 824 | return false; |
| 825 | NumPromoted++; |
| 826 | // If already have MaxNumPromotions promotion, don't do it anymore. |
| 827 | if (NumPromoted == MaxNumPromotions) |
| 828 | return false; |
| 829 | } |
| 830 | return true; |
| 831 | } |
| 832 | |
| 833 | /// Update indirect call target profile metadata for \p Inst. |
| 834 | /// Usually \p Sum is the sum of counts of all the targets for \p Inst. |
| 835 | /// If it is 0, it means updateIDTMetaData is used to mark a |
| 836 | /// certain target to be promoted already. If it is not zero, |
| 837 | /// we expect to use it to update the total count in the value profile. |
| 838 | static void |
| 839 | updateIDTMetaData(Instruction &Inst, |
| 840 | const SmallVectorImpl<InstrProfValueData> &CallTargets, |
| 841 | uint64_t Sum) { |
| 842 | // Bail out early if MaxNumPromotions is zero. |
| 843 | // This prevents allocating an array of zero length below. |
| 844 | // |
| 845 | // Note `updateIDTMetaData` is called in two places so check |
| 846 | // `MaxNumPromotions` inside it. |
| 847 | if (MaxNumPromotions == 0) |
| 848 | return; |
| 849 | // OldSum is the existing total count in the value profile data. |
| 850 | uint64_t OldSum = 0; |
| 851 | auto ValueData = getValueProfDataFromInst(Inst, ValueKind: IPVK_IndirectCallTarget, |
| 852 | MaxNumValueData: MaxNumPromotions, TotalC&: OldSum, GetNoICPValue: true); |
| 853 | |
| 854 | DenseMap<uint64_t, uint64_t> ValueCountMap; |
| 855 | if (Sum == 0) { |
| 856 | assert((CallTargets.size() == 1 && |
| 857 | CallTargets[0].Count == NOMORE_ICP_MAGICNUM) && |
| 858 | "If sum is 0, assume only one element in CallTargets " |
| 859 | "with count being NOMORE_ICP_MAGICNUM" ); |
| 860 | // Initialize ValueCountMap with existing value profile data. |
| 861 | for (const auto &V : ValueData) |
| 862 | ValueCountMap[V.Value] = V.Count; |
| 863 | auto Pair = |
| 864 | ValueCountMap.try_emplace(Key: CallTargets[0].Value, Args: CallTargets[0].Count); |
| 865 | // If the target already exists in value profile, decrease the total |
| 866 | // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM. |
| 867 | if (!Pair.second) { |
| 868 | OldSum -= Pair.first->second; |
| 869 | Pair.first->second = NOMORE_ICP_MAGICNUM; |
| 870 | } |
| 871 | Sum = OldSum; |
| 872 | } else { |
| 873 | // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM |
| 874 | // counts in the value profile. |
| 875 | for (const auto &V : ValueData) { |
| 876 | if (V.Count == NOMORE_ICP_MAGICNUM) |
| 877 | ValueCountMap[V.Value] = V.Count; |
| 878 | } |
| 879 | |
| 880 | for (const auto &Data : CallTargets) { |
| 881 | auto Pair = ValueCountMap.try_emplace(Key: Data.Value, Args: Data.Count); |
| 882 | if (Pair.second) |
| 883 | continue; |
| 884 | // The target represented by Data.Value has already been promoted. |
| 885 | // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease |
| 886 | // Sum by Data.Count. |
| 887 | assert(Sum >= Data.Count && "Sum should never be less than Data.Count" ); |
| 888 | Sum -= Data.Count; |
| 889 | } |
| 890 | } |
| 891 | |
| 892 | SmallVector<InstrProfValueData, 8> NewCallTargets; |
| 893 | for (const auto &ValueCount : ValueCountMap) { |
| 894 | NewCallTargets.emplace_back( |
| 895 | Args: InstrProfValueData{.Value: ValueCount.first, .Count: ValueCount.second}); |
| 896 | } |
| 897 | |
| 898 | llvm::sort(C&: NewCallTargets, |
| 899 | Comp: [](const InstrProfValueData &L, const InstrProfValueData &R) { |
| 900 | return std::tie(args: L.Count, args: L.Value) > std::tie(args: R.Count, args: R.Value); |
| 901 | }); |
| 902 | |
| 903 | uint32_t MaxMDCount = |
| 904 | std::min(a: NewCallTargets.size(), b: static_cast<size_t>(MaxNumPromotions)); |
| 905 | annotateValueSite(M&: *Inst.getParent()->getParent()->getParent(), Inst, |
| 906 | VDs: NewCallTargets, Sum, ValueKind: IPVK_IndirectCallTarget, MaxMDCount); |
| 907 | } |
| 908 | |
| 909 | /// Attempt to promote indirect call and also inline the promoted call. |
| 910 | /// |
| 911 | /// \param F Caller function. |
| 912 | /// \param Candidate ICP and inline candidate. |
| 913 | /// \param SumOrigin Original sum of target counts for indirect call before |
| 914 | /// promoting given candidate. |
| 915 | /// \param Sum Prorated sum of remaining target counts for indirect call |
| 916 | /// after promoting given candidate. |
| 917 | /// \param InlinedCallSite Output vector for new call sites exposed after |
| 918 | /// inlining. |
| 919 | bool SampleProfileLoader::tryPromoteAndInlineCandidate( |
| 920 | Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, |
| 921 | SmallVector<CallBase *, 8> *InlinedCallSite) { |
| 922 | // Bail out early if sample-loader inliner is disabled. |
| 923 | if (DisableSampleProfileInlining) |
| 924 | return false; |
| 925 | |
| 926 | // Bail out early if MaxNumPromotions is zero. |
| 927 | // This prevents allocating an array of zero length in callees below. |
| 928 | if (MaxNumPromotions == 0) |
| 929 | return false; |
| 930 | auto CalleeFunctionName = Candidate.CalleeSamples->getFunction(); |
| 931 | auto R = SymbolMap.find(Key: CalleeFunctionName); |
| 932 | if (R == SymbolMap.end() || !R->second) |
| 933 | return false; |
| 934 | |
| 935 | auto &CI = *Candidate.CallInstr; |
| 936 | if (!doesHistoryAllowICP(Inst: CI, Candidate: R->second->getName())) |
| 937 | return false; |
| 938 | |
| 939 | const char *Reason = "Callee function not available" ; |
| 940 | // R->getValue() != &F is to prevent promoting a recursive call. |
| 941 | // If it is a recursive call, we do not inline it as it could bloat |
| 942 | // the code exponentially. There is way to better handle this, e.g. |
| 943 | // clone the caller first, and inline the cloned caller if it is |
| 944 | // recursive. As llvm does not inline recursive calls, we will |
| 945 | // simply ignore it instead of handling it explicitly. |
| 946 | if (!R->second->isDeclaration() && R->second->getSubprogram() && |
| 947 | R->second->hasFnAttribute(Kind: "use-sample-profile" ) && |
| 948 | R->second != &F && isLegalToPromote(CB: CI, Callee: R->second, FailureReason: &Reason)) { |
| 949 | // For promoted target, set its value with NOMORE_ICP_MAGICNUM count |
| 950 | // in the value profile metadata so the target won't be promoted again. |
| 951 | SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{ |
| 952 | .Value: Function::getGUIDAssumingExternalLinkage(GlobalName: R->second->getName()), |
| 953 | .Count: NOMORE_ICP_MAGICNUM}}; |
| 954 | updateIDTMetaData(Inst&: CI, CallTargets: SortedCallTargets, Sum: 0); |
| 955 | |
| 956 | auto *DI = &pgo::promoteIndirectCall( |
| 957 | CB&: CI, F: R->second, Count: Candidate.CallsiteCount, TotalCount: Sum, AttachProfToDirectCall: false, ORE); |
| 958 | if (DI) { |
| 959 | Sum -= Candidate.CallsiteCount; |
| 960 | // Do not prorate the indirect callsite distribution since the original |
| 961 | // distribution will be used to scale down non-promoted profile target |
| 962 | // counts later. By doing this we lose track of the real callsite count |
| 963 | // for the leftover indirect callsite as a trade off for accurate call |
| 964 | // target counts. |
| 965 | // TODO: Ideally we would have two separate factors, one for call site |
| 966 | // counts and one is used to prorate call target counts. |
| 967 | // Do not update the promoted direct callsite distribution at this |
| 968 | // point since the original distribution combined with the callee profile |
| 969 | // will be used to prorate callsites from the callee if inlined. Once not |
| 970 | // inlined, the direct callsite distribution should be prorated so that |
| 971 | // the it will reflect the real callsite counts. |
| 972 | Candidate.CallInstr = DI; |
| 973 | if (isa<CallInst>(Val: DI) || isa<InvokeInst>(Val: DI)) { |
| 974 | bool Inlined = tryInlineCandidate(Candidate, InlinedCallSites: InlinedCallSite); |
| 975 | if (!Inlined) { |
| 976 | // Prorate the direct callsite distribution so that it reflects real |
| 977 | // callsite counts. |
| 978 | setProbeDistributionFactor( |
| 979 | Inst&: *DI, Factor: static_cast<float>(Candidate.CallsiteCount) / SumOrigin); |
| 980 | } |
| 981 | return Inlined; |
| 982 | } |
| 983 | } |
| 984 | } else { |
| 985 | LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to " |
| 986 | << FunctionSamples::getCanonicalFnName( |
| 987 | Candidate.CallInstr->getName())<< " because " |
| 988 | << Reason << "\n" ); |
| 989 | } |
| 990 | return false; |
| 991 | } |
| 992 | |
| 993 | bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) { |
| 994 | if (!ProfileSizeInline) |
| 995 | return false; |
| 996 | |
| 997 | Function *Callee = CallInst.getCalledFunction(); |
| 998 | if (Callee == nullptr) |
| 999 | return false; |
| 1000 | |
| 1001 | InlineCost Cost = getInlineCost(Call&: CallInst, Params: getInlineParams(), CalleeTTI&: GetTTI(*Callee), |
| 1002 | GetAssumptionCache: GetAC, GetTLI); |
| 1003 | |
| 1004 | if (Cost.isNever()) |
| 1005 | return false; |
| 1006 | |
| 1007 | if (Cost.isAlways()) |
| 1008 | return true; |
| 1009 | |
| 1010 | return Cost.getCost() <= SampleColdCallSiteThreshold; |
| 1011 | } |
| 1012 | |
| 1013 | void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( |
| 1014 | const SmallVectorImpl<CallBase *> &Candidates, const Function &F, |
| 1015 | bool Hot) { |
| 1016 | for (auto *I : Candidates) { |
| 1017 | Function *CalledFunction = I->getCalledFunction(); |
| 1018 | if (CalledFunction) { |
| 1019 | ORE->emit(OptDiag: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), |
| 1020 | "InlineAttempt" , I->getDebugLoc(), |
| 1021 | I->getParent()) |
| 1022 | << "previous inlining reattempted for " |
| 1023 | << (Hot ? "hotness: '" : "size: '" ) |
| 1024 | << ore::NV("Callee" , CalledFunction) << "' into '" |
| 1025 | << ore::NV("Caller" , &F) << "'" ); |
| 1026 | } |
| 1027 | } |
| 1028 | } |
| 1029 | |
| 1030 | void SampleProfileLoader::findExternalInlineCandidate( |
| 1031 | CallBase *CB, const FunctionSamples *Samples, |
| 1032 | DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) { |
| 1033 | |
| 1034 | // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external |
| 1035 | // function make sure it's imported |
| 1036 | if (CB && getExternalInlineAdvisorShouldInline(CB&: *CB)) { |
| 1037 | // Samples may not exist for replayed function, if so |
| 1038 | // just add the direct GUID and move on |
| 1039 | if (!Samples) { |
| 1040 | InlinedGUIDs.insert(V: Function::getGUIDAssumingExternalLinkage( |
| 1041 | GlobalName: CB->getCalledFunction()->getName())); |
| 1042 | return; |
| 1043 | } |
| 1044 | // Otherwise, drop the threshold to import everything that we can |
| 1045 | Threshold = 0; |
| 1046 | } |
| 1047 | |
| 1048 | // In some rare cases, call instruction could be changed after being pushed |
| 1049 | // into inline candidate queue, this is because earlier inlining may expose |
| 1050 | // constant propagation which can change indirect call to direct call. When |
| 1051 | // this happens, we may fail to find matching function samples for the |
| 1052 | // candidate later, even if a match was found when the candidate was enqueued. |
| 1053 | if (!Samples) |
| 1054 | return; |
| 1055 | |
| 1056 | // For AutoFDO profile, retrieve candidate profiles by walking over |
| 1057 | // the nested inlinee profiles. |
| 1058 | if (!FunctionSamples::ProfileIsCS) { |
| 1059 | // Set threshold to zero to honor pre-inliner decision. |
| 1060 | if (UsePreInlinerDecision) |
| 1061 | Threshold = 0; |
| 1062 | Samples->findInlinedFunctions(S&: InlinedGUIDs, SymbolMap, Threshold); |
| 1063 | return; |
| 1064 | } |
| 1065 | |
| 1066 | ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(FSamples: Samples); |
| 1067 | std::queue<ContextTrieNode *> CalleeList; |
| 1068 | CalleeList.push(x: Caller); |
| 1069 | while (!CalleeList.empty()) { |
| 1070 | ContextTrieNode *Node = CalleeList.front(); |
| 1071 | CalleeList.pop(); |
| 1072 | FunctionSamples *CalleeSample = Node->getFunctionSamples(); |
| 1073 | // For CSSPGO profile, retrieve candidate profile by walking over the |
| 1074 | // trie built for context profile. Note that also take call targets |
| 1075 | // even if callee doesn't have a corresponding context profile. |
| 1076 | if (!CalleeSample) |
| 1077 | continue; |
| 1078 | |
| 1079 | // If pre-inliner decision is used, honor that for importing as well. |
| 1080 | bool PreInline = |
| 1081 | UsePreInlinerDecision && |
| 1082 | CalleeSample->getContext().hasAttribute(A: ContextShouldBeInlined); |
| 1083 | if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold) |
| 1084 | continue; |
| 1085 | |
| 1086 | Function *Func = SymbolMap.lookup(Key: CalleeSample->getFunction()); |
| 1087 | // Add to the import list only when it's defined out of module. |
| 1088 | if (!Func || Func->isDeclaration()) |
| 1089 | InlinedGUIDs.insert(V: CalleeSample->getGUID()); |
| 1090 | |
| 1091 | // Import hot CallTargets, which may not be available in IR because full |
| 1092 | // profile annotation cannot be done until backend compilation in ThinLTO. |
| 1093 | for (const auto &BS : CalleeSample->getBodySamples()) |
| 1094 | for (const auto &TS : BS.second.getCallTargets()) |
| 1095 | if (TS.second > Threshold) { |
| 1096 | const Function *Callee = SymbolMap.lookup(Key: TS.first); |
| 1097 | if (!Callee || Callee->isDeclaration()) |
| 1098 | InlinedGUIDs.insert(V: TS.first.getHashCode()); |
| 1099 | } |
| 1100 | |
| 1101 | // Import hot child context profile associted with callees. Note that this |
| 1102 | // may have some overlap with the call target loop above, but doing this |
| 1103 | // based child context profile again effectively allow us to use the max of |
| 1104 | // entry count and call target count to determine importing. |
| 1105 | for (auto &Child : Node->getAllChildContext()) { |
| 1106 | ContextTrieNode *CalleeNode = &Child.second; |
| 1107 | CalleeList.push(x: CalleeNode); |
| 1108 | } |
| 1109 | } |
| 1110 | } |
| 1111 | |
| 1112 | /// Iteratively inline hot callsites of a function. |
| 1113 | /// |
| 1114 | /// Iteratively traverse all callsites of the function \p F, so as to |
| 1115 | /// find out callsites with corresponding inline instances. |
| 1116 | /// |
| 1117 | /// For such callsites, |
| 1118 | /// - If it is hot enough, inline the callsites and adds callsites of the callee |
| 1119 | /// into the caller. If the call is an indirect call, first promote |
| 1120 | /// it to direct call. Each indirect call is limited with a single target. |
| 1121 | /// |
| 1122 | /// - If a callsite is not inlined, merge the its profile to the outline |
| 1123 | /// version (if --sample-profile-merge-inlinee is true), or scale the |
| 1124 | /// counters of standalone function based on the profile of inlined |
| 1125 | /// instances (if --sample-profile-merge-inlinee is false). |
| 1126 | /// |
| 1127 | /// Later passes may consume the updated profiles. |
| 1128 | /// |
| 1129 | /// \param F function to perform iterative inlining. |
| 1130 | /// \param InlinedGUIDs a set to be updated to include all GUIDs that are |
| 1131 | /// inlined in the profiled binary. |
| 1132 | /// |
| 1133 | /// \returns True if there is any inline happened. |
| 1134 | bool SampleProfileLoader::inlineHotFunctions( |
| 1135 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
| 1136 | // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure |
| 1137 | // Profile symbol list is ignored when profile-sample-accurate is on. |
| 1138 | assert((!ProfAccForSymsInList || |
| 1139 | (!ProfileSampleAccurate && |
| 1140 | !F.hasFnAttribute("profile-sample-accurate" ))) && |
| 1141 | "ProfAccForSymsInList should be false when profile-sample-accurate " |
| 1142 | "is enabled" ); |
| 1143 | |
| 1144 | MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; |
| 1145 | bool Changed = false; |
| 1146 | bool LocalChanged = true; |
| 1147 | while (LocalChanged) { |
| 1148 | LocalChanged = false; |
| 1149 | SmallVector<CallBase *, 10> CIS; |
| 1150 | for (auto &BB : F) { |
| 1151 | bool Hot = false; |
| 1152 | SmallVector<CallBase *, 10> AllCandidates; |
| 1153 | SmallVector<CallBase *, 10> ColdCandidates; |
| 1154 | for (auto &I : BB) { |
| 1155 | const FunctionSamples *FS = nullptr; |
| 1156 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
| 1157 | if (!isa<IntrinsicInst>(Val: I)) { |
| 1158 | if ((FS = findCalleeFunctionSamples(Inst: *CB))) { |
| 1159 | assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && |
| 1160 | "GUIDToFuncNameMap has to be populated" ); |
| 1161 | AllCandidates.push_back(Elt: CB); |
| 1162 | if (FS->getHeadSamplesEstimate() > 0 || |
| 1163 | FunctionSamples::ProfileIsCS) |
| 1164 | LocalNotInlinedCallSites.insert(KV: {CB, FS}); |
| 1165 | if (callsiteIsHot(CallsiteFS: FS, PSI, ProfAccForSymsInList)) |
| 1166 | Hot = true; |
| 1167 | else if (shouldInlineColdCallee(CallInst&: *CB)) |
| 1168 | ColdCandidates.push_back(Elt: CB); |
| 1169 | } else if (getExternalInlineAdvisorShouldInline(CB&: *CB)) { |
| 1170 | AllCandidates.push_back(Elt: CB); |
| 1171 | } |
| 1172 | } |
| 1173 | } |
| 1174 | } |
| 1175 | if (Hot || ExternalInlineAdvisor) { |
| 1176 | CIS.insert(I: CIS.begin(), From: AllCandidates.begin(), To: AllCandidates.end()); |
| 1177 | emitOptimizationRemarksForInlineCandidates(Candidates: AllCandidates, F, Hot: true); |
| 1178 | } else { |
| 1179 | CIS.insert(I: CIS.begin(), From: ColdCandidates.begin(), To: ColdCandidates.end()); |
| 1180 | emitOptimizationRemarksForInlineCandidates(Candidates: ColdCandidates, F, Hot: false); |
| 1181 | } |
| 1182 | } |
| 1183 | for (CallBase *I : CIS) { |
| 1184 | Function *CalledFunction = I->getCalledFunction(); |
| 1185 | InlineCandidate Candidate = {.CallInstr: I, .CalleeSamples: LocalNotInlinedCallSites.lookup(Key: I), |
| 1186 | .CallsiteCount: 0 /* dummy count */, |
| 1187 | .CallsiteDistribution: 1.0 /* dummy distribution factor */}; |
| 1188 | // Do not inline recursive calls. |
| 1189 | if (CalledFunction == &F) |
| 1190 | continue; |
| 1191 | if (I->isIndirectCall()) { |
| 1192 | uint64_t Sum; |
| 1193 | for (const auto *FS : findIndirectCallFunctionSamples(Inst: *I, Sum)) { |
| 1194 | uint64_t SumOrigin = Sum; |
| 1195 | if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
| 1196 | findExternalInlineCandidate(CB: I, Samples: FS, InlinedGUIDs, |
| 1197 | Threshold: PSI->getOrCompHotCountThreshold()); |
| 1198 | continue; |
| 1199 | } |
| 1200 | if (!callsiteIsHot(CallsiteFS: FS, PSI, ProfAccForSymsInList)) |
| 1201 | continue; |
| 1202 | |
| 1203 | Candidate = {.CallInstr: I, .CalleeSamples: FS, .CallsiteCount: FS->getHeadSamplesEstimate(), .CallsiteDistribution: 1.0}; |
| 1204 | if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) { |
| 1205 | LocalNotInlinedCallSites.erase(Key: I); |
| 1206 | LocalChanged = true; |
| 1207 | } |
| 1208 | } |
| 1209 | } else if (CalledFunction && CalledFunction->getSubprogram() && |
| 1210 | !CalledFunction->isDeclaration()) { |
| 1211 | if (tryInlineCandidate(Candidate)) { |
| 1212 | LocalNotInlinedCallSites.erase(Key: I); |
| 1213 | LocalChanged = true; |
| 1214 | } |
| 1215 | } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
| 1216 | findExternalInlineCandidate(CB: I, Samples: findCalleeFunctionSamples(Inst: *I), |
| 1217 | InlinedGUIDs, |
| 1218 | Threshold: PSI->getOrCompHotCountThreshold()); |
| 1219 | } |
| 1220 | } |
| 1221 | Changed |= LocalChanged; |
| 1222 | } |
| 1223 | |
| 1224 | // For CS profile, profile for not inlined context will be merged when |
| 1225 | // base profile is being retrieved. |
| 1226 | if (!FunctionSamples::ProfileIsCS) |
| 1227 | promoteMergeNotInlinedContextSamples(NonInlinedCallSites: LocalNotInlinedCallSites, F); |
| 1228 | return Changed; |
| 1229 | } |
| 1230 | |
| 1231 | bool SampleProfileLoader::tryInlineCandidate( |
| 1232 | InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) { |
| 1233 | // Do not attempt to inline a candidate if |
| 1234 | // --disable-sample-loader-inlining is true. |
| 1235 | if (DisableSampleProfileInlining) |
| 1236 | return false; |
| 1237 | |
| 1238 | CallBase &CB = *Candidate.CallInstr; |
| 1239 | Function *CalledFunction = CB.getCalledFunction(); |
| 1240 | assert(CalledFunction && "Expect a callee with definition" ); |
| 1241 | DebugLoc DLoc = CB.getDebugLoc(); |
| 1242 | BasicBlock *BB = CB.getParent(); |
| 1243 | |
| 1244 | InlineCost Cost = shouldInlineCandidate(Candidate); |
| 1245 | if (Cost.isNever()) { |
| 1246 | ORE->emit(OptDiag: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), |
| 1247 | "InlineFail" , DLoc, BB) |
| 1248 | << "incompatible inlining" ); |
| 1249 | return false; |
| 1250 | } |
| 1251 | |
| 1252 | if (!Cost) |
| 1253 | return false; |
| 1254 | |
| 1255 | InlineFunctionInfo IFI(GetAC); |
| 1256 | IFI.UpdateProfile = false; |
| 1257 | InlineResult IR = InlineFunction(CB, IFI, |
| 1258 | /*MergeAttributes=*/true); |
| 1259 | if (!IR.isSuccess()) |
| 1260 | return false; |
| 1261 | |
| 1262 | // The call to InlineFunction erases I, so we can't pass it here. |
| 1263 | emitInlinedIntoBasedOnCost(ORE&: *ORE, DLoc, Block: BB, Callee: *CalledFunction, Caller: *BB->getParent(), |
| 1264 | IC: Cost, ForProfileContext: true, PassName: getAnnotatedRemarkPassName()); |
| 1265 | |
| 1266 | // Now populate the list of newly exposed call sites. |
| 1267 | if (InlinedCallSites) { |
| 1268 | InlinedCallSites->clear(); |
| 1269 | llvm::append_range(C&: *InlinedCallSites, R&: IFI.InlinedCallSites); |
| 1270 | } |
| 1271 | |
| 1272 | if (FunctionSamples::ProfileIsCS) |
| 1273 | ContextTracker->markContextSamplesInlined(InlinedSamples: Candidate.CalleeSamples); |
| 1274 | ++NumCSInlined; |
| 1275 | |
| 1276 | // Prorate inlined probes for a duplicated inlining callsite which probably |
| 1277 | // has a distribution less than 100%. Samples for an inlinee should be |
| 1278 | // distributed among the copies of the original callsite based on each |
| 1279 | // callsite's distribution factor for counts accuracy. Note that an inlined |
| 1280 | // probe may come with its own distribution factor if it has been duplicated |
| 1281 | // in the inlinee body. The two factor are multiplied to reflect the |
| 1282 | // aggregation of duplication. |
| 1283 | if (Candidate.CallsiteDistribution < 1) { |
| 1284 | for (auto &I : IFI.InlinedCallSites) { |
| 1285 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: *I)) |
| 1286 | setProbeDistributionFactor(Inst&: *I, Factor: Probe->Factor * |
| 1287 | Candidate.CallsiteDistribution); |
| 1288 | } |
| 1289 | NumDuplicatedInlinesite++; |
| 1290 | } |
| 1291 | |
| 1292 | return true; |
| 1293 | } |
| 1294 | |
| 1295 | bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, |
| 1296 | CallBase *CB) { |
| 1297 | assert(CB && "Expect non-null call instruction" ); |
| 1298 | |
| 1299 | if (isa<IntrinsicInst>(Val: CB)) |
| 1300 | return false; |
| 1301 | |
| 1302 | // Find the callee's profile. For indirect call, find hottest target profile. |
| 1303 | const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(Inst: *CB); |
| 1304 | // If ExternalInlineAdvisor wants to inline this site, do so even |
| 1305 | // if Samples are not present. |
| 1306 | if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(CB&: *CB)) |
| 1307 | return false; |
| 1308 | |
| 1309 | float Factor = 1.0; |
| 1310 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: *CB)) |
| 1311 | Factor = Probe->Factor; |
| 1312 | |
| 1313 | uint64_t CallsiteCount = |
| 1314 | CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0; |
| 1315 | *NewCandidate = {.CallInstr: CB, .CalleeSamples: CalleeSamples, .CallsiteCount: CallsiteCount, .CallsiteDistribution: Factor}; |
| 1316 | return true; |
| 1317 | } |
| 1318 | |
| 1319 | std::optional<InlineCost> |
| 1320 | SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) { |
| 1321 | std::unique_ptr<InlineAdvice> Advice = nullptr; |
| 1322 | if (ExternalInlineAdvisor) { |
| 1323 | Advice = ExternalInlineAdvisor->getAdvice(CB); |
| 1324 | if (Advice) { |
| 1325 | if (!Advice->isInliningRecommended()) { |
| 1326 | Advice->recordUnattemptedInlining(); |
| 1327 | return InlineCost::getNever(Reason: "not previously inlined" ); |
| 1328 | } |
| 1329 | Advice->recordInlining(); |
| 1330 | return InlineCost::getAlways(Reason: "previously inlined" ); |
| 1331 | } |
| 1332 | } |
| 1333 | |
| 1334 | return {}; |
| 1335 | } |
| 1336 | |
| 1337 | bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) { |
| 1338 | std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB); |
| 1339 | return Cost ? !!*Cost : false; |
| 1340 | } |
| 1341 | |
| 1342 | InlineCost |
| 1343 | SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { |
| 1344 | if (std::optional<InlineCost> ReplayCost = |
| 1345 | getExternalInlineAdvisorCost(CB&: *Candidate.CallInstr)) |
| 1346 | return *ReplayCost; |
| 1347 | // Adjust threshold based on call site hotness, only do this for callsite |
| 1348 | // prioritized inliner because otherwise cost-benefit check is done earlier. |
| 1349 | int SampleThreshold = SampleColdCallSiteThreshold; |
| 1350 | if (CallsitePrioritizedInline) { |
| 1351 | if (Candidate.CallsiteCount > PSI->getHotCountThreshold()) |
| 1352 | SampleThreshold = SampleHotCallSiteThreshold; |
| 1353 | else if (!ProfileSizeInline) |
| 1354 | return InlineCost::getNever(Reason: "cold callsite" ); |
| 1355 | } |
| 1356 | |
| 1357 | Function *Callee = Candidate.CallInstr->getCalledFunction(); |
| 1358 | assert(Callee && "Expect a definition for inline candidate of direct call" ); |
| 1359 | |
| 1360 | InlineParams Params = getInlineParams(); |
| 1361 | // We will ignore the threshold from inline cost, so always get full cost. |
| 1362 | Params.ComputeFullInlineCost = true; |
| 1363 | Params.AllowRecursiveCall = AllowRecursiveInline; |
| 1364 | // Checks if there is anything in the reachable portion of the callee at |
| 1365 | // this callsite that makes this inlining potentially illegal. Need to |
| 1366 | // set ComputeFullInlineCost, otherwise getInlineCost may return early |
| 1367 | // when cost exceeds threshold without checking all IRs in the callee. |
| 1368 | // The acutal cost does not matter because we only checks isNever() to |
| 1369 | // see if it is legal to inline the callsite. |
| 1370 | InlineCost Cost = getInlineCost(Call&: *Candidate.CallInstr, Callee, Params, |
| 1371 | CalleeTTI&: GetTTI(*Callee), GetAssumptionCache: GetAC, GetTLI); |
| 1372 | |
| 1373 | // Honor always inline and never inline from call analyzer |
| 1374 | if (Cost.isNever() || Cost.isAlways()) |
| 1375 | return Cost; |
| 1376 | |
| 1377 | // With CSSPGO, the preinliner in llvm-profgen can estimate global inline |
| 1378 | // decisions based on hotness as well as accurate function byte sizes for |
| 1379 | // given context using function/inlinee sizes from previous build. It |
| 1380 | // stores the decision in profile, and also adjust/merge context profile |
| 1381 | // aiming at better context-sensitive post-inline profile quality, assuming |
| 1382 | // all inline decision estimates are going to be honored by compiler. Here |
| 1383 | // we replay that inline decision under `sample-profile-use-preinliner`. |
| 1384 | // Note that we don't need to handle negative decision from preinliner as |
| 1385 | // context profile for not inlined calls are merged by preinliner already. |
| 1386 | if (UsePreInlinerDecision && Candidate.CalleeSamples) { |
| 1387 | // Once two node are merged due to promotion, we're losing some context |
| 1388 | // so the original context-sensitive preinliner decision should be ignored |
| 1389 | // for SyntheticContext. |
| 1390 | SampleContext &Context = Candidate.CalleeSamples->getContext(); |
| 1391 | if (!Context.hasState(S: SyntheticContext) && |
| 1392 | Context.hasAttribute(A: ContextShouldBeInlined)) |
| 1393 | return InlineCost::getAlways(Reason: "preinliner" ); |
| 1394 | } |
| 1395 | |
| 1396 | // For old FDO inliner, we inline the call site if it is below hot threshold, |
| 1397 | // even if the function is hot based on sample profile data. This is to |
| 1398 | // prevent huge functions from being inlined. |
| 1399 | if (!CallsitePrioritizedInline) { |
| 1400 | return InlineCost::get(Cost: Cost.getCost(), Threshold: SampleHotCallSiteThreshold); |
| 1401 | } |
| 1402 | |
| 1403 | // Otherwise only use the cost from call analyzer, but overwite threshold with |
| 1404 | // Sample PGO threshold. |
| 1405 | return InlineCost::get(Cost: Cost.getCost(), Threshold: SampleThreshold); |
| 1406 | } |
| 1407 | |
| 1408 | bool SampleProfileLoader::inlineHotFunctionsWithPriority( |
| 1409 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
| 1410 | // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure |
| 1411 | // Profile symbol list is ignored when profile-sample-accurate is on. |
| 1412 | assert((!ProfAccForSymsInList || |
| 1413 | (!ProfileSampleAccurate && |
| 1414 | !F.hasFnAttribute("profile-sample-accurate" ))) && |
| 1415 | "ProfAccForSymsInList should be false when profile-sample-accurate " |
| 1416 | "is enabled" ); |
| 1417 | |
| 1418 | // Populating worklist with initial call sites from root inliner, along |
| 1419 | // with call site weights. |
| 1420 | CandidateQueue CQueue; |
| 1421 | InlineCandidate NewCandidate; |
| 1422 | for (auto &BB : F) { |
| 1423 | for (auto &I : BB) { |
| 1424 | auto *CB = dyn_cast<CallBase>(Val: &I); |
| 1425 | if (!CB) |
| 1426 | continue; |
| 1427 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
| 1428 | CQueue.push(x: NewCandidate); |
| 1429 | } |
| 1430 | } |
| 1431 | |
| 1432 | // Cap the size growth from profile guided inlining. This is needed even |
| 1433 | // though cost of each inline candidate already accounts for callee size, |
| 1434 | // because with top-down inlining, we can grow inliner size significantly |
| 1435 | // with large number of smaller inlinees each pass the cost check. |
| 1436 | assert(ProfileInlineLimitMax >= ProfileInlineLimitMin && |
| 1437 | "Max inline size limit should not be smaller than min inline size " |
| 1438 | "limit." ); |
| 1439 | unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit; |
| 1440 | SizeLimit = std::min(a: SizeLimit, b: (unsigned)ProfileInlineLimitMax); |
| 1441 | SizeLimit = std::max(a: SizeLimit, b: (unsigned)ProfileInlineLimitMin); |
| 1442 | if (ExternalInlineAdvisor) |
| 1443 | SizeLimit = std::numeric_limits<unsigned>::max(); |
| 1444 | |
| 1445 | MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; |
| 1446 | |
| 1447 | // Perform iterative BFS call site prioritized inlining |
| 1448 | bool Changed = false; |
| 1449 | while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { |
| 1450 | InlineCandidate Candidate = CQueue.top(); |
| 1451 | CQueue.pop(); |
| 1452 | CallBase *I = Candidate.CallInstr; |
| 1453 | Function *CalledFunction = I->getCalledFunction(); |
| 1454 | |
| 1455 | if (CalledFunction == &F) |
| 1456 | continue; |
| 1457 | if (I->isIndirectCall()) { |
| 1458 | uint64_t Sum = 0; |
| 1459 | auto CalleeSamples = findIndirectCallFunctionSamples(Inst: *I, Sum); |
| 1460 | uint64_t SumOrigin = Sum; |
| 1461 | Sum *= Candidate.CallsiteDistribution; |
| 1462 | unsigned ICPCount = 0; |
| 1463 | for (const auto *FS : CalleeSamples) { |
| 1464 | // TODO: Consider disable pre-lTO ICP for MonoLTO as well |
| 1465 | if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
| 1466 | findExternalInlineCandidate(CB: I, Samples: FS, InlinedGUIDs, |
| 1467 | Threshold: PSI->getOrCompHotCountThreshold()); |
| 1468 | continue; |
| 1469 | } |
| 1470 | uint64_t EntryCountDistributed = |
| 1471 | FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution; |
| 1472 | // In addition to regular inline cost check, we also need to make sure |
| 1473 | // ICP isn't introducing excessive speculative checks even if individual |
| 1474 | // target looks beneficial to promote and inline. That means we should |
| 1475 | // only do ICP when there's a small number dominant targets. |
| 1476 | if (ICPCount >= ProfileICPRelativeHotnessSkip && |
| 1477 | EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness) |
| 1478 | break; |
| 1479 | // TODO: Fix CallAnalyzer to handle all indirect calls. |
| 1480 | // For indirect call, we don't run CallAnalyzer to get InlineCost |
| 1481 | // before actual inlining. This is because we could see two different |
| 1482 | // types from the same definition, which makes CallAnalyzer choke as |
| 1483 | // it's expecting matching parameter type on both caller and callee |
| 1484 | // side. See example from PR18962 for the triggering cases (the bug was |
| 1485 | // fixed, but we generate different types). |
| 1486 | if (!PSI->isHotCount(C: EntryCountDistributed)) |
| 1487 | break; |
| 1488 | SmallVector<CallBase *, 8> InlinedCallSites; |
| 1489 | // Attach function profile for promoted indirect callee, and update |
| 1490 | // call site count for the promoted inline candidate too. |
| 1491 | Candidate = {.CallInstr: I, .CalleeSamples: FS, .CallsiteCount: EntryCountDistributed, |
| 1492 | .CallsiteDistribution: Candidate.CallsiteDistribution}; |
| 1493 | if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, |
| 1494 | InlinedCallSite: &InlinedCallSites)) { |
| 1495 | for (auto *CB : InlinedCallSites) { |
| 1496 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
| 1497 | CQueue.emplace(args&: NewCandidate); |
| 1498 | } |
| 1499 | ICPCount++; |
| 1500 | Changed = true; |
| 1501 | } else if (!ContextTracker) { |
| 1502 | LocalNotInlinedCallSites.insert(KV: {I, FS}); |
| 1503 | } |
| 1504 | } |
| 1505 | } else if (CalledFunction && CalledFunction->getSubprogram() && |
| 1506 | !CalledFunction->isDeclaration()) { |
| 1507 | SmallVector<CallBase *, 8> InlinedCallSites; |
| 1508 | if (tryInlineCandidate(Candidate, InlinedCallSites: &InlinedCallSites)) { |
| 1509 | for (auto *CB : InlinedCallSites) { |
| 1510 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
| 1511 | CQueue.emplace(args&: NewCandidate); |
| 1512 | } |
| 1513 | Changed = true; |
| 1514 | } else if (!ContextTracker) { |
| 1515 | LocalNotInlinedCallSites.insert(KV: {I, Candidate.CalleeSamples}); |
| 1516 | } |
| 1517 | } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
| 1518 | findExternalInlineCandidate(CB: I, Samples: findCalleeFunctionSamples(Inst: *I), |
| 1519 | InlinedGUIDs, |
| 1520 | Threshold: PSI->getOrCompHotCountThreshold()); |
| 1521 | } |
| 1522 | } |
| 1523 | |
| 1524 | if (!CQueue.empty()) { |
| 1525 | if (SizeLimit == (unsigned)ProfileInlineLimitMax) |
| 1526 | ++NumCSInlinedHitMaxLimit; |
| 1527 | else if (SizeLimit == (unsigned)ProfileInlineLimitMin) |
| 1528 | ++NumCSInlinedHitMinLimit; |
| 1529 | else |
| 1530 | ++NumCSInlinedHitGrowthLimit; |
| 1531 | } |
| 1532 | |
| 1533 | // For CS profile, profile for not inlined context will be merged when |
| 1534 | // base profile is being retrieved. |
| 1535 | if (!FunctionSamples::ProfileIsCS) |
| 1536 | promoteMergeNotInlinedContextSamples(NonInlinedCallSites: LocalNotInlinedCallSites, F); |
| 1537 | return Changed; |
| 1538 | } |
| 1539 | |
| 1540 | void SampleProfileLoader::promoteMergeNotInlinedContextSamples( |
| 1541 | MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites, |
| 1542 | const Function &F) { |
| 1543 | // Accumulate not inlined callsite information into notInlinedSamples |
| 1544 | for (const auto &Pair : NonInlinedCallSites) { |
| 1545 | CallBase *I = Pair.first; |
| 1546 | Function *Callee = I->getCalledFunction(); |
| 1547 | if (!Callee || Callee->isDeclaration()) |
| 1548 | continue; |
| 1549 | |
| 1550 | ORE->emit( |
| 1551 | OptDiag: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline" , |
| 1552 | I->getDebugLoc(), I->getParent()) |
| 1553 | << "previous inlining not repeated: '" << ore::NV("Callee" , Callee) |
| 1554 | << "' into '" << ore::NV("Caller" , &F) << "'" ); |
| 1555 | |
| 1556 | ++NumCSNotInlined; |
| 1557 | const FunctionSamples *FS = Pair.second; |
| 1558 | if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) { |
| 1559 | continue; |
| 1560 | } |
| 1561 | |
| 1562 | // Do not merge a context that is already duplicated into the base profile. |
| 1563 | if (FS->getContext().hasAttribute(A: sampleprof::ContextDuplicatedIntoBase)) |
| 1564 | continue; |
| 1565 | |
| 1566 | if (ProfileMergeInlinee) { |
| 1567 | // A function call can be replicated by optimizations like callsite |
| 1568 | // splitting or jump threading and the replicates end up sharing the |
| 1569 | // sample nested callee profile instead of slicing the original |
| 1570 | // inlinee's profile. We want to do merge exactly once by filtering out |
| 1571 | // callee profiles with a non-zero head sample count. |
| 1572 | if (FS->getHeadSamples() == 0) { |
| 1573 | // Use entry samples as head samples during the merge, as inlinees |
| 1574 | // don't have head samples. |
| 1575 | const_cast<FunctionSamples *>(FS)->addHeadSamples( |
| 1576 | Num: FS->getHeadSamplesEstimate()); |
| 1577 | |
| 1578 | // Note that we have to do the merge right after processing function. |
| 1579 | // This allows OutlineFS's profile to be used for annotation during |
| 1580 | // top-down processing of functions' annotation. |
| 1581 | FunctionSamples *OutlineFS = Reader->getSamplesFor(F: *Callee); |
| 1582 | // If outlined function does not exist in the profile, add it to a |
| 1583 | // separate map so that it does not rehash the original profile. |
| 1584 | if (!OutlineFS) |
| 1585 | OutlineFS = &OutlineFunctionSamples[ |
| 1586 | FunctionId(FunctionSamples::getCanonicalFnName(FnName: Callee->getName()))]; |
| 1587 | OutlineFS->merge(Other: *FS, Weight: 1); |
| 1588 | // Set outlined profile to be synthetic to not bias the inliner. |
| 1589 | OutlineFS->setContextSynthetic(); |
| 1590 | } |
| 1591 | } else { |
| 1592 | auto pair = |
| 1593 | notInlinedCallInfo.try_emplace(Key: Callee, Args: NotInlinedProfileInfo{.entryCount: 0}); |
| 1594 | pair.first->second.entryCount += FS->getHeadSamplesEstimate(); |
| 1595 | } |
| 1596 | } |
| 1597 | } |
| 1598 | |
| 1599 | /// Returns the sorted CallTargetMap \p M by count in descending order. |
| 1600 | static SmallVector<InstrProfValueData, 2> |
| 1601 | GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) { |
| 1602 | SmallVector<InstrProfValueData, 2> R; |
| 1603 | for (const auto &I : SampleRecord::sortCallTargets(Targets: M)) { |
| 1604 | R.emplace_back( |
| 1605 | Args: InstrProfValueData{.Value: I.first.getHashCode(), .Count: I.second}); |
| 1606 | } |
| 1607 | return R; |
| 1608 | } |
| 1609 | |
| 1610 | // Generate MD_prof metadata for every branch instruction using the |
| 1611 | // edge weights computed during propagation. |
| 1612 | void SampleProfileLoader::generateMDProfMetadata(Function &F) { |
| 1613 | // Generate MD_prof metadata for every branch instruction using the |
| 1614 | // edge weights computed during propagation. |
| 1615 | LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n" ); |
| 1616 | LLVMContext &Ctx = F.getContext(); |
| 1617 | MDBuilder MDB(Ctx); |
| 1618 | for (auto &BI : F) { |
| 1619 | BasicBlock *BB = &BI; |
| 1620 | |
| 1621 | if (BlockWeights[BB]) { |
| 1622 | for (auto &I : *BB) { |
| 1623 | if (!isa<CallInst>(Val: I) && !isa<InvokeInst>(Val: I)) |
| 1624 | continue; |
| 1625 | if (!cast<CallBase>(Val&: I).getCalledFunction()) { |
| 1626 | const DebugLoc &DLoc = I.getDebugLoc(); |
| 1627 | if (!DLoc) |
| 1628 | continue; |
| 1629 | const DILocation *DIL = DLoc; |
| 1630 | const FunctionSamples *FS = findFunctionSamples(Inst: I); |
| 1631 | if (!FS) |
| 1632 | continue; |
| 1633 | auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL); |
| 1634 | ErrorOr<SampleRecord::CallTargetMap> T = |
| 1635 | FS->findCallTargetMapAt(CallSite); |
| 1636 | if (!T || T.get().empty()) |
| 1637 | continue; |
| 1638 | if (FunctionSamples::ProfileIsProbeBased) { |
| 1639 | // Prorate the callsite counts based on the pre-ICP distribution |
| 1640 | // factor to reflect what is already done to the callsite before |
| 1641 | // ICP, such as calliste cloning. |
| 1642 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: I)) { |
| 1643 | if (Probe->Factor < 1) |
| 1644 | T = SampleRecord::adjustCallTargets(Targets: T.get(), DistributionFactor: Probe->Factor); |
| 1645 | } |
| 1646 | } |
| 1647 | SmallVector<InstrProfValueData, 2> SortedCallTargets = |
| 1648 | GetSortedValueDataFromCallTargets(M: T.get()); |
| 1649 | uint64_t Sum = 0; |
| 1650 | for (const auto &C : T.get()) |
| 1651 | Sum += C.second; |
| 1652 | // With CSSPGO all indirect call targets are counted torwards the |
| 1653 | // original indirect call site in the profile, including both |
| 1654 | // inlined and non-inlined targets. |
| 1655 | if (!FunctionSamples::ProfileIsCS) { |
| 1656 | if (const FunctionSamplesMap *M = |
| 1657 | FS->findFunctionSamplesMapAt(Loc: CallSite)) { |
| 1658 | for (const auto &NameFS : *M) |
| 1659 | Sum += NameFS.second.getHeadSamplesEstimate(); |
| 1660 | } |
| 1661 | } |
| 1662 | if (Sum) |
| 1663 | updateIDTMetaData(Inst&: I, CallTargets: SortedCallTargets, Sum); |
| 1664 | else if (OverwriteExistingWeights) |
| 1665 | I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
| 1666 | } else if (!isa<IntrinsicInst>(Val: &I)) { |
| 1667 | setBranchWeights(I, Weights: {static_cast<uint32_t>(BlockWeights[BB])}, |
| 1668 | /*IsExpected=*/false); |
| 1669 | } |
| 1670 | } |
| 1671 | } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) { |
| 1672 | // Set profile metadata (possibly annotated by LTO prelink) to zero or |
| 1673 | // clear it for cold code. |
| 1674 | for (auto &I : *BB) { |
| 1675 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
| 1676 | if (cast<CallBase>(Val&: I).isIndirectCall()) { |
| 1677 | I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
| 1678 | } else { |
| 1679 | setBranchWeights(I, Weights: {uint32_t(0)}, /*IsExpected=*/false); |
| 1680 | } |
| 1681 | } |
| 1682 | } |
| 1683 | } |
| 1684 | |
| 1685 | Instruction *TI = BB->getTerminator(); |
| 1686 | if (TI->getNumSuccessors() == 1) |
| 1687 | continue; |
| 1688 | if (!isa<BranchInst>(Val: TI) && !isa<SwitchInst>(Val: TI) && |
| 1689 | !isa<IndirectBrInst>(Val: TI)) |
| 1690 | continue; |
| 1691 | |
| 1692 | DebugLoc BranchLoc = TI->getDebugLoc(); |
| 1693 | LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line " |
| 1694 | << ((BranchLoc) ? Twine(BranchLoc.getLine()) |
| 1695 | : Twine("<UNKNOWN LOCATION>" )) |
| 1696 | << ".\n" ); |
| 1697 | SmallVector<uint32_t, 4> Weights; |
| 1698 | uint32_t MaxWeight = 0; |
| 1699 | Instruction *MaxDestInst; |
| 1700 | // Since profi treats multiple edges (multiway branches) as a single edge, |
| 1701 | // we need to distribute the computed weight among the branches. We do |
| 1702 | // this by evenly splitting the edge weight among destinations. |
| 1703 | DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity; |
| 1704 | std::vector<uint64_t> EdgeIndex; |
| 1705 | if (SampleProfileUseProfi) { |
| 1706 | EdgeIndex.resize(new_size: TI->getNumSuccessors()); |
| 1707 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { |
| 1708 | const BasicBlock *Succ = TI->getSuccessor(Idx: I); |
| 1709 | EdgeIndex[I] = EdgeMultiplicity[Succ]; |
| 1710 | EdgeMultiplicity[Succ]++; |
| 1711 | } |
| 1712 | } |
| 1713 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { |
| 1714 | BasicBlock *Succ = TI->getSuccessor(Idx: I); |
| 1715 | Edge E = std::make_pair(x&: BB, y&: Succ); |
| 1716 | uint64_t Weight = EdgeWeights[E]; |
| 1717 | LLVM_DEBUG(dbgs() << "\t" ; printEdgeWeight(dbgs(), E)); |
| 1718 | // Use uint32_t saturated arithmetic to adjust the incoming weights, |
| 1719 | // if needed. Sample counts in profiles are 64-bit unsigned values, |
| 1720 | // but internally branch weights are expressed as 32-bit values. |
| 1721 | if (Weight > std::numeric_limits<uint32_t>::max()) { |
| 1722 | LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)\n" ); |
| 1723 | Weight = std::numeric_limits<uint32_t>::max(); |
| 1724 | } |
| 1725 | if (!SampleProfileUseProfi) { |
| 1726 | // Weight is added by one to avoid propagation errors introduced by |
| 1727 | // 0 weights. |
| 1728 | Weights.push_back(Elt: static_cast<uint32_t>( |
| 1729 | Weight == std::numeric_limits<uint32_t>::max() ? Weight |
| 1730 | : Weight + 1)); |
| 1731 | } else { |
| 1732 | // Profi creates proper weights that do not require "+1" adjustments but |
| 1733 | // we evenly split the weight among branches with the same destination. |
| 1734 | uint64_t W = Weight / EdgeMultiplicity[Succ]; |
| 1735 | // Rounding up, if needed, so that first branches are hotter. |
| 1736 | if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ]) |
| 1737 | W++; |
| 1738 | Weights.push_back(Elt: static_cast<uint32_t>(W)); |
| 1739 | } |
| 1740 | if (Weight != 0) { |
| 1741 | if (Weight > MaxWeight) { |
| 1742 | MaxWeight = Weight; |
| 1743 | MaxDestInst = &*Succ->getFirstNonPHIOrDbgOrLifetime(); |
| 1744 | } |
| 1745 | } |
| 1746 | } |
| 1747 | |
| 1748 | misexpect::checkExpectAnnotations(I&: *TI, ExistingWeights: Weights, /*IsFrontend=*/false); |
| 1749 | |
| 1750 | uint64_t TempWeight; |
| 1751 | // Only set weights if there is at least one non-zero weight. |
| 1752 | // In any other case, let the analyzer set weights. |
| 1753 | // Do not set weights if the weights are present unless under |
| 1754 | // OverwriteExistingWeights. In ThinLTO, the profile annotation is done |
| 1755 | // twice. If the first annotation already set the weights, the second pass |
| 1756 | // does not need to set it. With OverwriteExistingWeights, Blocks with zero |
| 1757 | // weight should have their existing metadata (possibly annotated by LTO |
| 1758 | // prelink) cleared. |
| 1759 | if (MaxWeight > 0 && |
| 1760 | (!TI->extractProfTotalWeight(TotalVal&: TempWeight) || OverwriteExistingWeights)) { |
| 1761 | LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n" ); |
| 1762 | setBranchWeights(I&: *TI, Weights, /*IsExpected=*/false); |
| 1763 | ORE->emit(RemarkBuilder: [&]() { |
| 1764 | return OptimizationRemark(DEBUG_TYPE, "PopularDest" , MaxDestInst) |
| 1765 | << "most popular destination for conditional branches at " |
| 1766 | << ore::NV("CondBranchesLoc" , BranchLoc); |
| 1767 | }); |
| 1768 | } else { |
| 1769 | if (OverwriteExistingWeights) { |
| 1770 | TI->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
| 1771 | LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n" ); |
| 1772 | } else { |
| 1773 | LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n" ); |
| 1774 | } |
| 1775 | } |
| 1776 | } |
| 1777 | } |
| 1778 | |
| 1779 | /// Once all the branch weights are computed, we emit the MD_prof |
| 1780 | /// metadata on BB using the computed values for each of its branches. |
| 1781 | /// |
| 1782 | /// \param F The function to query. |
| 1783 | /// |
| 1784 | /// \returns true if \p F was modified. Returns false, otherwise. |
| 1785 | bool SampleProfileLoader::emitAnnotations(Function &F) { |
| 1786 | bool Changed = false; |
| 1787 | |
| 1788 | if (FunctionSamples::ProfileIsProbeBased) { |
| 1789 | LLVM_DEBUG({ |
| 1790 | if (!ProbeManager->getDesc(F)) |
| 1791 | dbgs() << "Probe descriptor missing for Function " << F.getName() |
| 1792 | << "\n" ; |
| 1793 | }); |
| 1794 | |
| 1795 | if (ProbeManager->profileIsValid(F, Samples: *Samples)) { |
| 1796 | ++NumMatchedProfile; |
| 1797 | } else { |
| 1798 | ++NumMismatchedProfile; |
| 1799 | LLVM_DEBUG( |
| 1800 | dbgs() << "Profile is invalid due to CFG mismatch for Function " |
| 1801 | << F.getName() << "\n" ); |
| 1802 | if (!SalvageStaleProfile) |
| 1803 | return false; |
| 1804 | } |
| 1805 | } else { |
| 1806 | if (getFunctionLoc(F) == 0) |
| 1807 | return false; |
| 1808 | |
| 1809 | LLVM_DEBUG(dbgs() << "Line number for the first instruction in " |
| 1810 | << F.getName() << ": " << getFunctionLoc(F) << "\n" ); |
| 1811 | } |
| 1812 | |
| 1813 | DenseSet<GlobalValue::GUID> InlinedGUIDs; |
| 1814 | if (CallsitePrioritizedInline) |
| 1815 | Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); |
| 1816 | else |
| 1817 | Changed |= inlineHotFunctions(F, InlinedGUIDs); |
| 1818 | |
| 1819 | Changed |= computeAndPropagateWeights(F, InlinedGUIDs); |
| 1820 | |
| 1821 | if (Changed) |
| 1822 | generateMDProfMetadata(F); |
| 1823 | |
| 1824 | emitCoverageRemarks(F); |
| 1825 | return Changed; |
| 1826 | } |
| 1827 | |
| 1828 | std::unique_ptr<ProfiledCallGraph> |
| 1829 | SampleProfileLoader::buildProfiledCallGraph(Module &M) { |
| 1830 | std::unique_ptr<ProfiledCallGraph> ProfiledCG; |
| 1831 | if (FunctionSamples::ProfileIsCS) |
| 1832 | ProfiledCG = std::make_unique<ProfiledCallGraph>(args&: *ContextTracker); |
| 1833 | else |
| 1834 | ProfiledCG = std::make_unique<ProfiledCallGraph>(args&: Reader->getProfiles()); |
| 1835 | |
| 1836 | // Add all functions into the profiled call graph even if they are not in |
| 1837 | // the profile. This makes sure functions missing from the profile still |
| 1838 | // gets a chance to be processed. |
| 1839 | for (Function &F : M) { |
| 1840 | if (skipProfileForFunction(F)) |
| 1841 | continue; |
| 1842 | ProfiledCG->addProfiledFunction( |
| 1843 | Name: getRepInFormat(Name: FunctionSamples::getCanonicalFnName(F))); |
| 1844 | } |
| 1845 | |
| 1846 | return ProfiledCG; |
| 1847 | } |
| 1848 | |
| 1849 | std::vector<Function *> |
| 1850 | SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) { |
| 1851 | std::vector<Function *> FunctionOrderList; |
| 1852 | FunctionOrderList.reserve(n: M.size()); |
| 1853 | |
| 1854 | if (!ProfileTopDownLoad && UseProfiledCallGraph) |
| 1855 | errs() << "WARNING: -use-profiled-call-graph ignored, should be used " |
| 1856 | "together with -sample-profile-top-down-load.\n" ; |
| 1857 | |
| 1858 | if (!ProfileTopDownLoad) { |
| 1859 | if (ProfileMergeInlinee) { |
| 1860 | // Disable ProfileMergeInlinee if profile is not loaded in top down order, |
| 1861 | // because the profile for a function may be used for the profile |
| 1862 | // annotation of its outline copy before the profile merging of its |
| 1863 | // non-inlined inline instances, and that is not the way how |
| 1864 | // ProfileMergeInlinee is supposed to work. |
| 1865 | ProfileMergeInlinee = false; |
| 1866 | } |
| 1867 | |
| 1868 | for (Function &F : M) |
| 1869 | if (!skipProfileForFunction(F)) |
| 1870 | FunctionOrderList.push_back(x: &F); |
| 1871 | return FunctionOrderList; |
| 1872 | } |
| 1873 | |
| 1874 | if (UseProfiledCallGraph || (FunctionSamples::ProfileIsCS && |
| 1875 | !UseProfiledCallGraph.getNumOccurrences())) { |
| 1876 | // Use profiled call edges to augment the top-down order. There are cases |
| 1877 | // that the top-down order computed based on the static call graph doesn't |
| 1878 | // reflect real execution order. For example |
| 1879 | // |
| 1880 | // 1. Incomplete static call graph due to unknown indirect call targets. |
| 1881 | // Adjusting the order by considering indirect call edges from the |
| 1882 | // profile can enable the inlining of indirect call targets by allowing |
| 1883 | // the caller processed before them. |
| 1884 | // 2. Mutual call edges in an SCC. The static processing order computed for |
| 1885 | // an SCC may not reflect the call contexts in the context-sensitive |
| 1886 | // profile, thus may cause potential inlining to be overlooked. The |
| 1887 | // function order in one SCC is being adjusted to a top-down order based |
| 1888 | // on the profile to favor more inlining. This is only a problem with CS |
| 1889 | // profile. |
| 1890 | // 3. Transitive indirect call edges due to inlining. When a callee function |
| 1891 | // (say B) is inlined into a caller function (say A) in LTO prelink, |
| 1892 | // every call edge originated from the callee B will be transferred to |
| 1893 | // the caller A. If any transferred edge (say A->C) is indirect, the |
| 1894 | // original profiled indirect edge B->C, even if considered, would not |
| 1895 | // enforce a top-down order from the caller A to the potential indirect |
| 1896 | // call target C in LTO postlink since the inlined callee B is gone from |
| 1897 | // the static call graph. |
| 1898 | // 4. #3 can happen even for direct call targets, due to functions defined |
| 1899 | // in header files. A header function (say A), when included into source |
| 1900 | // files, is defined multiple times but only one definition survives due |
| 1901 | // to ODR. Therefore, the LTO prelink inlining done on those dropped |
| 1902 | // definitions can be useless based on a local file scope. More |
| 1903 | // importantly, the inlinee (say B), once fully inlined to a |
| 1904 | // to-be-dropped A, will have no profile to consume when its outlined |
| 1905 | // version is compiled. This can lead to a profile-less prelink |
| 1906 | // compilation for the outlined version of B which may be called from |
| 1907 | // external modules. while this isn't easy to fix, we rely on the |
| 1908 | // postlink AutoFDO pipeline to optimize B. Since the survived copy of |
| 1909 | // the A can be inlined in its local scope in prelink, it may not exist |
| 1910 | // in the merged IR in postlink, and we'll need the profiled call edges |
| 1911 | // to enforce a top-down order for the rest of the functions. |
| 1912 | // |
| 1913 | // Considering those cases, a profiled call graph completely independent of |
| 1914 | // the static call graph is constructed based on profile data, where |
| 1915 | // function objects are not even needed to handle case #3 and case 4. |
| 1916 | // |
| 1917 | // Note that static callgraph edges are completely ignored since they |
| 1918 | // can be conflicting with profiled edges for cyclic SCCs and may result in |
| 1919 | // an SCC order incompatible with profile-defined one. Using strictly |
| 1920 | // profile order ensures a maximum inlining experience. On the other hand, |
| 1921 | // static call edges are not so important when they don't correspond to a |
| 1922 | // context in the profile. |
| 1923 | |
| 1924 | std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M); |
| 1925 | scc_iterator<ProfiledCallGraph *> CGI = scc_begin(G: ProfiledCG.get()); |
| 1926 | while (!CGI.isAtEnd()) { |
| 1927 | auto Range = *CGI; |
| 1928 | if (SortProfiledSCC) { |
| 1929 | // Sort nodes in one SCC based on callsite hotness. |
| 1930 | scc_member_iterator<ProfiledCallGraph *> SI(*CGI); |
| 1931 | Range = *SI; |
| 1932 | } |
| 1933 | for (auto *Node : Range) { |
| 1934 | Function *F = SymbolMap.lookup(Key: Node->Name); |
| 1935 | if (F && !skipProfileForFunction(F: *F)) |
| 1936 | FunctionOrderList.push_back(x: F); |
| 1937 | } |
| 1938 | ++CGI; |
| 1939 | } |
| 1940 | std::reverse(first: FunctionOrderList.begin(), last: FunctionOrderList.end()); |
| 1941 | } else |
| 1942 | buildTopDownFuncOrder(CG, FunctionOrderList); |
| 1943 | |
| 1944 | LLVM_DEBUG({ |
| 1945 | dbgs() << "Function processing order:\n" ; |
| 1946 | for (auto F : FunctionOrderList) { |
| 1947 | dbgs() << F->getName() << "\n" ; |
| 1948 | } |
| 1949 | }); |
| 1950 | |
| 1951 | return FunctionOrderList; |
| 1952 | } |
| 1953 | |
| 1954 | bool SampleProfileLoader::doInitialization(Module &M, |
| 1955 | FunctionAnalysisManager *FAM) { |
| 1956 | auto &Ctx = M.getContext(); |
| 1957 | |
| 1958 | auto ReaderOrErr = SampleProfileReader::create( |
| 1959 | Filename, C&: Ctx, FS&: *FS, P: FSDiscriminatorPass::Base, RemapFilename: RemappingFilename); |
| 1960 | if (std::error_code EC = ReaderOrErr.getError()) { |
| 1961 | std::string Msg = "Could not open profile: " + EC.message(); |
| 1962 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(Filename, Msg)); |
| 1963 | return false; |
| 1964 | } |
| 1965 | Reader = std::move(ReaderOrErr.get()); |
| 1966 | Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink); |
| 1967 | // set module before reading the profile so reader may be able to only |
| 1968 | // read the function profiles which are used by the current module. |
| 1969 | Reader->setModule(&M); |
| 1970 | if (std::error_code EC = Reader->read()) { |
| 1971 | std::string Msg = "profile reading failed: " + EC.message(); |
| 1972 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(Filename, Msg)); |
| 1973 | return false; |
| 1974 | } |
| 1975 | |
| 1976 | PSL = Reader->getProfileSymbolList(); |
| 1977 | |
| 1978 | if (DisableSampleLoaderInlining.getNumOccurrences()) |
| 1979 | DisableSampleProfileInlining = DisableSampleLoaderInlining; |
| 1980 | |
| 1981 | if (UseFlattenedProfile) |
| 1982 | ProfileConverter::flattenProfile(ProfileMap&: Reader->getProfiles(), |
| 1983 | ProfileIsCS: Reader->profileIsCS()); |
| 1984 | |
| 1985 | // While profile-sample-accurate is on, ignore symbol list. |
| 1986 | ProfAccForSymsInList = |
| 1987 | ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate; |
| 1988 | if (ProfAccForSymsInList) { |
| 1989 | NamesInProfile.clear(); |
| 1990 | GUIDsInProfile.clear(); |
| 1991 | if (auto NameTable = Reader->getNameTable()) { |
| 1992 | if (FunctionSamples::UseMD5) { |
| 1993 | for (auto Name : *NameTable) |
| 1994 | GUIDsInProfile.insert(V: Name.getHashCode()); |
| 1995 | } else { |
| 1996 | for (auto Name : *NameTable) |
| 1997 | NamesInProfile.insert(key: Name.stringRef()); |
| 1998 | } |
| 1999 | } |
| 2000 | CoverageTracker.setProfAccForSymsInList(true); |
| 2001 | } |
| 2002 | |
| 2003 | if (FAM && !ProfileInlineReplayFile.empty()) { |
| 2004 | ExternalInlineAdvisor = getReplayInlineAdvisor( |
| 2005 | M, FAM&: *FAM, Context&: Ctx, /*OriginalAdvisor=*/nullptr, |
| 2006 | ReplaySettings: ReplayInlinerSettings{.ReplayFile: ProfileInlineReplayFile, |
| 2007 | .ReplayScope: ProfileInlineReplayScope, |
| 2008 | .ReplayFallback: ProfileInlineReplayFallback, |
| 2009 | .ReplayFormat: {.OutputFormat: ProfileInlineReplayFormat}}, |
| 2010 | /*EmitRemarks=*/false, IC: InlineContext{.LTOPhase: LTOPhase, .Pass: InlinePass::ReplaySampleProfileInliner}); |
| 2011 | } |
| 2012 | |
| 2013 | // Apply tweaks if context-sensitive or probe-based profile is available. |
| 2014 | if (Reader->profileIsCS() || Reader->profileIsPreInlined() || |
| 2015 | Reader->profileIsProbeBased()) { |
| 2016 | if (!UseIterativeBFIInference.getNumOccurrences()) |
| 2017 | UseIterativeBFIInference = true; |
| 2018 | if (!SampleProfileUseProfi.getNumOccurrences()) |
| 2019 | SampleProfileUseProfi = true; |
| 2020 | if (!EnableExtTspBlockPlacement.getNumOccurrences()) |
| 2021 | EnableExtTspBlockPlacement = true; |
| 2022 | // Enable priority-base inliner and size inline by default for CSSPGO. |
| 2023 | if (!ProfileSizeInline.getNumOccurrences()) |
| 2024 | ProfileSizeInline = true; |
| 2025 | if (!CallsitePrioritizedInline.getNumOccurrences()) |
| 2026 | CallsitePrioritizedInline = true; |
| 2027 | // For CSSPGO, we also allow recursive inline to best use context profile. |
| 2028 | if (!AllowRecursiveInline.getNumOccurrences()) |
| 2029 | AllowRecursiveInline = true; |
| 2030 | |
| 2031 | if (Reader->profileIsPreInlined()) { |
| 2032 | if (!UsePreInlinerDecision.getNumOccurrences()) |
| 2033 | UsePreInlinerDecision = true; |
| 2034 | } |
| 2035 | |
| 2036 | // Enable stale profile matching by default for probe-based profile. |
| 2037 | // Currently the matching relies on if the checksum mismatch is detected, |
| 2038 | // which is currently only available for pseudo-probe mode. Removing the |
| 2039 | // checksum check could cause regressions for some cases, so further tuning |
| 2040 | // might be needed if we want to enable it for all cases. |
| 2041 | if (Reader->profileIsProbeBased()) { |
| 2042 | if (!SalvageStaleProfile.getNumOccurrences()) |
| 2043 | SalvageStaleProfile = true; |
| 2044 | if (!SalvageUnusedProfile.getNumOccurrences()) |
| 2045 | SalvageUnusedProfile = true; |
| 2046 | } |
| 2047 | |
| 2048 | if (!Reader->profileIsCS()) { |
| 2049 | // Non-CS profile should be fine without a function size budget for the |
| 2050 | // inliner since the contexts in the profile are either all from inlining |
| 2051 | // in the prevoius build or pre-computed by the preinliner with a size |
| 2052 | // cap, thus they are bounded. |
| 2053 | if (!ProfileInlineLimitMin.getNumOccurrences()) |
| 2054 | ProfileInlineLimitMin = std::numeric_limits<unsigned>::max(); |
| 2055 | if (!ProfileInlineLimitMax.getNumOccurrences()) |
| 2056 | ProfileInlineLimitMax = std::numeric_limits<unsigned>::max(); |
| 2057 | } |
| 2058 | } |
| 2059 | |
| 2060 | if (Reader->profileIsCS()) { |
| 2061 | // Tracker for profiles under different context |
| 2062 | ContextTracker = std::make_unique<SampleContextTracker>( |
| 2063 | args&: Reader->getProfiles(), args: &GUIDToFuncNameMap); |
| 2064 | } |
| 2065 | |
| 2066 | // Load pseudo probe descriptors for probe-based function samples. |
| 2067 | if (Reader->profileIsProbeBased()) { |
| 2068 | ProbeManager = std::make_unique<PseudoProbeManager>(args&: M); |
| 2069 | if (!ProbeManager->moduleIsProbed(M)) { |
| 2070 | const char *Msg = |
| 2071 | "Pseudo-probe-based profile requires SampleProfileProbePass" ; |
| 2072 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg, |
| 2073 | DS_Warning)); |
| 2074 | return false; |
| 2075 | } |
| 2076 | } |
| 2077 | |
| 2078 | if (ReportProfileStaleness || PersistProfileStaleness || |
| 2079 | SalvageStaleProfile) { |
| 2080 | MatchingManager = std::make_unique<SampleProfileMatcher>( |
| 2081 | args&: M, args&: *Reader, args&: CG, args: ProbeManager.get(), args: LTOPhase, args&: SymbolMap, args&: PSL, |
| 2082 | args&: FuncNameToProfNameMap); |
| 2083 | } |
| 2084 | |
| 2085 | return true; |
| 2086 | } |
| 2087 | |
| 2088 | // Note that this is a module-level check. Even if one module is errored out, |
| 2089 | // the entire build will be errored out. However, the user could make big |
| 2090 | // changes to functions in single module but those changes might not be |
| 2091 | // performance significant to the whole binary. Therefore, to avoid those false |
| 2092 | // positives, we select a reasonable big set of hot functions that are supposed |
| 2093 | // to be globally performance significant, only compute and check the mismatch |
| 2094 | // within those functions. The function selection is based on two criteria: |
| 2095 | // 1) The function is hot enough, which is tuned by a hotness-based |
| 2096 | // flag(HotFuncCutoffForStalenessError). 2) The num of function is large enough |
| 2097 | // which is tuned by the MinfuncsForStalenessError flag. |
| 2098 | bool SampleProfileLoader::rejectHighStalenessProfile( |
| 2099 | Module &M, ProfileSummaryInfo *PSI, const SampleProfileMap &Profiles) { |
| 2100 | assert(FunctionSamples::ProfileIsProbeBased && |
| 2101 | "Only support for probe-based profile" ); |
| 2102 | uint64_t TotalHotFunc = 0; |
| 2103 | uint64_t NumMismatchedFunc = 0; |
| 2104 | for (const auto &I : Profiles) { |
| 2105 | const auto &FS = I.second; |
| 2106 | const auto *FuncDesc = ProbeManager->getDesc(GUID: FS.getGUID()); |
| 2107 | if (!FuncDesc) |
| 2108 | continue; |
| 2109 | |
| 2110 | // Use a hotness-based threshold to control the function selection. |
| 2111 | if (!PSI->isHotCountNthPercentile(PercentileCutoff: HotFuncCutoffForStalenessError, |
| 2112 | C: FS.getTotalSamples())) |
| 2113 | continue; |
| 2114 | |
| 2115 | TotalHotFunc++; |
| 2116 | if (ProbeManager->profileIsHashMismatched(FuncDesc: *FuncDesc, Samples: FS)) |
| 2117 | NumMismatchedFunc++; |
| 2118 | } |
| 2119 | // Make sure that the num of selected function is not too small to distinguish |
| 2120 | // from the user's benign changes. |
| 2121 | if (TotalHotFunc < MinfuncsForStalenessError) |
| 2122 | return false; |
| 2123 | |
| 2124 | // Finally check the mismatch percentage against the threshold. |
| 2125 | if (NumMismatchedFunc * 100 >= |
| 2126 | TotalHotFunc * PrecentMismatchForStalenessError) { |
| 2127 | auto &Ctx = M.getContext(); |
| 2128 | const char *Msg = |
| 2129 | "The input profile significantly mismatches current source code. " |
| 2130 | "Please recollect profile to avoid performance regression." ; |
| 2131 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg)); |
| 2132 | return true; |
| 2133 | } |
| 2134 | return false; |
| 2135 | } |
| 2136 | |
| 2137 | void SampleProfileLoader::removePseudoProbeInstsDiscriminator(Module &M) { |
| 2138 | for (auto &F : M) { |
| 2139 | std::vector<Instruction *> InstsToDel; |
| 2140 | for (auto &BB : F) { |
| 2141 | for (auto &I : BB) { |
| 2142 | if (isa<PseudoProbeInst>(Val: &I)) |
| 2143 | InstsToDel.push_back(x: &I); |
| 2144 | else if (isa<CallBase>(Val: &I)) |
| 2145 | if (const DILocation *DIL = I.getDebugLoc().get()) { |
| 2146 | // Restore dwarf discriminator for call. |
| 2147 | unsigned Discriminator = DIL->getDiscriminator(); |
| 2148 | if (DILocation::isPseudoProbeDiscriminator(Discriminator)) { |
| 2149 | std::optional<uint32_t> DwarfDiscriminator = |
| 2150 | PseudoProbeDwarfDiscriminator::extractDwarfBaseDiscriminator( |
| 2151 | Value: Discriminator); |
| 2152 | I.setDebugLoc( |
| 2153 | DIL->cloneWithDiscriminator(Discriminator: DwarfDiscriminator.value_or(u: 0))); |
| 2154 | } |
| 2155 | } |
| 2156 | } |
| 2157 | } |
| 2158 | for (auto *I : InstsToDel) |
| 2159 | I->eraseFromParent(); |
| 2160 | } |
| 2161 | } |
| 2162 | |
| 2163 | bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, |
| 2164 | ProfileSummaryInfo *_PSI) { |
| 2165 | GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); |
| 2166 | |
| 2167 | PSI = _PSI; |
| 2168 | if (M.getProfileSummary(/* IsCS */ false) == nullptr) { |
| 2169 | M.setProfileSummary(M: Reader->getSummary().getMD(Context&: M.getContext()), |
| 2170 | Kind: ProfileSummary::PSK_Sample); |
| 2171 | PSI->refresh(); |
| 2172 | } |
| 2173 | |
| 2174 | if (FunctionSamples::ProfileIsProbeBased && |
| 2175 | rejectHighStalenessProfile(M, PSI, Profiles: Reader->getProfiles())) |
| 2176 | return false; |
| 2177 | |
| 2178 | auto Remapper = Reader->getRemapper(); |
| 2179 | // Populate the symbol map. |
| 2180 | for (const auto &N_F : M.getValueSymbolTable()) { |
| 2181 | StringRef OrigName = N_F.getKey(); |
| 2182 | Function *F = dyn_cast<Function>(Val: N_F.getValue()); |
| 2183 | if (F == nullptr || OrigName.empty()) |
| 2184 | continue; |
| 2185 | SymbolMap[FunctionId(OrigName)] = F; |
| 2186 | StringRef NewName = FunctionSamples::getCanonicalFnName(F: *F); |
| 2187 | if (OrigName != NewName && !NewName.empty()) { |
| 2188 | auto r = SymbolMap.emplace(Args: FunctionId(NewName), Args&: F); |
| 2189 | // Failiing to insert means there is already an entry in SymbolMap, |
| 2190 | // thus there are multiple functions that are mapped to the same |
| 2191 | // stripped name. In this case of name conflicting, set the value |
| 2192 | // to nullptr to avoid confusion. |
| 2193 | if (!r.second) |
| 2194 | r.first->second = nullptr; |
| 2195 | OrigName = NewName; |
| 2196 | } |
| 2197 | // Insert the remapped names into SymbolMap. |
| 2198 | if (Remapper) { |
| 2199 | if (auto MapName = Remapper->lookUpNameInProfile(FunctionName: OrigName)) { |
| 2200 | if (*MapName != OrigName && !MapName->empty()) |
| 2201 | SymbolMap.emplace(Args: FunctionId(*MapName), Args&: F); |
| 2202 | } |
| 2203 | } |
| 2204 | } |
| 2205 | |
| 2206 | // Stale profile matching. |
| 2207 | if (ReportProfileStaleness || PersistProfileStaleness || |
| 2208 | SalvageStaleProfile) { |
| 2209 | MatchingManager->runOnModule(); |
| 2210 | MatchingManager->clearMatchingData(); |
| 2211 | } |
| 2212 | assert(SymbolMap.count(FunctionId()) == 0 && |
| 2213 | "No empty StringRef should be added in SymbolMap" ); |
| 2214 | assert((SalvageUnusedProfile || FuncNameToProfNameMap.empty()) && |
| 2215 | "FuncNameToProfNameMap is not empty when --salvage-unused-profile is " |
| 2216 | "not enabled" ); |
| 2217 | |
| 2218 | bool retval = false; |
| 2219 | for (auto *F : buildFunctionOrder(M, CG)) { |
| 2220 | assert(!F->isDeclaration()); |
| 2221 | clearFunctionData(); |
| 2222 | retval |= runOnFunction(F&: *F, AM); |
| 2223 | } |
| 2224 | |
| 2225 | // Account for cold calls not inlined.... |
| 2226 | if (!FunctionSamples::ProfileIsCS) |
| 2227 | for (const std::pair<Function *, NotInlinedProfileInfo> &pair : |
| 2228 | notInlinedCallInfo) |
| 2229 | updateProfileCallee(Callee: pair.first, EntryDelta: pair.second.entryCount); |
| 2230 | |
| 2231 | if (RemoveProbeAfterProfileAnnotation && |
| 2232 | FunctionSamples::ProfileIsProbeBased) { |
| 2233 | removePseudoProbeInstsDiscriminator(M); |
| 2234 | if (auto *FuncInfo = M.getNamedMetadata(Name: PseudoProbeDescMetadataName)) |
| 2235 | M.eraseNamedMetadata(NMD: FuncInfo); |
| 2236 | } |
| 2237 | |
| 2238 | return retval; |
| 2239 | } |
| 2240 | |
| 2241 | bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { |
| 2242 | LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n" ); |
| 2243 | DILocation2SampleMap.clear(); |
| 2244 | // By default the entry count is initialized to -1, which will be treated |
| 2245 | // conservatively by getEntryCount as the same as unknown (None). This is |
| 2246 | // to avoid newly added code to be treated as cold. If we have samples |
| 2247 | // this will be overwritten in emitAnnotations. |
| 2248 | uint64_t initialEntryCount = -1; |
| 2249 | |
| 2250 | ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL; |
| 2251 | if (ProfileSampleAccurate || F.hasFnAttribute(Kind: "profile-sample-accurate" )) { |
| 2252 | // initialize all the function entry counts to 0. It means all the |
| 2253 | // functions without profile will be regarded as cold. |
| 2254 | initialEntryCount = 0; |
| 2255 | // profile-sample-accurate is a user assertion which has a higher precedence |
| 2256 | // than symbol list. When profile-sample-accurate is on, ignore symbol list. |
| 2257 | ProfAccForSymsInList = false; |
| 2258 | } |
| 2259 | CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList); |
| 2260 | |
| 2261 | // PSL -- profile symbol list include all the symbols in sampled binary. |
| 2262 | // If ProfileAccurateForSymsInList is enabled, PSL is used to treat |
| 2263 | // old functions without samples being cold, without having to worry |
| 2264 | // about new and hot functions being mistakenly treated as cold. |
| 2265 | if (ProfAccForSymsInList) { |
| 2266 | // Initialize the entry count to 0 for functions in the list. |
| 2267 | if (PSL->contains(Name: F.getName())) |
| 2268 | initialEntryCount = 0; |
| 2269 | |
| 2270 | // Function in the symbol list but without sample will be regarded as |
| 2271 | // cold. To minimize the potential negative performance impact it could |
| 2272 | // have, we want to be a little conservative here saying if a function |
| 2273 | // shows up in the profile, no matter as outline function, inline instance |
| 2274 | // or call targets, treat the function as not being cold. This will handle |
| 2275 | // the cases such as most callsites of a function are inlined in sampled |
| 2276 | // binary but not inlined in current build (because of source code drift, |
| 2277 | // imprecise debug information, or the callsites are all cold individually |
| 2278 | // but not cold accumulatively...), so the outline function showing up as |
| 2279 | // cold in sampled binary will actually not be cold after current build. |
| 2280 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
| 2281 | if ((FunctionSamples::UseMD5 && |
| 2282 | GUIDsInProfile.count( |
| 2283 | V: Function::getGUIDAssumingExternalLinkage(GlobalName: CanonName))) || |
| 2284 | (!FunctionSamples::UseMD5 && NamesInProfile.count(Key: CanonName))) |
| 2285 | initialEntryCount = -1; |
| 2286 | } |
| 2287 | |
| 2288 | // Initialize entry count when the function has no existing entry |
| 2289 | // count value. |
| 2290 | if (!F.getEntryCount()) |
| 2291 | F.setEntryCount(Count: ProfileCount(initialEntryCount, Function::PCT_Real)); |
| 2292 | std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; |
| 2293 | if (AM) { |
| 2294 | auto &FAM = |
| 2295 | AM->getResult<FunctionAnalysisManagerModuleProxy>(IR&: *F.getParent()) |
| 2296 | .getManager(); |
| 2297 | ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
| 2298 | } else { |
| 2299 | OwnedORE = std::make_unique<OptimizationRemarkEmitter>(args: &F); |
| 2300 | ORE = OwnedORE.get(); |
| 2301 | } |
| 2302 | |
| 2303 | if (FunctionSamples::ProfileIsCS) |
| 2304 | Samples = ContextTracker->getBaseSamplesFor(Func: F); |
| 2305 | else { |
| 2306 | Samples = Reader->getSamplesFor(F); |
| 2307 | // Try search in previously inlined functions that were split or duplicated |
| 2308 | // into base. |
| 2309 | if (!Samples) { |
| 2310 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
| 2311 | auto It = OutlineFunctionSamples.find(x: FunctionId(CanonName)); |
| 2312 | if (It != OutlineFunctionSamples.end()) { |
| 2313 | Samples = &It->second; |
| 2314 | } else if (auto Remapper = Reader->getRemapper()) { |
| 2315 | if (auto RemppedName = Remapper->lookUpNameInProfile(FunctionName: CanonName)) { |
| 2316 | It = OutlineFunctionSamples.find(x: FunctionId(*RemppedName)); |
| 2317 | if (It != OutlineFunctionSamples.end()) |
| 2318 | Samples = &It->second; |
| 2319 | } |
| 2320 | } |
| 2321 | } |
| 2322 | } |
| 2323 | |
| 2324 | if (Samples && !Samples->empty()) |
| 2325 | return emitAnnotations(F); |
| 2326 | return false; |
| 2327 | } |
| 2328 | SampleProfileLoaderPass::SampleProfileLoaderPass( |
| 2329 | std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase, |
| 2330 | IntrusiveRefCntPtr<vfs::FileSystem> FS, bool DisableSampleProfileInlining, |
| 2331 | bool UseFlattenedProfile) |
| 2332 | : ProfileFileName(File), ProfileRemappingFileName(RemappingFile), |
| 2333 | LTOPhase(LTOPhase), FS(std::move(FS)), |
| 2334 | DisableSampleProfileInlining(DisableSampleProfileInlining), |
| 2335 | UseFlattenedProfile(UseFlattenedProfile) {} |
| 2336 | |
| 2337 | PreservedAnalyses SampleProfileLoaderPass::run(Module &M, |
| 2338 | ModuleAnalysisManager &AM) { |
| 2339 | FunctionAnalysisManager &FAM = |
| 2340 | AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
| 2341 | |
| 2342 | auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { |
| 2343 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
| 2344 | }; |
| 2345 | auto GetTTI = [&](Function &F) -> TargetTransformInfo & { |
| 2346 | return FAM.getResult<TargetIRAnalysis>(IR&: F); |
| 2347 | }; |
| 2348 | auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & { |
| 2349 | return FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
| 2350 | }; |
| 2351 | |
| 2352 | if (!FS) |
| 2353 | FS = vfs::getRealFileSystem(); |
| 2354 | LazyCallGraph &CG = AM.getResult<LazyCallGraphAnalysis>(IR&: M); |
| 2355 | |
| 2356 | SampleProfileLoader SampleLoader( |
| 2357 | ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, |
| 2358 | ProfileRemappingFileName.empty() ? SampleProfileRemappingFile |
| 2359 | : ProfileRemappingFileName, |
| 2360 | LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI, CG, |
| 2361 | DisableSampleProfileInlining, UseFlattenedProfile); |
| 2362 | if (!SampleLoader.doInitialization(M, FAM: &FAM)) |
| 2363 | return PreservedAnalyses::all(); |
| 2364 | |
| 2365 | ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(IR&: M); |
| 2366 | if (!SampleLoader.runOnModule(M, AM: &AM, PSI: PSI)) |
| 2367 | return PreservedAnalyses::all(); |
| 2368 | |
| 2369 | return PreservedAnalyses::none(); |
| 2370 | } |
| 2371 | |