1 | //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the SampleProfileLoader transformation. This pass |
10 | // reads a profile file generated by a sampling profiler (e.g. Linux Perf - |
11 | // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the |
12 | // profile information in the given profile. |
13 | // |
14 | // This pass generates branch weight annotations on the IR: |
15 | // |
16 | // - prof: Represents branch weights. This annotation is added to branches |
17 | // to indicate the weights of each edge coming out of the branch. |
18 | // The weight of each edge is the weight of the target block for |
19 | // that edge. The weight of a block B is computed as the maximum |
20 | // number of samples found in B. |
21 | // |
22 | //===----------------------------------------------------------------------===// |
23 | |
24 | #include "llvm/Transforms/IPO/SampleProfile.h" |
25 | #include "llvm/ADT/ArrayRef.h" |
26 | #include "llvm/ADT/DenseMap.h" |
27 | #include "llvm/ADT/DenseSet.h" |
28 | #include "llvm/ADT/MapVector.h" |
29 | #include "llvm/ADT/PriorityQueue.h" |
30 | #include "llvm/ADT/SCCIterator.h" |
31 | #include "llvm/ADT/SmallVector.h" |
32 | #include "llvm/ADT/Statistic.h" |
33 | #include "llvm/ADT/StringMap.h" |
34 | #include "llvm/ADT/StringRef.h" |
35 | #include "llvm/ADT/Twine.h" |
36 | #include "llvm/Analysis/AssumptionCache.h" |
37 | #include "llvm/Analysis/BlockFrequencyInfoImpl.h" |
38 | #include "llvm/Analysis/InlineAdvisor.h" |
39 | #include "llvm/Analysis/InlineCost.h" |
40 | #include "llvm/Analysis/LazyCallGraph.h" |
41 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
42 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
43 | #include "llvm/Analysis/ReplayInlineAdvisor.h" |
44 | #include "llvm/Analysis/TargetLibraryInfo.h" |
45 | #include "llvm/Analysis/TargetTransformInfo.h" |
46 | #include "llvm/IR/BasicBlock.h" |
47 | #include "llvm/IR/DebugLoc.h" |
48 | #include "llvm/IR/DiagnosticInfo.h" |
49 | #include "llvm/IR/Function.h" |
50 | #include "llvm/IR/GlobalValue.h" |
51 | #include "llvm/IR/InstrTypes.h" |
52 | #include "llvm/IR/Instruction.h" |
53 | #include "llvm/IR/Instructions.h" |
54 | #include "llvm/IR/IntrinsicInst.h" |
55 | #include "llvm/IR/LLVMContext.h" |
56 | #include "llvm/IR/MDBuilder.h" |
57 | #include "llvm/IR/Module.h" |
58 | #include "llvm/IR/PassManager.h" |
59 | #include "llvm/IR/ProfDataUtils.h" |
60 | #include "llvm/IR/PseudoProbe.h" |
61 | #include "llvm/IR/ValueSymbolTable.h" |
62 | #include "llvm/ProfileData/InstrProf.h" |
63 | #include "llvm/ProfileData/SampleProf.h" |
64 | #include "llvm/ProfileData/SampleProfReader.h" |
65 | #include "llvm/Support/Casting.h" |
66 | #include "llvm/Support/CommandLine.h" |
67 | #include "llvm/Support/Debug.h" |
68 | #include "llvm/Support/ErrorOr.h" |
69 | #include "llvm/Support/VirtualFileSystem.h" |
70 | #include "llvm/Support/raw_ostream.h" |
71 | #include "llvm/Transforms/IPO.h" |
72 | #include "llvm/Transforms/IPO/ProfiledCallGraph.h" |
73 | #include "llvm/Transforms/IPO/SampleContextTracker.h" |
74 | #include "llvm/Transforms/IPO/SampleProfileMatcher.h" |
75 | #include "llvm/Transforms/IPO/SampleProfileProbe.h" |
76 | #include "llvm/Transforms/Instrumentation.h" |
77 | #include "llvm/Transforms/Utils/CallPromotionUtils.h" |
78 | #include "llvm/Transforms/Utils/Cloning.h" |
79 | #include "llvm/Transforms/Utils/MisExpect.h" |
80 | #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" |
81 | #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" |
82 | #include <algorithm> |
83 | #include <cassert> |
84 | #include <cstdint> |
85 | #include <functional> |
86 | #include <limits> |
87 | #include <map> |
88 | #include <memory> |
89 | #include <queue> |
90 | #include <string> |
91 | #include <system_error> |
92 | #include <utility> |
93 | #include <vector> |
94 | |
95 | using namespace llvm; |
96 | using namespace sampleprof; |
97 | using namespace llvm::sampleprofutil; |
98 | using ProfileCount = Function::ProfileCount; |
99 | #define DEBUG_TYPE "sample-profile" |
100 | #define CSINLINE_DEBUG DEBUG_TYPE "-inline" |
101 | |
102 | STATISTIC(NumCSInlined, |
103 | "Number of functions inlined with context sensitive profile" ); |
104 | STATISTIC(NumCSNotInlined, |
105 | "Number of functions not inlined with context sensitive profile" ); |
106 | STATISTIC(NumMismatchedProfile, |
107 | "Number of functions with CFG mismatched profile" ); |
108 | STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile" ); |
109 | STATISTIC(NumDuplicatedInlinesite, |
110 | "Number of inlined callsites with a partial distribution factor" ); |
111 | |
112 | STATISTIC(NumCSInlinedHitMinLimit, |
113 | "Number of functions with FDO inline stopped due to min size limit" ); |
114 | STATISTIC(NumCSInlinedHitMaxLimit, |
115 | "Number of functions with FDO inline stopped due to max size limit" ); |
116 | STATISTIC( |
117 | NumCSInlinedHitGrowthLimit, |
118 | "Number of functions with FDO inline stopped due to growth size limit" ); |
119 | |
120 | // Command line option to specify the file to read samples from. This is |
121 | // mainly used for debugging. |
122 | static cl::opt<std::string> SampleProfileFile( |
123 | "sample-profile-file" , cl::init(Val: "" ), cl::value_desc("filename" ), |
124 | cl::desc("Profile file loaded by -sample-profile" ), cl::Hidden); |
125 | |
126 | // The named file contains a set of transformations that may have been applied |
127 | // to the symbol names between the program from which the sample data was |
128 | // collected and the current program's symbols. |
129 | static cl::opt<std::string> SampleProfileRemappingFile( |
130 | "sample-profile-remapping-file" , cl::init(Val: "" ), cl::value_desc("filename" ), |
131 | cl::desc("Profile remapping file loaded by -sample-profile" ), cl::Hidden); |
132 | |
133 | cl::opt<bool> SalvageStaleProfile( |
134 | "salvage-stale-profile" , cl::Hidden, cl::init(Val: false), |
135 | cl::desc("Salvage stale profile by fuzzy matching and use the remapped " |
136 | "location for sample profile query." )); |
137 | cl::opt<bool> |
138 | SalvageUnusedProfile("salvage-unused-profile" , cl::Hidden, cl::init(Val: false), |
139 | cl::desc("Salvage unused profile by matching with new " |
140 | "functions on call graph." )); |
141 | |
142 | cl::opt<bool> ReportProfileStaleness( |
143 | "report-profile-staleness" , cl::Hidden, cl::init(Val: false), |
144 | cl::desc("Compute and report stale profile statistical metrics." )); |
145 | |
146 | cl::opt<bool> PersistProfileStaleness( |
147 | "persist-profile-staleness" , cl::Hidden, cl::init(Val: false), |
148 | cl::desc("Compute stale profile statistical metrics and write it into the " |
149 | "native object file(.llvm_stats section)." )); |
150 | |
151 | static cl::opt<bool> ProfileSampleAccurate( |
152 | "profile-sample-accurate" , cl::Hidden, cl::init(Val: false), |
153 | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
154 | "callsite and function as having 0 samples. Otherwise, treat " |
155 | "un-sampled callsites and functions conservatively as unknown. " )); |
156 | |
157 | static cl::opt<bool> ProfileSampleBlockAccurate( |
158 | "profile-sample-block-accurate" , cl::Hidden, cl::init(Val: false), |
159 | cl::desc("If the sample profile is accurate, we will mark all un-sampled " |
160 | "branches and calls as having 0 samples. Otherwise, treat " |
161 | "them conservatively as unknown. " )); |
162 | |
163 | static cl::opt<bool> ProfileAccurateForSymsInList( |
164 | "profile-accurate-for-symsinlist" , cl::Hidden, cl::init(Val: true), |
165 | cl::desc("For symbols in profile symbol list, regard their profiles to " |
166 | "be accurate. It may be overriden by profile-sample-accurate. " )); |
167 | |
168 | static cl::opt<bool> ProfileMergeInlinee( |
169 | "sample-profile-merge-inlinee" , cl::Hidden, cl::init(Val: true), |
170 | cl::desc("Merge past inlinee's profile to outline version if sample " |
171 | "profile loader decided not to inline a call site. It will " |
172 | "only be enabled when top-down order of profile loading is " |
173 | "enabled. " )); |
174 | |
175 | static cl::opt<bool> ProfileTopDownLoad( |
176 | "sample-profile-top-down-load" , cl::Hidden, cl::init(Val: true), |
177 | cl::desc("Do profile annotation and inlining for functions in top-down " |
178 | "order of call graph during sample profile loading. It only " |
179 | "works for new pass manager. " )); |
180 | |
181 | static cl::opt<bool> |
182 | UseProfiledCallGraph("use-profiled-call-graph" , cl::init(Val: true), cl::Hidden, |
183 | cl::desc("Process functions in a top-down order " |
184 | "defined by the profiled call graph when " |
185 | "-sample-profile-top-down-load is on." )); |
186 | |
187 | static cl::opt<bool> ProfileSizeInline( |
188 | "sample-profile-inline-size" , cl::Hidden, cl::init(Val: false), |
189 | cl::desc("Inline cold call sites in profile loader if it's beneficial " |
190 | "for code size." )); |
191 | |
192 | // Since profiles are consumed by many passes, turning on this option has |
193 | // side effects. For instance, pre-link SCC inliner would see merged profiles |
194 | // and inline the hot functions (that are skipped in this pass). |
195 | static cl::opt<bool> DisableSampleLoaderInlining( |
196 | "disable-sample-loader-inlining" , cl::Hidden, cl::init(Val: false), |
197 | cl::desc("If true, artifically skip inline transformation in sample-loader " |
198 | "pass, and merge (or scale) profiles (as configured by " |
199 | "--sample-profile-merge-inlinee)." )); |
200 | |
201 | namespace llvm { |
202 | cl::opt<bool> |
203 | SortProfiledSCC("sort-profiled-scc-member" , cl::init(Val: true), cl::Hidden, |
204 | cl::desc("Sort profiled recursion by edge weights." )); |
205 | |
206 | cl::opt<int> ProfileInlineGrowthLimit( |
207 | "sample-profile-inline-growth-limit" , cl::Hidden, cl::init(Val: 12), |
208 | cl::desc("The size growth ratio limit for proirity-based sample profile " |
209 | "loader inlining." )); |
210 | |
211 | cl::opt<int> ProfileInlineLimitMin( |
212 | "sample-profile-inline-limit-min" , cl::Hidden, cl::init(Val: 100), |
213 | cl::desc("The lower bound of size growth limit for " |
214 | "proirity-based sample profile loader inlining." )); |
215 | |
216 | cl::opt<int> ProfileInlineLimitMax( |
217 | "sample-profile-inline-limit-max" , cl::Hidden, cl::init(Val: 10000), |
218 | cl::desc("The upper bound of size growth limit for " |
219 | "proirity-based sample profile loader inlining." )); |
220 | |
221 | cl::opt<int> SampleHotCallSiteThreshold( |
222 | "sample-profile-hot-inline-threshold" , cl::Hidden, cl::init(Val: 3000), |
223 | cl::desc("Hot callsite threshold for proirity-based sample profile loader " |
224 | "inlining." )); |
225 | |
226 | cl::opt<int> SampleColdCallSiteThreshold( |
227 | "sample-profile-cold-inline-threshold" , cl::Hidden, cl::init(Val: 45), |
228 | cl::desc("Threshold for inlining cold callsites" )); |
229 | } // namespace llvm |
230 | |
231 | static cl::opt<unsigned> ProfileICPRelativeHotness( |
232 | "sample-profile-icp-relative-hotness" , cl::Hidden, cl::init(Val: 25), |
233 | cl::desc( |
234 | "Relative hotness percentage threshold for indirect " |
235 | "call promotion in proirity-based sample profile loader inlining." )); |
236 | |
237 | static cl::opt<unsigned> ProfileICPRelativeHotnessSkip( |
238 | "sample-profile-icp-relative-hotness-skip" , cl::Hidden, cl::init(Val: 1), |
239 | cl::desc( |
240 | "Skip relative hotness check for ICP up to given number of targets." )); |
241 | |
242 | static cl::opt<unsigned> HotFuncCutoffForStalenessError( |
243 | "hot-func-cutoff-for-staleness-error" , cl::Hidden, cl::init(Val: 800000), |
244 | cl::desc("A function is considered hot for staleness error check if its " |
245 | "total sample count is above the specified percentile" )); |
246 | |
247 | static cl::opt<unsigned> MinfuncsForStalenessError( |
248 | "min-functions-for-staleness-error" , cl::Hidden, cl::init(Val: 50), |
249 | cl::desc("Skip the check if the number of hot functions is smaller than " |
250 | "the specified number." )); |
251 | |
252 | static cl::opt<unsigned> PrecentMismatchForStalenessError( |
253 | "precent-mismatch-for-staleness-error" , cl::Hidden, cl::init(Val: 80), |
254 | cl::desc("Reject the profile if the mismatch percent is higher than the " |
255 | "given number." )); |
256 | |
257 | static cl::opt<bool> CallsitePrioritizedInline( |
258 | "sample-profile-prioritized-inline" , cl::Hidden, |
259 | cl::desc("Use call site prioritized inlining for sample profile loader." |
260 | "Currently only CSSPGO is supported." )); |
261 | |
262 | static cl::opt<bool> UsePreInlinerDecision( |
263 | "sample-profile-use-preinliner" , cl::Hidden, |
264 | cl::desc("Use the preinliner decisions stored in profile context." )); |
265 | |
266 | static cl::opt<bool> AllowRecursiveInline( |
267 | "sample-profile-recursive-inline" , cl::Hidden, |
268 | cl::desc("Allow sample loader inliner to inline recursive calls." )); |
269 | |
270 | static cl::opt<bool> RemoveProbeAfterProfileAnnotation( |
271 | "sample-profile-remove-probe" , cl::Hidden, cl::init(Val: false), |
272 | cl::desc("Remove pseudo-probe after sample profile annotation." )); |
273 | |
274 | static cl::opt<std::string> ProfileInlineReplayFile( |
275 | "sample-profile-inline-replay" , cl::init(Val: "" ), cl::value_desc("filename" ), |
276 | cl::desc( |
277 | "Optimization remarks file containing inline remarks to be replayed " |
278 | "by inlining from sample profile loader." ), |
279 | cl::Hidden); |
280 | |
281 | static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope( |
282 | "sample-profile-inline-replay-scope" , |
283 | cl::init(Val: ReplayInlinerSettings::Scope::Function), |
284 | cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function" , |
285 | "Replay on functions that have remarks associated " |
286 | "with them (default)" ), |
287 | clEnumValN(ReplayInlinerSettings::Scope::Module, "Module" , |
288 | "Replay on the entire module" )), |
289 | cl::desc("Whether inline replay should be applied to the entire " |
290 | "Module or just the Functions (default) that are present as " |
291 | "callers in remarks during sample profile inlining." ), |
292 | cl::Hidden); |
293 | |
294 | static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback( |
295 | "sample-profile-inline-replay-fallback" , |
296 | cl::init(Val: ReplayInlinerSettings::Fallback::Original), |
297 | cl::values( |
298 | clEnumValN( |
299 | ReplayInlinerSettings::Fallback::Original, "Original" , |
300 | "All decisions not in replay send to original advisor (default)" ), |
301 | clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline, |
302 | "AlwaysInline" , "All decisions not in replay are inlined" ), |
303 | clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline" , |
304 | "All decisions not in replay are not inlined" )), |
305 | cl::desc("How sample profile inline replay treats sites that don't come " |
306 | "from the replay. Original: defers to original advisor, " |
307 | "AlwaysInline: inline all sites not in replay, NeverInline: " |
308 | "inline no sites not in replay" ), |
309 | cl::Hidden); |
310 | |
311 | static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat( |
312 | "sample-profile-inline-replay-format" , |
313 | cl::init(Val: CallSiteFormat::Format::LineColumnDiscriminator), |
314 | cl::values( |
315 | clEnumValN(CallSiteFormat::Format::Line, "Line" , "<Line Number>" ), |
316 | clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn" , |
317 | "<Line Number>:<Column Number>" ), |
318 | clEnumValN(CallSiteFormat::Format::LineDiscriminator, |
319 | "LineDiscriminator" , "<Line Number>.<Discriminator>" ), |
320 | clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator, |
321 | "LineColumnDiscriminator" , |
322 | "<Line Number>:<Column Number>.<Discriminator> (default)" )), |
323 | cl::desc("How sample profile inline replay file is formatted" ), cl::Hidden); |
324 | |
325 | static cl::opt<unsigned> |
326 | MaxNumPromotions("sample-profile-icp-max-prom" , cl::init(Val: 3), cl::Hidden, |
327 | cl::desc("Max number of promotions for a single indirect " |
328 | "call callsite in sample profile loader" )); |
329 | |
330 | static cl::opt<bool> OverwriteExistingWeights( |
331 | "overwrite-existing-weights" , cl::Hidden, cl::init(Val: false), |
332 | cl::desc("Ignore existing branch weights on IR and always overwrite." )); |
333 | |
334 | static cl::opt<bool> AnnotateSampleProfileInlinePhase( |
335 | "annotate-sample-profile-inline-phase" , cl::Hidden, cl::init(Val: false), |
336 | cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for " |
337 | "sample-profile inline pass name." )); |
338 | |
339 | namespace llvm { |
340 | extern cl::opt<bool> EnableExtTspBlockPlacement; |
341 | } |
342 | |
343 | namespace { |
344 | |
345 | using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; |
346 | using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>; |
347 | using Edge = std::pair<const BasicBlock *, const BasicBlock *>; |
348 | using EdgeWeightMap = DenseMap<Edge, uint64_t>; |
349 | using BlockEdgeMap = |
350 | DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; |
351 | |
352 | class GUIDToFuncNameMapper { |
353 | public: |
354 | GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader, |
355 | DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap) |
356 | : CurrentReader(Reader), CurrentModule(M), |
357 | CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) { |
358 | if (!CurrentReader.useMD5()) |
359 | return; |
360 | |
361 | for (const auto &F : CurrentModule) { |
362 | StringRef OrigName = F.getName(); |
363 | CurrentGUIDToFuncNameMap.insert( |
364 | KV: {Function::getGUID(GlobalName: OrigName), OrigName}); |
365 | |
366 | // Local to global var promotion used by optimization like thinlto |
367 | // will rename the var and add suffix like ".llvm.xxx" to the |
368 | // original local name. In sample profile, the suffixes of function |
369 | // names are all stripped. Since it is possible that the mapper is |
370 | // built in post-thin-link phase and var promotion has been done, |
371 | // we need to add the substring of function name without the suffix |
372 | // into the GUIDToFuncNameMap. |
373 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
374 | if (CanonName != OrigName) |
375 | CurrentGUIDToFuncNameMap.insert( |
376 | KV: {Function::getGUID(GlobalName: CanonName), CanonName}); |
377 | } |
378 | |
379 | // Update GUIDToFuncNameMap for each function including inlinees. |
380 | SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap); |
381 | } |
382 | |
383 | ~GUIDToFuncNameMapper() { |
384 | if (!CurrentReader.useMD5()) |
385 | return; |
386 | |
387 | CurrentGUIDToFuncNameMap.clear(); |
388 | |
389 | // Reset GUIDToFuncNameMap for of each function as they're no |
390 | // longer valid at this point. |
391 | SetGUIDToFuncNameMapForAll(nullptr); |
392 | } |
393 | |
394 | private: |
395 | void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) { |
396 | std::queue<FunctionSamples *> FSToUpdate; |
397 | for (auto &IFS : CurrentReader.getProfiles()) { |
398 | FSToUpdate.push(x: &IFS.second); |
399 | } |
400 | |
401 | while (!FSToUpdate.empty()) { |
402 | FunctionSamples *FS = FSToUpdate.front(); |
403 | FSToUpdate.pop(); |
404 | FS->GUIDToFuncNameMap = Map; |
405 | for (const auto &ICS : FS->getCallsiteSamples()) { |
406 | const FunctionSamplesMap &FSMap = ICS.second; |
407 | for (const auto &IFS : FSMap) { |
408 | FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second); |
409 | FSToUpdate.push(x: &FS); |
410 | } |
411 | } |
412 | } |
413 | } |
414 | |
415 | SampleProfileReader &CurrentReader; |
416 | Module &CurrentModule; |
417 | DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap; |
418 | }; |
419 | |
420 | // Inline candidate used by iterative callsite prioritized inliner |
421 | struct InlineCandidate { |
422 | CallBase *CallInstr; |
423 | const FunctionSamples *CalleeSamples; |
424 | // Prorated callsite count, which will be used to guide inlining. For example, |
425 | // if a callsite is duplicated in LTO prelink, then in LTO postlink the two |
426 | // copies will get their own distribution factors and their prorated counts |
427 | // will be used to decide if they should be inlined independently. |
428 | uint64_t CallsiteCount; |
429 | // Call site distribution factor to prorate the profile samples for a |
430 | // duplicated callsite. Default value is 1.0. |
431 | float CallsiteDistribution; |
432 | }; |
433 | |
434 | // Inline candidate comparer using call site weight |
435 | struct CandidateComparer { |
436 | bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) { |
437 | if (LHS.CallsiteCount != RHS.CallsiteCount) |
438 | return LHS.CallsiteCount < RHS.CallsiteCount; |
439 | |
440 | const FunctionSamples *LCS = LHS.CalleeSamples; |
441 | const FunctionSamples *RCS = RHS.CalleeSamples; |
442 | // In inline replay mode, CalleeSamples may be null and the order doesn't |
443 | // matter. |
444 | if (!LCS || !RCS) |
445 | return LCS; |
446 | |
447 | // Tie breaker using number of samples try to favor smaller functions first |
448 | if (LCS->getBodySamples().size() != RCS->getBodySamples().size()) |
449 | return LCS->getBodySamples().size() > RCS->getBodySamples().size(); |
450 | |
451 | // Tie breaker using GUID so we have stable/deterministic inlining order |
452 | return LCS->getGUID() < RCS->getGUID(); |
453 | } |
454 | }; |
455 | |
456 | using CandidateQueue = |
457 | PriorityQueue<InlineCandidate, std::vector<InlineCandidate>, |
458 | CandidateComparer>; |
459 | |
460 | /// Sample profile pass. |
461 | /// |
462 | /// This pass reads profile data from the file specified by |
463 | /// -sample-profile-file and annotates every affected function with the |
464 | /// profile information found in that file. |
465 | class SampleProfileLoader final : public SampleProfileLoaderBaseImpl<Function> { |
466 | public: |
467 | SampleProfileLoader( |
468 | StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase, |
469 | IntrusiveRefCntPtr<vfs::FileSystem> FS, |
470 | std::function<AssumptionCache &(Function &)> GetAssumptionCache, |
471 | std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo, |
472 | std::function<const TargetLibraryInfo &(Function &)> GetTLI, |
473 | LazyCallGraph &CG) |
474 | : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName), |
475 | std::move(FS)), |
476 | GetAC(std::move(GetAssumptionCache)), |
477 | GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)), |
478 | CG(CG), LTOPhase(LTOPhase), |
479 | AnnotatedPassName(AnnotateSampleProfileInlinePhase |
480 | ? llvm::AnnotateInlinePassName(IC: InlineContext{ |
481 | .LTOPhase: LTOPhase, .Pass: InlinePass::SampleProfileInliner}) |
482 | : CSINLINE_DEBUG) {} |
483 | |
484 | bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr); |
485 | bool runOnModule(Module &M, ModuleAnalysisManager *AM, |
486 | ProfileSummaryInfo *_PSI); |
487 | |
488 | protected: |
489 | bool runOnFunction(Function &F, ModuleAnalysisManager *AM); |
490 | bool emitAnnotations(Function &F); |
491 | ErrorOr<uint64_t> getInstWeight(const Instruction &I) override; |
492 | const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const; |
493 | const FunctionSamples * |
494 | findFunctionSamples(const Instruction &I) const override; |
495 | std::vector<const FunctionSamples *> |
496 | findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; |
497 | void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples, |
498 | DenseSet<GlobalValue::GUID> &InlinedGUIDs, |
499 | uint64_t Threshold); |
500 | // Attempt to promote indirect call and also inline the promoted call |
501 | bool tryPromoteAndInlineCandidate( |
502 | Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, |
503 | uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); |
504 | |
505 | bool inlineHotFunctions(Function &F, |
506 | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
507 | std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB); |
508 | bool getExternalInlineAdvisorShouldInline(CallBase &CB); |
509 | InlineCost shouldInlineCandidate(InlineCandidate &Candidate); |
510 | bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB); |
511 | bool |
512 | tryInlineCandidate(InlineCandidate &Candidate, |
513 | SmallVector<CallBase *, 8> *InlinedCallSites = nullptr); |
514 | bool |
515 | inlineHotFunctionsWithPriority(Function &F, |
516 | DenseSet<GlobalValue::GUID> &InlinedGUIDs); |
517 | // Inline cold/small functions in addition to hot ones |
518 | bool shouldInlineColdCallee(CallBase &CallInst); |
519 | void emitOptimizationRemarksForInlineCandidates( |
520 | const SmallVectorImpl<CallBase *> &Candidates, const Function &F, |
521 | bool Hot); |
522 | void promoteMergeNotInlinedContextSamples( |
523 | MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites, |
524 | const Function &F); |
525 | std::vector<Function *> buildFunctionOrder(Module &M, LazyCallGraph &CG); |
526 | std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(Module &M); |
527 | void generateMDProfMetadata(Function &F); |
528 | bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI, |
529 | const SampleProfileMap &Profiles); |
530 | void removePseudoProbeInsts(Module &M); |
531 | |
532 | /// Map from function name to Function *. Used to find the function from |
533 | /// the function name. If the function name contains suffix, additional |
534 | /// entry is added to map from the stripped name to the function if there |
535 | /// is one-to-one mapping. |
536 | HashKeyMap<std::unordered_map, FunctionId, Function *> SymbolMap; |
537 | |
538 | /// Map from function name to profile name generated by call-graph based |
539 | /// profile fuzzy matching(--salvage-unused-profile). |
540 | HashKeyMap<std::unordered_map, FunctionId, FunctionId> FuncNameToProfNameMap; |
541 | |
542 | std::function<AssumptionCache &(Function &)> GetAC; |
543 | std::function<TargetTransformInfo &(Function &)> GetTTI; |
544 | std::function<const TargetLibraryInfo &(Function &)> GetTLI; |
545 | LazyCallGraph &CG; |
546 | |
547 | /// Profile tracker for different context. |
548 | std::unique_ptr<SampleContextTracker> ContextTracker; |
549 | |
550 | /// Flag indicating which LTO/ThinLTO phase the pass is invoked in. |
551 | /// |
552 | /// We need to know the LTO phase because for example in ThinLTOPrelink |
553 | /// phase, in annotation, we should not promote indirect calls. Instead, |
554 | /// we will mark GUIDs that needs to be annotated to the function. |
555 | const ThinOrFullLTOPhase LTOPhase; |
556 | const std::string AnnotatedPassName; |
557 | |
558 | /// Profle Symbol list tells whether a function name appears in the binary |
559 | /// used to generate the current profile. |
560 | std::shared_ptr<ProfileSymbolList> PSL; |
561 | |
562 | /// Total number of samples collected in this profile. |
563 | /// |
564 | /// This is the sum of all the samples collected in all the functions executed |
565 | /// at runtime. |
566 | uint64_t TotalCollectedSamples = 0; |
567 | |
568 | // Information recorded when we declined to inline a call site |
569 | // because we have determined it is too cold is accumulated for |
570 | // each callee function. Initially this is just the entry count. |
571 | struct NotInlinedProfileInfo { |
572 | uint64_t entryCount; |
573 | }; |
574 | DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo; |
575 | |
576 | // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for |
577 | // all the function symbols defined or declared in current module. |
578 | DenseMap<uint64_t, StringRef> GUIDToFuncNameMap; |
579 | |
580 | // All the Names used in FunctionSamples including outline function |
581 | // names, inline instance names and call target names. |
582 | StringSet<> NamesInProfile; |
583 | // MD5 version of NamesInProfile. Either NamesInProfile or GUIDsInProfile is |
584 | // populated, depends on whether the profile uses MD5. Because the name table |
585 | // generally contains several magnitude more entries than the number of |
586 | // functions, we do not want to convert all names from one form to another. |
587 | llvm::DenseSet<uint64_t> GUIDsInProfile; |
588 | |
589 | // For symbol in profile symbol list, whether to regard their profiles |
590 | // to be accurate. It is mainly decided by existance of profile symbol |
591 | // list and -profile-accurate-for-symsinlist flag, but it can be |
592 | // overriden by -profile-sample-accurate or profile-sample-accurate |
593 | // attribute. |
594 | bool ProfAccForSymsInList; |
595 | |
596 | // External inline advisor used to replay inline decision from remarks. |
597 | std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor; |
598 | |
599 | // A helper to implement the sample profile matching algorithm. |
600 | std::unique_ptr<SampleProfileMatcher> MatchingManager; |
601 | |
602 | private: |
603 | const char *() const { |
604 | return AnnotatedPassName.c_str(); |
605 | } |
606 | }; |
607 | } // end anonymous namespace |
608 | |
609 | namespace llvm { |
610 | template <> |
611 | inline bool SampleProfileInference<Function>::isExit(const BasicBlock *BB) { |
612 | return succ_empty(BB); |
613 | } |
614 | |
615 | template <> |
616 | inline void SampleProfileInference<Function>::findUnlikelyJumps( |
617 | const std::vector<const BasicBlockT *> &BasicBlocks, |
618 | BlockEdgeMap &Successors, FlowFunction &Func) { |
619 | for (auto &Jump : Func.Jumps) { |
620 | const auto *BB = BasicBlocks[Jump.Source]; |
621 | const auto *Succ = BasicBlocks[Jump.Target]; |
622 | const Instruction *TI = BB->getTerminator(); |
623 | // Check if a block ends with InvokeInst and mark non-taken branch unlikely. |
624 | // In that case block Succ should be a landing pad |
625 | if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) { |
626 | if (isa<InvokeInst>(Val: TI)) { |
627 | Jump.IsUnlikely = true; |
628 | } |
629 | } |
630 | const Instruction *SuccTI = Succ->getTerminator(); |
631 | // Check if the target block contains UnreachableInst and mark it unlikely |
632 | if (SuccTI->getNumSuccessors() == 0) { |
633 | if (isa<UnreachableInst>(Val: SuccTI)) { |
634 | Jump.IsUnlikely = true; |
635 | } |
636 | } |
637 | } |
638 | } |
639 | |
640 | template <> |
641 | void SampleProfileLoaderBaseImpl<Function>::computeDominanceAndLoopInfo( |
642 | Function &F) { |
643 | DT.reset(p: new DominatorTree); |
644 | DT->recalculate(Func&: F); |
645 | |
646 | PDT.reset(p: new PostDominatorTree(F)); |
647 | |
648 | LI.reset(p: new LoopInfo); |
649 | LI->analyze(DomTree: *DT); |
650 | } |
651 | } // namespace llvm |
652 | |
653 | ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { |
654 | if (FunctionSamples::ProfileIsProbeBased) |
655 | return getProbeWeight(Inst); |
656 | |
657 | const DebugLoc &DLoc = Inst.getDebugLoc(); |
658 | if (!DLoc) |
659 | return std::error_code(); |
660 | |
661 | // Ignore all intrinsics, phinodes and branch instructions. |
662 | // Branch and phinodes instruction usually contains debug info from sources |
663 | // outside of the residing basic block, thus we ignore them during annotation. |
664 | if (isa<BranchInst>(Val: Inst) || isa<IntrinsicInst>(Val: Inst) || isa<PHINode>(Val: Inst)) |
665 | return std::error_code(); |
666 | |
667 | // For non-CS profile, if a direct call/invoke instruction is inlined in |
668 | // profile (findCalleeFunctionSamples returns non-empty result), but not |
669 | // inlined here, it means that the inlined callsite has no sample, thus the |
670 | // call instruction should have 0 count. |
671 | // For CS profile, the callsite count of previously inlined callees is |
672 | // populated with the entry count of the callees. |
673 | if (!FunctionSamples::ProfileIsCS) |
674 | if (const auto *CB = dyn_cast<CallBase>(Val: &Inst)) |
675 | if (!CB->isIndirectCall() && findCalleeFunctionSamples(I: *CB)) |
676 | return 0; |
677 | |
678 | return getInstWeightImpl(Inst); |
679 | } |
680 | |
681 | /// Get the FunctionSamples for a call instruction. |
682 | /// |
683 | /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined |
684 | /// instance in which that call instruction is calling to. It contains |
685 | /// all samples that resides in the inlined instance. We first find the |
686 | /// inlined instance in which the call instruction is from, then we |
687 | /// traverse its children to find the callsite with the matching |
688 | /// location. |
689 | /// |
690 | /// \param Inst Call/Invoke instruction to query. |
691 | /// |
692 | /// \returns The FunctionSamples pointer to the inlined instance. |
693 | const FunctionSamples * |
694 | SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { |
695 | const DILocation *DIL = Inst.getDebugLoc(); |
696 | if (!DIL) { |
697 | return nullptr; |
698 | } |
699 | |
700 | StringRef CalleeName; |
701 | if (Function *Callee = Inst.getCalledFunction()) |
702 | CalleeName = Callee->getName(); |
703 | |
704 | if (FunctionSamples::ProfileIsCS) |
705 | return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName); |
706 | |
707 | const FunctionSamples *FS = findFunctionSamples(I: Inst); |
708 | if (FS == nullptr) |
709 | return nullptr; |
710 | |
711 | return FS->findFunctionSamplesAt(Loc: FunctionSamples::getCallSiteIdentifier(DIL), |
712 | CalleeName, Remapper: Reader->getRemapper(), |
713 | FuncNameToProfNameMap: &FuncNameToProfNameMap); |
714 | } |
715 | |
716 | /// Returns a vector of FunctionSamples that are the indirect call targets |
717 | /// of \p Inst. The vector is sorted by the total number of samples. Stores |
718 | /// the total call count of the indirect call in \p Sum. |
719 | std::vector<const FunctionSamples *> |
720 | SampleProfileLoader::findIndirectCallFunctionSamples( |
721 | const Instruction &Inst, uint64_t &Sum) const { |
722 | const DILocation *DIL = Inst.getDebugLoc(); |
723 | std::vector<const FunctionSamples *> R; |
724 | |
725 | if (!DIL) { |
726 | return R; |
727 | } |
728 | |
729 | auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) { |
730 | assert(L && R && "Expect non-null FunctionSamples" ); |
731 | if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate()) |
732 | return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate(); |
733 | return L->getGUID() < R->getGUID(); |
734 | }; |
735 | |
736 | if (FunctionSamples::ProfileIsCS) { |
737 | auto CalleeSamples = |
738 | ContextTracker->getIndirectCalleeContextSamplesFor(DIL); |
739 | if (CalleeSamples.empty()) |
740 | return R; |
741 | |
742 | // For CSSPGO, we only use target context profile's entry count |
743 | // as that already includes both inlined callee and non-inlined ones.. |
744 | Sum = 0; |
745 | for (const auto *const FS : CalleeSamples) { |
746 | Sum += FS->getHeadSamplesEstimate(); |
747 | R.push_back(x: FS); |
748 | } |
749 | llvm::sort(C&: R, Comp: FSCompare); |
750 | return R; |
751 | } |
752 | |
753 | const FunctionSamples *FS = findFunctionSamples(I: Inst); |
754 | if (FS == nullptr) |
755 | return R; |
756 | |
757 | auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL); |
758 | Sum = 0; |
759 | if (auto T = FS->findCallTargetMapAt(CallSite)) |
760 | for (const auto &T_C : *T) |
761 | Sum += T_C.second; |
762 | if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(Loc: CallSite)) { |
763 | if (M->empty()) |
764 | return R; |
765 | for (const auto &NameFS : *M) { |
766 | Sum += NameFS.second.getHeadSamplesEstimate(); |
767 | R.push_back(x: &NameFS.second); |
768 | } |
769 | llvm::sort(C&: R, Comp: FSCompare); |
770 | } |
771 | return R; |
772 | } |
773 | |
774 | const FunctionSamples * |
775 | SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { |
776 | if (FunctionSamples::ProfileIsProbeBased) { |
777 | std::optional<PseudoProbe> Probe = extractProbe(Inst); |
778 | if (!Probe) |
779 | return nullptr; |
780 | } |
781 | |
782 | const DILocation *DIL = Inst.getDebugLoc(); |
783 | if (!DIL) |
784 | return Samples; |
785 | |
786 | auto it = DILocation2SampleMap.try_emplace(Key: DIL,Args: nullptr); |
787 | if (it.second) { |
788 | if (FunctionSamples::ProfileIsCS) |
789 | it.first->second = ContextTracker->getContextSamplesFor(DIL); |
790 | else |
791 | it.first->second = Samples->findFunctionSamples( |
792 | DIL, Remapper: Reader->getRemapper(), FuncNameToProfNameMap: &FuncNameToProfNameMap); |
793 | } |
794 | return it.first->second; |
795 | } |
796 | |
797 | /// Check whether the indirect call promotion history of \p Inst allows |
798 | /// the promotion for \p Candidate. |
799 | /// If the profile count for the promotion candidate \p Candidate is |
800 | /// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted |
801 | /// for \p Inst. If we already have at least MaxNumPromotions |
802 | /// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we |
803 | /// cannot promote for \p Inst anymore. |
804 | static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) { |
805 | uint64_t TotalCount = 0; |
806 | auto ValueData = getValueProfDataFromInst(Inst, ValueKind: IPVK_IndirectCallTarget, |
807 | MaxNumValueData: MaxNumPromotions, TotalC&: TotalCount, GetNoICPValue: true); |
808 | // No valid value profile so no promoted targets have been recorded |
809 | // before. Ok to do ICP. |
810 | if (ValueData.empty()) |
811 | return true; |
812 | |
813 | unsigned NumPromoted = 0; |
814 | for (const auto &V : ValueData) { |
815 | if (V.Count != NOMORE_ICP_MAGICNUM) |
816 | continue; |
817 | |
818 | // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the |
819 | // metadata, it means the candidate has been promoted for this |
820 | // indirect call. |
821 | if (V.Value == Function::getGUID(GlobalName: Candidate)) |
822 | return false; |
823 | NumPromoted++; |
824 | // If already have MaxNumPromotions promotion, don't do it anymore. |
825 | if (NumPromoted == MaxNumPromotions) |
826 | return false; |
827 | } |
828 | return true; |
829 | } |
830 | |
831 | /// Update indirect call target profile metadata for \p Inst. |
832 | /// Usually \p Sum is the sum of counts of all the targets for \p Inst. |
833 | /// If it is 0, it means updateIDTMetaData is used to mark a |
834 | /// certain target to be promoted already. If it is not zero, |
835 | /// we expect to use it to update the total count in the value profile. |
836 | static void |
837 | updateIDTMetaData(Instruction &Inst, |
838 | const SmallVectorImpl<InstrProfValueData> &CallTargets, |
839 | uint64_t Sum) { |
840 | // Bail out early if MaxNumPromotions is zero. |
841 | // This prevents allocating an array of zero length below. |
842 | // |
843 | // Note `updateIDTMetaData` is called in two places so check |
844 | // `MaxNumPromotions` inside it. |
845 | if (MaxNumPromotions == 0) |
846 | return; |
847 | // OldSum is the existing total count in the value profile data. |
848 | uint64_t OldSum = 0; |
849 | auto ValueData = getValueProfDataFromInst(Inst, ValueKind: IPVK_IndirectCallTarget, |
850 | MaxNumValueData: MaxNumPromotions, TotalC&: OldSum, GetNoICPValue: true); |
851 | |
852 | DenseMap<uint64_t, uint64_t> ValueCountMap; |
853 | if (Sum == 0) { |
854 | assert((CallTargets.size() == 1 && |
855 | CallTargets[0].Count == NOMORE_ICP_MAGICNUM) && |
856 | "If sum is 0, assume only one element in CallTargets " |
857 | "with count being NOMORE_ICP_MAGICNUM" ); |
858 | // Initialize ValueCountMap with existing value profile data. |
859 | for (const auto &V : ValueData) |
860 | ValueCountMap[V.Value] = V.Count; |
861 | auto Pair = |
862 | ValueCountMap.try_emplace(Key: CallTargets[0].Value, Args: CallTargets[0].Count); |
863 | // If the target already exists in value profile, decrease the total |
864 | // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM. |
865 | if (!Pair.second) { |
866 | OldSum -= Pair.first->second; |
867 | Pair.first->second = NOMORE_ICP_MAGICNUM; |
868 | } |
869 | Sum = OldSum; |
870 | } else { |
871 | // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM |
872 | // counts in the value profile. |
873 | for (const auto &V : ValueData) { |
874 | if (V.Count == NOMORE_ICP_MAGICNUM) |
875 | ValueCountMap[V.Value] = V.Count; |
876 | } |
877 | |
878 | for (const auto &Data : CallTargets) { |
879 | auto Pair = ValueCountMap.try_emplace(Key: Data.Value, Args: Data.Count); |
880 | if (Pair.second) |
881 | continue; |
882 | // The target represented by Data.Value has already been promoted. |
883 | // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease |
884 | // Sum by Data.Count. |
885 | assert(Sum >= Data.Count && "Sum should never be less than Data.Count" ); |
886 | Sum -= Data.Count; |
887 | } |
888 | } |
889 | |
890 | SmallVector<InstrProfValueData, 8> NewCallTargets; |
891 | for (const auto &ValueCount : ValueCountMap) { |
892 | NewCallTargets.emplace_back( |
893 | Args: InstrProfValueData{.Value: ValueCount.first, .Count: ValueCount.second}); |
894 | } |
895 | |
896 | llvm::sort(C&: NewCallTargets, |
897 | Comp: [](const InstrProfValueData &L, const InstrProfValueData &R) { |
898 | if (L.Count != R.Count) |
899 | return L.Count > R.Count; |
900 | return L.Value > R.Value; |
901 | }); |
902 | |
903 | uint32_t MaxMDCount = |
904 | std::min(a: NewCallTargets.size(), b: static_cast<size_t>(MaxNumPromotions)); |
905 | annotateValueSite(M&: *Inst.getParent()->getParent()->getParent(), Inst, |
906 | VDs: NewCallTargets, Sum, ValueKind: IPVK_IndirectCallTarget, MaxMDCount); |
907 | } |
908 | |
909 | /// Attempt to promote indirect call and also inline the promoted call. |
910 | /// |
911 | /// \param F Caller function. |
912 | /// \param Candidate ICP and inline candidate. |
913 | /// \param SumOrigin Original sum of target counts for indirect call before |
914 | /// promoting given candidate. |
915 | /// \param Sum Prorated sum of remaining target counts for indirect call |
916 | /// after promoting given candidate. |
917 | /// \param InlinedCallSite Output vector for new call sites exposed after |
918 | /// inlining. |
919 | bool SampleProfileLoader::tryPromoteAndInlineCandidate( |
920 | Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, |
921 | SmallVector<CallBase *, 8> *InlinedCallSite) { |
922 | // Bail out early if sample-loader inliner is disabled. |
923 | if (DisableSampleLoaderInlining) |
924 | return false; |
925 | |
926 | // Bail out early if MaxNumPromotions is zero. |
927 | // This prevents allocating an array of zero length in callees below. |
928 | if (MaxNumPromotions == 0) |
929 | return false; |
930 | auto CalleeFunctionName = Candidate.CalleeSamples->getFunction(); |
931 | auto R = SymbolMap.find(Key: CalleeFunctionName); |
932 | if (R == SymbolMap.end() || !R->second) |
933 | return false; |
934 | |
935 | auto &CI = *Candidate.CallInstr; |
936 | if (!doesHistoryAllowICP(Inst: CI, Candidate: R->second->getName())) |
937 | return false; |
938 | |
939 | const char *Reason = "Callee function not available" ; |
940 | // R->getValue() != &F is to prevent promoting a recursive call. |
941 | // If it is a recursive call, we do not inline it as it could bloat |
942 | // the code exponentially. There is way to better handle this, e.g. |
943 | // clone the caller first, and inline the cloned caller if it is |
944 | // recursive. As llvm does not inline recursive calls, we will |
945 | // simply ignore it instead of handling it explicitly. |
946 | if (!R->second->isDeclaration() && R->second->getSubprogram() && |
947 | R->second->hasFnAttribute(Kind: "use-sample-profile" ) && |
948 | R->second != &F && isLegalToPromote(CB: CI, Callee: R->second, FailureReason: &Reason)) { |
949 | // For promoted target, set its value with NOMORE_ICP_MAGICNUM count |
950 | // in the value profile metadata so the target won't be promoted again. |
951 | SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{ |
952 | .Value: Function::getGUID(GlobalName: R->second->getName()), .Count: NOMORE_ICP_MAGICNUM}}; |
953 | updateIDTMetaData(Inst&: CI, CallTargets: SortedCallTargets, Sum: 0); |
954 | |
955 | auto *DI = &pgo::promoteIndirectCall( |
956 | CB&: CI, F: R->second, Count: Candidate.CallsiteCount, TotalCount: Sum, AttachProfToDirectCall: false, ORE); |
957 | if (DI) { |
958 | Sum -= Candidate.CallsiteCount; |
959 | // Do not prorate the indirect callsite distribution since the original |
960 | // distribution will be used to scale down non-promoted profile target |
961 | // counts later. By doing this we lose track of the real callsite count |
962 | // for the leftover indirect callsite as a trade off for accurate call |
963 | // target counts. |
964 | // TODO: Ideally we would have two separate factors, one for call site |
965 | // counts and one is used to prorate call target counts. |
966 | // Do not update the promoted direct callsite distribution at this |
967 | // point since the original distribution combined with the callee profile |
968 | // will be used to prorate callsites from the callee if inlined. Once not |
969 | // inlined, the direct callsite distribution should be prorated so that |
970 | // the it will reflect the real callsite counts. |
971 | Candidate.CallInstr = DI; |
972 | if (isa<CallInst>(Val: DI) || isa<InvokeInst>(Val: DI)) { |
973 | bool Inlined = tryInlineCandidate(Candidate, InlinedCallSites: InlinedCallSite); |
974 | if (!Inlined) { |
975 | // Prorate the direct callsite distribution so that it reflects real |
976 | // callsite counts. |
977 | setProbeDistributionFactor( |
978 | Inst&: *DI, Factor: static_cast<float>(Candidate.CallsiteCount) / SumOrigin); |
979 | } |
980 | return Inlined; |
981 | } |
982 | } |
983 | } else { |
984 | LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to " |
985 | << FunctionSamples::getCanonicalFnName( |
986 | Candidate.CallInstr->getName())<< " because " |
987 | << Reason << "\n" ); |
988 | } |
989 | return false; |
990 | } |
991 | |
992 | bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) { |
993 | if (!ProfileSizeInline) |
994 | return false; |
995 | |
996 | Function *Callee = CallInst.getCalledFunction(); |
997 | if (Callee == nullptr) |
998 | return false; |
999 | |
1000 | InlineCost Cost = getInlineCost(Call&: CallInst, Params: getInlineParams(), CalleeTTI&: GetTTI(*Callee), |
1001 | GetAssumptionCache: GetAC, GetTLI); |
1002 | |
1003 | if (Cost.isNever()) |
1004 | return false; |
1005 | |
1006 | if (Cost.isAlways()) |
1007 | return true; |
1008 | |
1009 | return Cost.getCost() <= SampleColdCallSiteThreshold; |
1010 | } |
1011 | |
1012 | void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( |
1013 | const SmallVectorImpl<CallBase *> &Candidates, const Function &F, |
1014 | bool Hot) { |
1015 | for (auto *I : Candidates) { |
1016 | Function *CalledFunction = I->getCalledFunction(); |
1017 | if (CalledFunction) { |
1018 | ORE->emit(OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), |
1019 | "InlineAttempt" , I->getDebugLoc(), |
1020 | I->getParent()) |
1021 | << "previous inlining reattempted for " |
1022 | << (Hot ? "hotness: '" : "size: '" ) |
1023 | << ore::NV("Callee" , CalledFunction) << "' into '" |
1024 | << ore::NV("Caller" , &F) << "'" ); |
1025 | } |
1026 | } |
1027 | } |
1028 | |
1029 | void SampleProfileLoader::findExternalInlineCandidate( |
1030 | CallBase *CB, const FunctionSamples *Samples, |
1031 | DenseSet<GlobalValue::GUID> &InlinedGUIDs, uint64_t Threshold) { |
1032 | |
1033 | // If ExternalInlineAdvisor(ReplayInlineAdvisor) wants to inline an external |
1034 | // function make sure it's imported |
1035 | if (CB && getExternalInlineAdvisorShouldInline(CB&: *CB)) { |
1036 | // Samples may not exist for replayed function, if so |
1037 | // just add the direct GUID and move on |
1038 | if (!Samples) { |
1039 | InlinedGUIDs.insert( |
1040 | V: Function::getGUID(GlobalName: CB->getCalledFunction()->getName())); |
1041 | return; |
1042 | } |
1043 | // Otherwise, drop the threshold to import everything that we can |
1044 | Threshold = 0; |
1045 | } |
1046 | |
1047 | // In some rare cases, call instruction could be changed after being pushed |
1048 | // into inline candidate queue, this is because earlier inlining may expose |
1049 | // constant propagation which can change indirect call to direct call. When |
1050 | // this happens, we may fail to find matching function samples for the |
1051 | // candidate later, even if a match was found when the candidate was enqueued. |
1052 | if (!Samples) |
1053 | return; |
1054 | |
1055 | // For AutoFDO profile, retrieve candidate profiles by walking over |
1056 | // the nested inlinee profiles. |
1057 | if (!FunctionSamples::ProfileIsCS) { |
1058 | // Set threshold to zero to honor pre-inliner decision. |
1059 | if (UsePreInlinerDecision) |
1060 | Threshold = 0; |
1061 | Samples->findInlinedFunctions(S&: InlinedGUIDs, SymbolMap, Threshold); |
1062 | return; |
1063 | } |
1064 | |
1065 | ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(FSamples: Samples); |
1066 | std::queue<ContextTrieNode *> CalleeList; |
1067 | CalleeList.push(x: Caller); |
1068 | while (!CalleeList.empty()) { |
1069 | ContextTrieNode *Node = CalleeList.front(); |
1070 | CalleeList.pop(); |
1071 | FunctionSamples *CalleeSample = Node->getFunctionSamples(); |
1072 | // For CSSPGO profile, retrieve candidate profile by walking over the |
1073 | // trie built for context profile. Note that also take call targets |
1074 | // even if callee doesn't have a corresponding context profile. |
1075 | if (!CalleeSample) |
1076 | continue; |
1077 | |
1078 | // If pre-inliner decision is used, honor that for importing as well. |
1079 | bool PreInline = |
1080 | UsePreInlinerDecision && |
1081 | CalleeSample->getContext().hasAttribute(A: ContextShouldBeInlined); |
1082 | if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold) |
1083 | continue; |
1084 | |
1085 | Function *Func = SymbolMap.lookup(Key: CalleeSample->getFunction()); |
1086 | // Add to the import list only when it's defined out of module. |
1087 | if (!Func || Func->isDeclaration()) |
1088 | InlinedGUIDs.insert(V: CalleeSample->getGUID()); |
1089 | |
1090 | // Import hot CallTargets, which may not be available in IR because full |
1091 | // profile annotation cannot be done until backend compilation in ThinLTO. |
1092 | for (const auto &BS : CalleeSample->getBodySamples()) |
1093 | for (const auto &TS : BS.second.getCallTargets()) |
1094 | if (TS.second > Threshold) { |
1095 | const Function *Callee = SymbolMap.lookup(Key: TS.first); |
1096 | if (!Callee || Callee->isDeclaration()) |
1097 | InlinedGUIDs.insert(V: TS.first.getHashCode()); |
1098 | } |
1099 | |
1100 | // Import hot child context profile associted with callees. Note that this |
1101 | // may have some overlap with the call target loop above, but doing this |
1102 | // based child context profile again effectively allow us to use the max of |
1103 | // entry count and call target count to determine importing. |
1104 | for (auto &Child : Node->getAllChildContext()) { |
1105 | ContextTrieNode *CalleeNode = &Child.second; |
1106 | CalleeList.push(x: CalleeNode); |
1107 | } |
1108 | } |
1109 | } |
1110 | |
1111 | /// Iteratively inline hot callsites of a function. |
1112 | /// |
1113 | /// Iteratively traverse all callsites of the function \p F, so as to |
1114 | /// find out callsites with corresponding inline instances. |
1115 | /// |
1116 | /// For such callsites, |
1117 | /// - If it is hot enough, inline the callsites and adds callsites of the callee |
1118 | /// into the caller. If the call is an indirect call, first promote |
1119 | /// it to direct call. Each indirect call is limited with a single target. |
1120 | /// |
1121 | /// - If a callsite is not inlined, merge the its profile to the outline |
1122 | /// version (if --sample-profile-merge-inlinee is true), or scale the |
1123 | /// counters of standalone function based on the profile of inlined |
1124 | /// instances (if --sample-profile-merge-inlinee is false). |
1125 | /// |
1126 | /// Later passes may consume the updated profiles. |
1127 | /// |
1128 | /// \param F function to perform iterative inlining. |
1129 | /// \param InlinedGUIDs a set to be updated to include all GUIDs that are |
1130 | /// inlined in the profiled binary. |
1131 | /// |
1132 | /// \returns True if there is any inline happened. |
1133 | bool SampleProfileLoader::inlineHotFunctions( |
1134 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
1135 | // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure |
1136 | // Profile symbol list is ignored when profile-sample-accurate is on. |
1137 | assert((!ProfAccForSymsInList || |
1138 | (!ProfileSampleAccurate && |
1139 | !F.hasFnAttribute("profile-sample-accurate" ))) && |
1140 | "ProfAccForSymsInList should be false when profile-sample-accurate " |
1141 | "is enabled" ); |
1142 | |
1143 | MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; |
1144 | bool Changed = false; |
1145 | bool LocalChanged = true; |
1146 | while (LocalChanged) { |
1147 | LocalChanged = false; |
1148 | SmallVector<CallBase *, 10> CIS; |
1149 | for (auto &BB : F) { |
1150 | bool Hot = false; |
1151 | SmallVector<CallBase *, 10> AllCandidates; |
1152 | SmallVector<CallBase *, 10> ColdCandidates; |
1153 | for (auto &I : BB) { |
1154 | const FunctionSamples *FS = nullptr; |
1155 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
1156 | if (!isa<IntrinsicInst>(Val: I)) { |
1157 | if ((FS = findCalleeFunctionSamples(Inst: *CB))) { |
1158 | assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && |
1159 | "GUIDToFuncNameMap has to be populated" ); |
1160 | AllCandidates.push_back(Elt: CB); |
1161 | if (FS->getHeadSamplesEstimate() > 0 || |
1162 | FunctionSamples::ProfileIsCS) |
1163 | LocalNotInlinedCallSites.insert(KV: {CB, FS}); |
1164 | if (callsiteIsHot(CallsiteFS: FS, PSI, ProfAccForSymsInList)) |
1165 | Hot = true; |
1166 | else if (shouldInlineColdCallee(CallInst&: *CB)) |
1167 | ColdCandidates.push_back(Elt: CB); |
1168 | } else if (getExternalInlineAdvisorShouldInline(CB&: *CB)) { |
1169 | AllCandidates.push_back(Elt: CB); |
1170 | } |
1171 | } |
1172 | } |
1173 | } |
1174 | if (Hot || ExternalInlineAdvisor) { |
1175 | CIS.insert(I: CIS.begin(), From: AllCandidates.begin(), To: AllCandidates.end()); |
1176 | emitOptimizationRemarksForInlineCandidates(Candidates: AllCandidates, F, Hot: true); |
1177 | } else { |
1178 | CIS.insert(I: CIS.begin(), From: ColdCandidates.begin(), To: ColdCandidates.end()); |
1179 | emitOptimizationRemarksForInlineCandidates(Candidates: ColdCandidates, F, Hot: false); |
1180 | } |
1181 | } |
1182 | for (CallBase *I : CIS) { |
1183 | Function *CalledFunction = I->getCalledFunction(); |
1184 | InlineCandidate Candidate = {.CallInstr: I, .CalleeSamples: LocalNotInlinedCallSites.lookup(Key: I), |
1185 | .CallsiteCount: 0 /* dummy count */, |
1186 | .CallsiteDistribution: 1.0 /* dummy distribution factor */}; |
1187 | // Do not inline recursive calls. |
1188 | if (CalledFunction == &F) |
1189 | continue; |
1190 | if (I->isIndirectCall()) { |
1191 | uint64_t Sum; |
1192 | for (const auto *FS : findIndirectCallFunctionSamples(Inst: *I, Sum)) { |
1193 | uint64_t SumOrigin = Sum; |
1194 | if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1195 | findExternalInlineCandidate(CB: I, Samples: FS, InlinedGUIDs, |
1196 | Threshold: PSI->getOrCompHotCountThreshold()); |
1197 | continue; |
1198 | } |
1199 | if (!callsiteIsHot(CallsiteFS: FS, PSI, ProfAccForSymsInList)) |
1200 | continue; |
1201 | |
1202 | Candidate = {.CallInstr: I, .CalleeSamples: FS, .CallsiteCount: FS->getHeadSamplesEstimate(), .CallsiteDistribution: 1.0}; |
1203 | if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) { |
1204 | LocalNotInlinedCallSites.erase(Key: I); |
1205 | LocalChanged = true; |
1206 | } |
1207 | } |
1208 | } else if (CalledFunction && CalledFunction->getSubprogram() && |
1209 | !CalledFunction->isDeclaration()) { |
1210 | if (tryInlineCandidate(Candidate)) { |
1211 | LocalNotInlinedCallSites.erase(Key: I); |
1212 | LocalChanged = true; |
1213 | } |
1214 | } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1215 | findExternalInlineCandidate(CB: I, Samples: findCalleeFunctionSamples(Inst: *I), |
1216 | InlinedGUIDs, |
1217 | Threshold: PSI->getOrCompHotCountThreshold()); |
1218 | } |
1219 | } |
1220 | Changed |= LocalChanged; |
1221 | } |
1222 | |
1223 | // For CS profile, profile for not inlined context will be merged when |
1224 | // base profile is being retrieved. |
1225 | if (!FunctionSamples::ProfileIsCS) |
1226 | promoteMergeNotInlinedContextSamples(NonInlinedCallSites: LocalNotInlinedCallSites, F); |
1227 | return Changed; |
1228 | } |
1229 | |
1230 | bool SampleProfileLoader::tryInlineCandidate( |
1231 | InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) { |
1232 | // Do not attempt to inline a candidate if |
1233 | // --disable-sample-loader-inlining is true. |
1234 | if (DisableSampleLoaderInlining) |
1235 | return false; |
1236 | |
1237 | CallBase &CB = *Candidate.CallInstr; |
1238 | Function *CalledFunction = CB.getCalledFunction(); |
1239 | assert(CalledFunction && "Expect a callee with definition" ); |
1240 | DebugLoc DLoc = CB.getDebugLoc(); |
1241 | BasicBlock *BB = CB.getParent(); |
1242 | |
1243 | InlineCost Cost = shouldInlineCandidate(Candidate); |
1244 | if (Cost.isNever()) { |
1245 | ORE->emit(OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), |
1246 | "InlineFail" , DLoc, BB) |
1247 | << "incompatible inlining" ); |
1248 | return false; |
1249 | } |
1250 | |
1251 | if (!Cost) |
1252 | return false; |
1253 | |
1254 | InlineFunctionInfo IFI(GetAC); |
1255 | IFI.UpdateProfile = false; |
1256 | InlineResult IR = InlineFunction(CB, IFI, |
1257 | /*MergeAttributes=*/true); |
1258 | if (!IR.isSuccess()) |
1259 | return false; |
1260 | |
1261 | // The call to InlineFunction erases I, so we can't pass it here. |
1262 | emitInlinedIntoBasedOnCost(ORE&: *ORE, DLoc, Block: BB, Callee: *CalledFunction, Caller: *BB->getParent(), |
1263 | IC: Cost, ForProfileContext: true, PassName: getAnnotatedRemarkPassName()); |
1264 | |
1265 | // Now populate the list of newly exposed call sites. |
1266 | if (InlinedCallSites) { |
1267 | InlinedCallSites->clear(); |
1268 | for (auto &I : IFI.InlinedCallSites) |
1269 | InlinedCallSites->push_back(Elt: I); |
1270 | } |
1271 | |
1272 | if (FunctionSamples::ProfileIsCS) |
1273 | ContextTracker->markContextSamplesInlined(InlinedSamples: Candidate.CalleeSamples); |
1274 | ++NumCSInlined; |
1275 | |
1276 | // Prorate inlined probes for a duplicated inlining callsite which probably |
1277 | // has a distribution less than 100%. Samples for an inlinee should be |
1278 | // distributed among the copies of the original callsite based on each |
1279 | // callsite's distribution factor for counts accuracy. Note that an inlined |
1280 | // probe may come with its own distribution factor if it has been duplicated |
1281 | // in the inlinee body. The two factor are multiplied to reflect the |
1282 | // aggregation of duplication. |
1283 | if (Candidate.CallsiteDistribution < 1) { |
1284 | for (auto &I : IFI.InlinedCallSites) { |
1285 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: *I)) |
1286 | setProbeDistributionFactor(Inst&: *I, Factor: Probe->Factor * |
1287 | Candidate.CallsiteDistribution); |
1288 | } |
1289 | NumDuplicatedInlinesite++; |
1290 | } |
1291 | |
1292 | return true; |
1293 | } |
1294 | |
1295 | bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, |
1296 | CallBase *CB) { |
1297 | assert(CB && "Expect non-null call instruction" ); |
1298 | |
1299 | if (isa<IntrinsicInst>(Val: CB)) |
1300 | return false; |
1301 | |
1302 | // Find the callee's profile. For indirect call, find hottest target profile. |
1303 | const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(Inst: *CB); |
1304 | // If ExternalInlineAdvisor wants to inline this site, do so even |
1305 | // if Samples are not present. |
1306 | if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(CB&: *CB)) |
1307 | return false; |
1308 | |
1309 | float Factor = 1.0; |
1310 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: *CB)) |
1311 | Factor = Probe->Factor; |
1312 | |
1313 | uint64_t CallsiteCount = |
1314 | CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0; |
1315 | *NewCandidate = {.CallInstr: CB, .CalleeSamples: CalleeSamples, .CallsiteCount: CallsiteCount, .CallsiteDistribution: Factor}; |
1316 | return true; |
1317 | } |
1318 | |
1319 | std::optional<InlineCost> |
1320 | SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) { |
1321 | std::unique_ptr<InlineAdvice> Advice = nullptr; |
1322 | if (ExternalInlineAdvisor) { |
1323 | Advice = ExternalInlineAdvisor->getAdvice(CB); |
1324 | if (Advice) { |
1325 | if (!Advice->isInliningRecommended()) { |
1326 | Advice->recordUnattemptedInlining(); |
1327 | return InlineCost::getNever(Reason: "not previously inlined" ); |
1328 | } |
1329 | Advice->recordInlining(); |
1330 | return InlineCost::getAlways(Reason: "previously inlined" ); |
1331 | } |
1332 | } |
1333 | |
1334 | return {}; |
1335 | } |
1336 | |
1337 | bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) { |
1338 | std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB); |
1339 | return Cost ? !!*Cost : false; |
1340 | } |
1341 | |
1342 | InlineCost |
1343 | SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { |
1344 | if (std::optional<InlineCost> ReplayCost = |
1345 | getExternalInlineAdvisorCost(CB&: *Candidate.CallInstr)) |
1346 | return *ReplayCost; |
1347 | // Adjust threshold based on call site hotness, only do this for callsite |
1348 | // prioritized inliner because otherwise cost-benefit check is done earlier. |
1349 | int SampleThreshold = SampleColdCallSiteThreshold; |
1350 | if (CallsitePrioritizedInline) { |
1351 | if (Candidate.CallsiteCount > PSI->getHotCountThreshold()) |
1352 | SampleThreshold = SampleHotCallSiteThreshold; |
1353 | else if (!ProfileSizeInline) |
1354 | return InlineCost::getNever(Reason: "cold callsite" ); |
1355 | } |
1356 | |
1357 | Function *Callee = Candidate.CallInstr->getCalledFunction(); |
1358 | assert(Callee && "Expect a definition for inline candidate of direct call" ); |
1359 | |
1360 | InlineParams Params = getInlineParams(); |
1361 | // We will ignore the threshold from inline cost, so always get full cost. |
1362 | Params.ComputeFullInlineCost = true; |
1363 | Params.AllowRecursiveCall = AllowRecursiveInline; |
1364 | // Checks if there is anything in the reachable portion of the callee at |
1365 | // this callsite that makes this inlining potentially illegal. Need to |
1366 | // set ComputeFullInlineCost, otherwise getInlineCost may return early |
1367 | // when cost exceeds threshold without checking all IRs in the callee. |
1368 | // The acutal cost does not matter because we only checks isNever() to |
1369 | // see if it is legal to inline the callsite. |
1370 | InlineCost Cost = getInlineCost(Call&: *Candidate.CallInstr, Callee, Params, |
1371 | CalleeTTI&: GetTTI(*Callee), GetAssumptionCache: GetAC, GetTLI); |
1372 | |
1373 | // Honor always inline and never inline from call analyzer |
1374 | if (Cost.isNever() || Cost.isAlways()) |
1375 | return Cost; |
1376 | |
1377 | // With CSSPGO, the preinliner in llvm-profgen can estimate global inline |
1378 | // decisions based on hotness as well as accurate function byte sizes for |
1379 | // given context using function/inlinee sizes from previous build. It |
1380 | // stores the decision in profile, and also adjust/merge context profile |
1381 | // aiming at better context-sensitive post-inline profile quality, assuming |
1382 | // all inline decision estimates are going to be honored by compiler. Here |
1383 | // we replay that inline decision under `sample-profile-use-preinliner`. |
1384 | // Note that we don't need to handle negative decision from preinliner as |
1385 | // context profile for not inlined calls are merged by preinliner already. |
1386 | if (UsePreInlinerDecision && Candidate.CalleeSamples) { |
1387 | // Once two node are merged due to promotion, we're losing some context |
1388 | // so the original context-sensitive preinliner decision should be ignored |
1389 | // for SyntheticContext. |
1390 | SampleContext &Context = Candidate.CalleeSamples->getContext(); |
1391 | if (!Context.hasState(S: SyntheticContext) && |
1392 | Context.hasAttribute(A: ContextShouldBeInlined)) |
1393 | return InlineCost::getAlways(Reason: "preinliner" ); |
1394 | } |
1395 | |
1396 | // For old FDO inliner, we inline the call site if it is below hot threshold, |
1397 | // even if the function is hot based on sample profile data. This is to |
1398 | // prevent huge functions from being inlined. |
1399 | if (!CallsitePrioritizedInline) { |
1400 | return InlineCost::get(Cost: Cost.getCost(), Threshold: SampleHotCallSiteThreshold); |
1401 | } |
1402 | |
1403 | // Otherwise only use the cost from call analyzer, but overwite threshold with |
1404 | // Sample PGO threshold. |
1405 | return InlineCost::get(Cost: Cost.getCost(), Threshold: SampleThreshold); |
1406 | } |
1407 | |
1408 | bool SampleProfileLoader::inlineHotFunctionsWithPriority( |
1409 | Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { |
1410 | // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure |
1411 | // Profile symbol list is ignored when profile-sample-accurate is on. |
1412 | assert((!ProfAccForSymsInList || |
1413 | (!ProfileSampleAccurate && |
1414 | !F.hasFnAttribute("profile-sample-accurate" ))) && |
1415 | "ProfAccForSymsInList should be false when profile-sample-accurate " |
1416 | "is enabled" ); |
1417 | |
1418 | // Populating worklist with initial call sites from root inliner, along |
1419 | // with call site weights. |
1420 | CandidateQueue CQueue; |
1421 | InlineCandidate NewCandidate; |
1422 | for (auto &BB : F) { |
1423 | for (auto &I : BB) { |
1424 | auto *CB = dyn_cast<CallBase>(Val: &I); |
1425 | if (!CB) |
1426 | continue; |
1427 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1428 | CQueue.push(x: NewCandidate); |
1429 | } |
1430 | } |
1431 | |
1432 | // Cap the size growth from profile guided inlining. This is needed even |
1433 | // though cost of each inline candidate already accounts for callee size, |
1434 | // because with top-down inlining, we can grow inliner size significantly |
1435 | // with large number of smaller inlinees each pass the cost check. |
1436 | assert(ProfileInlineLimitMax >= ProfileInlineLimitMin && |
1437 | "Max inline size limit should not be smaller than min inline size " |
1438 | "limit." ); |
1439 | unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit; |
1440 | SizeLimit = std::min(a: SizeLimit, b: (unsigned)ProfileInlineLimitMax); |
1441 | SizeLimit = std::max(a: SizeLimit, b: (unsigned)ProfileInlineLimitMin); |
1442 | if (ExternalInlineAdvisor) |
1443 | SizeLimit = std::numeric_limits<unsigned>::max(); |
1444 | |
1445 | MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; |
1446 | |
1447 | // Perform iterative BFS call site prioritized inlining |
1448 | bool Changed = false; |
1449 | while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { |
1450 | InlineCandidate Candidate = CQueue.top(); |
1451 | CQueue.pop(); |
1452 | CallBase *I = Candidate.CallInstr; |
1453 | Function *CalledFunction = I->getCalledFunction(); |
1454 | |
1455 | if (CalledFunction == &F) |
1456 | continue; |
1457 | if (I->isIndirectCall()) { |
1458 | uint64_t Sum = 0; |
1459 | auto CalleeSamples = findIndirectCallFunctionSamples(Inst: *I, Sum); |
1460 | uint64_t SumOrigin = Sum; |
1461 | Sum *= Candidate.CallsiteDistribution; |
1462 | unsigned ICPCount = 0; |
1463 | for (const auto *FS : CalleeSamples) { |
1464 | // TODO: Consider disable pre-lTO ICP for MonoLTO as well |
1465 | if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1466 | findExternalInlineCandidate(CB: I, Samples: FS, InlinedGUIDs, |
1467 | Threshold: PSI->getOrCompHotCountThreshold()); |
1468 | continue; |
1469 | } |
1470 | uint64_t EntryCountDistributed = |
1471 | FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution; |
1472 | // In addition to regular inline cost check, we also need to make sure |
1473 | // ICP isn't introducing excessive speculative checks even if individual |
1474 | // target looks beneficial to promote and inline. That means we should |
1475 | // only do ICP when there's a small number dominant targets. |
1476 | if (ICPCount >= ProfileICPRelativeHotnessSkip && |
1477 | EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness) |
1478 | break; |
1479 | // TODO: Fix CallAnalyzer to handle all indirect calls. |
1480 | // For indirect call, we don't run CallAnalyzer to get InlineCost |
1481 | // before actual inlining. This is because we could see two different |
1482 | // types from the same definition, which makes CallAnalyzer choke as |
1483 | // it's expecting matching parameter type on both caller and callee |
1484 | // side. See example from PR18962 for the triggering cases (the bug was |
1485 | // fixed, but we generate different types). |
1486 | if (!PSI->isHotCount(C: EntryCountDistributed)) |
1487 | break; |
1488 | SmallVector<CallBase *, 8> InlinedCallSites; |
1489 | // Attach function profile for promoted indirect callee, and update |
1490 | // call site count for the promoted inline candidate too. |
1491 | Candidate = {.CallInstr: I, .CalleeSamples: FS, .CallsiteCount: EntryCountDistributed, |
1492 | .CallsiteDistribution: Candidate.CallsiteDistribution}; |
1493 | if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, |
1494 | InlinedCallSite: &InlinedCallSites)) { |
1495 | for (auto *CB : InlinedCallSites) { |
1496 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1497 | CQueue.emplace(args&: NewCandidate); |
1498 | } |
1499 | ICPCount++; |
1500 | Changed = true; |
1501 | } else if (!ContextTracker) { |
1502 | LocalNotInlinedCallSites.insert(KV: {I, FS}); |
1503 | } |
1504 | } |
1505 | } else if (CalledFunction && CalledFunction->getSubprogram() && |
1506 | !CalledFunction->isDeclaration()) { |
1507 | SmallVector<CallBase *, 8> InlinedCallSites; |
1508 | if (tryInlineCandidate(Candidate, InlinedCallSites: &InlinedCallSites)) { |
1509 | for (auto *CB : InlinedCallSites) { |
1510 | if (getInlineCandidate(NewCandidate: &NewCandidate, CB)) |
1511 | CQueue.emplace(args&: NewCandidate); |
1512 | } |
1513 | Changed = true; |
1514 | } else if (!ContextTracker) { |
1515 | LocalNotInlinedCallSites.insert(KV: {I, Candidate.CalleeSamples}); |
1516 | } |
1517 | } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { |
1518 | findExternalInlineCandidate(CB: I, Samples: findCalleeFunctionSamples(Inst: *I), |
1519 | InlinedGUIDs, |
1520 | Threshold: PSI->getOrCompHotCountThreshold()); |
1521 | } |
1522 | } |
1523 | |
1524 | if (!CQueue.empty()) { |
1525 | if (SizeLimit == (unsigned)ProfileInlineLimitMax) |
1526 | ++NumCSInlinedHitMaxLimit; |
1527 | else if (SizeLimit == (unsigned)ProfileInlineLimitMin) |
1528 | ++NumCSInlinedHitMinLimit; |
1529 | else |
1530 | ++NumCSInlinedHitGrowthLimit; |
1531 | } |
1532 | |
1533 | // For CS profile, profile for not inlined context will be merged when |
1534 | // base profile is being retrieved. |
1535 | if (!FunctionSamples::ProfileIsCS) |
1536 | promoteMergeNotInlinedContextSamples(NonInlinedCallSites: LocalNotInlinedCallSites, F); |
1537 | return Changed; |
1538 | } |
1539 | |
1540 | void SampleProfileLoader::promoteMergeNotInlinedContextSamples( |
1541 | MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites, |
1542 | const Function &F) { |
1543 | // Accumulate not inlined callsite information into notInlinedSamples |
1544 | for (const auto &Pair : NonInlinedCallSites) { |
1545 | CallBase *I = Pair.first; |
1546 | Function *Callee = I->getCalledFunction(); |
1547 | if (!Callee || Callee->isDeclaration()) |
1548 | continue; |
1549 | |
1550 | ORE->emit( |
1551 | OptDiag&: OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline" , |
1552 | I->getDebugLoc(), I->getParent()) |
1553 | << "previous inlining not repeated: '" << ore::NV("Callee" , Callee) |
1554 | << "' into '" << ore::NV("Caller" , &F) << "'" ); |
1555 | |
1556 | ++NumCSNotInlined; |
1557 | const FunctionSamples *FS = Pair.second; |
1558 | if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) { |
1559 | continue; |
1560 | } |
1561 | |
1562 | // Do not merge a context that is already duplicated into the base profile. |
1563 | if (FS->getContext().hasAttribute(A: sampleprof::ContextDuplicatedIntoBase)) |
1564 | continue; |
1565 | |
1566 | if (ProfileMergeInlinee) { |
1567 | // A function call can be replicated by optimizations like callsite |
1568 | // splitting or jump threading and the replicates end up sharing the |
1569 | // sample nested callee profile instead of slicing the original |
1570 | // inlinee's profile. We want to do merge exactly once by filtering out |
1571 | // callee profiles with a non-zero head sample count. |
1572 | if (FS->getHeadSamples() == 0) { |
1573 | // Use entry samples as head samples during the merge, as inlinees |
1574 | // don't have head samples. |
1575 | const_cast<FunctionSamples *>(FS)->addHeadSamples( |
1576 | Num: FS->getHeadSamplesEstimate()); |
1577 | |
1578 | // Note that we have to do the merge right after processing function. |
1579 | // This allows OutlineFS's profile to be used for annotation during |
1580 | // top-down processing of functions' annotation. |
1581 | FunctionSamples *OutlineFS = Reader->getSamplesFor(F: *Callee); |
1582 | // If outlined function does not exist in the profile, add it to a |
1583 | // separate map so that it does not rehash the original profile. |
1584 | if (!OutlineFS) |
1585 | OutlineFS = &OutlineFunctionSamples[ |
1586 | FunctionId(FunctionSamples::getCanonicalFnName(FnName: Callee->getName()))]; |
1587 | OutlineFS->merge(Other: *FS, Weight: 1); |
1588 | // Set outlined profile to be synthetic to not bias the inliner. |
1589 | OutlineFS->setContextSynthetic(); |
1590 | } |
1591 | } else { |
1592 | auto pair = |
1593 | notInlinedCallInfo.try_emplace(Key: Callee, Args: NotInlinedProfileInfo{.entryCount: 0}); |
1594 | pair.first->second.entryCount += FS->getHeadSamplesEstimate(); |
1595 | } |
1596 | } |
1597 | } |
1598 | |
1599 | /// Returns the sorted CallTargetMap \p M by count in descending order. |
1600 | static SmallVector<InstrProfValueData, 2> |
1601 | GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) { |
1602 | SmallVector<InstrProfValueData, 2> R; |
1603 | for (const auto &I : SampleRecord::sortCallTargets(Targets: M)) { |
1604 | R.emplace_back( |
1605 | Args: InstrProfValueData{.Value: I.first.getHashCode(), .Count: I.second}); |
1606 | } |
1607 | return R; |
1608 | } |
1609 | |
1610 | // Generate MD_prof metadata for every branch instruction using the |
1611 | // edge weights computed during propagation. |
1612 | void SampleProfileLoader::generateMDProfMetadata(Function &F) { |
1613 | // Generate MD_prof metadata for every branch instruction using the |
1614 | // edge weights computed during propagation. |
1615 | LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n" ); |
1616 | LLVMContext &Ctx = F.getContext(); |
1617 | MDBuilder MDB(Ctx); |
1618 | for (auto &BI : F) { |
1619 | BasicBlock *BB = &BI; |
1620 | |
1621 | if (BlockWeights[BB]) { |
1622 | for (auto &I : *BB) { |
1623 | if (!isa<CallInst>(Val: I) && !isa<InvokeInst>(Val: I)) |
1624 | continue; |
1625 | if (!cast<CallBase>(Val&: I).getCalledFunction()) { |
1626 | const DebugLoc &DLoc = I.getDebugLoc(); |
1627 | if (!DLoc) |
1628 | continue; |
1629 | const DILocation *DIL = DLoc; |
1630 | const FunctionSamples *FS = findFunctionSamples(Inst: I); |
1631 | if (!FS) |
1632 | continue; |
1633 | auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL); |
1634 | ErrorOr<SampleRecord::CallTargetMap> T = |
1635 | FS->findCallTargetMapAt(CallSite); |
1636 | if (!T || T.get().empty()) |
1637 | continue; |
1638 | if (FunctionSamples::ProfileIsProbeBased) { |
1639 | // Prorate the callsite counts based on the pre-ICP distribution |
1640 | // factor to reflect what is already done to the callsite before |
1641 | // ICP, such as calliste cloning. |
1642 | if (std::optional<PseudoProbe> Probe = extractProbe(Inst: I)) { |
1643 | if (Probe->Factor < 1) |
1644 | T = SampleRecord::adjustCallTargets(Targets: T.get(), DistributionFactor: Probe->Factor); |
1645 | } |
1646 | } |
1647 | SmallVector<InstrProfValueData, 2> SortedCallTargets = |
1648 | GetSortedValueDataFromCallTargets(M: T.get()); |
1649 | uint64_t Sum = 0; |
1650 | for (const auto &C : T.get()) |
1651 | Sum += C.second; |
1652 | // With CSSPGO all indirect call targets are counted torwards the |
1653 | // original indirect call site in the profile, including both |
1654 | // inlined and non-inlined targets. |
1655 | if (!FunctionSamples::ProfileIsCS) { |
1656 | if (const FunctionSamplesMap *M = |
1657 | FS->findFunctionSamplesMapAt(Loc: CallSite)) { |
1658 | for (const auto &NameFS : *M) |
1659 | Sum += NameFS.second.getHeadSamplesEstimate(); |
1660 | } |
1661 | } |
1662 | if (Sum) |
1663 | updateIDTMetaData(Inst&: I, CallTargets: SortedCallTargets, Sum); |
1664 | else if (OverwriteExistingWeights) |
1665 | I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1666 | } else if (!isa<IntrinsicInst>(Val: &I)) { |
1667 | setBranchWeights(I, Weights: {static_cast<uint32_t>(BlockWeights[BB])}, |
1668 | /*IsExpected=*/false); |
1669 | } |
1670 | } |
1671 | } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) { |
1672 | // Set profile metadata (possibly annotated by LTO prelink) to zero or |
1673 | // clear it for cold code. |
1674 | for (auto &I : *BB) { |
1675 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
1676 | if (cast<CallBase>(Val&: I).isIndirectCall()) { |
1677 | I.setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1678 | } else { |
1679 | setBranchWeights(I, Weights: {uint32_t(0)}, /*IsExpected=*/false); |
1680 | } |
1681 | } |
1682 | } |
1683 | } |
1684 | |
1685 | Instruction *TI = BB->getTerminator(); |
1686 | if (TI->getNumSuccessors() == 1) |
1687 | continue; |
1688 | if (!isa<BranchInst>(Val: TI) && !isa<SwitchInst>(Val: TI) && |
1689 | !isa<IndirectBrInst>(Val: TI)) |
1690 | continue; |
1691 | |
1692 | DebugLoc BranchLoc = TI->getDebugLoc(); |
1693 | LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line " |
1694 | << ((BranchLoc) ? Twine(BranchLoc.getLine()) |
1695 | : Twine("<UNKNOWN LOCATION>" )) |
1696 | << ".\n" ); |
1697 | SmallVector<uint32_t, 4> Weights; |
1698 | uint32_t MaxWeight = 0; |
1699 | Instruction *MaxDestInst; |
1700 | // Since profi treats multiple edges (multiway branches) as a single edge, |
1701 | // we need to distribute the computed weight among the branches. We do |
1702 | // this by evenly splitting the edge weight among destinations. |
1703 | DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity; |
1704 | std::vector<uint64_t> EdgeIndex; |
1705 | if (SampleProfileUseProfi) { |
1706 | EdgeIndex.resize(new_size: TI->getNumSuccessors()); |
1707 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { |
1708 | const BasicBlock *Succ = TI->getSuccessor(Idx: I); |
1709 | EdgeIndex[I] = EdgeMultiplicity[Succ]; |
1710 | EdgeMultiplicity[Succ]++; |
1711 | } |
1712 | } |
1713 | for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { |
1714 | BasicBlock *Succ = TI->getSuccessor(Idx: I); |
1715 | Edge E = std::make_pair(x&: BB, y&: Succ); |
1716 | uint64_t Weight = EdgeWeights[E]; |
1717 | LLVM_DEBUG(dbgs() << "\t" ; printEdgeWeight(dbgs(), E)); |
1718 | // Use uint32_t saturated arithmetic to adjust the incoming weights, |
1719 | // if needed. Sample counts in profiles are 64-bit unsigned values, |
1720 | // but internally branch weights are expressed as 32-bit values. |
1721 | if (Weight > std::numeric_limits<uint32_t>::max()) { |
1722 | LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)\n" ); |
1723 | Weight = std::numeric_limits<uint32_t>::max(); |
1724 | } |
1725 | if (!SampleProfileUseProfi) { |
1726 | // Weight is added by one to avoid propagation errors introduced by |
1727 | // 0 weights. |
1728 | Weights.push_back(Elt: static_cast<uint32_t>( |
1729 | Weight == std::numeric_limits<uint32_t>::max() ? Weight |
1730 | : Weight + 1)); |
1731 | } else { |
1732 | // Profi creates proper weights that do not require "+1" adjustments but |
1733 | // we evenly split the weight among branches with the same destination. |
1734 | uint64_t W = Weight / EdgeMultiplicity[Succ]; |
1735 | // Rounding up, if needed, so that first branches are hotter. |
1736 | if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ]) |
1737 | W++; |
1738 | Weights.push_back(Elt: static_cast<uint32_t>(W)); |
1739 | } |
1740 | if (Weight != 0) { |
1741 | if (Weight > MaxWeight) { |
1742 | MaxWeight = Weight; |
1743 | MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime(); |
1744 | } |
1745 | } |
1746 | } |
1747 | |
1748 | misexpect::checkExpectAnnotations(I&: *TI, ExistingWeights: Weights, /*IsFrontend=*/false); |
1749 | |
1750 | uint64_t TempWeight; |
1751 | // Only set weights if there is at least one non-zero weight. |
1752 | // In any other case, let the analyzer set weights. |
1753 | // Do not set weights if the weights are present unless under |
1754 | // OverwriteExistingWeights. In ThinLTO, the profile annotation is done |
1755 | // twice. If the first annotation already set the weights, the second pass |
1756 | // does not need to set it. With OverwriteExistingWeights, Blocks with zero |
1757 | // weight should have their existing metadata (possibly annotated by LTO |
1758 | // prelink) cleared. |
1759 | if (MaxWeight > 0 && |
1760 | (!TI->extractProfTotalWeight(TotalVal&: TempWeight) || OverwriteExistingWeights)) { |
1761 | LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n" ); |
1762 | setBranchWeights(I&: *TI, Weights, /*IsExpected=*/false); |
1763 | ORE->emit(RemarkBuilder: [&]() { |
1764 | return OptimizationRemark(DEBUG_TYPE, "PopularDest" , MaxDestInst) |
1765 | << "most popular destination for conditional branches at " |
1766 | << ore::NV("CondBranchesLoc" , BranchLoc); |
1767 | }); |
1768 | } else { |
1769 | if (OverwriteExistingWeights) { |
1770 | TI->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
1771 | LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n" ); |
1772 | } else { |
1773 | LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n" ); |
1774 | } |
1775 | } |
1776 | } |
1777 | } |
1778 | |
1779 | /// Once all the branch weights are computed, we emit the MD_prof |
1780 | /// metadata on BB using the computed values for each of its branches. |
1781 | /// |
1782 | /// \param F The function to query. |
1783 | /// |
1784 | /// \returns true if \p F was modified. Returns false, otherwise. |
1785 | bool SampleProfileLoader::emitAnnotations(Function &F) { |
1786 | bool Changed = false; |
1787 | |
1788 | if (FunctionSamples::ProfileIsProbeBased) { |
1789 | LLVM_DEBUG({ |
1790 | if (!ProbeManager->getDesc(F)) |
1791 | dbgs() << "Probe descriptor missing for Function " << F.getName() |
1792 | << "\n" ; |
1793 | }); |
1794 | |
1795 | if (ProbeManager->profileIsValid(F, Samples: *Samples)) { |
1796 | ++NumMatchedProfile; |
1797 | } else { |
1798 | ++NumMismatchedProfile; |
1799 | LLVM_DEBUG( |
1800 | dbgs() << "Profile is invalid due to CFG mismatch for Function " |
1801 | << F.getName() << "\n" ); |
1802 | if (!SalvageStaleProfile) |
1803 | return false; |
1804 | } |
1805 | } else { |
1806 | if (getFunctionLoc(F) == 0) |
1807 | return false; |
1808 | |
1809 | LLVM_DEBUG(dbgs() << "Line number for the first instruction in " |
1810 | << F.getName() << ": " << getFunctionLoc(F) << "\n" ); |
1811 | } |
1812 | |
1813 | DenseSet<GlobalValue::GUID> InlinedGUIDs; |
1814 | if (CallsitePrioritizedInline) |
1815 | Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); |
1816 | else |
1817 | Changed |= inlineHotFunctions(F, InlinedGUIDs); |
1818 | |
1819 | Changed |= computeAndPropagateWeights(F, InlinedGUIDs); |
1820 | |
1821 | if (Changed) |
1822 | generateMDProfMetadata(F); |
1823 | |
1824 | emitCoverageRemarks(F); |
1825 | return Changed; |
1826 | } |
1827 | |
1828 | std::unique_ptr<ProfiledCallGraph> |
1829 | SampleProfileLoader::buildProfiledCallGraph(Module &M) { |
1830 | std::unique_ptr<ProfiledCallGraph> ProfiledCG; |
1831 | if (FunctionSamples::ProfileIsCS) |
1832 | ProfiledCG = std::make_unique<ProfiledCallGraph>(args&: *ContextTracker); |
1833 | else |
1834 | ProfiledCG = std::make_unique<ProfiledCallGraph>(args&: Reader->getProfiles()); |
1835 | |
1836 | // Add all functions into the profiled call graph even if they are not in |
1837 | // the profile. This makes sure functions missing from the profile still |
1838 | // gets a chance to be processed. |
1839 | for (Function &F : M) { |
1840 | if (skipProfileForFunction(F)) |
1841 | continue; |
1842 | ProfiledCG->addProfiledFunction( |
1843 | Name: getRepInFormat(Name: FunctionSamples::getCanonicalFnName(F))); |
1844 | } |
1845 | |
1846 | return ProfiledCG; |
1847 | } |
1848 | |
1849 | std::vector<Function *> |
1850 | SampleProfileLoader::buildFunctionOrder(Module &M, LazyCallGraph &CG) { |
1851 | std::vector<Function *> FunctionOrderList; |
1852 | FunctionOrderList.reserve(n: M.size()); |
1853 | |
1854 | if (!ProfileTopDownLoad && UseProfiledCallGraph) |
1855 | errs() << "WARNING: -use-profiled-call-graph ignored, should be used " |
1856 | "together with -sample-profile-top-down-load.\n" ; |
1857 | |
1858 | if (!ProfileTopDownLoad) { |
1859 | if (ProfileMergeInlinee) { |
1860 | // Disable ProfileMergeInlinee if profile is not loaded in top down order, |
1861 | // because the profile for a function may be used for the profile |
1862 | // annotation of its outline copy before the profile merging of its |
1863 | // non-inlined inline instances, and that is not the way how |
1864 | // ProfileMergeInlinee is supposed to work. |
1865 | ProfileMergeInlinee = false; |
1866 | } |
1867 | |
1868 | for (Function &F : M) |
1869 | if (!skipProfileForFunction(F)) |
1870 | FunctionOrderList.push_back(x: &F); |
1871 | return FunctionOrderList; |
1872 | } |
1873 | |
1874 | if (UseProfiledCallGraph || (FunctionSamples::ProfileIsCS && |
1875 | !UseProfiledCallGraph.getNumOccurrences())) { |
1876 | // Use profiled call edges to augment the top-down order. There are cases |
1877 | // that the top-down order computed based on the static call graph doesn't |
1878 | // reflect real execution order. For example |
1879 | // |
1880 | // 1. Incomplete static call graph due to unknown indirect call targets. |
1881 | // Adjusting the order by considering indirect call edges from the |
1882 | // profile can enable the inlining of indirect call targets by allowing |
1883 | // the caller processed before them. |
1884 | // 2. Mutual call edges in an SCC. The static processing order computed for |
1885 | // an SCC may not reflect the call contexts in the context-sensitive |
1886 | // profile, thus may cause potential inlining to be overlooked. The |
1887 | // function order in one SCC is being adjusted to a top-down order based |
1888 | // on the profile to favor more inlining. This is only a problem with CS |
1889 | // profile. |
1890 | // 3. Transitive indirect call edges due to inlining. When a callee function |
1891 | // (say B) is inlined into a caller function (say A) in LTO prelink, |
1892 | // every call edge originated from the callee B will be transferred to |
1893 | // the caller A. If any transferred edge (say A->C) is indirect, the |
1894 | // original profiled indirect edge B->C, even if considered, would not |
1895 | // enforce a top-down order from the caller A to the potential indirect |
1896 | // call target C in LTO postlink since the inlined callee B is gone from |
1897 | // the static call graph. |
1898 | // 4. #3 can happen even for direct call targets, due to functions defined |
1899 | // in header files. A header function (say A), when included into source |
1900 | // files, is defined multiple times but only one definition survives due |
1901 | // to ODR. Therefore, the LTO prelink inlining done on those dropped |
1902 | // definitions can be useless based on a local file scope. More |
1903 | // importantly, the inlinee (say B), once fully inlined to a |
1904 | // to-be-dropped A, will have no profile to consume when its outlined |
1905 | // version is compiled. This can lead to a profile-less prelink |
1906 | // compilation for the outlined version of B which may be called from |
1907 | // external modules. while this isn't easy to fix, we rely on the |
1908 | // postlink AutoFDO pipeline to optimize B. Since the survived copy of |
1909 | // the A can be inlined in its local scope in prelink, it may not exist |
1910 | // in the merged IR in postlink, and we'll need the profiled call edges |
1911 | // to enforce a top-down order for the rest of the functions. |
1912 | // |
1913 | // Considering those cases, a profiled call graph completely independent of |
1914 | // the static call graph is constructed based on profile data, where |
1915 | // function objects are not even needed to handle case #3 and case 4. |
1916 | // |
1917 | // Note that static callgraph edges are completely ignored since they |
1918 | // can be conflicting with profiled edges for cyclic SCCs and may result in |
1919 | // an SCC order incompatible with profile-defined one. Using strictly |
1920 | // profile order ensures a maximum inlining experience. On the other hand, |
1921 | // static call edges are not so important when they don't correspond to a |
1922 | // context in the profile. |
1923 | |
1924 | std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(M); |
1925 | scc_iterator<ProfiledCallGraph *> CGI = scc_begin(G: ProfiledCG.get()); |
1926 | while (!CGI.isAtEnd()) { |
1927 | auto Range = *CGI; |
1928 | if (SortProfiledSCC) { |
1929 | // Sort nodes in one SCC based on callsite hotness. |
1930 | scc_member_iterator<ProfiledCallGraph *> SI(*CGI); |
1931 | Range = *SI; |
1932 | } |
1933 | for (auto *Node : Range) { |
1934 | Function *F = SymbolMap.lookup(Key: Node->Name); |
1935 | if (F && !skipProfileForFunction(F: *F)) |
1936 | FunctionOrderList.push_back(x: F); |
1937 | } |
1938 | ++CGI; |
1939 | } |
1940 | std::reverse(first: FunctionOrderList.begin(), last: FunctionOrderList.end()); |
1941 | } else |
1942 | buildTopDownFuncOrder(CG, FunctionOrderList); |
1943 | |
1944 | LLVM_DEBUG({ |
1945 | dbgs() << "Function processing order:\n" ; |
1946 | for (auto F : FunctionOrderList) { |
1947 | dbgs() << F->getName() << "\n" ; |
1948 | } |
1949 | }); |
1950 | |
1951 | return FunctionOrderList; |
1952 | } |
1953 | |
1954 | bool SampleProfileLoader::doInitialization(Module &M, |
1955 | FunctionAnalysisManager *FAM) { |
1956 | auto &Ctx = M.getContext(); |
1957 | |
1958 | auto ReaderOrErr = SampleProfileReader::create( |
1959 | Filename, C&: Ctx, FS&: *FS, P: FSDiscriminatorPass::Base, RemapFilename: RemappingFilename); |
1960 | if (std::error_code EC = ReaderOrErr.getError()) { |
1961 | std::string Msg = "Could not open profile: " + EC.message(); |
1962 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(Filename, Msg)); |
1963 | return false; |
1964 | } |
1965 | Reader = std::move(ReaderOrErr.get()); |
1966 | Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink); |
1967 | // set module before reading the profile so reader may be able to only |
1968 | // read the function profiles which are used by the current module. |
1969 | Reader->setModule(&M); |
1970 | if (std::error_code EC = Reader->read()) { |
1971 | std::string Msg = "profile reading failed: " + EC.message(); |
1972 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(Filename, Msg)); |
1973 | return false; |
1974 | } |
1975 | |
1976 | PSL = Reader->getProfileSymbolList(); |
1977 | |
1978 | // While profile-sample-accurate is on, ignore symbol list. |
1979 | ProfAccForSymsInList = |
1980 | ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate; |
1981 | if (ProfAccForSymsInList) { |
1982 | NamesInProfile.clear(); |
1983 | GUIDsInProfile.clear(); |
1984 | if (auto NameTable = Reader->getNameTable()) { |
1985 | if (FunctionSamples::UseMD5) { |
1986 | for (auto Name : *NameTable) |
1987 | GUIDsInProfile.insert(V: Name.getHashCode()); |
1988 | } else { |
1989 | for (auto Name : *NameTable) |
1990 | NamesInProfile.insert(key: Name.stringRef()); |
1991 | } |
1992 | } |
1993 | CoverageTracker.setProfAccForSymsInList(true); |
1994 | } |
1995 | |
1996 | if (FAM && !ProfileInlineReplayFile.empty()) { |
1997 | ExternalInlineAdvisor = getReplayInlineAdvisor( |
1998 | M, FAM&: *FAM, Context&: Ctx, /*OriginalAdvisor=*/nullptr, |
1999 | ReplaySettings: ReplayInlinerSettings{.ReplayFile: ProfileInlineReplayFile, |
2000 | .ReplayScope: ProfileInlineReplayScope, |
2001 | .ReplayFallback: ProfileInlineReplayFallback, |
2002 | .ReplayFormat: {.OutputFormat: ProfileInlineReplayFormat}}, |
2003 | /*EmitRemarks=*/false, IC: InlineContext{.LTOPhase: LTOPhase, .Pass: InlinePass::ReplaySampleProfileInliner}); |
2004 | } |
2005 | |
2006 | // Apply tweaks if context-sensitive or probe-based profile is available. |
2007 | if (Reader->profileIsCS() || Reader->profileIsPreInlined() || |
2008 | Reader->profileIsProbeBased()) { |
2009 | if (!UseIterativeBFIInference.getNumOccurrences()) |
2010 | UseIterativeBFIInference = true; |
2011 | if (!SampleProfileUseProfi.getNumOccurrences()) |
2012 | SampleProfileUseProfi = true; |
2013 | if (!EnableExtTspBlockPlacement.getNumOccurrences()) |
2014 | EnableExtTspBlockPlacement = true; |
2015 | // Enable priority-base inliner and size inline by default for CSSPGO. |
2016 | if (!ProfileSizeInline.getNumOccurrences()) |
2017 | ProfileSizeInline = true; |
2018 | if (!CallsitePrioritizedInline.getNumOccurrences()) |
2019 | CallsitePrioritizedInline = true; |
2020 | // For CSSPGO, we also allow recursive inline to best use context profile. |
2021 | if (!AllowRecursiveInline.getNumOccurrences()) |
2022 | AllowRecursiveInline = true; |
2023 | |
2024 | if (Reader->profileIsPreInlined()) { |
2025 | if (!UsePreInlinerDecision.getNumOccurrences()) |
2026 | UsePreInlinerDecision = true; |
2027 | } |
2028 | |
2029 | // Enable stale profile matching by default for probe-based profile. |
2030 | // Currently the matching relies on if the checksum mismatch is detected, |
2031 | // which is currently only available for pseudo-probe mode. Removing the |
2032 | // checksum check could cause regressions for some cases, so further tuning |
2033 | // might be needed if we want to enable it for all cases. |
2034 | if (Reader->profileIsProbeBased() && |
2035 | !SalvageStaleProfile.getNumOccurrences()) { |
2036 | SalvageStaleProfile = true; |
2037 | } |
2038 | |
2039 | if (!Reader->profileIsCS()) { |
2040 | // Non-CS profile should be fine without a function size budget for the |
2041 | // inliner since the contexts in the profile are either all from inlining |
2042 | // in the prevoius build or pre-computed by the preinliner with a size |
2043 | // cap, thus they are bounded. |
2044 | if (!ProfileInlineLimitMin.getNumOccurrences()) |
2045 | ProfileInlineLimitMin = std::numeric_limits<unsigned>::max(); |
2046 | if (!ProfileInlineLimitMax.getNumOccurrences()) |
2047 | ProfileInlineLimitMax = std::numeric_limits<unsigned>::max(); |
2048 | } |
2049 | } |
2050 | |
2051 | if (Reader->profileIsCS()) { |
2052 | // Tracker for profiles under different context |
2053 | ContextTracker = std::make_unique<SampleContextTracker>( |
2054 | args&: Reader->getProfiles(), args: &GUIDToFuncNameMap); |
2055 | } |
2056 | |
2057 | // Load pseudo probe descriptors for probe-based function samples. |
2058 | if (Reader->profileIsProbeBased()) { |
2059 | ProbeManager = std::make_unique<PseudoProbeManager>(args&: M); |
2060 | if (!ProbeManager->moduleIsProbed(M)) { |
2061 | const char *Msg = |
2062 | "Pseudo-probe-based profile requires SampleProfileProbePass" ; |
2063 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg, |
2064 | DS_Warning)); |
2065 | return false; |
2066 | } |
2067 | } |
2068 | |
2069 | if (ReportProfileStaleness || PersistProfileStaleness || |
2070 | SalvageStaleProfile) { |
2071 | MatchingManager = std::make_unique<SampleProfileMatcher>( |
2072 | args&: M, args&: *Reader, args&: CG, args: ProbeManager.get(), args: LTOPhase, args&: SymbolMap, args&: PSL, |
2073 | args&: FuncNameToProfNameMap); |
2074 | } |
2075 | |
2076 | return true; |
2077 | } |
2078 | |
2079 | // Note that this is a module-level check. Even if one module is errored out, |
2080 | // the entire build will be errored out. However, the user could make big |
2081 | // changes to functions in single module but those changes might not be |
2082 | // performance significant to the whole binary. Therefore, to avoid those false |
2083 | // positives, we select a reasonable big set of hot functions that are supposed |
2084 | // to be globally performance significant, only compute and check the mismatch |
2085 | // within those functions. The function selection is based on two criteria: |
2086 | // 1) The function is hot enough, which is tuned by a hotness-based |
2087 | // flag(HotFuncCutoffForStalenessError). 2) The num of function is large enough |
2088 | // which is tuned by the MinfuncsForStalenessError flag. |
2089 | bool SampleProfileLoader::rejectHighStalenessProfile( |
2090 | Module &M, ProfileSummaryInfo *PSI, const SampleProfileMap &Profiles) { |
2091 | assert(FunctionSamples::ProfileIsProbeBased && |
2092 | "Only support for probe-based profile" ); |
2093 | uint64_t TotalHotFunc = 0; |
2094 | uint64_t NumMismatchedFunc = 0; |
2095 | for (const auto &I : Profiles) { |
2096 | const auto &FS = I.second; |
2097 | const auto *FuncDesc = ProbeManager->getDesc(GUID: FS.getGUID()); |
2098 | if (!FuncDesc) |
2099 | continue; |
2100 | |
2101 | // Use a hotness-based threshold to control the function selection. |
2102 | if (!PSI->isHotCountNthPercentile(PercentileCutoff: HotFuncCutoffForStalenessError, |
2103 | C: FS.getTotalSamples())) |
2104 | continue; |
2105 | |
2106 | TotalHotFunc++; |
2107 | if (ProbeManager->profileIsHashMismatched(FuncDesc: *FuncDesc, Samples: FS)) |
2108 | NumMismatchedFunc++; |
2109 | } |
2110 | // Make sure that the num of selected function is not too small to distinguish |
2111 | // from the user's benign changes. |
2112 | if (TotalHotFunc < MinfuncsForStalenessError) |
2113 | return false; |
2114 | |
2115 | // Finally check the mismatch percentage against the threshold. |
2116 | if (NumMismatchedFunc * 100 >= |
2117 | TotalHotFunc * PrecentMismatchForStalenessError) { |
2118 | auto &Ctx = M.getContext(); |
2119 | const char *Msg = |
2120 | "The input profile significantly mismatches current source code. " |
2121 | "Please recollect profile to avoid performance regression." ; |
2122 | Ctx.diagnose(DI: DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg)); |
2123 | return true; |
2124 | } |
2125 | return false; |
2126 | } |
2127 | |
2128 | void SampleProfileLoader::removePseudoProbeInsts(Module &M) { |
2129 | for (auto &F : M) { |
2130 | std::vector<Instruction *> InstsToDel; |
2131 | for (auto &BB : F) { |
2132 | for (auto &I : BB) { |
2133 | if (isa<PseudoProbeInst>(Val: &I)) |
2134 | InstsToDel.push_back(x: &I); |
2135 | } |
2136 | } |
2137 | for (auto *I : InstsToDel) |
2138 | I->eraseFromParent(); |
2139 | } |
2140 | } |
2141 | |
2142 | bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, |
2143 | ProfileSummaryInfo *_PSI) { |
2144 | GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); |
2145 | |
2146 | PSI = _PSI; |
2147 | if (M.getProfileSummary(/* IsCS */ false) == nullptr) { |
2148 | M.setProfileSummary(M: Reader->getSummary().getMD(Context&: M.getContext()), |
2149 | Kind: ProfileSummary::PSK_Sample); |
2150 | PSI->refresh(); |
2151 | } |
2152 | |
2153 | if (FunctionSamples::ProfileIsProbeBased && |
2154 | rejectHighStalenessProfile(M, PSI, Profiles: Reader->getProfiles())) |
2155 | return false; |
2156 | |
2157 | // Compute the total number of samples collected in this profile. |
2158 | for (const auto &I : Reader->getProfiles()) |
2159 | TotalCollectedSamples += I.second.getTotalSamples(); |
2160 | |
2161 | auto Remapper = Reader->getRemapper(); |
2162 | // Populate the symbol map. |
2163 | for (const auto &N_F : M.getValueSymbolTable()) { |
2164 | StringRef OrigName = N_F.getKey(); |
2165 | Function *F = dyn_cast<Function>(Val: N_F.getValue()); |
2166 | if (F == nullptr || OrigName.empty()) |
2167 | continue; |
2168 | SymbolMap[FunctionId(OrigName)] = F; |
2169 | StringRef NewName = FunctionSamples::getCanonicalFnName(F: *F); |
2170 | if (OrigName != NewName && !NewName.empty()) { |
2171 | auto r = SymbolMap.emplace(Args: FunctionId(NewName), Args&: F); |
2172 | // Failiing to insert means there is already an entry in SymbolMap, |
2173 | // thus there are multiple functions that are mapped to the same |
2174 | // stripped name. In this case of name conflicting, set the value |
2175 | // to nullptr to avoid confusion. |
2176 | if (!r.second) |
2177 | r.first->second = nullptr; |
2178 | OrigName = NewName; |
2179 | } |
2180 | // Insert the remapped names into SymbolMap. |
2181 | if (Remapper) { |
2182 | if (auto MapName = Remapper->lookUpNameInProfile(FunctionName: OrigName)) { |
2183 | if (*MapName != OrigName && !MapName->empty()) |
2184 | SymbolMap.emplace(Args: FunctionId(*MapName), Args&: F); |
2185 | } |
2186 | } |
2187 | } |
2188 | |
2189 | // Stale profile matching. |
2190 | if (ReportProfileStaleness || PersistProfileStaleness || |
2191 | SalvageStaleProfile) { |
2192 | MatchingManager->runOnModule(); |
2193 | MatchingManager->clearMatchingData(); |
2194 | } |
2195 | assert(SymbolMap.count(FunctionId()) == 0 && |
2196 | "No empty StringRef should be added in SymbolMap" ); |
2197 | assert((SalvageUnusedProfile || FuncNameToProfNameMap.empty()) && |
2198 | "FuncNameToProfNameMap is not empty when --salvage-unused-profile is " |
2199 | "not enabled" ); |
2200 | |
2201 | bool retval = false; |
2202 | for (auto *F : buildFunctionOrder(M, CG)) { |
2203 | assert(!F->isDeclaration()); |
2204 | clearFunctionData(); |
2205 | retval |= runOnFunction(F&: *F, AM); |
2206 | } |
2207 | |
2208 | // Account for cold calls not inlined.... |
2209 | if (!FunctionSamples::ProfileIsCS) |
2210 | for (const std::pair<Function *, NotInlinedProfileInfo> &pair : |
2211 | notInlinedCallInfo) |
2212 | updateProfileCallee(Callee: pair.first, EntryDelta: pair.second.entryCount); |
2213 | |
2214 | if (RemoveProbeAfterProfileAnnotation && FunctionSamples::ProfileIsProbeBased) |
2215 | removePseudoProbeInsts(M); |
2216 | |
2217 | return retval; |
2218 | } |
2219 | |
2220 | bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { |
2221 | LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n" ); |
2222 | DILocation2SampleMap.clear(); |
2223 | // By default the entry count is initialized to -1, which will be treated |
2224 | // conservatively by getEntryCount as the same as unknown (None). This is |
2225 | // to avoid newly added code to be treated as cold. If we have samples |
2226 | // this will be overwritten in emitAnnotations. |
2227 | uint64_t initialEntryCount = -1; |
2228 | |
2229 | ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL; |
2230 | if (ProfileSampleAccurate || F.hasFnAttribute(Kind: "profile-sample-accurate" )) { |
2231 | // initialize all the function entry counts to 0. It means all the |
2232 | // functions without profile will be regarded as cold. |
2233 | initialEntryCount = 0; |
2234 | // profile-sample-accurate is a user assertion which has a higher precedence |
2235 | // than symbol list. When profile-sample-accurate is on, ignore symbol list. |
2236 | ProfAccForSymsInList = false; |
2237 | } |
2238 | CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList); |
2239 | |
2240 | // PSL -- profile symbol list include all the symbols in sampled binary. |
2241 | // If ProfileAccurateForSymsInList is enabled, PSL is used to treat |
2242 | // old functions without samples being cold, without having to worry |
2243 | // about new and hot functions being mistakenly treated as cold. |
2244 | if (ProfAccForSymsInList) { |
2245 | // Initialize the entry count to 0 for functions in the list. |
2246 | if (PSL->contains(Name: F.getName())) |
2247 | initialEntryCount = 0; |
2248 | |
2249 | // Function in the symbol list but without sample will be regarded as |
2250 | // cold. To minimize the potential negative performance impact it could |
2251 | // have, we want to be a little conservative here saying if a function |
2252 | // shows up in the profile, no matter as outline function, inline instance |
2253 | // or call targets, treat the function as not being cold. This will handle |
2254 | // the cases such as most callsites of a function are inlined in sampled |
2255 | // binary but not inlined in current build (because of source code drift, |
2256 | // imprecise debug information, or the callsites are all cold individually |
2257 | // but not cold accumulatively...), so the outline function showing up as |
2258 | // cold in sampled binary will actually not be cold after current build. |
2259 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
2260 | if ((FunctionSamples::UseMD5 && |
2261 | GUIDsInProfile.count(V: Function::getGUID(GlobalName: CanonName))) || |
2262 | (!FunctionSamples::UseMD5 && NamesInProfile.count(Key: CanonName))) |
2263 | initialEntryCount = -1; |
2264 | } |
2265 | |
2266 | // Initialize entry count when the function has no existing entry |
2267 | // count value. |
2268 | if (!F.getEntryCount()) |
2269 | F.setEntryCount(Count: ProfileCount(initialEntryCount, Function::PCT_Real)); |
2270 | std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; |
2271 | if (AM) { |
2272 | auto &FAM = |
2273 | AM->getResult<FunctionAnalysisManagerModuleProxy>(IR&: *F.getParent()) |
2274 | .getManager(); |
2275 | ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
2276 | } else { |
2277 | OwnedORE = std::make_unique<OptimizationRemarkEmitter>(args: &F); |
2278 | ORE = OwnedORE.get(); |
2279 | } |
2280 | |
2281 | if (FunctionSamples::ProfileIsCS) |
2282 | Samples = ContextTracker->getBaseSamplesFor(Func: F); |
2283 | else { |
2284 | Samples = Reader->getSamplesFor(F); |
2285 | // Try search in previously inlined functions that were split or duplicated |
2286 | // into base. |
2287 | if (!Samples) { |
2288 | StringRef CanonName = FunctionSamples::getCanonicalFnName(F); |
2289 | auto It = OutlineFunctionSamples.find(x: FunctionId(CanonName)); |
2290 | if (It != OutlineFunctionSamples.end()) { |
2291 | Samples = &It->second; |
2292 | } else if (auto Remapper = Reader->getRemapper()) { |
2293 | if (auto RemppedName = Remapper->lookUpNameInProfile(FunctionName: CanonName)) { |
2294 | It = OutlineFunctionSamples.find(x: FunctionId(*RemppedName)); |
2295 | if (It != OutlineFunctionSamples.end()) |
2296 | Samples = &It->second; |
2297 | } |
2298 | } |
2299 | } |
2300 | } |
2301 | |
2302 | if (Samples && !Samples->empty()) |
2303 | return emitAnnotations(F); |
2304 | return false; |
2305 | } |
2306 | SampleProfileLoaderPass::SampleProfileLoaderPass( |
2307 | std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase, |
2308 | IntrusiveRefCntPtr<vfs::FileSystem> FS) |
2309 | : ProfileFileName(File), ProfileRemappingFileName(RemappingFile), |
2310 | LTOPhase(LTOPhase), FS(std::move(FS)) {} |
2311 | |
2312 | PreservedAnalyses SampleProfileLoaderPass::run(Module &M, |
2313 | ModuleAnalysisManager &AM) { |
2314 | FunctionAnalysisManager &FAM = |
2315 | AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager(); |
2316 | |
2317 | auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { |
2318 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
2319 | }; |
2320 | auto GetTTI = [&](Function &F) -> TargetTransformInfo & { |
2321 | return FAM.getResult<TargetIRAnalysis>(IR&: F); |
2322 | }; |
2323 | auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & { |
2324 | return FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
2325 | }; |
2326 | |
2327 | if (!FS) |
2328 | FS = vfs::getRealFileSystem(); |
2329 | LazyCallGraph &CG = AM.getResult<LazyCallGraphAnalysis>(IR&: M); |
2330 | |
2331 | SampleProfileLoader SampleLoader( |
2332 | ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, |
2333 | ProfileRemappingFileName.empty() ? SampleProfileRemappingFile |
2334 | : ProfileRemappingFileName, |
2335 | LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI, CG); |
2336 | if (!SampleLoader.doInitialization(M, FAM: &FAM)) |
2337 | return PreservedAnalyses::all(); |
2338 | |
2339 | ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(IR&: M); |
2340 | if (!SampleLoader.runOnModule(M, AM: &AM, PSI: PSI)) |
2341 | return PreservedAnalyses::all(); |
2342 | |
2343 | return PreservedAnalyses::none(); |
2344 | } |
2345 | |