1 | //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "CSPreInliner.h" |
10 | #include "ProfiledBinary.h" |
11 | #include "llvm/ADT/SCCIterator.h" |
12 | #include "llvm/ADT/Statistic.h" |
13 | #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" |
14 | #include "llvm/Transforms/IPO/SampleProfile.h" |
15 | #include <cstdint> |
16 | #include <queue> |
17 | |
18 | #define DEBUG_TYPE "cs-preinliner" |
19 | |
20 | using namespace llvm; |
21 | using namespace sampleprof; |
22 | |
23 | STATISTIC(PreInlNumCSInlined, |
24 | "Number of functions inlined with context sensitive profile" ); |
25 | STATISTIC(PreInlNumCSNotInlined, |
26 | "Number of functions not inlined with context sensitive profile" ); |
27 | STATISTIC(PreInlNumCSInlinedHitMinLimit, |
28 | "Number of functions with FDO inline stopped due to min size limit" ); |
29 | STATISTIC(PreInlNumCSInlinedHitMaxLimit, |
30 | "Number of functions with FDO inline stopped due to max size limit" ); |
31 | STATISTIC( |
32 | PreInlNumCSInlinedHitGrowthLimit, |
33 | "Number of functions with FDO inline stopped due to growth size limit" ); |
34 | |
35 | // The switches specify inline thresholds used in SampleProfileLoader inlining. |
36 | // TODO: the actual threshold to be tuned here because the size here is based |
37 | // on machine code not LLVM IR. |
38 | namespace llvm { |
39 | cl::opt<bool> EnableCSPreInliner( |
40 | "csspgo-preinliner" , cl::Hidden, cl::init(Val: true), |
41 | cl::desc("Run a global pre-inliner to merge context profile based on " |
42 | "estimated global top-down inline decisions" )); |
43 | |
44 | cl::opt<bool> UseContextCostForPreInliner( |
45 | "use-context-cost-for-preinliner" , cl::Hidden, cl::init(Val: true), |
46 | cl::desc("Use context-sensitive byte size cost for preinliner decisions" )); |
47 | } // namespace llvm |
48 | |
49 | static cl::opt<bool> SamplePreInlineReplay( |
50 | "csspgo-replay-preinline" , cl::Hidden, cl::init(Val: false), |
51 | cl::desc( |
52 | "Replay previous inlining and adjust context profile accordingly" )); |
53 | |
54 | static cl::opt<int> CSPreinlMultiplierForPrevInl( |
55 | "csspgo-preinliner-multiplier-for-previous-inlining" , cl::Hidden, |
56 | cl::init(Val: 100), |
57 | cl::desc( |
58 | "Multiplier to bump up callsite threshold for previous inlining." )); |
59 | |
60 | CSPreInliner::CSPreInliner(SampleContextTracker &Tracker, |
61 | ProfiledBinary &Binary, ProfileSummary *Summary) |
62 | : UseContextCost(UseContextCostForPreInliner), |
63 | // TODO: Pass in a guid-to-name map in order for |
64 | // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes |
65 | // as their profile context. |
66 | ContextTracker(Tracker), Binary(Binary), Summary(Summary) { |
67 | // Set default preinliner hot/cold call site threshold tuned with CSSPGO. |
68 | // for good performance with reasonable profile size. |
69 | if (!SampleHotCallSiteThreshold.getNumOccurrences()) |
70 | SampleHotCallSiteThreshold = 1500; |
71 | if (!SampleColdCallSiteThreshold.getNumOccurrences()) |
72 | SampleColdCallSiteThreshold = 0; |
73 | if (!ProfileInlineLimitMax.getNumOccurrences()) |
74 | ProfileInlineLimitMax = 50000; |
75 | } |
76 | |
77 | std::vector<FunctionId> CSPreInliner::buildTopDownOrder() { |
78 | std::vector<FunctionId> Order; |
79 | // Trim cold edges to get a more stable call graph. This allows for a more |
80 | // stable top-down order which in turns helps the stablity of the generated |
81 | // profile from run to run. |
82 | uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( |
83 | DS: (Summary->getDetailedSummary())); |
84 | ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold); |
85 | |
86 | // Now that we have a profiled call graph, construct top-down order |
87 | // by building up SCC and reversing SCC order. |
88 | scc_iterator<ProfiledCallGraph *> I = scc_begin(G: &ProfiledCG); |
89 | while (!I.isAtEnd()) { |
90 | auto Range = *I; |
91 | if (SortProfiledSCC) { |
92 | // Sort nodes in one SCC based on callsite hotness. |
93 | scc_member_iterator<ProfiledCallGraph *> SI(*I); |
94 | Range = *SI; |
95 | } |
96 | for (auto *Node : Range) { |
97 | if (Node != ProfiledCG.getEntryNode()) |
98 | Order.push_back(x: Node->Name); |
99 | } |
100 | ++I; |
101 | } |
102 | std::reverse(first: Order.begin(), last: Order.end()); |
103 | |
104 | return Order; |
105 | } |
106 | |
107 | bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, |
108 | const FunctionSamples *CallerSamples) { |
109 | assert(CallerSamples && "Expect non-null caller samples" ); |
110 | |
111 | // Ideally we want to consider everything a function calls, but as far as |
112 | // context profile is concerned, only those frames that are children of |
113 | // current one in the trie is relavent. So we walk the trie instead of call |
114 | // targets from function profile. |
115 | ContextTrieNode *CallerNode = |
116 | ContextTracker.getContextNodeForProfile(FSamples: CallerSamples); |
117 | |
118 | bool HasNewCandidate = false; |
119 | for (auto &Child : CallerNode->getAllChildContext()) { |
120 | ContextTrieNode *CalleeNode = &Child.second; |
121 | FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); |
122 | if (!CalleeSamples) |
123 | continue; |
124 | |
125 | // Call site count is more reliable, so we look up the corresponding call |
126 | // target profile in caller's context profile to retrieve call site count. |
127 | uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate(); |
128 | uint64_t CallsiteCount = 0; |
129 | LineLocation Callsite = CalleeNode->getCallSiteLoc(); |
130 | if (auto CallTargets = CallerSamples->findCallTargetMapAt(CallSite: Callsite)) { |
131 | auto It = CallTargets->find(x: CalleeSamples->getFunction()); |
132 | if (It != CallTargets->end()) |
133 | CallsiteCount = It->second; |
134 | } |
135 | |
136 | // TODO: call site and callee entry count should be mostly consistent, add |
137 | // check for that. |
138 | HasNewCandidate = true; |
139 | uint32_t CalleeSize = getFuncSize(ContextNode: CalleeNode); |
140 | CQueue.emplace(args&: CalleeSamples, args: std::max(a: CallsiteCount, b: CalleeEntryCount), |
141 | args&: CalleeSize); |
142 | } |
143 | |
144 | return HasNewCandidate; |
145 | } |
146 | |
147 | uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) { |
148 | if (UseContextCost) |
149 | return Binary.getFuncSizeForContext(ContextNode); |
150 | |
151 | return ContextNode->getFunctionSamples()->getBodySamples().size(); |
152 | } |
153 | |
154 | bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { |
155 | bool WasInlined = |
156 | Candidate.CalleeSamples->getContext().hasAttribute(A: ContextWasInlined); |
157 | // If replay inline is requested, simply follow the inline decision of the |
158 | // profiled binary. |
159 | if (SamplePreInlineReplay) |
160 | return WasInlined; |
161 | |
162 | unsigned int SampleThreshold = SampleColdCallSiteThreshold; |
163 | uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( |
164 | DS: (Summary->getDetailedSummary())); |
165 | |
166 | if (Candidate.CallsiteCount <= ColdCountThreshold) |
167 | SampleThreshold = SampleColdCallSiteThreshold; |
168 | else { |
169 | // Linearly adjust threshold based on normalized hotness, i.e, a value in |
170 | // [0,1]. Use 10% cutoff instead of the max count as the normalization |
171 | // upperbound for stability. |
172 | double NormalizationUpperBound = |
173 | ProfileSummaryBuilder::getEntryForPercentile( |
174 | DS: Summary->getDetailedSummary(), Percentile: 100000 /* 10% */) |
175 | .MinCount; |
176 | double NormalizationLowerBound = ColdCountThreshold; |
177 | double NormalizedHotness = |
178 | (Candidate.CallsiteCount - NormalizationLowerBound) / |
179 | (NormalizationUpperBound - NormalizationLowerBound); |
180 | if (NormalizedHotness > 1.0) |
181 | NormalizedHotness = 1.0; |
182 | // Add 1 to ensure hot callsites get a non-zero threshold, which could |
183 | // happen when SampleColdCallSiteThreshold is 0. This is when we do not |
184 | // want any inlining for cold callsites. |
185 | SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 + |
186 | SampleColdCallSiteThreshold + 1; |
187 | // Bump up the threshold to favor previous compiler inline decision. The |
188 | // compiler has more insight and knowledge about functions based on their IR |
189 | // and attribures and should be able to make a more reasonable inline |
190 | // decision. |
191 | if (WasInlined) |
192 | SampleThreshold *= CSPreinlMultiplierForPrevInl; |
193 | } |
194 | |
195 | return (Candidate.SizeCost < SampleThreshold); |
196 | } |
197 | |
198 | void CSPreInliner::processFunction(const FunctionId Name) { |
199 | FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); |
200 | if (!FSamples) |
201 | return; |
202 | |
203 | unsigned FuncSize = |
204 | getFuncSize(ContextNode: ContextTracker.getContextNodeForProfile(FSamples)); |
205 | unsigned FuncFinalSize = FuncSize; |
206 | unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; |
207 | SizeLimit = std::min(a: SizeLimit, b: (unsigned)ProfileInlineLimitMax); |
208 | SizeLimit = std::max(a: SizeLimit, b: (unsigned)ProfileInlineLimitMin); |
209 | |
210 | LLVM_DEBUG(dbgs() << "Process " << Name |
211 | << " for context-sensitive pre-inlining (pre-inline size: " |
212 | << FuncSize << ", size limit: " << SizeLimit << ")\n" ); |
213 | |
214 | ProfiledCandidateQueue CQueue; |
215 | getInlineCandidates(CQueue, CallerSamples: FSamples); |
216 | |
217 | while (!CQueue.empty() && FuncFinalSize < SizeLimit) { |
218 | ProfiledInlineCandidate Candidate = CQueue.top(); |
219 | CQueue.pop(); |
220 | bool ShouldInline = false; |
221 | if ((ShouldInline = shouldInline(Candidate))) { |
222 | // We mark context as inlined as the corresponding context profile |
223 | // won't be merged into that function's base profile. |
224 | ++PreInlNumCSInlined; |
225 | ContextTracker.markContextSamplesInlined(InlinedSamples: Candidate.CalleeSamples); |
226 | Candidate.CalleeSamples->getContext().setAttribute( |
227 | ContextShouldBeInlined); |
228 | FuncFinalSize += Candidate.SizeCost; |
229 | getInlineCandidates(CQueue, CallerSamples: Candidate.CalleeSamples); |
230 | } else { |
231 | ++PreInlNumCSNotInlined; |
232 | } |
233 | LLVM_DEBUG( |
234 | dbgs() << (ShouldInline ? " Inlined" : " Outlined" ) |
235 | << " context profile for: " |
236 | << ContextTracker.getContextString(*Candidate.CalleeSamples) |
237 | << " (callee size: " << Candidate.SizeCost |
238 | << ", call count:" << Candidate.CallsiteCount << ")\n" ); |
239 | } |
240 | |
241 | if (!CQueue.empty()) { |
242 | if (SizeLimit == (unsigned)ProfileInlineLimitMax) |
243 | ++PreInlNumCSInlinedHitMaxLimit; |
244 | else if (SizeLimit == (unsigned)ProfileInlineLimitMin) |
245 | ++PreInlNumCSInlinedHitMinLimit; |
246 | else |
247 | ++PreInlNumCSInlinedHitGrowthLimit; |
248 | } |
249 | |
250 | LLVM_DEBUG({ |
251 | if (!CQueue.empty()) |
252 | dbgs() << " Inline candidates ignored due to size limit (inliner " |
253 | "original size: " |
254 | << FuncSize << ", inliner final size: " << FuncFinalSize |
255 | << ", size limit: " << SizeLimit << ")\n" ; |
256 | |
257 | while (!CQueue.empty()) { |
258 | ProfiledInlineCandidate Candidate = CQueue.top(); |
259 | CQueue.pop(); |
260 | bool WasInlined = |
261 | Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); |
262 | dbgs() << " " |
263 | << ContextTracker.getContextString(*Candidate.CalleeSamples) |
264 | << " (candidate size:" << Candidate.SizeCost |
265 | << ", call count: " << Candidate.CallsiteCount << ", previously " |
266 | << (WasInlined ? "inlined)\n" : "not inlined)\n" ); |
267 | } |
268 | }); |
269 | } |
270 | |
271 | void CSPreInliner::run() { |
272 | #ifndef NDEBUG |
273 | auto printProfileNames = [](SampleContextTracker &ContextTracker, |
274 | bool IsInput) { |
275 | uint32_t Size = 0; |
276 | for (auto *Node : ContextTracker) { |
277 | FunctionSamples *FSamples = Node->getFunctionSamples(); |
278 | if (FSamples) { |
279 | Size++; |
280 | dbgs() << " [" << ContextTracker.getContextString(Node) << "] " |
281 | << FSamples->getTotalSamples() << ":" |
282 | << FSamples->getHeadSamples() << "\n" ; |
283 | } |
284 | } |
285 | dbgs() << (IsInput ? "Input" : "Output" ) << " context-sensitive profiles (" |
286 | << Size << " total):\n" ; |
287 | }; |
288 | #endif |
289 | |
290 | LLVM_DEBUG(printProfileNames(ContextTracker, true)); |
291 | |
292 | // Execute global pre-inliner to estimate a global top-down inline |
293 | // decision and merge profiles accordingly. This helps with profile |
294 | // merge for ThinLTO otherwise we won't be able to merge profiles back |
295 | // to base profile across module/thin-backend boundaries. |
296 | // It also helps better compress context profile to control profile |
297 | // size, as we now only need context profile for functions going to |
298 | // be inlined. |
299 | for (FunctionId FuncName : buildTopDownOrder()) { |
300 | processFunction(Name: FuncName); |
301 | } |
302 | |
303 | // Not inlined context profiles are merged into its base, so we can |
304 | // trim out such profiles from the output. |
305 | for (auto *Node : ContextTracker) { |
306 | FunctionSamples *FProfile = Node->getFunctionSamples(); |
307 | if (FProfile && |
308 | (Node->getParentContext() != &ContextTracker.getRootContext() && |
309 | !FProfile->getContext().hasState(S: InlinedContext))) { |
310 | Node->setFunctionSamples(nullptr); |
311 | } |
312 | } |
313 | FunctionSamples::ProfileIsPreInlined = true; |
314 | |
315 | LLVM_DEBUG(printProfileNames(ContextTracker, false)); |
316 | } |
317 | |