1//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "CSPreInliner.h"
10#include "ProfiledBinary.h"
11#include "llvm/ADT/SCCIterator.h"
12#include "llvm/ADT/Statistic.h"
13#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14#include "llvm/Transforms/IPO/SampleProfile.h"
15#include <cstdint>
16#include <queue>
17
18#define DEBUG_TYPE "cs-preinliner"
19
20using namespace llvm;
21using namespace sampleprof;
22
23STATISTIC(PreInlNumCSInlined,
24 "Number of functions inlined with context sensitive profile");
25STATISTIC(PreInlNumCSNotInlined,
26 "Number of functions not inlined with context sensitive profile");
27STATISTIC(PreInlNumCSInlinedHitMinLimit,
28 "Number of functions with FDO inline stopped due to min size limit");
29STATISTIC(PreInlNumCSInlinedHitMaxLimit,
30 "Number of functions with FDO inline stopped due to max size limit");
31STATISTIC(
32 PreInlNumCSInlinedHitGrowthLimit,
33 "Number of functions with FDO inline stopped due to growth size limit");
34
35// The switches specify inline thresholds used in SampleProfileLoader inlining.
36// TODO: the actual threshold to be tuned here because the size here is based
37// on machine code not LLVM IR.
38namespace llvm {
39cl::opt<bool> EnableCSPreInliner(
40 "csspgo-preinliner", cl::Hidden, cl::init(Val: true),
41 cl::desc("Run a global pre-inliner to merge context profile based on "
42 "estimated global top-down inline decisions"));
43
44cl::opt<bool> UseContextCostForPreInliner(
45 "use-context-cost-for-preinliner", cl::Hidden, cl::init(Val: true),
46 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
47} // namespace llvm
48
49static cl::opt<bool> SamplePreInlineReplay(
50 "csspgo-replay-preinline", cl::Hidden, cl::init(Val: false),
51 cl::desc(
52 "Replay previous inlining and adjust context profile accordingly"));
53
54static cl::opt<int> CSPreinlMultiplierForPrevInl(
55 "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
56 cl::init(Val: 100),
57 cl::desc(
58 "Multiplier to bump up callsite threshold for previous inlining."));
59
60CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
61 ProfiledBinary &Binary, ProfileSummary *Summary)
62 : UseContextCost(UseContextCostForPreInliner),
63 // TODO: Pass in a guid-to-name map in order for
64 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
65 // as their profile context.
66 ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
67 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
68 // for good performance with reasonable profile size.
69 if (!SampleHotCallSiteThreshold.getNumOccurrences())
70 SampleHotCallSiteThreshold = 1500;
71 if (!SampleColdCallSiteThreshold.getNumOccurrences())
72 SampleColdCallSiteThreshold = 0;
73 if (!ProfileInlineLimitMax.getNumOccurrences())
74 ProfileInlineLimitMax = 50000;
75}
76
77std::vector<FunctionId> CSPreInliner::buildTopDownOrder() {
78 std::vector<FunctionId> Order;
79 // Trim cold edges to get a more stable call graph. This allows for a more
80 // stable top-down order which in turns helps the stablity of the generated
81 // profile from run to run.
82 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
83 DS: (Summary->getDetailedSummary()));
84 ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
85
86 // Now that we have a profiled call graph, construct top-down order
87 // by building up SCC and reversing SCC order.
88 scc_iterator<ProfiledCallGraph *> I = scc_begin(G: &ProfiledCG);
89 while (!I.isAtEnd()) {
90 auto Range = *I;
91 if (SortProfiledSCC) {
92 // Sort nodes in one SCC based on callsite hotness.
93 scc_member_iterator<ProfiledCallGraph *> SI(*I);
94 Range = *SI;
95 }
96 for (auto *Node : Range) {
97 if (Node != ProfiledCG.getEntryNode())
98 Order.push_back(x: Node->Name);
99 }
100 ++I;
101 }
102 std::reverse(first: Order.begin(), last: Order.end());
103
104 return Order;
105}
106
107bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
108 const FunctionSamples *CallerSamples) {
109 assert(CallerSamples && "Expect non-null caller samples");
110
111 // Ideally we want to consider everything a function calls, but as far as
112 // context profile is concerned, only those frames that are children of
113 // current one in the trie is relavent. So we walk the trie instead of call
114 // targets from function profile.
115 ContextTrieNode *CallerNode =
116 ContextTracker.getContextNodeForProfile(FSamples: CallerSamples);
117
118 bool HasNewCandidate = false;
119 for (auto &Child : CallerNode->getAllChildContext()) {
120 ContextTrieNode *CalleeNode = &Child.second;
121 FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
122 if (!CalleeSamples)
123 continue;
124
125 // Call site count is more reliable, so we look up the corresponding call
126 // target profile in caller's context profile to retrieve call site count.
127 uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
128 uint64_t CallsiteCount = 0;
129 LineLocation Callsite = CalleeNode->getCallSiteLoc();
130 if (auto CallTargets = CallerSamples->findCallTargetMapAt(CallSite: Callsite)) {
131 auto It = CallTargets->find(x: CalleeSamples->getFunction());
132 if (It != CallTargets->end())
133 CallsiteCount = It->second;
134 }
135
136 // TODO: call site and callee entry count should be mostly consistent, add
137 // check for that.
138 HasNewCandidate = true;
139 uint32_t CalleeSize = getFuncSize(ContextNode: CalleeNode);
140 CQueue.emplace(args&: CalleeSamples, args: std::max(a: CallsiteCount, b: CalleeEntryCount),
141 args&: CalleeSize);
142 }
143
144 return HasNewCandidate;
145}
146
147uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
148 if (UseContextCost)
149 return Binary.getFuncSizeForContext(ContextNode);
150
151 return ContextNode->getFunctionSamples()->getBodySamples().size();
152}
153
154bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
155 bool WasInlined =
156 Candidate.CalleeSamples->getContext().hasAttribute(A: ContextWasInlined);
157 // If replay inline is requested, simply follow the inline decision of the
158 // profiled binary.
159 if (SamplePreInlineReplay)
160 return WasInlined;
161
162 unsigned int SampleThreshold = SampleColdCallSiteThreshold;
163 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
164 DS: (Summary->getDetailedSummary()));
165
166 if (Candidate.CallsiteCount <= ColdCountThreshold)
167 SampleThreshold = SampleColdCallSiteThreshold;
168 else {
169 // Linearly adjust threshold based on normalized hotness, i.e, a value in
170 // [0,1]. Use 10% cutoff instead of the max count as the normalization
171 // upperbound for stability.
172 double NormalizationUpperBound =
173 ProfileSummaryBuilder::getEntryForPercentile(
174 DS: Summary->getDetailedSummary(), Percentile: 100000 /* 10% */)
175 .MinCount;
176 double NormalizationLowerBound = ColdCountThreshold;
177 double NormalizedHotness =
178 (Candidate.CallsiteCount - NormalizationLowerBound) /
179 (NormalizationUpperBound - NormalizationLowerBound);
180 if (NormalizedHotness > 1.0)
181 NormalizedHotness = 1.0;
182 // Add 1 to ensure hot callsites get a non-zero threshold, which could
183 // happen when SampleColdCallSiteThreshold is 0. This is when we do not
184 // want any inlining for cold callsites.
185 SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
186 SampleColdCallSiteThreshold + 1;
187 // Bump up the threshold to favor previous compiler inline decision. The
188 // compiler has more insight and knowledge about functions based on their IR
189 // and attribures and should be able to make a more reasonable inline
190 // decision.
191 if (WasInlined)
192 SampleThreshold *= CSPreinlMultiplierForPrevInl;
193 }
194
195 return (Candidate.SizeCost < SampleThreshold);
196}
197
198void CSPreInliner::processFunction(const FunctionId Name) {
199 FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
200 if (!FSamples)
201 return;
202
203 unsigned FuncSize =
204 getFuncSize(ContextNode: ContextTracker.getContextNodeForProfile(FSamples));
205 unsigned FuncFinalSize = FuncSize;
206 unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
207 SizeLimit = std::min(a: SizeLimit, b: (unsigned)ProfileInlineLimitMax);
208 SizeLimit = std::max(a: SizeLimit, b: (unsigned)ProfileInlineLimitMin);
209
210 LLVM_DEBUG(dbgs() << "Process " << Name
211 << " for context-sensitive pre-inlining (pre-inline size: "
212 << FuncSize << ", size limit: " << SizeLimit << ")\n");
213
214 ProfiledCandidateQueue CQueue;
215 getInlineCandidates(CQueue, CallerSamples: FSamples);
216
217 while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
218 ProfiledInlineCandidate Candidate = CQueue.top();
219 CQueue.pop();
220 bool ShouldInline = false;
221 if ((ShouldInline = shouldInline(Candidate))) {
222 // We mark context as inlined as the corresponding context profile
223 // won't be merged into that function's base profile.
224 ++PreInlNumCSInlined;
225 ContextTracker.markContextSamplesInlined(InlinedSamples: Candidate.CalleeSamples);
226 Candidate.CalleeSamples->getContext().setAttribute(
227 ContextShouldBeInlined);
228 FuncFinalSize += Candidate.SizeCost;
229 getInlineCandidates(CQueue, CallerSamples: Candidate.CalleeSamples);
230 } else {
231 ++PreInlNumCSNotInlined;
232 }
233 LLVM_DEBUG(
234 dbgs() << (ShouldInline ? " Inlined" : " Outlined")
235 << " context profile for: "
236 << ContextTracker.getContextString(*Candidate.CalleeSamples)
237 << " (callee size: " << Candidate.SizeCost
238 << ", call count:" << Candidate.CallsiteCount << ")\n");
239 }
240
241 if (!CQueue.empty()) {
242 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
243 ++PreInlNumCSInlinedHitMaxLimit;
244 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
245 ++PreInlNumCSInlinedHitMinLimit;
246 else
247 ++PreInlNumCSInlinedHitGrowthLimit;
248 }
249
250 LLVM_DEBUG({
251 if (!CQueue.empty())
252 dbgs() << " Inline candidates ignored due to size limit (inliner "
253 "original size: "
254 << FuncSize << ", inliner final size: " << FuncFinalSize
255 << ", size limit: " << SizeLimit << ")\n";
256
257 while (!CQueue.empty()) {
258 ProfiledInlineCandidate Candidate = CQueue.top();
259 CQueue.pop();
260 bool WasInlined =
261 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
262 dbgs() << " "
263 << ContextTracker.getContextString(*Candidate.CalleeSamples)
264 << " (candidate size:" << Candidate.SizeCost
265 << ", call count: " << Candidate.CallsiteCount << ", previously "
266 << (WasInlined ? "inlined)\n" : "not inlined)\n");
267 }
268 });
269}
270
271void CSPreInliner::run() {
272#ifndef NDEBUG
273 auto printProfileNames = [](SampleContextTracker &ContextTracker,
274 bool IsInput) {
275 uint32_t Size = 0;
276 for (auto *Node : ContextTracker) {
277 FunctionSamples *FSamples = Node->getFunctionSamples();
278 if (FSamples) {
279 Size++;
280 dbgs() << " [" << ContextTracker.getContextString(Node) << "] "
281 << FSamples->getTotalSamples() << ":"
282 << FSamples->getHeadSamples() << "\n";
283 }
284 }
285 dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
286 << Size << " total):\n";
287 };
288#endif
289
290 LLVM_DEBUG(printProfileNames(ContextTracker, true));
291
292 // Execute global pre-inliner to estimate a global top-down inline
293 // decision and merge profiles accordingly. This helps with profile
294 // merge for ThinLTO otherwise we won't be able to merge profiles back
295 // to base profile across module/thin-backend boundaries.
296 // It also helps better compress context profile to control profile
297 // size, as we now only need context profile for functions going to
298 // be inlined.
299 for (FunctionId FuncName : buildTopDownOrder()) {
300 processFunction(Name: FuncName);
301 }
302
303 // Not inlined context profiles are merged into its base, so we can
304 // trim out such profiles from the output.
305 for (auto *Node : ContextTracker) {
306 FunctionSamples *FProfile = Node->getFunctionSamples();
307 if (FProfile &&
308 (Node->getParentContext() != &ContextTracker.getRootContext() &&
309 !FProfile->getContext().hasState(S: InlinedContext))) {
310 Node->setFunctionSamples(nullptr);
311 }
312 }
313 FunctionSamples::ProfileIsPreInlined = true;
314
315 LLVM_DEBUG(printProfileNames(ContextTracker, false));
316}
317