1 | //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements inline cost analysis. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/Analysis/InlineCost.h" |
14 | #include "llvm/ADT/STLExtras.h" |
15 | #include "llvm/ADT/SetVector.h" |
16 | #include "llvm/ADT/SmallPtrSet.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/ADT/Statistic.h" |
19 | #include "llvm/Analysis/AssumptionCache.h" |
20 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
21 | #include "llvm/Analysis/CodeMetrics.h" |
22 | #include "llvm/Analysis/ConstantFolding.h" |
23 | #include "llvm/Analysis/InstructionSimplify.h" |
24 | #include "llvm/Analysis/LoopInfo.h" |
25 | #include "llvm/Analysis/MemoryBuiltins.h" |
26 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
27 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
28 | #include "llvm/Analysis/TargetLibraryInfo.h" |
29 | #include "llvm/Analysis/TargetTransformInfo.h" |
30 | #include "llvm/Analysis/ValueTracking.h" |
31 | #include "llvm/Config/llvm-config.h" |
32 | #include "llvm/IR/AssemblyAnnotationWriter.h" |
33 | #include "llvm/IR/CallingConv.h" |
34 | #include "llvm/IR/DataLayout.h" |
35 | #include "llvm/IR/Dominators.h" |
36 | #include "llvm/IR/GetElementPtrTypeIterator.h" |
37 | #include "llvm/IR/GlobalAlias.h" |
38 | #include "llvm/IR/InstVisitor.h" |
39 | #include "llvm/IR/IntrinsicInst.h" |
40 | #include "llvm/IR/Operator.h" |
41 | #include "llvm/IR/PatternMatch.h" |
42 | #include "llvm/Support/CommandLine.h" |
43 | #include "llvm/Support/Debug.h" |
44 | #include "llvm/Support/FormattedStream.h" |
45 | #include "llvm/Support/raw_ostream.h" |
46 | #include <climits> |
47 | #include <limits> |
48 | #include <optional> |
49 | |
50 | using namespace llvm; |
51 | |
52 | #define DEBUG_TYPE "inline-cost" |
53 | |
54 | STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed" ); |
55 | |
56 | static cl::opt<int> |
57 | DefaultThreshold("inlinedefault-threshold" , cl::Hidden, cl::init(Val: 225), |
58 | cl::desc("Default amount of inlining to perform" )); |
59 | |
60 | // We introduce this option since there is a minor compile-time win by avoiding |
61 | // addition of TTI attributes (target-features in particular) to inline |
62 | // candidates when they are guaranteed to be the same as top level methods in |
63 | // some use cases. If we avoid adding the attribute, we need an option to avoid |
64 | // checking these attributes. |
65 | static cl::opt<bool> IgnoreTTIInlineCompatible( |
66 | "ignore-tti-inline-compatible" , cl::Hidden, cl::init(Val: false), |
67 | cl::desc("Ignore TTI attributes compatibility check between callee/caller " |
68 | "during inline cost calculation" )); |
69 | |
70 | static cl::opt<bool> ( |
71 | "print-instruction-comments" , cl::Hidden, cl::init(Val: false), |
72 | cl::desc("Prints comments for instruction based on inline cost analysis" )); |
73 | |
74 | static cl::opt<int> InlineThreshold( |
75 | "inline-threshold" , cl::Hidden, cl::init(Val: 225), |
76 | cl::desc("Control the amount of inlining to perform (default = 225)" )); |
77 | |
78 | static cl::opt<int> HintThreshold( |
79 | "inlinehint-threshold" , cl::Hidden, cl::init(Val: 325), |
80 | cl::desc("Threshold for inlining functions with inline hint" )); |
81 | |
82 | static cl::opt<int> |
83 | ColdCallSiteThreshold("inline-cold-callsite-threshold" , cl::Hidden, |
84 | cl::init(Val: 45), |
85 | cl::desc("Threshold for inlining cold callsites" )); |
86 | |
87 | static cl::opt<bool> InlineEnableCostBenefitAnalysis( |
88 | "inline-enable-cost-benefit-analysis" , cl::Hidden, cl::init(Val: false), |
89 | cl::desc("Enable the cost-benefit analysis for the inliner" )); |
90 | |
91 | // InlineSavingsMultiplier overrides per TTI multipliers iff it is |
92 | // specified explicitly in command line options. This option is exposed |
93 | // for tuning and testing. |
94 | static cl::opt<int> InlineSavingsMultiplier( |
95 | "inline-savings-multiplier" , cl::Hidden, cl::init(Val: 8), |
96 | cl::desc("Multiplier to multiply cycle savings by during inlining" )); |
97 | |
98 | // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is |
99 | // specified explicitly in command line options. This option is exposed |
100 | // for tuning and testing. |
101 | static cl::opt<int> InlineSavingsProfitableMultiplier( |
102 | "inline-savings-profitable-multiplier" , cl::Hidden, cl::init(Val: 4), |
103 | cl::desc("A multiplier on top of cycle savings to decide whether the " |
104 | "savings won't justify the cost" )); |
105 | |
106 | static cl::opt<int> |
107 | InlineSizeAllowance("inline-size-allowance" , cl::Hidden, cl::init(Val: 100), |
108 | cl::desc("The maximum size of a callee that get's " |
109 | "inlined without sufficient cycle savings" )); |
110 | |
111 | // We introduce this threshold to help performance of instrumentation based |
112 | // PGO before we actually hook up inliner with analysis passes such as BPI and |
113 | // BFI. |
114 | static cl::opt<int> ColdThreshold( |
115 | "inlinecold-threshold" , cl::Hidden, cl::init(Val: 45), |
116 | cl::desc("Threshold for inlining functions with cold attribute" )); |
117 | |
118 | static cl::opt<int> |
119 | HotCallSiteThreshold("hot-callsite-threshold" , cl::Hidden, cl::init(Val: 3000), |
120 | cl::desc("Threshold for hot callsites " )); |
121 | |
122 | static cl::opt<int> LocallyHotCallSiteThreshold( |
123 | "locally-hot-callsite-threshold" , cl::Hidden, cl::init(Val: 525), |
124 | cl::desc("Threshold for locally hot callsites " )); |
125 | |
126 | static cl::opt<int> ColdCallSiteRelFreq( |
127 | "cold-callsite-rel-freq" , cl::Hidden, cl::init(Val: 2), |
128 | cl::desc("Maximum block frequency, expressed as a percentage of caller's " |
129 | "entry frequency, for a callsite to be cold in the absence of " |
130 | "profile information." )); |
131 | |
132 | static cl::opt<uint64_t> HotCallSiteRelFreq( |
133 | "hot-callsite-rel-freq" , cl::Hidden, cl::init(Val: 60), |
134 | cl::desc("Minimum block frequency, expressed as a multiple of caller's " |
135 | "entry frequency, for a callsite to be hot in the absence of " |
136 | "profile information." )); |
137 | |
138 | static cl::opt<int> |
139 | InstrCost("inline-instr-cost" , cl::Hidden, cl::init(Val: 5), |
140 | cl::desc("Cost of a single instruction when inlining" )); |
141 | |
142 | static cl::opt<int> |
143 | MemAccessCost("inline-memaccess-cost" , cl::Hidden, cl::init(Val: 0), |
144 | cl::desc("Cost of load/store instruction when inlining" )); |
145 | |
146 | static cl::opt<int> CallPenalty( |
147 | "inline-call-penalty" , cl::Hidden, cl::init(Val: 25), |
148 | cl::desc("Call penalty that is applied per callsite when inlining" )); |
149 | |
150 | static cl::opt<size_t> |
151 | StackSizeThreshold("inline-max-stacksize" , cl::Hidden, |
152 | cl::init(Val: std::numeric_limits<size_t>::max()), |
153 | cl::desc("Do not inline functions with a stack size " |
154 | "that exceeds the specified limit" )); |
155 | |
156 | static cl::opt<size_t> RecurStackSizeThreshold( |
157 | "recursive-inline-max-stacksize" , cl::Hidden, |
158 | cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller), |
159 | cl::desc("Do not inline recursive functions with a stack " |
160 | "size that exceeds the specified limit" )); |
161 | |
162 | static cl::opt<bool> OptComputeFullInlineCost( |
163 | "inline-cost-full" , cl::Hidden, |
164 | cl::desc("Compute the full inline cost of a call site even when the cost " |
165 | "exceeds the threshold." )); |
166 | |
167 | static cl::opt<bool> InlineCallerSupersetNoBuiltin( |
168 | "inline-caller-superset-nobuiltin" , cl::Hidden, cl::init(Val: true), |
169 | cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " |
170 | "attributes." )); |
171 | |
172 | static cl::opt<bool> DisableGEPConstOperand( |
173 | "disable-gep-const-evaluation" , cl::Hidden, cl::init(Val: false), |
174 | cl::desc("Disables evaluation of GetElementPtr with constant operands" )); |
175 | |
176 | namespace llvm { |
177 | std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) { |
178 | if (Attr.isValid()) { |
179 | int AttrValue = 0; |
180 | if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue)) |
181 | return AttrValue; |
182 | } |
183 | return std::nullopt; |
184 | } |
185 | |
186 | std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) { |
187 | return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind)); |
188 | } |
189 | |
190 | std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) { |
191 | return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind)); |
192 | } |
193 | |
194 | namespace InlineConstants { |
195 | int getInstrCost() { return InstrCost; } |
196 | |
197 | } // namespace InlineConstants |
198 | |
199 | } // namespace llvm |
200 | |
201 | namespace { |
202 | class InlineCostCallAnalyzer; |
203 | |
204 | // This struct is used to store information about inline cost of a |
205 | // particular instruction |
206 | struct InstructionCostDetail { |
207 | int CostBefore = 0; |
208 | int CostAfter = 0; |
209 | int ThresholdBefore = 0; |
210 | int ThresholdAfter = 0; |
211 | |
212 | int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; } |
213 | |
214 | int getCostDelta() const { return CostAfter - CostBefore; } |
215 | |
216 | bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; } |
217 | }; |
218 | |
219 | class InlineCostAnnotationWriter : public AssemblyAnnotationWriter { |
220 | private: |
221 | InlineCostCallAnalyzer *const ICCA; |
222 | |
223 | public: |
224 | InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} |
225 | void emitInstructionAnnot(const Instruction *I, |
226 | formatted_raw_ostream &OS) override; |
227 | }; |
228 | |
229 | /// Carry out call site analysis, in order to evaluate inlinability. |
230 | /// NOTE: the type is currently used as implementation detail of functions such |
231 | /// as llvm::getInlineCost. Note the function_ref constructor parameters - the |
232 | /// expectation is that they come from the outer scope, from the wrapper |
233 | /// functions. If we want to support constructing CallAnalyzer objects where |
234 | /// lambdas are provided inline at construction, or where the object needs to |
235 | /// otherwise survive past the scope of the provided functions, we need to |
236 | /// revisit the argument types. |
237 | class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { |
238 | typedef InstVisitor<CallAnalyzer, bool> Base; |
239 | friend class InstVisitor<CallAnalyzer, bool>; |
240 | |
241 | protected: |
242 | virtual ~CallAnalyzer() = default; |
243 | /// The TargetTransformInfo available for this compilation. |
244 | const TargetTransformInfo &TTI; |
245 | |
246 | /// Getter for the cache of @llvm.assume intrinsics. |
247 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache; |
248 | |
249 | /// Getter for BlockFrequencyInfo |
250 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI; |
251 | |
252 | /// Profile summary information. |
253 | ProfileSummaryInfo *PSI; |
254 | |
255 | /// The called function. |
256 | Function &F; |
257 | |
258 | // Cache the DataLayout since we use it a lot. |
259 | const DataLayout &DL; |
260 | |
261 | /// The OptimizationRemarkEmitter available for this compilation. |
262 | OptimizationRemarkEmitter *ORE; |
263 | |
264 | /// The candidate callsite being analyzed. Please do not use this to do |
265 | /// analysis in the caller function; we want the inline cost query to be |
266 | /// easily cacheable. Instead, use the cover function paramHasAttr. |
267 | CallBase &CandidateCall; |
268 | |
269 | /// Extension points for handling callsite features. |
270 | // Called before a basic block was analyzed. |
271 | virtual void onBlockStart(const BasicBlock *BB) {} |
272 | |
273 | /// Called after a basic block was analyzed. |
274 | virtual void onBlockAnalyzed(const BasicBlock *BB) {} |
275 | |
276 | /// Called before an instruction was analyzed |
277 | virtual void onInstructionAnalysisStart(const Instruction *I) {} |
278 | |
279 | /// Called after an instruction was analyzed |
280 | virtual void onInstructionAnalysisFinish(const Instruction *I) {} |
281 | |
282 | /// Called at the end of the analysis of the callsite. Return the outcome of |
283 | /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or |
284 | /// the reason it can't. |
285 | virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } |
286 | /// Called when we're about to start processing a basic block, and every time |
287 | /// we are done processing an instruction. Return true if there is no point in |
288 | /// continuing the analysis (e.g. we've determined already the call site is |
289 | /// too expensive to inline) |
290 | virtual bool shouldStop() { return false; } |
291 | |
292 | /// Called before the analysis of the callee body starts (with callsite |
293 | /// contexts propagated). It checks callsite-specific information. Return a |
294 | /// reason analysis can't continue if that's the case, or 'true' if it may |
295 | /// continue. |
296 | virtual InlineResult onAnalysisStart() { return InlineResult::success(); } |
297 | /// Called if the analysis engine decides SROA cannot be done for the given |
298 | /// alloca. |
299 | virtual void onDisableSROA(AllocaInst *Arg) {} |
300 | |
301 | /// Called the analysis engine determines load elimination won't happen. |
302 | virtual void onDisableLoadElimination() {} |
303 | |
304 | /// Called when we visit a CallBase, before the analysis starts. Return false |
305 | /// to stop further processing of the instruction. |
306 | virtual bool onCallBaseVisitStart(CallBase &Call) { return true; } |
307 | |
308 | /// Called to account for a call. |
309 | virtual void onCallPenalty() {} |
310 | |
311 | /// Called to account for a load or store. |
312 | virtual void onMemAccess(){}; |
313 | |
314 | /// Called to account for the expectation the inlining would result in a load |
315 | /// elimination. |
316 | virtual void onLoadEliminationOpportunity() {} |
317 | |
318 | /// Called to account for the cost of argument setup for the Call in the |
319 | /// callee's body (not the callsite currently under analysis). |
320 | virtual void onCallArgumentSetup(const CallBase &Call) {} |
321 | |
322 | /// Called to account for a load relative intrinsic. |
323 | virtual void onLoadRelativeIntrinsic() {} |
324 | |
325 | /// Called to account for a lowered call. |
326 | virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { |
327 | } |
328 | |
329 | /// Account for a jump table of given size. Return false to stop further |
330 | /// processing the switch instruction |
331 | virtual bool onJumpTable(unsigned JumpTableSize) { return true; } |
332 | |
333 | /// Account for a case cluster of given size. Return false to stop further |
334 | /// processing of the instruction. |
335 | virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } |
336 | |
337 | /// Called at the end of processing a switch instruction, with the given |
338 | /// number of case clusters. |
339 | virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
340 | bool DefaultDestUndefined) {} |
341 | |
342 | /// Called to account for any other instruction not specifically accounted |
343 | /// for. |
344 | virtual void onMissedSimplification() {} |
345 | |
346 | /// Start accounting potential benefits due to SROA for the given alloca. |
347 | virtual void onInitializeSROAArg(AllocaInst *Arg) {} |
348 | |
349 | /// Account SROA savings for the AllocaInst value. |
350 | virtual void onAggregateSROAUse(AllocaInst *V) {} |
351 | |
352 | bool handleSROA(Value *V, bool DoNotDisable) { |
353 | // Check for SROA candidates in comparisons. |
354 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
355 | if (DoNotDisable) { |
356 | onAggregateSROAUse(V: SROAArg); |
357 | return true; |
358 | } |
359 | disableSROAForArg(SROAArg); |
360 | } |
361 | return false; |
362 | } |
363 | |
364 | bool IsCallerRecursive = false; |
365 | bool IsRecursiveCall = false; |
366 | bool ExposesReturnsTwice = false; |
367 | bool HasDynamicAlloca = false; |
368 | bool ContainsNoDuplicateCall = false; |
369 | bool HasReturn = false; |
370 | bool HasIndirectBr = false; |
371 | bool HasUninlineableIntrinsic = false; |
372 | bool InitsVargArgs = false; |
373 | |
374 | /// Number of bytes allocated statically by the callee. |
375 | uint64_t AllocatedSize = 0; |
376 | unsigned NumInstructions = 0; |
377 | unsigned NumVectorInstructions = 0; |
378 | |
379 | /// While we walk the potentially-inlined instructions, we build up and |
380 | /// maintain a mapping of simplified values specific to this callsite. The |
381 | /// idea is to propagate any special information we have about arguments to |
382 | /// this call through the inlinable section of the function, and account for |
383 | /// likely simplifications post-inlining. The most important aspect we track |
384 | /// is CFG altering simplifications -- when we prove a basic block dead, that |
385 | /// can cause dramatic shifts in the cost of inlining a function. |
386 | DenseMap<Value *, Constant *> SimplifiedValues; |
387 | |
388 | /// Keep track of the values which map back (through function arguments) to |
389 | /// allocas on the caller stack which could be simplified through SROA. |
390 | DenseMap<Value *, AllocaInst *> SROAArgValues; |
391 | |
392 | /// Keep track of Allocas for which we believe we may get SROA optimization. |
393 | DenseSet<AllocaInst *> EnabledSROAAllocas; |
394 | |
395 | /// Keep track of values which map to a pointer base and constant offset. |
396 | DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; |
397 | |
398 | /// Keep track of dead blocks due to the constant arguments. |
399 | SmallPtrSet<BasicBlock *, 16> DeadBlocks; |
400 | |
401 | /// The mapping of the blocks to their known unique successors due to the |
402 | /// constant arguments. |
403 | DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; |
404 | |
405 | /// Model the elimination of repeated loads that is expected to happen |
406 | /// whenever we simplify away the stores that would otherwise cause them to be |
407 | /// loads. |
408 | bool EnableLoadElimination = true; |
409 | |
410 | /// Whether we allow inlining for recursive call. |
411 | bool AllowRecursiveCall = false; |
412 | |
413 | SmallPtrSet<Value *, 16> LoadAddrSet; |
414 | |
415 | AllocaInst *getSROAArgForValueOrNull(Value *V) const { |
416 | auto It = SROAArgValues.find(Val: V); |
417 | if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0) |
418 | return nullptr; |
419 | return It->second; |
420 | } |
421 | |
422 | // Custom simplification helper routines. |
423 | bool isAllocaDerivedArg(Value *V); |
424 | void disableSROAForArg(AllocaInst *SROAArg); |
425 | void disableSROA(Value *V); |
426 | void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); |
427 | void disableLoadElimination(); |
428 | bool isGEPFree(GetElementPtrInst &GEP); |
429 | bool canFoldInboundsGEP(GetElementPtrInst &I); |
430 | bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); |
431 | bool simplifyCallSite(Function *F, CallBase &Call); |
432 | bool simplifyInstruction(Instruction &I); |
433 | bool simplifyIntrinsicCallIsConstant(CallBase &CB); |
434 | bool simplifyIntrinsicCallObjectSize(CallBase &CB); |
435 | ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); |
436 | |
437 | /// Return true if the given argument to the function being considered for |
438 | /// inlining has the given attribute set either at the call site or the |
439 | /// function declaration. Primarily used to inspect call site specific |
440 | /// attributes since these can be more precise than the ones on the callee |
441 | /// itself. |
442 | bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); |
443 | |
444 | /// Return true if the given value is known non null within the callee if |
445 | /// inlined through this particular callsite. |
446 | bool isKnownNonNullInCallee(Value *V); |
447 | |
448 | /// Return true if size growth is allowed when inlining the callee at \p Call. |
449 | bool allowSizeGrowth(CallBase &Call); |
450 | |
451 | // Custom analysis routines. |
452 | InlineResult analyzeBlock(BasicBlock *BB, |
453 | SmallPtrSetImpl<const Value *> &EphValues); |
454 | |
455 | // Disable several entry points to the visitor so we don't accidentally use |
456 | // them by declaring but not defining them here. |
457 | void visit(Module *); |
458 | void visit(Module &); |
459 | void visit(Function *); |
460 | void visit(Function &); |
461 | void visit(BasicBlock *); |
462 | void visit(BasicBlock &); |
463 | |
464 | // Provide base case for our instruction visit. |
465 | bool visitInstruction(Instruction &I); |
466 | |
467 | // Our visit overrides. |
468 | bool visitAlloca(AllocaInst &I); |
469 | bool visitPHI(PHINode &I); |
470 | bool visitGetElementPtr(GetElementPtrInst &I); |
471 | bool visitBitCast(BitCastInst &I); |
472 | bool visitPtrToInt(PtrToIntInst &I); |
473 | bool visitIntToPtr(IntToPtrInst &I); |
474 | bool visitCastInst(CastInst &I); |
475 | bool visitCmpInst(CmpInst &I); |
476 | bool visitSub(BinaryOperator &I); |
477 | bool visitBinaryOperator(BinaryOperator &I); |
478 | bool visitFNeg(UnaryOperator &I); |
479 | bool visitLoad(LoadInst &I); |
480 | bool visitStore(StoreInst &I); |
481 | bool visitExtractValue(ExtractValueInst &I); |
482 | bool visitInsertValue(InsertValueInst &I); |
483 | bool visitCallBase(CallBase &Call); |
484 | bool visitReturnInst(ReturnInst &RI); |
485 | bool visitBranchInst(BranchInst &BI); |
486 | bool visitSelectInst(SelectInst &SI); |
487 | bool visitSwitchInst(SwitchInst &SI); |
488 | bool visitIndirectBrInst(IndirectBrInst &IBI); |
489 | bool visitResumeInst(ResumeInst &RI); |
490 | bool visitCleanupReturnInst(CleanupReturnInst &RI); |
491 | bool visitCatchReturnInst(CatchReturnInst &RI); |
492 | bool visitUnreachableInst(UnreachableInst &I); |
493 | |
494 | public: |
495 | CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, |
496 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
497 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
498 | ProfileSummaryInfo *PSI = nullptr, |
499 | OptimizationRemarkEmitter *ORE = nullptr) |
500 | : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), |
501 | PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE), |
502 | CandidateCall(Call) {} |
503 | |
504 | InlineResult analyze(); |
505 | |
506 | std::optional<Constant *> getSimplifiedValue(Instruction *I) { |
507 | if (SimplifiedValues.contains(Val: I)) |
508 | return SimplifiedValues[I]; |
509 | return std::nullopt; |
510 | } |
511 | |
512 | // Keep a bunch of stats about the cost savings found so we can print them |
513 | // out when debugging. |
514 | unsigned NumConstantArgs = 0; |
515 | unsigned NumConstantOffsetPtrArgs = 0; |
516 | unsigned NumAllocaArgs = 0; |
517 | unsigned NumConstantPtrCmps = 0; |
518 | unsigned NumConstantPtrDiffs = 0; |
519 | unsigned NumInstructionsSimplified = 0; |
520 | |
521 | void dump(); |
522 | }; |
523 | |
524 | // Considering forming a binary search, we should find the number of nodes |
525 | // which is same as the number of comparisons when lowered. For a given |
526 | // number of clusters, n, we can define a recursive function, f(n), to find |
527 | // the number of nodes in the tree. The recursion is : |
528 | // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, |
529 | // and f(n) = n, when n <= 3. |
530 | // This will lead a binary tree where the leaf should be either f(2) or f(3) |
531 | // when n > 3. So, the number of comparisons from leaves should be n, while |
532 | // the number of non-leaf should be : |
533 | // 2^(log2(n) - 1) - 1 |
534 | // = 2^log2(n) * 2^-1 - 1 |
535 | // = n / 2 - 1. |
536 | // Considering comparisons from leaf and non-leaf nodes, we can estimate the |
537 | // number of comparisons in a simple closed form : |
538 | // n + n / 2 - 1 = n * 3 / 2 - 1 |
539 | int64_t getExpectedNumberOfCompare(int NumCaseCluster) { |
540 | return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1; |
541 | } |
542 | |
543 | /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note |
544 | /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer |
545 | class InlineCostCallAnalyzer final : public CallAnalyzer { |
546 | const bool ComputeFullInlineCost; |
547 | int LoadEliminationCost = 0; |
548 | /// Bonus to be applied when percentage of vector instructions in callee is |
549 | /// high (see more details in updateThreshold). |
550 | int VectorBonus = 0; |
551 | /// Bonus to be applied when the callee has only one reachable basic block. |
552 | int SingleBBBonus = 0; |
553 | |
554 | /// Tunable parameters that control the analysis. |
555 | const InlineParams &Params; |
556 | |
557 | // This DenseMap stores the delta change in cost and threshold after |
558 | // accounting for the given instruction. The map is filled only with the |
559 | // flag PrintInstructionComments on. |
560 | DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap; |
561 | |
562 | /// Upper bound for the inlining cost. Bonuses are being applied to account |
563 | /// for speculative "expected profit" of the inlining decision. |
564 | int Threshold = 0; |
565 | |
566 | /// The amount of StaticBonus applied. |
567 | int StaticBonusApplied = 0; |
568 | |
569 | /// Attempt to evaluate indirect calls to boost its inline cost. |
570 | const bool BoostIndirectCalls; |
571 | |
572 | /// Ignore the threshold when finalizing analysis. |
573 | const bool IgnoreThreshold; |
574 | |
575 | // True if the cost-benefit-analysis-based inliner is enabled. |
576 | const bool CostBenefitAnalysisEnabled; |
577 | |
578 | /// Inlining cost measured in abstract units, accounts for all the |
579 | /// instructions expected to be executed for a given function invocation. |
580 | /// Instructions that are statically proven to be dead based on call-site |
581 | /// arguments are not counted here. |
582 | int Cost = 0; |
583 | |
584 | // The cumulative cost at the beginning of the basic block being analyzed. At |
585 | // the end of analyzing each basic block, "Cost - CostAtBBStart" represents |
586 | // the size of that basic block. |
587 | int CostAtBBStart = 0; |
588 | |
589 | // The static size of live but cold basic blocks. This is "static" in the |
590 | // sense that it's not weighted by profile counts at all. |
591 | int ColdSize = 0; |
592 | |
593 | // Whether inlining is decided by cost-threshold analysis. |
594 | bool DecidedByCostThreshold = false; |
595 | |
596 | // Whether inlining is decided by cost-benefit analysis. |
597 | bool DecidedByCostBenefit = false; |
598 | |
599 | // The cost-benefit pair computed by cost-benefit analysis. |
600 | std::optional<CostBenefitPair> CostBenefit; |
601 | |
602 | bool SingleBB = true; |
603 | |
604 | unsigned SROACostSavings = 0; |
605 | unsigned SROACostSavingsLost = 0; |
606 | |
607 | /// The mapping of caller Alloca values to their accumulated cost savings. If |
608 | /// we have to disable SROA for one of the allocas, this tells us how much |
609 | /// cost must be added. |
610 | DenseMap<AllocaInst *, int> SROAArgCosts; |
611 | |
612 | /// Return true if \p Call is a cold callsite. |
613 | bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); |
614 | |
615 | /// Update Threshold based on callsite properties such as callee |
616 | /// attributes and callee hotness for PGO builds. The Callee is explicitly |
617 | /// passed to support analyzing indirect calls whose target is inferred by |
618 | /// analysis. |
619 | void updateThreshold(CallBase &Call, Function &Callee); |
620 | /// Return a higher threshold if \p Call is a hot callsite. |
621 | std::optional<int> getHotCallSiteThreshold(CallBase &Call, |
622 | BlockFrequencyInfo *CallerBFI); |
623 | |
624 | /// Handle a capped 'int' increment for Cost. |
625 | void addCost(int64_t Inc) { |
626 | Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX); |
627 | Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX); |
628 | } |
629 | |
630 | void onDisableSROA(AllocaInst *Arg) override { |
631 | auto CostIt = SROAArgCosts.find(Val: Arg); |
632 | if (CostIt == SROAArgCosts.end()) |
633 | return; |
634 | addCost(Inc: CostIt->second); |
635 | SROACostSavings -= CostIt->second; |
636 | SROACostSavingsLost += CostIt->second; |
637 | SROAArgCosts.erase(I: CostIt); |
638 | } |
639 | |
640 | void onDisableLoadElimination() override { |
641 | addCost(Inc: LoadEliminationCost); |
642 | LoadEliminationCost = 0; |
643 | } |
644 | |
645 | bool onCallBaseVisitStart(CallBase &Call) override { |
646 | if (std::optional<int> AttrCallThresholdBonus = |
647 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus" )) |
648 | Threshold += *AttrCallThresholdBonus; |
649 | |
650 | if (std::optional<int> AttrCallCost = |
651 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost" )) { |
652 | addCost(Inc: *AttrCallCost); |
653 | // Prevent further processing of the call since we want to override its |
654 | // inline cost, not just add to it. |
655 | return false; |
656 | } |
657 | return true; |
658 | } |
659 | |
660 | void onCallPenalty() override { addCost(Inc: CallPenalty); } |
661 | |
662 | void onMemAccess() override { addCost(Inc: MemAccessCost); } |
663 | |
664 | void onCallArgumentSetup(const CallBase &Call) override { |
665 | // Pay the price of the argument setup. We account for the average 1 |
666 | // instruction per call argument setup here. |
667 | addCost(Inc: Call.arg_size() * InstrCost); |
668 | } |
669 | void onLoadRelativeIntrinsic() override { |
670 | // This is normally lowered to 4 LLVM instructions. |
671 | addCost(Inc: 3 * InstrCost); |
672 | } |
673 | void onLoweredCall(Function *F, CallBase &Call, |
674 | bool IsIndirectCall) override { |
675 | // We account for the average 1 instruction per call argument setup here. |
676 | addCost(Inc: Call.arg_size() * InstrCost); |
677 | |
678 | // If we have a constant that we are calling as a function, we can peer |
679 | // through it and see the function target. This happens not infrequently |
680 | // during devirtualization and so we want to give it a hefty bonus for |
681 | // inlining, but cap that bonus in the event that inlining wouldn't pan out. |
682 | // Pretend to inline the function, with a custom threshold. |
683 | if (IsIndirectCall && BoostIndirectCalls) { |
684 | auto IndirectCallParams = Params; |
685 | IndirectCallParams.DefaultThreshold = |
686 | InlineConstants::IndirectCallThreshold; |
687 | /// FIXME: if InlineCostCallAnalyzer is derived from, this may need |
688 | /// to instantiate the derived class. |
689 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
690 | GetAssumptionCache, GetBFI, PSI, ORE, false); |
691 | if (CA.analyze().isSuccess()) { |
692 | // We were able to inline the indirect call! Subtract the cost from the |
693 | // threshold to get the bonus we want to apply, but don't go below zero. |
694 | Cost -= std::max(a: 0, b: CA.getThreshold() - CA.getCost()); |
695 | } |
696 | } else |
697 | // Otherwise simply add the cost for merely making the call. |
698 | addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call, |
699 | DefaultCallPenalty: CallPenalty)); |
700 | } |
701 | |
702 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
703 | bool DefaultDestUndefined) override { |
704 | // If suitable for a jump table, consider the cost for the table size and |
705 | // branch to destination. |
706 | // Maximum valid cost increased in this function. |
707 | if (JumpTableSize) { |
708 | // Suppose a default branch includes one compare and one conditional |
709 | // branch if it's reachable. |
710 | if (!DefaultDestUndefined) |
711 | addCost(Inc: 2 * InstrCost); |
712 | // Suppose a jump table requires one load and one jump instruction. |
713 | int64_t JTCost = |
714 | static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost; |
715 | addCost(Inc: JTCost); |
716 | return; |
717 | } |
718 | |
719 | if (NumCaseCluster <= 3) { |
720 | // Suppose a comparison includes one compare and one conditional branch. |
721 | // We can reduce a set of instructions if the default branch is |
722 | // undefined. |
723 | addCost(Inc: (NumCaseCluster - DefaultDestUndefined) * 2 * InstrCost); |
724 | return; |
725 | } |
726 | |
727 | int64_t ExpectedNumberOfCompare = |
728 | getExpectedNumberOfCompare(NumCaseCluster); |
729 | int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost; |
730 | |
731 | addCost(Inc: SwitchCost); |
732 | } |
733 | void onMissedSimplification() override { addCost(Inc: InstrCost); } |
734 | |
735 | void onInitializeSROAArg(AllocaInst *Arg) override { |
736 | assert(Arg != nullptr && |
737 | "Should not initialize SROA costs for null value." ); |
738 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
739 | SROACostSavings += SROAArgCost; |
740 | SROAArgCosts[Arg] = SROAArgCost; |
741 | } |
742 | |
743 | void onAggregateSROAUse(AllocaInst *SROAArg) override { |
744 | auto CostIt = SROAArgCosts.find(Val: SROAArg); |
745 | assert(CostIt != SROAArgCosts.end() && |
746 | "expected this argument to have a cost" ); |
747 | CostIt->second += InstrCost; |
748 | SROACostSavings += InstrCost; |
749 | } |
750 | |
751 | void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } |
752 | |
753 | void onBlockAnalyzed(const BasicBlock *BB) override { |
754 | if (CostBenefitAnalysisEnabled) { |
755 | // Keep track of the static size of live but cold basic blocks. For now, |
756 | // we define a cold basic block to be one that's never executed. |
757 | assert(GetBFI && "GetBFI must be available" ); |
758 | BlockFrequencyInfo *BFI = &(GetBFI(F)); |
759 | assert(BFI && "BFI must be available" ); |
760 | auto ProfileCount = BFI->getBlockProfileCount(BB); |
761 | if (*ProfileCount == 0) |
762 | ColdSize += Cost - CostAtBBStart; |
763 | } |
764 | |
765 | auto *TI = BB->getTerminator(); |
766 | // If we had any successors at this point, than post-inlining is likely to |
767 | // have them as well. Note that we assume any basic blocks which existed |
768 | // due to branches or switches which folded above will also fold after |
769 | // inlining. |
770 | if (SingleBB && TI->getNumSuccessors() > 1) { |
771 | // Take off the bonus we applied to the threshold. |
772 | Threshold -= SingleBBBonus; |
773 | SingleBB = false; |
774 | } |
775 | } |
776 | |
777 | void onInstructionAnalysisStart(const Instruction *I) override { |
778 | // This function is called to store the initial cost of inlining before |
779 | // the given instruction was assessed. |
780 | if (!PrintInstructionComments) |
781 | return; |
782 | InstructionCostDetailMap[I].CostBefore = Cost; |
783 | InstructionCostDetailMap[I].ThresholdBefore = Threshold; |
784 | } |
785 | |
786 | void onInstructionAnalysisFinish(const Instruction *I) override { |
787 | // This function is called to find new values of cost and threshold after |
788 | // the instruction has been assessed. |
789 | if (!PrintInstructionComments) |
790 | return; |
791 | InstructionCostDetailMap[I].CostAfter = Cost; |
792 | InstructionCostDetailMap[I].ThresholdAfter = Threshold; |
793 | } |
794 | |
795 | bool isCostBenefitAnalysisEnabled() { |
796 | if (!PSI || !PSI->hasProfileSummary()) |
797 | return false; |
798 | |
799 | if (!GetBFI) |
800 | return false; |
801 | |
802 | if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) { |
803 | // Honor the explicit request from the user. |
804 | if (!InlineEnableCostBenefitAnalysis) |
805 | return false; |
806 | } else { |
807 | // Otherwise, require instrumentation profile. |
808 | if (!PSI->hasInstrumentationProfile()) |
809 | return false; |
810 | } |
811 | |
812 | auto *Caller = CandidateCall.getParent()->getParent(); |
813 | if (!Caller->getEntryCount()) |
814 | return false; |
815 | |
816 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller)); |
817 | if (!CallerBFI) |
818 | return false; |
819 | |
820 | // For now, limit to hot call site. |
821 | if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI)) |
822 | return false; |
823 | |
824 | // Make sure we have a nonzero entry count. |
825 | auto EntryCount = F.getEntryCount(); |
826 | if (!EntryCount || !EntryCount->getCount()) |
827 | return false; |
828 | |
829 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
830 | if (!CalleeBFI) |
831 | return false; |
832 | |
833 | return true; |
834 | } |
835 | |
836 | // A helper function to choose between command line override and default. |
837 | unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { |
838 | if (InlineSavingsMultiplier.getNumOccurrences()) |
839 | return InlineSavingsMultiplier; |
840 | return TTI.getInliningCostBenefitAnalysisSavingsMultiplier(); |
841 | } |
842 | |
843 | // A helper function to choose between command line override and default. |
844 | unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const { |
845 | if (InlineSavingsProfitableMultiplier.getNumOccurrences()) |
846 | return InlineSavingsProfitableMultiplier; |
847 | return TTI.getInliningCostBenefitAnalysisProfitableMultiplier(); |
848 | } |
849 | |
850 | void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) { |
851 | if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt( |
852 | CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test" )) { |
853 | CycleSavings = *AttrCycleSavings; |
854 | } |
855 | |
856 | if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt( |
857 | CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test" )) { |
858 | Size = *AttrRuntimeCost; |
859 | } |
860 | } |
861 | |
862 | // Determine whether we should inline the given call site, taking into account |
863 | // both the size cost and the cycle savings. Return std::nullopt if we don't |
864 | // have sufficient profiling information to determine. |
865 | std::optional<bool> costBenefitAnalysis() { |
866 | if (!CostBenefitAnalysisEnabled) |
867 | return std::nullopt; |
868 | |
869 | // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 |
870 | // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by |
871 | // falling back to the cost-based metric. |
872 | // TODO: Improve this hacky condition. |
873 | if (Threshold == 0) |
874 | return std::nullopt; |
875 | |
876 | assert(GetBFI); |
877 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
878 | assert(CalleeBFI); |
879 | |
880 | // The cycle savings expressed as the sum of InstrCost |
881 | // multiplied by the estimated dynamic count of each instruction we can |
882 | // avoid. Savings come from the call site cost, such as argument setup and |
883 | // the call instruction, as well as the instructions that are folded. |
884 | // |
885 | // We use 128-bit APInt here to avoid potential overflow. This variable |
886 | // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case |
887 | // assumes that we can avoid or fold a billion instructions, each with a |
888 | // profile count of 10^^15 -- roughly the number of cycles for a 24-hour |
889 | // period on a 4GHz machine. |
890 | APInt CycleSavings(128, 0); |
891 | |
892 | for (auto &BB : F) { |
893 | APInt CurrentSavings(128, 0); |
894 | for (auto &I : BB) { |
895 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) { |
896 | // Count a conditional branch as savings if it becomes unconditional. |
897 | if (BI->isConditional() && |
898 | isa_and_nonnull<ConstantInt>( |
899 | Val: SimplifiedValues.lookup(Val: BI->getCondition()))) { |
900 | CurrentSavings += InstrCost; |
901 | } |
902 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) { |
903 | if (isa_and_present<ConstantInt>(Val: SimplifiedValues.lookup(Val: SI->getCondition()))) |
904 | CurrentSavings += InstrCost; |
905 | } else if (Value *V = dyn_cast<Value>(Val: &I)) { |
906 | // Count an instruction as savings if we can fold it. |
907 | if (SimplifiedValues.count(Val: V)) { |
908 | CurrentSavings += InstrCost; |
909 | } |
910 | } |
911 | } |
912 | |
913 | auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB); |
914 | CurrentSavings *= *ProfileCount; |
915 | CycleSavings += CurrentSavings; |
916 | } |
917 | |
918 | // Compute the cycle savings per call. |
919 | auto EntryProfileCount = F.getEntryCount(); |
920 | assert(EntryProfileCount && EntryProfileCount->getCount()); |
921 | auto EntryCount = EntryProfileCount->getCount(); |
922 | CycleSavings += EntryCount / 2; |
923 | CycleSavings = CycleSavings.udiv(RHS: EntryCount); |
924 | |
925 | // Compute the total savings for the call site. |
926 | auto *CallerBB = CandidateCall.getParent(); |
927 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent()))); |
928 | CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL); |
929 | CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB); |
930 | |
931 | // Remove the cost of the cold basic blocks to model the runtime cost more |
932 | // accurately. Both machine block placement and function splitting could |
933 | // place cold blocks further from hot blocks. |
934 | int Size = Cost - ColdSize; |
935 | |
936 | // Allow tiny callees to be inlined regardless of whether they meet the |
937 | // savings threshold. |
938 | Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; |
939 | |
940 | OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size); |
941 | CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings); |
942 | |
943 | // Let R be the ratio of CycleSavings to Size. We accept the inlining |
944 | // opportunity if R is really high and reject if R is really low. If R is |
945 | // somewhere in the middle, we fall back to the cost-based analysis. |
946 | // |
947 | // Specifically, let R = CycleSavings / Size, we accept the inlining |
948 | // opportunity if: |
949 | // |
950 | // PSI->getOrCompHotCountThreshold() |
951 | // R > ------------------------------------------------- |
952 | // getInliningCostBenefitAnalysisSavingsMultiplier() |
953 | // |
954 | // and reject the inlining opportunity if: |
955 | // |
956 | // PSI->getOrCompHotCountThreshold() |
957 | // R <= ---------------------------------------------------- |
958 | // getInliningCostBenefitAnalysisProfitableMultiplier() |
959 | // |
960 | // Otherwise, we fall back to the cost-based analysis. |
961 | // |
962 | // Implementation-wise, use multiplication (CycleSavings * Multiplier, |
963 | // HotCountThreshold * Size) rather than division to avoid precision loss. |
964 | APInt Threshold(128, PSI->getOrCompHotCountThreshold()); |
965 | Threshold *= Size; |
966 | |
967 | APInt UpperBoundCycleSavings = CycleSavings; |
968 | UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier(); |
969 | if (UpperBoundCycleSavings.uge(RHS: Threshold)) |
970 | return true; |
971 | |
972 | APInt LowerBoundCycleSavings = CycleSavings; |
973 | LowerBoundCycleSavings *= |
974 | getInliningCostBenefitAnalysisProfitableMultiplier(); |
975 | if (LowerBoundCycleSavings.ult(RHS: Threshold)) |
976 | return false; |
977 | |
978 | // Otherwise, fall back to the cost-based analysis. |
979 | return std::nullopt; |
980 | } |
981 | |
982 | InlineResult finalizeAnalysis() override { |
983 | // Loops generally act a lot like calls in that they act like barriers to |
984 | // movement, require a certain amount of setup, etc. So when optimising for |
985 | // size, we penalise any call sites that perform loops. We do this after all |
986 | // other costs here, so will likely only be dealing with relatively small |
987 | // functions (and hence DT and LI will hopefully be cheap). |
988 | auto *Caller = CandidateCall.getFunction(); |
989 | if (Caller->hasMinSize()) { |
990 | DominatorTree DT(F); |
991 | LoopInfo LI(DT); |
992 | int NumLoops = 0; |
993 | for (Loop *L : LI) { |
994 | // Ignore loops that will not be executed |
995 | if (DeadBlocks.count(Ptr: L->getHeader())) |
996 | continue; |
997 | NumLoops++; |
998 | } |
999 | addCost(Inc: NumLoops * InlineConstants::LoopPenalty); |
1000 | } |
1001 | |
1002 | // We applied the maximum possible vector bonus at the beginning. Now, |
1003 | // subtract the excess bonus, if any, from the Threshold before |
1004 | // comparing against Cost. |
1005 | if (NumVectorInstructions <= NumInstructions / 10) |
1006 | Threshold -= VectorBonus; |
1007 | else if (NumVectorInstructions <= NumInstructions / 2) |
1008 | Threshold -= VectorBonus / 2; |
1009 | |
1010 | if (std::optional<int> AttrCost = |
1011 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost" )) |
1012 | Cost = *AttrCost; |
1013 | |
1014 | if (std::optional<int> AttrCostMult = getStringFnAttrAsInt( |
1015 | CB&: CandidateCall, |
1016 | AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName)) |
1017 | Cost *= *AttrCostMult; |
1018 | |
1019 | if (std::optional<int> AttrThreshold = |
1020 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold" )) |
1021 | Threshold = *AttrThreshold; |
1022 | |
1023 | if (auto Result = costBenefitAnalysis()) { |
1024 | DecidedByCostBenefit = true; |
1025 | if (*Result) |
1026 | return InlineResult::success(); |
1027 | else |
1028 | return InlineResult::failure(Reason: "Cost over threshold." ); |
1029 | } |
1030 | |
1031 | if (IgnoreThreshold) |
1032 | return InlineResult::success(); |
1033 | |
1034 | DecidedByCostThreshold = true; |
1035 | return Cost < std::max(a: 1, b: Threshold) |
1036 | ? InlineResult::success() |
1037 | : InlineResult::failure(Reason: "Cost over threshold." ); |
1038 | } |
1039 | |
1040 | bool shouldStop() override { |
1041 | if (IgnoreThreshold || ComputeFullInlineCost) |
1042 | return false; |
1043 | // Bail out the moment we cross the threshold. This means we'll under-count |
1044 | // the cost, but only when undercounting doesn't matter. |
1045 | if (Cost < Threshold) |
1046 | return false; |
1047 | DecidedByCostThreshold = true; |
1048 | return true; |
1049 | } |
1050 | |
1051 | void onLoadEliminationOpportunity() override { |
1052 | LoadEliminationCost += InstrCost; |
1053 | } |
1054 | |
1055 | InlineResult onAnalysisStart() override { |
1056 | // Perform some tweaks to the cost and threshold based on the direct |
1057 | // callsite information. |
1058 | |
1059 | // We want to more aggressively inline vector-dense kernels, so up the |
1060 | // threshold, and we'll lower it if the % of vector instructions gets too |
1061 | // low. Note that these bonuses are some what arbitrary and evolved over |
1062 | // time by accident as much as because they are principled bonuses. |
1063 | // |
1064 | // FIXME: It would be nice to remove all such bonuses. At least it would be |
1065 | // nice to base the bonus values on something more scientific. |
1066 | assert(NumInstructions == 0); |
1067 | assert(NumVectorInstructions == 0); |
1068 | |
1069 | // Update the threshold based on callsite properties |
1070 | updateThreshold(Call&: CandidateCall, Callee&: F); |
1071 | |
1072 | // While Threshold depends on commandline options that can take negative |
1073 | // values, we want to enforce the invariant that the computed threshold and |
1074 | // bonuses are non-negative. |
1075 | assert(Threshold >= 0); |
1076 | assert(SingleBBBonus >= 0); |
1077 | assert(VectorBonus >= 0); |
1078 | |
1079 | // Speculatively apply all possible bonuses to Threshold. If cost exceeds |
1080 | // this Threshold any time, and cost cannot decrease, we can stop processing |
1081 | // the rest of the function body. |
1082 | Threshold += (SingleBBBonus + VectorBonus); |
1083 | |
1084 | // Give out bonuses for the callsite, as the instructions setting them up |
1085 | // will be gone after inlining. |
1086 | addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
1087 | |
1088 | // If this function uses the coldcc calling convention, prefer not to inline |
1089 | // it. |
1090 | if (F.getCallingConv() == CallingConv::Cold) |
1091 | Cost += InlineConstants::ColdccPenalty; |
1092 | |
1093 | LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n" ); |
1094 | |
1095 | // Check if we're done. This can happen due to bonuses and penalties. |
1096 | if (Cost >= Threshold && !ComputeFullInlineCost) |
1097 | return InlineResult::failure(Reason: "high cost" ); |
1098 | |
1099 | return InlineResult::success(); |
1100 | } |
1101 | |
1102 | public: |
1103 | InlineCostCallAnalyzer( |
1104 | Function &Callee, CallBase &Call, const InlineParams &Params, |
1105 | const TargetTransformInfo &TTI, |
1106 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
1107 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
1108 | ProfileSummaryInfo *PSI = nullptr, |
1109 | OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, |
1110 | bool IgnoreThreshold = false) |
1111 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE), |
1112 | ComputeFullInlineCost(OptComputeFullInlineCost || |
1113 | Params.ComputeFullInlineCost || ORE || |
1114 | isCostBenefitAnalysisEnabled()), |
1115 | Params(Params), Threshold(Params.DefaultThreshold), |
1116 | BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold), |
1117 | CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()), |
1118 | Writer(this) { |
1119 | AllowRecursiveCall = *Params.AllowRecursiveCall; |
1120 | } |
1121 | |
1122 | /// Annotation Writer for instruction details |
1123 | InlineCostAnnotationWriter Writer; |
1124 | |
1125 | void dump(); |
1126 | |
1127 | // Prints the same analysis as dump(), but its definition is not dependent |
1128 | // on the build. |
1129 | void print(raw_ostream &OS); |
1130 | |
1131 | std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) { |
1132 | if (InstructionCostDetailMap.contains(Val: I)) |
1133 | return InstructionCostDetailMap[I]; |
1134 | return std::nullopt; |
1135 | } |
1136 | |
1137 | virtual ~InlineCostCallAnalyzer() = default; |
1138 | int getThreshold() const { return Threshold; } |
1139 | int getCost() const { return Cost; } |
1140 | int getStaticBonusApplied() const { return StaticBonusApplied; } |
1141 | std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; } |
1142 | bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; } |
1143 | bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; } |
1144 | }; |
1145 | |
1146 | // Return true if CB is the sole call to local function Callee. |
1147 | static bool isSoleCallToLocalFunction(const CallBase &CB, |
1148 | const Function &Callee) { |
1149 | return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() && |
1150 | &Callee == CB.getCalledFunction(); |
1151 | } |
1152 | |
1153 | class InlineCostFeaturesAnalyzer final : public CallAnalyzer { |
1154 | private: |
1155 | InlineCostFeatures Cost = {}; |
1156 | |
1157 | // FIXME: These constants are taken from the heuristic-based cost visitor. |
1158 | // These should be removed entirely in a later revision to avoid reliance on |
1159 | // heuristics in the ML inliner. |
1160 | static constexpr int JTCostMultiplier = 2; |
1161 | static constexpr int CaseClusterCostMultiplier = 2; |
1162 | static constexpr int SwitchDefaultDestCostMultiplier = 2; |
1163 | static constexpr int SwitchCostMultiplier = 2; |
1164 | |
1165 | // FIXME: These are taken from the heuristic-based cost visitor: we should |
1166 | // eventually abstract these to the CallAnalyzer to avoid duplication. |
1167 | unsigned SROACostSavingOpportunities = 0; |
1168 | int VectorBonus = 0; |
1169 | int SingleBBBonus = 0; |
1170 | int Threshold = 5; |
1171 | |
1172 | DenseMap<AllocaInst *, unsigned> SROACosts; |
1173 | |
1174 | void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) { |
1175 | Cost[static_cast<size_t>(Feature)] += Delta; |
1176 | } |
1177 | |
1178 | void set(InlineCostFeatureIndex Feature, int64_t Value) { |
1179 | Cost[static_cast<size_t>(Feature)] = Value; |
1180 | } |
1181 | |
1182 | void onDisableSROA(AllocaInst *Arg) override { |
1183 | auto CostIt = SROACosts.find(Val: Arg); |
1184 | if (CostIt == SROACosts.end()) |
1185 | return; |
1186 | |
1187 | increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second); |
1188 | SROACostSavingOpportunities -= CostIt->second; |
1189 | SROACosts.erase(I: CostIt); |
1190 | } |
1191 | |
1192 | void onDisableLoadElimination() override { |
1193 | set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1); |
1194 | } |
1195 | |
1196 | void onCallPenalty() override { |
1197 | increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty); |
1198 | } |
1199 | |
1200 | void onCallArgumentSetup(const CallBase &Call) override { |
1201 | increment(Feature: InlineCostFeatureIndex::call_argument_setup, |
1202 | Delta: Call.arg_size() * InstrCost); |
1203 | } |
1204 | |
1205 | void onLoadRelativeIntrinsic() override { |
1206 | increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost); |
1207 | } |
1208 | |
1209 | void onLoweredCall(Function *F, CallBase &Call, |
1210 | bool IsIndirectCall) override { |
1211 | increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup, |
1212 | Delta: Call.arg_size() * InstrCost); |
1213 | |
1214 | if (IsIndirectCall) { |
1215 | InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0, |
1216 | /*HintThreshold*/ {}, |
1217 | /*ColdThreshold*/ {}, |
1218 | /*OptSizeThreshold*/ {}, |
1219 | /*OptMinSizeThreshold*/ {}, |
1220 | /*HotCallSiteThreshold*/ {}, |
1221 | /*LocallyHotCallSiteThreshold*/ {}, |
1222 | /*ColdCallSiteThreshold*/ {}, |
1223 | /*ComputeFullInlineCost*/ true, |
1224 | /*EnableDeferral*/ true}; |
1225 | IndirectCallParams.DefaultThreshold = |
1226 | InlineConstants::IndirectCallThreshold; |
1227 | |
1228 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
1229 | GetAssumptionCache, GetBFI, PSI, ORE, false, |
1230 | true); |
1231 | if (CA.analyze().isSuccess()) { |
1232 | increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate, |
1233 | Delta: CA.getCost()); |
1234 | increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1); |
1235 | } |
1236 | } else { |
1237 | onCallPenalty(); |
1238 | } |
1239 | } |
1240 | |
1241 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
1242 | bool DefaultDestUndefined) override { |
1243 | if (JumpTableSize) { |
1244 | if (!DefaultDestUndefined) |
1245 | increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty, |
1246 | Delta: SwitchDefaultDestCostMultiplier * InstrCost); |
1247 | int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost + |
1248 | JTCostMultiplier * InstrCost; |
1249 | increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost); |
1250 | return; |
1251 | } |
1252 | |
1253 | if (NumCaseCluster <= 3) { |
1254 | increment(Feature: InlineCostFeatureIndex::case_cluster_penalty, |
1255 | Delta: (NumCaseCluster - DefaultDestUndefined) * |
1256 | CaseClusterCostMultiplier * InstrCost); |
1257 | return; |
1258 | } |
1259 | |
1260 | int64_t ExpectedNumberOfCompare = |
1261 | getExpectedNumberOfCompare(NumCaseCluster); |
1262 | |
1263 | int64_t SwitchCost = |
1264 | ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost; |
1265 | increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost); |
1266 | } |
1267 | |
1268 | void onMissedSimplification() override { |
1269 | increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions, |
1270 | Delta: InstrCost); |
1271 | } |
1272 | |
1273 | void onInitializeSROAArg(AllocaInst *Arg) override { |
1274 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
1275 | SROACosts[Arg] = SROAArgCost; |
1276 | SROACostSavingOpportunities += SROAArgCost; |
1277 | } |
1278 | |
1279 | void onAggregateSROAUse(AllocaInst *Arg) override { |
1280 | SROACosts.find(Val: Arg)->second += InstrCost; |
1281 | SROACostSavingOpportunities += InstrCost; |
1282 | } |
1283 | |
1284 | void onBlockAnalyzed(const BasicBlock *BB) override { |
1285 | if (BB->getTerminator()->getNumSuccessors() > 1) |
1286 | set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1); |
1287 | Threshold -= SingleBBBonus; |
1288 | } |
1289 | |
1290 | InlineResult finalizeAnalysis() override { |
1291 | auto *Caller = CandidateCall.getFunction(); |
1292 | if (Caller->hasMinSize()) { |
1293 | DominatorTree DT(F); |
1294 | LoopInfo LI(DT); |
1295 | for (Loop *L : LI) { |
1296 | // Ignore loops that will not be executed |
1297 | if (DeadBlocks.count(Ptr: L->getHeader())) |
1298 | continue; |
1299 | increment(Feature: InlineCostFeatureIndex::num_loops, |
1300 | Delta: InlineConstants::LoopPenalty); |
1301 | } |
1302 | } |
1303 | set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size()); |
1304 | set(Feature: InlineCostFeatureIndex::simplified_instructions, |
1305 | Value: NumInstructionsSimplified); |
1306 | set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs); |
1307 | set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args, |
1308 | Value: NumConstantOffsetPtrArgs); |
1309 | set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities); |
1310 | |
1311 | if (NumVectorInstructions <= NumInstructions / 10) |
1312 | Threshold -= VectorBonus; |
1313 | else if (NumVectorInstructions <= NumInstructions / 2) |
1314 | Threshold -= VectorBonus / 2; |
1315 | |
1316 | set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold); |
1317 | |
1318 | return InlineResult::success(); |
1319 | } |
1320 | |
1321 | bool shouldStop() override { return false; } |
1322 | |
1323 | void onLoadEliminationOpportunity() override { |
1324 | increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1); |
1325 | } |
1326 | |
1327 | InlineResult onAnalysisStart() override { |
1328 | increment(Feature: InlineCostFeatureIndex::callsite_cost, |
1329 | Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
1330 | |
1331 | set(Feature: InlineCostFeatureIndex::cold_cc_penalty, |
1332 | Value: (F.getCallingConv() == CallingConv::Cold)); |
1333 | |
1334 | set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus, |
1335 | Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F)); |
1336 | |
1337 | // FIXME: we shouldn't repeat this logic in both the Features and Cost |
1338 | // analyzer - instead, we should abstract it to a common method in the |
1339 | // CallAnalyzer |
1340 | int SingleBBBonusPercent = 50; |
1341 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
1342 | Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall); |
1343 | Threshold *= TTI.getInliningThresholdMultiplier(); |
1344 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
1345 | VectorBonus = Threshold * VectorBonusPercent / 100; |
1346 | Threshold += (SingleBBBonus + VectorBonus); |
1347 | |
1348 | return InlineResult::success(); |
1349 | } |
1350 | |
1351 | public: |
1352 | InlineCostFeaturesAnalyzer( |
1353 | const TargetTransformInfo &TTI, |
1354 | function_ref<AssumptionCache &(Function &)> &GetAssumptionCache, |
1355 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
1356 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, |
1357 | CallBase &Call) |
1358 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {} |
1359 | |
1360 | const InlineCostFeatures &features() const { return Cost; } |
1361 | }; |
1362 | |
1363 | } // namespace |
1364 | |
1365 | /// Test whether the given value is an Alloca-derived function argument. |
1366 | bool CallAnalyzer::isAllocaDerivedArg(Value *V) { |
1367 | return SROAArgValues.count(Val: V); |
1368 | } |
1369 | |
1370 | void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { |
1371 | onDisableSROA(Arg: SROAArg); |
1372 | EnabledSROAAllocas.erase(V: SROAArg); |
1373 | disableLoadElimination(); |
1374 | } |
1375 | |
1376 | void InlineCostAnnotationWriter::emitInstructionAnnot( |
1377 | const Instruction *I, formatted_raw_ostream &OS) { |
1378 | // The cost of inlining of the given instruction is printed always. |
1379 | // The threshold delta is printed only when it is non-zero. It happens |
1380 | // when we decided to give a bonus at a particular instruction. |
1381 | std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I); |
1382 | if (!Record) |
1383 | OS << "; No analysis for the instruction" ; |
1384 | else { |
1385 | OS << "; cost before = " << Record->CostBefore |
1386 | << ", cost after = " << Record->CostAfter |
1387 | << ", threshold before = " << Record->ThresholdBefore |
1388 | << ", threshold after = " << Record->ThresholdAfter << ", " ; |
1389 | OS << "cost delta = " << Record->getCostDelta(); |
1390 | if (Record->hasThresholdChanged()) |
1391 | OS << ", threshold delta = " << Record->getThresholdDelta(); |
1392 | } |
1393 | auto C = ICCA->getSimplifiedValue(I: const_cast<Instruction *>(I)); |
1394 | if (C) { |
1395 | OS << ", simplified to " ; |
1396 | (*C)->print(O&: OS, IsForDebug: true); |
1397 | } |
1398 | OS << "\n" ; |
1399 | } |
1400 | |
1401 | /// If 'V' maps to a SROA candidate, disable SROA for it. |
1402 | void CallAnalyzer::disableSROA(Value *V) { |
1403 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
1404 | disableSROAForArg(SROAArg); |
1405 | } |
1406 | } |
1407 | |
1408 | void CallAnalyzer::disableLoadElimination() { |
1409 | if (EnableLoadElimination) { |
1410 | onDisableLoadElimination(); |
1411 | EnableLoadElimination = false; |
1412 | } |
1413 | } |
1414 | |
1415 | /// Accumulate a constant GEP offset into an APInt if possible. |
1416 | /// |
1417 | /// Returns false if unable to compute the offset for any reason. Respects any |
1418 | /// simplified values known during the analysis of this callsite. |
1419 | bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { |
1420 | unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType()); |
1421 | assert(IntPtrWidth == Offset.getBitWidth()); |
1422 | |
1423 | for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); |
1424 | GTI != GTE; ++GTI) { |
1425 | ConstantInt *OpC = dyn_cast<ConstantInt>(Val: GTI.getOperand()); |
1426 | if (!OpC) |
1427 | if (Constant *SimpleOp = SimplifiedValues.lookup(Val: GTI.getOperand())) |
1428 | OpC = dyn_cast<ConstantInt>(Val: SimpleOp); |
1429 | if (!OpC) |
1430 | return false; |
1431 | if (OpC->isZero()) |
1432 | continue; |
1433 | |
1434 | // Handle a struct index, which adds its field offset to the pointer. |
1435 | if (StructType *STy = GTI.getStructTypeOrNull()) { |
1436 | unsigned ElementIdx = OpC->getZExtValue(); |
1437 | const StructLayout *SL = DL.getStructLayout(Ty: STy); |
1438 | Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx)); |
1439 | continue; |
1440 | } |
1441 | |
1442 | APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL)); |
1443 | Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize; |
1444 | } |
1445 | return true; |
1446 | } |
1447 | |
1448 | /// Use TTI to check whether a GEP is free. |
1449 | /// |
1450 | /// Respects any simplified values known during the analysis of this callsite. |
1451 | bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { |
1452 | SmallVector<Value *, 4> Operands; |
1453 | Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0)); |
1454 | for (const Use &Op : GEP.indices()) |
1455 | if (Constant *SimpleOp = SimplifiedValues.lookup(Val: Op)) |
1456 | Operands.push_back(Elt: SimpleOp); |
1457 | else |
1458 | Operands.push_back(Elt: Op); |
1459 | return TTI.getInstructionCost(U: &GEP, Operands, |
1460 | CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1461 | TargetTransformInfo::TCC_Free; |
1462 | } |
1463 | |
1464 | bool CallAnalyzer::visitAlloca(AllocaInst &I) { |
1465 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
1466 | |
1467 | // Check whether inlining will turn a dynamic alloca into a static |
1468 | // alloca and handle that case. |
1469 | if (I.isArrayAllocation()) { |
1470 | Constant *Size = SimplifiedValues.lookup(Val: I.getArraySize()); |
1471 | if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) { |
1472 | // Sometimes a dynamic alloca could be converted into a static alloca |
1473 | // after this constant prop, and become a huge static alloca on an |
1474 | // unconditional CFG path. Avoid inlining if this is going to happen above |
1475 | // a threshold. |
1476 | // FIXME: If the threshold is removed or lowered too much, we could end up |
1477 | // being too pessimistic and prevent inlining non-problematic code. This |
1478 | // could result in unintended perf regressions. A better overall strategy |
1479 | // is needed to track stack usage during inlining. |
1480 | Type *Ty = I.getAllocatedType(); |
1481 | AllocatedSize = SaturatingMultiplyAdd( |
1482 | X: AllocSize->getLimitedValue(), |
1483 | Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize); |
1484 | if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) |
1485 | HasDynamicAlloca = true; |
1486 | return false; |
1487 | } |
1488 | } |
1489 | |
1490 | // Accumulate the allocated size. |
1491 | if (I.isStaticAlloca()) { |
1492 | Type *Ty = I.getAllocatedType(); |
1493 | AllocatedSize = SaturatingAdd(X: DL.getTypeAllocSize(Ty).getKnownMinValue(), |
1494 | Y: AllocatedSize); |
1495 | } |
1496 | |
1497 | // FIXME: This is overly conservative. Dynamic allocas are inefficient for |
1498 | // a variety of reasons, and so we would like to not inline them into |
1499 | // functions which don't currently have a dynamic alloca. This simply |
1500 | // disables inlining altogether in the presence of a dynamic alloca. |
1501 | if (!I.isStaticAlloca()) |
1502 | HasDynamicAlloca = true; |
1503 | |
1504 | return false; |
1505 | } |
1506 | |
1507 | bool CallAnalyzer::visitPHI(PHINode &I) { |
1508 | // FIXME: We need to propagate SROA *disabling* through phi nodes, even |
1509 | // though we don't want to propagate it's bonuses. The idea is to disable |
1510 | // SROA if it *might* be used in an inappropriate manner. |
1511 | |
1512 | // Phi nodes are always zero-cost. |
1513 | // FIXME: Pointer sizes may differ between different address spaces, so do we |
1514 | // need to use correct address space in the call to getPointerSizeInBits here? |
1515 | // Or could we skip the getPointerSizeInBits call completely? As far as I can |
1516 | // see the ZeroOffset is used as a dummy value, so we can probably use any |
1517 | // bit width for the ZeroOffset? |
1518 | APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0)); |
1519 | bool CheckSROA = I.getType()->isPointerTy(); |
1520 | |
1521 | // Track the constant or pointer with constant offset we've seen so far. |
1522 | Constant *FirstC = nullptr; |
1523 | std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; |
1524 | Value *FirstV = nullptr; |
1525 | |
1526 | for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { |
1527 | BasicBlock *Pred = I.getIncomingBlock(i); |
1528 | // If the incoming block is dead, skip the incoming block. |
1529 | if (DeadBlocks.count(Ptr: Pred)) |
1530 | continue; |
1531 | // If the parent block of phi is not the known successor of the incoming |
1532 | // block, skip the incoming block. |
1533 | BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; |
1534 | if (KnownSuccessor && KnownSuccessor != I.getParent()) |
1535 | continue; |
1536 | |
1537 | Value *V = I.getIncomingValue(i); |
1538 | // If the incoming value is this phi itself, skip the incoming value. |
1539 | if (&I == V) |
1540 | continue; |
1541 | |
1542 | Constant *C = dyn_cast<Constant>(Val: V); |
1543 | if (!C) |
1544 | C = SimplifiedValues.lookup(Val: V); |
1545 | |
1546 | std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; |
1547 | if (!C && CheckSROA) |
1548 | BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V); |
1549 | |
1550 | if (!C && !BaseAndOffset.first) |
1551 | // The incoming value is neither a constant nor a pointer with constant |
1552 | // offset, exit early. |
1553 | return true; |
1554 | |
1555 | if (FirstC) { |
1556 | if (FirstC == C) |
1557 | // If we've seen a constant incoming value before and it is the same |
1558 | // constant we see this time, continue checking the next incoming value. |
1559 | continue; |
1560 | // Otherwise early exit because we either see a different constant or saw |
1561 | // a constant before but we have a pointer with constant offset this time. |
1562 | return true; |
1563 | } |
1564 | |
1565 | if (FirstV) { |
1566 | // The same logic as above, but check pointer with constant offset here. |
1567 | if (FirstBaseAndOffset == BaseAndOffset) |
1568 | continue; |
1569 | return true; |
1570 | } |
1571 | |
1572 | if (C) { |
1573 | // This is the 1st time we've seen a constant, record it. |
1574 | FirstC = C; |
1575 | continue; |
1576 | } |
1577 | |
1578 | // The remaining case is that this is the 1st time we've seen a pointer with |
1579 | // constant offset, record it. |
1580 | FirstV = V; |
1581 | FirstBaseAndOffset = BaseAndOffset; |
1582 | } |
1583 | |
1584 | // Check if we can map phi to a constant. |
1585 | if (FirstC) { |
1586 | SimplifiedValues[&I] = FirstC; |
1587 | return true; |
1588 | } |
1589 | |
1590 | // Check if we can map phi to a pointer with constant offset. |
1591 | if (FirstBaseAndOffset.first) { |
1592 | ConstantOffsetPtrs[&I] = FirstBaseAndOffset; |
1593 | |
1594 | if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV)) |
1595 | SROAArgValues[&I] = SROAArg; |
1596 | } |
1597 | |
1598 | return true; |
1599 | } |
1600 | |
1601 | /// Check we can fold GEPs of constant-offset call site argument pointers. |
1602 | /// This requires target data and inbounds GEPs. |
1603 | /// |
1604 | /// \return true if the specified GEP can be folded. |
1605 | bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { |
1606 | // Check if we have a base + offset for the pointer. |
1607 | std::pair<Value *, APInt> BaseAndOffset = |
1608 | ConstantOffsetPtrs.lookup(Val: I.getPointerOperand()); |
1609 | if (!BaseAndOffset.first) |
1610 | return false; |
1611 | |
1612 | // Check if the offset of this GEP is constant, and if so accumulate it |
1613 | // into Offset. |
1614 | if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second)) |
1615 | return false; |
1616 | |
1617 | // Add the result as a new mapping to Base + Offset. |
1618 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1619 | |
1620 | return true; |
1621 | } |
1622 | |
1623 | bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { |
1624 | auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand()); |
1625 | |
1626 | // Lambda to check whether a GEP's indices are all constant. |
1627 | auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { |
1628 | for (const Use &Op : GEP.indices()) |
1629 | if (!isa<Constant>(Val: Op) && !SimplifiedValues.lookup(Val: Op)) |
1630 | return false; |
1631 | return true; |
1632 | }; |
1633 | |
1634 | if (!DisableGEPConstOperand) |
1635 | if (simplifyInstruction(I)) |
1636 | return true; |
1637 | |
1638 | if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { |
1639 | if (SROAArg) |
1640 | SROAArgValues[&I] = SROAArg; |
1641 | |
1642 | // Constant GEPs are modeled as free. |
1643 | return true; |
1644 | } |
1645 | |
1646 | // Variable GEPs will require math and will disable SROA. |
1647 | if (SROAArg) |
1648 | disableSROAForArg(SROAArg); |
1649 | return isGEPFree(GEP&: I); |
1650 | } |
1651 | |
1652 | /// Simplify \p I if its operands are constants and update SimplifiedValues. |
1653 | bool CallAnalyzer::simplifyInstruction(Instruction &I) { |
1654 | SmallVector<Constant *> COps; |
1655 | for (Value *Op : I.operands()) { |
1656 | Constant *COp = dyn_cast<Constant>(Val: Op); |
1657 | if (!COp) |
1658 | COp = SimplifiedValues.lookup(Val: Op); |
1659 | if (!COp) |
1660 | return false; |
1661 | COps.push_back(Elt: COp); |
1662 | } |
1663 | auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL); |
1664 | if (!C) |
1665 | return false; |
1666 | SimplifiedValues[&I] = C; |
1667 | return true; |
1668 | } |
1669 | |
1670 | /// Try to simplify a call to llvm.is.constant. |
1671 | /// |
1672 | /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since |
1673 | /// we expect calls of this specific intrinsic to be infrequent. |
1674 | /// |
1675 | /// FIXME: Given that we know CB's parent (F) caller |
1676 | /// (CandidateCall->getParent()->getParent()), we might be able to determine |
1677 | /// whether inlining F into F's caller would change how the call to |
1678 | /// llvm.is.constant would evaluate. |
1679 | bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) { |
1680 | Value *Arg = CB.getArgOperand(i: 0); |
1681 | auto *C = dyn_cast<Constant>(Val: Arg); |
1682 | |
1683 | if (!C) |
1684 | C = dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: Arg)); |
1685 | |
1686 | Type *RT = CB.getFunctionType()->getReturnType(); |
1687 | SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0); |
1688 | return true; |
1689 | } |
1690 | |
1691 | bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) { |
1692 | // As per the langref, "The fourth argument to llvm.objectsize determines if |
1693 | // the value should be evaluated at runtime." |
1694 | if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne()) |
1695 | return false; |
1696 | |
1697 | Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr, |
1698 | /*MustSucceed=*/true); |
1699 | Constant *C = dyn_cast_or_null<Constant>(Val: V); |
1700 | if (C) |
1701 | SimplifiedValues[&CB] = C; |
1702 | return C; |
1703 | } |
1704 | |
1705 | bool CallAnalyzer::visitBitCast(BitCastInst &I) { |
1706 | // Propagate constants through bitcasts. |
1707 | if (simplifyInstruction(I)) |
1708 | return true; |
1709 | |
1710 | // Track base/offsets through casts |
1711 | std::pair<Value *, APInt> BaseAndOffset = |
1712 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
1713 | // Casts don't change the offset, just wrap it up. |
1714 | if (BaseAndOffset.first) |
1715 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1716 | |
1717 | // Also look for SROA candidates here. |
1718 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
1719 | SROAArgValues[&I] = SROAArg; |
1720 | |
1721 | // Bitcasts are always zero cost. |
1722 | return true; |
1723 | } |
1724 | |
1725 | bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { |
1726 | // Propagate constants through ptrtoint. |
1727 | if (simplifyInstruction(I)) |
1728 | return true; |
1729 | |
1730 | // Track base/offset pairs when converted to a plain integer provided the |
1731 | // integer is large enough to represent the pointer. |
1732 | unsigned IntegerSize = I.getType()->getScalarSizeInBits(); |
1733 | unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace(); |
1734 | if (IntegerSize == DL.getPointerSizeInBits(AS)) { |
1735 | std::pair<Value *, APInt> BaseAndOffset = |
1736 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
1737 | if (BaseAndOffset.first) |
1738 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1739 | } |
1740 | |
1741 | // This is really weird. Technically, ptrtoint will disable SROA. However, |
1742 | // unless that ptrtoint is *used* somewhere in the live basic blocks after |
1743 | // inlining, it will be nuked, and SROA should proceed. All of the uses which |
1744 | // would block SROA would also block SROA if applied directly to a pointer, |
1745 | // and so we can just add the integer in here. The only places where SROA is |
1746 | // preserved either cannot fire on an integer, or won't in-and-of themselves |
1747 | // disable SROA (ext) w/o some later use that we would see and disable. |
1748 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
1749 | SROAArgValues[&I] = SROAArg; |
1750 | |
1751 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1752 | TargetTransformInfo::TCC_Free; |
1753 | } |
1754 | |
1755 | bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { |
1756 | // Propagate constants through ptrtoint. |
1757 | if (simplifyInstruction(I)) |
1758 | return true; |
1759 | |
1760 | // Track base/offset pairs when round-tripped through a pointer without |
1761 | // modifications provided the integer is not too large. |
1762 | Value *Op = I.getOperand(i_nocapture: 0); |
1763 | unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); |
1764 | if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { |
1765 | std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op); |
1766 | if (BaseAndOffset.first) |
1767 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1768 | } |
1769 | |
1770 | // "Propagate" SROA here in the same manner as we do for ptrtoint above. |
1771 | if (auto *SROAArg = getSROAArgForValueOrNull(V: Op)) |
1772 | SROAArgValues[&I] = SROAArg; |
1773 | |
1774 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1775 | TargetTransformInfo::TCC_Free; |
1776 | } |
1777 | |
1778 | bool CallAnalyzer::visitCastInst(CastInst &I) { |
1779 | // Propagate constants through casts. |
1780 | if (simplifyInstruction(I)) |
1781 | return true; |
1782 | |
1783 | // Disable SROA in the face of arbitrary casts we don't explicitly list |
1784 | // elsewhere. |
1785 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
1786 | |
1787 | // If this is a floating-point cast, and the target says this operation |
1788 | // is expensive, this may eventually become a library call. Treat the cost |
1789 | // as such. |
1790 | switch (I.getOpcode()) { |
1791 | case Instruction::FPTrunc: |
1792 | case Instruction::FPExt: |
1793 | case Instruction::UIToFP: |
1794 | case Instruction::SIToFP: |
1795 | case Instruction::FPToUI: |
1796 | case Instruction::FPToSI: |
1797 | if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive) |
1798 | onCallPenalty(); |
1799 | break; |
1800 | default: |
1801 | break; |
1802 | } |
1803 | |
1804 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1805 | TargetTransformInfo::TCC_Free; |
1806 | } |
1807 | |
1808 | bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { |
1809 | return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr); |
1810 | } |
1811 | |
1812 | bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { |
1813 | // Does the *call site* have the NonNull attribute set on an argument? We |
1814 | // use the attribute on the call site to memoize any analysis done in the |
1815 | // caller. This will also trip if the callee function has a non-null |
1816 | // parameter attribute, but that's a less interesting case because hopefully |
1817 | // the callee would already have been simplified based on that. |
1818 | if (Argument *A = dyn_cast<Argument>(Val: V)) |
1819 | if (paramHasAttr(A, Attr: Attribute::NonNull)) |
1820 | return true; |
1821 | |
1822 | // Is this an alloca in the caller? This is distinct from the attribute case |
1823 | // above because attributes aren't updated within the inliner itself and we |
1824 | // always want to catch the alloca derived case. |
1825 | if (isAllocaDerivedArg(V)) |
1826 | // We can actually predict the result of comparisons between an |
1827 | // alloca-derived value and null. Note that this fires regardless of |
1828 | // SROA firing. |
1829 | return true; |
1830 | |
1831 | return false; |
1832 | } |
1833 | |
1834 | bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { |
1835 | // If the normal destination of the invoke or the parent block of the call |
1836 | // site is unreachable-terminated, there is little point in inlining this |
1837 | // unless there is literally zero cost. |
1838 | // FIXME: Note that it is possible that an unreachable-terminated block has a |
1839 | // hot entry. For example, in below scenario inlining hot_call_X() may be |
1840 | // beneficial : |
1841 | // main() { |
1842 | // hot_call_1(); |
1843 | // ... |
1844 | // hot_call_N() |
1845 | // exit(0); |
1846 | // } |
1847 | // For now, we are not handling this corner case here as it is rare in real |
1848 | // code. In future, we should elaborate this based on BPI and BFI in more |
1849 | // general threshold adjusting heuristics in updateThreshold(). |
1850 | if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) { |
1851 | if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator())) |
1852 | return false; |
1853 | } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator())) |
1854 | return false; |
1855 | |
1856 | return true; |
1857 | } |
1858 | |
1859 | bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, |
1860 | BlockFrequencyInfo *CallerBFI) { |
1861 | // If global profile summary is available, then callsite's coldness is |
1862 | // determined based on that. |
1863 | if (PSI && PSI->hasProfileSummary()) |
1864 | return PSI->isColdCallSite(CB: Call, BFI: CallerBFI); |
1865 | |
1866 | // Otherwise we need BFI to be available. |
1867 | if (!CallerBFI) |
1868 | return false; |
1869 | |
1870 | // Determine if the callsite is cold relative to caller's entry. We could |
1871 | // potentially cache the computation of scaled entry frequency, but the added |
1872 | // complexity is not worth it unless this scaling shows up high in the |
1873 | // profiles. |
1874 | const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); |
1875 | auto CallSiteBB = Call.getParent(); |
1876 | auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
1877 | auto CallerEntryFreq = |
1878 | CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock())); |
1879 | return CallSiteFreq < CallerEntryFreq * ColdProb; |
1880 | } |
1881 | |
1882 | std::optional<int> |
1883 | InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, |
1884 | BlockFrequencyInfo *CallerBFI) { |
1885 | |
1886 | // If global profile summary is available, then callsite's hotness is |
1887 | // determined based on that. |
1888 | if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI)) |
1889 | return Params.HotCallSiteThreshold; |
1890 | |
1891 | // Otherwise we need BFI to be available and to have a locally hot callsite |
1892 | // threshold. |
1893 | if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) |
1894 | return std::nullopt; |
1895 | |
1896 | // Determine if the callsite is hot relative to caller's entry. We could |
1897 | // potentially cache the computation of scaled entry frequency, but the added |
1898 | // complexity is not worth it unless this scaling shows up high in the |
1899 | // profiles. |
1900 | const BasicBlock *CallSiteBB = Call.getParent(); |
1901 | BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
1902 | BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq(); |
1903 | std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq); |
1904 | if (Limit && CallSiteFreq >= *Limit) |
1905 | return Params.LocallyHotCallSiteThreshold; |
1906 | |
1907 | // Otherwise treat it normally. |
1908 | return std::nullopt; |
1909 | } |
1910 | |
1911 | void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { |
1912 | // If no size growth is allowed for this inlining, set Threshold to 0. |
1913 | if (!allowSizeGrowth(Call)) { |
1914 | Threshold = 0; |
1915 | return; |
1916 | } |
1917 | |
1918 | Function *Caller = Call.getCaller(); |
1919 | |
1920 | // return min(A, B) if B is valid. |
1921 | auto MinIfValid = [](int A, std::optional<int> B) { |
1922 | return B ? std::min(a: A, b: *B) : A; |
1923 | }; |
1924 | |
1925 | // return max(A, B) if B is valid. |
1926 | auto MaxIfValid = [](int A, std::optional<int> B) { |
1927 | return B ? std::max(a: A, b: *B) : A; |
1928 | }; |
1929 | |
1930 | // Various bonus percentages. These are multiplied by Threshold to get the |
1931 | // bonus values. |
1932 | // SingleBBBonus: This bonus is applied if the callee has a single reachable |
1933 | // basic block at the given callsite context. This is speculatively applied |
1934 | // and withdrawn if more than one basic block is seen. |
1935 | // |
1936 | // LstCallToStaticBonus: This large bonus is applied to ensure the inlining |
1937 | // of the last call to a static function as inlining such functions is |
1938 | // guaranteed to reduce code size. |
1939 | // |
1940 | // These bonus percentages may be set to 0 based on properties of the caller |
1941 | // and the callsite. |
1942 | int SingleBBBonusPercent = 50; |
1943 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
1944 | int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; |
1945 | |
1946 | // Lambda to set all the above bonus and bonus percentages to 0. |
1947 | auto DisallowAllBonuses = [&]() { |
1948 | SingleBBBonusPercent = 0; |
1949 | VectorBonusPercent = 0; |
1950 | LastCallToStaticBonus = 0; |
1951 | }; |
1952 | |
1953 | // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available |
1954 | // and reduce the threshold if the caller has the necessary attribute. |
1955 | if (Caller->hasMinSize()) { |
1956 | Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); |
1957 | // For minsize, we want to disable the single BB bonus and the vector |
1958 | // bonuses, but not the last-call-to-static bonus. Inlining the last call to |
1959 | // a static function will, at the minimum, eliminate the parameter setup and |
1960 | // call/return instructions. |
1961 | SingleBBBonusPercent = 0; |
1962 | VectorBonusPercent = 0; |
1963 | } else if (Caller->hasOptSize()) |
1964 | Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); |
1965 | |
1966 | // Adjust the threshold based on inlinehint attribute and profile based |
1967 | // hotness information if the caller does not have MinSize attribute. |
1968 | if (!Caller->hasMinSize()) { |
1969 | if (Callee.hasFnAttribute(Kind: Attribute::InlineHint)) |
1970 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
1971 | |
1972 | // FIXME: After switching to the new passmanager, simplify the logic below |
1973 | // by checking only the callsite hotness/coldness as we will reliably |
1974 | // have local profile information. |
1975 | // |
1976 | // Callsite hotness and coldness can be determined if sample profile is |
1977 | // used (which adds hotness metadata to calls) or if caller's |
1978 | // BlockFrequencyInfo is available. |
1979 | BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr; |
1980 | auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); |
1981 | if (!Caller->hasOptSize() && HotCallSiteThreshold) { |
1982 | LLVM_DEBUG(dbgs() << "Hot callsite.\n" ); |
1983 | // FIXME: This should update the threshold only if it exceeds the |
1984 | // current threshold, but AutoFDO + ThinLTO currently relies on this |
1985 | // behavior to prevent inlining of hot callsites during ThinLTO |
1986 | // compile phase. |
1987 | Threshold = *HotCallSiteThreshold; |
1988 | } else if (isColdCallSite(Call, CallerBFI)) { |
1989 | LLVM_DEBUG(dbgs() << "Cold callsite.\n" ); |
1990 | // Do not apply bonuses for a cold callsite including the |
1991 | // LastCallToStatic bonus. While this bonus might result in code size |
1992 | // reduction, it can cause the size of a non-cold caller to increase |
1993 | // preventing it from being inlined. |
1994 | DisallowAllBonuses(); |
1995 | Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); |
1996 | } else if (PSI) { |
1997 | // Use callee's global profile information only if we have no way of |
1998 | // determining this via callsite information. |
1999 | if (PSI->isFunctionEntryHot(F: &Callee)) { |
2000 | LLVM_DEBUG(dbgs() << "Hot callee.\n" ); |
2001 | // If callsite hotness can not be determined, we may still know |
2002 | // that the callee is hot and treat it as a weaker hint for threshold |
2003 | // increase. |
2004 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
2005 | } else if (PSI->isFunctionEntryCold(F: &Callee)) { |
2006 | LLVM_DEBUG(dbgs() << "Cold callee.\n" ); |
2007 | // Do not apply bonuses for a cold callee including the |
2008 | // LastCallToStatic bonus. While this bonus might result in code size |
2009 | // reduction, it can cause the size of a non-cold caller to increase |
2010 | // preventing it from being inlined. |
2011 | DisallowAllBonuses(); |
2012 | Threshold = MinIfValid(Threshold, Params.ColdThreshold); |
2013 | } |
2014 | } |
2015 | } |
2016 | |
2017 | Threshold += TTI.adjustInliningThreshold(CB: &Call); |
2018 | |
2019 | // Finally, take the target-specific inlining threshold multiplier into |
2020 | // account. |
2021 | Threshold *= TTI.getInliningThresholdMultiplier(); |
2022 | |
2023 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
2024 | VectorBonus = Threshold * VectorBonusPercent / 100; |
2025 | |
2026 | // If there is only one call of the function, and it has internal linkage, |
2027 | // the cost of inlining it drops dramatically. It may seem odd to update |
2028 | // Cost in updateThreshold, but the bonus depends on the logic in this method. |
2029 | if (isSoleCallToLocalFunction(CB: Call, Callee: F)) { |
2030 | Cost -= LastCallToStaticBonus; |
2031 | StaticBonusApplied = LastCallToStaticBonus; |
2032 | } |
2033 | } |
2034 | |
2035 | bool CallAnalyzer::visitCmpInst(CmpInst &I) { |
2036 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2037 | // First try to handle simplified comparisons. |
2038 | if (simplifyInstruction(I)) |
2039 | return true; |
2040 | |
2041 | if (I.getOpcode() == Instruction::FCmp) |
2042 | return false; |
2043 | |
2044 | // Otherwise look for a comparison between constant offset pointers with |
2045 | // a common base. |
2046 | Value *LHSBase, *RHSBase; |
2047 | APInt LHSOffset, RHSOffset; |
2048 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
2049 | if (LHSBase) { |
2050 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
2051 | if (RHSBase && LHSBase == RHSBase) { |
2052 | // We have common bases, fold the icmp to a constant based on the |
2053 | // offsets. |
2054 | SimplifiedValues[&I] = ConstantInt::getBool( |
2055 | Ty: I.getType(), |
2056 | V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate())); |
2057 | ++NumConstantPtrCmps; |
2058 | return true; |
2059 | } |
2060 | } |
2061 | |
2062 | auto isImplicitNullCheckCmp = [](const CmpInst &I) { |
2063 | for (auto *User : I.users()) |
2064 | if (auto *Instr = dyn_cast<Instruction>(Val: User)) |
2065 | if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit)) |
2066 | return false; |
2067 | return true; |
2068 | }; |
2069 | |
2070 | // If the comparison is an equality comparison with null, we can simplify it |
2071 | // if we know the value (argument) can't be null |
2072 | if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) { |
2073 | if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) { |
2074 | bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; |
2075 | SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType()) |
2076 | : ConstantInt::getFalse(Ty: I.getType()); |
2077 | return true; |
2078 | } |
2079 | // Implicit null checks act as unconditional branches and their comparisons |
2080 | // should be treated as simplified and free of cost. |
2081 | if (isImplicitNullCheckCmp(I)) |
2082 | return true; |
2083 | } |
2084 | return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))); |
2085 | } |
2086 | |
2087 | bool CallAnalyzer::visitSub(BinaryOperator &I) { |
2088 | // Try to handle a special case: we can fold computing the difference of two |
2089 | // constant-related pointers. |
2090 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2091 | Value *LHSBase, *RHSBase; |
2092 | APInt LHSOffset, RHSOffset; |
2093 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
2094 | if (LHSBase) { |
2095 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
2096 | if (RHSBase && LHSBase == RHSBase) { |
2097 | // We have common bases, fold the subtract to a constant based on the |
2098 | // offsets. |
2099 | Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset); |
2100 | Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset); |
2101 | if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) { |
2102 | SimplifiedValues[&I] = C; |
2103 | ++NumConstantPtrDiffs; |
2104 | return true; |
2105 | } |
2106 | } |
2107 | } |
2108 | |
2109 | // Otherwise, fall back to the generic logic for simplifying and handling |
2110 | // instructions. |
2111 | return Base::visitSub(I); |
2112 | } |
2113 | |
2114 | bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { |
2115 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2116 | Constant *CLHS = dyn_cast<Constant>(Val: LHS); |
2117 | if (!CLHS) |
2118 | CLHS = SimplifiedValues.lookup(Val: LHS); |
2119 | Constant *CRHS = dyn_cast<Constant>(Val: RHS); |
2120 | if (!CRHS) |
2121 | CRHS = SimplifiedValues.lookup(Val: RHS); |
2122 | |
2123 | Value *SimpleV = nullptr; |
2124 | if (auto FI = dyn_cast<FPMathOperator>(Val: &I)) |
2125 | SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, |
2126 | FMF: FI->getFastMathFlags(), Q: DL); |
2127 | else |
2128 | SimpleV = |
2129 | simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL); |
2130 | |
2131 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
2132 | SimplifiedValues[&I] = C; |
2133 | |
2134 | if (SimpleV) |
2135 | return true; |
2136 | |
2137 | // Disable any SROA on arguments to arbitrary, unsimplified binary operators. |
2138 | disableSROA(V: LHS); |
2139 | disableSROA(V: RHS); |
2140 | |
2141 | // If the instruction is floating point, and the target says this operation |
2142 | // is expensive, this may eventually become a library call. Treat the cost |
2143 | // as such. Unless it's fneg which can be implemented with an xor. |
2144 | using namespace llvm::PatternMatch; |
2145 | if (I.getType()->isFloatingPointTy() && |
2146 | TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive && |
2147 | !match(V: &I, P: m_FNeg(X: m_Value()))) |
2148 | onCallPenalty(); |
2149 | |
2150 | return false; |
2151 | } |
2152 | |
2153 | bool CallAnalyzer::visitFNeg(UnaryOperator &I) { |
2154 | Value *Op = I.getOperand(i_nocapture: 0); |
2155 | Constant *COp = dyn_cast<Constant>(Val: Op); |
2156 | if (!COp) |
2157 | COp = SimplifiedValues.lookup(Val: Op); |
2158 | |
2159 | Value *SimpleV = simplifyFNegInst( |
2160 | Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL); |
2161 | |
2162 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
2163 | SimplifiedValues[&I] = C; |
2164 | |
2165 | if (SimpleV) |
2166 | return true; |
2167 | |
2168 | // Disable any SROA on arguments to arbitrary, unsimplified fneg. |
2169 | disableSROA(V: Op); |
2170 | |
2171 | return false; |
2172 | } |
2173 | |
2174 | bool CallAnalyzer::visitLoad(LoadInst &I) { |
2175 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
2176 | return true; |
2177 | |
2178 | // If the data is already loaded from this address and hasn't been clobbered |
2179 | // by any stores or calls, this load is likely to be redundant and can be |
2180 | // eliminated. |
2181 | if (EnableLoadElimination && |
2182 | !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) { |
2183 | onLoadEliminationOpportunity(); |
2184 | return true; |
2185 | } |
2186 | |
2187 | onMemAccess(); |
2188 | return false; |
2189 | } |
2190 | |
2191 | bool CallAnalyzer::visitStore(StoreInst &I) { |
2192 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
2193 | return true; |
2194 | |
2195 | // The store can potentially clobber loads and prevent repeated loads from |
2196 | // being eliminated. |
2197 | // FIXME: |
2198 | // 1. We can probably keep an initial set of eliminatable loads substracted |
2199 | // from the cost even when we finally see a store. We just need to disable |
2200 | // *further* accumulation of elimination savings. |
2201 | // 2. We should probably at some point thread MemorySSA for the callee into |
2202 | // this and then use that to actually compute *really* precise savings. |
2203 | disableLoadElimination(); |
2204 | |
2205 | onMemAccess(); |
2206 | return false; |
2207 | } |
2208 | |
2209 | bool CallAnalyzer::(ExtractValueInst &I) { |
2210 | // Constant folding for extract value is trivial. |
2211 | if (simplifyInstruction(I)) |
2212 | return true; |
2213 | |
2214 | // SROA can't look through these, but they may be free. |
2215 | return Base::visitExtractValue(I); |
2216 | } |
2217 | |
2218 | bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { |
2219 | // Constant folding for insert value is trivial. |
2220 | if (simplifyInstruction(I)) |
2221 | return true; |
2222 | |
2223 | // SROA can't look through these, but they may be free. |
2224 | return Base::visitInsertValue(I); |
2225 | } |
2226 | |
2227 | /// Try to simplify a call site. |
2228 | /// |
2229 | /// Takes a concrete function and callsite and tries to actually simplify it by |
2230 | /// analyzing the arguments and call itself with instsimplify. Returns true if |
2231 | /// it has simplified the callsite to some other entity (a constant), making it |
2232 | /// free. |
2233 | bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { |
2234 | // FIXME: Using the instsimplify logic directly for this is inefficient |
2235 | // because we have to continually rebuild the argument list even when no |
2236 | // simplifications can be performed. Until that is fixed with remapping |
2237 | // inside of instsimplify, directly constant fold calls here. |
2238 | if (!canConstantFoldCallTo(Call: &Call, F)) |
2239 | return false; |
2240 | |
2241 | // Try to re-map the arguments to constants. |
2242 | SmallVector<Constant *, 4> ConstantArgs; |
2243 | ConstantArgs.reserve(N: Call.arg_size()); |
2244 | for (Value *I : Call.args()) { |
2245 | Constant *C = dyn_cast<Constant>(Val: I); |
2246 | if (!C) |
2247 | C = dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: I)); |
2248 | if (!C) |
2249 | return false; // This argument doesn't map to a constant. |
2250 | |
2251 | ConstantArgs.push_back(Elt: C); |
2252 | } |
2253 | if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) { |
2254 | SimplifiedValues[&Call] = C; |
2255 | return true; |
2256 | } |
2257 | |
2258 | return false; |
2259 | } |
2260 | |
2261 | bool CallAnalyzer::visitCallBase(CallBase &Call) { |
2262 | if (!onCallBaseVisitStart(Call)) |
2263 | return true; |
2264 | |
2265 | if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) && |
2266 | !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) { |
2267 | // This aborts the entire analysis. |
2268 | ExposesReturnsTwice = true; |
2269 | return false; |
2270 | } |
2271 | if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate()) |
2272 | ContainsNoDuplicateCall = true; |
2273 | |
2274 | Function *F = Call.getCalledFunction(); |
2275 | bool IsIndirectCall = !F; |
2276 | if (IsIndirectCall) { |
2277 | // Check if this happens to be an indirect function call to a known function |
2278 | // in this inline context. If not, we've done all we can. |
2279 | Value *Callee = Call.getCalledOperand(); |
2280 | F = dyn_cast_or_null<Function>(Val: SimplifiedValues.lookup(Val: Callee)); |
2281 | if (!F || F->getFunctionType() != Call.getFunctionType()) { |
2282 | onCallArgumentSetup(Call); |
2283 | |
2284 | if (!Call.onlyReadsMemory()) |
2285 | disableLoadElimination(); |
2286 | return Base::visitCallBase(I&: Call); |
2287 | } |
2288 | } |
2289 | |
2290 | assert(F && "Expected a call to a known function" ); |
2291 | |
2292 | // When we have a concrete function, first try to simplify it directly. |
2293 | if (simplifyCallSite(F, Call)) |
2294 | return true; |
2295 | |
2296 | // Next check if it is an intrinsic we know about. |
2297 | // FIXME: Lift this into part of the InstVisitor. |
2298 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) { |
2299 | switch (II->getIntrinsicID()) { |
2300 | default: |
2301 | if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II)) |
2302 | disableLoadElimination(); |
2303 | return Base::visitCallBase(I&: Call); |
2304 | |
2305 | case Intrinsic::load_relative: |
2306 | onLoadRelativeIntrinsic(); |
2307 | return false; |
2308 | |
2309 | case Intrinsic::memset: |
2310 | case Intrinsic::memcpy: |
2311 | case Intrinsic::memmove: |
2312 | disableLoadElimination(); |
2313 | // SROA can usually chew through these intrinsics, but they aren't free. |
2314 | return false; |
2315 | case Intrinsic::icall_branch_funnel: |
2316 | case Intrinsic::localescape: |
2317 | HasUninlineableIntrinsic = true; |
2318 | return false; |
2319 | case Intrinsic::vastart: |
2320 | InitsVargArgs = true; |
2321 | return false; |
2322 | case Intrinsic::launder_invariant_group: |
2323 | case Intrinsic::strip_invariant_group: |
2324 | if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0))) |
2325 | SROAArgValues[II] = SROAArg; |
2326 | return true; |
2327 | case Intrinsic::is_constant: |
2328 | return simplifyIntrinsicCallIsConstant(CB&: Call); |
2329 | case Intrinsic::objectsize: |
2330 | return simplifyIntrinsicCallObjectSize(CB&: Call); |
2331 | } |
2332 | } |
2333 | |
2334 | if (F == Call.getFunction()) { |
2335 | // This flag will fully abort the analysis, so don't bother with anything |
2336 | // else. |
2337 | IsRecursiveCall = true; |
2338 | if (!AllowRecursiveCall) |
2339 | return false; |
2340 | } |
2341 | |
2342 | if (TTI.isLoweredToCall(F)) { |
2343 | onLoweredCall(F, Call, IsIndirectCall); |
2344 | } |
2345 | |
2346 | if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) |
2347 | disableLoadElimination(); |
2348 | return Base::visitCallBase(I&: Call); |
2349 | } |
2350 | |
2351 | bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { |
2352 | // At least one return instruction will be free after inlining. |
2353 | bool Free = !HasReturn; |
2354 | HasReturn = true; |
2355 | return Free; |
2356 | } |
2357 | |
2358 | bool CallAnalyzer::visitBranchInst(BranchInst &BI) { |
2359 | // We model unconditional branches as essentially free -- they really |
2360 | // shouldn't exist at all, but handling them makes the behavior of the |
2361 | // inliner more regular and predictable. Interestingly, conditional branches |
2362 | // which will fold away are also free. |
2363 | return BI.isUnconditional() || isa<ConstantInt>(Val: BI.getCondition()) || |
2364 | BI.getMetadata(KindID: LLVMContext::MD_make_implicit) || |
2365 | isa_and_nonnull<ConstantInt>( |
2366 | Val: SimplifiedValues.lookup(Val: BI.getCondition())); |
2367 | } |
2368 | |
2369 | bool CallAnalyzer::visitSelectInst(SelectInst &SI) { |
2370 | bool CheckSROA = SI.getType()->isPointerTy(); |
2371 | Value *TrueVal = SI.getTrueValue(); |
2372 | Value *FalseVal = SI.getFalseValue(); |
2373 | |
2374 | Constant *TrueC = dyn_cast<Constant>(Val: TrueVal); |
2375 | if (!TrueC) |
2376 | TrueC = SimplifiedValues.lookup(Val: TrueVal); |
2377 | Constant *FalseC = dyn_cast<Constant>(Val: FalseVal); |
2378 | if (!FalseC) |
2379 | FalseC = SimplifiedValues.lookup(Val: FalseVal); |
2380 | Constant *CondC = |
2381 | dyn_cast_or_null<Constant>(Val: SimplifiedValues.lookup(Val: SI.getCondition())); |
2382 | |
2383 | if (!CondC) { |
2384 | // Select C, X, X => X |
2385 | if (TrueC == FalseC && TrueC) { |
2386 | SimplifiedValues[&SI] = TrueC; |
2387 | return true; |
2388 | } |
2389 | |
2390 | if (!CheckSROA) |
2391 | return Base::visitSelectInst(I&: SI); |
2392 | |
2393 | std::pair<Value *, APInt> TrueBaseAndOffset = |
2394 | ConstantOffsetPtrs.lookup(Val: TrueVal); |
2395 | std::pair<Value *, APInt> FalseBaseAndOffset = |
2396 | ConstantOffsetPtrs.lookup(Val: FalseVal); |
2397 | if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { |
2398 | ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; |
2399 | |
2400 | if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal)) |
2401 | SROAArgValues[&SI] = SROAArg; |
2402 | return true; |
2403 | } |
2404 | |
2405 | return Base::visitSelectInst(I&: SI); |
2406 | } |
2407 | |
2408 | // Select condition is a constant. |
2409 | Value *SelectedV = CondC->isAllOnesValue() ? TrueVal |
2410 | : (CondC->isNullValue()) ? FalseVal |
2411 | : nullptr; |
2412 | if (!SelectedV) { |
2413 | // Condition is a vector constant that is not all 1s or all 0s. If all |
2414 | // operands are constants, ConstantFoldSelectInstruction() can handle the |
2415 | // cases such as select vectors. |
2416 | if (TrueC && FalseC) { |
2417 | if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) { |
2418 | SimplifiedValues[&SI] = C; |
2419 | return true; |
2420 | } |
2421 | } |
2422 | return Base::visitSelectInst(I&: SI); |
2423 | } |
2424 | |
2425 | // Condition is either all 1s or all 0s. SI can be simplified. |
2426 | if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) { |
2427 | SimplifiedValues[&SI] = SelectedC; |
2428 | return true; |
2429 | } |
2430 | |
2431 | if (!CheckSROA) |
2432 | return true; |
2433 | |
2434 | std::pair<Value *, APInt> BaseAndOffset = |
2435 | ConstantOffsetPtrs.lookup(Val: SelectedV); |
2436 | if (BaseAndOffset.first) { |
2437 | ConstantOffsetPtrs[&SI] = BaseAndOffset; |
2438 | |
2439 | if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV)) |
2440 | SROAArgValues[&SI] = SROAArg; |
2441 | } |
2442 | |
2443 | return true; |
2444 | } |
2445 | |
2446 | bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { |
2447 | // We model unconditional switches as free, see the comments on handling |
2448 | // branches. |
2449 | if (isa<ConstantInt>(Val: SI.getCondition())) |
2450 | return true; |
2451 | if (Value *V = SimplifiedValues.lookup(Val: SI.getCondition())) |
2452 | if (isa<ConstantInt>(Val: V)) |
2453 | return true; |
2454 | |
2455 | // Assume the most general case where the switch is lowered into |
2456 | // either a jump table, bit test, or a balanced binary tree consisting of |
2457 | // case clusters without merging adjacent clusters with the same |
2458 | // destination. We do not consider the switches that are lowered with a mix |
2459 | // of jump table/bit test/binary search tree. The cost of the switch is |
2460 | // proportional to the size of the tree or the size of jump table range. |
2461 | // |
2462 | // NB: We convert large switches which are just used to initialize large phi |
2463 | // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent |
2464 | // inlining those. It will prevent inlining in cases where the optimization |
2465 | // does not (yet) fire. |
2466 | |
2467 | unsigned JumpTableSize = 0; |
2468 | BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; |
2469 | unsigned NumCaseCluster = |
2470 | TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI); |
2471 | |
2472 | onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUndefined: SI.defaultDestUndefined()); |
2473 | return false; |
2474 | } |
2475 | |
2476 | bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { |
2477 | // We never want to inline functions that contain an indirectbr. This is |
2478 | // incorrect because all the blockaddress's (in static global initializers |
2479 | // for example) would be referring to the original function, and this |
2480 | // indirect jump would jump from the inlined copy of the function into the |
2481 | // original function which is extremely undefined behavior. |
2482 | // FIXME: This logic isn't really right; we can safely inline functions with |
2483 | // indirectbr's as long as no other function or global references the |
2484 | // blockaddress of a block within the current function. |
2485 | HasIndirectBr = true; |
2486 | return false; |
2487 | } |
2488 | |
2489 | bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { |
2490 | // FIXME: It's not clear that a single instruction is an accurate model for |
2491 | // the inline cost of a resume instruction. |
2492 | return false; |
2493 | } |
2494 | |
2495 | bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { |
2496 | // FIXME: It's not clear that a single instruction is an accurate model for |
2497 | // the inline cost of a cleanupret instruction. |
2498 | return false; |
2499 | } |
2500 | |
2501 | bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { |
2502 | // FIXME: It's not clear that a single instruction is an accurate model for |
2503 | // the inline cost of a catchret instruction. |
2504 | return false; |
2505 | } |
2506 | |
2507 | bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { |
2508 | // FIXME: It might be reasonably to discount the cost of instructions leading |
2509 | // to unreachable as they have the lowest possible impact on both runtime and |
2510 | // code size. |
2511 | return true; // No actual code is needed for unreachable. |
2512 | } |
2513 | |
2514 | bool CallAnalyzer::visitInstruction(Instruction &I) { |
2515 | // Some instructions are free. All of the free intrinsics can also be |
2516 | // handled by SROA, etc. |
2517 | if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
2518 | TargetTransformInfo::TCC_Free) |
2519 | return true; |
2520 | |
2521 | // We found something we don't understand or can't handle. Mark any SROA-able |
2522 | // values in the operand list as no longer viable. |
2523 | for (const Use &Op : I.operands()) |
2524 | disableSROA(V: Op); |
2525 | |
2526 | return false; |
2527 | } |
2528 | |
2529 | /// Analyze a basic block for its contribution to the inline cost. |
2530 | /// |
2531 | /// This method walks the analyzer over every instruction in the given basic |
2532 | /// block and accounts for their cost during inlining at this callsite. It |
2533 | /// aborts early if the threshold has been exceeded or an impossible to inline |
2534 | /// construct has been detected. It returns false if inlining is no longer |
2535 | /// viable, and true if inlining remains viable. |
2536 | InlineResult |
2537 | CallAnalyzer::analyzeBlock(BasicBlock *BB, |
2538 | SmallPtrSetImpl<const Value *> &EphValues) { |
2539 | for (Instruction &I : *BB) { |
2540 | // FIXME: Currently, the number of instructions in a function regardless of |
2541 | // our ability to simplify them during inline to constants or dead code, |
2542 | // are actually used by the vector bonus heuristic. As long as that's true, |
2543 | // we have to special case debug intrinsics here to prevent differences in |
2544 | // inlining due to debug symbols. Eventually, the number of unsimplified |
2545 | // instructions shouldn't factor into the cost computation, but until then, |
2546 | // hack around it here. |
2547 | // Similarly, skip pseudo-probes. |
2548 | if (I.isDebugOrPseudoInst()) |
2549 | continue; |
2550 | |
2551 | // Skip ephemeral values. |
2552 | if (EphValues.count(Ptr: &I)) |
2553 | continue; |
2554 | |
2555 | ++NumInstructions; |
2556 | if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy()) |
2557 | ++NumVectorInstructions; |
2558 | |
2559 | // If the instruction simplified to a constant, there is no cost to this |
2560 | // instruction. Visit the instructions using our InstVisitor to account for |
2561 | // all of the per-instruction logic. The visit tree returns true if we |
2562 | // consumed the instruction in any way, and false if the instruction's base |
2563 | // cost should count against inlining. |
2564 | onInstructionAnalysisStart(I: &I); |
2565 | |
2566 | if (Base::visit(I: &I)) |
2567 | ++NumInstructionsSimplified; |
2568 | else |
2569 | onMissedSimplification(); |
2570 | |
2571 | onInstructionAnalysisFinish(I: &I); |
2572 | using namespace ore; |
2573 | // If the visit this instruction detected an uninlinable pattern, abort. |
2574 | InlineResult IR = InlineResult::success(); |
2575 | if (IsRecursiveCall && !AllowRecursiveCall) |
2576 | IR = InlineResult::failure(Reason: "recursive" ); |
2577 | else if (ExposesReturnsTwice) |
2578 | IR = InlineResult::failure(Reason: "exposes returns twice" ); |
2579 | else if (HasDynamicAlloca) |
2580 | IR = InlineResult::failure(Reason: "dynamic alloca" ); |
2581 | else if (HasIndirectBr) |
2582 | IR = InlineResult::failure(Reason: "indirect branch" ); |
2583 | else if (HasUninlineableIntrinsic) |
2584 | IR = InlineResult::failure(Reason: "uninlinable intrinsic" ); |
2585 | else if (InitsVargArgs) |
2586 | IR = InlineResult::failure(Reason: "varargs" ); |
2587 | if (!IR.isSuccess()) { |
2588 | if (ORE) |
2589 | ORE->emit(RemarkBuilder: [&]() { |
2590 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
2591 | &CandidateCall) |
2592 | << NV("Callee" , &F) << " has uninlinable pattern (" |
2593 | << NV("InlineResult" , IR.getFailureReason()) |
2594 | << ") and cost is not fully computed" ; |
2595 | }); |
2596 | return IR; |
2597 | } |
2598 | |
2599 | // If the caller is a recursive function then we don't want to inline |
2600 | // functions which allocate a lot of stack space because it would increase |
2601 | // the caller stack usage dramatically. |
2602 | if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) { |
2603 | auto IR = |
2604 | InlineResult::failure(Reason: "recursive and allocates too much stack space" ); |
2605 | if (ORE) |
2606 | ORE->emit(RemarkBuilder: [&]() { |
2607 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
2608 | &CandidateCall) |
2609 | << NV("Callee" , &F) << " is " |
2610 | << NV("InlineResult" , IR.getFailureReason()) |
2611 | << ". Cost is not fully computed" ; |
2612 | }); |
2613 | return IR; |
2614 | } |
2615 | |
2616 | if (shouldStop()) |
2617 | return InlineResult::failure( |
2618 | Reason: "Call site analysis is not favorable to inlining." ); |
2619 | } |
2620 | |
2621 | return InlineResult::success(); |
2622 | } |
2623 | |
2624 | /// Compute the base pointer and cumulative constant offsets for V. |
2625 | /// |
2626 | /// This strips all constant offsets off of V, leaving it the base pointer, and |
2627 | /// accumulates the total constant offset applied in the returned constant. It |
2628 | /// returns 0 if V is not a pointer, and returns the constant '0' if there are |
2629 | /// no constant offsets applied. |
2630 | ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { |
2631 | if (!V->getType()->isPointerTy()) |
2632 | return nullptr; |
2633 | |
2634 | unsigned AS = V->getType()->getPointerAddressSpace(); |
2635 | unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); |
2636 | APInt Offset = APInt::getZero(numBits: IntPtrWidth); |
2637 | |
2638 | // Even though we don't look through PHI nodes, we could be called on an |
2639 | // instruction in an unreachable block, which may be on a cycle. |
2640 | SmallPtrSet<Value *, 4> Visited; |
2641 | Visited.insert(Ptr: V); |
2642 | do { |
2643 | if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) { |
2644 | if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset)) |
2645 | return nullptr; |
2646 | V = GEP->getPointerOperand(); |
2647 | } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) { |
2648 | if (GA->isInterposable()) |
2649 | break; |
2650 | V = GA->getAliasee(); |
2651 | } else { |
2652 | break; |
2653 | } |
2654 | assert(V->getType()->isPointerTy() && "Unexpected operand type!" ); |
2655 | } while (Visited.insert(Ptr: V).second); |
2656 | |
2657 | Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType()); |
2658 | return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset)); |
2659 | } |
2660 | |
2661 | /// Find dead blocks due to deleted CFG edges during inlining. |
2662 | /// |
2663 | /// If we know the successor of the current block, \p CurrBB, has to be \p |
2664 | /// NextBB, the other successors of \p CurrBB are dead if these successors have |
2665 | /// no live incoming CFG edges. If one block is found to be dead, we can |
2666 | /// continue growing the dead block list by checking the successors of the dead |
2667 | /// blocks to see if all their incoming edges are dead or not. |
2668 | void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { |
2669 | auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { |
2670 | // A CFG edge is dead if the predecessor is dead or the predecessor has a |
2671 | // known successor which is not the one under exam. |
2672 | return (DeadBlocks.count(Ptr: Pred) || |
2673 | (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); |
2674 | }; |
2675 | |
2676 | auto IsNewlyDead = [&](BasicBlock *BB) { |
2677 | // If all the edges to a block are dead, the block is also dead. |
2678 | return (!DeadBlocks.count(Ptr: BB) && |
2679 | llvm::all_of(Range: predecessors(BB), |
2680 | P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); |
2681 | }; |
2682 | |
2683 | for (BasicBlock *Succ : successors(BB: CurrBB)) { |
2684 | if (Succ == NextBB || !IsNewlyDead(Succ)) |
2685 | continue; |
2686 | SmallVector<BasicBlock *, 4> NewDead; |
2687 | NewDead.push_back(Elt: Succ); |
2688 | while (!NewDead.empty()) { |
2689 | BasicBlock *Dead = NewDead.pop_back_val(); |
2690 | if (DeadBlocks.insert(Ptr: Dead).second) |
2691 | // Continue growing the dead block lists. |
2692 | for (BasicBlock *S : successors(BB: Dead)) |
2693 | if (IsNewlyDead(S)) |
2694 | NewDead.push_back(Elt: S); |
2695 | } |
2696 | } |
2697 | } |
2698 | |
2699 | /// Analyze a call site for potential inlining. |
2700 | /// |
2701 | /// Returns true if inlining this call is viable, and false if it is not |
2702 | /// viable. It computes the cost and adjusts the threshold based on numerous |
2703 | /// factors and heuristics. If this method returns false but the computed cost |
2704 | /// is below the computed threshold, then inlining was forcibly disabled by |
2705 | /// some artifact of the routine. |
2706 | InlineResult CallAnalyzer::analyze() { |
2707 | ++NumCallsAnalyzed; |
2708 | |
2709 | auto Result = onAnalysisStart(); |
2710 | if (!Result.isSuccess()) |
2711 | return Result; |
2712 | |
2713 | if (F.empty()) |
2714 | return InlineResult::success(); |
2715 | |
2716 | Function *Caller = CandidateCall.getFunction(); |
2717 | // Check if the caller function is recursive itself. |
2718 | for (User *U : Caller->users()) { |
2719 | CallBase *Call = dyn_cast<CallBase>(Val: U); |
2720 | if (Call && Call->getFunction() == Caller) { |
2721 | IsCallerRecursive = true; |
2722 | break; |
2723 | } |
2724 | } |
2725 | |
2726 | // Populate our simplified values by mapping from function arguments to call |
2727 | // arguments with known important simplifications. |
2728 | auto CAI = CandidateCall.arg_begin(); |
2729 | for (Argument &FAI : F.args()) { |
2730 | assert(CAI != CandidateCall.arg_end()); |
2731 | if (Constant *C = dyn_cast<Constant>(Val: CAI)) |
2732 | SimplifiedValues[&FAI] = C; |
2733 | |
2734 | Value *PtrArg = *CAI; |
2735 | if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) { |
2736 | ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue()); |
2737 | |
2738 | // We can SROA any pointer arguments derived from alloca instructions. |
2739 | if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) { |
2740 | SROAArgValues[&FAI] = SROAArg; |
2741 | onInitializeSROAArg(Arg: SROAArg); |
2742 | EnabledSROAAllocas.insert(V: SROAArg); |
2743 | } |
2744 | } |
2745 | ++CAI; |
2746 | } |
2747 | NumConstantArgs = SimplifiedValues.size(); |
2748 | NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); |
2749 | NumAllocaArgs = SROAArgValues.size(); |
2750 | |
2751 | // FIXME: If a caller has multiple calls to a callee, we end up recomputing |
2752 | // the ephemeral values multiple times (and they're completely determined by |
2753 | // the callee, so this is purely duplicate work). |
2754 | SmallPtrSet<const Value *, 32> EphValues; |
2755 | CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F), EphValues); |
2756 | |
2757 | // The worklist of live basic blocks in the callee *after* inlining. We avoid |
2758 | // adding basic blocks of the callee which can be proven to be dead for this |
2759 | // particular call site in order to get more accurate cost estimates. This |
2760 | // requires a somewhat heavyweight iteration pattern: we need to walk the |
2761 | // basic blocks in a breadth-first order as we insert live successors. To |
2762 | // accomplish this, prioritizing for small iterations because we exit after |
2763 | // crossing our threshold, we use a small-size optimized SetVector. |
2764 | typedef SmallSetVector<BasicBlock *, 16> BBSetVector; |
2765 | BBSetVector BBWorklist; |
2766 | BBWorklist.insert(X: &F.getEntryBlock()); |
2767 | |
2768 | // Note that we *must not* cache the size, this loop grows the worklist. |
2769 | for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { |
2770 | if (shouldStop()) |
2771 | break; |
2772 | |
2773 | BasicBlock *BB = BBWorklist[Idx]; |
2774 | if (BB->empty()) |
2775 | continue; |
2776 | |
2777 | onBlockStart(BB); |
2778 | |
2779 | // Disallow inlining a blockaddress with uses other than strictly callbr. |
2780 | // A blockaddress only has defined behavior for an indirect branch in the |
2781 | // same function, and we do not currently support inlining indirect |
2782 | // branches. But, the inliner may not see an indirect branch that ends up |
2783 | // being dead code at a particular call site. If the blockaddress escapes |
2784 | // the function, e.g., via a global variable, inlining may lead to an |
2785 | // invalid cross-function reference. |
2786 | // FIXME: pr/39560: continue relaxing this overt restriction. |
2787 | if (BB->hasAddressTaken()) |
2788 | for (User *U : BlockAddress::get(BB: &*BB)->users()) |
2789 | if (!isa<CallBrInst>(Val: *U)) |
2790 | return InlineResult::failure(Reason: "blockaddress used outside of callbr" ); |
2791 | |
2792 | // Analyze the cost of this block. If we blow through the threshold, this |
2793 | // returns false, and we can bail on out. |
2794 | InlineResult IR = analyzeBlock(BB, EphValues); |
2795 | if (!IR.isSuccess()) |
2796 | return IR; |
2797 | |
2798 | Instruction *TI = BB->getTerminator(); |
2799 | |
2800 | // Add in the live successors by first checking whether we have terminator |
2801 | // that may be simplified based on the values simplified by this call. |
2802 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) { |
2803 | if (BI->isConditional()) { |
2804 | Value *Cond = BI->getCondition(); |
2805 | if (ConstantInt *SimpleCond = |
2806 | dyn_cast_or_null<ConstantInt>(Val: SimplifiedValues.lookup(Val: Cond))) { |
2807 | BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0); |
2808 | BBWorklist.insert(X: NextBB); |
2809 | KnownSuccessors[BB] = NextBB; |
2810 | findDeadBlocks(CurrBB: BB, NextBB); |
2811 | continue; |
2812 | } |
2813 | } |
2814 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) { |
2815 | Value *Cond = SI->getCondition(); |
2816 | if (ConstantInt *SimpleCond = |
2817 | dyn_cast_or_null<ConstantInt>(Val: SimplifiedValues.lookup(Val: Cond))) { |
2818 | BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor(); |
2819 | BBWorklist.insert(X: NextBB); |
2820 | KnownSuccessors[BB] = NextBB; |
2821 | findDeadBlocks(CurrBB: BB, NextBB); |
2822 | continue; |
2823 | } |
2824 | } |
2825 | |
2826 | // If we're unable to select a particular successor, just count all of |
2827 | // them. |
2828 | for (BasicBlock *Succ : successors(BB)) |
2829 | BBWorklist.insert(X: Succ); |
2830 | |
2831 | onBlockAnalyzed(BB); |
2832 | } |
2833 | |
2834 | // If this is a noduplicate call, we can still inline as long as |
2835 | // inlining this would cause the removal of the caller (so the instruction |
2836 | // is not actually duplicated, just moved). |
2837 | if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall) |
2838 | return InlineResult::failure(Reason: "noduplicate" ); |
2839 | |
2840 | // If the callee's stack size exceeds the user-specified threshold, |
2841 | // do not let it be inlined. |
2842 | // The command line option overrides a limit set in the function attributes. |
2843 | size_t FinalStackSizeThreshold = StackSizeThreshold; |
2844 | if (!StackSizeThreshold.getNumOccurrences()) |
2845 | if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt( |
2846 | F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName)) |
2847 | FinalStackSizeThreshold = *AttrMaxStackSize; |
2848 | if (AllocatedSize > FinalStackSizeThreshold) |
2849 | return InlineResult::failure(Reason: "stacksize" ); |
2850 | |
2851 | return finalizeAnalysis(); |
2852 | } |
2853 | |
2854 | void InlineCostCallAnalyzer::print(raw_ostream &OS) { |
2855 | #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n" |
2856 | if (PrintInstructionComments) |
2857 | F.print(OS, AAW: &Writer); |
2858 | DEBUG_PRINT_STAT(NumConstantArgs); |
2859 | DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); |
2860 | DEBUG_PRINT_STAT(NumAllocaArgs); |
2861 | DEBUG_PRINT_STAT(NumConstantPtrCmps); |
2862 | DEBUG_PRINT_STAT(NumConstantPtrDiffs); |
2863 | DEBUG_PRINT_STAT(NumInstructionsSimplified); |
2864 | DEBUG_PRINT_STAT(NumInstructions); |
2865 | DEBUG_PRINT_STAT(SROACostSavings); |
2866 | DEBUG_PRINT_STAT(SROACostSavingsLost); |
2867 | DEBUG_PRINT_STAT(LoadEliminationCost); |
2868 | DEBUG_PRINT_STAT(ContainsNoDuplicateCall); |
2869 | DEBUG_PRINT_STAT(Cost); |
2870 | DEBUG_PRINT_STAT(Threshold); |
2871 | #undef DEBUG_PRINT_STAT |
2872 | } |
2873 | |
2874 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2875 | /// Dump stats about this call's analysis. |
2876 | LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); } |
2877 | #endif |
2878 | |
2879 | /// Test that there are no attribute conflicts between Caller and Callee |
2880 | /// that prevent inlining. |
2881 | static bool functionsHaveCompatibleAttributes( |
2882 | Function *Caller, Function *Callee, TargetTransformInfo &TTI, |
2883 | function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) { |
2884 | // Note that CalleeTLI must be a copy not a reference. The legacy pass manager |
2885 | // caches the most recently created TLI in the TargetLibraryInfoWrapperPass |
2886 | // object, and always returns the same object (which is overwritten on each |
2887 | // GetTLI call). Therefore we copy the first result. |
2888 | auto CalleeTLI = GetTLI(*Callee); |
2889 | return (IgnoreTTIInlineCompatible || |
2890 | TTI.areInlineCompatible(Caller, Callee)) && |
2891 | GetTLI(*Caller).areInlineCompatible(CalleeTLI, |
2892 | AllowCallerSuperset: InlineCallerSupersetNoBuiltin) && |
2893 | AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee); |
2894 | } |
2895 | |
2896 | int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, |
2897 | const DataLayout &DL) { |
2898 | int64_t Cost = 0; |
2899 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { |
2900 | if (Call.isByValArgument(ArgNo: I)) { |
2901 | // We approximate the number of loads and stores needed by dividing the |
2902 | // size of the byval type by the target's pointer size. |
2903 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
2904 | unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I)); |
2905 | unsigned AS = PTy->getAddressSpace(); |
2906 | unsigned PointerSize = DL.getPointerSizeInBits(AS); |
2907 | // Ceiling division. |
2908 | unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; |
2909 | |
2910 | // If it generates more than 8 stores it is likely to be expanded as an |
2911 | // inline memcpy so we take that as an upper bound. Otherwise we assume |
2912 | // one load and one store per word copied. |
2913 | // FIXME: The maxStoresPerMemcpy setting from the target should be used |
2914 | // here instead of a magic number of 8, but it's not available via |
2915 | // DataLayout. |
2916 | NumStores = std::min(a: NumStores, b: 8U); |
2917 | |
2918 | Cost += 2 * NumStores * InstrCost; |
2919 | } else { |
2920 | // For non-byval arguments subtract off one instruction per call |
2921 | // argument. |
2922 | Cost += InstrCost; |
2923 | } |
2924 | } |
2925 | // The call instruction also disappears after inlining. |
2926 | Cost += InstrCost; |
2927 | Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty); |
2928 | |
2929 | return std::min<int64_t>(a: Cost, INT_MAX); |
2930 | } |
2931 | |
2932 | InlineCost llvm::getInlineCost( |
2933 | CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, |
2934 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
2935 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
2936 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
2937 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
2938 | return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI, |
2939 | GetAssumptionCache, GetTLI, GetBFI, PSI, ORE); |
2940 | } |
2941 | |
2942 | std::optional<int> llvm::getInliningCostEstimate( |
2943 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
2944 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
2945 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
2946 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
2947 | const InlineParams Params = {/* DefaultThreshold*/ 0, |
2948 | /*HintThreshold*/ {}, |
2949 | /*ColdThreshold*/ {}, |
2950 | /*OptSizeThreshold*/ {}, |
2951 | /*OptMinSizeThreshold*/ {}, |
2952 | /*HotCallSiteThreshold*/ {}, |
2953 | /*LocallyHotCallSiteThreshold*/ {}, |
2954 | /*ColdCallSiteThreshold*/ {}, |
2955 | /*ComputeFullInlineCost*/ true, |
2956 | /*EnableDeferral*/ true}; |
2957 | |
2958 | InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, |
2959 | GetAssumptionCache, GetBFI, PSI, ORE, true, |
2960 | /*IgnoreThreshold*/ true); |
2961 | auto R = CA.analyze(); |
2962 | if (!R.isSuccess()) |
2963 | return std::nullopt; |
2964 | return CA.getCost(); |
2965 | } |
2966 | |
2967 | std::optional<InlineCostFeatures> llvm::getInliningCostFeatures( |
2968 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
2969 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
2970 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
2971 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
2972 | InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, |
2973 | ORE, *Call.getCalledFunction(), Call); |
2974 | auto R = CFA.analyze(); |
2975 | if (!R.isSuccess()) |
2976 | return std::nullopt; |
2977 | return CFA.features(); |
2978 | } |
2979 | |
2980 | std::optional<InlineResult> llvm::getAttributeBasedInliningDecision( |
2981 | CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, |
2982 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { |
2983 | |
2984 | // Cannot inline indirect calls. |
2985 | if (!Callee) |
2986 | return InlineResult::failure(Reason: "indirect call" ); |
2987 | |
2988 | // When callee coroutine function is inlined into caller coroutine function |
2989 | // before coro-split pass, |
2990 | // coro-early pass can not handle this quiet well. |
2991 | // So we won't inline the coroutine function if it have not been unsplited |
2992 | if (Callee->isPresplitCoroutine()) |
2993 | return InlineResult::failure(Reason: "unsplited coroutine call" ); |
2994 | |
2995 | // Never inline calls with byval arguments that does not have the alloca |
2996 | // address space. Since byval arguments can be replaced with a copy to an |
2997 | // alloca, the inlined code would need to be adjusted to handle that the |
2998 | // argument is in the alloca address space (so it is a little bit complicated |
2999 | // to solve). |
3000 | unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace(); |
3001 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) |
3002 | if (Call.isByValArgument(ArgNo: I)) { |
3003 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
3004 | if (PTy->getAddressSpace() != AllocaAS) |
3005 | return InlineResult::failure(Reason: "byval arguments without alloca" |
3006 | " address space" ); |
3007 | } |
3008 | |
3009 | // Calls to functions with always-inline attributes should be inlined |
3010 | // whenever possible. |
3011 | if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) { |
3012 | if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline)) |
3013 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
3014 | |
3015 | auto IsViable = isInlineViable(Callee&: *Callee); |
3016 | if (IsViable.isSuccess()) |
3017 | return InlineResult::success(); |
3018 | return InlineResult::failure(Reason: IsViable.getFailureReason()); |
3019 | } |
3020 | |
3021 | // Never inline functions with conflicting attributes (unless callee has |
3022 | // always-inline attribute). |
3023 | Function *Caller = Call.getCaller(); |
3024 | if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI)) |
3025 | return InlineResult::failure(Reason: "conflicting attributes" ); |
3026 | |
3027 | // Don't inline this call if the caller has the optnone attribute. |
3028 | if (Caller->hasOptNone()) |
3029 | return InlineResult::failure(Reason: "optnone attribute" ); |
3030 | |
3031 | // Don't inline a function that treats null pointer as valid into a caller |
3032 | // that does not have this attribute. |
3033 | if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) |
3034 | return InlineResult::failure(Reason: "nullptr definitions incompatible" ); |
3035 | |
3036 | // Don't inline functions which can be interposed at link-time. |
3037 | if (Callee->isInterposable()) |
3038 | return InlineResult::failure(Reason: "interposable" ); |
3039 | |
3040 | // Don't inline functions marked noinline. |
3041 | if (Callee->hasFnAttribute(Kind: Attribute::NoInline)) |
3042 | return InlineResult::failure(Reason: "noinline function attribute" ); |
3043 | |
3044 | // Don't inline call sites marked noinline. |
3045 | if (Call.isNoInline()) |
3046 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
3047 | |
3048 | return std::nullopt; |
3049 | } |
3050 | |
3051 | InlineCost llvm::getInlineCost( |
3052 | CallBase &Call, Function *Callee, const InlineParams &Params, |
3053 | TargetTransformInfo &CalleeTTI, |
3054 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
3055 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
3056 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
3057 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
3058 | |
3059 | auto UserDecision = |
3060 | llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI); |
3061 | |
3062 | if (UserDecision) { |
3063 | if (UserDecision->isSuccess()) |
3064 | return llvm::InlineCost::getAlways(Reason: "always inline attribute" ); |
3065 | return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason()); |
3066 | } |
3067 | |
3068 | LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() |
3069 | << "... (caller:" << Call.getCaller()->getName() |
3070 | << ")\n" ); |
3071 | |
3072 | InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, |
3073 | GetAssumptionCache, GetBFI, PSI, ORE); |
3074 | InlineResult ShouldInline = CA.analyze(); |
3075 | |
3076 | LLVM_DEBUG(CA.dump()); |
3077 | |
3078 | // Always make cost benefit based decision explicit. |
3079 | // We use always/never here since threshold is not meaningful, |
3080 | // as it's not what drives cost-benefit analysis. |
3081 | if (CA.wasDecidedByCostBenefit()) { |
3082 | if (ShouldInline.isSuccess()) |
3083 | return InlineCost::getAlways(Reason: "benefit over cost" , |
3084 | CostBenefit: CA.getCostBenefitPair()); |
3085 | else |
3086 | return InlineCost::getNever(Reason: "cost over benefit" , CostBenefit: CA.getCostBenefitPair()); |
3087 | } |
3088 | |
3089 | if (CA.wasDecidedByCostThreshold()) |
3090 | return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(), |
3091 | StaticBonus: CA.getStaticBonusApplied()); |
3092 | |
3093 | // No details on how the decision was made, simply return always or never. |
3094 | return ShouldInline.isSuccess() |
3095 | ? InlineCost::getAlways(Reason: "empty function" ) |
3096 | : InlineCost::getNever(Reason: ShouldInline.getFailureReason()); |
3097 | } |
3098 | |
3099 | InlineResult llvm::isInlineViable(Function &F) { |
3100 | bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice); |
3101 | for (BasicBlock &BB : F) { |
3102 | // Disallow inlining of functions which contain indirect branches. |
3103 | if (isa<IndirectBrInst>(Val: BB.getTerminator())) |
3104 | return InlineResult::failure(Reason: "contains indirect branches" ); |
3105 | |
3106 | // Disallow inlining of blockaddresses which are used by non-callbr |
3107 | // instructions. |
3108 | if (BB.hasAddressTaken()) |
3109 | for (User *U : BlockAddress::get(BB: &BB)->users()) |
3110 | if (!isa<CallBrInst>(Val: *U)) |
3111 | return InlineResult::failure(Reason: "blockaddress used outside of callbr" ); |
3112 | |
3113 | for (auto &II : BB) { |
3114 | CallBase *Call = dyn_cast<CallBase>(Val: &II); |
3115 | if (!Call) |
3116 | continue; |
3117 | |
3118 | // Disallow recursive calls. |
3119 | Function *Callee = Call->getCalledFunction(); |
3120 | if (&F == Callee) |
3121 | return InlineResult::failure(Reason: "recursive call" ); |
3122 | |
3123 | // Disallow calls which expose returns-twice to a function not previously |
3124 | // attributed as such. |
3125 | if (!ReturnsTwice && isa<CallInst>(Val: Call) && |
3126 | cast<CallInst>(Val: Call)->canReturnTwice()) |
3127 | return InlineResult::failure(Reason: "exposes returns-twice attribute" ); |
3128 | |
3129 | if (Callee) |
3130 | switch (Callee->getIntrinsicID()) { |
3131 | default: |
3132 | break; |
3133 | case llvm::Intrinsic::icall_branch_funnel: |
3134 | // Disallow inlining of @llvm.icall.branch.funnel because current |
3135 | // backend can't separate call targets from call arguments. |
3136 | return InlineResult::failure( |
3137 | Reason: "disallowed inlining of @llvm.icall.branch.funnel" ); |
3138 | case llvm::Intrinsic::localescape: |
3139 | // Disallow inlining functions that call @llvm.localescape. Doing this |
3140 | // correctly would require major changes to the inliner. |
3141 | return InlineResult::failure( |
3142 | Reason: "disallowed inlining of @llvm.localescape" ); |
3143 | case llvm::Intrinsic::vastart: |
3144 | // Disallow inlining of functions that initialize VarArgs with |
3145 | // va_start. |
3146 | return InlineResult::failure( |
3147 | Reason: "contains VarArgs initialized with va_start" ); |
3148 | } |
3149 | } |
3150 | } |
3151 | |
3152 | return InlineResult::success(); |
3153 | } |
3154 | |
3155 | // APIs to create InlineParams based on command line flags and/or other |
3156 | // parameters. |
3157 | |
3158 | InlineParams llvm::getInlineParams(int Threshold) { |
3159 | InlineParams Params; |
3160 | |
3161 | // This field is the threshold to use for a callee by default. This is |
3162 | // derived from one or more of: |
3163 | // * optimization or size-optimization levels, |
3164 | // * a value passed to createFunctionInliningPass function, or |
3165 | // * the -inline-threshold flag. |
3166 | // If the -inline-threshold flag is explicitly specified, that is used |
3167 | // irrespective of anything else. |
3168 | if (InlineThreshold.getNumOccurrences() > 0) |
3169 | Params.DefaultThreshold = InlineThreshold; |
3170 | else |
3171 | Params.DefaultThreshold = Threshold; |
3172 | |
3173 | // Set the HintThreshold knob from the -inlinehint-threshold. |
3174 | Params.HintThreshold = HintThreshold; |
3175 | |
3176 | // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. |
3177 | Params.HotCallSiteThreshold = HotCallSiteThreshold; |
3178 | |
3179 | // If the -locally-hot-callsite-threshold is explicitly specified, use it to |
3180 | // populate LocallyHotCallSiteThreshold. Later, we populate |
3181 | // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if |
3182 | // we know that optimization level is O3 (in the getInlineParams variant that |
3183 | // takes the opt and size levels). |
3184 | // FIXME: Remove this check (and make the assignment unconditional) after |
3185 | // addressing size regression issues at O2. |
3186 | if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) |
3187 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
3188 | |
3189 | // Set the ColdCallSiteThreshold knob from the |
3190 | // -inline-cold-callsite-threshold. |
3191 | Params.ColdCallSiteThreshold = ColdCallSiteThreshold; |
3192 | |
3193 | // Set the OptMinSizeThreshold and OptSizeThreshold params only if the |
3194 | // -inlinehint-threshold commandline option is not explicitly given. If that |
3195 | // option is present, then its value applies even for callees with size and |
3196 | // minsize attributes. |
3197 | // If the -inline-threshold is not specified, set the ColdThreshold from the |
3198 | // -inlinecold-threshold even if it is not explicitly passed. If |
3199 | // -inline-threshold is specified, then -inlinecold-threshold needs to be |
3200 | // explicitly specified to set the ColdThreshold knob |
3201 | if (InlineThreshold.getNumOccurrences() == 0) { |
3202 | Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; |
3203 | Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; |
3204 | Params.ColdThreshold = ColdThreshold; |
3205 | } else if (ColdThreshold.getNumOccurrences() > 0) { |
3206 | Params.ColdThreshold = ColdThreshold; |
3207 | } |
3208 | return Params; |
3209 | } |
3210 | |
3211 | InlineParams llvm::getInlineParams() { |
3212 | return getInlineParams(Threshold: DefaultThreshold); |
3213 | } |
3214 | |
3215 | // Compute the default threshold for inlining based on the opt level and the |
3216 | // size opt level. |
3217 | static int computeThresholdFromOptLevels(unsigned OptLevel, |
3218 | unsigned SizeOptLevel) { |
3219 | if (OptLevel > 2) |
3220 | return InlineConstants::OptAggressiveThreshold; |
3221 | if (SizeOptLevel == 1) // -Os |
3222 | return InlineConstants::OptSizeThreshold; |
3223 | if (SizeOptLevel == 2) // -Oz |
3224 | return InlineConstants::OptMinSizeThreshold; |
3225 | return DefaultThreshold; |
3226 | } |
3227 | |
3228 | InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { |
3229 | auto Params = |
3230 | getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); |
3231 | // At O3, use the value of -locally-hot-callsite-threshold option to populate |
3232 | // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only |
3233 | // when it is specified explicitly. |
3234 | if (OptLevel > 2) |
3235 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
3236 | return Params; |
3237 | } |
3238 | |
3239 | PreservedAnalyses |
3240 | InlineCostAnnotationPrinterPass::run(Function &F, |
3241 | FunctionAnalysisManager &FAM) { |
3242 | PrintInstructionComments = true; |
3243 | std::function<AssumptionCache &(Function &)> GetAssumptionCache = |
3244 | [&](Function &F) -> AssumptionCache & { |
3245 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
3246 | }; |
3247 | Module *M = F.getParent(); |
3248 | ProfileSummaryInfo PSI(*M); |
3249 | DataLayout DL(M); |
3250 | TargetTransformInfo TTI(DL); |
3251 | // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. |
3252 | // In the current implementation, the type of InlineParams doesn't matter as |
3253 | // the pass serves only for verification of inliner's decisions. |
3254 | // We can add a flag which determines InlineParams for this run. Right now, |
3255 | // the default InlineParams are used. |
3256 | const InlineParams Params = llvm::getInlineParams(); |
3257 | for (BasicBlock &BB : F) { |
3258 | for (Instruction &I : BB) { |
3259 | if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) { |
3260 | Function *CalledFunction = CI->getCalledFunction(); |
3261 | if (!CalledFunction || CalledFunction->isDeclaration()) |
3262 | continue; |
3263 | OptimizationRemarkEmitter ORE(CalledFunction); |
3264 | InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI, |
3265 | GetAssumptionCache, nullptr, &PSI, &ORE); |
3266 | ICCA.analyze(); |
3267 | OS << " Analyzing call of " << CalledFunction->getName() |
3268 | << "... (caller:" << CI->getCaller()->getName() << ")\n" ; |
3269 | ICCA.print(OS); |
3270 | OS << "\n" ; |
3271 | } |
3272 | } |
3273 | } |
3274 | return PreservedAnalyses::all(); |
3275 | } |
3276 | |