1 | //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements inline cost analysis. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/Analysis/InlineCost.h" |
14 | #include "llvm/ADT/STLExtras.h" |
15 | #include "llvm/ADT/SetVector.h" |
16 | #include "llvm/ADT/SmallPtrSet.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/ADT/Statistic.h" |
19 | #include "llvm/Analysis/AssumptionCache.h" |
20 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
21 | #include "llvm/Analysis/CodeMetrics.h" |
22 | #include "llvm/Analysis/ConstantFolding.h" |
23 | #include "llvm/Analysis/DomConditionCache.h" |
24 | #include "llvm/Analysis/EphemeralValuesCache.h" |
25 | #include "llvm/Analysis/InstructionSimplify.h" |
26 | #include "llvm/Analysis/LoopInfo.h" |
27 | #include "llvm/Analysis/MemoryBuiltins.h" |
28 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
29 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
30 | #include "llvm/Analysis/TargetLibraryInfo.h" |
31 | #include "llvm/Analysis/TargetTransformInfo.h" |
32 | #include "llvm/Analysis/ValueTracking.h" |
33 | #include "llvm/Config/llvm-config.h" |
34 | #include "llvm/IR/AssemblyAnnotationWriter.h" |
35 | #include "llvm/IR/CallingConv.h" |
36 | #include "llvm/IR/DataLayout.h" |
37 | #include "llvm/IR/Dominators.h" |
38 | #include "llvm/IR/GetElementPtrTypeIterator.h" |
39 | #include "llvm/IR/GlobalAlias.h" |
40 | #include "llvm/IR/InstVisitor.h" |
41 | #include "llvm/IR/IntrinsicInst.h" |
42 | #include "llvm/IR/Operator.h" |
43 | #include "llvm/IR/PatternMatch.h" |
44 | #include "llvm/Support/CommandLine.h" |
45 | #include "llvm/Support/Debug.h" |
46 | #include "llvm/Support/FormattedStream.h" |
47 | #include "llvm/Support/raw_ostream.h" |
48 | #include <climits> |
49 | #include <limits> |
50 | #include <optional> |
51 | |
52 | using namespace llvm; |
53 | |
54 | #define DEBUG_TYPE "inline-cost" |
55 | |
56 | STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed" ); |
57 | |
58 | static cl::opt<int> |
59 | DefaultThreshold("inlinedefault-threshold" , cl::Hidden, cl::init(Val: 225), |
60 | cl::desc("Default amount of inlining to perform" )); |
61 | |
62 | // We introduce this option since there is a minor compile-time win by avoiding |
63 | // addition of TTI attributes (target-features in particular) to inline |
64 | // candidates when they are guaranteed to be the same as top level methods in |
65 | // some use cases. If we avoid adding the attribute, we need an option to avoid |
66 | // checking these attributes. |
67 | static cl::opt<bool> IgnoreTTIInlineCompatible( |
68 | "ignore-tti-inline-compatible" , cl::Hidden, cl::init(Val: false), |
69 | cl::desc("Ignore TTI attributes compatibility check between callee/caller " |
70 | "during inline cost calculation" )); |
71 | |
72 | static cl::opt<bool> ( |
73 | "print-instruction-comments" , cl::Hidden, cl::init(Val: false), |
74 | cl::desc("Prints comments for instruction based on inline cost analysis" )); |
75 | |
76 | static cl::opt<int> InlineThreshold( |
77 | "inline-threshold" , cl::Hidden, cl::init(Val: 225), |
78 | cl::desc("Control the amount of inlining to perform (default = 225)" )); |
79 | |
80 | static cl::opt<int> HintThreshold( |
81 | "inlinehint-threshold" , cl::Hidden, cl::init(Val: 325), |
82 | cl::desc("Threshold for inlining functions with inline hint" )); |
83 | |
84 | static cl::opt<int> |
85 | ColdCallSiteThreshold("inline-cold-callsite-threshold" , cl::Hidden, |
86 | cl::init(Val: 45), |
87 | cl::desc("Threshold for inlining cold callsites" )); |
88 | |
89 | static cl::opt<bool> InlineEnableCostBenefitAnalysis( |
90 | "inline-enable-cost-benefit-analysis" , cl::Hidden, cl::init(Val: false), |
91 | cl::desc("Enable the cost-benefit analysis for the inliner" )); |
92 | |
93 | // InlineSavingsMultiplier overrides per TTI multipliers iff it is |
94 | // specified explicitly in command line options. This option is exposed |
95 | // for tuning and testing. |
96 | static cl::opt<int> InlineSavingsMultiplier( |
97 | "inline-savings-multiplier" , cl::Hidden, cl::init(Val: 8), |
98 | cl::desc("Multiplier to multiply cycle savings by during inlining" )); |
99 | |
100 | // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is |
101 | // specified explicitly in command line options. This option is exposed |
102 | // for tuning and testing. |
103 | static cl::opt<int> InlineSavingsProfitableMultiplier( |
104 | "inline-savings-profitable-multiplier" , cl::Hidden, cl::init(Val: 4), |
105 | cl::desc("A multiplier on top of cycle savings to decide whether the " |
106 | "savings won't justify the cost" )); |
107 | |
108 | static cl::opt<int> |
109 | InlineSizeAllowance("inline-size-allowance" , cl::Hidden, cl::init(Val: 100), |
110 | cl::desc("The maximum size of a callee that get's " |
111 | "inlined without sufficient cycle savings" )); |
112 | |
113 | // We introduce this threshold to help performance of instrumentation based |
114 | // PGO before we actually hook up inliner with analysis passes such as BPI and |
115 | // BFI. |
116 | static cl::opt<int> ColdThreshold( |
117 | "inlinecold-threshold" , cl::Hidden, cl::init(Val: 45), |
118 | cl::desc("Threshold for inlining functions with cold attribute" )); |
119 | |
120 | static cl::opt<int> |
121 | HotCallSiteThreshold("hot-callsite-threshold" , cl::Hidden, cl::init(Val: 3000), |
122 | cl::desc("Threshold for hot callsites " )); |
123 | |
124 | static cl::opt<int> LocallyHotCallSiteThreshold( |
125 | "locally-hot-callsite-threshold" , cl::Hidden, cl::init(Val: 525), |
126 | cl::desc("Threshold for locally hot callsites " )); |
127 | |
128 | static cl::opt<int> ColdCallSiteRelFreq( |
129 | "cold-callsite-rel-freq" , cl::Hidden, cl::init(Val: 2), |
130 | cl::desc("Maximum block frequency, expressed as a percentage of caller's " |
131 | "entry frequency, for a callsite to be cold in the absence of " |
132 | "profile information." )); |
133 | |
134 | static cl::opt<uint64_t> HotCallSiteRelFreq( |
135 | "hot-callsite-rel-freq" , cl::Hidden, cl::init(Val: 60), |
136 | cl::desc("Minimum block frequency, expressed as a multiple of caller's " |
137 | "entry frequency, for a callsite to be hot in the absence of " |
138 | "profile information." )); |
139 | |
140 | static cl::opt<int> |
141 | InstrCost("inline-instr-cost" , cl::Hidden, cl::init(Val: 5), |
142 | cl::desc("Cost of a single instruction when inlining" )); |
143 | |
144 | static cl::opt<int> |
145 | MemAccessCost("inline-memaccess-cost" , cl::Hidden, cl::init(Val: 0), |
146 | cl::desc("Cost of load/store instruction when inlining" )); |
147 | |
148 | static cl::opt<int> CallPenalty( |
149 | "inline-call-penalty" , cl::Hidden, cl::init(Val: 25), |
150 | cl::desc("Call penalty that is applied per callsite when inlining" )); |
151 | |
152 | static cl::opt<size_t> |
153 | StackSizeThreshold("inline-max-stacksize" , cl::Hidden, |
154 | cl::init(Val: std::numeric_limits<size_t>::max()), |
155 | cl::desc("Do not inline functions with a stack size " |
156 | "that exceeds the specified limit" )); |
157 | |
158 | static cl::opt<size_t> RecurStackSizeThreshold( |
159 | "recursive-inline-max-stacksize" , cl::Hidden, |
160 | cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller), |
161 | cl::desc("Do not inline recursive functions with a stack " |
162 | "size that exceeds the specified limit" )); |
163 | |
164 | static cl::opt<bool> OptComputeFullInlineCost( |
165 | "inline-cost-full" , cl::Hidden, |
166 | cl::desc("Compute the full inline cost of a call site even when the cost " |
167 | "exceeds the threshold." )); |
168 | |
169 | static cl::opt<bool> InlineCallerSupersetNoBuiltin( |
170 | "inline-caller-superset-nobuiltin" , cl::Hidden, cl::init(Val: true), |
171 | cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " |
172 | "attributes." )); |
173 | |
174 | static cl::opt<bool> DisableGEPConstOperand( |
175 | "disable-gep-const-evaluation" , cl::Hidden, cl::init(Val: false), |
176 | cl::desc("Disables evaluation of GetElementPtr with constant operands" )); |
177 | |
178 | namespace llvm { |
179 | std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) { |
180 | if (Attr.isValid()) { |
181 | int AttrValue = 0; |
182 | if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue)) |
183 | return AttrValue; |
184 | } |
185 | return std::nullopt; |
186 | } |
187 | |
188 | std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) { |
189 | return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind)); |
190 | } |
191 | |
192 | std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) { |
193 | return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind)); |
194 | } |
195 | |
196 | namespace InlineConstants { |
197 | int getInstrCost() { return InstrCost; } |
198 | |
199 | } // namespace InlineConstants |
200 | |
201 | } // namespace llvm |
202 | |
203 | namespace { |
204 | class InlineCostCallAnalyzer; |
205 | |
206 | // This struct is used to store information about inline cost of a |
207 | // particular instruction |
208 | struct InstructionCostDetail { |
209 | int CostBefore = 0; |
210 | int CostAfter = 0; |
211 | int ThresholdBefore = 0; |
212 | int ThresholdAfter = 0; |
213 | |
214 | int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; } |
215 | |
216 | int getCostDelta() const { return CostAfter - CostBefore; } |
217 | |
218 | bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; } |
219 | }; |
220 | |
221 | class InlineCostAnnotationWriter : public AssemblyAnnotationWriter { |
222 | private: |
223 | InlineCostCallAnalyzer *const ICCA; |
224 | |
225 | public: |
226 | InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} |
227 | void emitInstructionAnnot(const Instruction *I, |
228 | formatted_raw_ostream &OS) override; |
229 | }; |
230 | |
231 | /// Carry out call site analysis, in order to evaluate inlinability. |
232 | /// NOTE: the type is currently used as implementation detail of functions such |
233 | /// as llvm::getInlineCost. Note the function_ref constructor parameters - the |
234 | /// expectation is that they come from the outer scope, from the wrapper |
235 | /// functions. If we want to support constructing CallAnalyzer objects where |
236 | /// lambdas are provided inline at construction, or where the object needs to |
237 | /// otherwise survive past the scope of the provided functions, we need to |
238 | /// revisit the argument types. |
239 | class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { |
240 | typedef InstVisitor<CallAnalyzer, bool> Base; |
241 | friend class InstVisitor<CallAnalyzer, bool>; |
242 | |
243 | protected: |
244 | virtual ~CallAnalyzer() = default; |
245 | /// The TargetTransformInfo available for this compilation. |
246 | const TargetTransformInfo &TTI; |
247 | |
248 | /// Getter for the cache of @llvm.assume intrinsics. |
249 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache; |
250 | |
251 | /// Getter for BlockFrequencyInfo |
252 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI; |
253 | |
254 | /// Getter for TargetLibraryInfo |
255 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI; |
256 | |
257 | /// Profile summary information. |
258 | ProfileSummaryInfo *PSI; |
259 | |
260 | /// The called function. |
261 | Function &F; |
262 | |
263 | // Cache the DataLayout since we use it a lot. |
264 | const DataLayout &DL; |
265 | |
266 | /// The OptimizationRemarkEmitter available for this compilation. |
267 | OptimizationRemarkEmitter *ORE; |
268 | |
269 | /// The candidate callsite being analyzed. Please do not use this to do |
270 | /// analysis in the caller function; we want the inline cost query to be |
271 | /// easily cacheable. Instead, use the cover function paramHasAttr. |
272 | CallBase &CandidateCall; |
273 | |
274 | /// Getter for the cache of ephemeral values. |
275 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr; |
276 | |
277 | /// Extension points for handling callsite features. |
278 | // Called before a basic block was analyzed. |
279 | virtual void onBlockStart(const BasicBlock *BB) {} |
280 | |
281 | /// Called after a basic block was analyzed. |
282 | virtual void onBlockAnalyzed(const BasicBlock *BB) {} |
283 | |
284 | /// Called before an instruction was analyzed |
285 | virtual void onInstructionAnalysisStart(const Instruction *I) {} |
286 | |
287 | /// Called after an instruction was analyzed |
288 | virtual void onInstructionAnalysisFinish(const Instruction *I) {} |
289 | |
290 | /// Called at the end of the analysis of the callsite. Return the outcome of |
291 | /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or |
292 | /// the reason it can't. |
293 | virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } |
294 | /// Called when we're about to start processing a basic block, and every time |
295 | /// we are done processing an instruction. Return true if there is no point in |
296 | /// continuing the analysis (e.g. we've determined already the call site is |
297 | /// too expensive to inline) |
298 | virtual bool shouldStop() { return false; } |
299 | |
300 | /// Called before the analysis of the callee body starts (with callsite |
301 | /// contexts propagated). It checks callsite-specific information. Return a |
302 | /// reason analysis can't continue if that's the case, or 'true' if it may |
303 | /// continue. |
304 | virtual InlineResult onAnalysisStart() { return InlineResult::success(); } |
305 | /// Called if the analysis engine decides SROA cannot be done for the given |
306 | /// alloca. |
307 | virtual void onDisableSROA(AllocaInst *Arg) {} |
308 | |
309 | /// Called the analysis engine determines load elimination won't happen. |
310 | virtual void onDisableLoadElimination() {} |
311 | |
312 | /// Called when we visit a CallBase, before the analysis starts. Return false |
313 | /// to stop further processing of the instruction. |
314 | virtual bool onCallBaseVisitStart(CallBase &Call) { return true; } |
315 | |
316 | /// Called to account for a call. |
317 | virtual void onCallPenalty() {} |
318 | |
319 | /// Called to account for a load or store. |
320 | virtual void onMemAccess(){}; |
321 | |
322 | /// Called to account for the expectation the inlining would result in a load |
323 | /// elimination. |
324 | virtual void onLoadEliminationOpportunity() {} |
325 | |
326 | /// Called to account for the cost of argument setup for the Call in the |
327 | /// callee's body (not the callsite currently under analysis). |
328 | virtual void onCallArgumentSetup(const CallBase &Call) {} |
329 | |
330 | /// Called to account for a load relative intrinsic. |
331 | virtual void onLoadRelativeIntrinsic() {} |
332 | |
333 | /// Called to account for a lowered call. |
334 | virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { |
335 | } |
336 | |
337 | /// Account for a jump table of given size. Return false to stop further |
338 | /// processing the switch instruction |
339 | virtual bool onJumpTable(unsigned JumpTableSize) { return true; } |
340 | |
341 | /// Account for a case cluster of given size. Return false to stop further |
342 | /// processing of the instruction. |
343 | virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } |
344 | |
345 | /// Called at the end of processing a switch instruction, with the given |
346 | /// number of case clusters. |
347 | virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
348 | bool DefaultDestUnreachable) {} |
349 | |
350 | /// Called to account for any other instruction not specifically accounted |
351 | /// for. |
352 | virtual void onMissedSimplification() {} |
353 | |
354 | /// Start accounting potential benefits due to SROA for the given alloca. |
355 | virtual void onInitializeSROAArg(AllocaInst *Arg) {} |
356 | |
357 | /// Account SROA savings for the AllocaInst value. |
358 | virtual void onAggregateSROAUse(AllocaInst *V) {} |
359 | |
360 | bool handleSROA(Value *V, bool DoNotDisable) { |
361 | // Check for SROA candidates in comparisons. |
362 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
363 | if (DoNotDisable) { |
364 | onAggregateSROAUse(V: SROAArg); |
365 | return true; |
366 | } |
367 | disableSROAForArg(SROAArg); |
368 | } |
369 | return false; |
370 | } |
371 | |
372 | bool IsCallerRecursive = false; |
373 | bool IsRecursiveCall = false; |
374 | bool ExposesReturnsTwice = false; |
375 | bool HasDynamicAlloca = false; |
376 | bool ContainsNoDuplicateCall = false; |
377 | bool HasReturn = false; |
378 | bool HasIndirectBr = false; |
379 | bool HasUninlineableIntrinsic = false; |
380 | bool InitsVargArgs = false; |
381 | |
382 | /// Number of bytes allocated statically by the callee. |
383 | uint64_t AllocatedSize = 0; |
384 | unsigned NumInstructions = 0; |
385 | unsigned NumVectorInstructions = 0; |
386 | |
387 | /// While we walk the potentially-inlined instructions, we build up and |
388 | /// maintain a mapping of simplified values specific to this callsite. The |
389 | /// idea is to propagate any special information we have about arguments to |
390 | /// this call through the inlinable section of the function, and account for |
391 | /// likely simplifications post-inlining. The most important aspect we track |
392 | /// is CFG altering simplifications -- when we prove a basic block dead, that |
393 | /// can cause dramatic shifts in the cost of inlining a function. |
394 | /// Note: The simplified Value may be owned by the caller function. |
395 | DenseMap<Value *, Value *> SimplifiedValues; |
396 | |
397 | /// Keep track of the values which map back (through function arguments) to |
398 | /// allocas on the caller stack which could be simplified through SROA. |
399 | DenseMap<Value *, AllocaInst *> SROAArgValues; |
400 | |
401 | /// Keep track of Allocas for which we believe we may get SROA optimization. |
402 | DenseSet<AllocaInst *> EnabledSROAAllocas; |
403 | |
404 | /// Keep track of values which map to a pointer base and constant offset. |
405 | DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; |
406 | |
407 | /// Keep track of dead blocks due to the constant arguments. |
408 | SmallPtrSet<BasicBlock *, 16> DeadBlocks; |
409 | |
410 | /// The mapping of the blocks to their known unique successors due to the |
411 | /// constant arguments. |
412 | DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; |
413 | |
414 | /// Model the elimination of repeated loads that is expected to happen |
415 | /// whenever we simplify away the stores that would otherwise cause them to be |
416 | /// loads. |
417 | bool EnableLoadElimination = true; |
418 | |
419 | /// Whether we allow inlining for recursive call. |
420 | bool AllowRecursiveCall = false; |
421 | |
422 | SmallPtrSet<Value *, 16> LoadAddrSet; |
423 | |
424 | AllocaInst *getSROAArgForValueOrNull(Value *V) const { |
425 | auto It = SROAArgValues.find(Val: V); |
426 | if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0) |
427 | return nullptr; |
428 | return It->second; |
429 | } |
430 | |
431 | /// Use a value in its given form directly if possible, otherwise try looking |
432 | /// for it in SimplifiedValues. |
433 | template <typename T> T *getDirectOrSimplifiedValue(Value *V) const { |
434 | if (auto *Direct = dyn_cast<T>(V)) |
435 | return Direct; |
436 | return getSimplifiedValue<T>(V); |
437 | } |
438 | |
439 | // Custom simplification helper routines. |
440 | bool isAllocaDerivedArg(Value *V); |
441 | void disableSROAForArg(AllocaInst *SROAArg); |
442 | void disableSROA(Value *V); |
443 | void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); |
444 | void disableLoadElimination(); |
445 | bool isGEPFree(GetElementPtrInst &GEP); |
446 | bool canFoldInboundsGEP(GetElementPtrInst &I); |
447 | bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); |
448 | bool simplifyCallSite(Function *F, CallBase &Call); |
449 | bool simplifyCmpInstForRecCall(CmpInst &Cmp); |
450 | bool simplifyInstruction(Instruction &I); |
451 | bool simplifyIntrinsicCallIsConstant(CallBase &CB); |
452 | bool simplifyIntrinsicCallObjectSize(CallBase &CB); |
453 | ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); |
454 | bool isLoweredToCall(Function *F, CallBase &Call); |
455 | |
456 | /// Return true if the given argument to the function being considered for |
457 | /// inlining has the given attribute set either at the call site or the |
458 | /// function declaration. Primarily used to inspect call site specific |
459 | /// attributes since these can be more precise than the ones on the callee |
460 | /// itself. |
461 | bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); |
462 | |
463 | /// Return true if the given value is known non null within the callee if |
464 | /// inlined through this particular callsite. |
465 | bool isKnownNonNullInCallee(Value *V); |
466 | |
467 | /// Return true if size growth is allowed when inlining the callee at \p Call. |
468 | bool allowSizeGrowth(CallBase &Call); |
469 | |
470 | // Custom analysis routines. |
471 | InlineResult analyzeBlock(BasicBlock *BB, |
472 | const SmallPtrSetImpl<const Value *> &EphValues); |
473 | |
474 | // Disable several entry points to the visitor so we don't accidentally use |
475 | // them by declaring but not defining them here. |
476 | void visit(Module *); |
477 | void visit(Module &); |
478 | void visit(Function *); |
479 | void visit(Function &); |
480 | void visit(BasicBlock *); |
481 | void visit(BasicBlock &); |
482 | |
483 | // Provide base case for our instruction visit. |
484 | bool visitInstruction(Instruction &I); |
485 | |
486 | // Our visit overrides. |
487 | bool visitAlloca(AllocaInst &I); |
488 | bool visitPHI(PHINode &I); |
489 | bool visitGetElementPtr(GetElementPtrInst &I); |
490 | bool visitBitCast(BitCastInst &I); |
491 | bool visitPtrToInt(PtrToIntInst &I); |
492 | bool visitIntToPtr(IntToPtrInst &I); |
493 | bool visitCastInst(CastInst &I); |
494 | bool visitCmpInst(CmpInst &I); |
495 | bool visitSub(BinaryOperator &I); |
496 | bool visitBinaryOperator(BinaryOperator &I); |
497 | bool visitFNeg(UnaryOperator &I); |
498 | bool visitLoad(LoadInst &I); |
499 | bool visitStore(StoreInst &I); |
500 | bool visitExtractValue(ExtractValueInst &I); |
501 | bool visitInsertValue(InsertValueInst &I); |
502 | bool visitCallBase(CallBase &Call); |
503 | bool visitReturnInst(ReturnInst &RI); |
504 | bool visitBranchInst(BranchInst &BI); |
505 | bool visitSelectInst(SelectInst &SI); |
506 | bool visitSwitchInst(SwitchInst &SI); |
507 | bool visitIndirectBrInst(IndirectBrInst &IBI); |
508 | bool visitResumeInst(ResumeInst &RI); |
509 | bool visitCleanupReturnInst(CleanupReturnInst &RI); |
510 | bool visitCatchReturnInst(CatchReturnInst &RI); |
511 | bool visitUnreachableInst(UnreachableInst &I); |
512 | |
513 | public: |
514 | CallAnalyzer( |
515 | Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, |
516 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
517 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
518 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr, |
519 | ProfileSummaryInfo *PSI = nullptr, |
520 | OptimizationRemarkEmitter *ORE = nullptr, |
521 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = |
522 | nullptr) |
523 | : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), |
524 | GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE), |
525 | CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {} |
526 | |
527 | InlineResult analyze(); |
528 | |
529 | /// Lookup simplified Value. May return a value owned by the caller. |
530 | Value *getSimplifiedValueUnchecked(Value *V) const { |
531 | return SimplifiedValues.lookup(Val: V); |
532 | } |
533 | |
534 | /// Lookup simplified Value, but return nullptr if the simplified value is |
535 | /// owned by the caller. |
536 | template <typename T> T *getSimplifiedValue(Value *V) const { |
537 | Value *SimpleV = SimplifiedValues.lookup(Val: V); |
538 | if (!SimpleV) |
539 | return nullptr; |
540 | |
541 | // Skip checks if we know T is a global. This has a small, but measurable |
542 | // impact on compile-time. |
543 | if constexpr (std::is_base_of_v<Constant, T>) |
544 | return dyn_cast<T>(SimpleV); |
545 | |
546 | // Make sure the simplified Value is owned by this function |
547 | if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) { |
548 | if (I->getFunction() != &F) |
549 | return nullptr; |
550 | } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) { |
551 | if (Arg->getParent() != &F) |
552 | return nullptr; |
553 | } else if (!isa<Constant>(Val: SimpleV)) |
554 | return nullptr; |
555 | return dyn_cast<T>(SimpleV); |
556 | } |
557 | |
558 | // Keep a bunch of stats about the cost savings found so we can print them |
559 | // out when debugging. |
560 | unsigned NumConstantArgs = 0; |
561 | unsigned NumConstantOffsetPtrArgs = 0; |
562 | unsigned NumAllocaArgs = 0; |
563 | unsigned NumConstantPtrCmps = 0; |
564 | unsigned NumConstantPtrDiffs = 0; |
565 | unsigned NumInstructionsSimplified = 0; |
566 | |
567 | void dump(); |
568 | }; |
569 | |
570 | // Considering forming a binary search, we should find the number of nodes |
571 | // which is same as the number of comparisons when lowered. For a given |
572 | // number of clusters, n, we can define a recursive function, f(n), to find |
573 | // the number of nodes in the tree. The recursion is : |
574 | // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, |
575 | // and f(n) = n, when n <= 3. |
576 | // This will lead a binary tree where the leaf should be either f(2) or f(3) |
577 | // when n > 3. So, the number of comparisons from leaves should be n, while |
578 | // the number of non-leaf should be : |
579 | // 2^(log2(n) - 1) - 1 |
580 | // = 2^log2(n) * 2^-1 - 1 |
581 | // = n / 2 - 1. |
582 | // Considering comparisons from leaf and non-leaf nodes, we can estimate the |
583 | // number of comparisons in a simple closed form : |
584 | // n + n / 2 - 1 = n * 3 / 2 - 1 |
585 | int64_t getExpectedNumberOfCompare(int NumCaseCluster) { |
586 | return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1; |
587 | } |
588 | |
589 | /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note |
590 | /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer |
591 | class InlineCostCallAnalyzer final : public CallAnalyzer { |
592 | const bool ComputeFullInlineCost; |
593 | int LoadEliminationCost = 0; |
594 | /// Bonus to be applied when percentage of vector instructions in callee is |
595 | /// high (see more details in updateThreshold). |
596 | int VectorBonus = 0; |
597 | /// Bonus to be applied when the callee has only one reachable basic block. |
598 | int SingleBBBonus = 0; |
599 | |
600 | /// Tunable parameters that control the analysis. |
601 | const InlineParams &Params; |
602 | |
603 | // This DenseMap stores the delta change in cost and threshold after |
604 | // accounting for the given instruction. The map is filled only with the |
605 | // flag PrintInstructionComments on. |
606 | DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap; |
607 | |
608 | /// Upper bound for the inlining cost. Bonuses are being applied to account |
609 | /// for speculative "expected profit" of the inlining decision. |
610 | int Threshold = 0; |
611 | |
612 | /// The amount of StaticBonus applied. |
613 | int StaticBonusApplied = 0; |
614 | |
615 | /// Attempt to evaluate indirect calls to boost its inline cost. |
616 | const bool BoostIndirectCalls; |
617 | |
618 | /// Ignore the threshold when finalizing analysis. |
619 | const bool IgnoreThreshold; |
620 | |
621 | // True if the cost-benefit-analysis-based inliner is enabled. |
622 | const bool CostBenefitAnalysisEnabled; |
623 | |
624 | /// Inlining cost measured in abstract units, accounts for all the |
625 | /// instructions expected to be executed for a given function invocation. |
626 | /// Instructions that are statically proven to be dead based on call-site |
627 | /// arguments are not counted here. |
628 | int Cost = 0; |
629 | |
630 | // The cumulative cost at the beginning of the basic block being analyzed. At |
631 | // the end of analyzing each basic block, "Cost - CostAtBBStart" represents |
632 | // the size of that basic block. |
633 | int CostAtBBStart = 0; |
634 | |
635 | // The static size of live but cold basic blocks. This is "static" in the |
636 | // sense that it's not weighted by profile counts at all. |
637 | int ColdSize = 0; |
638 | |
639 | // Whether inlining is decided by cost-threshold analysis. |
640 | bool DecidedByCostThreshold = false; |
641 | |
642 | // Whether inlining is decided by cost-benefit analysis. |
643 | bool DecidedByCostBenefit = false; |
644 | |
645 | // The cost-benefit pair computed by cost-benefit analysis. |
646 | std::optional<CostBenefitPair> CostBenefit; |
647 | |
648 | bool SingleBB = true; |
649 | |
650 | unsigned SROACostSavings = 0; |
651 | unsigned SROACostSavingsLost = 0; |
652 | |
653 | /// The mapping of caller Alloca values to their accumulated cost savings. If |
654 | /// we have to disable SROA for one of the allocas, this tells us how much |
655 | /// cost must be added. |
656 | DenseMap<AllocaInst *, int> SROAArgCosts; |
657 | |
658 | /// Return true if \p Call is a cold callsite. |
659 | bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); |
660 | |
661 | /// Update Threshold based on callsite properties such as callee |
662 | /// attributes and callee hotness for PGO builds. The Callee is explicitly |
663 | /// passed to support analyzing indirect calls whose target is inferred by |
664 | /// analysis. |
665 | void updateThreshold(CallBase &Call, Function &Callee); |
666 | /// Return a higher threshold if \p Call is a hot callsite. |
667 | std::optional<int> getHotCallSiteThreshold(CallBase &Call, |
668 | BlockFrequencyInfo *CallerBFI); |
669 | |
670 | /// Handle a capped 'int' increment for Cost. |
671 | void addCost(int64_t Inc) { |
672 | Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX); |
673 | Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX); |
674 | } |
675 | |
676 | void onDisableSROA(AllocaInst *Arg) override { |
677 | auto CostIt = SROAArgCosts.find(Val: Arg); |
678 | if (CostIt == SROAArgCosts.end()) |
679 | return; |
680 | addCost(Inc: CostIt->second); |
681 | SROACostSavings -= CostIt->second; |
682 | SROACostSavingsLost += CostIt->second; |
683 | SROAArgCosts.erase(I: CostIt); |
684 | } |
685 | |
686 | void onDisableLoadElimination() override { |
687 | addCost(Inc: LoadEliminationCost); |
688 | LoadEliminationCost = 0; |
689 | } |
690 | |
691 | bool onCallBaseVisitStart(CallBase &Call) override { |
692 | if (std::optional<int> AttrCallThresholdBonus = |
693 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus" )) |
694 | Threshold += *AttrCallThresholdBonus; |
695 | |
696 | if (std::optional<int> AttrCallCost = |
697 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost" )) { |
698 | addCost(Inc: *AttrCallCost); |
699 | // Prevent further processing of the call since we want to override its |
700 | // inline cost, not just add to it. |
701 | return false; |
702 | } |
703 | return true; |
704 | } |
705 | |
706 | void onCallPenalty() override { addCost(Inc: CallPenalty); } |
707 | |
708 | void onMemAccess() override { addCost(Inc: MemAccessCost); } |
709 | |
710 | void onCallArgumentSetup(const CallBase &Call) override { |
711 | // Pay the price of the argument setup. We account for the average 1 |
712 | // instruction per call argument setup here. |
713 | addCost(Inc: Call.arg_size() * InstrCost); |
714 | } |
715 | void onLoadRelativeIntrinsic() override { |
716 | // This is normally lowered to 4 LLVM instructions. |
717 | addCost(Inc: 3 * InstrCost); |
718 | } |
719 | void onLoweredCall(Function *F, CallBase &Call, |
720 | bool IsIndirectCall) override { |
721 | // We account for the average 1 instruction per call argument setup here. |
722 | addCost(Inc: Call.arg_size() * InstrCost); |
723 | |
724 | // If we have a constant that we are calling as a function, we can peer |
725 | // through it and see the function target. This happens not infrequently |
726 | // during devirtualization and so we want to give it a hefty bonus for |
727 | // inlining, but cap that bonus in the event that inlining wouldn't pan out. |
728 | // Pretend to inline the function, with a custom threshold. |
729 | if (IsIndirectCall && BoostIndirectCalls) { |
730 | auto IndirectCallParams = Params; |
731 | IndirectCallParams.DefaultThreshold = |
732 | InlineConstants::IndirectCallThreshold; |
733 | /// FIXME: if InlineCostCallAnalyzer is derived from, this may need |
734 | /// to instantiate the derived class. |
735 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
736 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
737 | false); |
738 | if (CA.analyze().isSuccess()) { |
739 | // We were able to inline the indirect call! Subtract the cost from the |
740 | // threshold to get the bonus we want to apply, but don't go below zero. |
741 | Cost -= std::max(a: 0, b: CA.getThreshold() - CA.getCost()); |
742 | } |
743 | } else |
744 | // Otherwise simply add the cost for merely making the call. |
745 | addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call, |
746 | DefaultCallPenalty: CallPenalty)); |
747 | } |
748 | |
749 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
750 | bool DefaultDestUnreachable) override { |
751 | // If suitable for a jump table, consider the cost for the table size and |
752 | // branch to destination. |
753 | // Maximum valid cost increased in this function. |
754 | if (JumpTableSize) { |
755 | // Suppose a default branch includes one compare and one conditional |
756 | // branch if it's reachable. |
757 | if (!DefaultDestUnreachable) |
758 | addCost(Inc: 2 * InstrCost); |
759 | // Suppose a jump table requires one load and one jump instruction. |
760 | int64_t JTCost = |
761 | static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost; |
762 | addCost(Inc: JTCost); |
763 | return; |
764 | } |
765 | |
766 | if (NumCaseCluster <= 3) { |
767 | // Suppose a comparison includes one compare and one conditional branch. |
768 | // We can reduce a set of instructions if the default branch is |
769 | // undefined. |
770 | addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost); |
771 | return; |
772 | } |
773 | |
774 | int64_t ExpectedNumberOfCompare = |
775 | getExpectedNumberOfCompare(NumCaseCluster); |
776 | int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost; |
777 | |
778 | addCost(Inc: SwitchCost); |
779 | } |
780 | void onMissedSimplification() override { addCost(Inc: InstrCost); } |
781 | |
782 | void onInitializeSROAArg(AllocaInst *Arg) override { |
783 | assert(Arg != nullptr && |
784 | "Should not initialize SROA costs for null value." ); |
785 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
786 | SROACostSavings += SROAArgCost; |
787 | SROAArgCosts[Arg] = SROAArgCost; |
788 | } |
789 | |
790 | void onAggregateSROAUse(AllocaInst *SROAArg) override { |
791 | auto CostIt = SROAArgCosts.find(Val: SROAArg); |
792 | assert(CostIt != SROAArgCosts.end() && |
793 | "expected this argument to have a cost" ); |
794 | CostIt->second += InstrCost; |
795 | SROACostSavings += InstrCost; |
796 | } |
797 | |
798 | void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } |
799 | |
800 | void onBlockAnalyzed(const BasicBlock *BB) override { |
801 | if (CostBenefitAnalysisEnabled) { |
802 | // Keep track of the static size of live but cold basic blocks. For now, |
803 | // we define a cold basic block to be one that's never executed. |
804 | assert(GetBFI && "GetBFI must be available" ); |
805 | BlockFrequencyInfo *BFI = &(GetBFI(F)); |
806 | assert(BFI && "BFI must be available" ); |
807 | auto ProfileCount = BFI->getBlockProfileCount(BB); |
808 | if (*ProfileCount == 0) |
809 | ColdSize += Cost - CostAtBBStart; |
810 | } |
811 | |
812 | auto *TI = BB->getTerminator(); |
813 | // If we had any successors at this point, than post-inlining is likely to |
814 | // have them as well. Note that we assume any basic blocks which existed |
815 | // due to branches or switches which folded above will also fold after |
816 | // inlining. |
817 | if (SingleBB && TI->getNumSuccessors() > 1) { |
818 | // Take off the bonus we applied to the threshold. |
819 | Threshold -= SingleBBBonus; |
820 | SingleBB = false; |
821 | } |
822 | } |
823 | |
824 | void onInstructionAnalysisStart(const Instruction *I) override { |
825 | // This function is called to store the initial cost of inlining before |
826 | // the given instruction was assessed. |
827 | if (!PrintInstructionComments) |
828 | return; |
829 | auto &CostDetail = InstructionCostDetailMap[I]; |
830 | CostDetail.CostBefore = Cost; |
831 | CostDetail.ThresholdBefore = Threshold; |
832 | } |
833 | |
834 | void onInstructionAnalysisFinish(const Instruction *I) override { |
835 | // This function is called to find new values of cost and threshold after |
836 | // the instruction has been assessed. |
837 | if (!PrintInstructionComments) |
838 | return; |
839 | auto &CostDetail = InstructionCostDetailMap[I]; |
840 | CostDetail.CostAfter = Cost; |
841 | CostDetail.ThresholdAfter = Threshold; |
842 | } |
843 | |
844 | bool isCostBenefitAnalysisEnabled() { |
845 | if (!PSI || !PSI->hasProfileSummary()) |
846 | return false; |
847 | |
848 | if (!GetBFI) |
849 | return false; |
850 | |
851 | if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) { |
852 | // Honor the explicit request from the user. |
853 | if (!InlineEnableCostBenefitAnalysis) |
854 | return false; |
855 | } else { |
856 | // Otherwise, require instrumentation profile. |
857 | if (!PSI->hasInstrumentationProfile()) |
858 | return false; |
859 | } |
860 | |
861 | auto *Caller = CandidateCall.getParent()->getParent(); |
862 | if (!Caller->getEntryCount()) |
863 | return false; |
864 | |
865 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller)); |
866 | if (!CallerBFI) |
867 | return false; |
868 | |
869 | // For now, limit to hot call site. |
870 | if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI)) |
871 | return false; |
872 | |
873 | // Make sure we have a nonzero entry count. |
874 | auto EntryCount = F.getEntryCount(); |
875 | if (!EntryCount || !EntryCount->getCount()) |
876 | return false; |
877 | |
878 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
879 | if (!CalleeBFI) |
880 | return false; |
881 | |
882 | return true; |
883 | } |
884 | |
885 | // A helper function to choose between command line override and default. |
886 | unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { |
887 | if (InlineSavingsMultiplier.getNumOccurrences()) |
888 | return InlineSavingsMultiplier; |
889 | return TTI.getInliningCostBenefitAnalysisSavingsMultiplier(); |
890 | } |
891 | |
892 | // A helper function to choose between command line override and default. |
893 | unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const { |
894 | if (InlineSavingsProfitableMultiplier.getNumOccurrences()) |
895 | return InlineSavingsProfitableMultiplier; |
896 | return TTI.getInliningCostBenefitAnalysisProfitableMultiplier(); |
897 | } |
898 | |
899 | void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) { |
900 | if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt( |
901 | CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test" )) { |
902 | CycleSavings = *AttrCycleSavings; |
903 | } |
904 | |
905 | if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt( |
906 | CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test" )) { |
907 | Size = *AttrRuntimeCost; |
908 | } |
909 | } |
910 | |
911 | // Determine whether we should inline the given call site, taking into account |
912 | // both the size cost and the cycle savings. Return std::nullopt if we don't |
913 | // have sufficient profiling information to determine. |
914 | std::optional<bool> costBenefitAnalysis() { |
915 | if (!CostBenefitAnalysisEnabled) |
916 | return std::nullopt; |
917 | |
918 | // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 |
919 | // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by |
920 | // falling back to the cost-based metric. |
921 | // TODO: Improve this hacky condition. |
922 | if (Threshold == 0) |
923 | return std::nullopt; |
924 | |
925 | assert(GetBFI); |
926 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
927 | assert(CalleeBFI); |
928 | |
929 | // The cycle savings expressed as the sum of InstrCost |
930 | // multiplied by the estimated dynamic count of each instruction we can |
931 | // avoid. Savings come from the call site cost, such as argument setup and |
932 | // the call instruction, as well as the instructions that are folded. |
933 | // |
934 | // We use 128-bit APInt here to avoid potential overflow. This variable |
935 | // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case |
936 | // assumes that we can avoid or fold a billion instructions, each with a |
937 | // profile count of 10^^15 -- roughly the number of cycles for a 24-hour |
938 | // period on a 4GHz machine. |
939 | APInt CycleSavings(128, 0); |
940 | |
941 | for (auto &BB : F) { |
942 | APInt CurrentSavings(128, 0); |
943 | for (auto &I : BB) { |
944 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) { |
945 | // Count a conditional branch as savings if it becomes unconditional. |
946 | if (BI->isConditional() && |
947 | getSimplifiedValue<ConstantInt>(V: BI->getCondition())) { |
948 | CurrentSavings += InstrCost; |
949 | } |
950 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) { |
951 | if (getSimplifiedValue<ConstantInt>(V: SI->getCondition())) |
952 | CurrentSavings += InstrCost; |
953 | } else if (Value *V = dyn_cast<Value>(Val: &I)) { |
954 | // Count an instruction as savings if we can fold it. |
955 | if (SimplifiedValues.count(Val: V)) { |
956 | CurrentSavings += InstrCost; |
957 | } |
958 | } |
959 | } |
960 | |
961 | auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB); |
962 | CurrentSavings *= *ProfileCount; |
963 | CycleSavings += CurrentSavings; |
964 | } |
965 | |
966 | // Compute the cycle savings per call. |
967 | auto EntryProfileCount = F.getEntryCount(); |
968 | assert(EntryProfileCount && EntryProfileCount->getCount()); |
969 | auto EntryCount = EntryProfileCount->getCount(); |
970 | CycleSavings += EntryCount / 2; |
971 | CycleSavings = CycleSavings.udiv(RHS: EntryCount); |
972 | |
973 | // Compute the total savings for the call site. |
974 | auto *CallerBB = CandidateCall.getParent(); |
975 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent()))); |
976 | CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL); |
977 | CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB); |
978 | |
979 | // Remove the cost of the cold basic blocks to model the runtime cost more |
980 | // accurately. Both machine block placement and function splitting could |
981 | // place cold blocks further from hot blocks. |
982 | int Size = Cost - ColdSize; |
983 | |
984 | // Allow tiny callees to be inlined regardless of whether they meet the |
985 | // savings threshold. |
986 | Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; |
987 | |
988 | OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size); |
989 | CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings); |
990 | |
991 | // Let R be the ratio of CycleSavings to Size. We accept the inlining |
992 | // opportunity if R is really high and reject if R is really low. If R is |
993 | // somewhere in the middle, we fall back to the cost-based analysis. |
994 | // |
995 | // Specifically, let R = CycleSavings / Size, we accept the inlining |
996 | // opportunity if: |
997 | // |
998 | // PSI->getOrCompHotCountThreshold() |
999 | // R > ------------------------------------------------- |
1000 | // getInliningCostBenefitAnalysisSavingsMultiplier() |
1001 | // |
1002 | // and reject the inlining opportunity if: |
1003 | // |
1004 | // PSI->getOrCompHotCountThreshold() |
1005 | // R <= ---------------------------------------------------- |
1006 | // getInliningCostBenefitAnalysisProfitableMultiplier() |
1007 | // |
1008 | // Otherwise, we fall back to the cost-based analysis. |
1009 | // |
1010 | // Implementation-wise, use multiplication (CycleSavings * Multiplier, |
1011 | // HotCountThreshold * Size) rather than division to avoid precision loss. |
1012 | APInt Threshold(128, PSI->getOrCompHotCountThreshold()); |
1013 | Threshold *= Size; |
1014 | |
1015 | APInt UpperBoundCycleSavings = CycleSavings; |
1016 | UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier(); |
1017 | if (UpperBoundCycleSavings.uge(RHS: Threshold)) |
1018 | return true; |
1019 | |
1020 | APInt LowerBoundCycleSavings = CycleSavings; |
1021 | LowerBoundCycleSavings *= |
1022 | getInliningCostBenefitAnalysisProfitableMultiplier(); |
1023 | if (LowerBoundCycleSavings.ult(RHS: Threshold)) |
1024 | return false; |
1025 | |
1026 | // Otherwise, fall back to the cost-based analysis. |
1027 | return std::nullopt; |
1028 | } |
1029 | |
1030 | InlineResult finalizeAnalysis() override { |
1031 | // Loops generally act a lot like calls in that they act like barriers to |
1032 | // movement, require a certain amount of setup, etc. So when optimising for |
1033 | // size, we penalise any call sites that perform loops. We do this after all |
1034 | // other costs here, so will likely only be dealing with relatively small |
1035 | // functions (and hence DT and LI will hopefully be cheap). |
1036 | auto *Caller = CandidateCall.getFunction(); |
1037 | if (Caller->hasMinSize()) { |
1038 | DominatorTree DT(F); |
1039 | LoopInfo LI(DT); |
1040 | int NumLoops = 0; |
1041 | for (Loop *L : LI) { |
1042 | // Ignore loops that will not be executed |
1043 | if (DeadBlocks.count(Ptr: L->getHeader())) |
1044 | continue; |
1045 | NumLoops++; |
1046 | } |
1047 | addCost(Inc: NumLoops * InlineConstants::LoopPenalty); |
1048 | } |
1049 | |
1050 | // We applied the maximum possible vector bonus at the beginning. Now, |
1051 | // subtract the excess bonus, if any, from the Threshold before |
1052 | // comparing against Cost. |
1053 | if (NumVectorInstructions <= NumInstructions / 10) |
1054 | Threshold -= VectorBonus; |
1055 | else if (NumVectorInstructions <= NumInstructions / 2) |
1056 | Threshold -= VectorBonus / 2; |
1057 | |
1058 | if (std::optional<int> AttrCost = |
1059 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost" )) |
1060 | Cost = *AttrCost; |
1061 | |
1062 | if (std::optional<int> AttrCostMult = getStringFnAttrAsInt( |
1063 | CB&: CandidateCall, |
1064 | AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName)) |
1065 | Cost *= *AttrCostMult; |
1066 | |
1067 | if (std::optional<int> AttrThreshold = |
1068 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold" )) |
1069 | Threshold = *AttrThreshold; |
1070 | |
1071 | if (auto Result = costBenefitAnalysis()) { |
1072 | DecidedByCostBenefit = true; |
1073 | if (*Result) |
1074 | return InlineResult::success(); |
1075 | else |
1076 | return InlineResult::failure(Reason: "Cost over threshold." ); |
1077 | } |
1078 | |
1079 | if (IgnoreThreshold) |
1080 | return InlineResult::success(); |
1081 | |
1082 | DecidedByCostThreshold = true; |
1083 | return Cost < std::max(a: 1, b: Threshold) |
1084 | ? InlineResult::success() |
1085 | : InlineResult::failure(Reason: "Cost over threshold." ); |
1086 | } |
1087 | |
1088 | bool shouldStop() override { |
1089 | if (IgnoreThreshold || ComputeFullInlineCost) |
1090 | return false; |
1091 | // Bail out the moment we cross the threshold. This means we'll under-count |
1092 | // the cost, but only when undercounting doesn't matter. |
1093 | if (Cost < Threshold) |
1094 | return false; |
1095 | DecidedByCostThreshold = true; |
1096 | return true; |
1097 | } |
1098 | |
1099 | void onLoadEliminationOpportunity() override { |
1100 | LoadEliminationCost += InstrCost; |
1101 | } |
1102 | |
1103 | InlineResult onAnalysisStart() override { |
1104 | // Perform some tweaks to the cost and threshold based on the direct |
1105 | // callsite information. |
1106 | |
1107 | // We want to more aggressively inline vector-dense kernels, so up the |
1108 | // threshold, and we'll lower it if the % of vector instructions gets too |
1109 | // low. Note that these bonuses are some what arbitrary and evolved over |
1110 | // time by accident as much as because they are principled bonuses. |
1111 | // |
1112 | // FIXME: It would be nice to remove all such bonuses. At least it would be |
1113 | // nice to base the bonus values on something more scientific. |
1114 | assert(NumInstructions == 0); |
1115 | assert(NumVectorInstructions == 0); |
1116 | |
1117 | // Update the threshold based on callsite properties |
1118 | updateThreshold(Call&: CandidateCall, Callee&: F); |
1119 | |
1120 | // While Threshold depends on commandline options that can take negative |
1121 | // values, we want to enforce the invariant that the computed threshold and |
1122 | // bonuses are non-negative. |
1123 | assert(Threshold >= 0); |
1124 | assert(SingleBBBonus >= 0); |
1125 | assert(VectorBonus >= 0); |
1126 | |
1127 | // Speculatively apply all possible bonuses to Threshold. If cost exceeds |
1128 | // this Threshold any time, and cost cannot decrease, we can stop processing |
1129 | // the rest of the function body. |
1130 | Threshold += (SingleBBBonus + VectorBonus); |
1131 | |
1132 | // Give out bonuses for the callsite, as the instructions setting them up |
1133 | // will be gone after inlining. |
1134 | addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
1135 | |
1136 | // If this function uses the coldcc calling convention, prefer not to inline |
1137 | // it. |
1138 | if (F.getCallingConv() == CallingConv::Cold) |
1139 | Cost += InlineConstants::ColdccPenalty; |
1140 | |
1141 | LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n" ); |
1142 | |
1143 | // Check if we're done. This can happen due to bonuses and penalties. |
1144 | if (Cost >= Threshold && !ComputeFullInlineCost) |
1145 | return InlineResult::failure(Reason: "high cost" ); |
1146 | |
1147 | return InlineResult::success(); |
1148 | } |
1149 | |
1150 | public: |
1151 | InlineCostCallAnalyzer( |
1152 | Function &Callee, CallBase &Call, const InlineParams &Params, |
1153 | const TargetTransformInfo &TTI, |
1154 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
1155 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
1156 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr, |
1157 | ProfileSummaryInfo *PSI = nullptr, |
1158 | OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, |
1159 | bool IgnoreThreshold = false, |
1160 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = |
1161 | nullptr) |
1162 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI, |
1163 | ORE, GetEphValuesCache), |
1164 | ComputeFullInlineCost(OptComputeFullInlineCost || |
1165 | Params.ComputeFullInlineCost || ORE || |
1166 | isCostBenefitAnalysisEnabled()), |
1167 | Params(Params), Threshold(Params.DefaultThreshold), |
1168 | BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold), |
1169 | CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()), |
1170 | Writer(this) { |
1171 | AllowRecursiveCall = *Params.AllowRecursiveCall; |
1172 | } |
1173 | |
1174 | /// Annotation Writer for instruction details |
1175 | InlineCostAnnotationWriter Writer; |
1176 | |
1177 | void dump(); |
1178 | |
1179 | // Prints the same analysis as dump(), but its definition is not dependent |
1180 | // on the build. |
1181 | void print(raw_ostream &OS); |
1182 | |
1183 | std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) { |
1184 | auto It = InstructionCostDetailMap.find(Val: I); |
1185 | if (It != InstructionCostDetailMap.end()) |
1186 | return It->second; |
1187 | return std::nullopt; |
1188 | } |
1189 | |
1190 | virtual ~InlineCostCallAnalyzer() = default; |
1191 | int getThreshold() const { return Threshold; } |
1192 | int getCost() const { return Cost; } |
1193 | int getStaticBonusApplied() const { return StaticBonusApplied; } |
1194 | std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; } |
1195 | bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; } |
1196 | bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; } |
1197 | }; |
1198 | |
1199 | // Return true if CB is the sole call to local function Callee. |
1200 | static bool isSoleCallToLocalFunction(const CallBase &CB, |
1201 | const Function &Callee) { |
1202 | return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() && |
1203 | &Callee == CB.getCalledFunction(); |
1204 | } |
1205 | |
1206 | class InlineCostFeaturesAnalyzer final : public CallAnalyzer { |
1207 | private: |
1208 | InlineCostFeatures Cost = {}; |
1209 | |
1210 | // FIXME: These constants are taken from the heuristic-based cost visitor. |
1211 | // These should be removed entirely in a later revision to avoid reliance on |
1212 | // heuristics in the ML inliner. |
1213 | static constexpr int JTCostMultiplier = 2; |
1214 | static constexpr int CaseClusterCostMultiplier = 2; |
1215 | static constexpr int SwitchDefaultDestCostMultiplier = 2; |
1216 | static constexpr int SwitchCostMultiplier = 2; |
1217 | |
1218 | // FIXME: These are taken from the heuristic-based cost visitor: we should |
1219 | // eventually abstract these to the CallAnalyzer to avoid duplication. |
1220 | unsigned SROACostSavingOpportunities = 0; |
1221 | int VectorBonus = 0; |
1222 | int SingleBBBonus = 0; |
1223 | int Threshold = 5; |
1224 | |
1225 | DenseMap<AllocaInst *, unsigned> SROACosts; |
1226 | |
1227 | void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) { |
1228 | Cost[static_cast<size_t>(Feature)] += Delta; |
1229 | } |
1230 | |
1231 | void set(InlineCostFeatureIndex Feature, int64_t Value) { |
1232 | Cost[static_cast<size_t>(Feature)] = Value; |
1233 | } |
1234 | |
1235 | void onDisableSROA(AllocaInst *Arg) override { |
1236 | auto CostIt = SROACosts.find(Val: Arg); |
1237 | if (CostIt == SROACosts.end()) |
1238 | return; |
1239 | |
1240 | increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second); |
1241 | SROACostSavingOpportunities -= CostIt->second; |
1242 | SROACosts.erase(I: CostIt); |
1243 | } |
1244 | |
1245 | void onDisableLoadElimination() override { |
1246 | set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1); |
1247 | } |
1248 | |
1249 | void onCallPenalty() override { |
1250 | increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty); |
1251 | } |
1252 | |
1253 | void onCallArgumentSetup(const CallBase &Call) override { |
1254 | increment(Feature: InlineCostFeatureIndex::call_argument_setup, |
1255 | Delta: Call.arg_size() * InstrCost); |
1256 | } |
1257 | |
1258 | void onLoadRelativeIntrinsic() override { |
1259 | increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost); |
1260 | } |
1261 | |
1262 | void onLoweredCall(Function *F, CallBase &Call, |
1263 | bool IsIndirectCall) override { |
1264 | increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup, |
1265 | Delta: Call.arg_size() * InstrCost); |
1266 | |
1267 | if (IsIndirectCall) { |
1268 | InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0, |
1269 | /*HintThreshold*/ {}, |
1270 | /*ColdThreshold*/ {}, |
1271 | /*OptSizeThreshold*/ {}, |
1272 | /*OptMinSizeThreshold*/ {}, |
1273 | /*HotCallSiteThreshold*/ {}, |
1274 | /*LocallyHotCallSiteThreshold*/ {}, |
1275 | /*ColdCallSiteThreshold*/ {}, |
1276 | /*ComputeFullInlineCost*/ true, |
1277 | /*EnableDeferral*/ true}; |
1278 | IndirectCallParams.DefaultThreshold = |
1279 | InlineConstants::IndirectCallThreshold; |
1280 | |
1281 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
1282 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
1283 | false, true); |
1284 | if (CA.analyze().isSuccess()) { |
1285 | increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate, |
1286 | Delta: CA.getCost()); |
1287 | increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1); |
1288 | } |
1289 | } else { |
1290 | onCallPenalty(); |
1291 | } |
1292 | } |
1293 | |
1294 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
1295 | bool DefaultDestUnreachable) override { |
1296 | if (JumpTableSize) { |
1297 | if (!DefaultDestUnreachable) |
1298 | increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty, |
1299 | Delta: SwitchDefaultDestCostMultiplier * InstrCost); |
1300 | int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost + |
1301 | JTCostMultiplier * InstrCost; |
1302 | increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost); |
1303 | return; |
1304 | } |
1305 | |
1306 | if (NumCaseCluster <= 3) { |
1307 | increment(Feature: InlineCostFeatureIndex::case_cluster_penalty, |
1308 | Delta: (NumCaseCluster - DefaultDestUnreachable) * |
1309 | CaseClusterCostMultiplier * InstrCost); |
1310 | return; |
1311 | } |
1312 | |
1313 | int64_t ExpectedNumberOfCompare = |
1314 | getExpectedNumberOfCompare(NumCaseCluster); |
1315 | |
1316 | int64_t SwitchCost = |
1317 | ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost; |
1318 | increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost); |
1319 | } |
1320 | |
1321 | void onMissedSimplification() override { |
1322 | increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions, |
1323 | Delta: InstrCost); |
1324 | } |
1325 | |
1326 | void onInitializeSROAArg(AllocaInst *Arg) override { |
1327 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
1328 | SROACosts[Arg] = SROAArgCost; |
1329 | SROACostSavingOpportunities += SROAArgCost; |
1330 | } |
1331 | |
1332 | void onAggregateSROAUse(AllocaInst *Arg) override { |
1333 | SROACosts.find(Val: Arg)->second += InstrCost; |
1334 | SROACostSavingOpportunities += InstrCost; |
1335 | } |
1336 | |
1337 | void onBlockAnalyzed(const BasicBlock *BB) override { |
1338 | if (BB->getTerminator()->getNumSuccessors() > 1) |
1339 | set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1); |
1340 | Threshold -= SingleBBBonus; |
1341 | } |
1342 | |
1343 | InlineResult finalizeAnalysis() override { |
1344 | auto *Caller = CandidateCall.getFunction(); |
1345 | if (Caller->hasMinSize()) { |
1346 | DominatorTree DT(F); |
1347 | LoopInfo LI(DT); |
1348 | for (Loop *L : LI) { |
1349 | // Ignore loops that will not be executed |
1350 | if (DeadBlocks.count(Ptr: L->getHeader())) |
1351 | continue; |
1352 | increment(Feature: InlineCostFeatureIndex::num_loops, |
1353 | Delta: InlineConstants::LoopPenalty); |
1354 | } |
1355 | } |
1356 | set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size()); |
1357 | set(Feature: InlineCostFeatureIndex::simplified_instructions, |
1358 | Value: NumInstructionsSimplified); |
1359 | set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs); |
1360 | set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args, |
1361 | Value: NumConstantOffsetPtrArgs); |
1362 | set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities); |
1363 | |
1364 | if (NumVectorInstructions <= NumInstructions / 10) |
1365 | Threshold -= VectorBonus; |
1366 | else if (NumVectorInstructions <= NumInstructions / 2) |
1367 | Threshold -= VectorBonus / 2; |
1368 | |
1369 | set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold); |
1370 | |
1371 | return InlineResult::success(); |
1372 | } |
1373 | |
1374 | bool shouldStop() override { return false; } |
1375 | |
1376 | void onLoadEliminationOpportunity() override { |
1377 | increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1); |
1378 | } |
1379 | |
1380 | InlineResult onAnalysisStart() override { |
1381 | increment(Feature: InlineCostFeatureIndex::callsite_cost, |
1382 | Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
1383 | |
1384 | set(Feature: InlineCostFeatureIndex::cold_cc_penalty, |
1385 | Value: (F.getCallingConv() == CallingConv::Cold)); |
1386 | |
1387 | set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus, |
1388 | Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F)); |
1389 | |
1390 | // FIXME: we shouldn't repeat this logic in both the Features and Cost |
1391 | // analyzer - instead, we should abstract it to a common method in the |
1392 | // CallAnalyzer |
1393 | int SingleBBBonusPercent = 50; |
1394 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
1395 | Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall); |
1396 | Threshold *= TTI.getInliningThresholdMultiplier(); |
1397 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
1398 | VectorBonus = Threshold * VectorBonusPercent / 100; |
1399 | Threshold += (SingleBBBonus + VectorBonus); |
1400 | |
1401 | return InlineResult::success(); |
1402 | } |
1403 | |
1404 | public: |
1405 | InlineCostFeaturesAnalyzer( |
1406 | const TargetTransformInfo &TTI, |
1407 | function_ref<AssumptionCache &(Function &)> &GetAssumptionCache, |
1408 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
1409 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
1410 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, |
1411 | CallBase &Call) |
1412 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, |
1413 | PSI) {} |
1414 | |
1415 | const InlineCostFeatures &features() const { return Cost; } |
1416 | }; |
1417 | |
1418 | } // namespace |
1419 | |
1420 | /// Test whether the given value is an Alloca-derived function argument. |
1421 | bool CallAnalyzer::isAllocaDerivedArg(Value *V) { |
1422 | return SROAArgValues.count(Val: V); |
1423 | } |
1424 | |
1425 | void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { |
1426 | onDisableSROA(Arg: SROAArg); |
1427 | EnabledSROAAllocas.erase(V: SROAArg); |
1428 | disableLoadElimination(); |
1429 | } |
1430 | |
1431 | void InlineCostAnnotationWriter::emitInstructionAnnot( |
1432 | const Instruction *I, formatted_raw_ostream &OS) { |
1433 | // The cost of inlining of the given instruction is printed always. |
1434 | // The threshold delta is printed only when it is non-zero. It happens |
1435 | // when we decided to give a bonus at a particular instruction. |
1436 | std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I); |
1437 | if (!Record) |
1438 | OS << "; No analysis for the instruction" ; |
1439 | else { |
1440 | OS << "; cost before = " << Record->CostBefore |
1441 | << ", cost after = " << Record->CostAfter |
1442 | << ", threshold before = " << Record->ThresholdBefore |
1443 | << ", threshold after = " << Record->ThresholdAfter << ", " ; |
1444 | OS << "cost delta = " << Record->getCostDelta(); |
1445 | if (Record->hasThresholdChanged()) |
1446 | OS << ", threshold delta = " << Record->getThresholdDelta(); |
1447 | } |
1448 | auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I)); |
1449 | if (V) { |
1450 | OS << ", simplified to " ; |
1451 | V->print(O&: OS, IsForDebug: true); |
1452 | if (auto *VI = dyn_cast<Instruction>(Val: V)) { |
1453 | if (VI->getFunction() != I->getFunction()) |
1454 | OS << " (caller instruction)" ; |
1455 | } else if (auto *VArg = dyn_cast<Argument>(Val: V)) { |
1456 | if (VArg->getParent() != I->getFunction()) |
1457 | OS << " (caller argument)" ; |
1458 | } |
1459 | } |
1460 | OS << "\n" ; |
1461 | } |
1462 | |
1463 | /// If 'V' maps to a SROA candidate, disable SROA for it. |
1464 | void CallAnalyzer::disableSROA(Value *V) { |
1465 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
1466 | disableSROAForArg(SROAArg); |
1467 | } |
1468 | } |
1469 | |
1470 | void CallAnalyzer::disableLoadElimination() { |
1471 | if (EnableLoadElimination) { |
1472 | onDisableLoadElimination(); |
1473 | EnableLoadElimination = false; |
1474 | } |
1475 | } |
1476 | |
1477 | /// Accumulate a constant GEP offset into an APInt if possible. |
1478 | /// |
1479 | /// Returns false if unable to compute the offset for any reason. Respects any |
1480 | /// simplified values known during the analysis of this callsite. |
1481 | bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { |
1482 | unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType()); |
1483 | assert(IntPtrWidth == Offset.getBitWidth()); |
1484 | |
1485 | for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); |
1486 | GTI != GTE; ++GTI) { |
1487 | ConstantInt *OpC = |
1488 | getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand()); |
1489 | if (!OpC) |
1490 | return false; |
1491 | if (OpC->isZero()) |
1492 | continue; |
1493 | |
1494 | // Handle a struct index, which adds its field offset to the pointer. |
1495 | if (StructType *STy = GTI.getStructTypeOrNull()) { |
1496 | unsigned ElementIdx = OpC->getZExtValue(); |
1497 | const StructLayout *SL = DL.getStructLayout(Ty: STy); |
1498 | Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx)); |
1499 | continue; |
1500 | } |
1501 | |
1502 | APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL)); |
1503 | Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize; |
1504 | } |
1505 | return true; |
1506 | } |
1507 | |
1508 | /// Use TTI to check whether a GEP is free. |
1509 | /// |
1510 | /// Respects any simplified values known during the analysis of this callsite. |
1511 | bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { |
1512 | SmallVector<Value *, 4> Operands; |
1513 | Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0)); |
1514 | for (const Use &Op : GEP.indices()) |
1515 | if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op)) |
1516 | Operands.push_back(Elt: SimpleOp); |
1517 | else |
1518 | Operands.push_back(Elt: Op); |
1519 | return TTI.getInstructionCost(U: &GEP, Operands, |
1520 | CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1521 | TargetTransformInfo::TCC_Free; |
1522 | } |
1523 | |
1524 | bool CallAnalyzer::visitAlloca(AllocaInst &I) { |
1525 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
1526 | |
1527 | // Check whether inlining will turn a dynamic alloca into a static |
1528 | // alloca and handle that case. |
1529 | if (I.isArrayAllocation()) { |
1530 | Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize()); |
1531 | if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) { |
1532 | // Sometimes a dynamic alloca could be converted into a static alloca |
1533 | // after this constant prop, and become a huge static alloca on an |
1534 | // unconditional CFG path. Avoid inlining if this is going to happen above |
1535 | // a threshold. |
1536 | // FIXME: If the threshold is removed or lowered too much, we could end up |
1537 | // being too pessimistic and prevent inlining non-problematic code. This |
1538 | // could result in unintended perf regressions. A better overall strategy |
1539 | // is needed to track stack usage during inlining. |
1540 | Type *Ty = I.getAllocatedType(); |
1541 | AllocatedSize = SaturatingMultiplyAdd( |
1542 | X: AllocSize->getLimitedValue(), |
1543 | Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize); |
1544 | if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) |
1545 | HasDynamicAlloca = true; |
1546 | return false; |
1547 | } |
1548 | } |
1549 | |
1550 | // Accumulate the allocated size. |
1551 | if (I.isStaticAlloca()) { |
1552 | Type *Ty = I.getAllocatedType(); |
1553 | AllocatedSize = SaturatingAdd(X: DL.getTypeAllocSize(Ty).getKnownMinValue(), |
1554 | Y: AllocatedSize); |
1555 | } |
1556 | |
1557 | // FIXME: This is overly conservative. Dynamic allocas are inefficient for |
1558 | // a variety of reasons, and so we would like to not inline them into |
1559 | // functions which don't currently have a dynamic alloca. This simply |
1560 | // disables inlining altogether in the presence of a dynamic alloca. |
1561 | if (!I.isStaticAlloca()) |
1562 | HasDynamicAlloca = true; |
1563 | |
1564 | return false; |
1565 | } |
1566 | |
1567 | bool CallAnalyzer::visitPHI(PHINode &I) { |
1568 | // FIXME: We need to propagate SROA *disabling* through phi nodes, even |
1569 | // though we don't want to propagate it's bonuses. The idea is to disable |
1570 | // SROA if it *might* be used in an inappropriate manner. |
1571 | |
1572 | // Phi nodes are always zero-cost. |
1573 | // FIXME: Pointer sizes may differ between different address spaces, so do we |
1574 | // need to use correct address space in the call to getPointerSizeInBits here? |
1575 | // Or could we skip the getPointerSizeInBits call completely? As far as I can |
1576 | // see the ZeroOffset is used as a dummy value, so we can probably use any |
1577 | // bit width for the ZeroOffset? |
1578 | APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0)); |
1579 | bool CheckSROA = I.getType()->isPointerTy(); |
1580 | |
1581 | // Track the constant or pointer with constant offset we've seen so far. |
1582 | Constant *FirstC = nullptr; |
1583 | std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; |
1584 | Value *FirstV = nullptr; |
1585 | |
1586 | for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { |
1587 | BasicBlock *Pred = I.getIncomingBlock(i); |
1588 | // If the incoming block is dead, skip the incoming block. |
1589 | if (DeadBlocks.count(Ptr: Pred)) |
1590 | continue; |
1591 | // If the parent block of phi is not the known successor of the incoming |
1592 | // block, skip the incoming block. |
1593 | BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; |
1594 | if (KnownSuccessor && KnownSuccessor != I.getParent()) |
1595 | continue; |
1596 | |
1597 | Value *V = I.getIncomingValue(i); |
1598 | // If the incoming value is this phi itself, skip the incoming value. |
1599 | if (&I == V) |
1600 | continue; |
1601 | |
1602 | Constant *C = getDirectOrSimplifiedValue<Constant>(V); |
1603 | |
1604 | std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; |
1605 | if (!C && CheckSROA) |
1606 | BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V); |
1607 | |
1608 | if (!C && !BaseAndOffset.first) |
1609 | // The incoming value is neither a constant nor a pointer with constant |
1610 | // offset, exit early. |
1611 | return true; |
1612 | |
1613 | if (FirstC) { |
1614 | if (FirstC == C) |
1615 | // If we've seen a constant incoming value before and it is the same |
1616 | // constant we see this time, continue checking the next incoming value. |
1617 | continue; |
1618 | // Otherwise early exit because we either see a different constant or saw |
1619 | // a constant before but we have a pointer with constant offset this time. |
1620 | return true; |
1621 | } |
1622 | |
1623 | if (FirstV) { |
1624 | // The same logic as above, but check pointer with constant offset here. |
1625 | if (FirstBaseAndOffset == BaseAndOffset) |
1626 | continue; |
1627 | return true; |
1628 | } |
1629 | |
1630 | if (C) { |
1631 | // This is the 1st time we've seen a constant, record it. |
1632 | FirstC = C; |
1633 | continue; |
1634 | } |
1635 | |
1636 | // The remaining case is that this is the 1st time we've seen a pointer with |
1637 | // constant offset, record it. |
1638 | FirstV = V; |
1639 | FirstBaseAndOffset = BaseAndOffset; |
1640 | } |
1641 | |
1642 | // Check if we can map phi to a constant. |
1643 | if (FirstC) { |
1644 | SimplifiedValues[&I] = FirstC; |
1645 | return true; |
1646 | } |
1647 | |
1648 | // Check if we can map phi to a pointer with constant offset. |
1649 | if (FirstBaseAndOffset.first) { |
1650 | ConstantOffsetPtrs[&I] = FirstBaseAndOffset; |
1651 | |
1652 | if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV)) |
1653 | SROAArgValues[&I] = SROAArg; |
1654 | } |
1655 | |
1656 | return true; |
1657 | } |
1658 | |
1659 | /// Check we can fold GEPs of constant-offset call site argument pointers. |
1660 | /// This requires target data and inbounds GEPs. |
1661 | /// |
1662 | /// \return true if the specified GEP can be folded. |
1663 | bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { |
1664 | // Check if we have a base + offset for the pointer. |
1665 | std::pair<Value *, APInt> BaseAndOffset = |
1666 | ConstantOffsetPtrs.lookup(Val: I.getPointerOperand()); |
1667 | if (!BaseAndOffset.first) |
1668 | return false; |
1669 | |
1670 | // Check if the offset of this GEP is constant, and if so accumulate it |
1671 | // into Offset. |
1672 | if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second)) |
1673 | return false; |
1674 | |
1675 | // Add the result as a new mapping to Base + Offset. |
1676 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1677 | |
1678 | return true; |
1679 | } |
1680 | |
1681 | bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { |
1682 | auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand()); |
1683 | |
1684 | // Lambda to check whether a GEP's indices are all constant. |
1685 | auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { |
1686 | for (const Use &Op : GEP.indices()) |
1687 | if (!getDirectOrSimplifiedValue<Constant>(V: Op)) |
1688 | return false; |
1689 | return true; |
1690 | }; |
1691 | |
1692 | if (!DisableGEPConstOperand) |
1693 | if (simplifyInstruction(I)) |
1694 | return true; |
1695 | |
1696 | if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { |
1697 | if (SROAArg) |
1698 | SROAArgValues[&I] = SROAArg; |
1699 | |
1700 | // Constant GEPs are modeled as free. |
1701 | return true; |
1702 | } |
1703 | |
1704 | // Variable GEPs will require math and will disable SROA. |
1705 | if (SROAArg) |
1706 | disableSROAForArg(SROAArg); |
1707 | return isGEPFree(GEP&: I); |
1708 | } |
1709 | |
1710 | // Simplify \p Cmp if RHS is const and we can ValueTrack LHS. |
1711 | // This handles the case only when the Cmp instruction is guarding a recursive |
1712 | // call that will cause the Cmp to fail/succeed for the recursive call. |
1713 | bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) { |
1714 | // Bail out if LHS is not a function argument or RHS is NOT const: |
1715 | if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1))) |
1716 | return false; |
1717 | auto *CmpOp = Cmp.getOperand(i_nocapture: 0); |
1718 | // Make sure that the callsite is recursive: |
1719 | if (CandidateCall.getCaller() != &F) |
1720 | return false; |
1721 | // Only handle the case when the callsite has a single predecessor: |
1722 | auto *CallBB = CandidateCall.getParent(); |
1723 | auto *Predecessor = CallBB->getSinglePredecessor(); |
1724 | if (!Predecessor) |
1725 | return false; |
1726 | // Check if the callsite is guarded by the same Cmp instruction: |
1727 | auto *Br = dyn_cast<BranchInst>(Val: Predecessor->getTerminator()); |
1728 | if (!Br || Br->isUnconditional() || Br->getCondition() != &Cmp) |
1729 | return false; |
1730 | |
1731 | // Check if there is any arg of the recursive callsite is affecting the cmp |
1732 | // instr: |
1733 | bool ArgFound = false; |
1734 | Value *FuncArg = nullptr, *CallArg = nullptr; |
1735 | for (unsigned ArgNum = 0; |
1736 | ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) { |
1737 | FuncArg = F.getArg(i: ArgNum); |
1738 | CallArg = CandidateCall.getArgOperand(i: ArgNum); |
1739 | if (FuncArg == CmpOp && CallArg != CmpOp) { |
1740 | ArgFound = true; |
1741 | break; |
1742 | } |
1743 | } |
1744 | if (!ArgFound) |
1745 | return false; |
1746 | |
1747 | // Now we have a recursive call that is guarded by a cmp instruction. |
1748 | // Check if this cmp can be simplified: |
1749 | SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg)); |
1750 | CondContext CC(&Cmp); |
1751 | CC.Invert = (CallBB != Br->getSuccessor(i: 0)); |
1752 | SQ.CC = &CC; |
1753 | CC.AffectedValues.insert(Ptr: FuncArg); |
1754 | Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands( |
1755 | I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ); |
1756 | if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) { |
1757 | // Make sure that the BB of the recursive call is NOT the true successor |
1758 | // of the icmp. In other words, make sure that the recursion depth is 1. |
1759 | if ((ConstVal->isOne() && CC.Invert) || |
1760 | (ConstVal->isZero() && !CC.Invert)) { |
1761 | SimplifiedValues[&Cmp] = ConstVal; |
1762 | return true; |
1763 | } |
1764 | } |
1765 | return false; |
1766 | } |
1767 | |
1768 | /// Simplify \p I if its operands are constants and update SimplifiedValues. |
1769 | bool CallAnalyzer::simplifyInstruction(Instruction &I) { |
1770 | SmallVector<Constant *> COps; |
1771 | for (Value *Op : I.operands()) { |
1772 | Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op); |
1773 | if (!COp) |
1774 | return false; |
1775 | COps.push_back(Elt: COp); |
1776 | } |
1777 | auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL); |
1778 | if (!C) |
1779 | return false; |
1780 | SimplifiedValues[&I] = C; |
1781 | return true; |
1782 | } |
1783 | |
1784 | /// Try to simplify a call to llvm.is.constant. |
1785 | /// |
1786 | /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since |
1787 | /// we expect calls of this specific intrinsic to be infrequent. |
1788 | /// |
1789 | /// FIXME: Given that we know CB's parent (F) caller |
1790 | /// (CandidateCall->getParent()->getParent()), we might be able to determine |
1791 | /// whether inlining F into F's caller would change how the call to |
1792 | /// llvm.is.constant would evaluate. |
1793 | bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) { |
1794 | Value *Arg = CB.getArgOperand(i: 0); |
1795 | auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg); |
1796 | |
1797 | Type *RT = CB.getFunctionType()->getReturnType(); |
1798 | SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0); |
1799 | return true; |
1800 | } |
1801 | |
1802 | bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) { |
1803 | // As per the langref, "The fourth argument to llvm.objectsize determines if |
1804 | // the value should be evaluated at runtime." |
1805 | if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne()) |
1806 | return false; |
1807 | |
1808 | Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr, |
1809 | /*MustSucceed=*/true); |
1810 | Constant *C = dyn_cast_or_null<Constant>(Val: V); |
1811 | if (C) |
1812 | SimplifiedValues[&CB] = C; |
1813 | return C; |
1814 | } |
1815 | |
1816 | bool CallAnalyzer::visitBitCast(BitCastInst &I) { |
1817 | // Propagate constants through bitcasts. |
1818 | if (simplifyInstruction(I)) |
1819 | return true; |
1820 | |
1821 | // Track base/offsets through casts |
1822 | std::pair<Value *, APInt> BaseAndOffset = |
1823 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
1824 | // Casts don't change the offset, just wrap it up. |
1825 | if (BaseAndOffset.first) |
1826 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1827 | |
1828 | // Also look for SROA candidates here. |
1829 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
1830 | SROAArgValues[&I] = SROAArg; |
1831 | |
1832 | // Bitcasts are always zero cost. |
1833 | return true; |
1834 | } |
1835 | |
1836 | bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { |
1837 | // Propagate constants through ptrtoint. |
1838 | if (simplifyInstruction(I)) |
1839 | return true; |
1840 | |
1841 | // Track base/offset pairs when converted to a plain integer provided the |
1842 | // integer is large enough to represent the pointer. |
1843 | unsigned IntegerSize = I.getType()->getScalarSizeInBits(); |
1844 | unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace(); |
1845 | if (IntegerSize == DL.getPointerSizeInBits(AS)) { |
1846 | std::pair<Value *, APInt> BaseAndOffset = |
1847 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
1848 | if (BaseAndOffset.first) |
1849 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1850 | } |
1851 | |
1852 | // This is really weird. Technically, ptrtoint will disable SROA. However, |
1853 | // unless that ptrtoint is *used* somewhere in the live basic blocks after |
1854 | // inlining, it will be nuked, and SROA should proceed. All of the uses which |
1855 | // would block SROA would also block SROA if applied directly to a pointer, |
1856 | // and so we can just add the integer in here. The only places where SROA is |
1857 | // preserved either cannot fire on an integer, or won't in-and-of themselves |
1858 | // disable SROA (ext) w/o some later use that we would see and disable. |
1859 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
1860 | SROAArgValues[&I] = SROAArg; |
1861 | |
1862 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1863 | TargetTransformInfo::TCC_Free; |
1864 | } |
1865 | |
1866 | bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { |
1867 | // Propagate constants through ptrtoint. |
1868 | if (simplifyInstruction(I)) |
1869 | return true; |
1870 | |
1871 | // Track base/offset pairs when round-tripped through a pointer without |
1872 | // modifications provided the integer is not too large. |
1873 | Value *Op = I.getOperand(i_nocapture: 0); |
1874 | unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); |
1875 | if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { |
1876 | std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op); |
1877 | if (BaseAndOffset.first) |
1878 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
1879 | } |
1880 | |
1881 | // "Propagate" SROA here in the same manner as we do for ptrtoint above. |
1882 | if (auto *SROAArg = getSROAArgForValueOrNull(V: Op)) |
1883 | SROAArgValues[&I] = SROAArg; |
1884 | |
1885 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1886 | TargetTransformInfo::TCC_Free; |
1887 | } |
1888 | |
1889 | bool CallAnalyzer::visitCastInst(CastInst &I) { |
1890 | // Propagate constants through casts. |
1891 | if (simplifyInstruction(I)) |
1892 | return true; |
1893 | |
1894 | // Disable SROA in the face of arbitrary casts we don't explicitly list |
1895 | // elsewhere. |
1896 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
1897 | |
1898 | // If this is a floating-point cast, and the target says this operation |
1899 | // is expensive, this may eventually become a library call. Treat the cost |
1900 | // as such. |
1901 | switch (I.getOpcode()) { |
1902 | case Instruction::FPTrunc: |
1903 | case Instruction::FPExt: |
1904 | case Instruction::UIToFP: |
1905 | case Instruction::SIToFP: |
1906 | case Instruction::FPToUI: |
1907 | case Instruction::FPToSI: |
1908 | if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive) |
1909 | onCallPenalty(); |
1910 | break; |
1911 | default: |
1912 | break; |
1913 | } |
1914 | |
1915 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
1916 | TargetTransformInfo::TCC_Free; |
1917 | } |
1918 | |
1919 | bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { |
1920 | return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr); |
1921 | } |
1922 | |
1923 | bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { |
1924 | // Does the *call site* have the NonNull attribute set on an argument? We |
1925 | // use the attribute on the call site to memoize any analysis done in the |
1926 | // caller. This will also trip if the callee function has a non-null |
1927 | // parameter attribute, but that's a less interesting case because hopefully |
1928 | // the callee would already have been simplified based on that. |
1929 | if (Argument *A = dyn_cast<Argument>(Val: V)) |
1930 | if (paramHasAttr(A, Attr: Attribute::NonNull)) |
1931 | return true; |
1932 | |
1933 | // Is this an alloca in the caller? This is distinct from the attribute case |
1934 | // above because attributes aren't updated within the inliner itself and we |
1935 | // always want to catch the alloca derived case. |
1936 | if (isAllocaDerivedArg(V)) |
1937 | // We can actually predict the result of comparisons between an |
1938 | // alloca-derived value and null. Note that this fires regardless of |
1939 | // SROA firing. |
1940 | return true; |
1941 | |
1942 | return false; |
1943 | } |
1944 | |
1945 | bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { |
1946 | // If the normal destination of the invoke or the parent block of the call |
1947 | // site is unreachable-terminated, there is little point in inlining this |
1948 | // unless there is literally zero cost. |
1949 | // FIXME: Note that it is possible that an unreachable-terminated block has a |
1950 | // hot entry. For example, in below scenario inlining hot_call_X() may be |
1951 | // beneficial : |
1952 | // main() { |
1953 | // hot_call_1(); |
1954 | // ... |
1955 | // hot_call_N() |
1956 | // exit(0); |
1957 | // } |
1958 | // For now, we are not handling this corner case here as it is rare in real |
1959 | // code. In future, we should elaborate this based on BPI and BFI in more |
1960 | // general threshold adjusting heuristics in updateThreshold(). |
1961 | if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) { |
1962 | if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator())) |
1963 | return false; |
1964 | } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator())) |
1965 | return false; |
1966 | |
1967 | return true; |
1968 | } |
1969 | |
1970 | bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, |
1971 | BlockFrequencyInfo *CallerBFI) { |
1972 | // If global profile summary is available, then callsite's coldness is |
1973 | // determined based on that. |
1974 | if (PSI && PSI->hasProfileSummary()) |
1975 | return PSI->isColdCallSite(CB: Call, BFI: CallerBFI); |
1976 | |
1977 | // Otherwise we need BFI to be available. |
1978 | if (!CallerBFI) |
1979 | return false; |
1980 | |
1981 | // Determine if the callsite is cold relative to caller's entry. We could |
1982 | // potentially cache the computation of scaled entry frequency, but the added |
1983 | // complexity is not worth it unless this scaling shows up high in the |
1984 | // profiles. |
1985 | const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); |
1986 | auto CallSiteBB = Call.getParent(); |
1987 | auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
1988 | auto CallerEntryFreq = |
1989 | CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock())); |
1990 | return CallSiteFreq < CallerEntryFreq * ColdProb; |
1991 | } |
1992 | |
1993 | std::optional<int> |
1994 | InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, |
1995 | BlockFrequencyInfo *CallerBFI) { |
1996 | |
1997 | // If global profile summary is available, then callsite's hotness is |
1998 | // determined based on that. |
1999 | if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI)) |
2000 | return Params.HotCallSiteThreshold; |
2001 | |
2002 | // Otherwise we need BFI to be available and to have a locally hot callsite |
2003 | // threshold. |
2004 | if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) |
2005 | return std::nullopt; |
2006 | |
2007 | // Determine if the callsite is hot relative to caller's entry. We could |
2008 | // potentially cache the computation of scaled entry frequency, but the added |
2009 | // complexity is not worth it unless this scaling shows up high in the |
2010 | // profiles. |
2011 | const BasicBlock *CallSiteBB = Call.getParent(); |
2012 | BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
2013 | BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq(); |
2014 | std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq); |
2015 | if (Limit && CallSiteFreq >= *Limit) |
2016 | return Params.LocallyHotCallSiteThreshold; |
2017 | |
2018 | // Otherwise treat it normally. |
2019 | return std::nullopt; |
2020 | } |
2021 | |
2022 | void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { |
2023 | // If no size growth is allowed for this inlining, set Threshold to 0. |
2024 | if (!allowSizeGrowth(Call)) { |
2025 | Threshold = 0; |
2026 | return; |
2027 | } |
2028 | |
2029 | Function *Caller = Call.getCaller(); |
2030 | |
2031 | // return min(A, B) if B is valid. |
2032 | auto MinIfValid = [](int A, std::optional<int> B) { |
2033 | return B ? std::min(a: A, b: *B) : A; |
2034 | }; |
2035 | |
2036 | // return max(A, B) if B is valid. |
2037 | auto MaxIfValid = [](int A, std::optional<int> B) { |
2038 | return B ? std::max(a: A, b: *B) : A; |
2039 | }; |
2040 | |
2041 | // Various bonus percentages. These are multiplied by Threshold to get the |
2042 | // bonus values. |
2043 | // SingleBBBonus: This bonus is applied if the callee has a single reachable |
2044 | // basic block at the given callsite context. This is speculatively applied |
2045 | // and withdrawn if more than one basic block is seen. |
2046 | // |
2047 | // LstCallToStaticBonus: This large bonus is applied to ensure the inlining |
2048 | // of the last call to a static function as inlining such functions is |
2049 | // guaranteed to reduce code size. |
2050 | // |
2051 | // These bonus percentages may be set to 0 based on properties of the caller |
2052 | // and the callsite. |
2053 | int SingleBBBonusPercent = 50; |
2054 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
2055 | int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus(); |
2056 | |
2057 | // Lambda to set all the above bonus and bonus percentages to 0. |
2058 | auto DisallowAllBonuses = [&]() { |
2059 | SingleBBBonusPercent = 0; |
2060 | VectorBonusPercent = 0; |
2061 | LastCallToStaticBonus = 0; |
2062 | }; |
2063 | |
2064 | // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available |
2065 | // and reduce the threshold if the caller has the necessary attribute. |
2066 | if (Caller->hasMinSize()) { |
2067 | Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); |
2068 | // For minsize, we want to disable the single BB bonus and the vector |
2069 | // bonuses, but not the last-call-to-static bonus. Inlining the last call to |
2070 | // a static function will, at the minimum, eliminate the parameter setup and |
2071 | // call/return instructions. |
2072 | SingleBBBonusPercent = 0; |
2073 | VectorBonusPercent = 0; |
2074 | } else if (Caller->hasOptSize()) |
2075 | Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); |
2076 | |
2077 | // Adjust the threshold based on inlinehint attribute and profile based |
2078 | // hotness information if the caller does not have MinSize attribute. |
2079 | if (!Caller->hasMinSize()) { |
2080 | if (Callee.hasFnAttribute(Kind: Attribute::InlineHint)) |
2081 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
2082 | |
2083 | // FIXME: After switching to the new passmanager, simplify the logic below |
2084 | // by checking only the callsite hotness/coldness as we will reliably |
2085 | // have local profile information. |
2086 | // |
2087 | // Callsite hotness and coldness can be determined if sample profile is |
2088 | // used (which adds hotness metadata to calls) or if caller's |
2089 | // BlockFrequencyInfo is available. |
2090 | BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr; |
2091 | auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); |
2092 | if (!Caller->hasOptSize() && HotCallSiteThreshold) { |
2093 | LLVM_DEBUG(dbgs() << "Hot callsite.\n" ); |
2094 | // FIXME: This should update the threshold only if it exceeds the |
2095 | // current threshold, but AutoFDO + ThinLTO currently relies on this |
2096 | // behavior to prevent inlining of hot callsites during ThinLTO |
2097 | // compile phase. |
2098 | Threshold = *HotCallSiteThreshold; |
2099 | } else if (isColdCallSite(Call, CallerBFI)) { |
2100 | LLVM_DEBUG(dbgs() << "Cold callsite.\n" ); |
2101 | // Do not apply bonuses for a cold callsite including the |
2102 | // LastCallToStatic bonus. While this bonus might result in code size |
2103 | // reduction, it can cause the size of a non-cold caller to increase |
2104 | // preventing it from being inlined. |
2105 | DisallowAllBonuses(); |
2106 | Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); |
2107 | } else if (PSI) { |
2108 | // Use callee's global profile information only if we have no way of |
2109 | // determining this via callsite information. |
2110 | if (PSI->isFunctionEntryHot(F: &Callee)) { |
2111 | LLVM_DEBUG(dbgs() << "Hot callee.\n" ); |
2112 | // If callsite hotness can not be determined, we may still know |
2113 | // that the callee is hot and treat it as a weaker hint for threshold |
2114 | // increase. |
2115 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
2116 | } else if (PSI->isFunctionEntryCold(F: &Callee)) { |
2117 | LLVM_DEBUG(dbgs() << "Cold callee.\n" ); |
2118 | // Do not apply bonuses for a cold callee including the |
2119 | // LastCallToStatic bonus. While this bonus might result in code size |
2120 | // reduction, it can cause the size of a non-cold caller to increase |
2121 | // preventing it from being inlined. |
2122 | DisallowAllBonuses(); |
2123 | Threshold = MinIfValid(Threshold, Params.ColdThreshold); |
2124 | } |
2125 | } |
2126 | } |
2127 | |
2128 | Threshold += TTI.adjustInliningThreshold(CB: &Call); |
2129 | |
2130 | // Finally, take the target-specific inlining threshold multiplier into |
2131 | // account. |
2132 | Threshold *= TTI.getInliningThresholdMultiplier(); |
2133 | |
2134 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
2135 | VectorBonus = Threshold * VectorBonusPercent / 100; |
2136 | |
2137 | // If there is only one call of the function, and it has internal linkage, |
2138 | // the cost of inlining it drops dramatically. It may seem odd to update |
2139 | // Cost in updateThreshold, but the bonus depends on the logic in this method. |
2140 | if (isSoleCallToLocalFunction(CB: Call, Callee: F)) { |
2141 | Cost -= LastCallToStaticBonus; |
2142 | StaticBonusApplied = LastCallToStaticBonus; |
2143 | } |
2144 | } |
2145 | |
2146 | bool CallAnalyzer::visitCmpInst(CmpInst &I) { |
2147 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2148 | // First try to handle simplified comparisons. |
2149 | if (simplifyInstruction(I)) |
2150 | return true; |
2151 | |
2152 | // Try to handle comparison that can be simplified using ValueTracking. |
2153 | if (simplifyCmpInstForRecCall(Cmp&: I)) |
2154 | return true; |
2155 | |
2156 | if (I.getOpcode() == Instruction::FCmp) |
2157 | return false; |
2158 | |
2159 | // Otherwise look for a comparison between constant offset pointers with |
2160 | // a common base. |
2161 | Value *LHSBase, *RHSBase; |
2162 | APInt LHSOffset, RHSOffset; |
2163 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
2164 | if (LHSBase) { |
2165 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
2166 | if (RHSBase && LHSBase == RHSBase) { |
2167 | // We have common bases, fold the icmp to a constant based on the |
2168 | // offsets. |
2169 | SimplifiedValues[&I] = ConstantInt::getBool( |
2170 | Ty: I.getType(), |
2171 | V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate())); |
2172 | ++NumConstantPtrCmps; |
2173 | return true; |
2174 | } |
2175 | } |
2176 | |
2177 | auto isImplicitNullCheckCmp = [](const CmpInst &I) { |
2178 | for (auto *User : I.users()) |
2179 | if (auto *Instr = dyn_cast<Instruction>(Val: User)) |
2180 | if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit)) |
2181 | return false; |
2182 | return true; |
2183 | }; |
2184 | |
2185 | // If the comparison is an equality comparison with null, we can simplify it |
2186 | // if we know the value (argument) can't be null |
2187 | if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) { |
2188 | if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) { |
2189 | bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; |
2190 | SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType()) |
2191 | : ConstantInt::getFalse(Ty: I.getType()); |
2192 | return true; |
2193 | } |
2194 | // Implicit null checks act as unconditional branches and their comparisons |
2195 | // should be treated as simplified and free of cost. |
2196 | if (isImplicitNullCheckCmp(I)) |
2197 | return true; |
2198 | } |
2199 | return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))); |
2200 | } |
2201 | |
2202 | bool CallAnalyzer::visitSub(BinaryOperator &I) { |
2203 | // Try to handle a special case: we can fold computing the difference of two |
2204 | // constant-related pointers. |
2205 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2206 | Value *LHSBase, *RHSBase; |
2207 | APInt LHSOffset, RHSOffset; |
2208 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
2209 | if (LHSBase) { |
2210 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
2211 | if (RHSBase && LHSBase == RHSBase) { |
2212 | // We have common bases, fold the subtract to a constant based on the |
2213 | // offsets. |
2214 | Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset); |
2215 | Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset); |
2216 | if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) { |
2217 | SimplifiedValues[&I] = C; |
2218 | ++NumConstantPtrDiffs; |
2219 | return true; |
2220 | } |
2221 | } |
2222 | } |
2223 | |
2224 | // Otherwise, fall back to the generic logic for simplifying and handling |
2225 | // instructions. |
2226 | return Base::visitSub(I); |
2227 | } |
2228 | |
2229 | bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { |
2230 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
2231 | Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS); |
2232 | Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS); |
2233 | |
2234 | Value *SimpleV = nullptr; |
2235 | if (auto FI = dyn_cast<FPMathOperator>(Val: &I)) |
2236 | SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, |
2237 | FMF: FI->getFastMathFlags(), Q: DL); |
2238 | else |
2239 | SimpleV = |
2240 | simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL); |
2241 | |
2242 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
2243 | SimplifiedValues[&I] = C; |
2244 | |
2245 | if (SimpleV) |
2246 | return true; |
2247 | |
2248 | // Disable any SROA on arguments to arbitrary, unsimplified binary operators. |
2249 | disableSROA(V: LHS); |
2250 | disableSROA(V: RHS); |
2251 | |
2252 | // If the instruction is floating point, and the target says this operation |
2253 | // is expensive, this may eventually become a library call. Treat the cost |
2254 | // as such. Unless it's fneg which can be implemented with an xor. |
2255 | using namespace llvm::PatternMatch; |
2256 | if (I.getType()->isFloatingPointTy() && |
2257 | TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive && |
2258 | !match(V: &I, P: m_FNeg(X: m_Value()))) |
2259 | onCallPenalty(); |
2260 | |
2261 | return false; |
2262 | } |
2263 | |
2264 | bool CallAnalyzer::visitFNeg(UnaryOperator &I) { |
2265 | Value *Op = I.getOperand(i_nocapture: 0); |
2266 | Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op); |
2267 | |
2268 | Value *SimpleV = simplifyFNegInst( |
2269 | Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL); |
2270 | |
2271 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
2272 | SimplifiedValues[&I] = C; |
2273 | |
2274 | if (SimpleV) |
2275 | return true; |
2276 | |
2277 | // Disable any SROA on arguments to arbitrary, unsimplified fneg. |
2278 | disableSROA(V: Op); |
2279 | |
2280 | return false; |
2281 | } |
2282 | |
2283 | bool CallAnalyzer::visitLoad(LoadInst &I) { |
2284 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
2285 | return true; |
2286 | |
2287 | // If the data is already loaded from this address and hasn't been clobbered |
2288 | // by any stores or calls, this load is likely to be redundant and can be |
2289 | // eliminated. |
2290 | if (EnableLoadElimination && |
2291 | !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) { |
2292 | onLoadEliminationOpportunity(); |
2293 | return true; |
2294 | } |
2295 | |
2296 | onMemAccess(); |
2297 | return false; |
2298 | } |
2299 | |
2300 | bool CallAnalyzer::visitStore(StoreInst &I) { |
2301 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
2302 | return true; |
2303 | |
2304 | // The store can potentially clobber loads and prevent repeated loads from |
2305 | // being eliminated. |
2306 | // FIXME: |
2307 | // 1. We can probably keep an initial set of eliminatable loads substracted |
2308 | // from the cost even when we finally see a store. We just need to disable |
2309 | // *further* accumulation of elimination savings. |
2310 | // 2. We should probably at some point thread MemorySSA for the callee into |
2311 | // this and then use that to actually compute *really* precise savings. |
2312 | disableLoadElimination(); |
2313 | |
2314 | onMemAccess(); |
2315 | return false; |
2316 | } |
2317 | |
2318 | bool CallAnalyzer::(ExtractValueInst &I) { |
2319 | Value *Op = I.getAggregateOperand(); |
2320 | |
2321 | // Special handling, because we want to simplify extractvalue with a |
2322 | // potential insertvalue from the caller. |
2323 | if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) { |
2324 | SimplifyQuery SQ(DL); |
2325 | Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ); |
2326 | if (SimpleV) { |
2327 | SimplifiedValues[&I] = SimpleV; |
2328 | return true; |
2329 | } |
2330 | } |
2331 | |
2332 | // SROA can't look through these, but they may be free. |
2333 | return Base::visitExtractValue(I); |
2334 | } |
2335 | |
2336 | bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { |
2337 | // Constant folding for insert value is trivial. |
2338 | if (simplifyInstruction(I)) |
2339 | return true; |
2340 | |
2341 | // SROA can't look through these, but they may be free. |
2342 | return Base::visitInsertValue(I); |
2343 | } |
2344 | |
2345 | /// Try to simplify a call site. |
2346 | /// |
2347 | /// Takes a concrete function and callsite and tries to actually simplify it by |
2348 | /// analyzing the arguments and call itself with instsimplify. Returns true if |
2349 | /// it has simplified the callsite to some other entity (a constant), making it |
2350 | /// free. |
2351 | bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { |
2352 | // FIXME: Using the instsimplify logic directly for this is inefficient |
2353 | // because we have to continually rebuild the argument list even when no |
2354 | // simplifications can be performed. Until that is fixed with remapping |
2355 | // inside of instsimplify, directly constant fold calls here. |
2356 | if (!canConstantFoldCallTo(Call: &Call, F)) |
2357 | return false; |
2358 | |
2359 | // Try to re-map the arguments to constants. |
2360 | SmallVector<Constant *, 4> ConstantArgs; |
2361 | ConstantArgs.reserve(N: Call.arg_size()); |
2362 | for (Value *I : Call.args()) { |
2363 | Constant *C = getDirectOrSimplifiedValue<Constant>(V: I); |
2364 | if (!C) |
2365 | return false; // This argument doesn't map to a constant. |
2366 | |
2367 | ConstantArgs.push_back(Elt: C); |
2368 | } |
2369 | if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) { |
2370 | SimplifiedValues[&Call] = C; |
2371 | return true; |
2372 | } |
2373 | |
2374 | return false; |
2375 | } |
2376 | |
2377 | bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) { |
2378 | const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr; |
2379 | LibFunc LF; |
2380 | if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF)) |
2381 | return TTI.isLoweredToCall(F); |
2382 | |
2383 | switch (LF) { |
2384 | case LibFunc_memcpy_chk: |
2385 | case LibFunc_memmove_chk: |
2386 | case LibFunc_mempcpy_chk: |
2387 | case LibFunc_memset_chk: { |
2388 | // Calls to __memcpy_chk whose length is known to fit within the object |
2389 | // size will eventually be replaced by inline stores. Therefore, these |
2390 | // should not incur a call penalty. This is only really relevant on |
2391 | // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as |
2392 | // other platforms use memcpy intrinsics, which are already exempt from the |
2393 | // call penalty. |
2394 | auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2)); |
2395 | auto *ObjSizeOp = |
2396 | getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3)); |
2397 | if (LenOp && ObjSizeOp && |
2398 | LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) { |
2399 | return false; |
2400 | } |
2401 | break; |
2402 | } |
2403 | default: |
2404 | break; |
2405 | } |
2406 | |
2407 | return TTI.isLoweredToCall(F); |
2408 | } |
2409 | |
2410 | bool CallAnalyzer::visitCallBase(CallBase &Call) { |
2411 | if (!onCallBaseVisitStart(Call)) |
2412 | return true; |
2413 | |
2414 | if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) && |
2415 | !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) { |
2416 | // This aborts the entire analysis. |
2417 | ExposesReturnsTwice = true; |
2418 | return false; |
2419 | } |
2420 | if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate()) |
2421 | ContainsNoDuplicateCall = true; |
2422 | |
2423 | Function *F = Call.getCalledFunction(); |
2424 | bool IsIndirectCall = !F; |
2425 | if (IsIndirectCall) { |
2426 | // Check if this happens to be an indirect function call to a known function |
2427 | // in this inline context. If not, we've done all we can. |
2428 | Value *Callee = Call.getCalledOperand(); |
2429 | F = getSimplifiedValue<Function>(V: Callee); |
2430 | if (!F || F->getFunctionType() != Call.getFunctionType()) { |
2431 | onCallArgumentSetup(Call); |
2432 | |
2433 | if (!Call.onlyReadsMemory()) |
2434 | disableLoadElimination(); |
2435 | return Base::visitCallBase(I&: Call); |
2436 | } |
2437 | } |
2438 | |
2439 | assert(F && "Expected a call to a known function" ); |
2440 | |
2441 | // When we have a concrete function, first try to simplify it directly. |
2442 | if (simplifyCallSite(F, Call)) |
2443 | return true; |
2444 | |
2445 | // Next check if it is an intrinsic we know about. |
2446 | // FIXME: Lift this into part of the InstVisitor. |
2447 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) { |
2448 | switch (II->getIntrinsicID()) { |
2449 | default: |
2450 | if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II)) |
2451 | disableLoadElimination(); |
2452 | return Base::visitCallBase(I&: Call); |
2453 | |
2454 | case Intrinsic::load_relative: |
2455 | onLoadRelativeIntrinsic(); |
2456 | return false; |
2457 | |
2458 | case Intrinsic::memset: |
2459 | case Intrinsic::memcpy: |
2460 | case Intrinsic::memmove: |
2461 | disableLoadElimination(); |
2462 | // SROA can usually chew through these intrinsics, but they aren't free. |
2463 | return false; |
2464 | case Intrinsic::icall_branch_funnel: |
2465 | case Intrinsic::localescape: |
2466 | HasUninlineableIntrinsic = true; |
2467 | return false; |
2468 | case Intrinsic::vastart: |
2469 | InitsVargArgs = true; |
2470 | return false; |
2471 | case Intrinsic::launder_invariant_group: |
2472 | case Intrinsic::strip_invariant_group: |
2473 | if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0))) |
2474 | SROAArgValues[II] = SROAArg; |
2475 | return true; |
2476 | case Intrinsic::is_constant: |
2477 | return simplifyIntrinsicCallIsConstant(CB&: Call); |
2478 | case Intrinsic::objectsize: |
2479 | return simplifyIntrinsicCallObjectSize(CB&: Call); |
2480 | } |
2481 | } |
2482 | |
2483 | if (F == Call.getFunction()) { |
2484 | // This flag will fully abort the analysis, so don't bother with anything |
2485 | // else. |
2486 | IsRecursiveCall = true; |
2487 | if (!AllowRecursiveCall) |
2488 | return false; |
2489 | } |
2490 | |
2491 | if (isLoweredToCall(F, Call)) { |
2492 | onLoweredCall(F, Call, IsIndirectCall); |
2493 | } |
2494 | |
2495 | if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) |
2496 | disableLoadElimination(); |
2497 | return Base::visitCallBase(I&: Call); |
2498 | } |
2499 | |
2500 | bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { |
2501 | // At least one return instruction will be free after inlining. |
2502 | bool Free = !HasReturn; |
2503 | HasReturn = true; |
2504 | return Free; |
2505 | } |
2506 | |
2507 | bool CallAnalyzer::visitBranchInst(BranchInst &BI) { |
2508 | // We model unconditional branches as essentially free -- they really |
2509 | // shouldn't exist at all, but handling them makes the behavior of the |
2510 | // inliner more regular and predictable. Interestingly, conditional branches |
2511 | // which will fold away are also free. |
2512 | return BI.isUnconditional() || |
2513 | getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) || |
2514 | BI.getMetadata(KindID: LLVMContext::MD_make_implicit); |
2515 | } |
2516 | |
2517 | bool CallAnalyzer::visitSelectInst(SelectInst &SI) { |
2518 | bool CheckSROA = SI.getType()->isPointerTy(); |
2519 | Value *TrueVal = SI.getTrueValue(); |
2520 | Value *FalseVal = SI.getFalseValue(); |
2521 | |
2522 | Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal); |
2523 | Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal); |
2524 | Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition()); |
2525 | |
2526 | if (!CondC) { |
2527 | // Select C, X, X => X |
2528 | if (TrueC == FalseC && TrueC) { |
2529 | SimplifiedValues[&SI] = TrueC; |
2530 | return true; |
2531 | } |
2532 | |
2533 | if (!CheckSROA) |
2534 | return Base::visitSelectInst(I&: SI); |
2535 | |
2536 | std::pair<Value *, APInt> TrueBaseAndOffset = |
2537 | ConstantOffsetPtrs.lookup(Val: TrueVal); |
2538 | std::pair<Value *, APInt> FalseBaseAndOffset = |
2539 | ConstantOffsetPtrs.lookup(Val: FalseVal); |
2540 | if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { |
2541 | ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; |
2542 | |
2543 | if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal)) |
2544 | SROAArgValues[&SI] = SROAArg; |
2545 | return true; |
2546 | } |
2547 | |
2548 | return Base::visitSelectInst(I&: SI); |
2549 | } |
2550 | |
2551 | // Select condition is a constant. |
2552 | Value *SelectedV = CondC->isAllOnesValue() ? TrueVal |
2553 | : (CondC->isNullValue()) ? FalseVal |
2554 | : nullptr; |
2555 | if (!SelectedV) { |
2556 | // Condition is a vector constant that is not all 1s or all 0s. If all |
2557 | // operands are constants, ConstantFoldSelectInstruction() can handle the |
2558 | // cases such as select vectors. |
2559 | if (TrueC && FalseC) { |
2560 | if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) { |
2561 | SimplifiedValues[&SI] = C; |
2562 | return true; |
2563 | } |
2564 | } |
2565 | return Base::visitSelectInst(I&: SI); |
2566 | } |
2567 | |
2568 | // Condition is either all 1s or all 0s. SI can be simplified. |
2569 | if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) { |
2570 | SimplifiedValues[&SI] = SelectedC; |
2571 | return true; |
2572 | } |
2573 | |
2574 | if (!CheckSROA) |
2575 | return true; |
2576 | |
2577 | std::pair<Value *, APInt> BaseAndOffset = |
2578 | ConstantOffsetPtrs.lookup(Val: SelectedV); |
2579 | if (BaseAndOffset.first) { |
2580 | ConstantOffsetPtrs[&SI] = BaseAndOffset; |
2581 | |
2582 | if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV)) |
2583 | SROAArgValues[&SI] = SROAArg; |
2584 | } |
2585 | |
2586 | return true; |
2587 | } |
2588 | |
2589 | bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { |
2590 | // We model unconditional switches as free, see the comments on handling |
2591 | // branches. |
2592 | if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition())) |
2593 | return true; |
2594 | |
2595 | // Assume the most general case where the switch is lowered into |
2596 | // either a jump table, bit test, or a balanced binary tree consisting of |
2597 | // case clusters without merging adjacent clusters with the same |
2598 | // destination. We do not consider the switches that are lowered with a mix |
2599 | // of jump table/bit test/binary search tree. The cost of the switch is |
2600 | // proportional to the size of the tree or the size of jump table range. |
2601 | // |
2602 | // NB: We convert large switches which are just used to initialize large phi |
2603 | // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent |
2604 | // inlining those. It will prevent inlining in cases where the optimization |
2605 | // does not (yet) fire. |
2606 | |
2607 | unsigned JumpTableSize = 0; |
2608 | BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; |
2609 | unsigned NumCaseCluster = |
2610 | TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI); |
2611 | |
2612 | onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable()); |
2613 | return false; |
2614 | } |
2615 | |
2616 | bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { |
2617 | // We never want to inline functions that contain an indirectbr. This is |
2618 | // incorrect because all the blockaddress's (in static global initializers |
2619 | // for example) would be referring to the original function, and this |
2620 | // indirect jump would jump from the inlined copy of the function into the |
2621 | // original function which is extremely undefined behavior. |
2622 | // FIXME: This logic isn't really right; we can safely inline functions with |
2623 | // indirectbr's as long as no other function or global references the |
2624 | // blockaddress of a block within the current function. |
2625 | HasIndirectBr = true; |
2626 | return false; |
2627 | } |
2628 | |
2629 | bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { |
2630 | // FIXME: It's not clear that a single instruction is an accurate model for |
2631 | // the inline cost of a resume instruction. |
2632 | return false; |
2633 | } |
2634 | |
2635 | bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { |
2636 | // FIXME: It's not clear that a single instruction is an accurate model for |
2637 | // the inline cost of a cleanupret instruction. |
2638 | return false; |
2639 | } |
2640 | |
2641 | bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { |
2642 | // FIXME: It's not clear that a single instruction is an accurate model for |
2643 | // the inline cost of a catchret instruction. |
2644 | return false; |
2645 | } |
2646 | |
2647 | bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { |
2648 | // FIXME: It might be reasonably to discount the cost of instructions leading |
2649 | // to unreachable as they have the lowest possible impact on both runtime and |
2650 | // code size. |
2651 | return true; // No actual code is needed for unreachable. |
2652 | } |
2653 | |
2654 | bool CallAnalyzer::visitInstruction(Instruction &I) { |
2655 | // Some instructions are free. All of the free intrinsics can also be |
2656 | // handled by SROA, etc. |
2657 | if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
2658 | TargetTransformInfo::TCC_Free) |
2659 | return true; |
2660 | |
2661 | // We found something we don't understand or can't handle. Mark any SROA-able |
2662 | // values in the operand list as no longer viable. |
2663 | for (const Use &Op : I.operands()) |
2664 | disableSROA(V: Op); |
2665 | |
2666 | return false; |
2667 | } |
2668 | |
2669 | /// Analyze a basic block for its contribution to the inline cost. |
2670 | /// |
2671 | /// This method walks the analyzer over every instruction in the given basic |
2672 | /// block and accounts for their cost during inlining at this callsite. It |
2673 | /// aborts early if the threshold has been exceeded or an impossible to inline |
2674 | /// construct has been detected. It returns false if inlining is no longer |
2675 | /// viable, and true if inlining remains viable. |
2676 | InlineResult |
2677 | CallAnalyzer::analyzeBlock(BasicBlock *BB, |
2678 | const SmallPtrSetImpl<const Value *> &EphValues) { |
2679 | for (Instruction &I : *BB) { |
2680 | // FIXME: Currently, the number of instructions in a function regardless of |
2681 | // our ability to simplify them during inline to constants or dead code, |
2682 | // are actually used by the vector bonus heuristic. As long as that's true, |
2683 | // we have to special case debug intrinsics here to prevent differences in |
2684 | // inlining due to debug symbols. Eventually, the number of unsimplified |
2685 | // instructions shouldn't factor into the cost computation, but until then, |
2686 | // hack around it here. |
2687 | // Similarly, skip pseudo-probes. |
2688 | if (I.isDebugOrPseudoInst()) |
2689 | continue; |
2690 | |
2691 | // Skip ephemeral values. |
2692 | if (EphValues.count(Ptr: &I)) |
2693 | continue; |
2694 | |
2695 | ++NumInstructions; |
2696 | if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy()) |
2697 | ++NumVectorInstructions; |
2698 | |
2699 | // If the instruction simplified to a constant, there is no cost to this |
2700 | // instruction. Visit the instructions using our InstVisitor to account for |
2701 | // all of the per-instruction logic. The visit tree returns true if we |
2702 | // consumed the instruction in any way, and false if the instruction's base |
2703 | // cost should count against inlining. |
2704 | onInstructionAnalysisStart(I: &I); |
2705 | |
2706 | if (Base::visit(I: &I)) |
2707 | ++NumInstructionsSimplified; |
2708 | else |
2709 | onMissedSimplification(); |
2710 | |
2711 | onInstructionAnalysisFinish(I: &I); |
2712 | using namespace ore; |
2713 | // If the visit this instruction detected an uninlinable pattern, abort. |
2714 | InlineResult IR = InlineResult::success(); |
2715 | if (IsRecursiveCall && !AllowRecursiveCall) |
2716 | IR = InlineResult::failure(Reason: "recursive" ); |
2717 | else if (ExposesReturnsTwice) |
2718 | IR = InlineResult::failure(Reason: "exposes returns twice" ); |
2719 | else if (HasDynamicAlloca) |
2720 | IR = InlineResult::failure(Reason: "dynamic alloca" ); |
2721 | else if (HasIndirectBr) |
2722 | IR = InlineResult::failure(Reason: "indirect branch" ); |
2723 | else if (HasUninlineableIntrinsic) |
2724 | IR = InlineResult::failure(Reason: "uninlinable intrinsic" ); |
2725 | else if (InitsVargArgs) |
2726 | IR = InlineResult::failure(Reason: "varargs" ); |
2727 | if (!IR.isSuccess()) { |
2728 | if (ORE) |
2729 | ORE->emit(RemarkBuilder: [&]() { |
2730 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
2731 | &CandidateCall) |
2732 | << NV("Callee" , &F) << " has uninlinable pattern (" |
2733 | << NV("InlineResult" , IR.getFailureReason()) |
2734 | << ") and cost is not fully computed" ; |
2735 | }); |
2736 | return IR; |
2737 | } |
2738 | |
2739 | // If the caller is a recursive function then we don't want to inline |
2740 | // functions which allocate a lot of stack space because it would increase |
2741 | // the caller stack usage dramatically. |
2742 | if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) { |
2743 | auto IR = |
2744 | InlineResult::failure(Reason: "recursive and allocates too much stack space" ); |
2745 | if (ORE) |
2746 | ORE->emit(RemarkBuilder: [&]() { |
2747 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
2748 | &CandidateCall) |
2749 | << NV("Callee" , &F) << " is " |
2750 | << NV("InlineResult" , IR.getFailureReason()) |
2751 | << ". Cost is not fully computed" ; |
2752 | }); |
2753 | return IR; |
2754 | } |
2755 | |
2756 | if (shouldStop()) |
2757 | return InlineResult::failure( |
2758 | Reason: "Call site analysis is not favorable to inlining." ); |
2759 | } |
2760 | |
2761 | return InlineResult::success(); |
2762 | } |
2763 | |
2764 | /// Compute the base pointer and cumulative constant offsets for V. |
2765 | /// |
2766 | /// This strips all constant offsets off of V, leaving it the base pointer, and |
2767 | /// accumulates the total constant offset applied in the returned constant. It |
2768 | /// returns 0 if V is not a pointer, and returns the constant '0' if there are |
2769 | /// no constant offsets applied. |
2770 | ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { |
2771 | if (!V->getType()->isPointerTy()) |
2772 | return nullptr; |
2773 | |
2774 | unsigned AS = V->getType()->getPointerAddressSpace(); |
2775 | unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); |
2776 | APInt Offset = APInt::getZero(numBits: IntPtrWidth); |
2777 | |
2778 | // Even though we don't look through PHI nodes, we could be called on an |
2779 | // instruction in an unreachable block, which may be on a cycle. |
2780 | SmallPtrSet<Value *, 4> Visited; |
2781 | Visited.insert(Ptr: V); |
2782 | do { |
2783 | if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) { |
2784 | if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset)) |
2785 | return nullptr; |
2786 | V = GEP->getPointerOperand(); |
2787 | } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) { |
2788 | if (GA->isInterposable()) |
2789 | break; |
2790 | V = GA->getAliasee(); |
2791 | } else { |
2792 | break; |
2793 | } |
2794 | assert(V->getType()->isPointerTy() && "Unexpected operand type!" ); |
2795 | } while (Visited.insert(Ptr: V).second); |
2796 | |
2797 | Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType()); |
2798 | return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset)); |
2799 | } |
2800 | |
2801 | /// Find dead blocks due to deleted CFG edges during inlining. |
2802 | /// |
2803 | /// If we know the successor of the current block, \p CurrBB, has to be \p |
2804 | /// NextBB, the other successors of \p CurrBB are dead if these successors have |
2805 | /// no live incoming CFG edges. If one block is found to be dead, we can |
2806 | /// continue growing the dead block list by checking the successors of the dead |
2807 | /// blocks to see if all their incoming edges are dead or not. |
2808 | void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { |
2809 | auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { |
2810 | // A CFG edge is dead if the predecessor is dead or the predecessor has a |
2811 | // known successor which is not the one under exam. |
2812 | if (DeadBlocks.count(Ptr: Pred)) |
2813 | return true; |
2814 | BasicBlock *KnownSucc = KnownSuccessors[Pred]; |
2815 | return KnownSucc && KnownSucc != Succ; |
2816 | }; |
2817 | |
2818 | auto IsNewlyDead = [&](BasicBlock *BB) { |
2819 | // If all the edges to a block are dead, the block is also dead. |
2820 | return (!DeadBlocks.count(Ptr: BB) && |
2821 | llvm::all_of(Range: predecessors(BB), |
2822 | P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); |
2823 | }; |
2824 | |
2825 | for (BasicBlock *Succ : successors(BB: CurrBB)) { |
2826 | if (Succ == NextBB || !IsNewlyDead(Succ)) |
2827 | continue; |
2828 | SmallVector<BasicBlock *, 4> NewDead; |
2829 | NewDead.push_back(Elt: Succ); |
2830 | while (!NewDead.empty()) { |
2831 | BasicBlock *Dead = NewDead.pop_back_val(); |
2832 | if (DeadBlocks.insert(Ptr: Dead).second) |
2833 | // Continue growing the dead block lists. |
2834 | for (BasicBlock *S : successors(BB: Dead)) |
2835 | if (IsNewlyDead(S)) |
2836 | NewDead.push_back(Elt: S); |
2837 | } |
2838 | } |
2839 | } |
2840 | |
2841 | /// Analyze a call site for potential inlining. |
2842 | /// |
2843 | /// Returns true if inlining this call is viable, and false if it is not |
2844 | /// viable. It computes the cost and adjusts the threshold based on numerous |
2845 | /// factors and heuristics. If this method returns false but the computed cost |
2846 | /// is below the computed threshold, then inlining was forcibly disabled by |
2847 | /// some artifact of the routine. |
2848 | InlineResult CallAnalyzer::analyze() { |
2849 | ++NumCallsAnalyzed; |
2850 | |
2851 | auto Result = onAnalysisStart(); |
2852 | if (!Result.isSuccess()) |
2853 | return Result; |
2854 | |
2855 | if (F.empty()) |
2856 | return InlineResult::success(); |
2857 | |
2858 | Function *Caller = CandidateCall.getFunction(); |
2859 | // Check if the caller function is recursive itself. |
2860 | for (User *U : Caller->users()) { |
2861 | CallBase *Call = dyn_cast<CallBase>(Val: U); |
2862 | if (Call && Call->getFunction() == Caller) { |
2863 | IsCallerRecursive = true; |
2864 | break; |
2865 | } |
2866 | } |
2867 | |
2868 | // Populate our simplified values by mapping from function arguments to call |
2869 | // arguments with known important simplifications. |
2870 | auto CAI = CandidateCall.arg_begin(); |
2871 | for (Argument &FAI : F.args()) { |
2872 | assert(CAI != CandidateCall.arg_end()); |
2873 | SimplifiedValues[&FAI] = *CAI; |
2874 | if (isa<Constant>(Val: *CAI)) |
2875 | ++NumConstantArgs; |
2876 | |
2877 | Value *PtrArg = *CAI; |
2878 | if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) { |
2879 | ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue()); |
2880 | |
2881 | // We can SROA any pointer arguments derived from alloca instructions. |
2882 | if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) { |
2883 | SROAArgValues[&FAI] = SROAArg; |
2884 | onInitializeSROAArg(Arg: SROAArg); |
2885 | EnabledSROAAllocas.insert(V: SROAArg); |
2886 | } |
2887 | } |
2888 | ++CAI; |
2889 | } |
2890 | NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); |
2891 | NumAllocaArgs = SROAArgValues.size(); |
2892 | |
2893 | // Collecting the ephemeral values of `F` can be expensive, so use the |
2894 | // ephemeral values cache if available. |
2895 | SmallPtrSet<const Value *, 32> EphValuesStorage; |
2896 | const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage; |
2897 | if (GetEphValuesCache) |
2898 | EphValues = &GetEphValuesCache(F).ephValues(); |
2899 | else |
2900 | CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F), |
2901 | EphValues&: EphValuesStorage); |
2902 | |
2903 | // The worklist of live basic blocks in the callee *after* inlining. We avoid |
2904 | // adding basic blocks of the callee which can be proven to be dead for this |
2905 | // particular call site in order to get more accurate cost estimates. This |
2906 | // requires a somewhat heavyweight iteration pattern: we need to walk the |
2907 | // basic blocks in a breadth-first order as we insert live successors. To |
2908 | // accomplish this, prioritizing for small iterations because we exit after |
2909 | // crossing our threshold, we use a small-size optimized SetVector. |
2910 | typedef SmallSetVector<BasicBlock *, 16> BBSetVector; |
2911 | BBSetVector BBWorklist; |
2912 | BBWorklist.insert(X: &F.getEntryBlock()); |
2913 | |
2914 | // Note that we *must not* cache the size, this loop grows the worklist. |
2915 | for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { |
2916 | if (shouldStop()) |
2917 | break; |
2918 | |
2919 | BasicBlock *BB = BBWorklist[Idx]; |
2920 | if (BB->empty()) |
2921 | continue; |
2922 | |
2923 | onBlockStart(BB); |
2924 | |
2925 | // Disallow inlining a blockaddress with uses other than strictly callbr. |
2926 | // A blockaddress only has defined behavior for an indirect branch in the |
2927 | // same function, and we do not currently support inlining indirect |
2928 | // branches. But, the inliner may not see an indirect branch that ends up |
2929 | // being dead code at a particular call site. If the blockaddress escapes |
2930 | // the function, e.g., via a global variable, inlining may lead to an |
2931 | // invalid cross-function reference. |
2932 | // FIXME: pr/39560: continue relaxing this overt restriction. |
2933 | if (BB->hasAddressTaken()) |
2934 | for (User *U : BlockAddress::get(BB: &*BB)->users()) |
2935 | if (!isa<CallBrInst>(Val: *U)) |
2936 | return InlineResult::failure(Reason: "blockaddress used outside of callbr" ); |
2937 | |
2938 | // Analyze the cost of this block. If we blow through the threshold, this |
2939 | // returns false, and we can bail on out. |
2940 | InlineResult IR = analyzeBlock(BB, EphValues: *EphValues); |
2941 | if (!IR.isSuccess()) |
2942 | return IR; |
2943 | |
2944 | Instruction *TI = BB->getTerminator(); |
2945 | |
2946 | // Add in the live successors by first checking whether we have terminator |
2947 | // that may be simplified based on the values simplified by this call. |
2948 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) { |
2949 | if (BI->isConditional()) { |
2950 | Value *Cond = BI->getCondition(); |
2951 | if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) { |
2952 | BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0); |
2953 | BBWorklist.insert(X: NextBB); |
2954 | KnownSuccessors[BB] = NextBB; |
2955 | findDeadBlocks(CurrBB: BB, NextBB); |
2956 | continue; |
2957 | } |
2958 | } |
2959 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) { |
2960 | Value *Cond = SI->getCondition(); |
2961 | if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) { |
2962 | BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor(); |
2963 | BBWorklist.insert(X: NextBB); |
2964 | KnownSuccessors[BB] = NextBB; |
2965 | findDeadBlocks(CurrBB: BB, NextBB); |
2966 | continue; |
2967 | } |
2968 | } |
2969 | |
2970 | // If we're unable to select a particular successor, just count all of |
2971 | // them. |
2972 | BBWorklist.insert_range(R: successors(BB)); |
2973 | |
2974 | onBlockAnalyzed(BB); |
2975 | } |
2976 | |
2977 | // If this is a noduplicate call, we can still inline as long as |
2978 | // inlining this would cause the removal of the caller (so the instruction |
2979 | // is not actually duplicated, just moved). |
2980 | if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall) |
2981 | return InlineResult::failure(Reason: "noduplicate" ); |
2982 | |
2983 | // If the callee's stack size exceeds the user-specified threshold, |
2984 | // do not let it be inlined. |
2985 | // The command line option overrides a limit set in the function attributes. |
2986 | size_t FinalStackSizeThreshold = StackSizeThreshold; |
2987 | if (!StackSizeThreshold.getNumOccurrences()) |
2988 | if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt( |
2989 | F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName)) |
2990 | FinalStackSizeThreshold = *AttrMaxStackSize; |
2991 | if (AllocatedSize > FinalStackSizeThreshold) |
2992 | return InlineResult::failure(Reason: "stacksize" ); |
2993 | |
2994 | return finalizeAnalysis(); |
2995 | } |
2996 | |
2997 | void InlineCostCallAnalyzer::print(raw_ostream &OS) { |
2998 | #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n" |
2999 | if (PrintInstructionComments) |
3000 | F.print(OS, AAW: &Writer); |
3001 | DEBUG_PRINT_STAT(NumConstantArgs); |
3002 | DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); |
3003 | DEBUG_PRINT_STAT(NumAllocaArgs); |
3004 | DEBUG_PRINT_STAT(NumConstantPtrCmps); |
3005 | DEBUG_PRINT_STAT(NumConstantPtrDiffs); |
3006 | DEBUG_PRINT_STAT(NumInstructionsSimplified); |
3007 | DEBUG_PRINT_STAT(NumInstructions); |
3008 | DEBUG_PRINT_STAT(SROACostSavings); |
3009 | DEBUG_PRINT_STAT(SROACostSavingsLost); |
3010 | DEBUG_PRINT_STAT(LoadEliminationCost); |
3011 | DEBUG_PRINT_STAT(ContainsNoDuplicateCall); |
3012 | DEBUG_PRINT_STAT(Cost); |
3013 | DEBUG_PRINT_STAT(Threshold); |
3014 | #undef DEBUG_PRINT_STAT |
3015 | } |
3016 | |
3017 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
3018 | /// Dump stats about this call's analysis. |
3019 | LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); } |
3020 | #endif |
3021 | |
3022 | /// Test that there are no attribute conflicts between Caller and Callee |
3023 | /// that prevent inlining. |
3024 | static bool functionsHaveCompatibleAttributes( |
3025 | Function *Caller, Function *Callee, TargetTransformInfo &TTI, |
3026 | function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) { |
3027 | // Note that CalleeTLI must be a copy not a reference. The legacy pass manager |
3028 | // caches the most recently created TLI in the TargetLibraryInfoWrapperPass |
3029 | // object, and always returns the same object (which is overwritten on each |
3030 | // GetTLI call). Therefore we copy the first result. |
3031 | auto CalleeTLI = GetTLI(*Callee); |
3032 | return (IgnoreTTIInlineCompatible || |
3033 | TTI.areInlineCompatible(Caller, Callee)) && |
3034 | GetTLI(*Caller).areInlineCompatible(CalleeTLI, |
3035 | AllowCallerSuperset: InlineCallerSupersetNoBuiltin) && |
3036 | AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee); |
3037 | } |
3038 | |
3039 | int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, |
3040 | const DataLayout &DL) { |
3041 | int64_t Cost = 0; |
3042 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { |
3043 | if (Call.isByValArgument(ArgNo: I)) { |
3044 | // We approximate the number of loads and stores needed by dividing the |
3045 | // size of the byval type by the target's pointer size. |
3046 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
3047 | unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I)); |
3048 | unsigned AS = PTy->getAddressSpace(); |
3049 | unsigned PointerSize = DL.getPointerSizeInBits(AS); |
3050 | // Ceiling division. |
3051 | unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; |
3052 | |
3053 | // If it generates more than 8 stores it is likely to be expanded as an |
3054 | // inline memcpy so we take that as an upper bound. Otherwise we assume |
3055 | // one load and one store per word copied. |
3056 | // FIXME: The maxStoresPerMemcpy setting from the target should be used |
3057 | // here instead of a magic number of 8, but it's not available via |
3058 | // DataLayout. |
3059 | NumStores = std::min(a: NumStores, b: 8U); |
3060 | |
3061 | Cost += 2 * NumStores * InstrCost; |
3062 | } else { |
3063 | // For non-byval arguments subtract off one instruction per call |
3064 | // argument. |
3065 | Cost += InstrCost; |
3066 | } |
3067 | } |
3068 | // The call instruction also disappears after inlining. |
3069 | Cost += InstrCost; |
3070 | Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty); |
3071 | |
3072 | return std::min<int64_t>(a: Cost, INT_MAX); |
3073 | } |
3074 | |
3075 | InlineCost llvm::getInlineCost( |
3076 | CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, |
3077 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
3078 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
3079 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
3080 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, |
3081 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) { |
3082 | return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI, |
3083 | GetAssumptionCache, GetTLI, GetBFI, PSI, ORE, |
3084 | GetEphValuesCache); |
3085 | } |
3086 | |
3087 | std::optional<int> llvm::getInliningCostEstimate( |
3088 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
3089 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
3090 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
3091 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
3092 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
3093 | const InlineParams Params = {/* DefaultThreshold*/ 0, |
3094 | /*HintThreshold*/ {}, |
3095 | /*ColdThreshold*/ {}, |
3096 | /*OptSizeThreshold*/ {}, |
3097 | /*OptMinSizeThreshold*/ {}, |
3098 | /*HotCallSiteThreshold*/ {}, |
3099 | /*LocallyHotCallSiteThreshold*/ {}, |
3100 | /*ColdCallSiteThreshold*/ {}, |
3101 | /*ComputeFullInlineCost*/ true, |
3102 | /*EnableDeferral*/ true}; |
3103 | |
3104 | InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, |
3105 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true, |
3106 | /*IgnoreThreshold*/ true); |
3107 | auto R = CA.analyze(); |
3108 | if (!R.isSuccess()) |
3109 | return std::nullopt; |
3110 | return CA.getCost(); |
3111 | } |
3112 | |
3113 | std::optional<InlineCostFeatures> llvm::getInliningCostFeatures( |
3114 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
3115 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
3116 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
3117 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
3118 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
3119 | InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI, |
3120 | PSI, ORE, *Call.getCalledFunction(), Call); |
3121 | auto R = CFA.analyze(); |
3122 | if (!R.isSuccess()) |
3123 | return std::nullopt; |
3124 | return CFA.features(); |
3125 | } |
3126 | |
3127 | std::optional<InlineResult> llvm::getAttributeBasedInliningDecision( |
3128 | CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, |
3129 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { |
3130 | |
3131 | // Cannot inline indirect calls. |
3132 | if (!Callee) |
3133 | return InlineResult::failure(Reason: "indirect call" ); |
3134 | |
3135 | // When callee coroutine function is inlined into caller coroutine function |
3136 | // before coro-split pass, |
3137 | // coro-early pass can not handle this quiet well. |
3138 | // So we won't inline the coroutine function if it have not been unsplited |
3139 | if (Callee->isPresplitCoroutine()) |
3140 | return InlineResult::failure(Reason: "unsplited coroutine call" ); |
3141 | |
3142 | // Never inline calls with byval arguments that does not have the alloca |
3143 | // address space. Since byval arguments can be replaced with a copy to an |
3144 | // alloca, the inlined code would need to be adjusted to handle that the |
3145 | // argument is in the alloca address space (so it is a little bit complicated |
3146 | // to solve). |
3147 | unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace(); |
3148 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) |
3149 | if (Call.isByValArgument(ArgNo: I)) { |
3150 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
3151 | if (PTy->getAddressSpace() != AllocaAS) |
3152 | return InlineResult::failure(Reason: "byval arguments without alloca" |
3153 | " address space" ); |
3154 | } |
3155 | |
3156 | // Calls to functions with always-inline attributes should be inlined |
3157 | // whenever possible. |
3158 | if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) { |
3159 | if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline)) |
3160 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
3161 | |
3162 | auto IsViable = isInlineViable(Callee&: *Callee); |
3163 | if (IsViable.isSuccess()) |
3164 | return InlineResult::success(); |
3165 | return InlineResult::failure(Reason: IsViable.getFailureReason()); |
3166 | } |
3167 | |
3168 | // Never inline functions with conflicting attributes (unless callee has |
3169 | // always-inline attribute). |
3170 | Function *Caller = Call.getCaller(); |
3171 | if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI)) |
3172 | return InlineResult::failure(Reason: "conflicting attributes" ); |
3173 | |
3174 | // Don't inline this call if the caller has the optnone attribute. |
3175 | if (Caller->hasOptNone()) |
3176 | return InlineResult::failure(Reason: "optnone attribute" ); |
3177 | |
3178 | // Don't inline a function that treats null pointer as valid into a caller |
3179 | // that does not have this attribute. |
3180 | if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) |
3181 | return InlineResult::failure(Reason: "nullptr definitions incompatible" ); |
3182 | |
3183 | // Don't inline functions which can be interposed at link-time. |
3184 | if (Callee->isInterposable()) |
3185 | return InlineResult::failure(Reason: "interposable" ); |
3186 | |
3187 | // Don't inline functions marked noinline. |
3188 | if (Callee->hasFnAttribute(Kind: Attribute::NoInline)) |
3189 | return InlineResult::failure(Reason: "noinline function attribute" ); |
3190 | |
3191 | // Don't inline call sites marked noinline. |
3192 | if (Call.isNoInline()) |
3193 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
3194 | |
3195 | // Don't inline functions that are loader replaceable. |
3196 | if (Callee->hasFnAttribute(Kind: "loader-replaceable" )) |
3197 | return InlineResult::failure(Reason: "loader replaceable function attribute" ); |
3198 | |
3199 | return std::nullopt; |
3200 | } |
3201 | |
3202 | InlineCost llvm::getInlineCost( |
3203 | CallBase &Call, Function *Callee, const InlineParams &Params, |
3204 | TargetTransformInfo &CalleeTTI, |
3205 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
3206 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
3207 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
3208 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, |
3209 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) { |
3210 | |
3211 | auto UserDecision = |
3212 | llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI); |
3213 | |
3214 | if (UserDecision) { |
3215 | if (UserDecision->isSuccess()) |
3216 | return llvm::InlineCost::getAlways(Reason: "always inline attribute" ); |
3217 | return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason()); |
3218 | } |
3219 | |
3220 | LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() |
3221 | << "... (caller:" << Call.getCaller()->getName() |
3222 | << ")\n" ); |
3223 | |
3224 | InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, |
3225 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
3226 | /*BoostIndirect=*/true, /*IgnoreThreshold=*/false, |
3227 | GetEphValuesCache); |
3228 | InlineResult ShouldInline = CA.analyze(); |
3229 | |
3230 | LLVM_DEBUG(CA.dump()); |
3231 | |
3232 | // Always make cost benefit based decision explicit. |
3233 | // We use always/never here since threshold is not meaningful, |
3234 | // as it's not what drives cost-benefit analysis. |
3235 | if (CA.wasDecidedByCostBenefit()) { |
3236 | if (ShouldInline.isSuccess()) |
3237 | return InlineCost::getAlways(Reason: "benefit over cost" , |
3238 | CostBenefit: CA.getCostBenefitPair()); |
3239 | else |
3240 | return InlineCost::getNever(Reason: "cost over benefit" , CostBenefit: CA.getCostBenefitPair()); |
3241 | } |
3242 | |
3243 | if (CA.wasDecidedByCostThreshold()) |
3244 | return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(), |
3245 | StaticBonus: CA.getStaticBonusApplied()); |
3246 | |
3247 | // No details on how the decision was made, simply return always or never. |
3248 | return ShouldInline.isSuccess() |
3249 | ? InlineCost::getAlways(Reason: "empty function" ) |
3250 | : InlineCost::getNever(Reason: ShouldInline.getFailureReason()); |
3251 | } |
3252 | |
3253 | InlineResult llvm::isInlineViable(Function &F) { |
3254 | bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice); |
3255 | for (BasicBlock &BB : F) { |
3256 | // Disallow inlining of functions which contain indirect branches. |
3257 | if (isa<IndirectBrInst>(Val: BB.getTerminator())) |
3258 | return InlineResult::failure(Reason: "contains indirect branches" ); |
3259 | |
3260 | // Disallow inlining of blockaddresses which are used by non-callbr |
3261 | // instructions. |
3262 | if (BB.hasAddressTaken()) |
3263 | for (User *U : BlockAddress::get(BB: &BB)->users()) |
3264 | if (!isa<CallBrInst>(Val: *U)) |
3265 | return InlineResult::failure(Reason: "blockaddress used outside of callbr" ); |
3266 | |
3267 | for (auto &II : BB) { |
3268 | CallBase *Call = dyn_cast<CallBase>(Val: &II); |
3269 | if (!Call) |
3270 | continue; |
3271 | |
3272 | // Disallow recursive calls. |
3273 | Function *Callee = Call->getCalledFunction(); |
3274 | if (&F == Callee) |
3275 | return InlineResult::failure(Reason: "recursive call" ); |
3276 | |
3277 | // Disallow calls which expose returns-twice to a function not previously |
3278 | // attributed as such. |
3279 | if (!ReturnsTwice && isa<CallInst>(Val: Call) && |
3280 | cast<CallInst>(Val: Call)->canReturnTwice()) |
3281 | return InlineResult::failure(Reason: "exposes returns-twice attribute" ); |
3282 | |
3283 | if (Callee) |
3284 | switch (Callee->getIntrinsicID()) { |
3285 | default: |
3286 | break; |
3287 | case llvm::Intrinsic::icall_branch_funnel: |
3288 | // Disallow inlining of @llvm.icall.branch.funnel because current |
3289 | // backend can't separate call targets from call arguments. |
3290 | return InlineResult::failure( |
3291 | Reason: "disallowed inlining of @llvm.icall.branch.funnel" ); |
3292 | case llvm::Intrinsic::localescape: |
3293 | // Disallow inlining functions that call @llvm.localescape. Doing this |
3294 | // correctly would require major changes to the inliner. |
3295 | return InlineResult::failure( |
3296 | Reason: "disallowed inlining of @llvm.localescape" ); |
3297 | case llvm::Intrinsic::vastart: |
3298 | // Disallow inlining of functions that initialize VarArgs with |
3299 | // va_start. |
3300 | return InlineResult::failure( |
3301 | Reason: "contains VarArgs initialized with va_start" ); |
3302 | } |
3303 | } |
3304 | } |
3305 | |
3306 | return InlineResult::success(); |
3307 | } |
3308 | |
3309 | // APIs to create InlineParams based on command line flags and/or other |
3310 | // parameters. |
3311 | |
3312 | InlineParams llvm::getInlineParams(int Threshold) { |
3313 | InlineParams Params; |
3314 | |
3315 | // This field is the threshold to use for a callee by default. This is |
3316 | // derived from one or more of: |
3317 | // * optimization or size-optimization levels, |
3318 | // * a value passed to createFunctionInliningPass function, or |
3319 | // * the -inline-threshold flag. |
3320 | // If the -inline-threshold flag is explicitly specified, that is used |
3321 | // irrespective of anything else. |
3322 | if (InlineThreshold.getNumOccurrences() > 0) |
3323 | Params.DefaultThreshold = InlineThreshold; |
3324 | else |
3325 | Params.DefaultThreshold = Threshold; |
3326 | |
3327 | // Set the HintThreshold knob from the -inlinehint-threshold. |
3328 | Params.HintThreshold = HintThreshold; |
3329 | |
3330 | // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. |
3331 | Params.HotCallSiteThreshold = HotCallSiteThreshold; |
3332 | |
3333 | // If the -locally-hot-callsite-threshold is explicitly specified, use it to |
3334 | // populate LocallyHotCallSiteThreshold. Later, we populate |
3335 | // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if |
3336 | // we know that optimization level is O3 (in the getInlineParams variant that |
3337 | // takes the opt and size levels). |
3338 | // FIXME: Remove this check (and make the assignment unconditional) after |
3339 | // addressing size regression issues at O2. |
3340 | if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) |
3341 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
3342 | |
3343 | // Set the ColdCallSiteThreshold knob from the |
3344 | // -inline-cold-callsite-threshold. |
3345 | Params.ColdCallSiteThreshold = ColdCallSiteThreshold; |
3346 | |
3347 | // Set the OptMinSizeThreshold and OptSizeThreshold params only if the |
3348 | // -inlinehint-threshold commandline option is not explicitly given. If that |
3349 | // option is present, then its value applies even for callees with size and |
3350 | // minsize attributes. |
3351 | // If the -inline-threshold is not specified, set the ColdThreshold from the |
3352 | // -inlinecold-threshold even if it is not explicitly passed. If |
3353 | // -inline-threshold is specified, then -inlinecold-threshold needs to be |
3354 | // explicitly specified to set the ColdThreshold knob |
3355 | if (InlineThreshold.getNumOccurrences() == 0) { |
3356 | Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; |
3357 | Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; |
3358 | Params.ColdThreshold = ColdThreshold; |
3359 | } else if (ColdThreshold.getNumOccurrences() > 0) { |
3360 | Params.ColdThreshold = ColdThreshold; |
3361 | } |
3362 | return Params; |
3363 | } |
3364 | |
3365 | InlineParams llvm::getInlineParams() { |
3366 | return getInlineParams(Threshold: DefaultThreshold); |
3367 | } |
3368 | |
3369 | // Compute the default threshold for inlining based on the opt level and the |
3370 | // size opt level. |
3371 | static int computeThresholdFromOptLevels(unsigned OptLevel, |
3372 | unsigned SizeOptLevel) { |
3373 | if (OptLevel > 2) |
3374 | return InlineConstants::OptAggressiveThreshold; |
3375 | if (SizeOptLevel == 1) // -Os |
3376 | return InlineConstants::OptSizeThreshold; |
3377 | if (SizeOptLevel == 2) // -Oz |
3378 | return InlineConstants::OptMinSizeThreshold; |
3379 | return DefaultThreshold; |
3380 | } |
3381 | |
3382 | InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { |
3383 | auto Params = |
3384 | getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); |
3385 | // At O3, use the value of -locally-hot-callsite-threshold option to populate |
3386 | // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only |
3387 | // when it is specified explicitly. |
3388 | if (OptLevel > 2) |
3389 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
3390 | return Params; |
3391 | } |
3392 | |
3393 | PreservedAnalyses |
3394 | InlineCostAnnotationPrinterPass::run(Function &F, |
3395 | FunctionAnalysisManager &FAM) { |
3396 | PrintInstructionComments = true; |
3397 | std::function<AssumptionCache &(Function &)> GetAssumptionCache = |
3398 | [&](Function &F) -> AssumptionCache & { |
3399 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
3400 | }; |
3401 | |
3402 | auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F); |
3403 | ProfileSummaryInfo *PSI = |
3404 | MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
3405 | const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); |
3406 | |
3407 | // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. |
3408 | // In the current implementation, the type of InlineParams doesn't matter as |
3409 | // the pass serves only for verification of inliner's decisions. |
3410 | // We can add a flag which determines InlineParams for this run. Right now, |
3411 | // the default InlineParams are used. |
3412 | const InlineParams Params = llvm::getInlineParams(); |
3413 | for (BasicBlock &BB : F) { |
3414 | for (Instruction &I : BB) { |
3415 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
3416 | Function *CalledFunction = CB->getCalledFunction(); |
3417 | if (!CalledFunction || CalledFunction->isDeclaration()) |
3418 | continue; |
3419 | OptimizationRemarkEmitter ORE(CalledFunction); |
3420 | InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI, |
3421 | GetAssumptionCache, nullptr, nullptr, PSI, |
3422 | &ORE); |
3423 | ICCA.analyze(); |
3424 | OS << " Analyzing call of " << CalledFunction->getName() |
3425 | << "... (caller:" << CB->getCaller()->getName() << ")\n" ; |
3426 | ICCA.print(OS); |
3427 | OS << "\n" ; |
3428 | } |
3429 | } |
3430 | } |
3431 | return PreservedAnalyses::all(); |
3432 | } |
3433 | |