1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Analysis/InlineCost.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/BlockFrequencyInfo.h"
21#include "llvm/Analysis/CodeMetrics.h"
22#include "llvm/Analysis/ConstantFolding.h"
23#include "llvm/Analysis/DomConditionCache.h"
24#include "llvm/Analysis/EphemeralValuesCache.h"
25#include "llvm/Analysis/InstructionSimplify.h"
26#include "llvm/Analysis/LoopInfo.h"
27#include "llvm/Analysis/MemoryBuiltins.h"
28#include "llvm/Analysis/OptimizationRemarkEmitter.h"
29#include "llvm/Analysis/ProfileSummaryInfo.h"
30#include "llvm/Analysis/TargetLibraryInfo.h"
31#include "llvm/Analysis/TargetTransformInfo.h"
32#include "llvm/Analysis/ValueTracking.h"
33#include "llvm/Config/llvm-config.h"
34#include "llvm/IR/AssemblyAnnotationWriter.h"
35#include "llvm/IR/CallingConv.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/Dominators.h"
38#include "llvm/IR/GetElementPtrTypeIterator.h"
39#include "llvm/IR/GlobalAlias.h"
40#include "llvm/IR/InlineAsm.h"
41#include "llvm/IR/InstVisitor.h"
42#include "llvm/IR/IntrinsicInst.h"
43#include "llvm/IR/Operator.h"
44#include "llvm/IR/PatternMatch.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/Debug.h"
47#include "llvm/Support/FormattedStream.h"
48#include "llvm/Support/raw_ostream.h"
49#include <climits>
50#include <limits>
51#include <optional>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "inline-cost"
56
57STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
58
59static cl::opt<int>
60 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(Val: 225),
61 cl::desc("Default amount of inlining to perform"));
62
63// We introduce this option since there is a minor compile-time win by avoiding
64// addition of TTI attributes (target-features in particular) to inline
65// candidates when they are guaranteed to be the same as top level methods in
66// some use cases. If we avoid adding the attribute, we need an option to avoid
67// checking these attributes.
68static cl::opt<bool> IgnoreTTIInlineCompatible(
69 "ignore-tti-inline-compatible", cl::Hidden, cl::init(Val: false),
70 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
71 "during inline cost calculation"));
72
73static cl::opt<bool> PrintInstructionComments(
74 "print-instruction-comments", cl::Hidden, cl::init(Val: false),
75 cl::desc("Prints comments for instruction based on inline cost analysis"));
76
77static cl::opt<int> InlineThreshold(
78 "inline-threshold", cl::Hidden, cl::init(Val: 225),
79 cl::desc("Control the amount of inlining to perform (default = 225)"));
80
81static cl::opt<int> HintThreshold(
82 "inlinehint-threshold", cl::Hidden, cl::init(Val: 325),
83 cl::desc("Threshold for inlining functions with inline hint"));
84
85static cl::opt<int>
86 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
87 cl::init(Val: 45),
88 cl::desc("Threshold for inlining cold callsites"));
89
90static cl::opt<bool> InlineEnableCostBenefitAnalysis(
91 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(Val: false),
92 cl::desc("Enable the cost-benefit analysis for the inliner"));
93
94// InlineSavingsMultiplier overrides per TTI multipliers iff it is
95// specified explicitly in command line options. This option is exposed
96// for tuning and testing.
97static cl::opt<int> InlineSavingsMultiplier(
98 "inline-savings-multiplier", cl::Hidden, cl::init(Val: 8),
99 cl::desc("Multiplier to multiply cycle savings by during inlining"));
100
101// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
102// specified explicitly in command line options. This option is exposed
103// for tuning and testing.
104static cl::opt<int> InlineSavingsProfitableMultiplier(
105 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(Val: 4),
106 cl::desc("A multiplier on top of cycle savings to decide whether the "
107 "savings won't justify the cost"));
108
109static cl::opt<int>
110 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(Val: 100),
111 cl::desc("The maximum size of a callee that get's "
112 "inlined without sufficient cycle savings"));
113
114// We introduce this threshold to help performance of instrumentation based
115// PGO before we actually hook up inliner with analysis passes such as BPI and
116// BFI.
117static cl::opt<int> ColdThreshold(
118 "inlinecold-threshold", cl::Hidden, cl::init(Val: 45),
119 cl::desc("Threshold for inlining functions with cold attribute"));
120
121static cl::opt<int>
122 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(Val: 3000),
123 cl::desc("Threshold for hot callsites "));
124
125static cl::opt<int> LocallyHotCallSiteThreshold(
126 "locally-hot-callsite-threshold", cl::Hidden, cl::init(Val: 525),
127 cl::desc("Threshold for locally hot callsites "));
128
129static cl::opt<int> ColdCallSiteRelFreq(
130 "cold-callsite-rel-freq", cl::Hidden, cl::init(Val: 2),
131 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
132 "entry frequency, for a callsite to be cold in the absence of "
133 "profile information."));
134
135static cl::opt<uint64_t> HotCallSiteRelFreq(
136 "hot-callsite-rel-freq", cl::Hidden, cl::init(Val: 60),
137 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
138 "entry frequency, for a callsite to be hot in the absence of "
139 "profile information."));
140
141static cl::opt<int>
142 InstrCost("inline-instr-cost", cl::Hidden, cl::init(Val: 5),
143 cl::desc("Cost of a single instruction when inlining"));
144
145static cl::opt<int> InlineAsmInstrCost(
146 "inline-asm-instr-cost", cl::Hidden, cl::init(Val: 0),
147 cl::desc("Cost of a single inline asm instruction when inlining"));
148
149static cl::opt<int>
150 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(Val: 0),
151 cl::desc("Cost of load/store instruction when inlining"));
152
153static cl::opt<int> CallPenalty(
154 "inline-call-penalty", cl::Hidden, cl::init(Val: 25),
155 cl::desc("Call penalty that is applied per callsite when inlining"));
156
157static cl::opt<size_t>
158 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
159 cl::init(Val: std::numeric_limits<size_t>::max()),
160 cl::desc("Do not inline functions with a stack size "
161 "that exceeds the specified limit"));
162
163static cl::opt<size_t> RecurStackSizeThreshold(
164 "recursive-inline-max-stacksize", cl::Hidden,
165 cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller),
166 cl::desc("Do not inline recursive functions with a stack "
167 "size that exceeds the specified limit"));
168
169static cl::opt<bool> OptComputeFullInlineCost(
170 "inline-cost-full", cl::Hidden,
171 cl::desc("Compute the full inline cost of a call site even when the cost "
172 "exceeds the threshold."));
173
174static cl::opt<bool> InlineCallerSupersetNoBuiltin(
175 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(Val: true),
176 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
177 "attributes."));
178
179static cl::opt<bool> DisableGEPConstOperand(
180 "disable-gep-const-evaluation", cl::Hidden, cl::init(Val: false),
181 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
182
183static cl::opt<bool> InlineAllViableCalls(
184 "inline-all-viable-calls", cl::Hidden, cl::init(Val: false),
185 cl::desc("Inline all viable calls, even if they exceed the inlining "
186 "threshold"));
187namespace llvm {
188std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
189 if (Attr.isValid()) {
190 int AttrValue = 0;
191 if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue))
192 return AttrValue;
193 }
194 return std::nullopt;
195}
196
197std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
198 return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind));
199}
200
201std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
202 return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind));
203}
204
205namespace InlineConstants {
206int getInstrCost() { return InstrCost; }
207
208} // namespace InlineConstants
209
210} // namespace llvm
211
212namespace {
213class InlineCostCallAnalyzer;
214
215// This struct is used to store information about inline cost of a
216// particular instruction
217struct InstructionCostDetail {
218 int CostBefore = 0;
219 int CostAfter = 0;
220 int ThresholdBefore = 0;
221 int ThresholdAfter = 0;
222
223 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
224
225 int getCostDelta() const { return CostAfter - CostBefore; }
226
227 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
228};
229
230class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
231private:
232 InlineCostCallAnalyzer *const ICCA;
233
234public:
235 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
236 void emitInstructionAnnot(const Instruction *I,
237 formatted_raw_ostream &OS) override;
238};
239
240/// Carry out call site analysis, in order to evaluate inlinability.
241/// NOTE: the type is currently used as implementation detail of functions such
242/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
243/// expectation is that they come from the outer scope, from the wrapper
244/// functions. If we want to support constructing CallAnalyzer objects where
245/// lambdas are provided inline at construction, or where the object needs to
246/// otherwise survive past the scope of the provided functions, we need to
247/// revisit the argument types.
248class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
249 typedef InstVisitor<CallAnalyzer, bool> Base;
250 friend class InstVisitor<CallAnalyzer, bool>;
251
252protected:
253 virtual ~CallAnalyzer() = default;
254 /// The TargetTransformInfo available for this compilation.
255 const TargetTransformInfo &TTI;
256
257 /// Getter for the cache of @llvm.assume intrinsics.
258 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
259
260 /// Getter for BlockFrequencyInfo
261 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
262
263 /// Getter for TargetLibraryInfo
264 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
265
266 /// Profile summary information.
267 ProfileSummaryInfo *PSI;
268
269 /// The called function.
270 Function &F;
271
272 // Cache the DataLayout since we use it a lot.
273 const DataLayout &DL;
274
275 /// The OptimizationRemarkEmitter available for this compilation.
276 OptimizationRemarkEmitter *ORE;
277
278 /// The candidate callsite being analyzed. Please do not use this to do
279 /// analysis in the caller function; we want the inline cost query to be
280 /// easily cacheable. Instead, use the cover function paramHasAttr.
281 CallBase &CandidateCall;
282
283 /// Getter for the cache of ephemeral values.
284 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr;
285
286 /// Extension points for handling callsite features.
287 // Called before a basic block was analyzed.
288 virtual void onBlockStart(const BasicBlock *BB) {}
289
290 /// Called after a basic block was analyzed.
291 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
292
293 /// Called before an instruction was analyzed
294 virtual void onInstructionAnalysisStart(const Instruction *I) {}
295
296 /// Called after an instruction was analyzed
297 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
298
299 /// Called at the end of the analysis of the callsite. Return the outcome of
300 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
301 /// the reason it can't.
302 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
303 /// Called when we're about to start processing a basic block, and every time
304 /// we are done processing an instruction. Return true if there is no point in
305 /// continuing the analysis (e.g. we've determined already the call site is
306 /// too expensive to inline)
307 virtual bool shouldStop() { return false; }
308
309 /// Called before the analysis of the callee body starts (with callsite
310 /// contexts propagated). It checks callsite-specific information. Return a
311 /// reason analysis can't continue if that's the case, or 'true' if it may
312 /// continue.
313 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
314 /// Called if the analysis engine decides SROA cannot be done for the given
315 /// alloca.
316 virtual void onDisableSROA(AllocaInst *Arg) {}
317
318 /// Called the analysis engine determines load elimination won't happen.
319 virtual void onDisableLoadElimination() {}
320
321 /// Called when we visit a CallBase, before the analysis starts. Return false
322 /// to stop further processing of the instruction.
323 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
324
325 /// Called to account for a call.
326 virtual void onCallPenalty() {}
327
328 /// Called to account for a load or store.
329 virtual void onMemAccess(){};
330
331 /// Called to account for the expectation the inlining would result in a load
332 /// elimination.
333 virtual void onLoadEliminationOpportunity() {}
334
335 /// Called to account for the cost of argument setup for the Call in the
336 /// callee's body (not the callsite currently under analysis).
337 virtual void onCallArgumentSetup(const CallBase &Call) {}
338
339 /// Called to account for a load relative intrinsic.
340 virtual void onLoadRelativeIntrinsic() {}
341
342 /// Called to account for a lowered call.
343 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
344 }
345
346 /// Account for a jump table of given size. Return false to stop further
347 /// processing the switch instruction
348 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
349
350 /// Account for a case cluster of given size. Return false to stop further
351 /// processing of the instruction.
352 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
353
354 /// Called at the end of processing a switch instruction, with the given
355 /// number of case clusters.
356 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
357 bool DefaultDestUnreachable) {}
358
359 /// Called to account for any other instruction not specifically accounted
360 /// for.
361 virtual void onMissedSimplification() {}
362
363 /// Account for inline assembly instructions.
364 virtual void onInlineAsm(const InlineAsm &Arg) {}
365
366 /// Start accounting potential benefits due to SROA for the given alloca.
367 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
368
369 /// Account SROA savings for the AllocaInst value.
370 virtual void onAggregateSROAUse(AllocaInst *V) {}
371
372 bool handleSROA(Value *V, bool DoNotDisable) {
373 // Check for SROA candidates in comparisons.
374 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
375 if (DoNotDisable) {
376 onAggregateSROAUse(V: SROAArg);
377 return true;
378 }
379 disableSROAForArg(SROAArg);
380 }
381 return false;
382 }
383
384 bool IsCallerRecursive = false;
385 bool IsRecursiveCall = false;
386 bool ExposesReturnsTwice = false;
387 bool HasDynamicAlloca = false;
388 bool ContainsNoDuplicateCall = false;
389 bool HasReturn = false;
390 bool HasIndirectBr = false;
391 bool HasUninlineableIntrinsic = false;
392 bool InitsVargArgs = false;
393
394 /// Number of bytes allocated statically by the callee.
395 uint64_t AllocatedSize = 0;
396 unsigned NumInstructions = 0;
397 unsigned NumInlineAsmInstructions = 0;
398 unsigned NumVectorInstructions = 0;
399
400 /// While we walk the potentially-inlined instructions, we build up and
401 /// maintain a mapping of simplified values specific to this callsite. The
402 /// idea is to propagate any special information we have about arguments to
403 /// this call through the inlinable section of the function, and account for
404 /// likely simplifications post-inlining. The most important aspect we track
405 /// is CFG altering simplifications -- when we prove a basic block dead, that
406 /// can cause dramatic shifts in the cost of inlining a function.
407 /// Note: The simplified Value may be owned by the caller function.
408 DenseMap<Value *, Value *> SimplifiedValues;
409
410 /// Keep track of the values which map back (through function arguments) to
411 /// allocas on the caller stack which could be simplified through SROA.
412 DenseMap<Value *, AllocaInst *> SROAArgValues;
413
414 /// Keep track of Allocas for which we believe we may get SROA optimization.
415 DenseSet<AllocaInst *> EnabledSROAAllocas;
416
417 /// Keep track of values which map to a pointer base and constant offset.
418 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
419
420 /// Keep track of dead blocks due to the constant arguments.
421 SmallPtrSet<BasicBlock *, 16> DeadBlocks;
422
423 /// The mapping of the blocks to their known unique successors due to the
424 /// constant arguments.
425 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
426
427 /// Model the elimination of repeated loads that is expected to happen
428 /// whenever we simplify away the stores that would otherwise cause them to be
429 /// loads.
430 bool EnableLoadElimination = true;
431
432 /// Whether we allow inlining for recursive call.
433 bool AllowRecursiveCall = false;
434
435 SmallPtrSet<Value *, 16> LoadAddrSet;
436
437 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
438 auto It = SROAArgValues.find(Val: V);
439 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0)
440 return nullptr;
441 return It->second;
442 }
443
444 /// Use a value in its given form directly if possible, otherwise try looking
445 /// for it in SimplifiedValues.
446 template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
447 if (auto *Direct = dyn_cast<T>(V))
448 return Direct;
449 return getSimplifiedValue<T>(V);
450 }
451
452 // Custom simplification helper routines.
453 bool isAllocaDerivedArg(Value *V);
454 void disableSROAForArg(AllocaInst *SROAArg);
455 void disableSROA(Value *V);
456 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
457 void disableLoadElimination();
458 bool isGEPFree(GetElementPtrInst &GEP);
459 bool canFoldInboundsGEP(GetElementPtrInst &I);
460 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
461 bool simplifyCallSite(Function *F, CallBase &Call);
462 bool simplifyCmpInstForRecCall(CmpInst &Cmp);
463 bool simplifyInstruction(Instruction &I);
464 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
465 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
466 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
467 bool isLoweredToCall(Function *F, CallBase &Call);
468
469 /// Return true if the given argument to the function being considered for
470 /// inlining has the given attribute set either at the call site or the
471 /// function declaration. Primarily used to inspect call site specific
472 /// attributes since these can be more precise than the ones on the callee
473 /// itself.
474 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
475
476 /// Return true if the given value is known non null within the callee if
477 /// inlined through this particular callsite.
478 bool isKnownNonNullInCallee(Value *V);
479
480 /// Return true if size growth is allowed when inlining the callee at \p Call.
481 bool allowSizeGrowth(CallBase &Call);
482
483 // Custom analysis routines.
484 InlineResult analyzeBlock(BasicBlock *BB,
485 const SmallPtrSetImpl<const Value *> &EphValues);
486
487 // Disable several entry points to the visitor so we don't accidentally use
488 // them by declaring but not defining them here.
489 void visit(Module *);
490 void visit(Module &);
491 void visit(Function *);
492 void visit(Function &);
493 void visit(BasicBlock *);
494 void visit(BasicBlock &);
495
496 // Provide base case for our instruction visit.
497 bool visitInstruction(Instruction &I);
498
499 // Our visit overrides.
500 bool visitAlloca(AllocaInst &I);
501 bool visitPHI(PHINode &I);
502 bool visitGetElementPtr(GetElementPtrInst &I);
503 bool visitBitCast(BitCastInst &I);
504 bool visitPtrToInt(PtrToIntInst &I);
505 bool visitIntToPtr(IntToPtrInst &I);
506 bool visitCastInst(CastInst &I);
507 bool visitCmpInst(CmpInst &I);
508 bool visitSub(BinaryOperator &I);
509 bool visitBinaryOperator(BinaryOperator &I);
510 bool visitFNeg(UnaryOperator &I);
511 bool visitLoad(LoadInst &I);
512 bool visitStore(StoreInst &I);
513 bool visitExtractValue(ExtractValueInst &I);
514 bool visitInsertValue(InsertValueInst &I);
515 bool visitCallBase(CallBase &Call);
516 bool visitReturnInst(ReturnInst &RI);
517 bool visitUncondBrInst(UncondBrInst &BI);
518 bool visitCondBrInst(CondBrInst &BI);
519 bool visitSelectInst(SelectInst &SI);
520 bool visitSwitchInst(SwitchInst &SI);
521 bool visitIndirectBrInst(IndirectBrInst &IBI);
522 bool visitResumeInst(ResumeInst &RI);
523 bool visitCleanupReturnInst(CleanupReturnInst &RI);
524 bool visitCatchReturnInst(CatchReturnInst &RI);
525 bool visitUnreachableInst(UnreachableInst &I);
526
527public:
528 CallAnalyzer(
529 Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
530 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
531 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
532 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
533 ProfileSummaryInfo *PSI = nullptr,
534 OptimizationRemarkEmitter *ORE = nullptr,
535 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
536 nullptr)
537 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
538 GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
539 CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {}
540
541 InlineResult analyze();
542
543 /// Lookup simplified Value. May return a value owned by the caller.
544 Value *getSimplifiedValueUnchecked(Value *V) const {
545 return SimplifiedValues.lookup(Val: V);
546 }
547
548 /// Lookup simplified Value, but return nullptr if the simplified value is
549 /// owned by the caller.
550 template <typename T> T *getSimplifiedValue(Value *V) const {
551 Value *SimpleV = SimplifiedValues.lookup(Val: V);
552 if (!SimpleV)
553 return nullptr;
554
555 // Skip checks if we know T is a global. This has a small, but measurable
556 // impact on compile-time.
557 if constexpr (std::is_base_of_v<Constant, T>)
558 return dyn_cast<T>(SimpleV);
559
560 // Make sure the simplified Value is owned by this function
561 if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) {
562 if (I->getFunction() != &F)
563 return nullptr;
564 } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) {
565 if (Arg->getParent() != &F)
566 return nullptr;
567 } else if (!isa<Constant>(Val: SimpleV))
568 return nullptr;
569 return dyn_cast<T>(SimpleV);
570 }
571
572 // Keep a bunch of stats about the cost savings found so we can print them
573 // out when debugging.
574 unsigned NumConstantArgs = 0;
575 unsigned NumConstantOffsetPtrArgs = 0;
576 unsigned NumAllocaArgs = 0;
577 unsigned NumConstantPtrCmps = 0;
578 unsigned NumConstantPtrDiffs = 0;
579 unsigned NumInstructionsSimplified = 0;
580
581 void dump();
582};
583
584// Considering forming a binary search, we should find the number of nodes
585// which is same as the number of comparisons when lowered. For a given
586// number of clusters, n, we can define a recursive function, f(n), to find
587// the number of nodes in the tree. The recursion is :
588// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
589// and f(n) = n, when n <= 3.
590// This will lead a binary tree where the leaf should be either f(2) or f(3)
591// when n > 3. So, the number of comparisons from leaves should be n, while
592// the number of non-leaf should be :
593// 2^(log2(n) - 1) - 1
594// = 2^log2(n) * 2^-1 - 1
595// = n / 2 - 1.
596// Considering comparisons from leaf and non-leaf nodes, we can estimate the
597// number of comparisons in a simple closed form :
598// n + n / 2 - 1 = n * 3 / 2 - 1
599int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
600 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
601}
602
603/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
604/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
605class InlineCostCallAnalyzer final : public CallAnalyzer {
606 const bool ComputeFullInlineCost;
607 int LoadEliminationCost = 0;
608 /// Bonus to be applied when percentage of vector instructions in callee is
609 /// high (see more details in updateThreshold).
610 int VectorBonus = 0;
611 /// Bonus to be applied when the callee has only one reachable basic block.
612 int SingleBBBonus = 0;
613
614 /// Tunable parameters that control the analysis.
615 const InlineParams &Params;
616
617 // This DenseMap stores the delta change in cost and threshold after
618 // accounting for the given instruction. The map is filled only with the
619 // flag PrintInstructionComments on.
620 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
621
622 /// Upper bound for the inlining cost. Bonuses are being applied to account
623 /// for speculative "expected profit" of the inlining decision.
624 int Threshold = 0;
625
626 /// The amount of StaticBonus applied.
627 int StaticBonusApplied = 0;
628
629 /// Attempt to evaluate indirect calls to boost its inline cost.
630 const bool BoostIndirectCalls;
631
632 /// Ignore the threshold when finalizing analysis.
633 const bool IgnoreThreshold;
634
635 // True if the cost-benefit-analysis-based inliner is enabled.
636 const bool CostBenefitAnalysisEnabled;
637
638 /// Inlining cost measured in abstract units, accounts for all the
639 /// instructions expected to be executed for a given function invocation.
640 /// Instructions that are statically proven to be dead based on call-site
641 /// arguments are not counted here.
642 int Cost = 0;
643
644 // The cumulative cost at the beginning of the basic block being analyzed. At
645 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
646 // the size of that basic block.
647 int CostAtBBStart = 0;
648
649 // The static size of live but cold basic blocks. This is "static" in the
650 // sense that it's not weighted by profile counts at all.
651 int ColdSize = 0;
652
653 // Whether inlining is decided by cost-threshold analysis.
654 bool DecidedByCostThreshold = false;
655
656 // Whether inlining is decided by cost-benefit analysis.
657 bool DecidedByCostBenefit = false;
658
659 // The cost-benefit pair computed by cost-benefit analysis.
660 std::optional<CostBenefitPair> CostBenefit;
661
662 bool SingleBB = true;
663
664 unsigned SROACostSavings = 0;
665 unsigned SROACostSavingsLost = 0;
666
667 /// The mapping of caller Alloca values to their accumulated cost savings. If
668 /// we have to disable SROA for one of the allocas, this tells us how much
669 /// cost must be added.
670 DenseMap<AllocaInst *, int> SROAArgCosts;
671
672 /// Return true if \p Call is a cold callsite.
673 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
674
675 /// Update Threshold based on callsite properties such as callee
676 /// attributes and callee hotness for PGO builds. The Callee is explicitly
677 /// passed to support analyzing indirect calls whose target is inferred by
678 /// analysis.
679 void updateThreshold(CallBase &Call, Function &Callee);
680 /// Return a higher threshold if \p Call is a hot callsite.
681 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
682 BlockFrequencyInfo *CallerBFI);
683
684 /// Handle a capped 'int' increment for Cost.
685 void addCost(int64_t Inc) {
686 Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX);
687 Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX);
688 }
689
690 void onDisableSROA(AllocaInst *Arg) override {
691 auto CostIt = SROAArgCosts.find(Val: Arg);
692 if (CostIt == SROAArgCosts.end())
693 return;
694 addCost(Inc: CostIt->second);
695 SROACostSavings -= CostIt->second;
696 SROACostSavingsLost += CostIt->second;
697 SROAArgCosts.erase(I: CostIt);
698 }
699
700 void onDisableLoadElimination() override {
701 addCost(Inc: LoadEliminationCost);
702 LoadEliminationCost = 0;
703 }
704
705 bool onCallBaseVisitStart(CallBase &Call) override {
706 if (std::optional<int> AttrCallThresholdBonus =
707 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus"))
708 Threshold += *AttrCallThresholdBonus;
709
710 if (std::optional<int> AttrCallCost =
711 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost")) {
712 addCost(Inc: *AttrCallCost);
713 // Prevent further processing of the call since we want to override its
714 // inline cost, not just add to it.
715 return false;
716 }
717 return true;
718 }
719
720 void onCallPenalty() override { addCost(Inc: CallPenalty); }
721
722 void onMemAccess() override { addCost(Inc: MemAccessCost); }
723
724 void onCallArgumentSetup(const CallBase &Call) override {
725 // Pay the price of the argument setup. We account for the average 1
726 // instruction per call argument setup here.
727 addCost(Inc: Call.arg_size() * InstrCost);
728 }
729 void onLoadRelativeIntrinsic() override {
730 // This is normally lowered to 4 LLVM instructions.
731 addCost(Inc: 3 * InstrCost);
732 }
733 void onLoweredCall(Function *F, CallBase &Call,
734 bool IsIndirectCall) override {
735 // We account for the average 1 instruction per call argument setup here.
736 addCost(Inc: Call.arg_size() * InstrCost);
737
738 // If we have a constant that we are calling as a function, we can peer
739 // through it and see the function target. This happens not infrequently
740 // during devirtualization and so we want to give it a hefty bonus for
741 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
742 // Pretend to inline the function, with a custom threshold.
743 if (IsIndirectCall && BoostIndirectCalls) {
744 auto IndirectCallParams = Params;
745 IndirectCallParams.DefaultThreshold =
746 InlineConstants::IndirectCallThreshold;
747 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
748 /// to instantiate the derived class.
749 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
750 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
751 false);
752 if (CA.analyze().isSuccess()) {
753 // We were able to inline the indirect call! Subtract the cost from the
754 // threshold to get the bonus we want to apply, but don't go below zero.
755 addCost(Inc: -std::max(a: 0, b: CA.getThreshold() - CA.getCost()));
756 }
757 } else
758 // Otherwise simply add the cost for merely making the call.
759 addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call,
760 DefaultCallPenalty: CallPenalty));
761 }
762
763 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
764 bool DefaultDestUnreachable) override {
765 // If suitable for a jump table, consider the cost for the table size and
766 // branch to destination.
767 // Maximum valid cost increased in this function.
768 if (JumpTableSize) {
769 // Suppose a default branch includes one compare and one conditional
770 // branch if it's reachable.
771 if (!DefaultDestUnreachable)
772 addCost(Inc: 2 * InstrCost);
773 // Suppose a jump table requires one load and one jump instruction.
774 int64_t JTCost =
775 static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
776 addCost(Inc: JTCost);
777 return;
778 }
779
780 if (NumCaseCluster <= 3) {
781 // Suppose a comparison includes one compare and one conditional branch.
782 // We can reduce a set of instructions if the default branch is
783 // undefined.
784 addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost);
785 return;
786 }
787
788 int64_t ExpectedNumberOfCompare =
789 getExpectedNumberOfCompare(NumCaseCluster);
790 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
791
792 addCost(Inc: SwitchCost);
793 }
794
795 // Parses the inline assembly argument to account for its cost. Inline
796 // assembly instructions incur higher costs for inlining since they cannot be
797 // analyzed and optimized.
798 void onInlineAsm(const InlineAsm &Arg) override {
799 if (!InlineAsmInstrCost)
800 return;
801 SmallVector<StringRef, 4> AsmStrs;
802 Arg.collectAsmStrs(AsmStrs);
803 int SectionLevel = 0;
804 int InlineAsmInstrCount = 0;
805 for (StringRef AsmStr : AsmStrs) {
806 // Trim whitespaces and comments.
807 StringRef Trimmed = AsmStr.trim();
808 size_t hashPos = Trimmed.find(C: '#');
809 if (hashPos != StringRef::npos)
810 Trimmed = Trimmed.substr(Start: 0, N: hashPos);
811 // Ignore comments.
812 if (Trimmed.empty())
813 continue;
814 // Filter out the outlined assembly instructions from the cost by keeping
815 // track of the section level and only accounting for instrutions at
816 // section level of zero. Note there will be duplication in outlined
817 // sections too, but is not accounted in the inlining cost model.
818 if (Trimmed.starts_with(Prefix: ".pushsection")) {
819 ++SectionLevel;
820 continue;
821 }
822 if (Trimmed.starts_with(Prefix: ".popsection")) {
823 --SectionLevel;
824 continue;
825 }
826 // Ignore directives and labels.
827 if (Trimmed.starts_with(Prefix: ".") || Trimmed.contains(Other: ":"))
828 continue;
829 if (SectionLevel == 0)
830 ++InlineAsmInstrCount;
831 }
832 NumInlineAsmInstructions += InlineAsmInstrCount;
833 addCost(Inc: InlineAsmInstrCount * InlineAsmInstrCost);
834 }
835
836 void onMissedSimplification() override { addCost(Inc: InstrCost); }
837
838 void onInitializeSROAArg(AllocaInst *Arg) override {
839 assert(Arg != nullptr &&
840 "Should not initialize SROA costs for null value.");
841 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
842 SROACostSavings += SROAArgCost;
843 SROAArgCosts[Arg] = SROAArgCost;
844 }
845
846 void onAggregateSROAUse(AllocaInst *SROAArg) override {
847 auto CostIt = SROAArgCosts.find(Val: SROAArg);
848 assert(CostIt != SROAArgCosts.end() &&
849 "expected this argument to have a cost");
850 CostIt->second += InstrCost;
851 SROACostSavings += InstrCost;
852 }
853
854 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
855
856 void onBlockAnalyzed(const BasicBlock *BB) override {
857 if (CostBenefitAnalysisEnabled) {
858 // Keep track of the static size of live but cold basic blocks. For now,
859 // we define a cold basic block to be one that's never executed.
860 assert(GetBFI && "GetBFI must be available");
861 BlockFrequencyInfo *BFI = &(GetBFI(F));
862 assert(BFI && "BFI must be available");
863 auto ProfileCount = BFI->getBlockProfileCount(BB);
864 if (*ProfileCount == 0)
865 ColdSize += Cost - CostAtBBStart;
866 }
867
868 auto *TI = BB->getTerminator();
869 // If we had any successors at this point, than post-inlining is likely to
870 // have them as well. Note that we assume any basic blocks which existed
871 // due to branches or switches which folded above will also fold after
872 // inlining.
873 if (SingleBB && TI->getNumSuccessors() > 1) {
874 // Take off the bonus we applied to the threshold.
875 Threshold -= SingleBBBonus;
876 SingleBB = false;
877 }
878 }
879
880 void onInstructionAnalysisStart(const Instruction *I) override {
881 // This function is called to store the initial cost of inlining before
882 // the given instruction was assessed.
883 if (!PrintInstructionComments)
884 return;
885 auto &CostDetail = InstructionCostDetailMap[I];
886 CostDetail.CostBefore = Cost;
887 CostDetail.ThresholdBefore = Threshold;
888 }
889
890 void onInstructionAnalysisFinish(const Instruction *I) override {
891 // This function is called to find new values of cost and threshold after
892 // the instruction has been assessed.
893 if (!PrintInstructionComments)
894 return;
895 auto &CostDetail = InstructionCostDetailMap[I];
896 CostDetail.CostAfter = Cost;
897 CostDetail.ThresholdAfter = Threshold;
898 }
899
900 bool isCostBenefitAnalysisEnabled() {
901 if (!PSI || !PSI->hasProfileSummary())
902 return false;
903
904 if (!GetBFI)
905 return false;
906
907 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
908 // Honor the explicit request from the user.
909 if (!InlineEnableCostBenefitAnalysis)
910 return false;
911 } else {
912 // Otherwise, require instrumentation profile.
913 if (!PSI->hasInstrumentationProfile())
914 return false;
915 }
916
917 auto *Caller = CandidateCall.getParent()->getParent();
918 if (!Caller->getEntryCount())
919 return false;
920
921 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
922 if (!CallerBFI)
923 return false;
924
925 // For now, limit to hot call site.
926 if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI))
927 return false;
928
929 // Make sure we have a nonzero entry count.
930 auto EntryCount = F.getEntryCount();
931 if (!EntryCount || !EntryCount->getCount())
932 return false;
933
934 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
935 if (!CalleeBFI)
936 return false;
937
938 return true;
939 }
940
941 // A helper function to choose between command line override and default.
942 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
943 if (InlineSavingsMultiplier.getNumOccurrences())
944 return InlineSavingsMultiplier;
945 return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
946 }
947
948 // A helper function to choose between command line override and default.
949 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
950 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
951 return InlineSavingsProfitableMultiplier;
952 return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
953 }
954
955 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
956 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
957 CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test")) {
958 CycleSavings = *AttrCycleSavings;
959 }
960
961 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
962 CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test")) {
963 Size = *AttrRuntimeCost;
964 }
965 }
966
967 // Determine whether we should inline the given call site, taking into account
968 // both the size cost and the cycle savings. Return std::nullopt if we don't
969 // have sufficient profiling information to determine.
970 std::optional<bool> costBenefitAnalysis() {
971 if (!CostBenefitAnalysisEnabled)
972 return std::nullopt;
973
974 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
975 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
976 // falling back to the cost-based metric.
977 // TODO: Improve this hacky condition.
978 if (Threshold == 0)
979 return std::nullopt;
980
981 assert(GetBFI);
982 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
983 assert(CalleeBFI);
984
985 // The cycle savings expressed as the sum of InstrCost
986 // multiplied by the estimated dynamic count of each instruction we can
987 // avoid. Savings come from the call site cost, such as argument setup and
988 // the call instruction, as well as the instructions that are folded.
989 //
990 // We use 128-bit APInt here to avoid potential overflow. This variable
991 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
992 // assumes that we can avoid or fold a billion instructions, each with a
993 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
994 // period on a 4GHz machine.
995 APInt CycleSavings(128, 0);
996
997 for (auto &BB : F) {
998 APInt CurrentSavings(128, 0);
999 for (auto &I : BB) {
1000 if (CondBrInst *BI = dyn_cast<CondBrInst>(Val: &I)) {
1001 // Count a conditional branch as savings if it becomes unconditional.
1002 if (getSimplifiedValue<ConstantInt>(V: BI->getCondition()))
1003 CurrentSavings += InstrCost;
1004 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) {
1005 if (getSimplifiedValue<ConstantInt>(V: SI->getCondition()))
1006 CurrentSavings += InstrCost;
1007 } else if (SimplifiedValues.count(Val: &I)) {
1008 // Count an instruction as savings if we can fold it.
1009 CurrentSavings += InstrCost;
1010 }
1011 }
1012
1013 auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB);
1014 CurrentSavings *= *ProfileCount;
1015 CycleSavings += CurrentSavings;
1016 }
1017
1018 // Compute the cycle savings per call.
1019 auto EntryProfileCount = F.getEntryCount();
1020 assert(EntryProfileCount && EntryProfileCount->getCount());
1021 auto EntryCount = EntryProfileCount->getCount();
1022 CycleSavings += EntryCount / 2;
1023 CycleSavings = CycleSavings.udiv(RHS: EntryCount);
1024
1025 // Compute the total savings for the call site.
1026 auto *CallerBB = CandidateCall.getParent();
1027 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
1028 CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL);
1029 CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB);
1030
1031 // Remove the cost of the cold basic blocks to model the runtime cost more
1032 // accurately. Both machine block placement and function splitting could
1033 // place cold blocks further from hot blocks.
1034 int Size = Cost - ColdSize;
1035
1036 // Allow tiny callees to be inlined regardless of whether they meet the
1037 // savings threshold.
1038 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
1039
1040 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
1041 CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings);
1042
1043 // Let R be the ratio of CycleSavings to Size. We accept the inlining
1044 // opportunity if R is really high and reject if R is really low. If R is
1045 // somewhere in the middle, we fall back to the cost-based analysis.
1046 //
1047 // Specifically, let R = CycleSavings / Size, we accept the inlining
1048 // opportunity if:
1049 //
1050 // PSI->getOrCompHotCountThreshold()
1051 // R > -------------------------------------------------
1052 // getInliningCostBenefitAnalysisSavingsMultiplier()
1053 //
1054 // and reject the inlining opportunity if:
1055 //
1056 // PSI->getOrCompHotCountThreshold()
1057 // R <= ----------------------------------------------------
1058 // getInliningCostBenefitAnalysisProfitableMultiplier()
1059 //
1060 // Otherwise, we fall back to the cost-based analysis.
1061 //
1062 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
1063 // HotCountThreshold * Size) rather than division to avoid precision loss.
1064 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
1065 Threshold *= Size;
1066
1067 APInt UpperBoundCycleSavings = CycleSavings;
1068 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
1069 if (UpperBoundCycleSavings.uge(RHS: Threshold))
1070 return true;
1071
1072 APInt LowerBoundCycleSavings = CycleSavings;
1073 LowerBoundCycleSavings *=
1074 getInliningCostBenefitAnalysisProfitableMultiplier();
1075 if (LowerBoundCycleSavings.ult(RHS: Threshold))
1076 return false;
1077
1078 // Otherwise, fall back to the cost-based analysis.
1079 return std::nullopt;
1080 }
1081
1082 InlineResult finalizeAnalysis() override {
1083 // Loops generally act a lot like calls in that they act like barriers to
1084 // movement, require a certain amount of setup, etc. So when optimising for
1085 // size, we penalise any call sites that perform loops. We do this after all
1086 // other costs here, so will likely only be dealing with relatively small
1087 // functions (and hence DT and LI will hopefully be cheap).
1088 auto *Caller = CandidateCall.getFunction();
1089 if (Caller->hasMinSize()) {
1090 DominatorTree DT(F);
1091 LoopInfo LI(DT);
1092 int NumLoops = 0;
1093 for (Loop *L : LI) {
1094 // Ignore loops that will not be executed
1095 if (DeadBlocks.count(Ptr: L->getHeader()))
1096 continue;
1097 NumLoops++;
1098 }
1099 addCost(Inc: NumLoops * InlineConstants::LoopPenalty);
1100 }
1101
1102 // We applied the maximum possible vector bonus at the beginning. Now,
1103 // subtract the excess bonus, if any, from the Threshold before
1104 // comparing against Cost.
1105 if (NumVectorInstructions <= NumInstructions / 10)
1106 Threshold -= VectorBonus;
1107 else if (NumVectorInstructions <= NumInstructions / 2)
1108 Threshold -= VectorBonus / 2;
1109
1110 if (std::optional<int> AttrCost =
1111 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost"))
1112 Cost = *AttrCost;
1113
1114 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1115 CB&: CandidateCall,
1116 AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName))
1117 Cost *= *AttrCostMult;
1118
1119 if (std::optional<int> AttrThreshold =
1120 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold"))
1121 Threshold = *AttrThreshold;
1122
1123 if (auto Result = costBenefitAnalysis()) {
1124 DecidedByCostBenefit = true;
1125 if (*Result)
1126 return InlineResult::success();
1127 else
1128 return InlineResult::failure(Reason: "Cost over threshold.");
1129 }
1130
1131 if (IgnoreThreshold)
1132 return InlineResult::success();
1133
1134 DecidedByCostThreshold = true;
1135 return Cost < std::max(a: 1, b: Threshold)
1136 ? InlineResult::success()
1137 : InlineResult::failure(Reason: "Cost over threshold.");
1138 }
1139
1140 bool shouldStop() override {
1141 if (IgnoreThreshold || ComputeFullInlineCost)
1142 return false;
1143 // Bail out the moment we cross the threshold. This means we'll under-count
1144 // the cost, but only when undercounting doesn't matter.
1145 if (Cost < Threshold)
1146 return false;
1147 DecidedByCostThreshold = true;
1148 return true;
1149 }
1150
1151 void onLoadEliminationOpportunity() override {
1152 LoadEliminationCost += InstrCost;
1153 }
1154
1155 InlineResult onAnalysisStart() override {
1156 // Perform some tweaks to the cost and threshold based on the direct
1157 // callsite information.
1158
1159 // We want to more aggressively inline vector-dense kernels, so up the
1160 // threshold, and we'll lower it if the % of vector instructions gets too
1161 // low. Note that these bonuses are some what arbitrary and evolved over
1162 // time by accident as much as because they are principled bonuses.
1163 //
1164 // FIXME: It would be nice to remove all such bonuses. At least it would be
1165 // nice to base the bonus values on something more scientific.
1166 assert(NumInstructions == 0);
1167 assert(NumVectorInstructions == 0);
1168
1169 // Update the threshold based on callsite properties
1170 updateThreshold(Call&: CandidateCall, Callee&: F);
1171
1172 // While Threshold depends on commandline options that can take negative
1173 // values, we want to enforce the invariant that the computed threshold and
1174 // bonuses are non-negative.
1175 assert(Threshold >= 0);
1176 assert(SingleBBBonus >= 0);
1177 assert(VectorBonus >= 0);
1178
1179 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1180 // this Threshold any time, and cost cannot decrease, we can stop processing
1181 // the rest of the function body.
1182 Threshold += (SingleBBBonus + VectorBonus);
1183
1184 // Give out bonuses for the callsite, as the instructions setting them up
1185 // will be gone after inlining.
1186 addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1187
1188 // If this function uses the coldcc calling convention, prefer not to inline
1189 // it.
1190 if (F.getCallingConv() == CallingConv::Cold)
1191 addCost(Inc: InlineConstants::ColdccPenalty);
1192
1193 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1194
1195 // Check if we're done. This can happen due to bonuses and penalties.
1196 if (Cost >= Threshold && !ComputeFullInlineCost)
1197 return InlineResult::failure(Reason: "high cost");
1198
1199 return InlineResult::success();
1200 }
1201
1202public:
1203 InlineCostCallAnalyzer(
1204 Function &Callee, CallBase &Call, const InlineParams &Params,
1205 const TargetTransformInfo &TTI,
1206 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1207 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1208 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
1209 ProfileSummaryInfo *PSI = nullptr,
1210 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1211 bool IgnoreThreshold = false,
1212 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
1213 nullptr)
1214 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
1215 ORE, GetEphValuesCache),
1216 ComputeFullInlineCost(OptComputeFullInlineCost ||
1217 Params.ComputeFullInlineCost || ORE ||
1218 isCostBenefitAnalysisEnabled()),
1219 Params(Params), Threshold(Params.DefaultThreshold),
1220 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1221 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1222 Writer(this) {
1223 AllowRecursiveCall = *Params.AllowRecursiveCall;
1224 }
1225
1226 /// Annotation Writer for instruction details
1227 InlineCostAnnotationWriter Writer;
1228
1229 void dump();
1230
1231 // Prints the same analysis as dump(), but its definition is not dependent
1232 // on the build.
1233 void print(raw_ostream &OS);
1234
1235 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1236 auto It = InstructionCostDetailMap.find(Val: I);
1237 if (It != InstructionCostDetailMap.end())
1238 return It->second;
1239 return std::nullopt;
1240 }
1241
1242 ~InlineCostCallAnalyzer() override = default;
1243 int getThreshold() const { return Threshold; }
1244 int getCost() const { return Cost; }
1245 int getStaticBonusApplied() const { return StaticBonusApplied; }
1246 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1247 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1248 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1249};
1250
1251// Return true if CB is the sole call to local function Callee.
1252static bool isSoleCallToLocalFunction(const CallBase &CB,
1253 const Function &Callee) {
1254 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1255 &Callee == CB.getCalledFunction();
1256}
1257
1258class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1259private:
1260 InlineCostFeatures Cost = {};
1261
1262 // FIXME: These constants are taken from the heuristic-based cost visitor.
1263 // These should be removed entirely in a later revision to avoid reliance on
1264 // heuristics in the ML inliner.
1265 static constexpr int JTCostMultiplier = 2;
1266 static constexpr int CaseClusterCostMultiplier = 2;
1267 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1268 static constexpr int SwitchCostMultiplier = 2;
1269
1270 // FIXME: These are taken from the heuristic-based cost visitor: we should
1271 // eventually abstract these to the CallAnalyzer to avoid duplication.
1272 unsigned SROACostSavingOpportunities = 0;
1273 int VectorBonus = 0;
1274 int SingleBBBonus = 0;
1275 int Threshold = 5;
1276
1277 DenseMap<AllocaInst *, unsigned> SROACosts;
1278
1279 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1280 Cost[static_cast<size_t>(Feature)] += Delta;
1281 }
1282
1283 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1284 Cost[static_cast<size_t>(Feature)] = Value;
1285 }
1286
1287 void onDisableSROA(AllocaInst *Arg) override {
1288 auto CostIt = SROACosts.find(Val: Arg);
1289 if (CostIt == SROACosts.end())
1290 return;
1291
1292 increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second);
1293 SROACostSavingOpportunities -= CostIt->second;
1294 SROACosts.erase(I: CostIt);
1295 }
1296
1297 void onDisableLoadElimination() override {
1298 set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1);
1299 }
1300
1301 void onCallPenalty() override {
1302 increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty);
1303 }
1304
1305 void onCallArgumentSetup(const CallBase &Call) override {
1306 increment(Feature: InlineCostFeatureIndex::call_argument_setup,
1307 Delta: Call.arg_size() * InstrCost);
1308 }
1309
1310 void onLoadRelativeIntrinsic() override {
1311 increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost);
1312 }
1313
1314 void onLoweredCall(Function *F, CallBase &Call,
1315 bool IsIndirectCall) override {
1316 increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup,
1317 Delta: Call.arg_size() * InstrCost);
1318
1319 if (IsIndirectCall) {
1320 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1321 /*HintThreshold*/ {},
1322 /*ColdThreshold*/ {},
1323 /*OptSizeThreshold*/ {},
1324 /*OptMinSizeThreshold*/ {},
1325 /*HotCallSiteThreshold*/ {},
1326 /*LocallyHotCallSiteThreshold*/ {},
1327 /*ColdCallSiteThreshold*/ {},
1328 /*ComputeFullInlineCost*/ true,
1329 /*EnableDeferral*/ true};
1330 IndirectCallParams.DefaultThreshold =
1331 InlineConstants::IndirectCallThreshold;
1332
1333 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1334 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
1335 false, true);
1336 if (CA.analyze().isSuccess()) {
1337 increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate,
1338 Delta: CA.getCost());
1339 increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1);
1340 }
1341 } else {
1342 onCallPenalty();
1343 }
1344 }
1345
1346 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1347 bool DefaultDestUnreachable) override {
1348 if (JumpTableSize) {
1349 if (!DefaultDestUnreachable)
1350 increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty,
1351 Delta: SwitchDefaultDestCostMultiplier * InstrCost);
1352 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1353 JTCostMultiplier * InstrCost;
1354 increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost);
1355 return;
1356 }
1357
1358 if (NumCaseCluster <= 3) {
1359 increment(Feature: InlineCostFeatureIndex::case_cluster_penalty,
1360 Delta: (NumCaseCluster - DefaultDestUnreachable) *
1361 CaseClusterCostMultiplier * InstrCost);
1362 return;
1363 }
1364
1365 int64_t ExpectedNumberOfCompare =
1366 getExpectedNumberOfCompare(NumCaseCluster);
1367
1368 int64_t SwitchCost =
1369 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1370 increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost);
1371 }
1372
1373 void onMissedSimplification() override {
1374 increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions,
1375 Delta: InstrCost);
1376 }
1377
1378 void onInitializeSROAArg(AllocaInst *Arg) override {
1379 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
1380 SROACosts[Arg] = SROAArgCost;
1381 SROACostSavingOpportunities += SROAArgCost;
1382 }
1383
1384 void onAggregateSROAUse(AllocaInst *Arg) override {
1385 SROACosts.find(Val: Arg)->second += InstrCost;
1386 SROACostSavingOpportunities += InstrCost;
1387 }
1388
1389 void onBlockAnalyzed(const BasicBlock *BB) override {
1390 if (BB->getTerminator()->getNumSuccessors() > 1)
1391 set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1);
1392 Threshold -= SingleBBBonus;
1393 }
1394
1395 InlineResult finalizeAnalysis() override {
1396 auto *Caller = CandidateCall.getFunction();
1397 if (Caller->hasMinSize()) {
1398 DominatorTree DT(F);
1399 LoopInfo LI(DT);
1400 for (Loop *L : LI) {
1401 // Ignore loops that will not be executed
1402 if (DeadBlocks.count(Ptr: L->getHeader()))
1403 continue;
1404 increment(Feature: InlineCostFeatureIndex::num_loops,
1405 Delta: InlineConstants::LoopPenalty);
1406 }
1407 }
1408 set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size());
1409 set(Feature: InlineCostFeatureIndex::simplified_instructions,
1410 Value: NumInstructionsSimplified);
1411 set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs);
1412 set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args,
1413 Value: NumConstantOffsetPtrArgs);
1414 set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities);
1415
1416 if (NumVectorInstructions <= NumInstructions / 10)
1417 Threshold -= VectorBonus;
1418 else if (NumVectorInstructions <= NumInstructions / 2)
1419 Threshold -= VectorBonus / 2;
1420
1421 set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold);
1422
1423 return InlineResult::success();
1424 }
1425
1426 bool shouldStop() override { return false; }
1427
1428 void onLoadEliminationOpportunity() override {
1429 increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1);
1430 }
1431
1432 InlineResult onAnalysisStart() override {
1433 increment(Feature: InlineCostFeatureIndex::callsite_cost,
1434 Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1435
1436 set(Feature: InlineCostFeatureIndex::cold_cc_penalty,
1437 Value: (F.getCallingConv() == CallingConv::Cold));
1438
1439 set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus,
1440 Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F));
1441
1442 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1443 // analyzer - instead, we should abstract it to a common method in the
1444 // CallAnalyzer
1445 int SingleBBBonusPercent = 50;
1446 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1447 Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall);
1448 Threshold *= TTI.getInliningThresholdMultiplier();
1449 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1450 VectorBonus = Threshold * VectorBonusPercent / 100;
1451 Threshold += (SingleBBBonus + VectorBonus);
1452
1453 return InlineResult::success();
1454 }
1455
1456public:
1457 InlineCostFeaturesAnalyzer(
1458 const TargetTransformInfo &TTI,
1459 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1460 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1461 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
1462 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1463 CallBase &Call)
1464 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
1465 PSI) {}
1466
1467 const InlineCostFeatures &features() const { return Cost; }
1468};
1469
1470} // namespace
1471
1472/// Test whether the given value is an Alloca-derived function argument.
1473bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1474 return SROAArgValues.count(Val: V);
1475}
1476
1477void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1478 onDisableSROA(Arg: SROAArg);
1479 EnabledSROAAllocas.erase(V: SROAArg);
1480 disableLoadElimination();
1481}
1482
1483void InlineCostAnnotationWriter::emitInstructionAnnot(
1484 const Instruction *I, formatted_raw_ostream &OS) {
1485 // The cost of inlining of the given instruction is printed always.
1486 // The threshold delta is printed only when it is non-zero. It happens
1487 // when we decided to give a bonus at a particular instruction.
1488 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1489 if (!Record)
1490 OS << "; No analysis for the instruction";
1491 else {
1492 OS << "; cost before = " << Record->CostBefore
1493 << ", cost after = " << Record->CostAfter
1494 << ", threshold before = " << Record->ThresholdBefore
1495 << ", threshold after = " << Record->ThresholdAfter << ", ";
1496 OS << "cost delta = " << Record->getCostDelta();
1497 if (Record->hasThresholdChanged())
1498 OS << ", threshold delta = " << Record->getThresholdDelta();
1499 }
1500 auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I));
1501 if (V) {
1502 OS << ", simplified to ";
1503 V->print(O&: OS, IsForDebug: true);
1504 if (auto *VI = dyn_cast<Instruction>(Val: V)) {
1505 if (VI->getFunction() != I->getFunction())
1506 OS << " (caller instruction)";
1507 } else if (auto *VArg = dyn_cast<Argument>(Val: V)) {
1508 if (VArg->getParent() != I->getFunction())
1509 OS << " (caller argument)";
1510 }
1511 }
1512 OS << "\n";
1513}
1514
1515/// If 'V' maps to a SROA candidate, disable SROA for it.
1516void CallAnalyzer::disableSROA(Value *V) {
1517 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1518 disableSROAForArg(SROAArg);
1519 }
1520}
1521
1522void CallAnalyzer::disableLoadElimination() {
1523 if (EnableLoadElimination) {
1524 onDisableLoadElimination();
1525 EnableLoadElimination = false;
1526 }
1527}
1528
1529/// Accumulate a constant GEP offset into an APInt if possible.
1530///
1531/// Returns false if unable to compute the offset for any reason. Respects any
1532/// simplified values known during the analysis of this callsite.
1533bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1534 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType());
1535 assert(IntPtrWidth == Offset.getBitWidth());
1536
1537 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1538 GTI != GTE; ++GTI) {
1539 ConstantInt *OpC =
1540 getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand());
1541 if (!OpC)
1542 return false;
1543 if (OpC->isZero())
1544 continue;
1545
1546 // Handle a struct index, which adds its field offset to the pointer.
1547 if (StructType *STy = GTI.getStructTypeOrNull()) {
1548 unsigned ElementIdx = OpC->getZExtValue();
1549 const StructLayout *SL = DL.getStructLayout(Ty: STy);
1550 Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx));
1551 continue;
1552 }
1553
1554 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1555 Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize;
1556 }
1557 return true;
1558}
1559
1560/// Use TTI to check whether a GEP is free.
1561///
1562/// Respects any simplified values known during the analysis of this callsite.
1563bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1564 SmallVector<Value *, 4> Operands;
1565 Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0));
1566 for (const Use &Op : GEP.indices())
1567 if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op))
1568 Operands.push_back(Elt: SimpleOp);
1569 else
1570 Operands.push_back(Elt: Op);
1571 return TTI.getInstructionCost(U: &GEP, Operands,
1572 CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1573 TargetTransformInfo::TCC_Free;
1574}
1575
1576bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1577 disableSROA(V: I.getOperand(i_nocapture: 0));
1578
1579 // Check whether inlining will turn a dynamic alloca into a static
1580 // alloca and handle that case.
1581 if (I.isArrayAllocation()) {
1582 Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize());
1583 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) {
1584 // Sometimes a dynamic alloca could be converted into a static alloca
1585 // after this constant prop, and become a huge static alloca on an
1586 // unconditional CFG path. Avoid inlining if this is going to happen above
1587 // a threshold.
1588 // FIXME: If the threshold is removed or lowered too much, we could end up
1589 // being too pessimistic and prevent inlining non-problematic code. This
1590 // could result in unintended perf regressions. A better overall strategy
1591 // is needed to track stack usage during inlining.
1592 Type *Ty = I.getAllocatedType();
1593 AllocatedSize = SaturatingMultiplyAdd(
1594 X: AllocSize->getLimitedValue(),
1595 Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize);
1596 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1597 HasDynamicAlloca = true;
1598 return false;
1599 }
1600 }
1601
1602 if (I.isStaticAlloca()) {
1603 // Accumulate the allocated size if constant and executed once.
1604 // Note: if AllocSize is a vscale value, this is an underestimate of the
1605 // allocated size, and it also requires some of the cost of a dynamic
1606 // alloca, but is recorded here as a constant size alloca.
1607 TypeSize AllocSize = I.getAllocationSize(DL).value_or(u: TypeSize::getZero());
1608 AllocatedSize = SaturatingAdd(X: AllocSize.getKnownMinValue(), Y: AllocatedSize);
1609 } else {
1610 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1611 // a variety of reasons, and so we would like to not inline them into
1612 // functions which don't currently have a dynamic alloca. This simply
1613 // disables inlining altogether in the presence of a dynamic alloca.
1614 HasDynamicAlloca = true;
1615 }
1616
1617 return false;
1618}
1619
1620bool CallAnalyzer::visitPHI(PHINode &I) {
1621 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1622 // though we don't want to propagate it's bonuses. The idea is to disable
1623 // SROA if it *might* be used in an inappropriate manner.
1624
1625 // Phi nodes are always zero-cost.
1626 // FIXME: Pointer sizes may differ between different address spaces, so do we
1627 // need to use correct address space in the call to getPointerSizeInBits here?
1628 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1629 // see the ZeroOffset is used as a dummy value, so we can probably use any
1630 // bit width for the ZeroOffset?
1631 APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0));
1632 bool CheckSROA = I.getType()->isPointerTy();
1633
1634 // Track the constant or pointer with constant offset we've seen so far.
1635 Constant *FirstC = nullptr;
1636 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1637 Value *FirstV = nullptr;
1638
1639 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1640 BasicBlock *Pred = I.getIncomingBlock(i);
1641 // If the incoming block is dead, skip the incoming block.
1642 if (DeadBlocks.count(Ptr: Pred))
1643 continue;
1644 // If the parent block of phi is not the known successor of the incoming
1645 // block, skip the incoming block.
1646 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1647 if (KnownSuccessor && KnownSuccessor != I.getParent())
1648 continue;
1649
1650 Value *V = I.getIncomingValue(i);
1651 // If the incoming value is this phi itself, skip the incoming value.
1652 if (&I == V)
1653 continue;
1654
1655 Constant *C = getDirectOrSimplifiedValue<Constant>(V);
1656
1657 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1658 if (!C && CheckSROA)
1659 BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V);
1660
1661 if (!C && !BaseAndOffset.first)
1662 // The incoming value is neither a constant nor a pointer with constant
1663 // offset, exit early.
1664 return true;
1665
1666 if (FirstC) {
1667 if (FirstC == C)
1668 // If we've seen a constant incoming value before and it is the same
1669 // constant we see this time, continue checking the next incoming value.
1670 continue;
1671 // Otherwise early exit because we either see a different constant or saw
1672 // a constant before but we have a pointer with constant offset this time.
1673 return true;
1674 }
1675
1676 if (FirstV) {
1677 // The same logic as above, but check pointer with constant offset here.
1678 if (FirstBaseAndOffset == BaseAndOffset)
1679 continue;
1680 return true;
1681 }
1682
1683 if (C) {
1684 // This is the 1st time we've seen a constant, record it.
1685 FirstC = C;
1686 continue;
1687 }
1688
1689 // The remaining case is that this is the 1st time we've seen a pointer with
1690 // constant offset, record it.
1691 FirstV = V;
1692 FirstBaseAndOffset = BaseAndOffset;
1693 }
1694
1695 // Check if we can map phi to a constant.
1696 if (FirstC) {
1697 SimplifiedValues[&I] = FirstC;
1698 return true;
1699 }
1700
1701 // Check if we can map phi to a pointer with constant offset.
1702 if (FirstBaseAndOffset.first) {
1703 ConstantOffsetPtrs[&I] = std::move(FirstBaseAndOffset);
1704
1705 if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV))
1706 SROAArgValues[&I] = SROAArg;
1707 }
1708
1709 return true;
1710}
1711
1712/// Check we can fold GEPs of constant-offset call site argument pointers.
1713/// This requires target data and inbounds GEPs.
1714///
1715/// \return true if the specified GEP can be folded.
1716bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1717 // Check if we have a base + offset for the pointer.
1718 std::pair<Value *, APInt> BaseAndOffset =
1719 ConstantOffsetPtrs.lookup(Val: I.getPointerOperand());
1720 if (!BaseAndOffset.first)
1721 return false;
1722
1723 // Check if the offset of this GEP is constant, and if so accumulate it
1724 // into Offset.
1725 if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second))
1726 return false;
1727
1728 // Add the result as a new mapping to Base + Offset.
1729 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1730
1731 return true;
1732}
1733
1734bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1735 auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand());
1736
1737 // Lambda to check whether a GEP's indices are all constant.
1738 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1739 for (const Use &Op : GEP.indices())
1740 if (!getDirectOrSimplifiedValue<Constant>(V: Op))
1741 return false;
1742 return true;
1743 };
1744
1745 if (!DisableGEPConstOperand)
1746 if (simplifyInstruction(I))
1747 return true;
1748
1749 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1750 if (SROAArg)
1751 SROAArgValues[&I] = SROAArg;
1752
1753 // Constant GEPs are modeled as free.
1754 return true;
1755 }
1756
1757 // Variable GEPs will require math and will disable SROA.
1758 if (SROAArg)
1759 disableSROAForArg(SROAArg);
1760 return isGEPFree(GEP&: I);
1761}
1762
1763// Simplify \p Cmp if RHS is const and we can ValueTrack LHS.
1764// This handles the case only when the Cmp instruction is guarding a recursive
1765// call that will cause the Cmp to fail/succeed for the recursive call.
1766bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) {
1767 // Bail out if LHS is not a function argument or RHS is NOT const:
1768 if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1)))
1769 return false;
1770 auto *CmpOp = Cmp.getOperand(i_nocapture: 0);
1771 // Make sure that the callsite is recursive:
1772 if (CandidateCall.getCaller() != &F)
1773 return false;
1774 // Only handle the case when the callsite has a single predecessor:
1775 auto *CallBB = CandidateCall.getParent();
1776 auto *Predecessor = CallBB->getSinglePredecessor();
1777 if (!Predecessor)
1778 return false;
1779 // Check if the callsite is guarded by the same Cmp instruction:
1780 auto *Br = dyn_cast<CondBrInst>(Val: Predecessor->getTerminator());
1781 if (!Br || Br->getCondition() != &Cmp)
1782 return false;
1783
1784 // Check if there is any arg of the recursive callsite is affecting the cmp
1785 // instr:
1786 bool ArgFound = false;
1787 Value *FuncArg = nullptr, *CallArg = nullptr;
1788 for (unsigned ArgNum = 0;
1789 ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) {
1790 FuncArg = F.getArg(i: ArgNum);
1791 CallArg = CandidateCall.getArgOperand(i: ArgNum);
1792 if (FuncArg == CmpOp && CallArg != CmpOp) {
1793 ArgFound = true;
1794 break;
1795 }
1796 }
1797 if (!ArgFound)
1798 return false;
1799
1800 // Now we have a recursive call that is guarded by a cmp instruction.
1801 // Check if this cmp can be simplified:
1802 SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg));
1803 CondContext CC(&Cmp);
1804 CC.Invert = (CallBB != Br->getSuccessor(i: 0));
1805 SQ.CC = &CC;
1806 CC.AffectedValues.insert(Ptr: FuncArg);
1807 Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands(
1808 I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ);
1809 if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) {
1810 // Make sure that the BB of the recursive call is NOT the true successor
1811 // of the icmp. In other words, make sure that the recursion depth is 1.
1812 if ((ConstVal->isOne() && CC.Invert) ||
1813 (ConstVal->isZero() && !CC.Invert)) {
1814 SimplifiedValues[&Cmp] = ConstVal;
1815 return true;
1816 }
1817 }
1818 return false;
1819}
1820
1821/// Simplify \p I if its operands are constants and update SimplifiedValues.
1822bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1823 SmallVector<Constant *> COps;
1824 for (Value *Op : I.operands()) {
1825 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
1826 if (!COp)
1827 return false;
1828 COps.push_back(Elt: COp);
1829 }
1830 auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL);
1831 if (!C)
1832 return false;
1833 SimplifiedValues[&I] = C;
1834 return true;
1835}
1836
1837/// Try to simplify a call to llvm.is.constant.
1838///
1839/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1840/// we expect calls of this specific intrinsic to be infrequent.
1841///
1842/// FIXME: Given that we know CB's parent (F) caller
1843/// (CandidateCall->getParent()->getParent()), we might be able to determine
1844/// whether inlining F into F's caller would change how the call to
1845/// llvm.is.constant would evaluate.
1846bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1847 Value *Arg = CB.getArgOperand(i: 0);
1848 auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg);
1849
1850 Type *RT = CB.getFunctionType()->getReturnType();
1851 SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0);
1852 return true;
1853}
1854
1855bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1856 // As per the langref, "The fourth argument to llvm.objectsize determines if
1857 // the value should be evaluated at runtime."
1858 if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne())
1859 return false;
1860
1861 Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr,
1862 /*MustSucceed=*/true);
1863 Constant *C = dyn_cast_or_null<Constant>(Val: V);
1864 if (C)
1865 SimplifiedValues[&CB] = C;
1866 return C;
1867}
1868
1869bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1870 // Propagate constants through bitcasts.
1871 if (simplifyInstruction(I))
1872 return true;
1873
1874 // Track base/offsets through casts
1875 std::pair<Value *, APInt> BaseAndOffset =
1876 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1877 // Casts don't change the offset, just wrap it up.
1878 if (BaseAndOffset.first)
1879 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1880
1881 // Also look for SROA candidates here.
1882 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1883 SROAArgValues[&I] = SROAArg;
1884
1885 // Bitcasts are always zero cost.
1886 return true;
1887}
1888
1889bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1890 // Propagate constants through ptrtoint.
1891 if (simplifyInstruction(I))
1892 return true;
1893
1894 // Track base/offset pairs when converted to a plain integer provided the
1895 // integer is large enough to represent the pointer.
1896 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1897 unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1898 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1899 std::pair<Value *, APInt> BaseAndOffset =
1900 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1901 if (BaseAndOffset.first)
1902 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1903 }
1904
1905 // This is really weird. Technically, ptrtoint will disable SROA. However,
1906 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1907 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1908 // would block SROA would also block SROA if applied directly to a pointer,
1909 // and so we can just add the integer in here. The only places where SROA is
1910 // preserved either cannot fire on an integer, or won't in-and-of themselves
1911 // disable SROA (ext) w/o some later use that we would see and disable.
1912 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1913 SROAArgValues[&I] = SROAArg;
1914
1915 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1916 TargetTransformInfo::TCC_Free;
1917}
1918
1919bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1920 // Propagate constants through ptrtoint.
1921 if (simplifyInstruction(I))
1922 return true;
1923
1924 // Track base/offset pairs when round-tripped through a pointer without
1925 // modifications provided the integer is not too large.
1926 Value *Op = I.getOperand(i_nocapture: 0);
1927 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1928 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1929 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op);
1930 if (BaseAndOffset.first)
1931 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1932 }
1933
1934 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1935 if (auto *SROAArg = getSROAArgForValueOrNull(V: Op))
1936 SROAArgValues[&I] = SROAArg;
1937
1938 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1939 TargetTransformInfo::TCC_Free;
1940}
1941
1942bool CallAnalyzer::visitCastInst(CastInst &I) {
1943 // Propagate constants through casts.
1944 if (simplifyInstruction(I))
1945 return true;
1946
1947 // Disable SROA in the face of arbitrary casts we don't explicitly list
1948 // elsewhere.
1949 disableSROA(V: I.getOperand(i_nocapture: 0));
1950
1951 // If this is a floating-point cast, and the target says this operation
1952 // is expensive, this may eventually become a library call. Treat the cost
1953 // as such.
1954 switch (I.getOpcode()) {
1955 case Instruction::FPTrunc:
1956 case Instruction::FPExt:
1957 case Instruction::UIToFP:
1958 case Instruction::SIToFP:
1959 case Instruction::FPToUI:
1960 case Instruction::FPToSI:
1961 if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive)
1962 onCallPenalty();
1963 break;
1964 default:
1965 break;
1966 }
1967
1968 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1969 TargetTransformInfo::TCC_Free;
1970}
1971
1972bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1973 return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr);
1974}
1975
1976bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1977 // Does the *call site* have the NonNull attribute set on an argument? We
1978 // use the attribute on the call site to memoize any analysis done in the
1979 // caller. This will also trip if the callee function has a non-null
1980 // parameter attribute, but that's a less interesting case because hopefully
1981 // the callee would already have been simplified based on that.
1982 if (Argument *A = dyn_cast<Argument>(Val: V))
1983 if (paramHasAttr(A, Attr: Attribute::NonNull))
1984 return true;
1985
1986 // Is this an alloca in the caller? This is distinct from the attribute case
1987 // above because attributes aren't updated within the inliner itself and we
1988 // always want to catch the alloca derived case.
1989 if (isAllocaDerivedArg(V))
1990 // We can actually predict the result of comparisons between an
1991 // alloca-derived value and null. Note that this fires regardless of
1992 // SROA firing.
1993 return true;
1994
1995 return false;
1996}
1997
1998bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1999 // If the normal destination of the invoke or the parent block of the call
2000 // site is unreachable-terminated, there is little point in inlining this
2001 // unless there is literally zero cost.
2002 // FIXME: Note that it is possible that an unreachable-terminated block has a
2003 // hot entry. For example, in below scenario inlining hot_call_X() may be
2004 // beneficial :
2005 // main() {
2006 // hot_call_1();
2007 // ...
2008 // hot_call_N()
2009 // exit(0);
2010 // }
2011 // For now, we are not handling this corner case here as it is rare in real
2012 // code. In future, we should elaborate this based on BPI and BFI in more
2013 // general threshold adjusting heuristics in updateThreshold().
2014 if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) {
2015 if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator()))
2016 return false;
2017 } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator()))
2018 return false;
2019
2020 return true;
2021}
2022
2023bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
2024 BlockFrequencyInfo *CallerBFI) {
2025 // If global profile summary is available, then callsite's coldness is
2026 // determined based on that.
2027 if (PSI && PSI->hasProfileSummary())
2028 return PSI->isColdCallSite(CB: Call, BFI: CallerBFI);
2029
2030 // Otherwise we need BFI to be available.
2031 if (!CallerBFI)
2032 return false;
2033
2034 // Determine if the callsite is cold relative to caller's entry. We could
2035 // potentially cache the computation of scaled entry frequency, but the added
2036 // complexity is not worth it unless this scaling shows up high in the
2037 // profiles.
2038 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
2039 auto CallSiteBB = Call.getParent();
2040 auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2041 auto CallerEntryFreq =
2042 CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock()));
2043 return CallSiteFreq < CallerEntryFreq * ColdProb;
2044}
2045
2046std::optional<int>
2047InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
2048 BlockFrequencyInfo *CallerBFI) {
2049
2050 // If global profile summary is available, then callsite's hotness is
2051 // determined based on that.
2052 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI))
2053 return Params.HotCallSiteThreshold;
2054
2055 // Otherwise we need BFI to be available and to have a locally hot callsite
2056 // threshold.
2057 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
2058 return std::nullopt;
2059
2060 // Determine if the callsite is hot relative to caller's entry. We could
2061 // potentially cache the computation of scaled entry frequency, but the added
2062 // complexity is not worth it unless this scaling shows up high in the
2063 // profiles.
2064 const BasicBlock *CallSiteBB = Call.getParent();
2065 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2066 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
2067 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq);
2068 if (Limit && CallSiteFreq >= *Limit)
2069 return Params.LocallyHotCallSiteThreshold;
2070
2071 // Otherwise treat it normally.
2072 return std::nullopt;
2073}
2074
2075void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
2076 // If no size growth is allowed for this inlining, set Threshold to 0.
2077 if (!allowSizeGrowth(Call)) {
2078 Threshold = 0;
2079 return;
2080 }
2081
2082 Function *Caller = Call.getCaller();
2083
2084 // return min(A, B) if B is valid.
2085 auto MinIfValid = [](int A, std::optional<int> B) {
2086 return B ? std::min(a: A, b: *B) : A;
2087 };
2088
2089 // return max(A, B) if B is valid.
2090 auto MaxIfValid = [](int A, std::optional<int> B) {
2091 return B ? std::max(a: A, b: *B) : A;
2092 };
2093
2094 // Various bonus percentages. These are multiplied by Threshold to get the
2095 // bonus values.
2096 // SingleBBBonus: This bonus is applied if the callee has a single reachable
2097 // basic block at the given callsite context. This is speculatively applied
2098 // and withdrawn if more than one basic block is seen.
2099 //
2100 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
2101 // of the last call to a static function as inlining such functions is
2102 // guaranteed to reduce code size.
2103 //
2104 // These bonus percentages may be set to 0 based on properties of the caller
2105 // and the callsite.
2106 int SingleBBBonusPercent = 50;
2107 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
2108 int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus();
2109
2110 // Lambda to set all the above bonus and bonus percentages to 0.
2111 auto DisallowAllBonuses = [&]() {
2112 SingleBBBonusPercent = 0;
2113 VectorBonusPercent = 0;
2114 LastCallToStaticBonus = 0;
2115 };
2116
2117 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
2118 // and reduce the threshold if the caller has the necessary attribute.
2119 if (Caller->hasMinSize()) {
2120 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
2121 // For minsize, we want to disable the single BB bonus and the vector
2122 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
2123 // a static function will, at the minimum, eliminate the parameter setup and
2124 // call/return instructions.
2125 SingleBBBonusPercent = 0;
2126 VectorBonusPercent = 0;
2127 } else if (Caller->hasOptSize())
2128 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
2129
2130 // Adjust the threshold based on inlinehint attribute and profile based
2131 // hotness information if the caller does not have MinSize attribute.
2132 if (!Caller->hasMinSize()) {
2133 if (Callee.hasFnAttribute(Kind: Attribute::InlineHint))
2134 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2135
2136 // FIXME: After switching to the new passmanager, simplify the logic below
2137 // by checking only the callsite hotness/coldness as we will reliably
2138 // have local profile information.
2139 //
2140 // Callsite hotness and coldness can be determined if sample profile is
2141 // used (which adds hotness metadata to calls) or if caller's
2142 // BlockFrequencyInfo is available.
2143 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
2144 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
2145 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
2146 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
2147 // FIXME: This should update the threshold only if it exceeds the
2148 // current threshold, but AutoFDO + ThinLTO currently relies on this
2149 // behavior to prevent inlining of hot callsites during ThinLTO
2150 // compile phase.
2151 Threshold = *HotCallSiteThreshold;
2152 } else if (isColdCallSite(Call, CallerBFI)) {
2153 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2154 // Do not apply bonuses for a cold callsite including the
2155 // LastCallToStatic bonus. While this bonus might result in code size
2156 // reduction, it can cause the size of a non-cold caller to increase
2157 // preventing it from being inlined.
2158 DisallowAllBonuses();
2159 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
2160 } else if (PSI) {
2161 // Use callee's global profile information only if we have no way of
2162 // determining this via callsite information.
2163 if (PSI->isFunctionEntryHot(F: &Callee)) {
2164 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2165 // If callsite hotness can not be determined, we may still know
2166 // that the callee is hot and treat it as a weaker hint for threshold
2167 // increase.
2168 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2169 } else if (PSI->isFunctionEntryCold(F: &Callee)) {
2170 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2171 // Do not apply bonuses for a cold callee including the
2172 // LastCallToStatic bonus. While this bonus might result in code size
2173 // reduction, it can cause the size of a non-cold caller to increase
2174 // preventing it from being inlined.
2175 DisallowAllBonuses();
2176 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2177 }
2178 }
2179 }
2180
2181 Threshold += TTI.adjustInliningThreshold(CB: &Call);
2182
2183 // Finally, take the target-specific inlining threshold multiplier into
2184 // account.
2185 Threshold *= TTI.getInliningThresholdMultiplier();
2186
2187 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2188 VectorBonus = Threshold * VectorBonusPercent / 100;
2189
2190 // If there is only one call of the function, and it has internal linkage,
2191 // the cost of inlining it drops dramatically. It may seem odd to update
2192 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2193 if (isSoleCallToLocalFunction(CB: Call, Callee: F)) {
2194 addCost(Inc: -LastCallToStaticBonus);
2195 StaticBonusApplied = LastCallToStaticBonus;
2196 }
2197}
2198
2199bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2200 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2201 // First try to handle simplified comparisons.
2202 if (simplifyInstruction(I))
2203 return true;
2204
2205 // Try to handle comparison that can be simplified using ValueTracking.
2206 if (simplifyCmpInstForRecCall(Cmp&: I))
2207 return true;
2208
2209 if (I.getOpcode() == Instruction::FCmp)
2210 return false;
2211
2212 // Otherwise look for a comparison between constant offset pointers with
2213 // a common base.
2214 Value *LHSBase, *RHSBase;
2215 APInt LHSOffset, RHSOffset;
2216 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2217 if (LHSBase) {
2218 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2219 if (RHSBase && LHSBase == RHSBase) {
2220 // We have common bases, fold the icmp to a constant based on the
2221 // offsets.
2222 SimplifiedValues[&I] = ConstantInt::getBool(
2223 Ty: I.getType(),
2224 V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate()));
2225 ++NumConstantPtrCmps;
2226 return true;
2227 }
2228 }
2229
2230 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2231 for (auto *User : I.users())
2232 if (auto *Instr = dyn_cast<Instruction>(Val: User))
2233 if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit))
2234 return false;
2235 return true;
2236 };
2237
2238 // If the comparison is an equality comparison with null, we can simplify it
2239 // if we know the value (argument) can't be null
2240 if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) {
2241 if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) {
2242 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2243 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType())
2244 : ConstantInt::getFalse(Ty: I.getType());
2245 return true;
2246 }
2247 // Implicit null checks act as unconditional branches and their comparisons
2248 // should be treated as simplified and free of cost.
2249 if (isImplicitNullCheckCmp(I))
2250 return true;
2251 }
2252 return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1)));
2253}
2254
2255bool CallAnalyzer::visitSub(BinaryOperator &I) {
2256 // Try to handle a special case: we can fold computing the difference of two
2257 // constant-related pointers.
2258 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2259 Value *LHSBase, *RHSBase;
2260 APInt LHSOffset, RHSOffset;
2261 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2262 if (LHSBase) {
2263 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2264 if (RHSBase && LHSBase == RHSBase) {
2265 // We have common bases, fold the subtract to a constant based on the
2266 // offsets.
2267 Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset);
2268 Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset);
2269 if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) {
2270 SimplifiedValues[&I] = C;
2271 ++NumConstantPtrDiffs;
2272 return true;
2273 }
2274 }
2275 }
2276
2277 // Otherwise, fall back to the generic logic for simplifying and handling
2278 // instructions.
2279 return Base::visitSub(I);
2280}
2281
2282bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2283 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2284 Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS);
2285 Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS);
2286
2287 Value *SimpleV = nullptr;
2288 if (auto FI = dyn_cast<FPMathOperator>(Val: &I))
2289 SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS,
2290 FMF: FI->getFastMathFlags(), Q: DL);
2291 else
2292 SimpleV =
2293 simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL);
2294
2295 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2296 SimplifiedValues[&I] = C;
2297
2298 if (SimpleV)
2299 return true;
2300
2301 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2302 disableSROA(V: LHS);
2303 disableSROA(V: RHS);
2304
2305 // If the instruction is floating point, and the target says this operation
2306 // is expensive, this may eventually become a library call. Treat the cost
2307 // as such. Unless it's fneg which can be implemented with an xor.
2308 using namespace llvm::PatternMatch;
2309 if (I.getType()->isFloatingPointTy() &&
2310 TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive &&
2311 !match(V: &I, P: m_FNeg(X: m_Value())))
2312 onCallPenalty();
2313
2314 return false;
2315}
2316
2317bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2318 Value *Op = I.getOperand(i_nocapture: 0);
2319 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
2320
2321 Value *SimpleV = simplifyFNegInst(
2322 Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL);
2323
2324 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2325 SimplifiedValues[&I] = C;
2326
2327 if (SimpleV)
2328 return true;
2329
2330 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2331 disableSROA(V: Op);
2332
2333 return false;
2334}
2335
2336bool CallAnalyzer::visitLoad(LoadInst &I) {
2337 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2338 return true;
2339
2340 // If the data is already loaded from this address and hasn't been clobbered
2341 // by any stores or calls, this load is likely to be redundant and can be
2342 // eliminated.
2343 if (EnableLoadElimination &&
2344 !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) {
2345 onLoadEliminationOpportunity();
2346 return true;
2347 }
2348
2349 onMemAccess();
2350 return false;
2351}
2352
2353bool CallAnalyzer::visitStore(StoreInst &I) {
2354 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2355 return true;
2356
2357 // The store can potentially clobber loads and prevent repeated loads from
2358 // being eliminated.
2359 // FIXME:
2360 // 1. We can probably keep an initial set of eliminatable loads substracted
2361 // from the cost even when we finally see a store. We just need to disable
2362 // *further* accumulation of elimination savings.
2363 // 2. We should probably at some point thread MemorySSA for the callee into
2364 // this and then use that to actually compute *really* precise savings.
2365 disableLoadElimination();
2366
2367 onMemAccess();
2368 return false;
2369}
2370
2371bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2372 Value *Op = I.getAggregateOperand();
2373
2374 // Special handling, because we want to simplify extractvalue with a
2375 // potential insertvalue from the caller.
2376 if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) {
2377 SimplifyQuery SQ(DL);
2378 Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ);
2379 if (SimpleV) {
2380 SimplifiedValues[&I] = SimpleV;
2381 return true;
2382 }
2383 }
2384
2385 // SROA can't look through these, but they may be free.
2386 return Base::visitExtractValue(I);
2387}
2388
2389bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2390 // Constant folding for insert value is trivial.
2391 if (simplifyInstruction(I))
2392 return true;
2393
2394 // SROA can't look through these, but they may be free.
2395 return Base::visitInsertValue(I);
2396}
2397
2398/// Try to simplify a call site.
2399///
2400/// Takes a concrete function and callsite and tries to actually simplify it by
2401/// analyzing the arguments and call itself with instsimplify. Returns true if
2402/// it has simplified the callsite to some other entity (a constant), making it
2403/// free.
2404bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2405 // FIXME: Using the instsimplify logic directly for this is inefficient
2406 // because we have to continually rebuild the argument list even when no
2407 // simplifications can be performed. Until that is fixed with remapping
2408 // inside of instsimplify, directly constant fold calls here.
2409 if (!canConstantFoldCallTo(Call: &Call, F))
2410 return false;
2411
2412 // Try to re-map the arguments to constants.
2413 SmallVector<Constant *, 4> ConstantArgs;
2414 ConstantArgs.reserve(N: Call.arg_size());
2415 for (Value *I : Call.args()) {
2416 Constant *C = getDirectOrSimplifiedValue<Constant>(V: I);
2417 if (!C)
2418 return false; // This argument doesn't map to a constant.
2419
2420 ConstantArgs.push_back(Elt: C);
2421 }
2422 if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) {
2423 SimplifiedValues[&Call] = C;
2424 return true;
2425 }
2426
2427 return false;
2428}
2429
2430bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
2431 const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
2432 LibFunc LF;
2433 if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF))
2434 return TTI.isLoweredToCall(F);
2435
2436 switch (LF) {
2437 case LibFunc_memcpy_chk:
2438 case LibFunc_memmove_chk:
2439 case LibFunc_mempcpy_chk:
2440 case LibFunc_memset_chk: {
2441 // Calls to __memcpy_chk whose length is known to fit within the object
2442 // size will eventually be replaced by inline stores. Therefore, these
2443 // should not incur a call penalty. This is only really relevant on
2444 // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2445 // other platforms use memcpy intrinsics, which are already exempt from the
2446 // call penalty.
2447 auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2));
2448 auto *ObjSizeOp =
2449 getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3));
2450 if (LenOp && ObjSizeOp &&
2451 LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
2452 return false;
2453 }
2454 break;
2455 }
2456 default:
2457 break;
2458 }
2459
2460 return TTI.isLoweredToCall(F);
2461}
2462
2463bool CallAnalyzer::visitCallBase(CallBase &Call) {
2464 if (!onCallBaseVisitStart(Call))
2465 return true;
2466
2467 if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) &&
2468 !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) {
2469 // This aborts the entire analysis.
2470 ExposesReturnsTwice = true;
2471 return false;
2472 }
2473 if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate())
2474 ContainsNoDuplicateCall = true;
2475
2476 if (InlineAsm *InlineAsmOp = dyn_cast<InlineAsm>(Val: Call.getCalledOperand()))
2477 onInlineAsm(Arg: *InlineAsmOp);
2478
2479 Function *F = Call.getCalledFunction();
2480 bool IsIndirectCall = !F;
2481 if (IsIndirectCall) {
2482 // Check if this happens to be an indirect function call to a known function
2483 // in this inline context. If not, we've done all we can.
2484 Value *Callee = Call.getCalledOperand();
2485 F = getSimplifiedValue<Function>(V: Callee);
2486 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2487 onCallArgumentSetup(Call);
2488
2489 if (!Call.onlyReadsMemory())
2490 disableLoadElimination();
2491 return Base::visitCallBase(I&: Call);
2492 }
2493 }
2494
2495 assert(F && "Expected a call to a known function");
2496
2497 // When we have a concrete function, first try to simplify it directly.
2498 if (simplifyCallSite(F, Call))
2499 return true;
2500
2501 // Next check if it is an intrinsic we know about.
2502 // FIXME: Lift this into part of the InstVisitor.
2503 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) {
2504 switch (II->getIntrinsicID()) {
2505 default:
2506 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II))
2507 disableLoadElimination();
2508 return Base::visitCallBase(I&: Call);
2509
2510 case Intrinsic::load_relative:
2511 onLoadRelativeIntrinsic();
2512 return false;
2513
2514 case Intrinsic::memset:
2515 case Intrinsic::memcpy:
2516 case Intrinsic::memmove:
2517 disableLoadElimination();
2518 // SROA can usually chew through these intrinsics, but they aren't free.
2519 return false;
2520 case Intrinsic::icall_branch_funnel:
2521 case Intrinsic::localescape:
2522 HasUninlineableIntrinsic = true;
2523 return false;
2524 case Intrinsic::vastart:
2525 InitsVargArgs = true;
2526 return false;
2527 case Intrinsic::launder_invariant_group:
2528 case Intrinsic::strip_invariant_group:
2529 if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0)))
2530 SROAArgValues[II] = SROAArg;
2531 return true;
2532 case Intrinsic::is_constant:
2533 return simplifyIntrinsicCallIsConstant(CB&: Call);
2534 case Intrinsic::objectsize:
2535 return simplifyIntrinsicCallObjectSize(CB&: Call);
2536 }
2537 }
2538
2539 if (F == Call.getFunction()) {
2540 // This flag will fully abort the analysis, so don't bother with anything
2541 // else.
2542 IsRecursiveCall = true;
2543 if (!AllowRecursiveCall)
2544 return false;
2545 }
2546
2547 if (isLoweredToCall(F, Call)) {
2548 onLoweredCall(F, Call, IsIndirectCall);
2549 }
2550
2551 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2552 disableLoadElimination();
2553 return Base::visitCallBase(I&: Call);
2554}
2555
2556bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2557 // At least one return instruction will be free after inlining.
2558 bool Free = !HasReturn;
2559 HasReturn = true;
2560 return Free;
2561}
2562
2563bool CallAnalyzer::visitUncondBrInst(UncondBrInst &BI) {
2564 // We model unconditional branches as essentially free -- they really
2565 // shouldn't exist at all, but handling them makes the behavior of the
2566 // inliner more regular and predictable.
2567 return true;
2568}
2569
2570bool CallAnalyzer::visitCondBrInst(CondBrInst &BI) {
2571 // Conditional branches which will fold away are free.
2572 return getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) ||
2573 BI.getMetadata(KindID: LLVMContext::MD_make_implicit);
2574}
2575
2576bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2577 bool CheckSROA = SI.getType()->isPointerTy();
2578 Value *TrueVal = SI.getTrueValue();
2579 Value *FalseVal = SI.getFalseValue();
2580
2581 Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal);
2582 Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal);
2583 Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition());
2584
2585 if (!CondC) {
2586 // Select C, X, X => X
2587 if (TrueC == FalseC && TrueC) {
2588 SimplifiedValues[&SI] = TrueC;
2589 return true;
2590 }
2591
2592 if (!CheckSROA)
2593 return Base::visitSelectInst(I&: SI);
2594
2595 std::pair<Value *, APInt> TrueBaseAndOffset =
2596 ConstantOffsetPtrs.lookup(Val: TrueVal);
2597 std::pair<Value *, APInt> FalseBaseAndOffset =
2598 ConstantOffsetPtrs.lookup(Val: FalseVal);
2599 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2600 ConstantOffsetPtrs[&SI] = std::move(TrueBaseAndOffset);
2601
2602 if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal))
2603 SROAArgValues[&SI] = SROAArg;
2604 return true;
2605 }
2606
2607 return Base::visitSelectInst(I&: SI);
2608 }
2609
2610 // Select condition is a constant.
2611 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2612 : (CondC->isNullValue()) ? FalseVal
2613 : nullptr;
2614 if (!SelectedV) {
2615 // Condition is a vector constant that is not all 1s or all 0s. If all
2616 // operands are constants, ConstantFoldSelectInstruction() can handle the
2617 // cases such as select vectors.
2618 if (TrueC && FalseC) {
2619 if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) {
2620 SimplifiedValues[&SI] = C;
2621 return true;
2622 }
2623 }
2624 return Base::visitSelectInst(I&: SI);
2625 }
2626
2627 // Condition is either all 1s or all 0s. SI can be simplified.
2628 if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) {
2629 SimplifiedValues[&SI] = SelectedC;
2630 return true;
2631 }
2632
2633 if (!CheckSROA)
2634 return true;
2635
2636 std::pair<Value *, APInt> BaseAndOffset =
2637 ConstantOffsetPtrs.lookup(Val: SelectedV);
2638 if (BaseAndOffset.first) {
2639 ConstantOffsetPtrs[&SI] = std::move(BaseAndOffset);
2640
2641 if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV))
2642 SROAArgValues[&SI] = SROAArg;
2643 }
2644
2645 return true;
2646}
2647
2648bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2649 // We model unconditional switches as free, see the comments on handling
2650 // branches.
2651 if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition()))
2652 return true;
2653
2654 // Assume the most general case where the switch is lowered into
2655 // either a jump table, bit test, or a balanced binary tree consisting of
2656 // case clusters without merging adjacent clusters with the same
2657 // destination. We do not consider the switches that are lowered with a mix
2658 // of jump table/bit test/binary search tree. The cost of the switch is
2659 // proportional to the size of the tree or the size of jump table range.
2660 //
2661 // NB: We convert large switches which are just used to initialize large phi
2662 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2663 // inlining those. It will prevent inlining in cases where the optimization
2664 // does not (yet) fire.
2665
2666 unsigned JumpTableSize = 0;
2667 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2668 unsigned NumCaseCluster =
2669 TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI);
2670
2671 onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable());
2672 return false;
2673}
2674
2675bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2676 // We never want to inline functions that contain an indirectbr. This is
2677 // incorrect because all the blockaddress's (in static global initializers
2678 // for example) would be referring to the original function, and this
2679 // indirect jump would jump from the inlined copy of the function into the
2680 // original function which is extremely undefined behavior.
2681 // FIXME: This logic isn't really right; we can safely inline functions with
2682 // indirectbr's as long as no other function or global references the
2683 // blockaddress of a block within the current function.
2684 HasIndirectBr = true;
2685 return false;
2686}
2687
2688bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2689 // FIXME: It's not clear that a single instruction is an accurate model for
2690 // the inline cost of a resume instruction.
2691 return false;
2692}
2693
2694bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2695 // FIXME: It's not clear that a single instruction is an accurate model for
2696 // the inline cost of a cleanupret instruction.
2697 return false;
2698}
2699
2700bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2701 // FIXME: It's not clear that a single instruction is an accurate model for
2702 // the inline cost of a catchret instruction.
2703 return false;
2704}
2705
2706bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2707 // FIXME: It might be reasonably to discount the cost of instructions leading
2708 // to unreachable as they have the lowest possible impact on both runtime and
2709 // code size.
2710 return true; // No actual code is needed for unreachable.
2711}
2712
2713bool CallAnalyzer::visitInstruction(Instruction &I) {
2714 // Some instructions are free. All of the free intrinsics can also be
2715 // handled by SROA, etc.
2716 if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
2717 TargetTransformInfo::TCC_Free)
2718 return true;
2719
2720 // We found something we don't understand or can't handle. Mark any SROA-able
2721 // values in the operand list as no longer viable.
2722 for (const Use &Op : I.operands())
2723 disableSROA(V: Op);
2724
2725 return false;
2726}
2727
2728/// Analyze a basic block for its contribution to the inline cost.
2729///
2730/// This method walks the analyzer over every instruction in the given basic
2731/// block and accounts for their cost during inlining at this callsite. It
2732/// aborts early if the threshold has been exceeded or an impossible to inline
2733/// construct has been detected. It returns false if inlining is no longer
2734/// viable, and true if inlining remains viable.
2735InlineResult
2736CallAnalyzer::analyzeBlock(BasicBlock *BB,
2737 const SmallPtrSetImpl<const Value *> &EphValues) {
2738 for (Instruction &I : *BB) {
2739 // FIXME: Currently, the number of instructions in a function regardless of
2740 // our ability to simplify them during inline to constants or dead code,
2741 // are actually used by the vector bonus heuristic. As long as that's true,
2742 // we have to special case debug intrinsics here to prevent differences in
2743 // inlining due to debug symbols. Eventually, the number of unsimplified
2744 // instructions shouldn't factor into the cost computation, but until then,
2745 // hack around it here.
2746 // Similarly, skip pseudo-probes.
2747 if (I.isDebugOrPseudoInst())
2748 continue;
2749
2750 // Skip ephemeral values.
2751 if (EphValues.count(Ptr: &I))
2752 continue;
2753
2754 ++NumInstructions;
2755 if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy())
2756 ++NumVectorInstructions;
2757
2758 // If the instruction simplified to a constant, there is no cost to this
2759 // instruction. Visit the instructions using our InstVisitor to account for
2760 // all of the per-instruction logic. The visit tree returns true if we
2761 // consumed the instruction in any way, and false if the instruction's base
2762 // cost should count against inlining.
2763 onInstructionAnalysisStart(I: &I);
2764
2765 if (Base::visit(I: &I))
2766 ++NumInstructionsSimplified;
2767 else
2768 onMissedSimplification();
2769
2770 onInstructionAnalysisFinish(I: &I);
2771 using namespace ore;
2772 // If the visit this instruction detected an uninlinable pattern, abort.
2773 InlineResult IR = InlineResult::success();
2774 if (IsRecursiveCall && !AllowRecursiveCall)
2775 IR = InlineResult::failure(Reason: "recursive");
2776 else if (ExposesReturnsTwice)
2777 IR = InlineResult::failure(Reason: "exposes returns twice");
2778 else if (HasDynamicAlloca)
2779 IR = InlineResult::failure(Reason: "dynamic alloca");
2780 else if (HasIndirectBr)
2781 IR = InlineResult::failure(Reason: "indirect branch");
2782 else if (HasUninlineableIntrinsic)
2783 IR = InlineResult::failure(Reason: "uninlinable intrinsic");
2784 else if (InitsVargArgs)
2785 IR = InlineResult::failure(Reason: "varargs");
2786 if (!IR.isSuccess()) {
2787 if (ORE)
2788 ORE->emit(RemarkBuilder: [&]() {
2789 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2790 &CandidateCall)
2791 << NV("Callee", &F) << " has uninlinable pattern ("
2792 << NV("InlineResult", IR.getFailureReason())
2793 << ") and cost is not fully computed";
2794 });
2795 return IR;
2796 }
2797
2798 // If the caller is a recursive function then we don't want to inline
2799 // functions which allocate a lot of stack space because it would increase
2800 // the caller stack usage dramatically.
2801 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2802 auto IR =
2803 InlineResult::failure(Reason: "recursive and allocates too much stack space");
2804 if (ORE)
2805 ORE->emit(RemarkBuilder: [&]() {
2806 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2807 &CandidateCall)
2808 << NV("Callee", &F) << " is "
2809 << NV("InlineResult", IR.getFailureReason())
2810 << ". Cost is not fully computed";
2811 });
2812 return IR;
2813 }
2814
2815 if (shouldStop())
2816 return InlineResult::failure(
2817 Reason: "Call site analysis is not favorable to inlining.");
2818 }
2819
2820 return InlineResult::success();
2821}
2822
2823/// Compute the base pointer and cumulative constant offsets for V.
2824///
2825/// This strips all constant offsets off of V, leaving it the base pointer, and
2826/// accumulates the total constant offset applied in the returned constant. It
2827/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2828/// no constant offsets applied.
2829ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2830 if (!V->getType()->isPointerTy())
2831 return nullptr;
2832
2833 unsigned AS = V->getType()->getPointerAddressSpace();
2834 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2835 APInt Offset = APInt::getZero(numBits: IntPtrWidth);
2836
2837 // Even though we don't look through PHI nodes, we could be called on an
2838 // instruction in an unreachable block, which may be on a cycle.
2839 SmallPtrSet<Value *, 4> Visited;
2840 Visited.insert(Ptr: V);
2841 do {
2842 if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) {
2843 if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset))
2844 return nullptr;
2845 V = GEP->getPointerOperand();
2846 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) {
2847 if (GA->isInterposable())
2848 break;
2849 V = GA->getAliasee();
2850 } else {
2851 break;
2852 }
2853 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2854 } while (Visited.insert(Ptr: V).second);
2855
2856 Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType());
2857 return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset));
2858}
2859
2860/// Find dead blocks due to deleted CFG edges during inlining.
2861///
2862/// If we know the successor of the current block, \p CurrBB, has to be \p
2863/// NextBB, the other successors of \p CurrBB are dead if these successors have
2864/// no live incoming CFG edges. If one block is found to be dead, we can
2865/// continue growing the dead block list by checking the successors of the dead
2866/// blocks to see if all their incoming edges are dead or not.
2867void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2868 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2869 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2870 // known successor which is not the one under exam.
2871 if (DeadBlocks.count(Ptr: Pred))
2872 return true;
2873 BasicBlock *KnownSucc = KnownSuccessors[Pred];
2874 return KnownSucc && KnownSucc != Succ;
2875 };
2876
2877 auto IsNewlyDead = [&](BasicBlock *BB) {
2878 // If all the edges to a block are dead, the block is also dead.
2879 return (!DeadBlocks.count(Ptr: BB) &&
2880 llvm::all_of(Range: predecessors(BB),
2881 P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2882 };
2883
2884 for (BasicBlock *Succ : successors(BB: CurrBB)) {
2885 if (Succ == NextBB || !IsNewlyDead(Succ))
2886 continue;
2887 SmallVector<BasicBlock *, 4> NewDead;
2888 NewDead.push_back(Elt: Succ);
2889 while (!NewDead.empty()) {
2890 BasicBlock *Dead = NewDead.pop_back_val();
2891 if (DeadBlocks.insert(Ptr: Dead).second)
2892 // Continue growing the dead block lists.
2893 for (BasicBlock *S : successors(BB: Dead))
2894 if (IsNewlyDead(S))
2895 NewDead.push_back(Elt: S);
2896 }
2897 }
2898}
2899
2900/// Analyze a call site for potential inlining.
2901///
2902/// Returns true if inlining this call is viable, and false if it is not
2903/// viable. It computes the cost and adjusts the threshold based on numerous
2904/// factors and heuristics. If this method returns false but the computed cost
2905/// is below the computed threshold, then inlining was forcibly disabled by
2906/// some artifact of the routine.
2907InlineResult CallAnalyzer::analyze() {
2908 ++NumCallsAnalyzed;
2909
2910 auto Result = onAnalysisStart();
2911 if (!Result.isSuccess())
2912 return Result;
2913
2914 if (F.empty())
2915 return InlineResult::success();
2916
2917 Function *Caller = CandidateCall.getFunction();
2918 // Check if the caller function is recursive itself.
2919 for (User *U : Caller->users()) {
2920 CallBase *Call = dyn_cast<CallBase>(Val: U);
2921 if (Call && Call->getFunction() == Caller) {
2922 IsCallerRecursive = true;
2923 break;
2924 }
2925 }
2926
2927 // Populate our simplified values by mapping from function arguments to call
2928 // arguments with known important simplifications.
2929 auto CAI = CandidateCall.arg_begin();
2930 for (Argument &FAI : F.args()) {
2931 assert(CAI != CandidateCall.arg_end());
2932 SimplifiedValues[&FAI] = *CAI;
2933 if (isa<Constant>(Val: *CAI))
2934 ++NumConstantArgs;
2935
2936 Value *PtrArg = *CAI;
2937 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) {
2938 ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue());
2939
2940 // We can SROA any pointer arguments derived from alloca instructions.
2941 if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) {
2942 SROAArgValues[&FAI] = SROAArg;
2943 onInitializeSROAArg(Arg: SROAArg);
2944 EnabledSROAAllocas.insert(V: SROAArg);
2945 }
2946 }
2947 ++CAI;
2948 }
2949 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2950 NumAllocaArgs = SROAArgValues.size();
2951
2952 // Collecting the ephemeral values of `F` can be expensive, so use the
2953 // ephemeral values cache if available.
2954 SmallPtrSet<const Value *, 32> EphValuesStorage;
2955 const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage;
2956 if (GetEphValuesCache)
2957 EphValues = &GetEphValuesCache(F).ephValues();
2958 else
2959 CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F),
2960 EphValues&: EphValuesStorage);
2961
2962 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2963 // adding basic blocks of the callee which can be proven to be dead for this
2964 // particular call site in order to get more accurate cost estimates. This
2965 // requires a somewhat heavyweight iteration pattern: we need to walk the
2966 // basic blocks in a breadth-first order as we insert live successors. To
2967 // accomplish this, prioritizing for small iterations because we exit after
2968 // crossing our threshold, we use a small-size optimized SetVector.
2969 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2970 BBSetVector BBWorklist;
2971 BBWorklist.insert(X: &F.getEntryBlock());
2972
2973 // Note that we *must not* cache the size, this loop grows the worklist.
2974 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2975 if (shouldStop())
2976 break;
2977
2978 BasicBlock *BB = BBWorklist[Idx];
2979 if (BB->empty())
2980 continue;
2981
2982 onBlockStart(BB);
2983
2984 // Disallow inlining a blockaddress.
2985 // A blockaddress only has defined behavior for an indirect branch in the
2986 // same function, and we do not currently support inlining indirect
2987 // branches. But, the inliner may not see an indirect branch that ends up
2988 // being dead code at a particular call site. If the blockaddress escapes
2989 // the function, e.g., via a global variable, inlining may lead to an
2990 // invalid cross-function reference.
2991 // FIXME: pr/39560: continue relaxing this overt restriction.
2992 if (BB->hasAddressTaken())
2993 return InlineResult::failure(Reason: "blockaddress used");
2994
2995 // Analyze the cost of this block. If we blow through the threshold, this
2996 // returns false, and we can bail on out.
2997 InlineResult IR = analyzeBlock(BB, EphValues: *EphValues);
2998 if (!IR.isSuccess())
2999 return IR;
3000
3001 Instruction *TI = BB->getTerminator();
3002
3003 // Add in the live successors by first checking whether we have terminator
3004 // that may be simplified based on the values simplified by this call.
3005 if (CondBrInst *BI = dyn_cast<CondBrInst>(Val: TI)) {
3006 Value *Cond = BI->getCondition();
3007 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
3008 BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0);
3009 BBWorklist.insert(X: NextBB);
3010 KnownSuccessors[BB] = NextBB;
3011 findDeadBlocks(CurrBB: BB, NextBB);
3012 continue;
3013 }
3014 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) {
3015 Value *Cond = SI->getCondition();
3016 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
3017 BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor();
3018 BBWorklist.insert(X: NextBB);
3019 KnownSuccessors[BB] = NextBB;
3020 findDeadBlocks(CurrBB: BB, NextBB);
3021 continue;
3022 }
3023 }
3024
3025 // If we're unable to select a particular successor, just count all of
3026 // them.
3027 BBWorklist.insert_range(R: successors(BB));
3028
3029 onBlockAnalyzed(BB);
3030 }
3031
3032 // If this is a noduplicate call, we can still inline as long as
3033 // inlining this would cause the removal of the caller (so the instruction
3034 // is not actually duplicated, just moved).
3035 if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall)
3036 return InlineResult::failure(Reason: "noduplicate");
3037
3038 // If the callee's stack size exceeds the user-specified threshold,
3039 // do not let it be inlined.
3040 // The command line option overrides a limit set in the function attributes.
3041 size_t FinalStackSizeThreshold = StackSizeThreshold;
3042 if (!StackSizeThreshold.getNumOccurrences())
3043 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
3044 F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName))
3045 FinalStackSizeThreshold = *AttrMaxStackSize;
3046 if (AllocatedSize > FinalStackSizeThreshold)
3047 return InlineResult::failure(Reason: "stacksize");
3048
3049 return finalizeAnalysis();
3050}
3051
3052void InlineCostCallAnalyzer::print(raw_ostream &OS) {
3053#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
3054 if (PrintInstructionComments)
3055 F.print(OS, AAW: &Writer);
3056 DEBUG_PRINT_STAT(NumConstantArgs);
3057 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
3058 DEBUG_PRINT_STAT(NumAllocaArgs);
3059 DEBUG_PRINT_STAT(NumConstantPtrCmps);
3060 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
3061 DEBUG_PRINT_STAT(NumInstructionsSimplified);
3062 DEBUG_PRINT_STAT(NumInstructions);
3063 DEBUG_PRINT_STAT(NumInlineAsmInstructions);
3064 DEBUG_PRINT_STAT(SROACostSavings);
3065 DEBUG_PRINT_STAT(SROACostSavingsLost);
3066 DEBUG_PRINT_STAT(LoadEliminationCost);
3067 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
3068 DEBUG_PRINT_STAT(Cost);
3069 DEBUG_PRINT_STAT(Threshold);
3070#undef DEBUG_PRINT_STAT
3071}
3072
3073#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3074/// Dump stats about this call's analysis.
3075LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
3076#endif
3077
3078/// Test that there are no attribute conflicts between Caller and Callee
3079/// that prevent inlining.
3080static bool functionsHaveCompatibleAttributes(
3081 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
3082 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
3083 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
3084 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
3085 // object, and always returns the same object (which is overwritten on each
3086 // GetTLI call). Therefore we copy the first result.
3087 auto CalleeTLI = GetTLI(*Callee);
3088 return (IgnoreTTIInlineCompatible ||
3089 TTI.areInlineCompatible(Caller, Callee)) &&
3090 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
3091 AllowCallerSuperset: InlineCallerSupersetNoBuiltin) &&
3092 AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee);
3093}
3094
3095int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
3096 const DataLayout &DL) {
3097 int64_t Cost = 0;
3098 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
3099 if (Call.isByValArgument(ArgNo: I)) {
3100 // We approximate the number of loads and stores needed by dividing the
3101 // size of the byval type by the target's pointer size.
3102 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3103 unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I));
3104 unsigned AS = PTy->getAddressSpace();
3105 unsigned PointerSize = DL.getPointerSizeInBits(AS);
3106 // Ceiling division.
3107 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
3108
3109 // If it generates more than 8 stores it is likely to be expanded as an
3110 // inline memcpy so we take that as an upper bound. Otherwise we assume
3111 // one load and one store per word copied.
3112 // FIXME: The maxStoresPerMemcpy setting from the target should be used
3113 // here instead of a magic number of 8, but it's not available via
3114 // DataLayout.
3115 NumStores = std::min(a: NumStores, b: 8U);
3116
3117 Cost += 2 * NumStores * InstrCost;
3118 } else {
3119 // For non-byval arguments subtract off one instruction per call
3120 // argument.
3121 Cost += InstrCost;
3122 }
3123 }
3124 // The call instruction also disappears after inlining.
3125 Cost += InstrCost;
3126 Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty);
3127
3128 return std::min<int64_t>(a: Cost, INT_MAX);
3129}
3130
3131InlineCost llvm::getInlineCost(
3132 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
3133 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3134 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3135 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3136 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3137 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3138 return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI,
3139 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE,
3140 GetEphValuesCache);
3141}
3142
3143std::optional<int> llvm::getInliningCostEstimate(
3144 CallBase &Call, TargetTransformInfo &CalleeTTI,
3145 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3146 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3147 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3148 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3149 const InlineParams Params = {/* DefaultThreshold*/ 0,
3150 /*HintThreshold*/ {},
3151 /*ColdThreshold*/ {},
3152 /*OptSizeThreshold*/ {},
3153 /*OptMinSizeThreshold*/ {},
3154 /*HotCallSiteThreshold*/ {},
3155 /*LocallyHotCallSiteThreshold*/ {},
3156 /*ColdCallSiteThreshold*/ {},
3157 /*ComputeFullInlineCost*/ true,
3158 /*EnableDeferral*/ true};
3159
3160 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
3161 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
3162 /*IgnoreThreshold*/ true);
3163 auto R = CA.analyze();
3164 if (!R.isSuccess())
3165 return std::nullopt;
3166 return CA.getCost();
3167}
3168
3169std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
3170 CallBase &Call, TargetTransformInfo &CalleeTTI,
3171 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3172 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3173 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3174 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3175 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
3176 PSI, ORE, *Call.getCalledFunction(), Call);
3177 auto R = CFA.analyze();
3178 if (!R.isSuccess())
3179 return std::nullopt;
3180 return CFA.features();
3181}
3182
3183std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
3184 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
3185 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
3186
3187 // Cannot inline indirect calls.
3188 if (!Callee)
3189 return InlineResult::failure(Reason: "indirect call");
3190
3191 // When callee coroutine function is inlined into caller coroutine function
3192 // before coro-split pass,
3193 // coro-early pass can not handle this quiet well.
3194 // So we won't inline the coroutine function if it have not been unsplited
3195 if (Callee->isPresplitCoroutine())
3196 return InlineResult::failure(Reason: "unsplited coroutine call");
3197
3198 // Never inline calls with byval arguments that does not have the alloca
3199 // address space. Since byval arguments can be replaced with a copy to an
3200 // alloca, the inlined code would need to be adjusted to handle that the
3201 // argument is in the alloca address space (so it is a little bit complicated
3202 // to solve).
3203 unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3204 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3205 if (Call.isByValArgument(ArgNo: I)) {
3206 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3207 if (PTy->getAddressSpace() != AllocaAS)
3208 return InlineResult::failure(Reason: "byval arguments without alloca"
3209 " address space");
3210 }
3211
3212 // Calls to functions with always-inline attributes should be inlined
3213 // whenever possible.
3214 if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) {
3215 if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline))
3216 return InlineResult::failure(Reason: "noinline call site attribute");
3217
3218 auto IsViable = isInlineViable(Callee&: *Callee);
3219 if (IsViable.isSuccess())
3220 return InlineResult::success();
3221 return InlineResult::failure(Reason: IsViable.getFailureReason());
3222 }
3223
3224 // Never inline functions with conflicting attributes (unless callee has
3225 // always-inline attribute).
3226 Function *Caller = Call.getCaller();
3227 if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI))
3228 return InlineResult::failure(Reason: "conflicting attributes");
3229
3230 // Flatten: inline all viable calls from flatten functions regardless of cost.
3231 // Checked before optnone so that flatten takes priority.
3232 if (Caller->hasFnAttribute(Kind: Attribute::Flatten)) {
3233 auto IsViable = isInlineViable(Callee&: *Callee);
3234 if (IsViable.isSuccess())
3235 return InlineResult::success();
3236 return InlineResult::failure(Reason: IsViable.getFailureReason());
3237 }
3238
3239 // Don't inline this call if the caller has the optnone attribute.
3240 if (Caller->hasOptNone())
3241 return InlineResult::failure(Reason: "optnone attribute");
3242
3243 // Don't inline a function that treats null pointer as valid into a caller
3244 // that does not have this attribute.
3245 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3246 return InlineResult::failure(Reason: "nullptr definitions incompatible");
3247
3248 // Don't inline functions which can be interposed at link-time.
3249 if (Callee->isInterposable())
3250 return InlineResult::failure(Reason: "interposable");
3251
3252 // Don't inline functions marked noinline.
3253 if (Callee->hasFnAttribute(Kind: Attribute::NoInline))
3254 return InlineResult::failure(Reason: "noinline function attribute");
3255
3256 // Don't inline call sites marked noinline.
3257 if (Call.isNoInline())
3258 return InlineResult::failure(Reason: "noinline call site attribute");
3259
3260 // Don't inline functions that are loader replaceable.
3261 if (Callee->hasFnAttribute(Kind: "loader-replaceable"))
3262 return InlineResult::failure(Reason: "loader replaceable function attribute");
3263
3264 return std::nullopt;
3265}
3266
3267InlineCost llvm::getInlineCost(
3268 CallBase &Call, Function *Callee, const InlineParams &Params,
3269 TargetTransformInfo &CalleeTTI,
3270 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3271 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3272 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3273 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3274 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3275
3276 auto UserDecision =
3277 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3278
3279 if (UserDecision) {
3280 if (UserDecision->isSuccess())
3281 return llvm::InlineCost::getAlways(Reason: "always inline attribute");
3282 return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason());
3283 }
3284
3285 if (InlineAllViableCalls && isInlineViable(Callee&: *Callee).isSuccess())
3286 return llvm::InlineCost::getAlways(
3287 Reason: "Inlining forced by -inline-all-viable-calls");
3288
3289 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3290 << "... (caller:" << Call.getCaller()->getName()
3291 << ")\n");
3292
3293 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3294 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
3295 /*BoostIndirect=*/true, /*IgnoreThreshold=*/false,
3296 GetEphValuesCache);
3297 InlineResult ShouldInline = CA.analyze();
3298
3299 LLVM_DEBUG(CA.dump());
3300
3301 // Always make cost benefit based decision explicit.
3302 // We use always/never here since threshold is not meaningful,
3303 // as it's not what drives cost-benefit analysis.
3304 if (CA.wasDecidedByCostBenefit()) {
3305 if (ShouldInline.isSuccess())
3306 return InlineCost::getAlways(Reason: "benefit over cost",
3307 CostBenefit: CA.getCostBenefitPair());
3308 else
3309 return InlineCost::getNever(Reason: "cost over benefit", CostBenefit: CA.getCostBenefitPair());
3310 }
3311
3312 if (CA.wasDecidedByCostThreshold())
3313 return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(),
3314 StaticBonus: CA.getStaticBonusApplied());
3315
3316 // No details on how the decision was made, simply return always or never.
3317 return ShouldInline.isSuccess()
3318 ? InlineCost::getAlways(Reason: "empty function")
3319 : InlineCost::getNever(Reason: ShouldInline.getFailureReason());
3320}
3321
3322InlineResult llvm::isInlineViable(Function &F) {
3323 bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice);
3324 for (BasicBlock &BB : F) {
3325 // Disallow inlining of functions which contain indirect branches.
3326 if (isa<IndirectBrInst>(Val: BB.getTerminator()))
3327 return InlineResult::failure(Reason: "contains indirect branches");
3328
3329 // Disallow inlining of blockaddresses.
3330 if (BB.hasAddressTaken())
3331 return InlineResult::failure(Reason: "blockaddress used");
3332
3333 for (auto &II : BB) {
3334 CallBase *Call = dyn_cast<CallBase>(Val: &II);
3335 if (!Call)
3336 continue;
3337
3338 // Disallow recursive calls.
3339 Function *Callee = Call->getCalledFunction();
3340 if (&F == Callee)
3341 return InlineResult::failure(Reason: "recursive call");
3342
3343 // Disallow calls which expose returns-twice to a function not previously
3344 // attributed as such.
3345 if (!ReturnsTwice && isa<CallInst>(Val: Call) &&
3346 cast<CallInst>(Val: Call)->canReturnTwice())
3347 return InlineResult::failure(Reason: "exposes returns-twice attribute");
3348
3349 if (Callee)
3350 switch (Callee->getIntrinsicID()) {
3351 default:
3352 break;
3353 case llvm::Intrinsic::icall_branch_funnel:
3354 // Disallow inlining of @llvm.icall.branch.funnel because current
3355 // backend can't separate call targets from call arguments.
3356 return InlineResult::failure(
3357 Reason: "disallowed inlining of @llvm.icall.branch.funnel");
3358 case llvm::Intrinsic::localescape:
3359 // Disallow inlining functions that call @llvm.localescape. Doing this
3360 // correctly would require major changes to the inliner.
3361 return InlineResult::failure(
3362 Reason: "disallowed inlining of @llvm.localescape");
3363 case llvm::Intrinsic::vastart:
3364 // Disallow inlining of functions that initialize VarArgs with
3365 // va_start.
3366 return InlineResult::failure(
3367 Reason: "contains VarArgs initialized with va_start");
3368 }
3369 }
3370 }
3371
3372 return InlineResult::success();
3373}
3374
3375// APIs to create InlineParams based on command line flags and/or other
3376// parameters.
3377
3378InlineParams llvm::getInlineParams(int Threshold) {
3379 InlineParams Params;
3380
3381 // This field is the threshold to use for a callee by default. This is
3382 // derived from one or more of:
3383 // * optimization or size-optimization levels,
3384 // * a value passed to createFunctionInliningPass function, or
3385 // * the -inline-threshold flag.
3386 // If the -inline-threshold flag is explicitly specified, that is used
3387 // irrespective of anything else.
3388 if (InlineThreshold.getNumOccurrences() > 0)
3389 Params.DefaultThreshold = InlineThreshold;
3390 else
3391 Params.DefaultThreshold = Threshold;
3392
3393 // Set the HintThreshold knob from the -inlinehint-threshold.
3394 Params.HintThreshold = HintThreshold;
3395
3396 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3397 Params.HotCallSiteThreshold = HotCallSiteThreshold;
3398
3399 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3400 // populate LocallyHotCallSiteThreshold. Later, we populate
3401 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3402 // we know that optimization level is O3 (in the getInlineParams variant that
3403 // takes the opt and size levels).
3404 // FIXME: Remove this check (and make the assignment unconditional) after
3405 // addressing size regression issues at O2.
3406 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3407 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3408
3409 // Set the ColdCallSiteThreshold knob from the
3410 // -inline-cold-callsite-threshold.
3411 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3412
3413 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3414 // -inlinehint-threshold commandline option is not explicitly given. If that
3415 // option is present, then its value applies even for callees with size and
3416 // minsize attributes.
3417 // If the -inline-threshold is not specified, set the ColdThreshold from the
3418 // -inlinecold-threshold even if it is not explicitly passed. If
3419 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3420 // explicitly specified to set the ColdThreshold knob
3421 if (InlineThreshold.getNumOccurrences() == 0) {
3422 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3423 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3424 Params.ColdThreshold = ColdThreshold;
3425 } else if (ColdThreshold.getNumOccurrences() > 0) {
3426 Params.ColdThreshold = ColdThreshold;
3427 }
3428 return Params;
3429}
3430
3431InlineParams llvm::getInlineParams() {
3432 return getInlineParams(Threshold: DefaultThreshold);
3433}
3434
3435// Compute the default threshold for inlining based on the opt level and the
3436// size opt level.
3437static int computeThresholdFromOptLevels(unsigned OptLevel,
3438 unsigned SizeOptLevel) {
3439 if (OptLevel > 2)
3440 return InlineConstants::OptAggressiveThreshold;
3441 if (SizeOptLevel == 1) // -Os
3442 return InlineConstants::OptSizeThreshold;
3443 if (SizeOptLevel == 2) // -Oz
3444 return InlineConstants::OptMinSizeThreshold;
3445 return DefaultThreshold;
3446}
3447
3448InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3449 auto Params =
3450 getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3451 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3452 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3453 // when it is specified explicitly.
3454 if (OptLevel > 2)
3455 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3456 return Params;
3457}
3458
3459PreservedAnalyses
3460InlineCostAnnotationPrinterPass::run(Function &F,
3461 FunctionAnalysisManager &FAM) {
3462 PrintInstructionComments = true;
3463 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3464 [&](Function &F) -> AssumptionCache & {
3465 return FAM.getResult<AssumptionAnalysis>(IR&: F);
3466 };
3467
3468 auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
3469 ProfileSummaryInfo *PSI =
3470 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
3471 const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3472
3473 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3474 // In the current implementation, the type of InlineParams doesn't matter as
3475 // the pass serves only for verification of inliner's decisions.
3476 // We can add a flag which determines InlineParams for this run. Right now,
3477 // the default InlineParams are used.
3478 const InlineParams Params = llvm::getInlineParams();
3479 for (BasicBlock &BB : F) {
3480 for (Instruction &I : BB) {
3481 if (auto *CB = dyn_cast<CallBase>(Val: &I)) {
3482 Function *CalledFunction = CB->getCalledFunction();
3483 if (!CalledFunction || CalledFunction->isDeclaration())
3484 continue;
3485 OptimizationRemarkEmitter ORE(CalledFunction);
3486 InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
3487 GetAssumptionCache, nullptr, nullptr, PSI,
3488 &ORE);
3489 ICCA.analyze();
3490 OS << " Analyzing call of " << CalledFunction->getName()
3491 << "... (caller:" << CB->getCaller()->getName() << ")\n";
3492 ICCA.print(OS);
3493 OS << "\n";
3494 }
3495 }
3496 }
3497 return PreservedAnalyses::all();
3498}
3499