1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Analysis/InlineCost.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/BlockFrequencyInfo.h"
21#include "llvm/Analysis/CodeMetrics.h"
22#include "llvm/Analysis/ConstantFolding.h"
23#include "llvm/Analysis/DomConditionCache.h"
24#include "llvm/Analysis/EphemeralValuesCache.h"
25#include "llvm/Analysis/InstructionSimplify.h"
26#include "llvm/Analysis/LoopInfo.h"
27#include "llvm/Analysis/MemoryBuiltins.h"
28#include "llvm/Analysis/OptimizationRemarkEmitter.h"
29#include "llvm/Analysis/ProfileSummaryInfo.h"
30#include "llvm/Analysis/TargetLibraryInfo.h"
31#include "llvm/Analysis/TargetTransformInfo.h"
32#include "llvm/Analysis/ValueTracking.h"
33#include "llvm/Config/llvm-config.h"
34#include "llvm/IR/AssemblyAnnotationWriter.h"
35#include "llvm/IR/CallingConv.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/Dominators.h"
38#include "llvm/IR/GetElementPtrTypeIterator.h"
39#include "llvm/IR/GlobalAlias.h"
40#include "llvm/IR/InlineAsm.h"
41#include "llvm/IR/InstVisitor.h"
42#include "llvm/IR/IntrinsicInst.h"
43#include "llvm/IR/Operator.h"
44#include "llvm/IR/PatternMatch.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/Debug.h"
47#include "llvm/Support/FormattedStream.h"
48#include "llvm/Support/raw_ostream.h"
49#include <climits>
50#include <limits>
51#include <optional>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "inline-cost"
56
57STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
58
59static cl::opt<int>
60 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(Val: 225),
61 cl::desc("Default amount of inlining to perform"));
62
63// We introduce this option since there is a minor compile-time win by avoiding
64// addition of TTI attributes (target-features in particular) to inline
65// candidates when they are guaranteed to be the same as top level methods in
66// some use cases. If we avoid adding the attribute, we need an option to avoid
67// checking these attributes.
68static cl::opt<bool> IgnoreTTIInlineCompatible(
69 "ignore-tti-inline-compatible", cl::Hidden, cl::init(Val: false),
70 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
71 "during inline cost calculation"));
72
73static cl::opt<bool> PrintInstructionComments(
74 "print-instruction-comments", cl::Hidden, cl::init(Val: false),
75 cl::desc("Prints comments for instruction based on inline cost analysis"));
76
77static cl::opt<int> InlineThreshold(
78 "inline-threshold", cl::Hidden, cl::init(Val: 225),
79 cl::desc("Control the amount of inlining to perform (default = 225)"));
80
81static cl::opt<int> HintThreshold(
82 "inlinehint-threshold", cl::Hidden, cl::init(Val: 325),
83 cl::desc("Threshold for inlining functions with inline hint"));
84
85static cl::opt<int>
86 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
87 cl::init(Val: 45),
88 cl::desc("Threshold for inlining cold callsites"));
89
90static cl::opt<bool> InlineEnableCostBenefitAnalysis(
91 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(Val: false),
92 cl::desc("Enable the cost-benefit analysis for the inliner"));
93
94// InlineSavingsMultiplier overrides per TTI multipliers iff it is
95// specified explicitly in command line options. This option is exposed
96// for tuning and testing.
97static cl::opt<int> InlineSavingsMultiplier(
98 "inline-savings-multiplier", cl::Hidden, cl::init(Val: 8),
99 cl::desc("Multiplier to multiply cycle savings by during inlining"));
100
101// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
102// specified explicitly in command line options. This option is exposed
103// for tuning and testing.
104static cl::opt<int> InlineSavingsProfitableMultiplier(
105 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(Val: 4),
106 cl::desc("A multiplier on top of cycle savings to decide whether the "
107 "savings won't justify the cost"));
108
109static cl::opt<int>
110 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(Val: 100),
111 cl::desc("The maximum size of a callee that get's "
112 "inlined without sufficient cycle savings"));
113
114// We introduce this threshold to help performance of instrumentation based
115// PGO before we actually hook up inliner with analysis passes such as BPI and
116// BFI.
117static cl::opt<int> ColdThreshold(
118 "inlinecold-threshold", cl::Hidden, cl::init(Val: 45),
119 cl::desc("Threshold for inlining functions with cold attribute"));
120
121static cl::opt<int>
122 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(Val: 3000),
123 cl::desc("Threshold for hot callsites "));
124
125static cl::opt<int> LocallyHotCallSiteThreshold(
126 "locally-hot-callsite-threshold", cl::Hidden, cl::init(Val: 525),
127 cl::desc("Threshold for locally hot callsites "));
128
129static cl::opt<int> ColdCallSiteRelFreq(
130 "cold-callsite-rel-freq", cl::Hidden, cl::init(Val: 2),
131 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
132 "entry frequency, for a callsite to be cold in the absence of "
133 "profile information."));
134
135static cl::opt<uint64_t> HotCallSiteRelFreq(
136 "hot-callsite-rel-freq", cl::Hidden, cl::init(Val: 60),
137 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
138 "entry frequency, for a callsite to be hot in the absence of "
139 "profile information."));
140
141static cl::opt<int>
142 InstrCost("inline-instr-cost", cl::Hidden, cl::init(Val: 5),
143 cl::desc("Cost of a single instruction when inlining"));
144
145static cl::opt<int> InlineAsmInstrCost(
146 "inline-asm-instr-cost", cl::Hidden, cl::init(Val: 0),
147 cl::desc("Cost of a single inline asm instruction when inlining"));
148
149static cl::opt<int>
150 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(Val: 0),
151 cl::desc("Cost of load/store instruction when inlining"));
152
153static cl::opt<int> CallPenalty(
154 "inline-call-penalty", cl::Hidden, cl::init(Val: 25),
155 cl::desc("Call penalty that is applied per callsite when inlining"));
156
157static cl::opt<size_t>
158 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
159 cl::init(Val: std::numeric_limits<size_t>::max()),
160 cl::desc("Do not inline functions with a stack size "
161 "that exceeds the specified limit"));
162
163static cl::opt<size_t> RecurStackSizeThreshold(
164 "recursive-inline-max-stacksize", cl::Hidden,
165 cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller),
166 cl::desc("Do not inline recursive functions with a stack "
167 "size that exceeds the specified limit"));
168
169static cl::opt<bool> OptComputeFullInlineCost(
170 "inline-cost-full", cl::Hidden,
171 cl::desc("Compute the full inline cost of a call site even when the cost "
172 "exceeds the threshold."));
173
174static cl::opt<bool> InlineCallerSupersetNoBuiltin(
175 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(Val: true),
176 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
177 "attributes."));
178
179static cl::opt<bool> DisableGEPConstOperand(
180 "disable-gep-const-evaluation", cl::Hidden, cl::init(Val: false),
181 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
182
183static cl::opt<bool> InlineAllViableCalls(
184 "inline-all-viable-calls", cl::Hidden, cl::init(Val: false),
185 cl::desc("Inline all viable calls, even if they exceed the inlining "
186 "threshold"));
187namespace llvm {
188std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
189 if (Attr.isValid()) {
190 int AttrValue = 0;
191 if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue))
192 return AttrValue;
193 }
194 return std::nullopt;
195}
196
197std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
198 return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind));
199}
200
201std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
202 return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind));
203}
204
205namespace InlineConstants {
206int getInstrCost() { return InstrCost; }
207
208} // namespace InlineConstants
209
210} // namespace llvm
211
212namespace {
213class InlineCostCallAnalyzer;
214
215// This struct is used to store information about inline cost of a
216// particular instruction
217struct InstructionCostDetail {
218 int CostBefore = 0;
219 int CostAfter = 0;
220 int ThresholdBefore = 0;
221 int ThresholdAfter = 0;
222
223 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
224
225 int getCostDelta() const { return CostAfter - CostBefore; }
226
227 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
228};
229
230class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
231private:
232 InlineCostCallAnalyzer *const ICCA;
233
234public:
235 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
236 void emitInstructionAnnot(const Instruction *I,
237 formatted_raw_ostream &OS) override;
238};
239
240/// Carry out call site analysis, in order to evaluate inlinability.
241/// NOTE: the type is currently used as implementation detail of functions such
242/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
243/// expectation is that they come from the outer scope, from the wrapper
244/// functions. If we want to support constructing CallAnalyzer objects where
245/// lambdas are provided inline at construction, or where the object needs to
246/// otherwise survive past the scope of the provided functions, we need to
247/// revisit the argument types.
248class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
249 typedef InstVisitor<CallAnalyzer, bool> Base;
250 friend class InstVisitor<CallAnalyzer, bool>;
251
252protected:
253 virtual ~CallAnalyzer() = default;
254 /// The TargetTransformInfo available for this compilation.
255 const TargetTransformInfo &TTI;
256
257 /// Getter for the cache of @llvm.assume intrinsics.
258 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
259
260 /// Getter for BlockFrequencyInfo
261 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
262
263 /// Getter for TargetLibraryInfo
264 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
265
266 /// Profile summary information.
267 ProfileSummaryInfo *PSI;
268
269 /// The called function.
270 Function &F;
271
272 // Cache the DataLayout since we use it a lot.
273 const DataLayout &DL;
274
275 /// The OptimizationRemarkEmitter available for this compilation.
276 OptimizationRemarkEmitter *ORE;
277
278 /// The candidate callsite being analyzed. Please do not use this to do
279 /// analysis in the caller function; we want the inline cost query to be
280 /// easily cacheable. Instead, use the cover function paramHasAttr.
281 CallBase &CandidateCall;
282
283 /// Getter for the cache of ephemeral values.
284 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr;
285
286 /// Extension points for handling callsite features.
287 // Called before a basic block was analyzed.
288 virtual void onBlockStart(const BasicBlock *BB) {}
289
290 /// Called after a basic block was analyzed.
291 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
292
293 /// Called before an instruction was analyzed
294 virtual void onInstructionAnalysisStart(const Instruction *I) {}
295
296 /// Called after an instruction was analyzed
297 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
298
299 /// Called at the end of the analysis of the callsite. Return the outcome of
300 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
301 /// the reason it can't.
302 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
303 /// Called when we're about to start processing a basic block, and every time
304 /// we are done processing an instruction. Return true if there is no point in
305 /// continuing the analysis (e.g. we've determined already the call site is
306 /// too expensive to inline)
307 virtual bool shouldStop() { return false; }
308
309 /// Called before the analysis of the callee body starts (with callsite
310 /// contexts propagated). It checks callsite-specific information. Return a
311 /// reason analysis can't continue if that's the case, or 'true' if it may
312 /// continue.
313 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
314 /// Called if the analysis engine decides SROA cannot be done for the given
315 /// alloca.
316 virtual void onDisableSROA(AllocaInst *Arg) {}
317
318 /// Called the analysis engine determines load elimination won't happen.
319 virtual void onDisableLoadElimination() {}
320
321 /// Called when we visit a CallBase, before the analysis starts. Return false
322 /// to stop further processing of the instruction.
323 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
324
325 /// Called to account for a call.
326 virtual void onCallPenalty() {}
327
328 /// Called to account for a load or store.
329 virtual void onMemAccess(){};
330
331 /// Called to account for the expectation the inlining would result in a load
332 /// elimination.
333 virtual void onLoadEliminationOpportunity() {}
334
335 /// Called to account for the cost of argument setup for the Call in the
336 /// callee's body (not the callsite currently under analysis).
337 virtual void onCallArgumentSetup(const CallBase &Call) {}
338
339 /// Called to account for a load relative intrinsic.
340 virtual void onLoadRelativeIntrinsic() {}
341
342 /// Called to account for a lowered call.
343 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
344 }
345
346 /// Account for a jump table of given size. Return false to stop further
347 /// processing the switch instruction
348 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
349
350 /// Account for a case cluster of given size. Return false to stop further
351 /// processing of the instruction.
352 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
353
354 /// Called at the end of processing a switch instruction, with the given
355 /// number of case clusters.
356 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
357 bool DefaultDestUnreachable) {}
358
359 /// Called to account for any other instruction not specifically accounted
360 /// for.
361 virtual void onMissedSimplification() {}
362
363 /// Account for inline assembly instructions.
364 virtual void onInlineAsm(const InlineAsm &Arg) {}
365
366 /// Start accounting potential benefits due to SROA for the given alloca.
367 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
368
369 /// Account SROA savings for the AllocaInst value.
370 virtual void onAggregateSROAUse(AllocaInst *V) {}
371
372 bool handleSROA(Value *V, bool DoNotDisable) {
373 // Check for SROA candidates in comparisons.
374 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
375 if (DoNotDisable) {
376 onAggregateSROAUse(V: SROAArg);
377 return true;
378 }
379 disableSROAForArg(SROAArg);
380 }
381 return false;
382 }
383
384 bool IsCallerRecursive = false;
385 bool IsRecursiveCall = false;
386 bool ExposesReturnsTwice = false;
387 bool HasDynamicAlloca = false;
388 bool ContainsNoDuplicateCall = false;
389 bool HasReturn = false;
390 bool HasIndirectBr = false;
391 bool HasUninlineableIntrinsic = false;
392 bool InitsVargArgs = false;
393
394 /// Number of bytes allocated statically by the callee.
395 uint64_t AllocatedSize = 0;
396 unsigned NumInstructions = 0;
397 unsigned NumInlineAsmInstructions = 0;
398 unsigned NumVectorInstructions = 0;
399
400 /// While we walk the potentially-inlined instructions, we build up and
401 /// maintain a mapping of simplified values specific to this callsite. The
402 /// idea is to propagate any special information we have about arguments to
403 /// this call through the inlinable section of the function, and account for
404 /// likely simplifications post-inlining. The most important aspect we track
405 /// is CFG altering simplifications -- when we prove a basic block dead, that
406 /// can cause dramatic shifts in the cost of inlining a function.
407 /// Note: The simplified Value may be owned by the caller function.
408 DenseMap<Value *, Value *> SimplifiedValues;
409
410 /// Keep track of the values which map back (through function arguments) to
411 /// allocas on the caller stack which could be simplified through SROA.
412 DenseMap<Value *, AllocaInst *> SROAArgValues;
413
414 /// Keep track of Allocas for which we believe we may get SROA optimization.
415 DenseSet<AllocaInst *> EnabledSROAAllocas;
416
417 /// Keep track of values which map to a pointer base and constant offset.
418 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
419
420 /// Keep track of dead blocks due to the constant arguments.
421 SmallPtrSet<BasicBlock *, 16> DeadBlocks;
422
423 /// The mapping of the blocks to their known unique successors due to the
424 /// constant arguments.
425 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
426
427 /// Model the elimination of repeated loads that is expected to happen
428 /// whenever we simplify away the stores that would otherwise cause them to be
429 /// loads.
430 bool EnableLoadElimination = true;
431
432 /// Whether we allow inlining for recursive call.
433 bool AllowRecursiveCall = false;
434
435 SmallPtrSet<Value *, 16> LoadAddrSet;
436
437 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
438 auto It = SROAArgValues.find(Val: V);
439 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0)
440 return nullptr;
441 return It->second;
442 }
443
444 /// Use a value in its given form directly if possible, otherwise try looking
445 /// for it in SimplifiedValues.
446 template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
447 if (auto *Direct = dyn_cast<T>(V))
448 return Direct;
449 return getSimplifiedValue<T>(V);
450 }
451
452 // Custom simplification helper routines.
453 bool isAllocaDerivedArg(Value *V);
454 void disableSROAForArg(AllocaInst *SROAArg);
455 void disableSROA(Value *V);
456 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
457 void disableLoadElimination();
458 bool isGEPFree(GetElementPtrInst &GEP);
459 bool canFoldInboundsGEP(GetElementPtrInst &I);
460 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
461 bool simplifyCallSite(Function *F, CallBase &Call);
462 bool simplifyCmpInstForRecCall(CmpInst &Cmp);
463 bool simplifyInstruction(Instruction &I);
464 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
465 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
466 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
467 bool isLoweredToCall(Function *F, CallBase &Call);
468
469 /// Return true if the given argument to the function being considered for
470 /// inlining has the given attribute set either at the call site or the
471 /// function declaration. Primarily used to inspect call site specific
472 /// attributes since these can be more precise than the ones on the callee
473 /// itself.
474 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
475
476 /// Return true if the given value is known non null within the callee if
477 /// inlined through this particular callsite.
478 bool isKnownNonNullInCallee(Value *V);
479
480 /// Return true if size growth is allowed when inlining the callee at \p Call.
481 bool allowSizeGrowth(CallBase &Call);
482
483 // Custom analysis routines.
484 InlineResult analyzeBlock(BasicBlock *BB,
485 const SmallPtrSetImpl<const Value *> &EphValues);
486
487 // Disable several entry points to the visitor so we don't accidentally use
488 // them by declaring but not defining them here.
489 void visit(Module *);
490 void visit(Module &);
491 void visit(Function *);
492 void visit(Function &);
493 void visit(BasicBlock *);
494 void visit(BasicBlock &);
495
496 // Provide base case for our instruction visit.
497 bool visitInstruction(Instruction &I);
498
499 // Our visit overrides.
500 bool visitAlloca(AllocaInst &I);
501 bool visitPHI(PHINode &I);
502 bool visitGetElementPtr(GetElementPtrInst &I);
503 bool visitBitCast(BitCastInst &I);
504 bool visitPtrToInt(PtrToIntInst &I);
505 bool visitIntToPtr(IntToPtrInst &I);
506 bool visitCastInst(CastInst &I);
507 bool visitCmpInst(CmpInst &I);
508 bool visitSub(BinaryOperator &I);
509 bool visitBinaryOperator(BinaryOperator &I);
510 bool visitFNeg(UnaryOperator &I);
511 bool visitLoad(LoadInst &I);
512 bool visitStore(StoreInst &I);
513 bool visitExtractValue(ExtractValueInst &I);
514 bool visitInsertValue(InsertValueInst &I);
515 bool visitCallBase(CallBase &Call);
516 bool visitReturnInst(ReturnInst &RI);
517 bool visitBranchInst(BranchInst &BI);
518 bool visitSelectInst(SelectInst &SI);
519 bool visitSwitchInst(SwitchInst &SI);
520 bool visitIndirectBrInst(IndirectBrInst &IBI);
521 bool visitResumeInst(ResumeInst &RI);
522 bool visitCleanupReturnInst(CleanupReturnInst &RI);
523 bool visitCatchReturnInst(CatchReturnInst &RI);
524 bool visitUnreachableInst(UnreachableInst &I);
525
526public:
527 CallAnalyzer(
528 Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
529 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
530 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
531 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
532 ProfileSummaryInfo *PSI = nullptr,
533 OptimizationRemarkEmitter *ORE = nullptr,
534 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
535 nullptr)
536 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
537 GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
538 CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {}
539
540 InlineResult analyze();
541
542 /// Lookup simplified Value. May return a value owned by the caller.
543 Value *getSimplifiedValueUnchecked(Value *V) const {
544 return SimplifiedValues.lookup(Val: V);
545 }
546
547 /// Lookup simplified Value, but return nullptr if the simplified value is
548 /// owned by the caller.
549 template <typename T> T *getSimplifiedValue(Value *V) const {
550 Value *SimpleV = SimplifiedValues.lookup(Val: V);
551 if (!SimpleV)
552 return nullptr;
553
554 // Skip checks if we know T is a global. This has a small, but measurable
555 // impact on compile-time.
556 if constexpr (std::is_base_of_v<Constant, T>)
557 return dyn_cast<T>(SimpleV);
558
559 // Make sure the simplified Value is owned by this function
560 if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) {
561 if (I->getFunction() != &F)
562 return nullptr;
563 } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) {
564 if (Arg->getParent() != &F)
565 return nullptr;
566 } else if (!isa<Constant>(Val: SimpleV))
567 return nullptr;
568 return dyn_cast<T>(SimpleV);
569 }
570
571 // Keep a bunch of stats about the cost savings found so we can print them
572 // out when debugging.
573 unsigned NumConstantArgs = 0;
574 unsigned NumConstantOffsetPtrArgs = 0;
575 unsigned NumAllocaArgs = 0;
576 unsigned NumConstantPtrCmps = 0;
577 unsigned NumConstantPtrDiffs = 0;
578 unsigned NumInstructionsSimplified = 0;
579
580 void dump();
581};
582
583// Considering forming a binary search, we should find the number of nodes
584// which is same as the number of comparisons when lowered. For a given
585// number of clusters, n, we can define a recursive function, f(n), to find
586// the number of nodes in the tree. The recursion is :
587// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
588// and f(n) = n, when n <= 3.
589// This will lead a binary tree where the leaf should be either f(2) or f(3)
590// when n > 3. So, the number of comparisons from leaves should be n, while
591// the number of non-leaf should be :
592// 2^(log2(n) - 1) - 1
593// = 2^log2(n) * 2^-1 - 1
594// = n / 2 - 1.
595// Considering comparisons from leaf and non-leaf nodes, we can estimate the
596// number of comparisons in a simple closed form :
597// n + n / 2 - 1 = n * 3 / 2 - 1
598int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
599 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
600}
601
602/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
603/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
604class InlineCostCallAnalyzer final : public CallAnalyzer {
605 const bool ComputeFullInlineCost;
606 int LoadEliminationCost = 0;
607 /// Bonus to be applied when percentage of vector instructions in callee is
608 /// high (see more details in updateThreshold).
609 int VectorBonus = 0;
610 /// Bonus to be applied when the callee has only one reachable basic block.
611 int SingleBBBonus = 0;
612
613 /// Tunable parameters that control the analysis.
614 const InlineParams &Params;
615
616 // This DenseMap stores the delta change in cost and threshold after
617 // accounting for the given instruction. The map is filled only with the
618 // flag PrintInstructionComments on.
619 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
620
621 /// Upper bound for the inlining cost. Bonuses are being applied to account
622 /// for speculative "expected profit" of the inlining decision.
623 int Threshold = 0;
624
625 /// The amount of StaticBonus applied.
626 int StaticBonusApplied = 0;
627
628 /// Attempt to evaluate indirect calls to boost its inline cost.
629 const bool BoostIndirectCalls;
630
631 /// Ignore the threshold when finalizing analysis.
632 const bool IgnoreThreshold;
633
634 // True if the cost-benefit-analysis-based inliner is enabled.
635 const bool CostBenefitAnalysisEnabled;
636
637 /// Inlining cost measured in abstract units, accounts for all the
638 /// instructions expected to be executed for a given function invocation.
639 /// Instructions that are statically proven to be dead based on call-site
640 /// arguments are not counted here.
641 int Cost = 0;
642
643 // The cumulative cost at the beginning of the basic block being analyzed. At
644 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
645 // the size of that basic block.
646 int CostAtBBStart = 0;
647
648 // The static size of live but cold basic blocks. This is "static" in the
649 // sense that it's not weighted by profile counts at all.
650 int ColdSize = 0;
651
652 // Whether inlining is decided by cost-threshold analysis.
653 bool DecidedByCostThreshold = false;
654
655 // Whether inlining is decided by cost-benefit analysis.
656 bool DecidedByCostBenefit = false;
657
658 // The cost-benefit pair computed by cost-benefit analysis.
659 std::optional<CostBenefitPair> CostBenefit;
660
661 bool SingleBB = true;
662
663 unsigned SROACostSavings = 0;
664 unsigned SROACostSavingsLost = 0;
665
666 /// The mapping of caller Alloca values to their accumulated cost savings. If
667 /// we have to disable SROA for one of the allocas, this tells us how much
668 /// cost must be added.
669 DenseMap<AllocaInst *, int> SROAArgCosts;
670
671 /// Return true if \p Call is a cold callsite.
672 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
673
674 /// Update Threshold based on callsite properties such as callee
675 /// attributes and callee hotness for PGO builds. The Callee is explicitly
676 /// passed to support analyzing indirect calls whose target is inferred by
677 /// analysis.
678 void updateThreshold(CallBase &Call, Function &Callee);
679 /// Return a higher threshold if \p Call is a hot callsite.
680 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
681 BlockFrequencyInfo *CallerBFI);
682
683 /// Handle a capped 'int' increment for Cost.
684 void addCost(int64_t Inc) {
685 Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX);
686 Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX);
687 }
688
689 void onDisableSROA(AllocaInst *Arg) override {
690 auto CostIt = SROAArgCosts.find(Val: Arg);
691 if (CostIt == SROAArgCosts.end())
692 return;
693 addCost(Inc: CostIt->second);
694 SROACostSavings -= CostIt->second;
695 SROACostSavingsLost += CostIt->second;
696 SROAArgCosts.erase(I: CostIt);
697 }
698
699 void onDisableLoadElimination() override {
700 addCost(Inc: LoadEliminationCost);
701 LoadEliminationCost = 0;
702 }
703
704 bool onCallBaseVisitStart(CallBase &Call) override {
705 if (std::optional<int> AttrCallThresholdBonus =
706 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus"))
707 Threshold += *AttrCallThresholdBonus;
708
709 if (std::optional<int> AttrCallCost =
710 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost")) {
711 addCost(Inc: *AttrCallCost);
712 // Prevent further processing of the call since we want to override its
713 // inline cost, not just add to it.
714 return false;
715 }
716 return true;
717 }
718
719 void onCallPenalty() override { addCost(Inc: CallPenalty); }
720
721 void onMemAccess() override { addCost(Inc: MemAccessCost); }
722
723 void onCallArgumentSetup(const CallBase &Call) override {
724 // Pay the price of the argument setup. We account for the average 1
725 // instruction per call argument setup here.
726 addCost(Inc: Call.arg_size() * InstrCost);
727 }
728 void onLoadRelativeIntrinsic() override {
729 // This is normally lowered to 4 LLVM instructions.
730 addCost(Inc: 3 * InstrCost);
731 }
732 void onLoweredCall(Function *F, CallBase &Call,
733 bool IsIndirectCall) override {
734 // We account for the average 1 instruction per call argument setup here.
735 addCost(Inc: Call.arg_size() * InstrCost);
736
737 // If we have a constant that we are calling as a function, we can peer
738 // through it and see the function target. This happens not infrequently
739 // during devirtualization and so we want to give it a hefty bonus for
740 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
741 // Pretend to inline the function, with a custom threshold.
742 if (IsIndirectCall && BoostIndirectCalls) {
743 auto IndirectCallParams = Params;
744 IndirectCallParams.DefaultThreshold =
745 InlineConstants::IndirectCallThreshold;
746 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
747 /// to instantiate the derived class.
748 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
749 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
750 false);
751 if (CA.analyze().isSuccess()) {
752 // We were able to inline the indirect call! Subtract the cost from the
753 // threshold to get the bonus we want to apply, but don't go below zero.
754 addCost(Inc: -std::max(a: 0, b: CA.getThreshold() - CA.getCost()));
755 }
756 } else
757 // Otherwise simply add the cost for merely making the call.
758 addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call,
759 DefaultCallPenalty: CallPenalty));
760 }
761
762 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
763 bool DefaultDestUnreachable) override {
764 // If suitable for a jump table, consider the cost for the table size and
765 // branch to destination.
766 // Maximum valid cost increased in this function.
767 if (JumpTableSize) {
768 // Suppose a default branch includes one compare and one conditional
769 // branch if it's reachable.
770 if (!DefaultDestUnreachable)
771 addCost(Inc: 2 * InstrCost);
772 // Suppose a jump table requires one load and one jump instruction.
773 int64_t JTCost =
774 static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
775 addCost(Inc: JTCost);
776 return;
777 }
778
779 if (NumCaseCluster <= 3) {
780 // Suppose a comparison includes one compare and one conditional branch.
781 // We can reduce a set of instructions if the default branch is
782 // undefined.
783 addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost);
784 return;
785 }
786
787 int64_t ExpectedNumberOfCompare =
788 getExpectedNumberOfCompare(NumCaseCluster);
789 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
790
791 addCost(Inc: SwitchCost);
792 }
793
794 // Parses the inline assembly argument to account for its cost. Inline
795 // assembly instructions incur higher costs for inlining since they cannot be
796 // analyzed and optimized.
797 void onInlineAsm(const InlineAsm &Arg) override {
798 if (!InlineAsmInstrCost)
799 return;
800 SmallVector<StringRef, 4> AsmStrs;
801 Arg.collectAsmStrs(AsmStrs);
802 int SectionLevel = 0;
803 int InlineAsmInstrCount = 0;
804 for (StringRef AsmStr : AsmStrs) {
805 // Trim whitespaces and comments.
806 StringRef Trimmed = AsmStr.trim();
807 size_t hashPos = Trimmed.find(C: '#');
808 if (hashPos != StringRef::npos)
809 Trimmed = Trimmed.substr(Start: 0, N: hashPos);
810 // Ignore comments.
811 if (Trimmed.empty())
812 continue;
813 // Filter out the outlined assembly instructions from the cost by keeping
814 // track of the section level and only accounting for instrutions at
815 // section level of zero. Note there will be duplication in outlined
816 // sections too, but is not accounted in the inlining cost model.
817 if (Trimmed.starts_with(Prefix: ".pushsection")) {
818 ++SectionLevel;
819 continue;
820 }
821 if (Trimmed.starts_with(Prefix: ".popsection")) {
822 --SectionLevel;
823 continue;
824 }
825 // Ignore directives and labels.
826 if (Trimmed.starts_with(Prefix: ".") || Trimmed.contains(Other: ":"))
827 continue;
828 if (SectionLevel == 0)
829 ++InlineAsmInstrCount;
830 }
831 NumInlineAsmInstructions += InlineAsmInstrCount;
832 addCost(Inc: InlineAsmInstrCount * InlineAsmInstrCost);
833 }
834
835 void onMissedSimplification() override { addCost(Inc: InstrCost); }
836
837 void onInitializeSROAArg(AllocaInst *Arg) override {
838 assert(Arg != nullptr &&
839 "Should not initialize SROA costs for null value.");
840 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
841 SROACostSavings += SROAArgCost;
842 SROAArgCosts[Arg] = SROAArgCost;
843 }
844
845 void onAggregateSROAUse(AllocaInst *SROAArg) override {
846 auto CostIt = SROAArgCosts.find(Val: SROAArg);
847 assert(CostIt != SROAArgCosts.end() &&
848 "expected this argument to have a cost");
849 CostIt->second += InstrCost;
850 SROACostSavings += InstrCost;
851 }
852
853 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
854
855 void onBlockAnalyzed(const BasicBlock *BB) override {
856 if (CostBenefitAnalysisEnabled) {
857 // Keep track of the static size of live but cold basic blocks. For now,
858 // we define a cold basic block to be one that's never executed.
859 assert(GetBFI && "GetBFI must be available");
860 BlockFrequencyInfo *BFI = &(GetBFI(F));
861 assert(BFI && "BFI must be available");
862 auto ProfileCount = BFI->getBlockProfileCount(BB);
863 if (*ProfileCount == 0)
864 ColdSize += Cost - CostAtBBStart;
865 }
866
867 auto *TI = BB->getTerminator();
868 // If we had any successors at this point, than post-inlining is likely to
869 // have them as well. Note that we assume any basic blocks which existed
870 // due to branches or switches which folded above will also fold after
871 // inlining.
872 if (SingleBB && TI->getNumSuccessors() > 1) {
873 // Take off the bonus we applied to the threshold.
874 Threshold -= SingleBBBonus;
875 SingleBB = false;
876 }
877 }
878
879 void onInstructionAnalysisStart(const Instruction *I) override {
880 // This function is called to store the initial cost of inlining before
881 // the given instruction was assessed.
882 if (!PrintInstructionComments)
883 return;
884 auto &CostDetail = InstructionCostDetailMap[I];
885 CostDetail.CostBefore = Cost;
886 CostDetail.ThresholdBefore = Threshold;
887 }
888
889 void onInstructionAnalysisFinish(const Instruction *I) override {
890 // This function is called to find new values of cost and threshold after
891 // the instruction has been assessed.
892 if (!PrintInstructionComments)
893 return;
894 auto &CostDetail = InstructionCostDetailMap[I];
895 CostDetail.CostAfter = Cost;
896 CostDetail.ThresholdAfter = Threshold;
897 }
898
899 bool isCostBenefitAnalysisEnabled() {
900 if (!PSI || !PSI->hasProfileSummary())
901 return false;
902
903 if (!GetBFI)
904 return false;
905
906 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
907 // Honor the explicit request from the user.
908 if (!InlineEnableCostBenefitAnalysis)
909 return false;
910 } else {
911 // Otherwise, require instrumentation profile.
912 if (!PSI->hasInstrumentationProfile())
913 return false;
914 }
915
916 auto *Caller = CandidateCall.getParent()->getParent();
917 if (!Caller->getEntryCount())
918 return false;
919
920 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
921 if (!CallerBFI)
922 return false;
923
924 // For now, limit to hot call site.
925 if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI))
926 return false;
927
928 // Make sure we have a nonzero entry count.
929 auto EntryCount = F.getEntryCount();
930 if (!EntryCount || !EntryCount->getCount())
931 return false;
932
933 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
934 if (!CalleeBFI)
935 return false;
936
937 return true;
938 }
939
940 // A helper function to choose between command line override and default.
941 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
942 if (InlineSavingsMultiplier.getNumOccurrences())
943 return InlineSavingsMultiplier;
944 return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
945 }
946
947 // A helper function to choose between command line override and default.
948 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
949 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
950 return InlineSavingsProfitableMultiplier;
951 return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
952 }
953
954 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
955 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
956 CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test")) {
957 CycleSavings = *AttrCycleSavings;
958 }
959
960 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
961 CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test")) {
962 Size = *AttrRuntimeCost;
963 }
964 }
965
966 // Determine whether we should inline the given call site, taking into account
967 // both the size cost and the cycle savings. Return std::nullopt if we don't
968 // have sufficient profiling information to determine.
969 std::optional<bool> costBenefitAnalysis() {
970 if (!CostBenefitAnalysisEnabled)
971 return std::nullopt;
972
973 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
974 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
975 // falling back to the cost-based metric.
976 // TODO: Improve this hacky condition.
977 if (Threshold == 0)
978 return std::nullopt;
979
980 assert(GetBFI);
981 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
982 assert(CalleeBFI);
983
984 // The cycle savings expressed as the sum of InstrCost
985 // multiplied by the estimated dynamic count of each instruction we can
986 // avoid. Savings come from the call site cost, such as argument setup and
987 // the call instruction, as well as the instructions that are folded.
988 //
989 // We use 128-bit APInt here to avoid potential overflow. This variable
990 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
991 // assumes that we can avoid or fold a billion instructions, each with a
992 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
993 // period on a 4GHz machine.
994 APInt CycleSavings(128, 0);
995
996 for (auto &BB : F) {
997 APInt CurrentSavings(128, 0);
998 for (auto &I : BB) {
999 if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) {
1000 // Count a conditional branch as savings if it becomes unconditional.
1001 if (BI->isConditional() &&
1002 getSimplifiedValue<ConstantInt>(V: BI->getCondition())) {
1003 CurrentSavings += InstrCost;
1004 }
1005 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) {
1006 if (getSimplifiedValue<ConstantInt>(V: SI->getCondition()))
1007 CurrentSavings += InstrCost;
1008 } else if (Value *V = dyn_cast<Value>(Val: &I)) {
1009 // Count an instruction as savings if we can fold it.
1010 if (SimplifiedValues.count(Val: V)) {
1011 CurrentSavings += InstrCost;
1012 }
1013 }
1014 }
1015
1016 auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB);
1017 CurrentSavings *= *ProfileCount;
1018 CycleSavings += CurrentSavings;
1019 }
1020
1021 // Compute the cycle savings per call.
1022 auto EntryProfileCount = F.getEntryCount();
1023 assert(EntryProfileCount && EntryProfileCount->getCount());
1024 auto EntryCount = EntryProfileCount->getCount();
1025 CycleSavings += EntryCount / 2;
1026 CycleSavings = CycleSavings.udiv(RHS: EntryCount);
1027
1028 // Compute the total savings for the call site.
1029 auto *CallerBB = CandidateCall.getParent();
1030 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
1031 CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL);
1032 CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB);
1033
1034 // Remove the cost of the cold basic blocks to model the runtime cost more
1035 // accurately. Both machine block placement and function splitting could
1036 // place cold blocks further from hot blocks.
1037 int Size = Cost - ColdSize;
1038
1039 // Allow tiny callees to be inlined regardless of whether they meet the
1040 // savings threshold.
1041 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
1042
1043 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
1044 CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings);
1045
1046 // Let R be the ratio of CycleSavings to Size. We accept the inlining
1047 // opportunity if R is really high and reject if R is really low. If R is
1048 // somewhere in the middle, we fall back to the cost-based analysis.
1049 //
1050 // Specifically, let R = CycleSavings / Size, we accept the inlining
1051 // opportunity if:
1052 //
1053 // PSI->getOrCompHotCountThreshold()
1054 // R > -------------------------------------------------
1055 // getInliningCostBenefitAnalysisSavingsMultiplier()
1056 //
1057 // and reject the inlining opportunity if:
1058 //
1059 // PSI->getOrCompHotCountThreshold()
1060 // R <= ----------------------------------------------------
1061 // getInliningCostBenefitAnalysisProfitableMultiplier()
1062 //
1063 // Otherwise, we fall back to the cost-based analysis.
1064 //
1065 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
1066 // HotCountThreshold * Size) rather than division to avoid precision loss.
1067 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
1068 Threshold *= Size;
1069
1070 APInt UpperBoundCycleSavings = CycleSavings;
1071 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
1072 if (UpperBoundCycleSavings.uge(RHS: Threshold))
1073 return true;
1074
1075 APInt LowerBoundCycleSavings = CycleSavings;
1076 LowerBoundCycleSavings *=
1077 getInliningCostBenefitAnalysisProfitableMultiplier();
1078 if (LowerBoundCycleSavings.ult(RHS: Threshold))
1079 return false;
1080
1081 // Otherwise, fall back to the cost-based analysis.
1082 return std::nullopt;
1083 }
1084
1085 InlineResult finalizeAnalysis() override {
1086 // Loops generally act a lot like calls in that they act like barriers to
1087 // movement, require a certain amount of setup, etc. So when optimising for
1088 // size, we penalise any call sites that perform loops. We do this after all
1089 // other costs here, so will likely only be dealing with relatively small
1090 // functions (and hence DT and LI will hopefully be cheap).
1091 auto *Caller = CandidateCall.getFunction();
1092 if (Caller->hasMinSize()) {
1093 DominatorTree DT(F);
1094 LoopInfo LI(DT);
1095 int NumLoops = 0;
1096 for (Loop *L : LI) {
1097 // Ignore loops that will not be executed
1098 if (DeadBlocks.count(Ptr: L->getHeader()))
1099 continue;
1100 NumLoops++;
1101 }
1102 addCost(Inc: NumLoops * InlineConstants::LoopPenalty);
1103 }
1104
1105 // We applied the maximum possible vector bonus at the beginning. Now,
1106 // subtract the excess bonus, if any, from the Threshold before
1107 // comparing against Cost.
1108 if (NumVectorInstructions <= NumInstructions / 10)
1109 Threshold -= VectorBonus;
1110 else if (NumVectorInstructions <= NumInstructions / 2)
1111 Threshold -= VectorBonus / 2;
1112
1113 if (std::optional<int> AttrCost =
1114 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost"))
1115 Cost = *AttrCost;
1116
1117 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1118 CB&: CandidateCall,
1119 AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName))
1120 Cost *= *AttrCostMult;
1121
1122 if (std::optional<int> AttrThreshold =
1123 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold"))
1124 Threshold = *AttrThreshold;
1125
1126 if (auto Result = costBenefitAnalysis()) {
1127 DecidedByCostBenefit = true;
1128 if (*Result)
1129 return InlineResult::success();
1130 else
1131 return InlineResult::failure(Reason: "Cost over threshold.");
1132 }
1133
1134 if (IgnoreThreshold)
1135 return InlineResult::success();
1136
1137 DecidedByCostThreshold = true;
1138 return Cost < std::max(a: 1, b: Threshold)
1139 ? InlineResult::success()
1140 : InlineResult::failure(Reason: "Cost over threshold.");
1141 }
1142
1143 bool shouldStop() override {
1144 if (IgnoreThreshold || ComputeFullInlineCost)
1145 return false;
1146 // Bail out the moment we cross the threshold. This means we'll under-count
1147 // the cost, but only when undercounting doesn't matter.
1148 if (Cost < Threshold)
1149 return false;
1150 DecidedByCostThreshold = true;
1151 return true;
1152 }
1153
1154 void onLoadEliminationOpportunity() override {
1155 LoadEliminationCost += InstrCost;
1156 }
1157
1158 InlineResult onAnalysisStart() override {
1159 // Perform some tweaks to the cost and threshold based on the direct
1160 // callsite information.
1161
1162 // We want to more aggressively inline vector-dense kernels, so up the
1163 // threshold, and we'll lower it if the % of vector instructions gets too
1164 // low. Note that these bonuses are some what arbitrary and evolved over
1165 // time by accident as much as because they are principled bonuses.
1166 //
1167 // FIXME: It would be nice to remove all such bonuses. At least it would be
1168 // nice to base the bonus values on something more scientific.
1169 assert(NumInstructions == 0);
1170 assert(NumVectorInstructions == 0);
1171
1172 // Update the threshold based on callsite properties
1173 updateThreshold(Call&: CandidateCall, Callee&: F);
1174
1175 // While Threshold depends on commandline options that can take negative
1176 // values, we want to enforce the invariant that the computed threshold and
1177 // bonuses are non-negative.
1178 assert(Threshold >= 0);
1179 assert(SingleBBBonus >= 0);
1180 assert(VectorBonus >= 0);
1181
1182 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1183 // this Threshold any time, and cost cannot decrease, we can stop processing
1184 // the rest of the function body.
1185 Threshold += (SingleBBBonus + VectorBonus);
1186
1187 // Give out bonuses for the callsite, as the instructions setting them up
1188 // will be gone after inlining.
1189 addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1190
1191 // If this function uses the coldcc calling convention, prefer not to inline
1192 // it.
1193 if (F.getCallingConv() == CallingConv::Cold)
1194 addCost(Inc: InlineConstants::ColdccPenalty);
1195
1196 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1197
1198 // Check if we're done. This can happen due to bonuses and penalties.
1199 if (Cost >= Threshold && !ComputeFullInlineCost)
1200 return InlineResult::failure(Reason: "high cost");
1201
1202 return InlineResult::success();
1203 }
1204
1205public:
1206 InlineCostCallAnalyzer(
1207 Function &Callee, CallBase &Call, const InlineParams &Params,
1208 const TargetTransformInfo &TTI,
1209 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1210 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1211 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
1212 ProfileSummaryInfo *PSI = nullptr,
1213 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1214 bool IgnoreThreshold = false,
1215 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
1216 nullptr)
1217 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
1218 ORE, GetEphValuesCache),
1219 ComputeFullInlineCost(OptComputeFullInlineCost ||
1220 Params.ComputeFullInlineCost || ORE ||
1221 isCostBenefitAnalysisEnabled()),
1222 Params(Params), Threshold(Params.DefaultThreshold),
1223 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1224 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1225 Writer(this) {
1226 AllowRecursiveCall = *Params.AllowRecursiveCall;
1227 }
1228
1229 /// Annotation Writer for instruction details
1230 InlineCostAnnotationWriter Writer;
1231
1232 void dump();
1233
1234 // Prints the same analysis as dump(), but its definition is not dependent
1235 // on the build.
1236 void print(raw_ostream &OS);
1237
1238 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1239 auto It = InstructionCostDetailMap.find(Val: I);
1240 if (It != InstructionCostDetailMap.end())
1241 return It->second;
1242 return std::nullopt;
1243 }
1244
1245 ~InlineCostCallAnalyzer() override = default;
1246 int getThreshold() const { return Threshold; }
1247 int getCost() const { return Cost; }
1248 int getStaticBonusApplied() const { return StaticBonusApplied; }
1249 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1250 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1251 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1252};
1253
1254// Return true if CB is the sole call to local function Callee.
1255static bool isSoleCallToLocalFunction(const CallBase &CB,
1256 const Function &Callee) {
1257 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1258 &Callee == CB.getCalledFunction();
1259}
1260
1261class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1262private:
1263 InlineCostFeatures Cost = {};
1264
1265 // FIXME: These constants are taken from the heuristic-based cost visitor.
1266 // These should be removed entirely in a later revision to avoid reliance on
1267 // heuristics in the ML inliner.
1268 static constexpr int JTCostMultiplier = 2;
1269 static constexpr int CaseClusterCostMultiplier = 2;
1270 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1271 static constexpr int SwitchCostMultiplier = 2;
1272
1273 // FIXME: These are taken from the heuristic-based cost visitor: we should
1274 // eventually abstract these to the CallAnalyzer to avoid duplication.
1275 unsigned SROACostSavingOpportunities = 0;
1276 int VectorBonus = 0;
1277 int SingleBBBonus = 0;
1278 int Threshold = 5;
1279
1280 DenseMap<AllocaInst *, unsigned> SROACosts;
1281
1282 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1283 Cost[static_cast<size_t>(Feature)] += Delta;
1284 }
1285
1286 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1287 Cost[static_cast<size_t>(Feature)] = Value;
1288 }
1289
1290 void onDisableSROA(AllocaInst *Arg) override {
1291 auto CostIt = SROACosts.find(Val: Arg);
1292 if (CostIt == SROACosts.end())
1293 return;
1294
1295 increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second);
1296 SROACostSavingOpportunities -= CostIt->second;
1297 SROACosts.erase(I: CostIt);
1298 }
1299
1300 void onDisableLoadElimination() override {
1301 set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1);
1302 }
1303
1304 void onCallPenalty() override {
1305 increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty);
1306 }
1307
1308 void onCallArgumentSetup(const CallBase &Call) override {
1309 increment(Feature: InlineCostFeatureIndex::call_argument_setup,
1310 Delta: Call.arg_size() * InstrCost);
1311 }
1312
1313 void onLoadRelativeIntrinsic() override {
1314 increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost);
1315 }
1316
1317 void onLoweredCall(Function *F, CallBase &Call,
1318 bool IsIndirectCall) override {
1319 increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup,
1320 Delta: Call.arg_size() * InstrCost);
1321
1322 if (IsIndirectCall) {
1323 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1324 /*HintThreshold*/ {},
1325 /*ColdThreshold*/ {},
1326 /*OptSizeThreshold*/ {},
1327 /*OptMinSizeThreshold*/ {},
1328 /*HotCallSiteThreshold*/ {},
1329 /*LocallyHotCallSiteThreshold*/ {},
1330 /*ColdCallSiteThreshold*/ {},
1331 /*ComputeFullInlineCost*/ true,
1332 /*EnableDeferral*/ true};
1333 IndirectCallParams.DefaultThreshold =
1334 InlineConstants::IndirectCallThreshold;
1335
1336 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1337 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
1338 false, true);
1339 if (CA.analyze().isSuccess()) {
1340 increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate,
1341 Delta: CA.getCost());
1342 increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1);
1343 }
1344 } else {
1345 onCallPenalty();
1346 }
1347 }
1348
1349 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1350 bool DefaultDestUnreachable) override {
1351 if (JumpTableSize) {
1352 if (!DefaultDestUnreachable)
1353 increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty,
1354 Delta: SwitchDefaultDestCostMultiplier * InstrCost);
1355 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1356 JTCostMultiplier * InstrCost;
1357 increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost);
1358 return;
1359 }
1360
1361 if (NumCaseCluster <= 3) {
1362 increment(Feature: InlineCostFeatureIndex::case_cluster_penalty,
1363 Delta: (NumCaseCluster - DefaultDestUnreachable) *
1364 CaseClusterCostMultiplier * InstrCost);
1365 return;
1366 }
1367
1368 int64_t ExpectedNumberOfCompare =
1369 getExpectedNumberOfCompare(NumCaseCluster);
1370
1371 int64_t SwitchCost =
1372 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1373 increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost);
1374 }
1375
1376 void onMissedSimplification() override {
1377 increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions,
1378 Delta: InstrCost);
1379 }
1380
1381 void onInitializeSROAArg(AllocaInst *Arg) override {
1382 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
1383 SROACosts[Arg] = SROAArgCost;
1384 SROACostSavingOpportunities += SROAArgCost;
1385 }
1386
1387 void onAggregateSROAUse(AllocaInst *Arg) override {
1388 SROACosts.find(Val: Arg)->second += InstrCost;
1389 SROACostSavingOpportunities += InstrCost;
1390 }
1391
1392 void onBlockAnalyzed(const BasicBlock *BB) override {
1393 if (BB->getTerminator()->getNumSuccessors() > 1)
1394 set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1);
1395 Threshold -= SingleBBBonus;
1396 }
1397
1398 InlineResult finalizeAnalysis() override {
1399 auto *Caller = CandidateCall.getFunction();
1400 if (Caller->hasMinSize()) {
1401 DominatorTree DT(F);
1402 LoopInfo LI(DT);
1403 for (Loop *L : LI) {
1404 // Ignore loops that will not be executed
1405 if (DeadBlocks.count(Ptr: L->getHeader()))
1406 continue;
1407 increment(Feature: InlineCostFeatureIndex::num_loops,
1408 Delta: InlineConstants::LoopPenalty);
1409 }
1410 }
1411 set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size());
1412 set(Feature: InlineCostFeatureIndex::simplified_instructions,
1413 Value: NumInstructionsSimplified);
1414 set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs);
1415 set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args,
1416 Value: NumConstantOffsetPtrArgs);
1417 set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities);
1418
1419 if (NumVectorInstructions <= NumInstructions / 10)
1420 Threshold -= VectorBonus;
1421 else if (NumVectorInstructions <= NumInstructions / 2)
1422 Threshold -= VectorBonus / 2;
1423
1424 set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold);
1425
1426 return InlineResult::success();
1427 }
1428
1429 bool shouldStop() override { return false; }
1430
1431 void onLoadEliminationOpportunity() override {
1432 increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1);
1433 }
1434
1435 InlineResult onAnalysisStart() override {
1436 increment(Feature: InlineCostFeatureIndex::callsite_cost,
1437 Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1438
1439 set(Feature: InlineCostFeatureIndex::cold_cc_penalty,
1440 Value: (F.getCallingConv() == CallingConv::Cold));
1441
1442 set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus,
1443 Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F));
1444
1445 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1446 // analyzer - instead, we should abstract it to a common method in the
1447 // CallAnalyzer
1448 int SingleBBBonusPercent = 50;
1449 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1450 Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall);
1451 Threshold *= TTI.getInliningThresholdMultiplier();
1452 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1453 VectorBonus = Threshold * VectorBonusPercent / 100;
1454 Threshold += (SingleBBBonus + VectorBonus);
1455
1456 return InlineResult::success();
1457 }
1458
1459public:
1460 InlineCostFeaturesAnalyzer(
1461 const TargetTransformInfo &TTI,
1462 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1463 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1464 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
1465 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1466 CallBase &Call)
1467 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
1468 PSI) {}
1469
1470 const InlineCostFeatures &features() const { return Cost; }
1471};
1472
1473} // namespace
1474
1475/// Test whether the given value is an Alloca-derived function argument.
1476bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1477 return SROAArgValues.count(Val: V);
1478}
1479
1480void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1481 onDisableSROA(Arg: SROAArg);
1482 EnabledSROAAllocas.erase(V: SROAArg);
1483 disableLoadElimination();
1484}
1485
1486void InlineCostAnnotationWriter::emitInstructionAnnot(
1487 const Instruction *I, formatted_raw_ostream &OS) {
1488 // The cost of inlining of the given instruction is printed always.
1489 // The threshold delta is printed only when it is non-zero. It happens
1490 // when we decided to give a bonus at a particular instruction.
1491 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1492 if (!Record)
1493 OS << "; No analysis for the instruction";
1494 else {
1495 OS << "; cost before = " << Record->CostBefore
1496 << ", cost after = " << Record->CostAfter
1497 << ", threshold before = " << Record->ThresholdBefore
1498 << ", threshold after = " << Record->ThresholdAfter << ", ";
1499 OS << "cost delta = " << Record->getCostDelta();
1500 if (Record->hasThresholdChanged())
1501 OS << ", threshold delta = " << Record->getThresholdDelta();
1502 }
1503 auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I));
1504 if (V) {
1505 OS << ", simplified to ";
1506 V->print(O&: OS, IsForDebug: true);
1507 if (auto *VI = dyn_cast<Instruction>(Val: V)) {
1508 if (VI->getFunction() != I->getFunction())
1509 OS << " (caller instruction)";
1510 } else if (auto *VArg = dyn_cast<Argument>(Val: V)) {
1511 if (VArg->getParent() != I->getFunction())
1512 OS << " (caller argument)";
1513 }
1514 }
1515 OS << "\n";
1516}
1517
1518/// If 'V' maps to a SROA candidate, disable SROA for it.
1519void CallAnalyzer::disableSROA(Value *V) {
1520 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1521 disableSROAForArg(SROAArg);
1522 }
1523}
1524
1525void CallAnalyzer::disableLoadElimination() {
1526 if (EnableLoadElimination) {
1527 onDisableLoadElimination();
1528 EnableLoadElimination = false;
1529 }
1530}
1531
1532/// Accumulate a constant GEP offset into an APInt if possible.
1533///
1534/// Returns false if unable to compute the offset for any reason. Respects any
1535/// simplified values known during the analysis of this callsite.
1536bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1537 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType());
1538 assert(IntPtrWidth == Offset.getBitWidth());
1539
1540 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1541 GTI != GTE; ++GTI) {
1542 ConstantInt *OpC =
1543 getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand());
1544 if (!OpC)
1545 return false;
1546 if (OpC->isZero())
1547 continue;
1548
1549 // Handle a struct index, which adds its field offset to the pointer.
1550 if (StructType *STy = GTI.getStructTypeOrNull()) {
1551 unsigned ElementIdx = OpC->getZExtValue();
1552 const StructLayout *SL = DL.getStructLayout(Ty: STy);
1553 Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx));
1554 continue;
1555 }
1556
1557 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1558 Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize;
1559 }
1560 return true;
1561}
1562
1563/// Use TTI to check whether a GEP is free.
1564///
1565/// Respects any simplified values known during the analysis of this callsite.
1566bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1567 SmallVector<Value *, 4> Operands;
1568 Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0));
1569 for (const Use &Op : GEP.indices())
1570 if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op))
1571 Operands.push_back(Elt: SimpleOp);
1572 else
1573 Operands.push_back(Elt: Op);
1574 return TTI.getInstructionCost(U: &GEP, Operands,
1575 CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1576 TargetTransformInfo::TCC_Free;
1577}
1578
1579bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1580 disableSROA(V: I.getOperand(i_nocapture: 0));
1581
1582 // Check whether inlining will turn a dynamic alloca into a static
1583 // alloca and handle that case.
1584 if (I.isArrayAllocation()) {
1585 Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize());
1586 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) {
1587 // Sometimes a dynamic alloca could be converted into a static alloca
1588 // after this constant prop, and become a huge static alloca on an
1589 // unconditional CFG path. Avoid inlining if this is going to happen above
1590 // a threshold.
1591 // FIXME: If the threshold is removed or lowered too much, we could end up
1592 // being too pessimistic and prevent inlining non-problematic code. This
1593 // could result in unintended perf regressions. A better overall strategy
1594 // is needed to track stack usage during inlining.
1595 Type *Ty = I.getAllocatedType();
1596 AllocatedSize = SaturatingMultiplyAdd(
1597 X: AllocSize->getLimitedValue(),
1598 Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize);
1599 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1600 HasDynamicAlloca = true;
1601 return false;
1602 }
1603 }
1604
1605 if (I.isStaticAlloca()) {
1606 // Accumulate the allocated size if constant and executed once.
1607 // Note: if AllocSize is a vscale value, this is an underestimate of the
1608 // allocated size, and it also requires some of the cost of a dynamic
1609 // alloca, but is recorded here as a constant size alloca.
1610 TypeSize AllocSize = I.getAllocationSize(DL).value_or(u: TypeSize::getZero());
1611 AllocatedSize = SaturatingAdd(X: AllocSize.getKnownMinValue(), Y: AllocatedSize);
1612 } else {
1613 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1614 // a variety of reasons, and so we would like to not inline them into
1615 // functions which don't currently have a dynamic alloca. This simply
1616 // disables inlining altogether in the presence of a dynamic alloca.
1617 HasDynamicAlloca = true;
1618 }
1619
1620 return false;
1621}
1622
1623bool CallAnalyzer::visitPHI(PHINode &I) {
1624 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1625 // though we don't want to propagate it's bonuses. The idea is to disable
1626 // SROA if it *might* be used in an inappropriate manner.
1627
1628 // Phi nodes are always zero-cost.
1629 // FIXME: Pointer sizes may differ between different address spaces, so do we
1630 // need to use correct address space in the call to getPointerSizeInBits here?
1631 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1632 // see the ZeroOffset is used as a dummy value, so we can probably use any
1633 // bit width for the ZeroOffset?
1634 APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0));
1635 bool CheckSROA = I.getType()->isPointerTy();
1636
1637 // Track the constant or pointer with constant offset we've seen so far.
1638 Constant *FirstC = nullptr;
1639 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1640 Value *FirstV = nullptr;
1641
1642 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1643 BasicBlock *Pred = I.getIncomingBlock(i);
1644 // If the incoming block is dead, skip the incoming block.
1645 if (DeadBlocks.count(Ptr: Pred))
1646 continue;
1647 // If the parent block of phi is not the known successor of the incoming
1648 // block, skip the incoming block.
1649 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1650 if (KnownSuccessor && KnownSuccessor != I.getParent())
1651 continue;
1652
1653 Value *V = I.getIncomingValue(i);
1654 // If the incoming value is this phi itself, skip the incoming value.
1655 if (&I == V)
1656 continue;
1657
1658 Constant *C = getDirectOrSimplifiedValue<Constant>(V);
1659
1660 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1661 if (!C && CheckSROA)
1662 BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V);
1663
1664 if (!C && !BaseAndOffset.first)
1665 // The incoming value is neither a constant nor a pointer with constant
1666 // offset, exit early.
1667 return true;
1668
1669 if (FirstC) {
1670 if (FirstC == C)
1671 // If we've seen a constant incoming value before and it is the same
1672 // constant we see this time, continue checking the next incoming value.
1673 continue;
1674 // Otherwise early exit because we either see a different constant or saw
1675 // a constant before but we have a pointer with constant offset this time.
1676 return true;
1677 }
1678
1679 if (FirstV) {
1680 // The same logic as above, but check pointer with constant offset here.
1681 if (FirstBaseAndOffset == BaseAndOffset)
1682 continue;
1683 return true;
1684 }
1685
1686 if (C) {
1687 // This is the 1st time we've seen a constant, record it.
1688 FirstC = C;
1689 continue;
1690 }
1691
1692 // The remaining case is that this is the 1st time we've seen a pointer with
1693 // constant offset, record it.
1694 FirstV = V;
1695 FirstBaseAndOffset = BaseAndOffset;
1696 }
1697
1698 // Check if we can map phi to a constant.
1699 if (FirstC) {
1700 SimplifiedValues[&I] = FirstC;
1701 return true;
1702 }
1703
1704 // Check if we can map phi to a pointer with constant offset.
1705 if (FirstBaseAndOffset.first) {
1706 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1707
1708 if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV))
1709 SROAArgValues[&I] = SROAArg;
1710 }
1711
1712 return true;
1713}
1714
1715/// Check we can fold GEPs of constant-offset call site argument pointers.
1716/// This requires target data and inbounds GEPs.
1717///
1718/// \return true if the specified GEP can be folded.
1719bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1720 // Check if we have a base + offset for the pointer.
1721 std::pair<Value *, APInt> BaseAndOffset =
1722 ConstantOffsetPtrs.lookup(Val: I.getPointerOperand());
1723 if (!BaseAndOffset.first)
1724 return false;
1725
1726 // Check if the offset of this GEP is constant, and if so accumulate it
1727 // into Offset.
1728 if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second))
1729 return false;
1730
1731 // Add the result as a new mapping to Base + Offset.
1732 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1733
1734 return true;
1735}
1736
1737bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1738 auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand());
1739
1740 // Lambda to check whether a GEP's indices are all constant.
1741 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1742 for (const Use &Op : GEP.indices())
1743 if (!getDirectOrSimplifiedValue<Constant>(V: Op))
1744 return false;
1745 return true;
1746 };
1747
1748 if (!DisableGEPConstOperand)
1749 if (simplifyInstruction(I))
1750 return true;
1751
1752 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1753 if (SROAArg)
1754 SROAArgValues[&I] = SROAArg;
1755
1756 // Constant GEPs are modeled as free.
1757 return true;
1758 }
1759
1760 // Variable GEPs will require math and will disable SROA.
1761 if (SROAArg)
1762 disableSROAForArg(SROAArg);
1763 return isGEPFree(GEP&: I);
1764}
1765
1766// Simplify \p Cmp if RHS is const and we can ValueTrack LHS.
1767// This handles the case only when the Cmp instruction is guarding a recursive
1768// call that will cause the Cmp to fail/succeed for the recursive call.
1769bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) {
1770 // Bail out if LHS is not a function argument or RHS is NOT const:
1771 if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1)))
1772 return false;
1773 auto *CmpOp = Cmp.getOperand(i_nocapture: 0);
1774 // Make sure that the callsite is recursive:
1775 if (CandidateCall.getCaller() != &F)
1776 return false;
1777 // Only handle the case when the callsite has a single predecessor:
1778 auto *CallBB = CandidateCall.getParent();
1779 auto *Predecessor = CallBB->getSinglePredecessor();
1780 if (!Predecessor)
1781 return false;
1782 // Check if the callsite is guarded by the same Cmp instruction:
1783 auto *Br = dyn_cast<BranchInst>(Val: Predecessor->getTerminator());
1784 if (!Br || Br->isUnconditional() || Br->getCondition() != &Cmp)
1785 return false;
1786
1787 // Check if there is any arg of the recursive callsite is affecting the cmp
1788 // instr:
1789 bool ArgFound = false;
1790 Value *FuncArg = nullptr, *CallArg = nullptr;
1791 for (unsigned ArgNum = 0;
1792 ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) {
1793 FuncArg = F.getArg(i: ArgNum);
1794 CallArg = CandidateCall.getArgOperand(i: ArgNum);
1795 if (FuncArg == CmpOp && CallArg != CmpOp) {
1796 ArgFound = true;
1797 break;
1798 }
1799 }
1800 if (!ArgFound)
1801 return false;
1802
1803 // Now we have a recursive call that is guarded by a cmp instruction.
1804 // Check if this cmp can be simplified:
1805 SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg));
1806 CondContext CC(&Cmp);
1807 CC.Invert = (CallBB != Br->getSuccessor(i: 0));
1808 SQ.CC = &CC;
1809 CC.AffectedValues.insert(Ptr: FuncArg);
1810 Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands(
1811 I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ);
1812 if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) {
1813 // Make sure that the BB of the recursive call is NOT the true successor
1814 // of the icmp. In other words, make sure that the recursion depth is 1.
1815 if ((ConstVal->isOne() && CC.Invert) ||
1816 (ConstVal->isZero() && !CC.Invert)) {
1817 SimplifiedValues[&Cmp] = ConstVal;
1818 return true;
1819 }
1820 }
1821 return false;
1822}
1823
1824/// Simplify \p I if its operands are constants and update SimplifiedValues.
1825bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1826 SmallVector<Constant *> COps;
1827 for (Value *Op : I.operands()) {
1828 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
1829 if (!COp)
1830 return false;
1831 COps.push_back(Elt: COp);
1832 }
1833 auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL);
1834 if (!C)
1835 return false;
1836 SimplifiedValues[&I] = C;
1837 return true;
1838}
1839
1840/// Try to simplify a call to llvm.is.constant.
1841///
1842/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1843/// we expect calls of this specific intrinsic to be infrequent.
1844///
1845/// FIXME: Given that we know CB's parent (F) caller
1846/// (CandidateCall->getParent()->getParent()), we might be able to determine
1847/// whether inlining F into F's caller would change how the call to
1848/// llvm.is.constant would evaluate.
1849bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1850 Value *Arg = CB.getArgOperand(i: 0);
1851 auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg);
1852
1853 Type *RT = CB.getFunctionType()->getReturnType();
1854 SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0);
1855 return true;
1856}
1857
1858bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1859 // As per the langref, "The fourth argument to llvm.objectsize determines if
1860 // the value should be evaluated at runtime."
1861 if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne())
1862 return false;
1863
1864 Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr,
1865 /*MustSucceed=*/true);
1866 Constant *C = dyn_cast_or_null<Constant>(Val: V);
1867 if (C)
1868 SimplifiedValues[&CB] = C;
1869 return C;
1870}
1871
1872bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1873 // Propagate constants through bitcasts.
1874 if (simplifyInstruction(I))
1875 return true;
1876
1877 // Track base/offsets through casts
1878 std::pair<Value *, APInt> BaseAndOffset =
1879 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1880 // Casts don't change the offset, just wrap it up.
1881 if (BaseAndOffset.first)
1882 ConstantOffsetPtrs[&I] = BaseAndOffset;
1883
1884 // Also look for SROA candidates here.
1885 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1886 SROAArgValues[&I] = SROAArg;
1887
1888 // Bitcasts are always zero cost.
1889 return true;
1890}
1891
1892bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1893 // Propagate constants through ptrtoint.
1894 if (simplifyInstruction(I))
1895 return true;
1896
1897 // Track base/offset pairs when converted to a plain integer provided the
1898 // integer is large enough to represent the pointer.
1899 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1900 unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1901 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1902 std::pair<Value *, APInt> BaseAndOffset =
1903 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1904 if (BaseAndOffset.first)
1905 ConstantOffsetPtrs[&I] = BaseAndOffset;
1906 }
1907
1908 // This is really weird. Technically, ptrtoint will disable SROA. However,
1909 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1910 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1911 // would block SROA would also block SROA if applied directly to a pointer,
1912 // and so we can just add the integer in here. The only places where SROA is
1913 // preserved either cannot fire on an integer, or won't in-and-of themselves
1914 // disable SROA (ext) w/o some later use that we would see and disable.
1915 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1916 SROAArgValues[&I] = SROAArg;
1917
1918 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1919 TargetTransformInfo::TCC_Free;
1920}
1921
1922bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1923 // Propagate constants through ptrtoint.
1924 if (simplifyInstruction(I))
1925 return true;
1926
1927 // Track base/offset pairs when round-tripped through a pointer without
1928 // modifications provided the integer is not too large.
1929 Value *Op = I.getOperand(i_nocapture: 0);
1930 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1931 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1932 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op);
1933 if (BaseAndOffset.first)
1934 ConstantOffsetPtrs[&I] = BaseAndOffset;
1935 }
1936
1937 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1938 if (auto *SROAArg = getSROAArgForValueOrNull(V: Op))
1939 SROAArgValues[&I] = SROAArg;
1940
1941 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1942 TargetTransformInfo::TCC_Free;
1943}
1944
1945bool CallAnalyzer::visitCastInst(CastInst &I) {
1946 // Propagate constants through casts.
1947 if (simplifyInstruction(I))
1948 return true;
1949
1950 // Disable SROA in the face of arbitrary casts we don't explicitly list
1951 // elsewhere.
1952 disableSROA(V: I.getOperand(i_nocapture: 0));
1953
1954 // If this is a floating-point cast, and the target says this operation
1955 // is expensive, this may eventually become a library call. Treat the cost
1956 // as such.
1957 switch (I.getOpcode()) {
1958 case Instruction::FPTrunc:
1959 case Instruction::FPExt:
1960 case Instruction::UIToFP:
1961 case Instruction::SIToFP:
1962 case Instruction::FPToUI:
1963 case Instruction::FPToSI:
1964 if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive)
1965 onCallPenalty();
1966 break;
1967 default:
1968 break;
1969 }
1970
1971 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1972 TargetTransformInfo::TCC_Free;
1973}
1974
1975bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1976 return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr);
1977}
1978
1979bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1980 // Does the *call site* have the NonNull attribute set on an argument? We
1981 // use the attribute on the call site to memoize any analysis done in the
1982 // caller. This will also trip if the callee function has a non-null
1983 // parameter attribute, but that's a less interesting case because hopefully
1984 // the callee would already have been simplified based on that.
1985 if (Argument *A = dyn_cast<Argument>(Val: V))
1986 if (paramHasAttr(A, Attr: Attribute::NonNull))
1987 return true;
1988
1989 // Is this an alloca in the caller? This is distinct from the attribute case
1990 // above because attributes aren't updated within the inliner itself and we
1991 // always want to catch the alloca derived case.
1992 if (isAllocaDerivedArg(V))
1993 // We can actually predict the result of comparisons between an
1994 // alloca-derived value and null. Note that this fires regardless of
1995 // SROA firing.
1996 return true;
1997
1998 return false;
1999}
2000
2001bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
2002 // If the normal destination of the invoke or the parent block of the call
2003 // site is unreachable-terminated, there is little point in inlining this
2004 // unless there is literally zero cost.
2005 // FIXME: Note that it is possible that an unreachable-terminated block has a
2006 // hot entry. For example, in below scenario inlining hot_call_X() may be
2007 // beneficial :
2008 // main() {
2009 // hot_call_1();
2010 // ...
2011 // hot_call_N()
2012 // exit(0);
2013 // }
2014 // For now, we are not handling this corner case here as it is rare in real
2015 // code. In future, we should elaborate this based on BPI and BFI in more
2016 // general threshold adjusting heuristics in updateThreshold().
2017 if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) {
2018 if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator()))
2019 return false;
2020 } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator()))
2021 return false;
2022
2023 return true;
2024}
2025
2026bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
2027 BlockFrequencyInfo *CallerBFI) {
2028 // If global profile summary is available, then callsite's coldness is
2029 // determined based on that.
2030 if (PSI && PSI->hasProfileSummary())
2031 return PSI->isColdCallSite(CB: Call, BFI: CallerBFI);
2032
2033 // Otherwise we need BFI to be available.
2034 if (!CallerBFI)
2035 return false;
2036
2037 // Determine if the callsite is cold relative to caller's entry. We could
2038 // potentially cache the computation of scaled entry frequency, but the added
2039 // complexity is not worth it unless this scaling shows up high in the
2040 // profiles.
2041 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
2042 auto CallSiteBB = Call.getParent();
2043 auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2044 auto CallerEntryFreq =
2045 CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock()));
2046 return CallSiteFreq < CallerEntryFreq * ColdProb;
2047}
2048
2049std::optional<int>
2050InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
2051 BlockFrequencyInfo *CallerBFI) {
2052
2053 // If global profile summary is available, then callsite's hotness is
2054 // determined based on that.
2055 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI))
2056 return Params.HotCallSiteThreshold;
2057
2058 // Otherwise we need BFI to be available and to have a locally hot callsite
2059 // threshold.
2060 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
2061 return std::nullopt;
2062
2063 // Determine if the callsite is hot relative to caller's entry. We could
2064 // potentially cache the computation of scaled entry frequency, but the added
2065 // complexity is not worth it unless this scaling shows up high in the
2066 // profiles.
2067 const BasicBlock *CallSiteBB = Call.getParent();
2068 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2069 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
2070 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq);
2071 if (Limit && CallSiteFreq >= *Limit)
2072 return Params.LocallyHotCallSiteThreshold;
2073
2074 // Otherwise treat it normally.
2075 return std::nullopt;
2076}
2077
2078void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
2079 // If no size growth is allowed for this inlining, set Threshold to 0.
2080 if (!allowSizeGrowth(Call)) {
2081 Threshold = 0;
2082 return;
2083 }
2084
2085 Function *Caller = Call.getCaller();
2086
2087 // return min(A, B) if B is valid.
2088 auto MinIfValid = [](int A, std::optional<int> B) {
2089 return B ? std::min(a: A, b: *B) : A;
2090 };
2091
2092 // return max(A, B) if B is valid.
2093 auto MaxIfValid = [](int A, std::optional<int> B) {
2094 return B ? std::max(a: A, b: *B) : A;
2095 };
2096
2097 // Various bonus percentages. These are multiplied by Threshold to get the
2098 // bonus values.
2099 // SingleBBBonus: This bonus is applied if the callee has a single reachable
2100 // basic block at the given callsite context. This is speculatively applied
2101 // and withdrawn if more than one basic block is seen.
2102 //
2103 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
2104 // of the last call to a static function as inlining such functions is
2105 // guaranteed to reduce code size.
2106 //
2107 // These bonus percentages may be set to 0 based on properties of the caller
2108 // and the callsite.
2109 int SingleBBBonusPercent = 50;
2110 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
2111 int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus();
2112
2113 // Lambda to set all the above bonus and bonus percentages to 0.
2114 auto DisallowAllBonuses = [&]() {
2115 SingleBBBonusPercent = 0;
2116 VectorBonusPercent = 0;
2117 LastCallToStaticBonus = 0;
2118 };
2119
2120 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
2121 // and reduce the threshold if the caller has the necessary attribute.
2122 if (Caller->hasMinSize()) {
2123 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
2124 // For minsize, we want to disable the single BB bonus and the vector
2125 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
2126 // a static function will, at the minimum, eliminate the parameter setup and
2127 // call/return instructions.
2128 SingleBBBonusPercent = 0;
2129 VectorBonusPercent = 0;
2130 } else if (Caller->hasOptSize())
2131 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
2132
2133 // Adjust the threshold based on inlinehint attribute and profile based
2134 // hotness information if the caller does not have MinSize attribute.
2135 if (!Caller->hasMinSize()) {
2136 if (Callee.hasFnAttribute(Kind: Attribute::InlineHint))
2137 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2138
2139 // FIXME: After switching to the new passmanager, simplify the logic below
2140 // by checking only the callsite hotness/coldness as we will reliably
2141 // have local profile information.
2142 //
2143 // Callsite hotness and coldness can be determined if sample profile is
2144 // used (which adds hotness metadata to calls) or if caller's
2145 // BlockFrequencyInfo is available.
2146 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
2147 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
2148 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
2149 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
2150 // FIXME: This should update the threshold only if it exceeds the
2151 // current threshold, but AutoFDO + ThinLTO currently relies on this
2152 // behavior to prevent inlining of hot callsites during ThinLTO
2153 // compile phase.
2154 Threshold = *HotCallSiteThreshold;
2155 } else if (isColdCallSite(Call, CallerBFI)) {
2156 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2157 // Do not apply bonuses for a cold callsite including the
2158 // LastCallToStatic bonus. While this bonus might result in code size
2159 // reduction, it can cause the size of a non-cold caller to increase
2160 // preventing it from being inlined.
2161 DisallowAllBonuses();
2162 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
2163 } else if (PSI) {
2164 // Use callee's global profile information only if we have no way of
2165 // determining this via callsite information.
2166 if (PSI->isFunctionEntryHot(F: &Callee)) {
2167 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2168 // If callsite hotness can not be determined, we may still know
2169 // that the callee is hot and treat it as a weaker hint for threshold
2170 // increase.
2171 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2172 } else if (PSI->isFunctionEntryCold(F: &Callee)) {
2173 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2174 // Do not apply bonuses for a cold callee including the
2175 // LastCallToStatic bonus. While this bonus might result in code size
2176 // reduction, it can cause the size of a non-cold caller to increase
2177 // preventing it from being inlined.
2178 DisallowAllBonuses();
2179 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2180 }
2181 }
2182 }
2183
2184 Threshold += TTI.adjustInliningThreshold(CB: &Call);
2185
2186 // Finally, take the target-specific inlining threshold multiplier into
2187 // account.
2188 Threshold *= TTI.getInliningThresholdMultiplier();
2189
2190 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2191 VectorBonus = Threshold * VectorBonusPercent / 100;
2192
2193 // If there is only one call of the function, and it has internal linkage,
2194 // the cost of inlining it drops dramatically. It may seem odd to update
2195 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2196 if (isSoleCallToLocalFunction(CB: Call, Callee: F)) {
2197 addCost(Inc: -LastCallToStaticBonus);
2198 StaticBonusApplied = LastCallToStaticBonus;
2199 }
2200}
2201
2202bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2203 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2204 // First try to handle simplified comparisons.
2205 if (simplifyInstruction(I))
2206 return true;
2207
2208 // Try to handle comparison that can be simplified using ValueTracking.
2209 if (simplifyCmpInstForRecCall(Cmp&: I))
2210 return true;
2211
2212 if (I.getOpcode() == Instruction::FCmp)
2213 return false;
2214
2215 // Otherwise look for a comparison between constant offset pointers with
2216 // a common base.
2217 Value *LHSBase, *RHSBase;
2218 APInt LHSOffset, RHSOffset;
2219 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2220 if (LHSBase) {
2221 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2222 if (RHSBase && LHSBase == RHSBase) {
2223 // We have common bases, fold the icmp to a constant based on the
2224 // offsets.
2225 SimplifiedValues[&I] = ConstantInt::getBool(
2226 Ty: I.getType(),
2227 V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate()));
2228 ++NumConstantPtrCmps;
2229 return true;
2230 }
2231 }
2232
2233 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2234 for (auto *User : I.users())
2235 if (auto *Instr = dyn_cast<Instruction>(Val: User))
2236 if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit))
2237 return false;
2238 return true;
2239 };
2240
2241 // If the comparison is an equality comparison with null, we can simplify it
2242 // if we know the value (argument) can't be null
2243 if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) {
2244 if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) {
2245 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2246 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType())
2247 : ConstantInt::getFalse(Ty: I.getType());
2248 return true;
2249 }
2250 // Implicit null checks act as unconditional branches and their comparisons
2251 // should be treated as simplified and free of cost.
2252 if (isImplicitNullCheckCmp(I))
2253 return true;
2254 }
2255 return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1)));
2256}
2257
2258bool CallAnalyzer::visitSub(BinaryOperator &I) {
2259 // Try to handle a special case: we can fold computing the difference of two
2260 // constant-related pointers.
2261 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2262 Value *LHSBase, *RHSBase;
2263 APInt LHSOffset, RHSOffset;
2264 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2265 if (LHSBase) {
2266 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2267 if (RHSBase && LHSBase == RHSBase) {
2268 // We have common bases, fold the subtract to a constant based on the
2269 // offsets.
2270 Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset);
2271 Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset);
2272 if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) {
2273 SimplifiedValues[&I] = C;
2274 ++NumConstantPtrDiffs;
2275 return true;
2276 }
2277 }
2278 }
2279
2280 // Otherwise, fall back to the generic logic for simplifying and handling
2281 // instructions.
2282 return Base::visitSub(I);
2283}
2284
2285bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2286 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2287 Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS);
2288 Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS);
2289
2290 Value *SimpleV = nullptr;
2291 if (auto FI = dyn_cast<FPMathOperator>(Val: &I))
2292 SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS,
2293 FMF: FI->getFastMathFlags(), Q: DL);
2294 else
2295 SimpleV =
2296 simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL);
2297
2298 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2299 SimplifiedValues[&I] = C;
2300
2301 if (SimpleV)
2302 return true;
2303
2304 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2305 disableSROA(V: LHS);
2306 disableSROA(V: RHS);
2307
2308 // If the instruction is floating point, and the target says this operation
2309 // is expensive, this may eventually become a library call. Treat the cost
2310 // as such. Unless it's fneg which can be implemented with an xor.
2311 using namespace llvm::PatternMatch;
2312 if (I.getType()->isFloatingPointTy() &&
2313 TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive &&
2314 !match(V: &I, P: m_FNeg(X: m_Value())))
2315 onCallPenalty();
2316
2317 return false;
2318}
2319
2320bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2321 Value *Op = I.getOperand(i_nocapture: 0);
2322 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
2323
2324 Value *SimpleV = simplifyFNegInst(
2325 Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL);
2326
2327 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2328 SimplifiedValues[&I] = C;
2329
2330 if (SimpleV)
2331 return true;
2332
2333 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2334 disableSROA(V: Op);
2335
2336 return false;
2337}
2338
2339bool CallAnalyzer::visitLoad(LoadInst &I) {
2340 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2341 return true;
2342
2343 // If the data is already loaded from this address and hasn't been clobbered
2344 // by any stores or calls, this load is likely to be redundant and can be
2345 // eliminated.
2346 if (EnableLoadElimination &&
2347 !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) {
2348 onLoadEliminationOpportunity();
2349 return true;
2350 }
2351
2352 onMemAccess();
2353 return false;
2354}
2355
2356bool CallAnalyzer::visitStore(StoreInst &I) {
2357 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2358 return true;
2359
2360 // The store can potentially clobber loads and prevent repeated loads from
2361 // being eliminated.
2362 // FIXME:
2363 // 1. We can probably keep an initial set of eliminatable loads substracted
2364 // from the cost even when we finally see a store. We just need to disable
2365 // *further* accumulation of elimination savings.
2366 // 2. We should probably at some point thread MemorySSA for the callee into
2367 // this and then use that to actually compute *really* precise savings.
2368 disableLoadElimination();
2369
2370 onMemAccess();
2371 return false;
2372}
2373
2374bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2375 Value *Op = I.getAggregateOperand();
2376
2377 // Special handling, because we want to simplify extractvalue with a
2378 // potential insertvalue from the caller.
2379 if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) {
2380 SimplifyQuery SQ(DL);
2381 Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ);
2382 if (SimpleV) {
2383 SimplifiedValues[&I] = SimpleV;
2384 return true;
2385 }
2386 }
2387
2388 // SROA can't look through these, but they may be free.
2389 return Base::visitExtractValue(I);
2390}
2391
2392bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2393 // Constant folding for insert value is trivial.
2394 if (simplifyInstruction(I))
2395 return true;
2396
2397 // SROA can't look through these, but they may be free.
2398 return Base::visitInsertValue(I);
2399}
2400
2401/// Try to simplify a call site.
2402///
2403/// Takes a concrete function and callsite and tries to actually simplify it by
2404/// analyzing the arguments and call itself with instsimplify. Returns true if
2405/// it has simplified the callsite to some other entity (a constant), making it
2406/// free.
2407bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2408 // FIXME: Using the instsimplify logic directly for this is inefficient
2409 // because we have to continually rebuild the argument list even when no
2410 // simplifications can be performed. Until that is fixed with remapping
2411 // inside of instsimplify, directly constant fold calls here.
2412 if (!canConstantFoldCallTo(Call: &Call, F))
2413 return false;
2414
2415 // Try to re-map the arguments to constants.
2416 SmallVector<Constant *, 4> ConstantArgs;
2417 ConstantArgs.reserve(N: Call.arg_size());
2418 for (Value *I : Call.args()) {
2419 Constant *C = getDirectOrSimplifiedValue<Constant>(V: I);
2420 if (!C)
2421 return false; // This argument doesn't map to a constant.
2422
2423 ConstantArgs.push_back(Elt: C);
2424 }
2425 if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) {
2426 SimplifiedValues[&Call] = C;
2427 return true;
2428 }
2429
2430 return false;
2431}
2432
2433bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
2434 const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
2435 LibFunc LF;
2436 if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF))
2437 return TTI.isLoweredToCall(F);
2438
2439 switch (LF) {
2440 case LibFunc_memcpy_chk:
2441 case LibFunc_memmove_chk:
2442 case LibFunc_mempcpy_chk:
2443 case LibFunc_memset_chk: {
2444 // Calls to __memcpy_chk whose length is known to fit within the object
2445 // size will eventually be replaced by inline stores. Therefore, these
2446 // should not incur a call penalty. This is only really relevant on
2447 // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2448 // other platforms use memcpy intrinsics, which are already exempt from the
2449 // call penalty.
2450 auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2));
2451 auto *ObjSizeOp =
2452 getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3));
2453 if (LenOp && ObjSizeOp &&
2454 LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
2455 return false;
2456 }
2457 break;
2458 }
2459 default:
2460 break;
2461 }
2462
2463 return TTI.isLoweredToCall(F);
2464}
2465
2466bool CallAnalyzer::visitCallBase(CallBase &Call) {
2467 if (!onCallBaseVisitStart(Call))
2468 return true;
2469
2470 if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) &&
2471 !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) {
2472 // This aborts the entire analysis.
2473 ExposesReturnsTwice = true;
2474 return false;
2475 }
2476 if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate())
2477 ContainsNoDuplicateCall = true;
2478
2479 if (InlineAsm *InlineAsmOp = dyn_cast<InlineAsm>(Val: Call.getCalledOperand()))
2480 onInlineAsm(Arg: *InlineAsmOp);
2481
2482 Function *F = Call.getCalledFunction();
2483 bool IsIndirectCall = !F;
2484 if (IsIndirectCall) {
2485 // Check if this happens to be an indirect function call to a known function
2486 // in this inline context. If not, we've done all we can.
2487 Value *Callee = Call.getCalledOperand();
2488 F = getSimplifiedValue<Function>(V: Callee);
2489 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2490 onCallArgumentSetup(Call);
2491
2492 if (!Call.onlyReadsMemory())
2493 disableLoadElimination();
2494 return Base::visitCallBase(I&: Call);
2495 }
2496 }
2497
2498 assert(F && "Expected a call to a known function");
2499
2500 // When we have a concrete function, first try to simplify it directly.
2501 if (simplifyCallSite(F, Call))
2502 return true;
2503
2504 // Next check if it is an intrinsic we know about.
2505 // FIXME: Lift this into part of the InstVisitor.
2506 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) {
2507 switch (II->getIntrinsicID()) {
2508 default:
2509 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II))
2510 disableLoadElimination();
2511 return Base::visitCallBase(I&: Call);
2512
2513 case Intrinsic::load_relative:
2514 onLoadRelativeIntrinsic();
2515 return false;
2516
2517 case Intrinsic::memset:
2518 case Intrinsic::memcpy:
2519 case Intrinsic::memmove:
2520 disableLoadElimination();
2521 // SROA can usually chew through these intrinsics, but they aren't free.
2522 return false;
2523 case Intrinsic::icall_branch_funnel:
2524 case Intrinsic::localescape:
2525 HasUninlineableIntrinsic = true;
2526 return false;
2527 case Intrinsic::vastart:
2528 InitsVargArgs = true;
2529 return false;
2530 case Intrinsic::launder_invariant_group:
2531 case Intrinsic::strip_invariant_group:
2532 if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0)))
2533 SROAArgValues[II] = SROAArg;
2534 return true;
2535 case Intrinsic::is_constant:
2536 return simplifyIntrinsicCallIsConstant(CB&: Call);
2537 case Intrinsic::objectsize:
2538 return simplifyIntrinsicCallObjectSize(CB&: Call);
2539 }
2540 }
2541
2542 if (F == Call.getFunction()) {
2543 // This flag will fully abort the analysis, so don't bother with anything
2544 // else.
2545 IsRecursiveCall = true;
2546 if (!AllowRecursiveCall)
2547 return false;
2548 }
2549
2550 if (isLoweredToCall(F, Call)) {
2551 onLoweredCall(F, Call, IsIndirectCall);
2552 }
2553
2554 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2555 disableLoadElimination();
2556 return Base::visitCallBase(I&: Call);
2557}
2558
2559bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2560 // At least one return instruction will be free after inlining.
2561 bool Free = !HasReturn;
2562 HasReturn = true;
2563 return Free;
2564}
2565
2566bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2567 // We model unconditional branches as essentially free -- they really
2568 // shouldn't exist at all, but handling them makes the behavior of the
2569 // inliner more regular and predictable. Interestingly, conditional branches
2570 // which will fold away are also free.
2571 return BI.isUnconditional() ||
2572 getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) ||
2573 BI.getMetadata(KindID: LLVMContext::MD_make_implicit);
2574}
2575
2576bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2577 bool CheckSROA = SI.getType()->isPointerTy();
2578 Value *TrueVal = SI.getTrueValue();
2579 Value *FalseVal = SI.getFalseValue();
2580
2581 Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal);
2582 Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal);
2583 Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition());
2584
2585 if (!CondC) {
2586 // Select C, X, X => X
2587 if (TrueC == FalseC && TrueC) {
2588 SimplifiedValues[&SI] = TrueC;
2589 return true;
2590 }
2591
2592 if (!CheckSROA)
2593 return Base::visitSelectInst(I&: SI);
2594
2595 std::pair<Value *, APInt> TrueBaseAndOffset =
2596 ConstantOffsetPtrs.lookup(Val: TrueVal);
2597 std::pair<Value *, APInt> FalseBaseAndOffset =
2598 ConstantOffsetPtrs.lookup(Val: FalseVal);
2599 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2600 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2601
2602 if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal))
2603 SROAArgValues[&SI] = SROAArg;
2604 return true;
2605 }
2606
2607 return Base::visitSelectInst(I&: SI);
2608 }
2609
2610 // Select condition is a constant.
2611 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2612 : (CondC->isNullValue()) ? FalseVal
2613 : nullptr;
2614 if (!SelectedV) {
2615 // Condition is a vector constant that is not all 1s or all 0s. If all
2616 // operands are constants, ConstantFoldSelectInstruction() can handle the
2617 // cases such as select vectors.
2618 if (TrueC && FalseC) {
2619 if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) {
2620 SimplifiedValues[&SI] = C;
2621 return true;
2622 }
2623 }
2624 return Base::visitSelectInst(I&: SI);
2625 }
2626
2627 // Condition is either all 1s or all 0s. SI can be simplified.
2628 if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) {
2629 SimplifiedValues[&SI] = SelectedC;
2630 return true;
2631 }
2632
2633 if (!CheckSROA)
2634 return true;
2635
2636 std::pair<Value *, APInt> BaseAndOffset =
2637 ConstantOffsetPtrs.lookup(Val: SelectedV);
2638 if (BaseAndOffset.first) {
2639 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2640
2641 if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV))
2642 SROAArgValues[&SI] = SROAArg;
2643 }
2644
2645 return true;
2646}
2647
2648bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2649 // We model unconditional switches as free, see the comments on handling
2650 // branches.
2651 if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition()))
2652 return true;
2653
2654 // Assume the most general case where the switch is lowered into
2655 // either a jump table, bit test, or a balanced binary tree consisting of
2656 // case clusters without merging adjacent clusters with the same
2657 // destination. We do not consider the switches that are lowered with a mix
2658 // of jump table/bit test/binary search tree. The cost of the switch is
2659 // proportional to the size of the tree or the size of jump table range.
2660 //
2661 // NB: We convert large switches which are just used to initialize large phi
2662 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2663 // inlining those. It will prevent inlining in cases where the optimization
2664 // does not (yet) fire.
2665
2666 unsigned JumpTableSize = 0;
2667 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2668 unsigned NumCaseCluster =
2669 TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI);
2670
2671 onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable());
2672 return false;
2673}
2674
2675bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2676 // We never want to inline functions that contain an indirectbr. This is
2677 // incorrect because all the blockaddress's (in static global initializers
2678 // for example) would be referring to the original function, and this
2679 // indirect jump would jump from the inlined copy of the function into the
2680 // original function which is extremely undefined behavior.
2681 // FIXME: This logic isn't really right; we can safely inline functions with
2682 // indirectbr's as long as no other function or global references the
2683 // blockaddress of a block within the current function.
2684 HasIndirectBr = true;
2685 return false;
2686}
2687
2688bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2689 // FIXME: It's not clear that a single instruction is an accurate model for
2690 // the inline cost of a resume instruction.
2691 return false;
2692}
2693
2694bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2695 // FIXME: It's not clear that a single instruction is an accurate model for
2696 // the inline cost of a cleanupret instruction.
2697 return false;
2698}
2699
2700bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2701 // FIXME: It's not clear that a single instruction is an accurate model for
2702 // the inline cost of a catchret instruction.
2703 return false;
2704}
2705
2706bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2707 // FIXME: It might be reasonably to discount the cost of instructions leading
2708 // to unreachable as they have the lowest possible impact on both runtime and
2709 // code size.
2710 return true; // No actual code is needed for unreachable.
2711}
2712
2713bool CallAnalyzer::visitInstruction(Instruction &I) {
2714 // Some instructions are free. All of the free intrinsics can also be
2715 // handled by SROA, etc.
2716 if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
2717 TargetTransformInfo::TCC_Free)
2718 return true;
2719
2720 // We found something we don't understand or can't handle. Mark any SROA-able
2721 // values in the operand list as no longer viable.
2722 for (const Use &Op : I.operands())
2723 disableSROA(V: Op);
2724
2725 return false;
2726}
2727
2728/// Analyze a basic block for its contribution to the inline cost.
2729///
2730/// This method walks the analyzer over every instruction in the given basic
2731/// block and accounts for their cost during inlining at this callsite. It
2732/// aborts early if the threshold has been exceeded or an impossible to inline
2733/// construct has been detected. It returns false if inlining is no longer
2734/// viable, and true if inlining remains viable.
2735InlineResult
2736CallAnalyzer::analyzeBlock(BasicBlock *BB,
2737 const SmallPtrSetImpl<const Value *> &EphValues) {
2738 for (Instruction &I : *BB) {
2739 // FIXME: Currently, the number of instructions in a function regardless of
2740 // our ability to simplify them during inline to constants or dead code,
2741 // are actually used by the vector bonus heuristic. As long as that's true,
2742 // we have to special case debug intrinsics here to prevent differences in
2743 // inlining due to debug symbols. Eventually, the number of unsimplified
2744 // instructions shouldn't factor into the cost computation, but until then,
2745 // hack around it here.
2746 // Similarly, skip pseudo-probes.
2747 if (I.isDebugOrPseudoInst())
2748 continue;
2749
2750 // Skip ephemeral values.
2751 if (EphValues.count(Ptr: &I))
2752 continue;
2753
2754 ++NumInstructions;
2755 if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy())
2756 ++NumVectorInstructions;
2757
2758 // If the instruction simplified to a constant, there is no cost to this
2759 // instruction. Visit the instructions using our InstVisitor to account for
2760 // all of the per-instruction logic. The visit tree returns true if we
2761 // consumed the instruction in any way, and false if the instruction's base
2762 // cost should count against inlining.
2763 onInstructionAnalysisStart(I: &I);
2764
2765 if (Base::visit(I: &I))
2766 ++NumInstructionsSimplified;
2767 else
2768 onMissedSimplification();
2769
2770 onInstructionAnalysisFinish(I: &I);
2771 using namespace ore;
2772 // If the visit this instruction detected an uninlinable pattern, abort.
2773 InlineResult IR = InlineResult::success();
2774 if (IsRecursiveCall && !AllowRecursiveCall)
2775 IR = InlineResult::failure(Reason: "recursive");
2776 else if (ExposesReturnsTwice)
2777 IR = InlineResult::failure(Reason: "exposes returns twice");
2778 else if (HasDynamicAlloca)
2779 IR = InlineResult::failure(Reason: "dynamic alloca");
2780 else if (HasIndirectBr)
2781 IR = InlineResult::failure(Reason: "indirect branch");
2782 else if (HasUninlineableIntrinsic)
2783 IR = InlineResult::failure(Reason: "uninlinable intrinsic");
2784 else if (InitsVargArgs)
2785 IR = InlineResult::failure(Reason: "varargs");
2786 if (!IR.isSuccess()) {
2787 if (ORE)
2788 ORE->emit(RemarkBuilder: [&]() {
2789 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2790 &CandidateCall)
2791 << NV("Callee", &F) << " has uninlinable pattern ("
2792 << NV("InlineResult", IR.getFailureReason())
2793 << ") and cost is not fully computed";
2794 });
2795 return IR;
2796 }
2797
2798 // If the caller is a recursive function then we don't want to inline
2799 // functions which allocate a lot of stack space because it would increase
2800 // the caller stack usage dramatically.
2801 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2802 auto IR =
2803 InlineResult::failure(Reason: "recursive and allocates too much stack space");
2804 if (ORE)
2805 ORE->emit(RemarkBuilder: [&]() {
2806 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2807 &CandidateCall)
2808 << NV("Callee", &F) << " is "
2809 << NV("InlineResult", IR.getFailureReason())
2810 << ". Cost is not fully computed";
2811 });
2812 return IR;
2813 }
2814
2815 if (shouldStop())
2816 return InlineResult::failure(
2817 Reason: "Call site analysis is not favorable to inlining.");
2818 }
2819
2820 return InlineResult::success();
2821}
2822
2823/// Compute the base pointer and cumulative constant offsets for V.
2824///
2825/// This strips all constant offsets off of V, leaving it the base pointer, and
2826/// accumulates the total constant offset applied in the returned constant. It
2827/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2828/// no constant offsets applied.
2829ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2830 if (!V->getType()->isPointerTy())
2831 return nullptr;
2832
2833 unsigned AS = V->getType()->getPointerAddressSpace();
2834 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2835 APInt Offset = APInt::getZero(numBits: IntPtrWidth);
2836
2837 // Even though we don't look through PHI nodes, we could be called on an
2838 // instruction in an unreachable block, which may be on a cycle.
2839 SmallPtrSet<Value *, 4> Visited;
2840 Visited.insert(Ptr: V);
2841 do {
2842 if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) {
2843 if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset))
2844 return nullptr;
2845 V = GEP->getPointerOperand();
2846 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) {
2847 if (GA->isInterposable())
2848 break;
2849 V = GA->getAliasee();
2850 } else {
2851 break;
2852 }
2853 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2854 } while (Visited.insert(Ptr: V).second);
2855
2856 Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType());
2857 return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset));
2858}
2859
2860/// Find dead blocks due to deleted CFG edges during inlining.
2861///
2862/// If we know the successor of the current block, \p CurrBB, has to be \p
2863/// NextBB, the other successors of \p CurrBB are dead if these successors have
2864/// no live incoming CFG edges. If one block is found to be dead, we can
2865/// continue growing the dead block list by checking the successors of the dead
2866/// blocks to see if all their incoming edges are dead or not.
2867void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2868 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2869 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2870 // known successor which is not the one under exam.
2871 if (DeadBlocks.count(Ptr: Pred))
2872 return true;
2873 BasicBlock *KnownSucc = KnownSuccessors[Pred];
2874 return KnownSucc && KnownSucc != Succ;
2875 };
2876
2877 auto IsNewlyDead = [&](BasicBlock *BB) {
2878 // If all the edges to a block are dead, the block is also dead.
2879 return (!DeadBlocks.count(Ptr: BB) &&
2880 llvm::all_of(Range: predecessors(BB),
2881 P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2882 };
2883
2884 for (BasicBlock *Succ : successors(BB: CurrBB)) {
2885 if (Succ == NextBB || !IsNewlyDead(Succ))
2886 continue;
2887 SmallVector<BasicBlock *, 4> NewDead;
2888 NewDead.push_back(Elt: Succ);
2889 while (!NewDead.empty()) {
2890 BasicBlock *Dead = NewDead.pop_back_val();
2891 if (DeadBlocks.insert(Ptr: Dead).second)
2892 // Continue growing the dead block lists.
2893 for (BasicBlock *S : successors(BB: Dead))
2894 if (IsNewlyDead(S))
2895 NewDead.push_back(Elt: S);
2896 }
2897 }
2898}
2899
2900/// Analyze a call site for potential inlining.
2901///
2902/// Returns true if inlining this call is viable, and false if it is not
2903/// viable. It computes the cost and adjusts the threshold based on numerous
2904/// factors and heuristics. If this method returns false but the computed cost
2905/// is below the computed threshold, then inlining was forcibly disabled by
2906/// some artifact of the routine.
2907InlineResult CallAnalyzer::analyze() {
2908 ++NumCallsAnalyzed;
2909
2910 auto Result = onAnalysisStart();
2911 if (!Result.isSuccess())
2912 return Result;
2913
2914 if (F.empty())
2915 return InlineResult::success();
2916
2917 Function *Caller = CandidateCall.getFunction();
2918 // Check if the caller function is recursive itself.
2919 for (User *U : Caller->users()) {
2920 CallBase *Call = dyn_cast<CallBase>(Val: U);
2921 if (Call && Call->getFunction() == Caller) {
2922 IsCallerRecursive = true;
2923 break;
2924 }
2925 }
2926
2927 // Populate our simplified values by mapping from function arguments to call
2928 // arguments with known important simplifications.
2929 auto CAI = CandidateCall.arg_begin();
2930 for (Argument &FAI : F.args()) {
2931 assert(CAI != CandidateCall.arg_end());
2932 SimplifiedValues[&FAI] = *CAI;
2933 if (isa<Constant>(Val: *CAI))
2934 ++NumConstantArgs;
2935
2936 Value *PtrArg = *CAI;
2937 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) {
2938 ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue());
2939
2940 // We can SROA any pointer arguments derived from alloca instructions.
2941 if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) {
2942 SROAArgValues[&FAI] = SROAArg;
2943 onInitializeSROAArg(Arg: SROAArg);
2944 EnabledSROAAllocas.insert(V: SROAArg);
2945 }
2946 }
2947 ++CAI;
2948 }
2949 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2950 NumAllocaArgs = SROAArgValues.size();
2951
2952 // Collecting the ephemeral values of `F` can be expensive, so use the
2953 // ephemeral values cache if available.
2954 SmallPtrSet<const Value *, 32> EphValuesStorage;
2955 const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage;
2956 if (GetEphValuesCache)
2957 EphValues = &GetEphValuesCache(F).ephValues();
2958 else
2959 CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F),
2960 EphValues&: EphValuesStorage);
2961
2962 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2963 // adding basic blocks of the callee which can be proven to be dead for this
2964 // particular call site in order to get more accurate cost estimates. This
2965 // requires a somewhat heavyweight iteration pattern: we need to walk the
2966 // basic blocks in a breadth-first order as we insert live successors. To
2967 // accomplish this, prioritizing for small iterations because we exit after
2968 // crossing our threshold, we use a small-size optimized SetVector.
2969 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2970 BBSetVector BBWorklist;
2971 BBWorklist.insert(X: &F.getEntryBlock());
2972
2973 // Note that we *must not* cache the size, this loop grows the worklist.
2974 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2975 if (shouldStop())
2976 break;
2977
2978 BasicBlock *BB = BBWorklist[Idx];
2979 if (BB->empty())
2980 continue;
2981
2982 onBlockStart(BB);
2983
2984 // Disallow inlining a blockaddress.
2985 // A blockaddress only has defined behavior for an indirect branch in the
2986 // same function, and we do not currently support inlining indirect
2987 // branches. But, the inliner may not see an indirect branch that ends up
2988 // being dead code at a particular call site. If the blockaddress escapes
2989 // the function, e.g., via a global variable, inlining may lead to an
2990 // invalid cross-function reference.
2991 // FIXME: pr/39560: continue relaxing this overt restriction.
2992 if (BB->hasAddressTaken())
2993 return InlineResult::failure(Reason: "blockaddress used");
2994
2995 // Analyze the cost of this block. If we blow through the threshold, this
2996 // returns false, and we can bail on out.
2997 InlineResult IR = analyzeBlock(BB, EphValues: *EphValues);
2998 if (!IR.isSuccess())
2999 return IR;
3000
3001 Instruction *TI = BB->getTerminator();
3002
3003 // Add in the live successors by first checking whether we have terminator
3004 // that may be simplified based on the values simplified by this call.
3005 if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) {
3006 if (BI->isConditional()) {
3007 Value *Cond = BI->getCondition();
3008 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
3009 BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0);
3010 BBWorklist.insert(X: NextBB);
3011 KnownSuccessors[BB] = NextBB;
3012 findDeadBlocks(CurrBB: BB, NextBB);
3013 continue;
3014 }
3015 }
3016 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) {
3017 Value *Cond = SI->getCondition();
3018 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
3019 BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor();
3020 BBWorklist.insert(X: NextBB);
3021 KnownSuccessors[BB] = NextBB;
3022 findDeadBlocks(CurrBB: BB, NextBB);
3023 continue;
3024 }
3025 }
3026
3027 // If we're unable to select a particular successor, just count all of
3028 // them.
3029 BBWorklist.insert_range(R: successors(BB));
3030
3031 onBlockAnalyzed(BB);
3032 }
3033
3034 // If this is a noduplicate call, we can still inline as long as
3035 // inlining this would cause the removal of the caller (so the instruction
3036 // is not actually duplicated, just moved).
3037 if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall)
3038 return InlineResult::failure(Reason: "noduplicate");
3039
3040 // If the callee's stack size exceeds the user-specified threshold,
3041 // do not let it be inlined.
3042 // The command line option overrides a limit set in the function attributes.
3043 size_t FinalStackSizeThreshold = StackSizeThreshold;
3044 if (!StackSizeThreshold.getNumOccurrences())
3045 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
3046 F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName))
3047 FinalStackSizeThreshold = *AttrMaxStackSize;
3048 if (AllocatedSize > FinalStackSizeThreshold)
3049 return InlineResult::failure(Reason: "stacksize");
3050
3051 return finalizeAnalysis();
3052}
3053
3054void InlineCostCallAnalyzer::print(raw_ostream &OS) {
3055#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
3056 if (PrintInstructionComments)
3057 F.print(OS, AAW: &Writer);
3058 DEBUG_PRINT_STAT(NumConstantArgs);
3059 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
3060 DEBUG_PRINT_STAT(NumAllocaArgs);
3061 DEBUG_PRINT_STAT(NumConstantPtrCmps);
3062 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
3063 DEBUG_PRINT_STAT(NumInstructionsSimplified);
3064 DEBUG_PRINT_STAT(NumInstructions);
3065 DEBUG_PRINT_STAT(NumInlineAsmInstructions);
3066 DEBUG_PRINT_STAT(SROACostSavings);
3067 DEBUG_PRINT_STAT(SROACostSavingsLost);
3068 DEBUG_PRINT_STAT(LoadEliminationCost);
3069 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
3070 DEBUG_PRINT_STAT(Cost);
3071 DEBUG_PRINT_STAT(Threshold);
3072#undef DEBUG_PRINT_STAT
3073}
3074
3075#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3076/// Dump stats about this call's analysis.
3077LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
3078#endif
3079
3080/// Test that there are no attribute conflicts between Caller and Callee
3081/// that prevent inlining.
3082static bool functionsHaveCompatibleAttributes(
3083 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
3084 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
3085 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
3086 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
3087 // object, and always returns the same object (which is overwritten on each
3088 // GetTLI call). Therefore we copy the first result.
3089 auto CalleeTLI = GetTLI(*Callee);
3090 return (IgnoreTTIInlineCompatible ||
3091 TTI.areInlineCompatible(Caller, Callee)) &&
3092 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
3093 AllowCallerSuperset: InlineCallerSupersetNoBuiltin) &&
3094 AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee);
3095}
3096
3097int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
3098 const DataLayout &DL) {
3099 int64_t Cost = 0;
3100 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
3101 if (Call.isByValArgument(ArgNo: I)) {
3102 // We approximate the number of loads and stores needed by dividing the
3103 // size of the byval type by the target's pointer size.
3104 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3105 unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I));
3106 unsigned AS = PTy->getAddressSpace();
3107 unsigned PointerSize = DL.getPointerSizeInBits(AS);
3108 // Ceiling division.
3109 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
3110
3111 // If it generates more than 8 stores it is likely to be expanded as an
3112 // inline memcpy so we take that as an upper bound. Otherwise we assume
3113 // one load and one store per word copied.
3114 // FIXME: The maxStoresPerMemcpy setting from the target should be used
3115 // here instead of a magic number of 8, but it's not available via
3116 // DataLayout.
3117 NumStores = std::min(a: NumStores, b: 8U);
3118
3119 Cost += 2 * NumStores * InstrCost;
3120 } else {
3121 // For non-byval arguments subtract off one instruction per call
3122 // argument.
3123 Cost += InstrCost;
3124 }
3125 }
3126 // The call instruction also disappears after inlining.
3127 Cost += InstrCost;
3128 Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty);
3129
3130 return std::min<int64_t>(a: Cost, INT_MAX);
3131}
3132
3133InlineCost llvm::getInlineCost(
3134 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
3135 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3136 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3137 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3138 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3139 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3140 return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI,
3141 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE,
3142 GetEphValuesCache);
3143}
3144
3145std::optional<int> llvm::getInliningCostEstimate(
3146 CallBase &Call, TargetTransformInfo &CalleeTTI,
3147 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3148 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3149 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3150 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3151 const InlineParams Params = {/* DefaultThreshold*/ 0,
3152 /*HintThreshold*/ {},
3153 /*ColdThreshold*/ {},
3154 /*OptSizeThreshold*/ {},
3155 /*OptMinSizeThreshold*/ {},
3156 /*HotCallSiteThreshold*/ {},
3157 /*LocallyHotCallSiteThreshold*/ {},
3158 /*ColdCallSiteThreshold*/ {},
3159 /*ComputeFullInlineCost*/ true,
3160 /*EnableDeferral*/ true};
3161
3162 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
3163 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
3164 /*IgnoreThreshold*/ true);
3165 auto R = CA.analyze();
3166 if (!R.isSuccess())
3167 return std::nullopt;
3168 return CA.getCost();
3169}
3170
3171std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
3172 CallBase &Call, TargetTransformInfo &CalleeTTI,
3173 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3174 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3175 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3176 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3177 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
3178 PSI, ORE, *Call.getCalledFunction(), Call);
3179 auto R = CFA.analyze();
3180 if (!R.isSuccess())
3181 return std::nullopt;
3182 return CFA.features();
3183}
3184
3185std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
3186 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
3187 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
3188
3189 // Cannot inline indirect calls.
3190 if (!Callee)
3191 return InlineResult::failure(Reason: "indirect call");
3192
3193 // When callee coroutine function is inlined into caller coroutine function
3194 // before coro-split pass,
3195 // coro-early pass can not handle this quiet well.
3196 // So we won't inline the coroutine function if it have not been unsplited
3197 if (Callee->isPresplitCoroutine())
3198 return InlineResult::failure(Reason: "unsplited coroutine call");
3199
3200 // Never inline calls with byval arguments that does not have the alloca
3201 // address space. Since byval arguments can be replaced with a copy to an
3202 // alloca, the inlined code would need to be adjusted to handle that the
3203 // argument is in the alloca address space (so it is a little bit complicated
3204 // to solve).
3205 unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3206 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3207 if (Call.isByValArgument(ArgNo: I)) {
3208 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3209 if (PTy->getAddressSpace() != AllocaAS)
3210 return InlineResult::failure(Reason: "byval arguments without alloca"
3211 " address space");
3212 }
3213
3214 // Calls to functions with always-inline attributes should be inlined
3215 // whenever possible.
3216 if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) {
3217 if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline))
3218 return InlineResult::failure(Reason: "noinline call site attribute");
3219
3220 auto IsViable = isInlineViable(Callee&: *Callee);
3221 if (IsViable.isSuccess())
3222 return InlineResult::success();
3223 return InlineResult::failure(Reason: IsViable.getFailureReason());
3224 }
3225
3226 // Never inline functions with conflicting attributes (unless callee has
3227 // always-inline attribute).
3228 Function *Caller = Call.getCaller();
3229 if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI))
3230 return InlineResult::failure(Reason: "conflicting attributes");
3231
3232 // Don't inline this call if the caller has the optnone attribute.
3233 if (Caller->hasOptNone())
3234 return InlineResult::failure(Reason: "optnone attribute");
3235
3236 // Don't inline a function that treats null pointer as valid into a caller
3237 // that does not have this attribute.
3238 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3239 return InlineResult::failure(Reason: "nullptr definitions incompatible");
3240
3241 // Don't inline functions which can be interposed at link-time.
3242 if (Callee->isInterposable())
3243 return InlineResult::failure(Reason: "interposable");
3244
3245 // Don't inline functions marked noinline.
3246 if (Callee->hasFnAttribute(Kind: Attribute::NoInline))
3247 return InlineResult::failure(Reason: "noinline function attribute");
3248
3249 // Don't inline call sites marked noinline.
3250 if (Call.isNoInline())
3251 return InlineResult::failure(Reason: "noinline call site attribute");
3252
3253 // Don't inline functions that are loader replaceable.
3254 if (Callee->hasFnAttribute(Kind: "loader-replaceable"))
3255 return InlineResult::failure(Reason: "loader replaceable function attribute");
3256
3257 return std::nullopt;
3258}
3259
3260InlineCost llvm::getInlineCost(
3261 CallBase &Call, Function *Callee, const InlineParams &Params,
3262 TargetTransformInfo &CalleeTTI,
3263 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3264 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3265 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3266 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3267 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3268
3269 auto UserDecision =
3270 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3271
3272 if (UserDecision) {
3273 if (UserDecision->isSuccess())
3274 return llvm::InlineCost::getAlways(Reason: "always inline attribute");
3275 return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason());
3276 }
3277
3278 if (InlineAllViableCalls && isInlineViable(Callee&: *Callee).isSuccess())
3279 return llvm::InlineCost::getAlways(
3280 Reason: "Inlining forced by -inline-all-viable-calls");
3281
3282 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3283 << "... (caller:" << Call.getCaller()->getName()
3284 << ")\n");
3285
3286 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3287 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
3288 /*BoostIndirect=*/true, /*IgnoreThreshold=*/false,
3289 GetEphValuesCache);
3290 InlineResult ShouldInline = CA.analyze();
3291
3292 LLVM_DEBUG(CA.dump());
3293
3294 // Always make cost benefit based decision explicit.
3295 // We use always/never here since threshold is not meaningful,
3296 // as it's not what drives cost-benefit analysis.
3297 if (CA.wasDecidedByCostBenefit()) {
3298 if (ShouldInline.isSuccess())
3299 return InlineCost::getAlways(Reason: "benefit over cost",
3300 CostBenefit: CA.getCostBenefitPair());
3301 else
3302 return InlineCost::getNever(Reason: "cost over benefit", CostBenefit: CA.getCostBenefitPair());
3303 }
3304
3305 if (CA.wasDecidedByCostThreshold())
3306 return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(),
3307 StaticBonus: CA.getStaticBonusApplied());
3308
3309 // No details on how the decision was made, simply return always or never.
3310 return ShouldInline.isSuccess()
3311 ? InlineCost::getAlways(Reason: "empty function")
3312 : InlineCost::getNever(Reason: ShouldInline.getFailureReason());
3313}
3314
3315InlineResult llvm::isInlineViable(Function &F) {
3316 bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice);
3317 for (BasicBlock &BB : F) {
3318 // Disallow inlining of functions which contain indirect branches.
3319 if (isa<IndirectBrInst>(Val: BB.getTerminator()))
3320 return InlineResult::failure(Reason: "contains indirect branches");
3321
3322 // Disallow inlining of blockaddresses.
3323 if (BB.hasAddressTaken())
3324 return InlineResult::failure(Reason: "blockaddress used");
3325
3326 for (auto &II : BB) {
3327 CallBase *Call = dyn_cast<CallBase>(Val: &II);
3328 if (!Call)
3329 continue;
3330
3331 // Disallow recursive calls.
3332 Function *Callee = Call->getCalledFunction();
3333 if (&F == Callee)
3334 return InlineResult::failure(Reason: "recursive call");
3335
3336 // Disallow calls which expose returns-twice to a function not previously
3337 // attributed as such.
3338 if (!ReturnsTwice && isa<CallInst>(Val: Call) &&
3339 cast<CallInst>(Val: Call)->canReturnTwice())
3340 return InlineResult::failure(Reason: "exposes returns-twice attribute");
3341
3342 if (Callee)
3343 switch (Callee->getIntrinsicID()) {
3344 default:
3345 break;
3346 case llvm::Intrinsic::icall_branch_funnel:
3347 // Disallow inlining of @llvm.icall.branch.funnel because current
3348 // backend can't separate call targets from call arguments.
3349 return InlineResult::failure(
3350 Reason: "disallowed inlining of @llvm.icall.branch.funnel");
3351 case llvm::Intrinsic::localescape:
3352 // Disallow inlining functions that call @llvm.localescape. Doing this
3353 // correctly would require major changes to the inliner.
3354 return InlineResult::failure(
3355 Reason: "disallowed inlining of @llvm.localescape");
3356 case llvm::Intrinsic::vastart:
3357 // Disallow inlining of functions that initialize VarArgs with
3358 // va_start.
3359 return InlineResult::failure(
3360 Reason: "contains VarArgs initialized with va_start");
3361 }
3362 }
3363 }
3364
3365 return InlineResult::success();
3366}
3367
3368// APIs to create InlineParams based on command line flags and/or other
3369// parameters.
3370
3371InlineParams llvm::getInlineParams(int Threshold) {
3372 InlineParams Params;
3373
3374 // This field is the threshold to use for a callee by default. This is
3375 // derived from one or more of:
3376 // * optimization or size-optimization levels,
3377 // * a value passed to createFunctionInliningPass function, or
3378 // * the -inline-threshold flag.
3379 // If the -inline-threshold flag is explicitly specified, that is used
3380 // irrespective of anything else.
3381 if (InlineThreshold.getNumOccurrences() > 0)
3382 Params.DefaultThreshold = InlineThreshold;
3383 else
3384 Params.DefaultThreshold = Threshold;
3385
3386 // Set the HintThreshold knob from the -inlinehint-threshold.
3387 Params.HintThreshold = HintThreshold;
3388
3389 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3390 Params.HotCallSiteThreshold = HotCallSiteThreshold;
3391
3392 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3393 // populate LocallyHotCallSiteThreshold. Later, we populate
3394 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3395 // we know that optimization level is O3 (in the getInlineParams variant that
3396 // takes the opt and size levels).
3397 // FIXME: Remove this check (and make the assignment unconditional) after
3398 // addressing size regression issues at O2.
3399 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3400 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3401
3402 // Set the ColdCallSiteThreshold knob from the
3403 // -inline-cold-callsite-threshold.
3404 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3405
3406 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3407 // -inlinehint-threshold commandline option is not explicitly given. If that
3408 // option is present, then its value applies even for callees with size and
3409 // minsize attributes.
3410 // If the -inline-threshold is not specified, set the ColdThreshold from the
3411 // -inlinecold-threshold even if it is not explicitly passed. If
3412 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3413 // explicitly specified to set the ColdThreshold knob
3414 if (InlineThreshold.getNumOccurrences() == 0) {
3415 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3416 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3417 Params.ColdThreshold = ColdThreshold;
3418 } else if (ColdThreshold.getNumOccurrences() > 0) {
3419 Params.ColdThreshold = ColdThreshold;
3420 }
3421 return Params;
3422}
3423
3424InlineParams llvm::getInlineParams() {
3425 return getInlineParams(Threshold: DefaultThreshold);
3426}
3427
3428// Compute the default threshold for inlining based on the opt level and the
3429// size opt level.
3430static int computeThresholdFromOptLevels(unsigned OptLevel,
3431 unsigned SizeOptLevel) {
3432 if (OptLevel > 2)
3433 return InlineConstants::OptAggressiveThreshold;
3434 if (SizeOptLevel == 1) // -Os
3435 return InlineConstants::OptSizeThreshold;
3436 if (SizeOptLevel == 2) // -Oz
3437 return InlineConstants::OptMinSizeThreshold;
3438 return DefaultThreshold;
3439}
3440
3441InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3442 auto Params =
3443 getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3444 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3445 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3446 // when it is specified explicitly.
3447 if (OptLevel > 2)
3448 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3449 return Params;
3450}
3451
3452PreservedAnalyses
3453InlineCostAnnotationPrinterPass::run(Function &F,
3454 FunctionAnalysisManager &FAM) {
3455 PrintInstructionComments = true;
3456 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3457 [&](Function &F) -> AssumptionCache & {
3458 return FAM.getResult<AssumptionAnalysis>(IR&: F);
3459 };
3460
3461 auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
3462 ProfileSummaryInfo *PSI =
3463 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
3464 const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3465
3466 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3467 // In the current implementation, the type of InlineParams doesn't matter as
3468 // the pass serves only for verification of inliner's decisions.
3469 // We can add a flag which determines InlineParams for this run. Right now,
3470 // the default InlineParams are used.
3471 const InlineParams Params = llvm::getInlineParams();
3472 for (BasicBlock &BB : F) {
3473 for (Instruction &I : BB) {
3474 if (auto *CB = dyn_cast<CallBase>(Val: &I)) {
3475 Function *CalledFunction = CB->getCalledFunction();
3476 if (!CalledFunction || CalledFunction->isDeclaration())
3477 continue;
3478 OptimizationRemarkEmitter ORE(CalledFunction);
3479 InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
3480 GetAssumptionCache, nullptr, nullptr, PSI,
3481 &ORE);
3482 ICCA.analyze();
3483 OS << " Analyzing call of " << CalledFunction->getName()
3484 << "... (caller:" << CB->getCaller()->getName() << ")\n";
3485 ICCA.print(OS);
3486 OS << "\n";
3487 }
3488 }
3489 }
3490 return PreservedAnalyses::all();
3491}
3492