1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Analysis/InlineCost.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/BlockFrequencyInfo.h"
21#include "llvm/Analysis/CodeMetrics.h"
22#include "llvm/Analysis/ConstantFolding.h"
23#include "llvm/Analysis/DomConditionCache.h"
24#include "llvm/Analysis/EphemeralValuesCache.h"
25#include "llvm/Analysis/InstructionSimplify.h"
26#include "llvm/Analysis/LoopInfo.h"
27#include "llvm/Analysis/MemoryBuiltins.h"
28#include "llvm/Analysis/OptimizationRemarkEmitter.h"
29#include "llvm/Analysis/ProfileSummaryInfo.h"
30#include "llvm/Analysis/TargetLibraryInfo.h"
31#include "llvm/Analysis/TargetTransformInfo.h"
32#include "llvm/Analysis/ValueTracking.h"
33#include "llvm/Config/llvm-config.h"
34#include "llvm/IR/AssemblyAnnotationWriter.h"
35#include "llvm/IR/CallingConv.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/Dominators.h"
38#include "llvm/IR/GetElementPtrTypeIterator.h"
39#include "llvm/IR/GlobalAlias.h"
40#include "llvm/IR/InlineAsm.h"
41#include "llvm/IR/InstVisitor.h"
42#include "llvm/IR/IntrinsicInst.h"
43#include "llvm/IR/Operator.h"
44#include "llvm/IR/PatternMatch.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/Debug.h"
47#include "llvm/Support/FormattedStream.h"
48#include "llvm/Support/raw_ostream.h"
49#include <climits>
50#include <limits>
51#include <optional>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "inline-cost"
56
57STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
58
59static cl::opt<int>
60 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(Val: 225),
61 cl::desc("Default amount of inlining to perform"));
62
63// We introduce this option since there is a minor compile-time win by avoiding
64// addition of TTI attributes (target-features in particular) to inline
65// candidates when they are guaranteed to be the same as top level methods in
66// some use cases. If we avoid adding the attribute, we need an option to avoid
67// checking these attributes.
68static cl::opt<bool> IgnoreTTIInlineCompatible(
69 "ignore-tti-inline-compatible", cl::Hidden, cl::init(Val: false),
70 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
71 "during inline cost calculation"));
72
73static cl::opt<bool> PrintInstructionComments(
74 "print-instruction-comments", cl::Hidden, cl::init(Val: false),
75 cl::desc("Prints comments for instruction based on inline cost analysis"));
76
77static cl::opt<int> InlineThreshold(
78 "inline-threshold", cl::Hidden, cl::init(Val: 225),
79 cl::desc("Control the amount of inlining to perform (default = 225)"));
80
81static cl::opt<int> HintThreshold(
82 "inlinehint-threshold", cl::Hidden, cl::init(Val: 325),
83 cl::desc("Threshold for inlining functions with inline hint"));
84
85static cl::opt<int>
86 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
87 cl::init(Val: 45),
88 cl::desc("Threshold for inlining cold callsites"));
89
90static cl::opt<bool> InlineEnableCostBenefitAnalysis(
91 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(Val: false),
92 cl::desc("Enable the cost-benefit analysis for the inliner"));
93
94// InlineSavingsMultiplier overrides per TTI multipliers iff it is
95// specified explicitly in command line options. This option is exposed
96// for tuning and testing.
97static cl::opt<int> InlineSavingsMultiplier(
98 "inline-savings-multiplier", cl::Hidden, cl::init(Val: 8),
99 cl::desc("Multiplier to multiply cycle savings by during inlining"));
100
101// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
102// specified explicitly in command line options. This option is exposed
103// for tuning and testing.
104static cl::opt<int> InlineSavingsProfitableMultiplier(
105 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(Val: 4),
106 cl::desc("A multiplier on top of cycle savings to decide whether the "
107 "savings won't justify the cost"));
108
109static cl::opt<int>
110 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(Val: 100),
111 cl::desc("The maximum size of a callee that get's "
112 "inlined without sufficient cycle savings"));
113
114// We introduce this threshold to help performance of instrumentation based
115// PGO before we actually hook up inliner with analysis passes such as BPI and
116// BFI.
117static cl::opt<int> ColdThreshold(
118 "inlinecold-threshold", cl::Hidden, cl::init(Val: 45),
119 cl::desc("Threshold for inlining functions with cold attribute"));
120
121static cl::opt<int>
122 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(Val: 3000),
123 cl::desc("Threshold for hot callsites "));
124
125static cl::opt<int> LocallyHotCallSiteThreshold(
126 "locally-hot-callsite-threshold", cl::Hidden, cl::init(Val: 525),
127 cl::desc("Threshold for locally hot callsites "));
128
129static cl::opt<int> ColdCallSiteRelFreq(
130 "cold-callsite-rel-freq", cl::Hidden, cl::init(Val: 2),
131 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
132 "entry frequency, for a callsite to be cold in the absence of "
133 "profile information."));
134
135static cl::opt<uint64_t> HotCallSiteRelFreq(
136 "hot-callsite-rel-freq", cl::Hidden, cl::init(Val: 60),
137 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
138 "entry frequency, for a callsite to be hot in the absence of "
139 "profile information."));
140
141static cl::opt<int>
142 InstrCost("inline-instr-cost", cl::Hidden, cl::init(Val: 5),
143 cl::desc("Cost of a single instruction when inlining"));
144
145static cl::opt<int> InlineAsmInstrCost(
146 "inline-asm-instr-cost", cl::Hidden, cl::init(Val: 0),
147 cl::desc("Cost of a single inline asm instruction when inlining"));
148
149static cl::opt<int>
150 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(Val: 0),
151 cl::desc("Cost of load/store instruction when inlining"));
152
153static cl::opt<int> CallPenalty(
154 "inline-call-penalty", cl::Hidden, cl::init(Val: 25),
155 cl::desc("Call penalty that is applied per callsite when inlining"));
156
157static cl::opt<size_t>
158 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
159 cl::init(Val: std::numeric_limits<size_t>::max()),
160 cl::desc("Do not inline functions with a stack size "
161 "that exceeds the specified limit"));
162
163static cl::opt<size_t> RecurStackSizeThreshold(
164 "recursive-inline-max-stacksize", cl::Hidden,
165 cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller),
166 cl::desc("Do not inline recursive functions with a stack "
167 "size that exceeds the specified limit"));
168
169static cl::opt<bool> OptComputeFullInlineCost(
170 "inline-cost-full", cl::Hidden,
171 cl::desc("Compute the full inline cost of a call site even when the cost "
172 "exceeds the threshold."));
173
174static cl::opt<bool> InlineCallerSupersetNoBuiltin(
175 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(Val: true),
176 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
177 "attributes."));
178
179static cl::opt<bool> DisableGEPConstOperand(
180 "disable-gep-const-evaluation", cl::Hidden, cl::init(Val: false),
181 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
182
183static cl::opt<bool> InlineAllViableCalls(
184 "inline-all-viable-calls", cl::Hidden, cl::init(Val: false),
185 cl::desc("Inline all viable calls, even if they exceed the inlining "
186 "threshold"));
187namespace llvm {
188std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
189 if (Attr.isValid()) {
190 int AttrValue = 0;
191 if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue))
192 return AttrValue;
193 }
194 return std::nullopt;
195}
196
197std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
198 return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind));
199}
200
201std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
202 return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind));
203}
204
205namespace InlineConstants {
206int getInstrCost() { return InstrCost; }
207
208} // namespace InlineConstants
209
210} // namespace llvm
211
212namespace {
213class InlineCostCallAnalyzer;
214
215// This struct is used to store information about inline cost of a
216// particular instruction
217struct InstructionCostDetail {
218 int CostBefore = 0;
219 int CostAfter = 0;
220 int ThresholdBefore = 0;
221 int ThresholdAfter = 0;
222
223 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
224
225 int getCostDelta() const { return CostAfter - CostBefore; }
226
227 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
228};
229
230class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
231private:
232 InlineCostCallAnalyzer *const ICCA;
233
234public:
235 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
236 void emitInstructionAnnot(const Instruction *I,
237 formatted_raw_ostream &OS) override;
238};
239
240/// Carry out call site analysis, in order to evaluate inlinability.
241/// NOTE: the type is currently used as implementation detail of functions such
242/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
243/// expectation is that they come from the outer scope, from the wrapper
244/// functions. If we want to support constructing CallAnalyzer objects where
245/// lambdas are provided inline at construction, or where the object needs to
246/// otherwise survive past the scope of the provided functions, we need to
247/// revisit the argument types.
248class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
249 typedef InstVisitor<CallAnalyzer, bool> Base;
250 friend class InstVisitor<CallAnalyzer, bool>;
251
252protected:
253 virtual ~CallAnalyzer() = default;
254 /// The TargetTransformInfo available for this compilation.
255 const TargetTransformInfo &TTI;
256
257 /// Getter for the cache of @llvm.assume intrinsics.
258 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
259
260 /// Getter for BlockFrequencyInfo
261 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
262
263 /// Getter for TargetLibraryInfo
264 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
265
266 /// Profile summary information.
267 ProfileSummaryInfo *PSI;
268
269 /// The called function.
270 Function &F;
271
272 // Cache the DataLayout since we use it a lot.
273 const DataLayout &DL;
274
275 /// The OptimizationRemarkEmitter available for this compilation.
276 OptimizationRemarkEmitter *ORE;
277
278 /// The candidate callsite being analyzed. Please do not use this to do
279 /// analysis in the caller function; we want the inline cost query to be
280 /// easily cacheable. Instead, use the cover function paramHasAttr.
281 CallBase &CandidateCall;
282
283 /// Getter for the cache of ephemeral values.
284 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr;
285
286 /// Extension points for handling callsite features.
287 // Called before a basic block was analyzed.
288 virtual void onBlockStart(const BasicBlock *BB) {}
289
290 /// Called after a basic block was analyzed.
291 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
292
293 /// Called before an instruction was analyzed
294 virtual void onInstructionAnalysisStart(const Instruction *I) {}
295
296 /// Called after an instruction was analyzed
297 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
298
299 /// Called at the end of the analysis of the callsite. Return the outcome of
300 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
301 /// the reason it can't.
302 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
303 /// Called when we're about to start processing a basic block, and every time
304 /// we are done processing an instruction. Return true if there is no point in
305 /// continuing the analysis (e.g. we've determined already the call site is
306 /// too expensive to inline)
307 virtual bool shouldStop() { return false; }
308
309 /// Called before the analysis of the callee body starts (with callsite
310 /// contexts propagated). It checks callsite-specific information. Return a
311 /// reason analysis can't continue if that's the case, or 'true' if it may
312 /// continue.
313 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
314 /// Called if the analysis engine decides SROA cannot be done for the given
315 /// alloca.
316 virtual void onDisableSROA(AllocaInst *Arg) {}
317
318 /// Called the analysis engine determines load elimination won't happen.
319 virtual void onDisableLoadElimination() {}
320
321 /// Called when we visit a CallBase, before the analysis starts. Return false
322 /// to stop further processing of the instruction.
323 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
324
325 /// Called to account for a call.
326 virtual void onCallPenalty() {}
327
328 /// Called to account for a load or store.
329 virtual void onMemAccess(){};
330
331 /// Called to account for the expectation the inlining would result in a load
332 /// elimination.
333 virtual void onLoadEliminationOpportunity() {}
334
335 /// Called to account for the cost of argument setup for the Call in the
336 /// callee's body (not the callsite currently under analysis).
337 virtual void onCallArgumentSetup(const CallBase &Call) {}
338
339 /// Called to account for a load relative intrinsic.
340 virtual void onLoadRelativeIntrinsic() {}
341
342 /// Called to account for a lowered call.
343 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
344 }
345
346 /// Account for a jump table of given size. Return false to stop further
347 /// processing the switch instruction
348 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
349
350 /// Account for a case cluster of given size. Return false to stop further
351 /// processing of the instruction.
352 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
353
354 /// Called at the end of processing a switch instruction, with the given
355 /// number of case clusters.
356 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
357 bool DefaultDestUnreachable) {}
358
359 /// Called to account for any other instruction not specifically accounted
360 /// for.
361 virtual void onMissedSimplification() {}
362
363 /// Account for inline assembly instructions.
364 virtual void onInlineAsm(const InlineAsm &Arg) {}
365
366 /// Start accounting potential benefits due to SROA for the given alloca.
367 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
368
369 /// Account SROA savings for the AllocaInst value.
370 virtual void onAggregateSROAUse(AllocaInst *V) {}
371
372 bool handleSROA(Value *V, bool DoNotDisable) {
373 // Check for SROA candidates in comparisons.
374 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
375 if (DoNotDisable) {
376 onAggregateSROAUse(V: SROAArg);
377 return true;
378 }
379 disableSROAForArg(SROAArg);
380 }
381 return false;
382 }
383
384 bool IsCallerRecursive = false;
385 bool IsRecursiveCall = false;
386 bool ExposesReturnsTwice = false;
387 bool HasDynamicAlloca = false;
388 bool ContainsNoDuplicateCall = false;
389 bool HasReturn = false;
390 bool HasIndirectBr = false;
391 bool HasUninlineableIntrinsic = false;
392 bool InitsVargArgs = false;
393
394 /// Number of bytes allocated statically by the callee.
395 uint64_t AllocatedSize = 0;
396 unsigned NumInstructions = 0;
397 unsigned NumInlineAsmInstructions = 0;
398 unsigned NumVectorInstructions = 0;
399
400 /// While we walk the potentially-inlined instructions, we build up and
401 /// maintain a mapping of simplified values specific to this callsite. The
402 /// idea is to propagate any special information we have about arguments to
403 /// this call through the inlinable section of the function, and account for
404 /// likely simplifications post-inlining. The most important aspect we track
405 /// is CFG altering simplifications -- when we prove a basic block dead, that
406 /// can cause dramatic shifts in the cost of inlining a function.
407 /// Note: The simplified Value may be owned by the caller function.
408 DenseMap<Value *, Value *> SimplifiedValues;
409
410 /// Keep track of the values which map back (through function arguments) to
411 /// allocas on the caller stack which could be simplified through SROA.
412 DenseMap<Value *, AllocaInst *> SROAArgValues;
413
414 /// Keep track of Allocas for which we believe we may get SROA optimization.
415 DenseSet<AllocaInst *> EnabledSROAAllocas;
416
417 /// Keep track of values which map to a pointer base and constant offset.
418 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
419
420 /// Keep track of dead blocks due to the constant arguments.
421 SmallPtrSet<BasicBlock *, 16> DeadBlocks;
422
423 /// The mapping of the blocks to their known unique successors due to the
424 /// constant arguments.
425 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
426
427 /// Model the elimination of repeated loads that is expected to happen
428 /// whenever we simplify away the stores that would otherwise cause them to be
429 /// loads.
430 bool EnableLoadElimination = true;
431
432 /// Whether we allow inlining for recursive call.
433 bool AllowRecursiveCall = false;
434
435 SmallPtrSet<Value *, 16> LoadAddrSet;
436
437 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
438 auto It = SROAArgValues.find(Val: V);
439 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0)
440 return nullptr;
441 return It->second;
442 }
443
444 /// Use a value in its given form directly if possible, otherwise try looking
445 /// for it in SimplifiedValues.
446 template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
447 if (auto *Direct = dyn_cast<T>(V))
448 return Direct;
449 return getSimplifiedValue<T>(V);
450 }
451
452 // Custom simplification helper routines.
453 bool isAllocaDerivedArg(Value *V);
454 void disableSROAForArg(AllocaInst *SROAArg);
455 void disableSROA(Value *V);
456 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
457 void disableLoadElimination();
458 bool isGEPFree(GetElementPtrInst &GEP);
459 bool canFoldInboundsGEP(GetElementPtrInst &I);
460 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
461 bool simplifyCallSite(Function *F, CallBase &Call);
462 bool simplifyCmpInstForRecCall(CmpInst &Cmp);
463 bool simplifyInstruction(Instruction &I);
464 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
465 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
466 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
467 bool isLoweredToCall(Function *F, CallBase &Call);
468
469 /// Return true if the given argument to the function being considered for
470 /// inlining has the given attribute set either at the call site or the
471 /// function declaration. Primarily used to inspect call site specific
472 /// attributes since these can be more precise than the ones on the callee
473 /// itself.
474 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
475
476 /// Return true if the given value is known non null within the callee if
477 /// inlined through this particular callsite.
478 bool isKnownNonNullInCallee(Value *V);
479
480 /// Return true if size growth is allowed when inlining the callee at \p Call.
481 bool allowSizeGrowth(CallBase &Call);
482
483 // Custom analysis routines.
484 InlineResult analyzeBlock(BasicBlock *BB,
485 const SmallPtrSetImpl<const Value *> &EphValues);
486
487 // Disable several entry points to the visitor so we don't accidentally use
488 // them by declaring but not defining them here.
489 void visit(Module *);
490 void visit(Module &);
491 void visit(Function *);
492 void visit(Function &);
493 void visit(BasicBlock *);
494 void visit(BasicBlock &);
495
496 // Provide base case for our instruction visit.
497 bool visitInstruction(Instruction &I);
498
499 // Our visit overrides.
500 bool visitAlloca(AllocaInst &I);
501 bool visitPHI(PHINode &I);
502 bool visitGetElementPtr(GetElementPtrInst &I);
503 bool visitBitCast(BitCastInst &I);
504 bool visitPtrToInt(PtrToIntInst &I);
505 bool visitIntToPtr(IntToPtrInst &I);
506 bool visitCastInst(CastInst &I);
507 bool visitCmpInst(CmpInst &I);
508 bool visitSub(BinaryOperator &I);
509 bool visitBinaryOperator(BinaryOperator &I);
510 bool visitFNeg(UnaryOperator &I);
511 bool visitLoad(LoadInst &I);
512 bool visitStore(StoreInst &I);
513 bool visitExtractValue(ExtractValueInst &I);
514 bool visitInsertValue(InsertValueInst &I);
515 bool visitCallBase(CallBase &Call);
516 bool visitReturnInst(ReturnInst &RI);
517 bool visitUncondBrInst(UncondBrInst &BI);
518 bool visitCondBrInst(CondBrInst &BI);
519 bool visitSelectInst(SelectInst &SI);
520 bool visitSwitchInst(SwitchInst &SI);
521 bool visitIndirectBrInst(IndirectBrInst &IBI);
522 bool visitResumeInst(ResumeInst &RI);
523 bool visitCleanupReturnInst(CleanupReturnInst &RI);
524 bool visitCatchReturnInst(CatchReturnInst &RI);
525 bool visitUnreachableInst(UnreachableInst &I);
526
527public:
528 CallAnalyzer(
529 Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
530 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
531 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
532 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
533 ProfileSummaryInfo *PSI = nullptr,
534 OptimizationRemarkEmitter *ORE = nullptr,
535 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
536 nullptr)
537 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
538 GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
539 CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {}
540
541 InlineResult analyze();
542
543 /// Lookup simplified Value. May return a value owned by the caller.
544 Value *getSimplifiedValueUnchecked(Value *V) const {
545 return SimplifiedValues.lookup(Val: V);
546 }
547
548 /// Lookup simplified Value, but return nullptr if the simplified value is
549 /// owned by the caller.
550 template <typename T> T *getSimplifiedValue(Value *V) const {
551 Value *SimpleV = SimplifiedValues.lookup(Val: V);
552 if (!SimpleV)
553 return nullptr;
554
555 // Skip checks if we know T is a global. This has a small, but measurable
556 // impact on compile-time.
557 if constexpr (std::is_base_of_v<Constant, T>)
558 return dyn_cast<T>(SimpleV);
559
560 // Make sure the simplified Value is owned by this function
561 if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) {
562 if (I->getFunction() != &F)
563 return nullptr;
564 } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) {
565 if (Arg->getParent() != &F)
566 return nullptr;
567 } else if (!isa<Constant>(Val: SimpleV))
568 return nullptr;
569 return dyn_cast<T>(SimpleV);
570 }
571
572 // Keep a bunch of stats about the cost savings found so we can print them
573 // out when debugging.
574 unsigned NumConstantArgs = 0;
575 unsigned NumConstantOffsetPtrArgs = 0;
576 unsigned NumAllocaArgs = 0;
577 unsigned NumConstantPtrCmps = 0;
578 unsigned NumConstantPtrDiffs = 0;
579 unsigned NumInstructionsSimplified = 0;
580
581 void dump();
582};
583
584// Considering forming a binary search, we should find the number of nodes
585// which is same as the number of comparisons when lowered. For a given
586// number of clusters, n, we can define a recursive function, f(n), to find
587// the number of nodes in the tree. The recursion is :
588// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
589// and f(n) = n, when n <= 3.
590// This will lead a binary tree where the leaf should be either f(2) or f(3)
591// when n > 3. So, the number of comparisons from leaves should be n, while
592// the number of non-leaf should be :
593// 2^(log2(n) - 1) - 1
594// = 2^log2(n) * 2^-1 - 1
595// = n / 2 - 1.
596// Considering comparisons from leaf and non-leaf nodes, we can estimate the
597// number of comparisons in a simple closed form :
598// n + n / 2 - 1 = n * 3 / 2 - 1
599int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
600 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
601}
602
603/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
604/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
605class InlineCostCallAnalyzer final : public CallAnalyzer {
606 const bool ComputeFullInlineCost;
607 int LoadEliminationCost = 0;
608 /// Bonus to be applied when percentage of vector instructions in callee is
609 /// high (see more details in updateThreshold).
610 int VectorBonus = 0;
611 /// Bonus to be applied when the callee has only one reachable basic block.
612 int SingleBBBonus = 0;
613
614 /// Tunable parameters that control the analysis.
615 const InlineParams &Params;
616
617 // This DenseMap stores the delta change in cost and threshold after
618 // accounting for the given instruction. The map is filled only with the
619 // flag PrintInstructionComments on.
620 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
621
622 /// Upper bound for the inlining cost. Bonuses are being applied to account
623 /// for speculative "expected profit" of the inlining decision.
624 int Threshold = 0;
625
626 /// The amount of StaticBonus applied.
627 int StaticBonusApplied = 0;
628
629 /// Attempt to evaluate indirect calls to boost its inline cost.
630 const bool BoostIndirectCalls;
631
632 /// Ignore the threshold when finalizing analysis.
633 const bool IgnoreThreshold;
634
635 // True if the cost-benefit-analysis-based inliner is enabled.
636 const bool CostBenefitAnalysisEnabled;
637
638 /// Inlining cost measured in abstract units, accounts for all the
639 /// instructions expected to be executed for a given function invocation.
640 /// Instructions that are statically proven to be dead based on call-site
641 /// arguments are not counted here.
642 int Cost = 0;
643
644 // The cumulative cost at the beginning of the basic block being analyzed. At
645 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
646 // the size of that basic block.
647 int CostAtBBStart = 0;
648
649 // The static size of live but cold basic blocks. This is "static" in the
650 // sense that it's not weighted by profile counts at all.
651 int ColdSize = 0;
652
653 // Whether inlining is decided by cost-threshold analysis.
654 bool DecidedByCostThreshold = false;
655
656 // Whether inlining is decided by cost-benefit analysis.
657 bool DecidedByCostBenefit = false;
658
659 // The cost-benefit pair computed by cost-benefit analysis.
660 std::optional<CostBenefitPair> CostBenefit;
661
662 bool SingleBB = true;
663
664 unsigned SROACostSavings = 0;
665 unsigned SROACostSavingsLost = 0;
666
667 /// The mapping of caller Alloca values to their accumulated cost savings. If
668 /// we have to disable SROA for one of the allocas, this tells us how much
669 /// cost must be added.
670 DenseMap<AllocaInst *, int> SROAArgCosts;
671
672 /// Return true if \p Call is a cold callsite.
673 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
674
675 /// Update Threshold based on callsite properties such as callee
676 /// attributes and callee hotness for PGO builds. The Callee is explicitly
677 /// passed to support analyzing indirect calls whose target is inferred by
678 /// analysis.
679 void updateThreshold(CallBase &Call, Function &Callee);
680 /// Return a higher threshold if \p Call is a hot callsite.
681 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
682 BlockFrequencyInfo *CallerBFI);
683
684 /// Handle a capped 'int' increment for Cost.
685 void addCost(int64_t Inc) {
686 Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX);
687 Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX);
688 }
689
690 void onDisableSROA(AllocaInst *Arg) override {
691 auto CostIt = SROAArgCosts.find(Val: Arg);
692 if (CostIt == SROAArgCosts.end())
693 return;
694 addCost(Inc: CostIt->second);
695 SROACostSavings -= CostIt->second;
696 SROACostSavingsLost += CostIt->second;
697 SROAArgCosts.erase(I: CostIt);
698 }
699
700 void onDisableLoadElimination() override {
701 addCost(Inc: LoadEliminationCost);
702 LoadEliminationCost = 0;
703 }
704
705 bool onCallBaseVisitStart(CallBase &Call) override {
706 if (std::optional<int> AttrCallThresholdBonus =
707 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus"))
708 Threshold += *AttrCallThresholdBonus;
709
710 if (std::optional<int> AttrCallCost =
711 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost")) {
712 addCost(Inc: *AttrCallCost);
713 // Prevent further processing of the call since we want to override its
714 // inline cost, not just add to it.
715 return false;
716 }
717 return true;
718 }
719
720 void onCallPenalty() override { addCost(Inc: CallPenalty); }
721
722 void onMemAccess() override { addCost(Inc: MemAccessCost); }
723
724 void onCallArgumentSetup(const CallBase &Call) override {
725 // Pay the price of the argument setup. We account for the average 1
726 // instruction per call argument setup here.
727 addCost(Inc: Call.arg_size() * InstrCost);
728 }
729 void onLoadRelativeIntrinsic() override {
730 // This is normally lowered to 4 LLVM instructions.
731 addCost(Inc: 3 * InstrCost);
732 }
733 void onLoweredCall(Function *F, CallBase &Call,
734 bool IsIndirectCall) override {
735 // We account for the average 1 instruction per call argument setup here.
736 addCost(Inc: Call.arg_size() * InstrCost);
737
738 // If we have a constant that we are calling as a function, we can peer
739 // through it and see the function target. This happens not infrequently
740 // during devirtualization and so we want to give it a hefty bonus for
741 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
742 // Pretend to inline the function, with a custom threshold.
743 if (IsIndirectCall && BoostIndirectCalls) {
744 auto IndirectCallParams = Params;
745 IndirectCallParams.DefaultThreshold =
746 InlineConstants::IndirectCallThreshold;
747 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
748 /// to instantiate the derived class.
749 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
750 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
751 false);
752 if (CA.analyze().isSuccess()) {
753 // We were able to inline the indirect call! Subtract the cost from the
754 // threshold to get the bonus we want to apply, but don't go below zero.
755 addCost(Inc: -std::max(a: 0, b: CA.getThreshold() - CA.getCost()));
756 }
757 } else
758 // Otherwise simply add the cost for merely making the call.
759 addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call,
760 DefaultCallPenalty: CallPenalty));
761 }
762
763 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
764 bool DefaultDestUnreachable) override {
765 // If suitable for a jump table, consider the cost for the table size and
766 // branch to destination.
767 // Maximum valid cost increased in this function.
768 if (JumpTableSize) {
769 // Suppose a default branch includes one compare and one conditional
770 // branch if it's reachable.
771 if (!DefaultDestUnreachable)
772 addCost(Inc: 2 * InstrCost);
773 // Suppose a jump table requires one load and one jump instruction.
774 int64_t JTCost =
775 static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
776 addCost(Inc: JTCost);
777 return;
778 }
779
780 if (NumCaseCluster <= 3) {
781 // Suppose a comparison includes one compare and one conditional branch.
782 // We can reduce a set of instructions if the default branch is
783 // undefined.
784 addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost);
785 return;
786 }
787
788 int64_t ExpectedNumberOfCompare =
789 getExpectedNumberOfCompare(NumCaseCluster);
790 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
791
792 addCost(Inc: SwitchCost);
793 }
794
795 // Parses the inline assembly argument to account for its cost. Inline
796 // assembly instructions incur higher costs for inlining since they cannot be
797 // analyzed and optimized.
798 void onInlineAsm(const InlineAsm &Arg) override {
799 if (!InlineAsmInstrCost)
800 return;
801 SmallVector<StringRef, 4> AsmStrs;
802 Arg.collectAsmStrs(AsmStrs);
803 int SectionLevel = 0;
804 int InlineAsmInstrCount = 0;
805 for (StringRef AsmStr : AsmStrs) {
806 // Trim whitespaces and comments.
807 StringRef Trimmed = AsmStr.trim();
808 size_t hashPos = Trimmed.find(C: '#');
809 if (hashPos != StringRef::npos)
810 Trimmed = Trimmed.substr(Start: 0, N: hashPos);
811 // Ignore comments.
812 if (Trimmed.empty())
813 continue;
814 // Filter out the outlined assembly instructions from the cost by keeping
815 // track of the section level and only accounting for instrutions at
816 // section level of zero. Note there will be duplication in outlined
817 // sections too, but is not accounted in the inlining cost model.
818 if (Trimmed.starts_with(Prefix: ".pushsection")) {
819 ++SectionLevel;
820 continue;
821 }
822 if (Trimmed.starts_with(Prefix: ".popsection")) {
823 --SectionLevel;
824 continue;
825 }
826 // Ignore directives and labels.
827 if (Trimmed.starts_with(Prefix: ".") || Trimmed.contains(Other: ":"))
828 continue;
829 if (SectionLevel == 0)
830 ++InlineAsmInstrCount;
831 }
832 NumInlineAsmInstructions += InlineAsmInstrCount;
833 addCost(Inc: InlineAsmInstrCount * InlineAsmInstrCost);
834 }
835
836 void onMissedSimplification() override { addCost(Inc: InstrCost); }
837
838 void onInitializeSROAArg(AllocaInst *Arg) override {
839 assert(Arg != nullptr &&
840 "Should not initialize SROA costs for null value.");
841 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
842 SROACostSavings += SROAArgCost;
843 SROAArgCosts[Arg] = SROAArgCost;
844 }
845
846 void onAggregateSROAUse(AllocaInst *SROAArg) override {
847 auto CostIt = SROAArgCosts.find(Val: SROAArg);
848 assert(CostIt != SROAArgCosts.end() &&
849 "expected this argument to have a cost");
850 CostIt->second += InstrCost;
851 SROACostSavings += InstrCost;
852 }
853
854 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
855
856 void onBlockAnalyzed(const BasicBlock *BB) override {
857 if (CostBenefitAnalysisEnabled) {
858 // Keep track of the static size of live but cold basic blocks. For now,
859 // we define a cold basic block to be one that's never executed.
860 assert(GetBFI && "GetBFI must be available");
861 BlockFrequencyInfo *BFI = &(GetBFI(F));
862 assert(BFI && "BFI must be available");
863 auto ProfileCount = BFI->getBlockProfileCount(BB);
864 if (*ProfileCount == 0)
865 ColdSize += Cost - CostAtBBStart;
866 }
867
868 auto *TI = BB->getTerminator();
869 // If we had any successors at this point, than post-inlining is likely to
870 // have them as well. Note that we assume any basic blocks which existed
871 // due to branches or switches which folded above will also fold after
872 // inlining.
873 if (SingleBB && TI->getNumSuccessors() > 1) {
874 // Take off the bonus we applied to the threshold.
875 Threshold -= SingleBBBonus;
876 SingleBB = false;
877 }
878 }
879
880 void onInstructionAnalysisStart(const Instruction *I) override {
881 // This function is called to store the initial cost of inlining before
882 // the given instruction was assessed.
883 if (!PrintInstructionComments)
884 return;
885 auto &CostDetail = InstructionCostDetailMap[I];
886 CostDetail.CostBefore = Cost;
887 CostDetail.ThresholdBefore = Threshold;
888 }
889
890 void onInstructionAnalysisFinish(const Instruction *I) override {
891 // This function is called to find new values of cost and threshold after
892 // the instruction has been assessed.
893 if (!PrintInstructionComments)
894 return;
895 auto &CostDetail = InstructionCostDetailMap[I];
896 CostDetail.CostAfter = Cost;
897 CostDetail.ThresholdAfter = Threshold;
898 }
899
900 bool isCostBenefitAnalysisEnabled() {
901 if (!PSI || !PSI->hasProfileSummary())
902 return false;
903
904 if (!GetBFI)
905 return false;
906
907 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
908 // Honor the explicit request from the user.
909 if (!InlineEnableCostBenefitAnalysis)
910 return false;
911 } else {
912 // Otherwise, require instrumentation profile.
913 if (!PSI->hasInstrumentationProfile())
914 return false;
915 }
916
917 auto *Caller = CandidateCall.getParent()->getParent();
918 if (!Caller->getEntryCount())
919 return false;
920
921 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
922 if (!CallerBFI)
923 return false;
924
925 // For now, limit to hot call site.
926 if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI))
927 return false;
928
929 // Make sure we have a nonzero entry count.
930 auto EntryCount = F.getEntryCount();
931 if (!EntryCount || *EntryCount == 0)
932 return false;
933
934 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
935 if (!CalleeBFI)
936 return false;
937
938 return true;
939 }
940
941 // A helper function to choose between command line override and default.
942 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
943 if (InlineSavingsMultiplier.getNumOccurrences())
944 return InlineSavingsMultiplier;
945 return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
946 }
947
948 // A helper function to choose between command line override and default.
949 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
950 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
951 return InlineSavingsProfitableMultiplier;
952 return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
953 }
954
955 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
956 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
957 CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test")) {
958 CycleSavings = *AttrCycleSavings;
959 }
960
961 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
962 CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test")) {
963 Size = *AttrRuntimeCost;
964 }
965 }
966
967 // Determine whether we should inline the given call site, taking into account
968 // both the size cost and the cycle savings. Return std::nullopt if we don't
969 // have sufficient profiling information to determine.
970 std::optional<bool> costBenefitAnalysis() {
971 if (!CostBenefitAnalysisEnabled)
972 return std::nullopt;
973
974 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
975 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
976 // falling back to the cost-based metric.
977 // TODO: Improve this hacky condition.
978 if (Threshold == 0)
979 return std::nullopt;
980
981 assert(GetBFI);
982 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
983 assert(CalleeBFI);
984
985 // The cycle savings expressed as the sum of InstrCost
986 // multiplied by the estimated dynamic count of each instruction we can
987 // avoid. Savings come from the call site cost, such as argument setup and
988 // the call instruction, as well as the instructions that are folded.
989 //
990 // We use 128-bit APInt here to avoid potential overflow. This variable
991 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
992 // assumes that we can avoid or fold a billion instructions, each with a
993 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
994 // period on a 4GHz machine.
995 APInt CycleSavings(128, 0);
996
997 for (auto &BB : F) {
998 APInt CurrentSavings(128, 0);
999 for (auto &I : BB) {
1000 if (CondBrInst *BI = dyn_cast<CondBrInst>(Val: &I)) {
1001 // Count a conditional branch as savings if it becomes unconditional.
1002 if (getSimplifiedValue<ConstantInt>(V: BI->getCondition()))
1003 CurrentSavings += InstrCost;
1004 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) {
1005 if (getSimplifiedValue<ConstantInt>(V: SI->getCondition()))
1006 CurrentSavings += InstrCost;
1007 } else if (SimplifiedValues.count(Val: &I)) {
1008 // Count an instruction as savings if we can fold it.
1009 CurrentSavings += InstrCost;
1010 }
1011 }
1012
1013 auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB);
1014 CurrentSavings *= *ProfileCount;
1015 CycleSavings += CurrentSavings;
1016 }
1017
1018 // Compute the cycle savings per call.
1019 auto EntryProfileCount = F.getEntryCount();
1020 assert(EntryProfileCount && *EntryProfileCount);
1021 CycleSavings += *EntryProfileCount / 2;
1022 CycleSavings = CycleSavings.udiv(RHS: *EntryProfileCount);
1023
1024 // Compute the total savings for the call site.
1025 auto *CallerBB = CandidateCall.getParent();
1026 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
1027 CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL);
1028 CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB);
1029
1030 // Remove the cost of the cold basic blocks to model the runtime cost more
1031 // accurately. Both machine block placement and function splitting could
1032 // place cold blocks further from hot blocks.
1033 int Size = Cost - ColdSize;
1034
1035 // Allow tiny callees to be inlined regardless of whether they meet the
1036 // savings threshold.
1037 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
1038
1039 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
1040 CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings);
1041
1042 // Let R be the ratio of CycleSavings to Size. We accept the inlining
1043 // opportunity if R is really high and reject if R is really low. If R is
1044 // somewhere in the middle, we fall back to the cost-based analysis.
1045 //
1046 // Specifically, let R = CycleSavings / Size, we accept the inlining
1047 // opportunity if:
1048 //
1049 // PSI->getOrCompHotCountThreshold()
1050 // R > -------------------------------------------------
1051 // getInliningCostBenefitAnalysisSavingsMultiplier()
1052 //
1053 // and reject the inlining opportunity if:
1054 //
1055 // PSI->getOrCompHotCountThreshold()
1056 // R <= ----------------------------------------------------
1057 // getInliningCostBenefitAnalysisProfitableMultiplier()
1058 //
1059 // Otherwise, we fall back to the cost-based analysis.
1060 //
1061 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
1062 // HotCountThreshold * Size) rather than division to avoid precision loss.
1063 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
1064 Threshold *= Size;
1065
1066 APInt UpperBoundCycleSavings = CycleSavings;
1067 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
1068 if (UpperBoundCycleSavings.uge(RHS: Threshold))
1069 return true;
1070
1071 APInt LowerBoundCycleSavings = CycleSavings;
1072 LowerBoundCycleSavings *=
1073 getInliningCostBenefitAnalysisProfitableMultiplier();
1074 if (LowerBoundCycleSavings.ult(RHS: Threshold))
1075 return false;
1076
1077 // Otherwise, fall back to the cost-based analysis.
1078 return std::nullopt;
1079 }
1080
1081 InlineResult finalizeAnalysis() override {
1082 // Loops generally act a lot like calls in that they act like barriers to
1083 // movement, require a certain amount of setup, etc. So when optimising for
1084 // size, we penalise any call sites that perform loops. We do this after all
1085 // other costs here, so will likely only be dealing with relatively small
1086 // functions (and hence DT and LI will hopefully be cheap).
1087 auto *Caller = CandidateCall.getFunction();
1088 if (Caller->hasMinSize()) {
1089 DominatorTree DT(F);
1090 LoopInfo LI(DT);
1091 int NumLoops = 0;
1092 for (Loop *L : LI) {
1093 // Ignore loops that will not be executed
1094 if (DeadBlocks.count(Ptr: L->getHeader()))
1095 continue;
1096 NumLoops++;
1097 }
1098 addCost(Inc: NumLoops * InlineConstants::LoopPenalty);
1099 }
1100
1101 // We applied the maximum possible vector bonus at the beginning. Now,
1102 // subtract the excess bonus, if any, from the Threshold before
1103 // comparing against Cost.
1104 if (NumVectorInstructions <= NumInstructions / 10)
1105 Threshold -= VectorBonus;
1106 else if (NumVectorInstructions <= NumInstructions / 2)
1107 Threshold -= VectorBonus / 2;
1108
1109 if (std::optional<int> AttrCost =
1110 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost"))
1111 Cost = *AttrCost;
1112
1113 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1114 CB&: CandidateCall,
1115 AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName))
1116 Cost *= *AttrCostMult;
1117
1118 if (std::optional<int> AttrThreshold =
1119 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold"))
1120 Threshold = *AttrThreshold;
1121
1122 if (auto Result = costBenefitAnalysis()) {
1123 DecidedByCostBenefit = true;
1124 if (*Result)
1125 return InlineResult::success();
1126 else
1127 return InlineResult::failure(Reason: "Cost over threshold.");
1128 }
1129
1130 if (IgnoreThreshold)
1131 return InlineResult::success();
1132
1133 DecidedByCostThreshold = true;
1134 return Cost < std::max(a: 1, b: Threshold)
1135 ? InlineResult::success()
1136 : InlineResult::failure(Reason: "Cost over threshold.");
1137 }
1138
1139 bool shouldStop() override {
1140 if (IgnoreThreshold || ComputeFullInlineCost)
1141 return false;
1142 // Bail out the moment we cross the threshold. This means we'll under-count
1143 // the cost, but only when undercounting doesn't matter.
1144 if (Cost < Threshold)
1145 return false;
1146 DecidedByCostThreshold = true;
1147 return true;
1148 }
1149
1150 void onLoadEliminationOpportunity() override {
1151 LoadEliminationCost += InstrCost;
1152 }
1153
1154 InlineResult onAnalysisStart() override {
1155 // Perform some tweaks to the cost and threshold based on the direct
1156 // callsite information.
1157
1158 // We want to more aggressively inline vector-dense kernels, so up the
1159 // threshold, and we'll lower it if the % of vector instructions gets too
1160 // low. Note that these bonuses are some what arbitrary and evolved over
1161 // time by accident as much as because they are principled bonuses.
1162 //
1163 // FIXME: It would be nice to remove all such bonuses. At least it would be
1164 // nice to base the bonus values on something more scientific.
1165 assert(NumInstructions == 0);
1166 assert(NumVectorInstructions == 0);
1167
1168 // Update the threshold based on callsite properties
1169 updateThreshold(Call&: CandidateCall, Callee&: F);
1170
1171 // While Threshold depends on commandline options that can take negative
1172 // values, we want to enforce the invariant that the computed threshold and
1173 // bonuses are non-negative.
1174 assert(Threshold >= 0);
1175 assert(SingleBBBonus >= 0);
1176 assert(VectorBonus >= 0);
1177
1178 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1179 // this Threshold any time, and cost cannot decrease, we can stop processing
1180 // the rest of the function body.
1181 Threshold += (SingleBBBonus + VectorBonus);
1182
1183 // Give out bonuses for the callsite, as the instructions setting them up
1184 // will be gone after inlining.
1185 addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1186
1187 // If this function uses the coldcc calling convention, prefer not to inline
1188 // it.
1189 if (F.getCallingConv() == CallingConv::Cold)
1190 addCost(Inc: InlineConstants::ColdccPenalty);
1191
1192 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1193
1194 // Check if we're done. This can happen due to bonuses and penalties.
1195 if (Cost >= Threshold && !ComputeFullInlineCost)
1196 return InlineResult::failure(Reason: "high cost");
1197
1198 return InlineResult::success();
1199 }
1200
1201public:
1202 InlineCostCallAnalyzer(
1203 Function &Callee, CallBase &Call, const InlineParams &Params,
1204 const TargetTransformInfo &TTI,
1205 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1206 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1207 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
1208 ProfileSummaryInfo *PSI = nullptr,
1209 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1210 bool IgnoreThreshold = false,
1211 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
1212 nullptr)
1213 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
1214 ORE, GetEphValuesCache),
1215 ComputeFullInlineCost(OptComputeFullInlineCost ||
1216 Params.ComputeFullInlineCost || ORE ||
1217 isCostBenefitAnalysisEnabled()),
1218 Params(Params), Threshold(Params.DefaultThreshold),
1219 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1220 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1221 Writer(this) {
1222 AllowRecursiveCall = *Params.AllowRecursiveCall;
1223 }
1224
1225 /// Annotation Writer for instruction details
1226 InlineCostAnnotationWriter Writer;
1227
1228 void dump();
1229
1230 // Prints the same analysis as dump(), but its definition is not dependent
1231 // on the build.
1232 void print(raw_ostream &OS);
1233
1234 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1235 auto It = InstructionCostDetailMap.find(Val: I);
1236 if (It != InstructionCostDetailMap.end())
1237 return It->second;
1238 return std::nullopt;
1239 }
1240
1241 ~InlineCostCallAnalyzer() override = default;
1242 int getThreshold() const { return Threshold; }
1243 int getCost() const { return Cost; }
1244 int getStaticBonusApplied() const { return StaticBonusApplied; }
1245 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1246 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1247 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1248};
1249
1250// Return true if CB is the sole call to local function Callee.
1251static bool isSoleCallToLocalFunction(const CallBase &CB,
1252 const Function &Callee) {
1253 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1254 &Callee == CB.getCalledFunction();
1255}
1256
1257class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1258private:
1259 InlineCostFeatures Cost = {};
1260
1261 // FIXME: These constants are taken from the heuristic-based cost visitor.
1262 // These should be removed entirely in a later revision to avoid reliance on
1263 // heuristics in the ML inliner.
1264 static constexpr int JTCostMultiplier = 2;
1265 static constexpr int CaseClusterCostMultiplier = 2;
1266 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1267 static constexpr int SwitchCostMultiplier = 2;
1268
1269 // FIXME: These are taken from the heuristic-based cost visitor: we should
1270 // eventually abstract these to the CallAnalyzer to avoid duplication.
1271 unsigned SROACostSavingOpportunities = 0;
1272 int VectorBonus = 0;
1273 int SingleBBBonus = 0;
1274 int Threshold = 5;
1275
1276 DenseMap<AllocaInst *, unsigned> SROACosts;
1277
1278 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1279 Cost[static_cast<size_t>(Feature)] += Delta;
1280 }
1281
1282 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1283 Cost[static_cast<size_t>(Feature)] = Value;
1284 }
1285
1286 void onDisableSROA(AllocaInst *Arg) override {
1287 auto CostIt = SROACosts.find(Val: Arg);
1288 if (CostIt == SROACosts.end())
1289 return;
1290
1291 increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second);
1292 SROACostSavingOpportunities -= CostIt->second;
1293 SROACosts.erase(I: CostIt);
1294 }
1295
1296 void onDisableLoadElimination() override {
1297 set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1);
1298 }
1299
1300 void onCallPenalty() override {
1301 increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty);
1302 }
1303
1304 void onCallArgumentSetup(const CallBase &Call) override {
1305 increment(Feature: InlineCostFeatureIndex::call_argument_setup,
1306 Delta: Call.arg_size() * InstrCost);
1307 }
1308
1309 void onLoadRelativeIntrinsic() override {
1310 increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost);
1311 }
1312
1313 void onLoweredCall(Function *F, CallBase &Call,
1314 bool IsIndirectCall) override {
1315 increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup,
1316 Delta: Call.arg_size() * InstrCost);
1317
1318 if (IsIndirectCall) {
1319 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1320 /*HintThreshold*/ {},
1321 /*OptSizeHintThreshold*/ {},
1322 /*ColdThreshold*/ {},
1323 /*OptSizeThreshold*/ {},
1324 /*OptMinSizeThreshold*/ {},
1325 /*HotCallSiteThreshold*/ {},
1326 /*LocallyHotCallSiteThreshold*/ {},
1327 /*ColdCallSiteThreshold*/ {},
1328 /*ComputeFullInlineCost*/ true,
1329 /*EnableDeferral*/ true};
1330 IndirectCallParams.DefaultThreshold =
1331 InlineConstants::IndirectCallThreshold;
1332
1333 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1334 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
1335 false, true);
1336 if (CA.analyze().isSuccess()) {
1337 increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate,
1338 Delta: CA.getCost());
1339 increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1);
1340 }
1341 } else {
1342 onCallPenalty();
1343 }
1344 }
1345
1346 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1347 bool DefaultDestUnreachable) override {
1348 if (JumpTableSize) {
1349 if (!DefaultDestUnreachable)
1350 increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty,
1351 Delta: SwitchDefaultDestCostMultiplier * InstrCost);
1352 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1353 JTCostMultiplier * InstrCost;
1354 increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost);
1355 return;
1356 }
1357
1358 if (NumCaseCluster <= 3) {
1359 increment(Feature: InlineCostFeatureIndex::case_cluster_penalty,
1360 Delta: (NumCaseCluster - DefaultDestUnreachable) *
1361 CaseClusterCostMultiplier * InstrCost);
1362 return;
1363 }
1364
1365 int64_t ExpectedNumberOfCompare =
1366 getExpectedNumberOfCompare(NumCaseCluster);
1367
1368 int64_t SwitchCost =
1369 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1370 increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost);
1371 }
1372
1373 void onMissedSimplification() override {
1374 increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions,
1375 Delta: InstrCost);
1376 }
1377
1378 void onInitializeSROAArg(AllocaInst *Arg) override {
1379 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
1380 SROACosts[Arg] = SROAArgCost;
1381 SROACostSavingOpportunities += SROAArgCost;
1382 }
1383
1384 void onAggregateSROAUse(AllocaInst *Arg) override {
1385 SROACosts.find(Val: Arg)->second += InstrCost;
1386 SROACostSavingOpportunities += InstrCost;
1387 }
1388
1389 void onBlockAnalyzed(const BasicBlock *BB) override {
1390 if (BB->getTerminator()->getNumSuccessors() > 1)
1391 set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1);
1392 Threshold -= SingleBBBonus;
1393 }
1394
1395 InlineResult finalizeAnalysis() override {
1396 auto *Caller = CandidateCall.getFunction();
1397 if (Caller->hasMinSize()) {
1398 DominatorTree DT(F);
1399 LoopInfo LI(DT);
1400 for (Loop *L : LI) {
1401 // Ignore loops that will not be executed
1402 if (DeadBlocks.count(Ptr: L->getHeader()))
1403 continue;
1404 increment(Feature: InlineCostFeatureIndex::num_loops,
1405 Delta: InlineConstants::LoopPenalty);
1406 }
1407 }
1408 set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size());
1409 set(Feature: InlineCostFeatureIndex::simplified_instructions,
1410 Value: NumInstructionsSimplified);
1411 set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs);
1412 set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args,
1413 Value: NumConstantOffsetPtrArgs);
1414 set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities);
1415
1416 if (NumVectorInstructions <= NumInstructions / 10)
1417 Threshold -= VectorBonus;
1418 else if (NumVectorInstructions <= NumInstructions / 2)
1419 Threshold -= VectorBonus / 2;
1420
1421 set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold);
1422
1423 return InlineResult::success();
1424 }
1425
1426 bool shouldStop() override { return false; }
1427
1428 void onLoadEliminationOpportunity() override {
1429 increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1);
1430 }
1431
1432 InlineResult onAnalysisStart() override {
1433 increment(Feature: InlineCostFeatureIndex::callsite_cost,
1434 Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1435
1436 set(Feature: InlineCostFeatureIndex::cold_cc_penalty,
1437 Value: (F.getCallingConv() == CallingConv::Cold));
1438
1439 set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus,
1440 Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F));
1441
1442 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1443 // analyzer - instead, we should abstract it to a common method in the
1444 // CallAnalyzer
1445 int SingleBBBonusPercent = 50;
1446 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1447 Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall);
1448 Threshold *= TTI.getInliningThresholdMultiplier();
1449 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1450 VectorBonus = Threshold * VectorBonusPercent / 100;
1451 Threshold += (SingleBBBonus + VectorBonus);
1452
1453 return InlineResult::success();
1454 }
1455
1456public:
1457 InlineCostFeaturesAnalyzer(
1458 const TargetTransformInfo &TTI,
1459 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1460 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1461 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
1462 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1463 CallBase &Call)
1464 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
1465 PSI) {}
1466
1467 const InlineCostFeatures &features() const { return Cost; }
1468};
1469
1470} // namespace
1471
1472/// Test whether the given value is an Alloca-derived function argument.
1473bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1474 return SROAArgValues.count(Val: V);
1475}
1476
1477void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1478 onDisableSROA(Arg: SROAArg);
1479 EnabledSROAAllocas.erase(V: SROAArg);
1480 disableLoadElimination();
1481}
1482
1483void InlineCostAnnotationWriter::emitInstructionAnnot(
1484 const Instruction *I, formatted_raw_ostream &OS) {
1485 // The cost of inlining of the given instruction is printed always.
1486 // The threshold delta is printed only when it is non-zero. It happens
1487 // when we decided to give a bonus at a particular instruction.
1488 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1489 if (!Record)
1490 OS << "; No analysis for the instruction";
1491 else {
1492 OS << "; cost before = " << Record->CostBefore
1493 << ", cost after = " << Record->CostAfter
1494 << ", threshold before = " << Record->ThresholdBefore
1495 << ", threshold after = " << Record->ThresholdAfter << ", ";
1496 OS << "cost delta = " << Record->getCostDelta();
1497 if (Record->hasThresholdChanged())
1498 OS << ", threshold delta = " << Record->getThresholdDelta();
1499 }
1500 auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I));
1501 if (V) {
1502 OS << ", simplified to ";
1503 V->print(O&: OS, IsForDebug: true);
1504 if (auto *VI = dyn_cast<Instruction>(Val: V)) {
1505 if (VI->getFunction() != I->getFunction())
1506 OS << " (caller instruction)";
1507 } else if (auto *VArg = dyn_cast<Argument>(Val: V)) {
1508 if (VArg->getParent() != I->getFunction())
1509 OS << " (caller argument)";
1510 }
1511 }
1512 OS << "\n";
1513}
1514
1515/// If 'V' maps to a SROA candidate, disable SROA for it.
1516void CallAnalyzer::disableSROA(Value *V) {
1517 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1518 disableSROAForArg(SROAArg);
1519 }
1520}
1521
1522void CallAnalyzer::disableLoadElimination() {
1523 if (EnableLoadElimination) {
1524 onDisableLoadElimination();
1525 EnableLoadElimination = false;
1526 }
1527}
1528
1529/// Accumulate a constant GEP offset into an APInt if possible.
1530///
1531/// Returns false if unable to compute the offset for any reason. Respects any
1532/// simplified values known during the analysis of this callsite.
1533bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1534 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType());
1535 assert(IntPtrWidth == Offset.getBitWidth());
1536
1537 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1538 GTI != GTE; ++GTI) {
1539 ConstantInt *OpC =
1540 getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand());
1541 if (!OpC)
1542 return false;
1543 if (OpC->isZero())
1544 continue;
1545
1546 // Handle a struct index, which adds its field offset to the pointer.
1547 if (StructType *STy = GTI.getStructTypeOrNull()) {
1548 unsigned ElementIdx = OpC->getZExtValue();
1549 const StructLayout *SL = DL.getStructLayout(Ty: STy);
1550 Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx));
1551 continue;
1552 }
1553
1554 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1555 Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize;
1556 }
1557 return true;
1558}
1559
1560/// Use TTI to check whether a GEP is free.
1561///
1562/// Respects any simplified values known during the analysis of this callsite.
1563bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1564 SmallVector<Value *, 4> Operands;
1565 Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0));
1566 for (const Use &Op : GEP.indices())
1567 if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op))
1568 Operands.push_back(Elt: SimpleOp);
1569 else
1570 Operands.push_back(Elt: Op);
1571 return TTI.getInstructionCost(U: &GEP, Operands,
1572 CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1573 TargetTransformInfo::TCC_Free;
1574}
1575
1576bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1577 disableSROA(V: I.getOperand(i_nocapture: 0));
1578
1579 // Check whether inlining will turn a dynamic alloca into a static
1580 // alloca and handle that case.
1581 if (I.isArrayAllocation()) {
1582 Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize());
1583 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) {
1584 // Sometimes a dynamic alloca could be converted into a static alloca
1585 // after this constant prop, and become a huge static alloca on an
1586 // unconditional CFG path. Avoid inlining if this is going to happen above
1587 // a threshold.
1588 // FIXME: If the threshold is removed or lowered too much, we could end up
1589 // being too pessimistic and prevent inlining non-problematic code. This
1590 // could result in unintended perf regressions. A better overall strategy
1591 // is needed to track stack usage during inlining.
1592 Type *Ty = I.getAllocatedType();
1593 AllocatedSize = SaturatingMultiplyAdd(
1594 X: AllocSize->getLimitedValue(),
1595 Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize);
1596 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1597 HasDynamicAlloca = true;
1598 return false;
1599 }
1600 }
1601
1602 if (I.isStaticAlloca()) {
1603 // Accumulate the allocated size if constant and executed once.
1604 // Note: if AllocSize is a vscale value, this is an underestimate of the
1605 // allocated size, and it also requires some of the cost of a dynamic
1606 // alloca, but is recorded here as a constant size alloca.
1607 TypeSize AllocSize = I.getAllocationSize(DL).value_or(u: TypeSize::getZero());
1608 AllocatedSize = SaturatingAdd(X: AllocSize.getKnownMinValue(), Y: AllocatedSize);
1609 } else {
1610 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1611 // a variety of reasons, and so we would like to not inline them into
1612 // functions which don't currently have a dynamic alloca. This simply
1613 // disables inlining altogether in the presence of a dynamic alloca.
1614 HasDynamicAlloca = true;
1615 }
1616
1617 return false;
1618}
1619
1620bool CallAnalyzer::visitPHI(PHINode &I) {
1621 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1622 // though we don't want to propagate it's bonuses. The idea is to disable
1623 // SROA if it *might* be used in an inappropriate manner.
1624
1625 // Phi nodes are always zero-cost.
1626 // FIXME: Pointer sizes may differ between different address spaces, so do we
1627 // need to use correct address space in the call to getPointerSizeInBits here?
1628 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1629 // see the ZeroOffset is used as a dummy value, so we can probably use any
1630 // bit width for the ZeroOffset?
1631 APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0));
1632 bool CheckSROA = I.getType()->isPointerTy();
1633
1634 // Track the constant or pointer with constant offset we've seen so far.
1635 Constant *FirstC = nullptr;
1636 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1637 Value *FirstV = nullptr;
1638
1639 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1640 BasicBlock *Pred = I.getIncomingBlock(i);
1641 // If the incoming block is dead, skip the incoming block.
1642 if (DeadBlocks.count(Ptr: Pred))
1643 continue;
1644 // If the parent block of phi is not the known successor of the incoming
1645 // block, skip the incoming block.
1646 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1647 if (KnownSuccessor && KnownSuccessor != I.getParent())
1648 continue;
1649
1650 Value *V = I.getIncomingValue(i);
1651 // If the incoming value is this phi itself, skip the incoming value.
1652 if (&I == V)
1653 continue;
1654
1655 Constant *C = getDirectOrSimplifiedValue<Constant>(V);
1656
1657 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1658 if (!C && CheckSROA)
1659 BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V);
1660
1661 if (!C && !BaseAndOffset.first)
1662 // The incoming value is neither a constant nor a pointer with constant
1663 // offset, exit early.
1664 return true;
1665
1666 if (FirstC) {
1667 if (FirstC == C)
1668 // If we've seen a constant incoming value before and it is the same
1669 // constant we see this time, continue checking the next incoming value.
1670 continue;
1671 // Otherwise early exit because we either see a different constant or saw
1672 // a constant before but we have a pointer with constant offset this time.
1673 return true;
1674 }
1675
1676 if (FirstV) {
1677 // The same logic as above, but check pointer with constant offset here.
1678 if (FirstBaseAndOffset == BaseAndOffset)
1679 continue;
1680 return true;
1681 }
1682
1683 if (C) {
1684 // This is the 1st time we've seen a constant, record it.
1685 FirstC = C;
1686 continue;
1687 }
1688
1689 // The remaining case is that this is the 1st time we've seen a pointer with
1690 // constant offset, record it.
1691 FirstV = V;
1692 FirstBaseAndOffset = BaseAndOffset;
1693 }
1694
1695 // Check if we can map phi to a constant.
1696 if (FirstC) {
1697 SimplifiedValues[&I] = FirstC;
1698 return true;
1699 }
1700
1701 // Check if we can map phi to a pointer with constant offset.
1702 if (FirstBaseAndOffset.first) {
1703 ConstantOffsetPtrs[&I] = std::move(FirstBaseAndOffset);
1704
1705 if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV))
1706 SROAArgValues[&I] = SROAArg;
1707 }
1708
1709 return true;
1710}
1711
1712/// Check we can fold GEPs of constant-offset call site argument pointers.
1713/// This requires target data and inbounds GEPs.
1714///
1715/// \return true if the specified GEP can be folded.
1716bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1717 // Check if we have a base + offset for the pointer.
1718 std::pair<Value *, APInt> BaseAndOffset =
1719 ConstantOffsetPtrs.lookup(Val: I.getPointerOperand());
1720 if (!BaseAndOffset.first)
1721 return false;
1722
1723 // Check if the offset of this GEP is constant, and if so accumulate it
1724 // into Offset.
1725 if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second))
1726 return false;
1727
1728 // Add the result as a new mapping to Base + Offset.
1729 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1730
1731 return true;
1732}
1733
1734bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1735 auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand());
1736
1737 // Lambda to check whether a GEP's indices are all constant.
1738 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1739 for (const Use &Op : GEP.indices())
1740 if (!getDirectOrSimplifiedValue<Constant>(V: Op))
1741 return false;
1742 return true;
1743 };
1744
1745 if (!DisableGEPConstOperand)
1746 if (simplifyInstruction(I))
1747 return true;
1748
1749 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1750 if (SROAArg)
1751 SROAArgValues[&I] = SROAArg;
1752
1753 // Constant GEPs are modeled as free.
1754 return true;
1755 }
1756
1757 // Variable GEPs will require math and will disable SROA.
1758 if (SROAArg)
1759 disableSROAForArg(SROAArg);
1760 return isGEPFree(GEP&: I);
1761}
1762
1763// Simplify \p Cmp if RHS is const and we can ValueTrack LHS.
1764// This handles the case only when the Cmp instruction is guarding a recursive
1765// call that will cause the Cmp to fail/succeed for the recursive call.
1766bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) {
1767 // Bail out if LHS is not a function argument or RHS is NOT const:
1768 if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1)))
1769 return false;
1770 auto *CmpOp = Cmp.getOperand(i_nocapture: 0);
1771 // Make sure that the callsite is recursive:
1772 if (CandidateCall.getCaller() != &F)
1773 return false;
1774 // Only handle the case when the callsite has a single predecessor:
1775 auto *CallBB = CandidateCall.getParent();
1776 auto *Predecessor = CallBB->getSinglePredecessor();
1777 if (!Predecessor)
1778 return false;
1779 // Check if the callsite is guarded by the same Cmp instruction:
1780 auto *Br = dyn_cast<CondBrInst>(Val: Predecessor->getTerminator());
1781 if (!Br || Br->getCondition() != &Cmp)
1782 return false;
1783
1784 // Check if there is any arg of the recursive callsite is affecting the cmp
1785 // instr:
1786 bool ArgFound = false;
1787 Value *FuncArg = nullptr, *CallArg = nullptr;
1788 for (unsigned ArgNum = 0;
1789 ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) {
1790 FuncArg = F.getArg(i: ArgNum);
1791 CallArg = CandidateCall.getArgOperand(i: ArgNum);
1792 if (FuncArg == CmpOp && CallArg != CmpOp) {
1793 ArgFound = true;
1794 break;
1795 }
1796 }
1797 if (!ArgFound)
1798 return false;
1799
1800 // Now we have a recursive call that is guarded by a cmp instruction.
1801 // Check if this cmp can be simplified:
1802 SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg));
1803 CondContext CC(&Cmp);
1804 CC.Invert = (CallBB != Br->getSuccessor(i: 0));
1805 SQ.CC = &CC;
1806 CC.AffectedValues.insert(Ptr: FuncArg);
1807 Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands(
1808 I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ);
1809 if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) {
1810 // Make sure that the BB of the recursive call is NOT the true successor
1811 // of the icmp. In other words, make sure that the recursion depth is 1.
1812 if ((ConstVal->isOne() && CC.Invert) ||
1813 (ConstVal->isZero() && !CC.Invert)) {
1814 SimplifiedValues[&Cmp] = ConstVal;
1815 return true;
1816 }
1817 }
1818 return false;
1819}
1820
1821/// Simplify \p I if its operands are constants and update SimplifiedValues.
1822bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1823 SmallVector<Constant *> COps;
1824 for (Value *Op : I.operands()) {
1825 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
1826 if (!COp)
1827 return false;
1828 COps.push_back(Elt: COp);
1829 }
1830 auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL);
1831 if (!C)
1832 return false;
1833 SimplifiedValues[&I] = C;
1834 return true;
1835}
1836
1837/// Try to simplify a call to llvm.is.constant.
1838///
1839/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1840/// we expect calls of this specific intrinsic to be infrequent.
1841///
1842/// FIXME: Given that we know CB's parent (F) caller
1843/// (CandidateCall->getParent()->getParent()), we might be able to determine
1844/// whether inlining F into F's caller would change how the call to
1845/// llvm.is.constant would evaluate.
1846bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1847 Value *Arg = CB.getArgOperand(i: 0);
1848 auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg);
1849
1850 Type *RT = CB.getFunctionType()->getReturnType();
1851 SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0);
1852 return true;
1853}
1854
1855bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1856 // As per the langref, "The fourth argument to llvm.objectsize determines if
1857 // the value should be evaluated at runtime."
1858 if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne())
1859 return false;
1860
1861 Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr,
1862 /*MustSucceed=*/true);
1863 Constant *C = dyn_cast_or_null<Constant>(Val: V);
1864 if (C)
1865 SimplifiedValues[&CB] = C;
1866 return C;
1867}
1868
1869bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1870 // Propagate constants through bitcasts.
1871 if (simplifyInstruction(I))
1872 return true;
1873
1874 // Track base/offsets through casts
1875 std::pair<Value *, APInt> BaseAndOffset =
1876 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1877 // Casts don't change the offset, just wrap it up.
1878 if (BaseAndOffset.first)
1879 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1880
1881 // Also look for SROA candidates here.
1882 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1883 SROAArgValues[&I] = SROAArg;
1884
1885 // Bitcasts are always zero cost.
1886 return true;
1887}
1888
1889bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1890 // Propagate constants through ptrtoint.
1891 if (simplifyInstruction(I))
1892 return true;
1893
1894 // Track base/offset pairs when converted to a plain integer provided the
1895 // integer is large enough to represent the pointer.
1896 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1897 unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1898 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1899 std::pair<Value *, APInt> BaseAndOffset =
1900 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1901 if (BaseAndOffset.first)
1902 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1903 }
1904
1905 // This is really weird. Technically, ptrtoint will disable SROA. However,
1906 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1907 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1908 // would block SROA would also block SROA if applied directly to a pointer,
1909 // and so we can just add the integer in here. The only places where SROA is
1910 // preserved either cannot fire on an integer, or won't in-and-of themselves
1911 // disable SROA (ext) w/o some later use that we would see and disable.
1912 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1913 SROAArgValues[&I] = SROAArg;
1914
1915 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1916 TargetTransformInfo::TCC_Free;
1917}
1918
1919bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1920 // Propagate constants through ptrtoint.
1921 if (simplifyInstruction(I))
1922 return true;
1923
1924 // Track base/offset pairs when round-tripped through a pointer without
1925 // modifications provided the integer is not too large.
1926 Value *Op = I.getOperand(i_nocapture: 0);
1927 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1928 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1929 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op);
1930 if (BaseAndOffset.first)
1931 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1932 }
1933
1934 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1935 if (auto *SROAArg = getSROAArgForValueOrNull(V: Op))
1936 SROAArgValues[&I] = SROAArg;
1937
1938 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1939 TargetTransformInfo::TCC_Free;
1940}
1941
1942bool CallAnalyzer::visitCastInst(CastInst &I) {
1943 // Propagate constants through casts.
1944 if (simplifyInstruction(I))
1945 return true;
1946
1947 // Disable SROA in the face of arbitrary casts we don't explicitly list
1948 // elsewhere.
1949 disableSROA(V: I.getOperand(i_nocapture: 0));
1950
1951 // If this is a floating-point cast, and the target says this operation
1952 // is expensive, this may eventually become a library call. Treat the cost
1953 // as such.
1954 switch (I.getOpcode()) {
1955 case Instruction::FPTrunc:
1956 case Instruction::FPExt:
1957 case Instruction::UIToFP:
1958 case Instruction::SIToFP:
1959 case Instruction::FPToUI:
1960 case Instruction::FPToSI:
1961 if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive)
1962 onCallPenalty();
1963 break;
1964 default:
1965 break;
1966 }
1967
1968 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1969 TargetTransformInfo::TCC_Free;
1970}
1971
1972bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1973 return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr);
1974}
1975
1976bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1977 // Does the *call site* have the NonNull attribute set on an argument? We
1978 // use the attribute on the call site to memoize any analysis done in the
1979 // caller. This will also trip if the callee function has a non-null
1980 // parameter attribute, but that's a less interesting case because hopefully
1981 // the callee would already have been simplified based on that.
1982 if (Argument *A = dyn_cast<Argument>(Val: V))
1983 if (paramHasAttr(A, Attr: Attribute::NonNull))
1984 return true;
1985
1986 // Is this an alloca in the caller? This is distinct from the attribute case
1987 // above because attributes aren't updated within the inliner itself and we
1988 // always want to catch the alloca derived case.
1989 if (isAllocaDerivedArg(V))
1990 // We can actually predict the result of comparisons between an
1991 // alloca-derived value and null. Note that this fires regardless of
1992 // SROA firing.
1993 return true;
1994
1995 return false;
1996}
1997
1998bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1999 // If the normal destination of the invoke or the parent block of the call
2000 // site is unreachable-terminated, there is little point in inlining this
2001 // unless there is literally zero cost.
2002 // FIXME: Note that it is possible that an unreachable-terminated block has a
2003 // hot entry. For example, in below scenario inlining hot_call_X() may be
2004 // beneficial :
2005 // main() {
2006 // hot_call_1();
2007 // ...
2008 // hot_call_N()
2009 // exit(0);
2010 // }
2011 // For now, we are not handling this corner case here as it is rare in real
2012 // code. In future, we should elaborate this based on BPI and BFI in more
2013 // general threshold adjusting heuristics in updateThreshold().
2014 if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) {
2015 if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator()))
2016 return false;
2017 } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator()))
2018 return false;
2019
2020 return true;
2021}
2022
2023bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
2024 BlockFrequencyInfo *CallerBFI) {
2025 // If global profile summary is available, then callsite's coldness is
2026 // determined based on that.
2027 if (PSI && PSI->hasProfileSummary())
2028 return PSI->isColdCallSite(CB: Call, BFI: CallerBFI);
2029
2030 // Otherwise we need BFI to be available.
2031 if (!CallerBFI)
2032 return false;
2033
2034 // Determine if the callsite is cold relative to caller's entry. We could
2035 // potentially cache the computation of scaled entry frequency, but the added
2036 // complexity is not worth it unless this scaling shows up high in the
2037 // profiles.
2038 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
2039 auto CallSiteBB = Call.getParent();
2040 auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2041 auto CallerEntryFreq =
2042 CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock()));
2043 return CallSiteFreq < CallerEntryFreq * ColdProb;
2044}
2045
2046std::optional<int>
2047InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
2048 BlockFrequencyInfo *CallerBFI) {
2049
2050 // If global profile summary is available, then callsite's hotness is
2051 // determined based on that.
2052 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI))
2053 return Params.HotCallSiteThreshold;
2054
2055 // Otherwise we need BFI to be available and to have a locally hot callsite
2056 // threshold.
2057 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
2058 return std::nullopt;
2059
2060 // Determine if the callsite is hot relative to caller's entry. We could
2061 // potentially cache the computation of scaled entry frequency, but the added
2062 // complexity is not worth it unless this scaling shows up high in the
2063 // profiles.
2064 const BasicBlock *CallSiteBB = Call.getParent();
2065 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2066 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
2067 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq);
2068 if (Limit && CallSiteFreq >= *Limit)
2069 return Params.LocallyHotCallSiteThreshold;
2070
2071 // Otherwise treat it normally.
2072 return std::nullopt;
2073}
2074
2075void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
2076 // If no size growth is allowed for this inlining, set Threshold to 0.
2077 if (!allowSizeGrowth(Call)) {
2078 Threshold = 0;
2079 return;
2080 }
2081
2082 Function *Caller = Call.getCaller();
2083
2084 // return min(A, B) if B is valid.
2085 auto MinIfValid = [](int A, std::optional<int> B) {
2086 return B ? std::min(a: A, b: *B) : A;
2087 };
2088
2089 // return max(A, B) if B is valid.
2090 auto MaxIfValid = [](int A, std::optional<int> B) {
2091 return B ? std::max(a: A, b: *B) : A;
2092 };
2093
2094 // Various bonus percentages. These are multiplied by Threshold to get the
2095 // bonus values.
2096 // SingleBBBonus: This bonus is applied if the callee has a single reachable
2097 // basic block at the given callsite context. This is speculatively applied
2098 // and withdrawn if more than one basic block is seen.
2099 //
2100 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
2101 // of the last call to a static function as inlining such functions is
2102 // guaranteed to reduce code size.
2103 //
2104 // These bonus percentages may be set to 0 based on properties of the caller
2105 // and the callsite.
2106 int SingleBBBonusPercent = 50;
2107 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
2108 int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus();
2109
2110 // Lambda to set all the above bonus and bonus percentages to 0.
2111 auto DisallowAllBonuses = [&]() {
2112 SingleBBBonusPercent = 0;
2113 VectorBonusPercent = 0;
2114 LastCallToStaticBonus = 0;
2115 };
2116
2117 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
2118 // and reduce the threshold if the caller has the necessary attribute.
2119 if (Caller->hasMinSize()) {
2120 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
2121 // For minsize, we want to disable the single BB bonus and the vector
2122 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
2123 // a static function will, at the minimum, eliminate the parameter setup and
2124 // call/return instructions.
2125 SingleBBBonusPercent = 0;
2126 VectorBonusPercent = 0;
2127 } else if (Caller->hasOptSize())
2128 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
2129
2130 // Adjust the threshold based on inlinehint attribute and profile based
2131 // hotness information if the caller does not have MinSize attribute.
2132 if (!Caller->hasMinSize()) {
2133 std::optional<int> HintThreshold = Caller->hasOptSize()
2134 ? Params.OptSizeHintThreshold
2135 : Params.HintThreshold;
2136 if (Callee.hasFnAttribute(Kind: Attribute::InlineHint))
2137 Threshold = MaxIfValid(Threshold, HintThreshold);
2138
2139 // FIXME: After switching to the new passmanager, simplify the logic below
2140 // by checking only the callsite hotness/coldness as we will reliably
2141 // have local profile information.
2142 //
2143 // Callsite hotness and coldness can be determined if sample profile is
2144 // used (which adds hotness metadata to calls) or if caller's
2145 // BlockFrequencyInfo is available.
2146 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
2147 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
2148 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
2149 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
2150 // FIXME: This should update the threshold only if it exceeds the
2151 // current threshold, but AutoFDO + ThinLTO currently relies on this
2152 // behavior to prevent inlining of hot callsites during ThinLTO
2153 // compile phase.
2154 Threshold = *HotCallSiteThreshold;
2155 } else if (isColdCallSite(Call, CallerBFI)) {
2156 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2157 // Do not apply bonuses for a cold callsite including the
2158 // LastCallToStatic bonus. While this bonus might result in code size
2159 // reduction, it can cause the size of a non-cold caller to increase
2160 // preventing it from being inlined.
2161 DisallowAllBonuses();
2162 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
2163 } else if (PSI) {
2164 // Use callee's global profile information only if we have no way of
2165 // determining this via callsite information.
2166 if (PSI->isFunctionEntryHot(F: &Callee)) {
2167 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2168 // If callsite hotness can not be determined, we may still know
2169 // that the callee is hot and treat it as a weaker hint for threshold
2170 // increase.
2171 Threshold = MaxIfValid(Threshold, HintThreshold);
2172 } else if (PSI->isFunctionEntryCold(F: &Callee)) {
2173 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2174 // Do not apply bonuses for a cold callee including the
2175 // LastCallToStatic bonus. While this bonus might result in code size
2176 // reduction, it can cause the size of a non-cold caller to increase
2177 // preventing it from being inlined.
2178 DisallowAllBonuses();
2179 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2180 }
2181 }
2182 }
2183
2184 Threshold += TTI.adjustInliningThreshold(CB: &Call);
2185
2186 // Finally, take the target-specific inlining threshold multiplier into
2187 // account.
2188 Threshold *= TTI.getInliningThresholdMultiplier();
2189
2190 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2191 VectorBonus = Threshold * VectorBonusPercent / 100;
2192
2193 // If there is only one call of the function, and it has internal linkage,
2194 // the cost of inlining it drops dramatically. It may seem odd to update
2195 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2196 if (isSoleCallToLocalFunction(CB: Call, Callee: F)) {
2197 addCost(Inc: -LastCallToStaticBonus);
2198 StaticBonusApplied = LastCallToStaticBonus;
2199 }
2200}
2201
2202bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2203 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2204 // First try to handle simplified comparisons.
2205 if (simplifyInstruction(I))
2206 return true;
2207
2208 // Try to handle comparison that can be simplified using ValueTracking.
2209 if (simplifyCmpInstForRecCall(Cmp&: I))
2210 return true;
2211
2212 if (I.getOpcode() == Instruction::FCmp)
2213 return false;
2214
2215 // Otherwise look for a comparison between constant offset pointers with
2216 // a common base.
2217 Value *LHSBase, *RHSBase;
2218 APInt LHSOffset, RHSOffset;
2219 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2220 if (LHSBase) {
2221 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2222 if (RHSBase && LHSBase == RHSBase) {
2223 // We have common bases, fold the icmp to a constant based on the
2224 // offsets.
2225 SimplifiedValues[&I] = ConstantInt::getBool(
2226 Ty: I.getType(),
2227 V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate()));
2228 ++NumConstantPtrCmps;
2229 return true;
2230 }
2231 }
2232
2233 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2234 for (auto *User : I.users())
2235 if (auto *Instr = dyn_cast<Instruction>(Val: User))
2236 if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit))
2237 return false;
2238 return true;
2239 };
2240
2241 // If the comparison is an equality comparison with null, we can simplify it
2242 // if we know the value (argument) can't be null
2243 if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) {
2244 if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) {
2245 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2246 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType())
2247 : ConstantInt::getFalse(Ty: I.getType());
2248 return true;
2249 }
2250 // Implicit null checks act as unconditional branches and their comparisons
2251 // should be treated as simplified and free of cost.
2252 if (isImplicitNullCheckCmp(I))
2253 return true;
2254 }
2255 return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1)));
2256}
2257
2258bool CallAnalyzer::visitSub(BinaryOperator &I) {
2259 // Try to handle a special case: we can fold computing the difference of two
2260 // constant-related pointers.
2261 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2262 Value *LHSBase, *RHSBase;
2263 APInt LHSOffset, RHSOffset;
2264 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2265 if (LHSBase) {
2266 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2267 if (RHSBase && LHSBase == RHSBase) {
2268 // We have common bases, fold the subtract to a constant based on the
2269 // offsets.
2270 Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset);
2271 Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset);
2272 if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) {
2273 SimplifiedValues[&I] = C;
2274 ++NumConstantPtrDiffs;
2275 return true;
2276 }
2277 }
2278 }
2279
2280 // Otherwise, fall back to the generic logic for simplifying and handling
2281 // instructions.
2282 return Base::visitSub(I);
2283}
2284
2285bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2286 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2287 Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS);
2288 Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS);
2289
2290 Value *SimpleV = nullptr;
2291 if (auto FI = dyn_cast<FPMathOperator>(Val: &I))
2292 SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS,
2293 FMF: FI->getFastMathFlags(), Q: DL);
2294 else
2295 SimpleV =
2296 simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL);
2297
2298 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2299 SimplifiedValues[&I] = C;
2300
2301 if (SimpleV)
2302 return true;
2303
2304 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2305 disableSROA(V: LHS);
2306 disableSROA(V: RHS);
2307
2308 // If the instruction is floating point, and the target says this operation
2309 // is expensive, this may eventually become a library call. Treat the cost
2310 // as such. Unless it's fneg which can be implemented with an xor.
2311 using namespace llvm::PatternMatch;
2312 if (I.getType()->isFloatingPointTy() &&
2313 TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive &&
2314 !match(V: &I, P: m_FNeg(X: m_Value())))
2315 onCallPenalty();
2316
2317 return false;
2318}
2319
2320bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2321 Value *Op = I.getOperand(i_nocapture: 0);
2322 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
2323
2324 Value *SimpleV = simplifyFNegInst(
2325 Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL);
2326
2327 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2328 SimplifiedValues[&I] = C;
2329
2330 if (SimpleV)
2331 return true;
2332
2333 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2334 disableSROA(V: Op);
2335
2336 return false;
2337}
2338
2339bool CallAnalyzer::visitLoad(LoadInst &I) {
2340 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2341 return true;
2342
2343 // If the data is already loaded from this address and hasn't been clobbered
2344 // by any stores or calls, this load is likely to be redundant and can be
2345 // eliminated.
2346 if (EnableLoadElimination &&
2347 !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) {
2348 onLoadEliminationOpportunity();
2349 return true;
2350 }
2351
2352 onMemAccess();
2353 return false;
2354}
2355
2356bool CallAnalyzer::visitStore(StoreInst &I) {
2357 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2358 return true;
2359
2360 // The store can potentially clobber loads and prevent repeated loads from
2361 // being eliminated.
2362 // FIXME:
2363 // 1. We can probably keep an initial set of eliminatable loads substracted
2364 // from the cost even when we finally see a store. We just need to disable
2365 // *further* accumulation of elimination savings.
2366 // 2. We should probably at some point thread MemorySSA for the callee into
2367 // this and then use that to actually compute *really* precise savings.
2368 disableLoadElimination();
2369
2370 onMemAccess();
2371 return false;
2372}
2373
2374bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2375 Value *Op = I.getAggregateOperand();
2376
2377 // Special handling, because we want to simplify extractvalue with a
2378 // potential insertvalue from the caller.
2379 if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) {
2380 SimplifyQuery SQ(DL);
2381 Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ);
2382 if (SimpleV) {
2383 SimplifiedValues[&I] = SimpleV;
2384 return true;
2385 }
2386 }
2387
2388 // SROA can't look through these, but they may be free.
2389 return Base::visitExtractValue(I);
2390}
2391
2392bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2393 // Constant folding for insert value is trivial.
2394 if (simplifyInstruction(I))
2395 return true;
2396
2397 // SROA can't look through these, but they may be free.
2398 return Base::visitInsertValue(I);
2399}
2400
2401/// Try to simplify a call site.
2402///
2403/// Takes a concrete function and callsite and tries to actually simplify it by
2404/// analyzing the arguments and call itself with instsimplify. Returns true if
2405/// it has simplified the callsite to some other entity (a constant), making it
2406/// free.
2407bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2408 // FIXME: Using the instsimplify logic directly for this is inefficient
2409 // because we have to continually rebuild the argument list even when no
2410 // simplifications can be performed. Until that is fixed with remapping
2411 // inside of instsimplify, directly constant fold calls here.
2412 if (!canConstantFoldCallTo(Call: &Call, F))
2413 return false;
2414
2415 // Try to re-map the arguments to constants.
2416 SmallVector<Constant *, 4> ConstantArgs;
2417 ConstantArgs.reserve(N: Call.arg_size());
2418 for (Value *I : Call.args()) {
2419 Constant *C = getDirectOrSimplifiedValue<Constant>(V: I);
2420 if (!C)
2421 return false; // This argument doesn't map to a constant.
2422
2423 ConstantArgs.push_back(Elt: C);
2424 }
2425 if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) {
2426 SimplifiedValues[&Call] = C;
2427 return true;
2428 }
2429
2430 return false;
2431}
2432
2433bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
2434 const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
2435 LibFunc LF;
2436 if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF))
2437 return TTI.isLoweredToCall(F);
2438
2439 switch (LF) {
2440 case LibFunc_memcpy_chk:
2441 case LibFunc_memmove_chk:
2442 case LibFunc_mempcpy_chk:
2443 case LibFunc_memset_chk: {
2444 // Calls to __memcpy_chk whose length is known to fit within the object
2445 // size will eventually be replaced by inline stores. Therefore, these
2446 // should not incur a call penalty. This is only really relevant on
2447 // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2448 // other platforms use memcpy intrinsics, which are already exempt from the
2449 // call penalty.
2450 auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2));
2451 auto *ObjSizeOp =
2452 getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3));
2453 if (LenOp && ObjSizeOp &&
2454 LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
2455 return false;
2456 }
2457 break;
2458 }
2459 default:
2460 break;
2461 }
2462
2463 return TTI.isLoweredToCall(F);
2464}
2465
2466bool CallAnalyzer::visitCallBase(CallBase &Call) {
2467 if (!onCallBaseVisitStart(Call))
2468 return true;
2469
2470 if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) &&
2471 !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) {
2472 // This aborts the entire analysis.
2473 ExposesReturnsTwice = true;
2474 return false;
2475 }
2476 if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate())
2477 ContainsNoDuplicateCall = true;
2478
2479 if (InlineAsm *InlineAsmOp = dyn_cast<InlineAsm>(Val: Call.getCalledOperand()))
2480 onInlineAsm(Arg: *InlineAsmOp);
2481
2482 Function *F = Call.getCalledFunction();
2483 bool IsIndirectCall = !F;
2484 if (IsIndirectCall) {
2485 // Check if this happens to be an indirect function call to a known function
2486 // in this inline context. If not, we've done all we can.
2487 Value *Callee = Call.getCalledOperand();
2488 F = getSimplifiedValue<Function>(V: Callee);
2489 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2490 onCallArgumentSetup(Call);
2491
2492 if (!Call.onlyReadsMemory())
2493 disableLoadElimination();
2494 return Base::visitCallBase(I&: Call);
2495 }
2496 }
2497
2498 assert(F && "Expected a call to a known function");
2499
2500 // When we have a concrete function, first try to simplify it directly.
2501 if (simplifyCallSite(F, Call))
2502 return true;
2503
2504 // Next check if it is an intrinsic we know about.
2505 // FIXME: Lift this into part of the InstVisitor.
2506 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) {
2507 switch (II->getIntrinsicID()) {
2508 default:
2509 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II))
2510 disableLoadElimination();
2511 return Base::visitCallBase(I&: Call);
2512
2513 case Intrinsic::load_relative:
2514 onLoadRelativeIntrinsic();
2515 return false;
2516
2517 case Intrinsic::memset:
2518 case Intrinsic::memcpy:
2519 case Intrinsic::memmove:
2520 disableLoadElimination();
2521 // SROA can usually chew through these intrinsics, but they aren't free.
2522 return false;
2523 case Intrinsic::icall_branch_funnel:
2524 case Intrinsic::localescape:
2525 HasUninlineableIntrinsic = true;
2526 return false;
2527 case Intrinsic::vastart:
2528 InitsVargArgs = true;
2529 return false;
2530 case Intrinsic::launder_invariant_group:
2531 case Intrinsic::strip_invariant_group:
2532 if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0)))
2533 SROAArgValues[II] = SROAArg;
2534 return true;
2535 case Intrinsic::is_constant:
2536 return simplifyIntrinsicCallIsConstant(CB&: Call);
2537 case Intrinsic::objectsize:
2538 return simplifyIntrinsicCallObjectSize(CB&: Call);
2539 }
2540 }
2541
2542 if (F == Call.getFunction()) {
2543 // This flag will fully abort the analysis, so don't bother with anything
2544 // else.
2545 IsRecursiveCall = true;
2546 if (!AllowRecursiveCall)
2547 return false;
2548 }
2549
2550 if (isLoweredToCall(F, Call)) {
2551 onLoweredCall(F, Call, IsIndirectCall);
2552 }
2553
2554 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2555 disableLoadElimination();
2556 return Base::visitCallBase(I&: Call);
2557}
2558
2559bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2560 // At least one return instruction will be free after inlining.
2561 bool Free = !HasReturn;
2562 HasReturn = true;
2563 return Free;
2564}
2565
2566bool CallAnalyzer::visitUncondBrInst(UncondBrInst &BI) {
2567 // We model unconditional branches as essentially free -- they really
2568 // shouldn't exist at all, but handling them makes the behavior of the
2569 // inliner more regular and predictable.
2570 return true;
2571}
2572
2573bool CallAnalyzer::visitCondBrInst(CondBrInst &BI) {
2574 // Conditional branches which will fold away are free.
2575 return getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) ||
2576 BI.getMetadata(KindID: LLVMContext::MD_make_implicit);
2577}
2578
2579bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2580 bool CheckSROA = SI.getType()->isPointerTy();
2581 Value *TrueVal = SI.getTrueValue();
2582 Value *FalseVal = SI.getFalseValue();
2583
2584 Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal);
2585 Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal);
2586 Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition());
2587
2588 if (!CondC) {
2589 // Select C, X, X => X
2590 if (TrueC == FalseC && TrueC) {
2591 SimplifiedValues[&SI] = TrueC;
2592 return true;
2593 }
2594
2595 if (!CheckSROA)
2596 return Base::visitSelectInst(I&: SI);
2597
2598 std::pair<Value *, APInt> TrueBaseAndOffset =
2599 ConstantOffsetPtrs.lookup(Val: TrueVal);
2600 std::pair<Value *, APInt> FalseBaseAndOffset =
2601 ConstantOffsetPtrs.lookup(Val: FalseVal);
2602 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2603 ConstantOffsetPtrs[&SI] = std::move(TrueBaseAndOffset);
2604
2605 if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal))
2606 SROAArgValues[&SI] = SROAArg;
2607 return true;
2608 }
2609
2610 return Base::visitSelectInst(I&: SI);
2611 }
2612
2613 // Select condition is a constant.
2614 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2615 : (CondC->isNullValue()) ? FalseVal
2616 : nullptr;
2617 if (!SelectedV) {
2618 // Condition is a vector constant that is not all 1s or all 0s. If all
2619 // operands are constants, ConstantFoldSelectInstruction() can handle the
2620 // cases such as select vectors.
2621 if (TrueC && FalseC) {
2622 if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) {
2623 SimplifiedValues[&SI] = C;
2624 return true;
2625 }
2626 }
2627 return Base::visitSelectInst(I&: SI);
2628 }
2629
2630 // Condition is either all 1s or all 0s. SI can be simplified.
2631 if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) {
2632 SimplifiedValues[&SI] = SelectedC;
2633 return true;
2634 }
2635
2636 if (!CheckSROA)
2637 return true;
2638
2639 std::pair<Value *, APInt> BaseAndOffset =
2640 ConstantOffsetPtrs.lookup(Val: SelectedV);
2641 if (BaseAndOffset.first) {
2642 ConstantOffsetPtrs[&SI] = std::move(BaseAndOffset);
2643
2644 if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV))
2645 SROAArgValues[&SI] = SROAArg;
2646 }
2647
2648 return true;
2649}
2650
2651bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2652 // We model unconditional switches as free, see the comments on handling
2653 // branches.
2654 if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition()))
2655 return true;
2656
2657 // Assume the most general case where the switch is lowered into
2658 // either a jump table, bit test, or a balanced binary tree consisting of
2659 // case clusters without merging adjacent clusters with the same
2660 // destination. We do not consider the switches that are lowered with a mix
2661 // of jump table/bit test/binary search tree. The cost of the switch is
2662 // proportional to the size of the tree or the size of jump table range.
2663 //
2664 // NB: We convert large switches which are just used to initialize large phi
2665 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2666 // inlining those. It will prevent inlining in cases where the optimization
2667 // does not (yet) fire.
2668
2669 unsigned JumpTableSize = 0;
2670 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2671 unsigned NumCaseCluster =
2672 TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI);
2673
2674 onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable());
2675 return false;
2676}
2677
2678bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2679 // We never want to inline functions that contain an indirectbr. This is
2680 // incorrect because all the blockaddress's (in static global initializers
2681 // for example) would be referring to the original function, and this
2682 // indirect jump would jump from the inlined copy of the function into the
2683 // original function which is extremely undefined behavior.
2684 // FIXME: This logic isn't really right; we can safely inline functions with
2685 // indirectbr's as long as no other function or global references the
2686 // blockaddress of a block within the current function.
2687 HasIndirectBr = true;
2688 return false;
2689}
2690
2691bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2692 // FIXME: It's not clear that a single instruction is an accurate model for
2693 // the inline cost of a resume instruction.
2694 return false;
2695}
2696
2697bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2698 // FIXME: It's not clear that a single instruction is an accurate model for
2699 // the inline cost of a cleanupret instruction.
2700 return false;
2701}
2702
2703bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2704 // FIXME: It's not clear that a single instruction is an accurate model for
2705 // the inline cost of a catchret instruction.
2706 return false;
2707}
2708
2709bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2710 // FIXME: It might be reasonably to discount the cost of instructions leading
2711 // to unreachable as they have the lowest possible impact on both runtime and
2712 // code size.
2713 return true; // No actual code is needed for unreachable.
2714}
2715
2716bool CallAnalyzer::visitInstruction(Instruction &I) {
2717 // Some instructions are free. All of the free intrinsics can also be
2718 // handled by SROA, etc.
2719 if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
2720 TargetTransformInfo::TCC_Free)
2721 return true;
2722
2723 // We found something we don't understand or can't handle. Mark any SROA-able
2724 // values in the operand list as no longer viable.
2725 for (const Use &Op : I.operands())
2726 disableSROA(V: Op);
2727
2728 return false;
2729}
2730
2731/// Analyze a basic block for its contribution to the inline cost.
2732///
2733/// This method walks the analyzer over every instruction in the given basic
2734/// block and accounts for their cost during inlining at this callsite. It
2735/// aborts early if the threshold has been exceeded or an impossible to inline
2736/// construct has been detected. It returns false if inlining is no longer
2737/// viable, and true if inlining remains viable.
2738InlineResult
2739CallAnalyzer::analyzeBlock(BasicBlock *BB,
2740 const SmallPtrSetImpl<const Value *> &EphValues) {
2741 for (Instruction &I : *BB) {
2742 // FIXME: Currently, the number of instructions in a function regardless of
2743 // our ability to simplify them during inline to constants or dead code,
2744 // are actually used by the vector bonus heuristic. As long as that's true,
2745 // we have to special case debug intrinsics here to prevent differences in
2746 // inlining due to debug symbols. Eventually, the number of unsimplified
2747 // instructions shouldn't factor into the cost computation, but until then,
2748 // hack around it here.
2749 // Similarly, skip pseudo-probes.
2750 if (I.isDebugOrPseudoInst())
2751 continue;
2752
2753 // Skip ephemeral values.
2754 if (EphValues.count(Ptr: &I))
2755 continue;
2756
2757 ++NumInstructions;
2758 if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy())
2759 ++NumVectorInstructions;
2760
2761 // If the instruction simplified to a constant, there is no cost to this
2762 // instruction. Visit the instructions using our InstVisitor to account for
2763 // all of the per-instruction logic. The visit tree returns true if we
2764 // consumed the instruction in any way, and false if the instruction's base
2765 // cost should count against inlining.
2766 onInstructionAnalysisStart(I: &I);
2767
2768 if (Base::visit(I: &I))
2769 ++NumInstructionsSimplified;
2770 else
2771 onMissedSimplification();
2772
2773 onInstructionAnalysisFinish(I: &I);
2774 using namespace ore;
2775 // If the visit this instruction detected an uninlinable pattern, abort.
2776 InlineResult IR = InlineResult::success();
2777 if (IsRecursiveCall && !AllowRecursiveCall)
2778 IR = InlineResult::failure(Reason: "recursive");
2779 else if (ExposesReturnsTwice)
2780 IR = InlineResult::failure(Reason: "exposes returns twice");
2781 else if (HasDynamicAlloca)
2782 IR = InlineResult::failure(Reason: "dynamic alloca");
2783 else if (HasIndirectBr)
2784 IR = InlineResult::failure(Reason: "indirect branch");
2785 else if (HasUninlineableIntrinsic)
2786 IR = InlineResult::failure(Reason: "uninlinable intrinsic");
2787 else if (InitsVargArgs)
2788 IR = InlineResult::failure(Reason: "varargs");
2789 if (!IR.isSuccess()) {
2790 if (ORE)
2791 ORE->emit(RemarkBuilder: [&]() {
2792 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2793 &CandidateCall)
2794 << NV("Callee", &F) << " has uninlinable pattern ("
2795 << NV("InlineResult", IR.getFailureReason())
2796 << ") and cost is not fully computed";
2797 });
2798 return IR;
2799 }
2800
2801 // If the caller is a recursive function then we don't want to inline
2802 // functions which allocate a lot of stack space because it would increase
2803 // the caller stack usage dramatically.
2804 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2805 auto IR =
2806 InlineResult::failure(Reason: "recursive and allocates too much stack space");
2807 if (ORE)
2808 ORE->emit(RemarkBuilder: [&]() {
2809 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2810 &CandidateCall)
2811 << NV("Callee", &F) << " is "
2812 << NV("InlineResult", IR.getFailureReason())
2813 << ". Cost is not fully computed";
2814 });
2815 return IR;
2816 }
2817
2818 if (shouldStop())
2819 return InlineResult::failure(
2820 Reason: "Call site analysis is not favorable to inlining.");
2821 }
2822
2823 return InlineResult::success();
2824}
2825
2826/// Compute the base pointer and cumulative constant offsets for V.
2827///
2828/// This strips all constant offsets off of V, leaving it the base pointer, and
2829/// accumulates the total constant offset applied in the returned constant. It
2830/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2831/// no constant offsets applied.
2832ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2833 if (!V->getType()->isPointerTy())
2834 return nullptr;
2835
2836 unsigned AS = V->getType()->getPointerAddressSpace();
2837 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2838 APInt Offset = APInt::getZero(numBits: IntPtrWidth);
2839
2840 // Even though we don't look through PHI nodes, we could be called on an
2841 // instruction in an unreachable block, which may be on a cycle.
2842 SmallPtrSet<Value *, 4> Visited;
2843 Visited.insert(Ptr: V);
2844 do {
2845 if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) {
2846 if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset))
2847 return nullptr;
2848 V = GEP->getPointerOperand();
2849 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) {
2850 if (GA->isInterposable())
2851 break;
2852 V = GA->getAliasee();
2853 } else {
2854 break;
2855 }
2856 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2857 } while (Visited.insert(Ptr: V).second);
2858
2859 Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType());
2860 return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset));
2861}
2862
2863/// Find dead blocks due to deleted CFG edges during inlining.
2864///
2865/// If we know the successor of the current block, \p CurrBB, has to be \p
2866/// NextBB, the other successors of \p CurrBB are dead if these successors have
2867/// no live incoming CFG edges. If one block is found to be dead, we can
2868/// continue growing the dead block list by checking the successors of the dead
2869/// blocks to see if all their incoming edges are dead or not.
2870void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2871 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2872 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2873 // known successor which is not the one under exam.
2874 if (DeadBlocks.count(Ptr: Pred))
2875 return true;
2876 BasicBlock *KnownSucc = KnownSuccessors[Pred];
2877 return KnownSucc && KnownSucc != Succ;
2878 };
2879
2880 auto IsNewlyDead = [&](BasicBlock *BB) {
2881 // If all the edges to a block are dead, the block is also dead.
2882 return (!DeadBlocks.count(Ptr: BB) &&
2883 llvm::all_of(Range: predecessors(BB),
2884 P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2885 };
2886
2887 for (BasicBlock *Succ : successors(BB: CurrBB)) {
2888 if (Succ == NextBB || !IsNewlyDead(Succ))
2889 continue;
2890 SmallVector<BasicBlock *, 4> NewDead;
2891 NewDead.push_back(Elt: Succ);
2892 while (!NewDead.empty()) {
2893 BasicBlock *Dead = NewDead.pop_back_val();
2894 if (DeadBlocks.insert(Ptr: Dead).second)
2895 // Continue growing the dead block lists.
2896 for (BasicBlock *S : successors(BB: Dead))
2897 if (IsNewlyDead(S))
2898 NewDead.push_back(Elt: S);
2899 }
2900 }
2901}
2902
2903/// Analyze a call site for potential inlining.
2904///
2905/// Returns true if inlining this call is viable, and false if it is not
2906/// viable. It computes the cost and adjusts the threshold based on numerous
2907/// factors and heuristics. If this method returns false but the computed cost
2908/// is below the computed threshold, then inlining was forcibly disabled by
2909/// some artifact of the routine.
2910InlineResult CallAnalyzer::analyze() {
2911 ++NumCallsAnalyzed;
2912
2913 auto Result = onAnalysisStart();
2914 if (!Result.isSuccess())
2915 return Result;
2916
2917 if (F.empty())
2918 return InlineResult::success();
2919
2920 Function *Caller = CandidateCall.getFunction();
2921 // Check if the caller function is recursive itself.
2922 for (User *U : Caller->users()) {
2923 CallBase *Call = dyn_cast<CallBase>(Val: U);
2924 if (Call && Call->getFunction() == Caller) {
2925 IsCallerRecursive = true;
2926 break;
2927 }
2928 }
2929
2930 // Populate our simplified values by mapping from function arguments to call
2931 // arguments with known important simplifications.
2932 auto CAI = CandidateCall.arg_begin();
2933 for (Argument &FAI : F.args()) {
2934 assert(CAI != CandidateCall.arg_end());
2935 SimplifiedValues[&FAI] = *CAI;
2936 if (isa<Constant>(Val: *CAI))
2937 ++NumConstantArgs;
2938
2939 Value *PtrArg = *CAI;
2940 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) {
2941 ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue());
2942
2943 // We can SROA any pointer arguments derived from alloca instructions.
2944 if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) {
2945 SROAArgValues[&FAI] = SROAArg;
2946 onInitializeSROAArg(Arg: SROAArg);
2947 EnabledSROAAllocas.insert(V: SROAArg);
2948 }
2949 }
2950 ++CAI;
2951 }
2952 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2953 NumAllocaArgs = SROAArgValues.size();
2954
2955 // Collecting the ephemeral values of `F` can be expensive, so use the
2956 // ephemeral values cache if available.
2957 SmallPtrSet<const Value *, 32> EphValuesStorage;
2958 const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage;
2959 if (GetEphValuesCache)
2960 EphValues = &GetEphValuesCache(F).ephValues();
2961 else
2962 CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F),
2963 EphValues&: EphValuesStorage);
2964
2965 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2966 // adding basic blocks of the callee which can be proven to be dead for this
2967 // particular call site in order to get more accurate cost estimates. This
2968 // requires a somewhat heavyweight iteration pattern: we need to walk the
2969 // basic blocks in a breadth-first order as we insert live successors. To
2970 // accomplish this, prioritizing for small iterations because we exit after
2971 // crossing our threshold, we use a small-size optimized SetVector.
2972 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2973 BBSetVector BBWorklist;
2974 BBWorklist.insert(X: &F.getEntryBlock());
2975
2976 // Note that we *must not* cache the size, this loop grows the worklist.
2977 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2978 if (shouldStop())
2979 break;
2980
2981 BasicBlock *BB = BBWorklist[Idx];
2982 if (BB->empty())
2983 continue;
2984
2985 onBlockStart(BB);
2986
2987 // Disallow inlining a blockaddress.
2988 // A blockaddress only has defined behavior for an indirect branch in the
2989 // same function, and we do not currently support inlining indirect
2990 // branches. But, the inliner may not see an indirect branch that ends up
2991 // being dead code at a particular call site. If the blockaddress escapes
2992 // the function, e.g., via a global variable, inlining may lead to an
2993 // invalid cross-function reference.
2994 // FIXME: pr/39560: continue relaxing this overt restriction.
2995 if (BB->hasAddressTaken())
2996 return InlineResult::failure(Reason: "blockaddress used");
2997
2998 // Analyze the cost of this block. If we blow through the threshold, this
2999 // returns false, and we can bail on out.
3000 InlineResult IR = analyzeBlock(BB, EphValues: *EphValues);
3001 if (!IR.isSuccess())
3002 return IR;
3003
3004 Instruction *TI = BB->getTerminator();
3005
3006 // Add in the live successors by first checking whether we have terminator
3007 // that may be simplified based on the values simplified by this call.
3008 if (CondBrInst *BI = dyn_cast<CondBrInst>(Val: TI)) {
3009 Value *Cond = BI->getCondition();
3010 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
3011 BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0);
3012 BBWorklist.insert(X: NextBB);
3013 KnownSuccessors[BB] = NextBB;
3014 findDeadBlocks(CurrBB: BB, NextBB);
3015 continue;
3016 }
3017 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) {
3018 Value *Cond = SI->getCondition();
3019 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
3020 BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor();
3021 BBWorklist.insert(X: NextBB);
3022 KnownSuccessors[BB] = NextBB;
3023 findDeadBlocks(CurrBB: BB, NextBB);
3024 continue;
3025 }
3026 }
3027
3028 // If we're unable to select a particular successor, just count all of
3029 // them.
3030 BBWorklist.insert_range(R: successors(BB));
3031
3032 onBlockAnalyzed(BB);
3033 }
3034
3035 // If this is a noduplicate call, we can still inline as long as
3036 // inlining this would cause the removal of the caller (so the instruction
3037 // is not actually duplicated, just moved).
3038 if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall)
3039 return InlineResult::failure(Reason: "noduplicate");
3040
3041 // If the callee's stack size exceeds the user-specified threshold,
3042 // do not let it be inlined.
3043 // The command line option overrides a limit set in the function attributes.
3044 size_t FinalStackSizeThreshold = StackSizeThreshold;
3045 if (!StackSizeThreshold.getNumOccurrences())
3046 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
3047 F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName))
3048 FinalStackSizeThreshold = *AttrMaxStackSize;
3049 if (AllocatedSize > FinalStackSizeThreshold)
3050 return InlineResult::failure(Reason: "stacksize");
3051
3052 return finalizeAnalysis();
3053}
3054
3055void InlineCostCallAnalyzer::print(raw_ostream &OS) {
3056#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
3057 if (PrintInstructionComments)
3058 F.print(OS, AAW: &Writer);
3059 DEBUG_PRINT_STAT(NumConstantArgs);
3060 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
3061 DEBUG_PRINT_STAT(NumAllocaArgs);
3062 DEBUG_PRINT_STAT(NumConstantPtrCmps);
3063 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
3064 DEBUG_PRINT_STAT(NumInstructionsSimplified);
3065 DEBUG_PRINT_STAT(NumInstructions);
3066 DEBUG_PRINT_STAT(NumInlineAsmInstructions);
3067 DEBUG_PRINT_STAT(SROACostSavings);
3068 DEBUG_PRINT_STAT(SROACostSavingsLost);
3069 DEBUG_PRINT_STAT(LoadEliminationCost);
3070 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
3071 DEBUG_PRINT_STAT(Cost);
3072 DEBUG_PRINT_STAT(Threshold);
3073#undef DEBUG_PRINT_STAT
3074}
3075
3076#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3077/// Dump stats about this call's analysis.
3078LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
3079#endif
3080
3081/// Test that there are no attribute conflicts between Caller and Callee
3082/// that prevent inlining.
3083static bool functionsHaveCompatibleAttributes(
3084 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
3085 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
3086 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
3087 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
3088 // object, and always returns the same object (which is overwritten on each
3089 // GetTLI call). Therefore we copy the first result.
3090 auto CalleeTLI = GetTLI(*Callee);
3091 return (IgnoreTTIInlineCompatible ||
3092 TTI.areInlineCompatible(Caller, Callee)) &&
3093 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
3094 AllowCallerSuperset: InlineCallerSupersetNoBuiltin) &&
3095 AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee);
3096}
3097
3098int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
3099 const DataLayout &DL) {
3100 int64_t Cost = 0;
3101 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
3102 if (Call.isByValArgument(ArgNo: I)) {
3103 // We approximate the number of loads and stores needed by dividing the
3104 // size of the byval type by the target's pointer size.
3105 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3106 unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I));
3107 unsigned AS = PTy->getAddressSpace();
3108 unsigned PointerSize = DL.getPointerSizeInBits(AS);
3109 // Ceiling division.
3110 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
3111
3112 // If it generates more than 8 stores it is likely to be expanded as an
3113 // inline memcpy so we take that as an upper bound. Otherwise we assume
3114 // one load and one store per word copied.
3115 // FIXME: The maxStoresPerMemcpy setting from the target should be used
3116 // here instead of a magic number of 8, but it's not available via
3117 // DataLayout.
3118 NumStores = std::min(a: NumStores, b: 8U);
3119
3120 Cost += 2 * NumStores * InstrCost;
3121 } else {
3122 // For non-byval arguments subtract off one instruction per call
3123 // argument.
3124 Cost += InstrCost;
3125 }
3126 }
3127 // The call instruction also disappears after inlining.
3128 Cost += InstrCost;
3129 Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty);
3130
3131 return std::min<int64_t>(a: Cost, INT_MAX);
3132}
3133
3134InlineCost llvm::getInlineCost(
3135 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
3136 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3137 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3138 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3139 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3140 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3141 return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI,
3142 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE,
3143 GetEphValuesCache);
3144}
3145
3146std::optional<int> llvm::getInliningCostEstimate(
3147 CallBase &Call, TargetTransformInfo &CalleeTTI,
3148 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3149 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3150 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3151 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3152 const InlineParams Params = {/* DefaultThreshold*/ 0,
3153 /*HintThreshold*/ {},
3154 /*OptSizeHintThreshold*/ {},
3155 /*ColdThreshold*/ {},
3156 /*OptSizeThreshold*/ {},
3157 /*OptMinSizeThreshold*/ {},
3158 /*HotCallSiteThreshold*/ {},
3159 /*LocallyHotCallSiteThreshold*/ {},
3160 /*ColdCallSiteThreshold*/ {},
3161 /*ComputeFullInlineCost*/ true,
3162 /*EnableDeferral*/ true};
3163
3164 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
3165 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
3166 /*IgnoreThreshold*/ true);
3167 auto R = CA.analyze();
3168 if (!R.isSuccess())
3169 return std::nullopt;
3170 return CA.getCost();
3171}
3172
3173std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
3174 CallBase &Call, TargetTransformInfo &CalleeTTI,
3175 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3176 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3177 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3178 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3179 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
3180 PSI, ORE, *Call.getCalledFunction(), Call);
3181 auto R = CFA.analyze();
3182 if (!R.isSuccess())
3183 return std::nullopt;
3184 return CFA.features();
3185}
3186
3187std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
3188 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
3189 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
3190
3191 // Cannot inline indirect calls.
3192 if (!Callee)
3193 return InlineResult::failure(Reason: "indirect call");
3194
3195 // When callee coroutine function is inlined into caller coroutine function
3196 // before coro-split pass,
3197 // coro-early pass can not handle this quiet well.
3198 // So we won't inline the coroutine function if it have not been unsplited
3199 if (Callee->isPresplitCoroutine())
3200 return InlineResult::failure(Reason: "unsplited coroutine call");
3201
3202 // Never inline calls with byval arguments that does not have the alloca
3203 // address space. Since byval arguments can be replaced with a copy to an
3204 // alloca, the inlined code would need to be adjusted to handle that the
3205 // argument is in the alloca address space (so it is a little bit complicated
3206 // to solve).
3207 unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3208 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3209 if (Call.isByValArgument(ArgNo: I)) {
3210 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3211 if (PTy->getAddressSpace() != AllocaAS)
3212 return InlineResult::failure(Reason: "byval arguments without alloca"
3213 " address space");
3214 }
3215
3216 // Calls to functions with always-inline attributes should be inlined
3217 // whenever possible.
3218 if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) {
3219 if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline))
3220 return InlineResult::failure(Reason: "noinline call site attribute");
3221
3222 auto IsViable = isInlineViable(Callee&: *Callee);
3223 if (IsViable.isSuccess())
3224 return InlineResult::success();
3225 return InlineResult::failure(Reason: IsViable.getFailureReason());
3226 }
3227
3228 // Never inline functions with conflicting attributes (unless callee has
3229 // always-inline attribute).
3230 Function *Caller = Call.getCaller();
3231 if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI))
3232 return InlineResult::failure(Reason: "conflicting attributes");
3233
3234 // Flatten: inline all viable calls from flatten functions regardless of cost.
3235 // Checked before optnone so that flatten takes priority.
3236 if (Caller->hasFnAttribute(Kind: Attribute::Flatten)) {
3237 auto IsViable = isInlineViable(Callee&: *Callee);
3238 if (IsViable.isSuccess())
3239 return InlineResult::success();
3240 return InlineResult::failure(Reason: IsViable.getFailureReason());
3241 }
3242
3243 // Don't inline this call if the caller has the optnone attribute.
3244 if (Caller->hasOptNone())
3245 return InlineResult::failure(Reason: "optnone attribute");
3246
3247 // Don't inline functions which can be interposed at link-time.
3248 if (Callee->isInterposable(/*CheckNoIPA=*/false))
3249 return InlineResult::failure(Reason: "interposable");
3250
3251 // Don't inline functions marked noinline.
3252 if (Callee->hasFnAttribute(Kind: Attribute::NoInline))
3253 return InlineResult::failure(Reason: "noinline function attribute");
3254
3255 // Don't inline call sites marked noinline.
3256 if (Call.isNoInline())
3257 return InlineResult::failure(Reason: "noinline call site attribute");
3258
3259 // Don't inline functions that are loader replaceable.
3260 if (Callee->hasFnAttribute(Kind: "loader-replaceable"))
3261 return InlineResult::failure(Reason: "loader replaceable function attribute");
3262
3263 return std::nullopt;
3264}
3265
3266InlineCost llvm::getInlineCost(
3267 CallBase &Call, Function *Callee, const InlineParams &Params,
3268 TargetTransformInfo &CalleeTTI,
3269 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3270 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3271 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3272 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3273 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3274
3275 auto UserDecision =
3276 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3277
3278 if (UserDecision) {
3279 if (UserDecision->isSuccess())
3280 return llvm::InlineCost::getAlways(Reason: "always inline attribute");
3281 return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason());
3282 }
3283
3284 if (InlineAllViableCalls && isInlineViable(Callee&: *Callee).isSuccess())
3285 return llvm::InlineCost::getAlways(
3286 Reason: "Inlining forced by -inline-all-viable-calls");
3287
3288 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3289 << "... (caller:" << Call.getCaller()->getName()
3290 << ")\n");
3291
3292 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3293 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
3294 /*BoostIndirect=*/true, /*IgnoreThreshold=*/false,
3295 GetEphValuesCache);
3296 InlineResult ShouldInline = CA.analyze();
3297
3298 LLVM_DEBUG(CA.dump());
3299
3300 // Always make cost benefit based decision explicit.
3301 // We use always/never here since threshold is not meaningful,
3302 // as it's not what drives cost-benefit analysis.
3303 if (CA.wasDecidedByCostBenefit()) {
3304 if (ShouldInline.isSuccess())
3305 return InlineCost::getAlways(Reason: "benefit over cost",
3306 CostBenefit: CA.getCostBenefitPair());
3307 else
3308 return InlineCost::getNever(Reason: "cost over benefit", CostBenefit: CA.getCostBenefitPair());
3309 }
3310
3311 if (CA.wasDecidedByCostThreshold())
3312 return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(),
3313 StaticBonus: CA.getStaticBonusApplied());
3314
3315 // No details on how the decision was made, simply return always or never.
3316 return ShouldInline.isSuccess()
3317 ? InlineCost::getAlways(Reason: "empty function")
3318 : InlineCost::getNever(Reason: ShouldInline.getFailureReason());
3319}
3320
3321InlineResult llvm::isInlineViable(Function &F) {
3322 bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice);
3323 for (BasicBlock &BB : F) {
3324 // Disallow inlining of functions which contain indirect branches.
3325 if (isa<IndirectBrInst>(Val: BB.getTerminator()))
3326 return InlineResult::failure(Reason: "contains indirect branches");
3327
3328 // Disallow inlining of blockaddresses.
3329 if (BB.hasAddressTaken())
3330 return InlineResult::failure(Reason: "blockaddress used");
3331
3332 for (auto &II : BB) {
3333 CallBase *Call = dyn_cast<CallBase>(Val: &II);
3334 if (!Call)
3335 continue;
3336
3337 // Disallow recursive calls.
3338 Function *Callee = Call->getCalledFunction();
3339 if (&F == Callee)
3340 return InlineResult::failure(Reason: "recursive call");
3341
3342 // Disallow calls which expose returns-twice to a function not previously
3343 // attributed as such.
3344 if (!ReturnsTwice && isa<CallInst>(Val: Call) &&
3345 cast<CallInst>(Val: Call)->canReturnTwice())
3346 return InlineResult::failure(Reason: "exposes returns-twice attribute");
3347
3348 if (Callee)
3349 switch (Callee->getIntrinsicID()) {
3350 default:
3351 break;
3352 case llvm::Intrinsic::icall_branch_funnel:
3353 // Disallow inlining of @llvm.icall.branch.funnel because current
3354 // backend can't separate call targets from call arguments.
3355 return InlineResult::failure(
3356 Reason: "disallowed inlining of @llvm.icall.branch.funnel");
3357 case llvm::Intrinsic::localescape:
3358 // Disallow inlining functions that call @llvm.localescape. Doing this
3359 // correctly would require major changes to the inliner.
3360 return InlineResult::failure(
3361 Reason: "disallowed inlining of @llvm.localescape");
3362 case llvm::Intrinsic::vastart:
3363 // Disallow inlining of functions that initialize VarArgs with
3364 // va_start.
3365 return InlineResult::failure(
3366 Reason: "contains VarArgs initialized with va_start");
3367 }
3368 }
3369 }
3370
3371 return InlineResult::success();
3372}
3373
3374// APIs to create InlineParams based on command line flags and/or other
3375// parameters.
3376
3377InlineParams llvm::getInlineParams(int Threshold) {
3378 InlineParams Params;
3379
3380 // This field is the threshold to use for a callee by default. This is
3381 // derived from one or more of:
3382 // * optimization or size-optimization levels,
3383 // * a value passed to createFunctionInliningPass function, or
3384 // * the -inline-threshold flag.
3385 // If the -inline-threshold flag is explicitly specified, that is used
3386 // irrespective of anything else.
3387 if (InlineThreshold.getNumOccurrences() > 0)
3388 Params.DefaultThreshold = InlineThreshold;
3389 else
3390 Params.DefaultThreshold = Threshold;
3391
3392 // Set the HintThreshold knob from the -inlinehint-threshold.
3393 Params.HintThreshold = HintThreshold;
3394 // Use same threshold for optsize by default.
3395 Params.OptSizeHintThreshold = HintThreshold;
3396
3397 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3398 Params.HotCallSiteThreshold = HotCallSiteThreshold;
3399
3400 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3401 // populate LocallyHotCallSiteThreshold. Later, we populate
3402 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3403 // we know that optimization level is O3 (in the getInlineParams variant that
3404 // takes the opt and size levels).
3405 // FIXME: Remove this check (and make the assignment unconditional) after
3406 // addressing size regression issues at O2.
3407 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3408 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3409
3410 // Set the ColdCallSiteThreshold knob from the
3411 // -inline-cold-callsite-threshold.
3412 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3413
3414 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3415 // -inlinehint-threshold commandline option is not explicitly given. If that
3416 // option is present, then its value applies even for callees with size and
3417 // minsize attributes.
3418 // If the -inline-threshold is not specified, set the ColdThreshold from the
3419 // -inlinecold-threshold even if it is not explicitly passed. If
3420 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3421 // explicitly specified to set the ColdThreshold knob
3422 if (InlineThreshold.getNumOccurrences() == 0) {
3423 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3424 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3425 Params.ColdThreshold = ColdThreshold;
3426 } else if (ColdThreshold.getNumOccurrences() > 0) {
3427 Params.ColdThreshold = ColdThreshold;
3428 }
3429 return Params;
3430}
3431
3432InlineParams llvm::getInlineParams() {
3433 return getInlineParams(Threshold: DefaultThreshold);
3434}
3435
3436InlineParams llvm::getInlineParamsFromOptLevel(unsigned OptLevel) {
3437 auto Params =
3438 getInlineParams(Threshold: OptLevel > 2 ? InlineConstants::OptAggressiveThreshold
3439 : DefaultThreshold);
3440 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3441 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3442 // when it is specified explicitly.
3443 if (OptLevel > 2)
3444 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3445 return Params;
3446}
3447
3448PreservedAnalyses
3449InlineCostAnnotationPrinterPass::run(Function &F,
3450 FunctionAnalysisManager &FAM) {
3451 PrintInstructionComments = true;
3452 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3453 [&](Function &F) -> AssumptionCache & {
3454 return FAM.getResult<AssumptionAnalysis>(IR&: F);
3455 };
3456
3457 auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
3458 ProfileSummaryInfo *PSI =
3459 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
3460 const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3461
3462 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3463 // In the current implementation, the type of InlineParams doesn't matter as
3464 // the pass serves only for verification of inliner's decisions.
3465 // We can add a flag which determines InlineParams for this run. Right now,
3466 // the default InlineParams are used.
3467 const InlineParams Params = llvm::getInlineParams();
3468 for (BasicBlock &BB : F) {
3469 for (Instruction &I : BB) {
3470 if (auto *CB = dyn_cast<CallBase>(Val: &I)) {
3471 Function *CalledFunction = CB->getCalledFunction();
3472 if (!CalledFunction || CalledFunction->isDeclaration())
3473 continue;
3474 OptimizationRemarkEmitter ORE(CalledFunction);
3475 InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
3476 GetAssumptionCache, nullptr, nullptr, PSI,
3477 &ORE);
3478 ICCA.analyze();
3479 OS << " Analyzing call of " << CalledFunction->getName()
3480 << "... (caller:" << CB->getCaller()->getName() << ")\n";
3481 ICCA.print(OS);
3482 OS << "\n";
3483 }
3484 }
3485 }
3486 return PreservedAnalyses::all();
3487}
3488