1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Analysis/InlineCost.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/BlockFrequencyInfo.h"
21#include "llvm/Analysis/CodeMetrics.h"
22#include "llvm/Analysis/ConstantFolding.h"
23#include "llvm/Analysis/DomConditionCache.h"
24#include "llvm/Analysis/EphemeralValuesCache.h"
25#include "llvm/Analysis/InstructionSimplify.h"
26#include "llvm/Analysis/LoopInfo.h"
27#include "llvm/Analysis/MemoryBuiltins.h"
28#include "llvm/Analysis/OptimizationRemarkEmitter.h"
29#include "llvm/Analysis/ProfileSummaryInfo.h"
30#include "llvm/Analysis/TargetLibraryInfo.h"
31#include "llvm/Analysis/TargetTransformInfo.h"
32#include "llvm/Analysis/ValueTracking.h"
33#include "llvm/Config/llvm-config.h"
34#include "llvm/IR/AssemblyAnnotationWriter.h"
35#include "llvm/IR/CallingConv.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/Dominators.h"
38#include "llvm/IR/GetElementPtrTypeIterator.h"
39#include "llvm/IR/GlobalAlias.h"
40#include "llvm/IR/InlineAsm.h"
41#include "llvm/IR/InstVisitor.h"
42#include "llvm/IR/IntrinsicInst.h"
43#include "llvm/IR/Operator.h"
44#include "llvm/IR/PatternMatch.h"
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/Debug.h"
47#include "llvm/Support/FormattedStream.h"
48#include "llvm/Support/raw_ostream.h"
49#include <climits>
50#include <limits>
51#include <optional>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "inline-cost"
56
57STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
58
59static cl::opt<int>
60 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(Val: 225),
61 cl::desc("Default amount of inlining to perform"));
62
63// We introduce this option since there is a minor compile-time win by avoiding
64// addition of TTI attributes (target-features in particular) to inline
65// candidates when they are guaranteed to be the same as top level methods in
66// some use cases. If we avoid adding the attribute, we need an option to avoid
67// checking these attributes.
68static cl::opt<bool> IgnoreTTIInlineCompatible(
69 "ignore-tti-inline-compatible", cl::Hidden, cl::init(Val: false),
70 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
71 "during inline cost calculation"));
72
73static cl::opt<bool> PrintInstructionComments(
74 "print-instruction-comments", cl::Hidden, cl::init(Val: false),
75 cl::desc("Prints comments for instruction based on inline cost analysis"));
76
77static cl::opt<int> InlineThreshold(
78 "inline-threshold", cl::Hidden, cl::init(Val: 225),
79 cl::desc("Control the amount of inlining to perform (default = 225)"));
80
81static cl::opt<int> HintThreshold(
82 "inlinehint-threshold", cl::Hidden, cl::init(Val: 325),
83 cl::desc("Threshold for inlining functions with inline hint"));
84
85static cl::opt<int>
86 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
87 cl::init(Val: 45),
88 cl::desc("Threshold for inlining cold callsites"));
89
90static cl::opt<bool> InlineEnableCostBenefitAnalysis(
91 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(Val: false),
92 cl::desc("Enable the cost-benefit analysis for the inliner"));
93
94// InlineSavingsMultiplier overrides per TTI multipliers iff it is
95// specified explicitly in command line options. This option is exposed
96// for tuning and testing.
97static cl::opt<int> InlineSavingsMultiplier(
98 "inline-savings-multiplier", cl::Hidden, cl::init(Val: 8),
99 cl::desc("Multiplier to multiply cycle savings by during inlining"));
100
101// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
102// specified explicitly in command line options. This option is exposed
103// for tuning and testing.
104static cl::opt<int> InlineSavingsProfitableMultiplier(
105 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(Val: 4),
106 cl::desc("A multiplier on top of cycle savings to decide whether the "
107 "savings won't justify the cost"));
108
109static cl::opt<int>
110 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(Val: 100),
111 cl::desc("The maximum size of a callee that get's "
112 "inlined without sufficient cycle savings"));
113
114// We introduce this threshold to help performance of instrumentation based
115// PGO before we actually hook up inliner with analysis passes such as BPI and
116// BFI.
117static cl::opt<int> ColdThreshold(
118 "inlinecold-threshold", cl::Hidden, cl::init(Val: 45),
119 cl::desc("Threshold for inlining functions with cold attribute"));
120
121static cl::opt<int>
122 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(Val: 3000),
123 cl::desc("Threshold for hot callsites "));
124
125static cl::opt<int> LocallyHotCallSiteThreshold(
126 "locally-hot-callsite-threshold", cl::Hidden, cl::init(Val: 525),
127 cl::desc("Threshold for locally hot callsites "));
128
129static cl::opt<int> ColdCallSiteRelFreq(
130 "cold-callsite-rel-freq", cl::Hidden, cl::init(Val: 2),
131 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
132 "entry frequency, for a callsite to be cold in the absence of "
133 "profile information."));
134
135static cl::opt<uint64_t> HotCallSiteRelFreq(
136 "hot-callsite-rel-freq", cl::Hidden, cl::init(Val: 60),
137 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
138 "entry frequency, for a callsite to be hot in the absence of "
139 "profile information."));
140
141static cl::opt<int>
142 InstrCost("inline-instr-cost", cl::Hidden, cl::init(Val: 5),
143 cl::desc("Cost of a single instruction when inlining"));
144
145static cl::opt<int> InlineAsmInstrCost(
146 "inline-asm-instr-cost", cl::Hidden, cl::init(Val: 0),
147 cl::desc("Cost of a single inline asm instruction when inlining"));
148
149static cl::opt<int>
150 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(Val: 0),
151 cl::desc("Cost of load/store instruction when inlining"));
152
153static cl::opt<int> CallPenalty(
154 "inline-call-penalty", cl::Hidden, cl::init(Val: 25),
155 cl::desc("Call penalty that is applied per callsite when inlining"));
156
157static cl::opt<size_t>
158 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
159 cl::init(Val: std::numeric_limits<size_t>::max()),
160 cl::desc("Do not inline functions with a stack size "
161 "that exceeds the specified limit"));
162
163static cl::opt<size_t> RecurStackSizeThreshold(
164 "recursive-inline-max-stacksize", cl::Hidden,
165 cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller),
166 cl::desc("Do not inline recursive functions with a stack "
167 "size that exceeds the specified limit"));
168
169static cl::opt<bool> OptComputeFullInlineCost(
170 "inline-cost-full", cl::Hidden,
171 cl::desc("Compute the full inline cost of a call site even when the cost "
172 "exceeds the threshold."));
173
174static cl::opt<bool> InlineCallerSupersetNoBuiltin(
175 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(Val: true),
176 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
177 "attributes."));
178
179static cl::opt<bool> DisableGEPConstOperand(
180 "disable-gep-const-evaluation", cl::Hidden, cl::init(Val: false),
181 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
182
183static cl::opt<bool> InlineAllViableCalls(
184 "inline-all-viable-calls", cl::Hidden, cl::init(Val: false),
185 cl::desc("Inline all viable calls, even if they exceed the inlining "
186 "threshold"));
187namespace llvm {
188std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
189 if (Attr.isValid()) {
190 int AttrValue = 0;
191 if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue))
192 return AttrValue;
193 }
194 return std::nullopt;
195}
196
197std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
198 return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind));
199}
200
201std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
202 return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind));
203}
204
205namespace InlineConstants {
206int getInstrCost() { return InstrCost; }
207
208} // namespace InlineConstants
209
210} // namespace llvm
211
212namespace {
213class InlineCostCallAnalyzer;
214
215// This struct is used to store information about inline cost of a
216// particular instruction
217struct InstructionCostDetail {
218 int CostBefore = 0;
219 int CostAfter = 0;
220 int ThresholdBefore = 0;
221 int ThresholdAfter = 0;
222
223 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
224
225 int getCostDelta() const { return CostAfter - CostBefore; }
226
227 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
228};
229
230class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
231private:
232 InlineCostCallAnalyzer *const ICCA;
233
234public:
235 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
236 void emitInstructionAnnot(const Instruction *I,
237 formatted_raw_ostream &OS) override;
238};
239
240/// Carry out call site analysis, in order to evaluate inlinability.
241/// NOTE: the type is currently used as implementation detail of functions such
242/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
243/// expectation is that they come from the outer scope, from the wrapper
244/// functions. If we want to support constructing CallAnalyzer objects where
245/// lambdas are provided inline at construction, or where the object needs to
246/// otherwise survive past the scope of the provided functions, we need to
247/// revisit the argument types.
248class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
249 typedef InstVisitor<CallAnalyzer, bool> Base;
250 friend class InstVisitor<CallAnalyzer, bool>;
251
252protected:
253 virtual ~CallAnalyzer() = default;
254 /// The TargetTransformInfo available for this compilation.
255 const TargetTransformInfo &TTI;
256
257 /// Getter for the cache of @llvm.assume intrinsics.
258 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
259
260 /// Getter for BlockFrequencyInfo
261 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
262
263 /// Getter for TargetLibraryInfo
264 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
265
266 /// Profile summary information.
267 ProfileSummaryInfo *PSI;
268
269 /// The called function.
270 Function &F;
271
272 // Cache the DataLayout since we use it a lot.
273 const DataLayout &DL;
274
275 /// The OptimizationRemarkEmitter available for this compilation.
276 OptimizationRemarkEmitter *ORE;
277
278 /// The candidate callsite being analyzed. Please do not use this to do
279 /// analysis in the caller function; we want the inline cost query to be
280 /// easily cacheable. Instead, use the cover function paramHasAttr.
281 CallBase &CandidateCall;
282
283 /// Getter for the cache of ephemeral values.
284 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr;
285
286 /// Extension points for handling callsite features.
287 // Called before a basic block was analyzed.
288 virtual void onBlockStart(const BasicBlock *BB) {}
289
290 /// Called after a basic block was analyzed.
291 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
292
293 /// Called before an instruction was analyzed
294 virtual void onInstructionAnalysisStart(const Instruction *I) {}
295
296 /// Called after an instruction was analyzed
297 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
298
299 /// Called at the end of the analysis of the callsite. Return the outcome of
300 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
301 /// the reason it can't.
302 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
303 /// Called when we're about to start processing a basic block, and every time
304 /// we are done processing an instruction. Return true if there is no point in
305 /// continuing the analysis (e.g. we've determined already the call site is
306 /// too expensive to inline)
307 virtual bool shouldStop() { return false; }
308
309 /// Called before the analysis of the callee body starts (with callsite
310 /// contexts propagated). It checks callsite-specific information. Return a
311 /// reason analysis can't continue if that's the case, or 'true' if it may
312 /// continue.
313 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
314 /// Called if the analysis engine decides SROA cannot be done for the given
315 /// alloca.
316 virtual void onDisableSROA(AllocaInst *Arg) {}
317
318 /// Called the analysis engine determines load elimination won't happen.
319 virtual void onDisableLoadElimination() {}
320
321 /// Called when we visit a CallBase, before the analysis starts. Return false
322 /// to stop further processing of the instruction.
323 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
324
325 /// Called to account for a call.
326 virtual void onCallPenalty() {}
327
328 /// Called to account for a load or store.
329 virtual void onMemAccess(){};
330
331 /// Called to account for the expectation the inlining would result in a load
332 /// elimination.
333 virtual void onLoadEliminationOpportunity() {}
334
335 /// Called to account for the cost of argument setup for the Call in the
336 /// callee's body (not the callsite currently under analysis).
337 virtual void onCallArgumentSetup(const CallBase &Call) {}
338
339 /// Called to account for a load relative intrinsic.
340 virtual void onLoadRelativeIntrinsic() {}
341
342 /// Called to account for a lowered call.
343 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
344 }
345
346 /// Account for a jump table of given size. Return false to stop further
347 /// processing the switch instruction
348 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
349
350 /// Account for a case cluster of given size. Return false to stop further
351 /// processing of the instruction.
352 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
353
354 /// Called at the end of processing a switch instruction, with the given
355 /// number of case clusters.
356 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
357 bool DefaultDestUnreachable) {}
358
359 /// Called to account for any other instruction not specifically accounted
360 /// for.
361 virtual void onMissedSimplification() {}
362
363 /// Account for inline assembly instructions.
364 virtual void onInlineAsm(const InlineAsm &Arg) {}
365
366 /// Start accounting potential benefits due to SROA for the given alloca.
367 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
368
369 /// Account SROA savings for the AllocaInst value.
370 virtual void onAggregateSROAUse(AllocaInst *V) {}
371
372 bool handleSROA(Value *V, bool DoNotDisable) {
373 // Check for SROA candidates in comparisons.
374 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
375 if (DoNotDisable) {
376 onAggregateSROAUse(V: SROAArg);
377 return true;
378 }
379 disableSROAForArg(SROAArg);
380 }
381 return false;
382 }
383
384 bool IsCallerRecursive = false;
385 bool IsRecursiveCall = false;
386 bool ExposesReturnsTwice = false;
387 bool HasDynamicAlloca = false;
388 bool ContainsNoDuplicateCall = false;
389 bool HasReturn = false;
390 bool HasIndirectBr = false;
391 bool HasUninlineableIntrinsic = false;
392 bool InitsVargArgs = false;
393
394 /// Number of bytes allocated statically by the callee.
395 uint64_t AllocatedSize = 0;
396 unsigned NumInstructions = 0;
397 unsigned NumInlineAsmInstructions = 0;
398 unsigned NumVectorInstructions = 0;
399
400 /// While we walk the potentially-inlined instructions, we build up and
401 /// maintain a mapping of simplified values specific to this callsite. The
402 /// idea is to propagate any special information we have about arguments to
403 /// this call through the inlinable section of the function, and account for
404 /// likely simplifications post-inlining. The most important aspect we track
405 /// is CFG altering simplifications -- when we prove a basic block dead, that
406 /// can cause dramatic shifts in the cost of inlining a function.
407 /// Note: The simplified Value may be owned by the caller function.
408 DenseMap<Value *, Value *> SimplifiedValues;
409
410 /// Keep track of the values which map back (through function arguments) to
411 /// allocas on the caller stack which could be simplified through SROA.
412 DenseMap<Value *, AllocaInst *> SROAArgValues;
413
414 /// Keep track of Allocas for which we believe we may get SROA optimization.
415 DenseSet<AllocaInst *> EnabledSROAAllocas;
416
417 /// Keep track of values which map to a pointer base and constant offset.
418 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
419
420 /// Keep track of dead blocks due to the constant arguments.
421 SmallPtrSet<BasicBlock *, 16> DeadBlocks;
422
423 /// The mapping of the blocks to their known unique successors due to the
424 /// constant arguments.
425 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
426
427 /// Model the elimination of repeated loads that is expected to happen
428 /// whenever we simplify away the stores that would otherwise cause them to be
429 /// loads.
430 bool EnableLoadElimination = true;
431
432 /// Whether we allow inlining for recursive call.
433 bool AllowRecursiveCall = false;
434
435 SmallPtrSet<Value *, 16> LoadAddrSet;
436
437 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
438 auto It = SROAArgValues.find(Val: V);
439 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0)
440 return nullptr;
441 return It->second;
442 }
443
444 /// Use a value in its given form directly if possible, otherwise try looking
445 /// for it in SimplifiedValues.
446 template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
447 if (auto *Direct = dyn_cast<T>(V))
448 return Direct;
449 return getSimplifiedValue<T>(V);
450 }
451
452 // Custom simplification helper routines.
453 bool isAllocaDerivedArg(Value *V);
454 void disableSROAForArg(AllocaInst *SROAArg);
455 void disableSROA(Value *V);
456 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
457 void disableLoadElimination();
458 bool isGEPFree(GetElementPtrInst &GEP);
459 bool canFoldInboundsGEP(GetElementPtrInst &I);
460 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
461 bool simplifyCallSite(Function *F, CallBase &Call);
462 bool simplifyCmpInstForRecCall(CmpInst &Cmp);
463 bool simplifyInstruction(Instruction &I);
464 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
465 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
466 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
467 bool isLoweredToCall(Function *F, CallBase &Call);
468
469 /// Return true if the given argument to the function being considered for
470 /// inlining has the given attribute set either at the call site or the
471 /// function declaration. Primarily used to inspect call site specific
472 /// attributes since these can be more precise than the ones on the callee
473 /// itself.
474 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
475
476 /// Return true if the given value is known non null within the callee if
477 /// inlined through this particular callsite.
478 bool isKnownNonNullInCallee(Value *V);
479
480 /// Return true if size growth is allowed when inlining the callee at \p Call.
481 bool allowSizeGrowth(CallBase &Call);
482
483 // Custom analysis routines.
484 InlineResult analyzeBlock(BasicBlock *BB,
485 const SmallPtrSetImpl<const Value *> &EphValues);
486
487 // Disable several entry points to the visitor so we don't accidentally use
488 // them by declaring but not defining them here.
489 void visit(Module *);
490 void visit(Module &);
491 void visit(Function *);
492 void visit(Function &);
493 void visit(BasicBlock *);
494 void visit(BasicBlock &);
495
496 // Provide base case for our instruction visit.
497 bool visitInstruction(Instruction &I);
498
499 // Our visit overrides.
500 bool visitAlloca(AllocaInst &I);
501 bool visitPHI(PHINode &I);
502 bool visitGetElementPtr(GetElementPtrInst &I);
503 bool visitBitCast(BitCastInst &I);
504 bool visitPtrToInt(PtrToIntInst &I);
505 bool visitIntToPtr(IntToPtrInst &I);
506 bool visitCastInst(CastInst &I);
507 bool visitCmpInst(CmpInst &I);
508 bool visitSub(BinaryOperator &I);
509 bool visitBinaryOperator(BinaryOperator &I);
510 bool visitFNeg(UnaryOperator &I);
511 bool visitLoad(LoadInst &I);
512 bool visitStore(StoreInst &I);
513 bool visitExtractValue(ExtractValueInst &I);
514 bool visitInsertValue(InsertValueInst &I);
515 bool visitCallBase(CallBase &Call);
516 bool visitReturnInst(ReturnInst &RI);
517 bool visitUncondBrInst(UncondBrInst &BI);
518 bool visitCondBrInst(CondBrInst &BI);
519 bool visitSelectInst(SelectInst &SI);
520 bool visitSwitchInst(SwitchInst &SI);
521 bool visitIndirectBrInst(IndirectBrInst &IBI);
522 bool visitResumeInst(ResumeInst &RI);
523 bool visitCleanupReturnInst(CleanupReturnInst &RI);
524 bool visitCatchReturnInst(CatchReturnInst &RI);
525 bool visitUnreachableInst(UnreachableInst &I);
526
527public:
528 CallAnalyzer(
529 Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
530 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
531 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
532 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
533 ProfileSummaryInfo *PSI = nullptr,
534 OptimizationRemarkEmitter *ORE = nullptr,
535 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
536 nullptr)
537 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
538 GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
539 CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {}
540
541 InlineResult analyze();
542
543 /// Lookup simplified Value. May return a value owned by the caller.
544 Value *getSimplifiedValueUnchecked(Value *V) const {
545 return SimplifiedValues.lookup(Val: V);
546 }
547
548 /// Lookup simplified Value, but return nullptr if the simplified value is
549 /// owned by the caller.
550 template <typename T> T *getSimplifiedValue(Value *V) const {
551 Value *SimpleV = SimplifiedValues.lookup(Val: V);
552 if (!SimpleV)
553 return nullptr;
554
555 // Skip checks if we know T is a global. This has a small, but measurable
556 // impact on compile-time.
557 if constexpr (std::is_base_of_v<Constant, T>)
558 return dyn_cast<T>(SimpleV);
559
560 // Make sure the simplified Value is owned by this function
561 if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) {
562 if (I->getFunction() != &F)
563 return nullptr;
564 } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) {
565 if (Arg->getParent() != &F)
566 return nullptr;
567 } else if (!isa<Constant>(Val: SimpleV))
568 return nullptr;
569 return dyn_cast<T>(SimpleV);
570 }
571
572 // Keep a bunch of stats about the cost savings found so we can print them
573 // out when debugging.
574 unsigned NumConstantArgs = 0;
575 unsigned NumConstantOffsetPtrArgs = 0;
576 unsigned NumAllocaArgs = 0;
577 unsigned NumConstantPtrCmps = 0;
578 unsigned NumConstantPtrDiffs = 0;
579 unsigned NumInstructionsSimplified = 0;
580
581 void dump();
582};
583
584// Considering forming a binary search, we should find the number of nodes
585// which is same as the number of comparisons when lowered. For a given
586// number of clusters, n, we can define a recursive function, f(n), to find
587// the number of nodes in the tree. The recursion is :
588// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
589// and f(n) = n, when n <= 3.
590// This will lead a binary tree where the leaf should be either f(2) or f(3)
591// when n > 3. So, the number of comparisons from leaves should be n, while
592// the number of non-leaf should be :
593// 2^(log2(n) - 1) - 1
594// = 2^log2(n) * 2^-1 - 1
595// = n / 2 - 1.
596// Considering comparisons from leaf and non-leaf nodes, we can estimate the
597// number of comparisons in a simple closed form :
598// n + n / 2 - 1 = n * 3 / 2 - 1
599int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
600 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
601}
602
603/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
604/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
605class InlineCostCallAnalyzer final : public CallAnalyzer {
606 const bool ComputeFullInlineCost;
607 int LoadEliminationCost = 0;
608 /// Bonus to be applied when percentage of vector instructions in callee is
609 /// high (see more details in updateThreshold).
610 int VectorBonus = 0;
611 /// Bonus to be applied when the callee has only one reachable basic block.
612 int SingleBBBonus = 0;
613
614 /// Tunable parameters that control the analysis.
615 const InlineParams &Params;
616
617 // This DenseMap stores the delta change in cost and threshold after
618 // accounting for the given instruction. The map is filled only with the
619 // flag PrintInstructionComments on.
620 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
621
622 /// Upper bound for the inlining cost. Bonuses are being applied to account
623 /// for speculative "expected profit" of the inlining decision.
624 int Threshold = 0;
625
626 /// The amount of StaticBonus applied.
627 int StaticBonusApplied = 0;
628
629 /// Attempt to evaluate indirect calls to boost its inline cost.
630 const bool BoostIndirectCalls;
631
632 /// Ignore the threshold when finalizing analysis.
633 const bool IgnoreThreshold;
634
635 // True if the cost-benefit-analysis-based inliner is enabled.
636 const bool CostBenefitAnalysisEnabled;
637
638 /// Inlining cost measured in abstract units, accounts for all the
639 /// instructions expected to be executed for a given function invocation.
640 /// Instructions that are statically proven to be dead based on call-site
641 /// arguments are not counted here.
642 int Cost = 0;
643
644 // The cumulative cost at the beginning of the basic block being analyzed. At
645 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
646 // the size of that basic block.
647 int CostAtBBStart = 0;
648
649 // The static size of live but cold basic blocks. This is "static" in the
650 // sense that it's not weighted by profile counts at all.
651 int ColdSize = 0;
652
653 // Whether inlining is decided by cost-threshold analysis.
654 bool DecidedByCostThreshold = false;
655
656 // Whether inlining is decided by cost-benefit analysis.
657 bool DecidedByCostBenefit = false;
658
659 // The cost-benefit pair computed by cost-benefit analysis.
660 std::optional<CostBenefitPair> CostBenefit;
661
662 bool SingleBB = true;
663
664 unsigned SROACostSavings = 0;
665 unsigned SROACostSavingsLost = 0;
666
667 /// The mapping of caller Alloca values to their accumulated cost savings. If
668 /// we have to disable SROA for one of the allocas, this tells us how much
669 /// cost must be added.
670 DenseMap<AllocaInst *, int> SROAArgCosts;
671
672 /// Return true if \p Call is a cold callsite.
673 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
674
675 /// Update Threshold based on callsite properties such as callee
676 /// attributes and callee hotness for PGO builds. The Callee is explicitly
677 /// passed to support analyzing indirect calls whose target is inferred by
678 /// analysis.
679 void updateThreshold(CallBase &Call, Function &Callee);
680 /// Return a higher threshold if \p Call is a hot callsite.
681 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
682 BlockFrequencyInfo *CallerBFI);
683
684 /// Handle a capped 'int' increment for Cost.
685 void addCost(int64_t Inc) {
686 Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX);
687 Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX);
688 }
689
690 void onDisableSROA(AllocaInst *Arg) override {
691 auto CostIt = SROAArgCosts.find(Val: Arg);
692 if (CostIt == SROAArgCosts.end())
693 return;
694 addCost(Inc: CostIt->second);
695 SROACostSavings -= CostIt->second;
696 SROACostSavingsLost += CostIt->second;
697 SROAArgCosts.erase(I: CostIt);
698 }
699
700 void onDisableLoadElimination() override {
701 addCost(Inc: LoadEliminationCost);
702 LoadEliminationCost = 0;
703 }
704
705 bool onCallBaseVisitStart(CallBase &Call) override {
706 if (std::optional<int> AttrCallThresholdBonus =
707 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus"))
708 Threshold += *AttrCallThresholdBonus;
709
710 if (std::optional<int> AttrCallCost =
711 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost")) {
712 addCost(Inc: *AttrCallCost);
713 // Prevent further processing of the call since we want to override its
714 // inline cost, not just add to it.
715 return false;
716 }
717 return true;
718 }
719
720 void onCallPenalty() override { addCost(Inc: CallPenalty); }
721
722 void onMemAccess() override { addCost(Inc: MemAccessCost); }
723
724 void onCallArgumentSetup(const CallBase &Call) override {
725 // Pay the price of the argument setup. We account for the average 1
726 // instruction per call argument setup here.
727 addCost(Inc: Call.arg_size() * InstrCost);
728 }
729 void onLoadRelativeIntrinsic() override {
730 // This is normally lowered to 4 LLVM instructions.
731 addCost(Inc: 3 * InstrCost);
732 }
733 void onLoweredCall(Function *F, CallBase &Call,
734 bool IsIndirectCall) override {
735 // We account for the average 1 instruction per call argument setup here.
736 addCost(Inc: Call.arg_size() * InstrCost);
737
738 // If we have a constant that we are calling as a function, we can peer
739 // through it and see the function target. This happens not infrequently
740 // during devirtualization and so we want to give it a hefty bonus for
741 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
742 // Pretend to inline the function, with a custom threshold.
743 if (IsIndirectCall && BoostIndirectCalls) {
744 auto IndirectCallParams = Params;
745 IndirectCallParams.DefaultThreshold =
746 InlineConstants::IndirectCallThreshold;
747 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
748 /// to instantiate the derived class.
749 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
750 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
751 false);
752 if (CA.analyze().isSuccess()) {
753 // We were able to inline the indirect call! Subtract the cost from the
754 // threshold to get the bonus we want to apply, but don't go below zero.
755 addCost(Inc: -std::max(a: 0, b: CA.getThreshold() - CA.getCost()));
756 }
757 } else
758 // Otherwise simply add the cost for merely making the call.
759 addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call,
760 DefaultCallPenalty: CallPenalty));
761 }
762
763 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
764 bool DefaultDestUnreachable) override {
765 // If suitable for a jump table, consider the cost for the table size and
766 // branch to destination.
767 // Maximum valid cost increased in this function.
768 if (JumpTableSize) {
769 // Suppose a default branch includes one compare and one conditional
770 // branch if it's reachable.
771 if (!DefaultDestUnreachable)
772 addCost(Inc: 2 * InstrCost);
773 // Suppose a jump table requires one load and one jump instruction.
774 int64_t JTCost =
775 static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
776 addCost(Inc: JTCost);
777 return;
778 }
779
780 if (NumCaseCluster <= 3) {
781 // Suppose a comparison includes one compare and one conditional branch.
782 // We can reduce a set of instructions if the default branch is
783 // undefined.
784 addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost);
785 return;
786 }
787
788 int64_t ExpectedNumberOfCompare =
789 getExpectedNumberOfCompare(NumCaseCluster);
790 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
791
792 addCost(Inc: SwitchCost);
793 }
794
795 // Parses the inline assembly argument to account for its cost. Inline
796 // assembly instructions incur higher costs for inlining since they cannot be
797 // analyzed and optimized.
798 void onInlineAsm(const InlineAsm &Arg) override {
799 if (!InlineAsmInstrCost)
800 return;
801 SmallVector<StringRef, 4> AsmStrs;
802 Arg.collectAsmStrs(AsmStrs);
803 int SectionLevel = 0;
804 int InlineAsmInstrCount = 0;
805 for (StringRef AsmStr : AsmStrs) {
806 // Trim whitespaces and comments.
807 StringRef Trimmed = AsmStr.trim();
808 size_t hashPos = Trimmed.find(C: '#');
809 if (hashPos != StringRef::npos)
810 Trimmed = Trimmed.substr(Start: 0, N: hashPos);
811 // Ignore comments.
812 if (Trimmed.empty())
813 continue;
814 // Filter out the outlined assembly instructions from the cost by keeping
815 // track of the section level and only accounting for instrutions at
816 // section level of zero. Note there will be duplication in outlined
817 // sections too, but is not accounted in the inlining cost model.
818 if (Trimmed.starts_with(Prefix: ".pushsection")) {
819 ++SectionLevel;
820 continue;
821 }
822 if (Trimmed.starts_with(Prefix: ".popsection")) {
823 --SectionLevel;
824 continue;
825 }
826 // Ignore directives and labels.
827 if (Trimmed.starts_with(Prefix: ".") || Trimmed.contains(Other: ":"))
828 continue;
829 if (SectionLevel == 0)
830 ++InlineAsmInstrCount;
831 }
832 NumInlineAsmInstructions += InlineAsmInstrCount;
833 addCost(Inc: InlineAsmInstrCount * InlineAsmInstrCost);
834 }
835
836 void onMissedSimplification() override { addCost(Inc: InstrCost); }
837
838 void onInitializeSROAArg(AllocaInst *Arg) override {
839 assert(Arg != nullptr &&
840 "Should not initialize SROA costs for null value.");
841 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
842 SROACostSavings += SROAArgCost;
843 SROAArgCosts[Arg] = SROAArgCost;
844 }
845
846 void onAggregateSROAUse(AllocaInst *SROAArg) override {
847 auto CostIt = SROAArgCosts.find(Val: SROAArg);
848 assert(CostIt != SROAArgCosts.end() &&
849 "expected this argument to have a cost");
850 CostIt->second += InstrCost;
851 SROACostSavings += InstrCost;
852 }
853
854 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
855
856 void onBlockAnalyzed(const BasicBlock *BB) override {
857 if (CostBenefitAnalysisEnabled) {
858 // Keep track of the static size of live but cold basic blocks. For now,
859 // we define a cold basic block to be one that's never executed.
860 assert(GetBFI && "GetBFI must be available");
861 BlockFrequencyInfo *BFI = &(GetBFI(F));
862 assert(BFI && "BFI must be available");
863 auto ProfileCount = BFI->getBlockProfileCount(BB);
864 if (*ProfileCount == 0)
865 ColdSize += Cost - CostAtBBStart;
866 }
867
868 auto *TI = BB->getTerminator();
869 // If we had any successors at this point, than post-inlining is likely to
870 // have them as well. Note that we assume any basic blocks which existed
871 // due to branches or switches which folded above will also fold after
872 // inlining.
873 if (SingleBB && TI->getNumSuccessors() > 1) {
874 // Take off the bonus we applied to the threshold.
875 Threshold -= SingleBBBonus;
876 SingleBB = false;
877 }
878 }
879
880 void onInstructionAnalysisStart(const Instruction *I) override {
881 // This function is called to store the initial cost of inlining before
882 // the given instruction was assessed.
883 if (!PrintInstructionComments)
884 return;
885 auto &CostDetail = InstructionCostDetailMap[I];
886 CostDetail.CostBefore = Cost;
887 CostDetail.ThresholdBefore = Threshold;
888 }
889
890 void onInstructionAnalysisFinish(const Instruction *I) override {
891 // This function is called to find new values of cost and threshold after
892 // the instruction has been assessed.
893 if (!PrintInstructionComments)
894 return;
895 auto &CostDetail = InstructionCostDetailMap[I];
896 CostDetail.CostAfter = Cost;
897 CostDetail.ThresholdAfter = Threshold;
898 }
899
900 bool isCostBenefitAnalysisEnabled() {
901 if (!PSI || !PSI->hasProfileSummary())
902 return false;
903
904 if (!GetBFI)
905 return false;
906
907 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
908 // Honor the explicit request from the user.
909 if (!InlineEnableCostBenefitAnalysis)
910 return false;
911 } else {
912 // Otherwise, require instrumentation profile.
913 if (!PSI->hasInstrumentationProfile())
914 return false;
915 }
916
917 auto *Caller = CandidateCall.getParent()->getParent();
918 if (!Caller->getEntryCount())
919 return false;
920
921 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
922 if (!CallerBFI)
923 return false;
924
925 // For now, limit to hot call site.
926 if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI))
927 return false;
928
929 // Make sure we have a nonzero entry count.
930 auto EntryCount = F.getEntryCount();
931 if (!EntryCount || !EntryCount->getCount())
932 return false;
933
934 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
935 if (!CalleeBFI)
936 return false;
937
938 return true;
939 }
940
941 // A helper function to choose between command line override and default.
942 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
943 if (InlineSavingsMultiplier.getNumOccurrences())
944 return InlineSavingsMultiplier;
945 return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
946 }
947
948 // A helper function to choose between command line override and default.
949 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
950 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
951 return InlineSavingsProfitableMultiplier;
952 return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
953 }
954
955 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
956 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
957 CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test")) {
958 CycleSavings = *AttrCycleSavings;
959 }
960
961 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
962 CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test")) {
963 Size = *AttrRuntimeCost;
964 }
965 }
966
967 // Determine whether we should inline the given call site, taking into account
968 // both the size cost and the cycle savings. Return std::nullopt if we don't
969 // have sufficient profiling information to determine.
970 std::optional<bool> costBenefitAnalysis() {
971 if (!CostBenefitAnalysisEnabled)
972 return std::nullopt;
973
974 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
975 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
976 // falling back to the cost-based metric.
977 // TODO: Improve this hacky condition.
978 if (Threshold == 0)
979 return std::nullopt;
980
981 assert(GetBFI);
982 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
983 assert(CalleeBFI);
984
985 // The cycle savings expressed as the sum of InstrCost
986 // multiplied by the estimated dynamic count of each instruction we can
987 // avoid. Savings come from the call site cost, such as argument setup and
988 // the call instruction, as well as the instructions that are folded.
989 //
990 // We use 128-bit APInt here to avoid potential overflow. This variable
991 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
992 // assumes that we can avoid or fold a billion instructions, each with a
993 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
994 // period on a 4GHz machine.
995 APInt CycleSavings(128, 0);
996
997 for (auto &BB : F) {
998 APInt CurrentSavings(128, 0);
999 for (auto &I : BB) {
1000 if (CondBrInst *BI = dyn_cast<CondBrInst>(Val: &I)) {
1001 // Count a conditional branch as savings if it becomes unconditional.
1002 if (getSimplifiedValue<ConstantInt>(V: BI->getCondition()))
1003 CurrentSavings += InstrCost;
1004 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) {
1005 if (getSimplifiedValue<ConstantInt>(V: SI->getCondition()))
1006 CurrentSavings += InstrCost;
1007 } else if (Value *V = dyn_cast<Value>(Val: &I)) {
1008 // Count an instruction as savings if we can fold it.
1009 if (SimplifiedValues.count(Val: V)) {
1010 CurrentSavings += InstrCost;
1011 }
1012 }
1013 }
1014
1015 auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB);
1016 CurrentSavings *= *ProfileCount;
1017 CycleSavings += CurrentSavings;
1018 }
1019
1020 // Compute the cycle savings per call.
1021 auto EntryProfileCount = F.getEntryCount();
1022 assert(EntryProfileCount && EntryProfileCount->getCount());
1023 auto EntryCount = EntryProfileCount->getCount();
1024 CycleSavings += EntryCount / 2;
1025 CycleSavings = CycleSavings.udiv(RHS: EntryCount);
1026
1027 // Compute the total savings for the call site.
1028 auto *CallerBB = CandidateCall.getParent();
1029 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
1030 CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL);
1031 CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB);
1032
1033 // Remove the cost of the cold basic blocks to model the runtime cost more
1034 // accurately. Both machine block placement and function splitting could
1035 // place cold blocks further from hot blocks.
1036 int Size = Cost - ColdSize;
1037
1038 // Allow tiny callees to be inlined regardless of whether they meet the
1039 // savings threshold.
1040 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
1041
1042 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
1043 CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings);
1044
1045 // Let R be the ratio of CycleSavings to Size. We accept the inlining
1046 // opportunity if R is really high and reject if R is really low. If R is
1047 // somewhere in the middle, we fall back to the cost-based analysis.
1048 //
1049 // Specifically, let R = CycleSavings / Size, we accept the inlining
1050 // opportunity if:
1051 //
1052 // PSI->getOrCompHotCountThreshold()
1053 // R > -------------------------------------------------
1054 // getInliningCostBenefitAnalysisSavingsMultiplier()
1055 //
1056 // and reject the inlining opportunity if:
1057 //
1058 // PSI->getOrCompHotCountThreshold()
1059 // R <= ----------------------------------------------------
1060 // getInliningCostBenefitAnalysisProfitableMultiplier()
1061 //
1062 // Otherwise, we fall back to the cost-based analysis.
1063 //
1064 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
1065 // HotCountThreshold * Size) rather than division to avoid precision loss.
1066 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
1067 Threshold *= Size;
1068
1069 APInt UpperBoundCycleSavings = CycleSavings;
1070 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
1071 if (UpperBoundCycleSavings.uge(RHS: Threshold))
1072 return true;
1073
1074 APInt LowerBoundCycleSavings = CycleSavings;
1075 LowerBoundCycleSavings *=
1076 getInliningCostBenefitAnalysisProfitableMultiplier();
1077 if (LowerBoundCycleSavings.ult(RHS: Threshold))
1078 return false;
1079
1080 // Otherwise, fall back to the cost-based analysis.
1081 return std::nullopt;
1082 }
1083
1084 InlineResult finalizeAnalysis() override {
1085 // Loops generally act a lot like calls in that they act like barriers to
1086 // movement, require a certain amount of setup, etc. So when optimising for
1087 // size, we penalise any call sites that perform loops. We do this after all
1088 // other costs here, so will likely only be dealing with relatively small
1089 // functions (and hence DT and LI will hopefully be cheap).
1090 auto *Caller = CandidateCall.getFunction();
1091 if (Caller->hasMinSize()) {
1092 DominatorTree DT(F);
1093 LoopInfo LI(DT);
1094 int NumLoops = 0;
1095 for (Loop *L : LI) {
1096 // Ignore loops that will not be executed
1097 if (DeadBlocks.count(Ptr: L->getHeader()))
1098 continue;
1099 NumLoops++;
1100 }
1101 addCost(Inc: NumLoops * InlineConstants::LoopPenalty);
1102 }
1103
1104 // We applied the maximum possible vector bonus at the beginning. Now,
1105 // subtract the excess bonus, if any, from the Threshold before
1106 // comparing against Cost.
1107 if (NumVectorInstructions <= NumInstructions / 10)
1108 Threshold -= VectorBonus;
1109 else if (NumVectorInstructions <= NumInstructions / 2)
1110 Threshold -= VectorBonus / 2;
1111
1112 if (std::optional<int> AttrCost =
1113 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost"))
1114 Cost = *AttrCost;
1115
1116 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1117 CB&: CandidateCall,
1118 AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName))
1119 Cost *= *AttrCostMult;
1120
1121 if (std::optional<int> AttrThreshold =
1122 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold"))
1123 Threshold = *AttrThreshold;
1124
1125 if (auto Result = costBenefitAnalysis()) {
1126 DecidedByCostBenefit = true;
1127 if (*Result)
1128 return InlineResult::success();
1129 else
1130 return InlineResult::failure(Reason: "Cost over threshold.");
1131 }
1132
1133 if (IgnoreThreshold)
1134 return InlineResult::success();
1135
1136 DecidedByCostThreshold = true;
1137 return Cost < std::max(a: 1, b: Threshold)
1138 ? InlineResult::success()
1139 : InlineResult::failure(Reason: "Cost over threshold.");
1140 }
1141
1142 bool shouldStop() override {
1143 if (IgnoreThreshold || ComputeFullInlineCost)
1144 return false;
1145 // Bail out the moment we cross the threshold. This means we'll under-count
1146 // the cost, but only when undercounting doesn't matter.
1147 if (Cost < Threshold)
1148 return false;
1149 DecidedByCostThreshold = true;
1150 return true;
1151 }
1152
1153 void onLoadEliminationOpportunity() override {
1154 LoadEliminationCost += InstrCost;
1155 }
1156
1157 InlineResult onAnalysisStart() override {
1158 // Perform some tweaks to the cost and threshold based on the direct
1159 // callsite information.
1160
1161 // We want to more aggressively inline vector-dense kernels, so up the
1162 // threshold, and we'll lower it if the % of vector instructions gets too
1163 // low. Note that these bonuses are some what arbitrary and evolved over
1164 // time by accident as much as because they are principled bonuses.
1165 //
1166 // FIXME: It would be nice to remove all such bonuses. At least it would be
1167 // nice to base the bonus values on something more scientific.
1168 assert(NumInstructions == 0);
1169 assert(NumVectorInstructions == 0);
1170
1171 // Update the threshold based on callsite properties
1172 updateThreshold(Call&: CandidateCall, Callee&: F);
1173
1174 // While Threshold depends on commandline options that can take negative
1175 // values, we want to enforce the invariant that the computed threshold and
1176 // bonuses are non-negative.
1177 assert(Threshold >= 0);
1178 assert(SingleBBBonus >= 0);
1179 assert(VectorBonus >= 0);
1180
1181 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1182 // this Threshold any time, and cost cannot decrease, we can stop processing
1183 // the rest of the function body.
1184 Threshold += (SingleBBBonus + VectorBonus);
1185
1186 // Give out bonuses for the callsite, as the instructions setting them up
1187 // will be gone after inlining.
1188 addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1189
1190 // If this function uses the coldcc calling convention, prefer not to inline
1191 // it.
1192 if (F.getCallingConv() == CallingConv::Cold)
1193 addCost(Inc: InlineConstants::ColdccPenalty);
1194
1195 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1196
1197 // Check if we're done. This can happen due to bonuses and penalties.
1198 if (Cost >= Threshold && !ComputeFullInlineCost)
1199 return InlineResult::failure(Reason: "high cost");
1200
1201 return InlineResult::success();
1202 }
1203
1204public:
1205 InlineCostCallAnalyzer(
1206 Function &Callee, CallBase &Call, const InlineParams &Params,
1207 const TargetTransformInfo &TTI,
1208 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1209 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1210 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
1211 ProfileSummaryInfo *PSI = nullptr,
1212 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1213 bool IgnoreThreshold = false,
1214 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
1215 nullptr)
1216 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
1217 ORE, GetEphValuesCache),
1218 ComputeFullInlineCost(OptComputeFullInlineCost ||
1219 Params.ComputeFullInlineCost || ORE ||
1220 isCostBenefitAnalysisEnabled()),
1221 Params(Params), Threshold(Params.DefaultThreshold),
1222 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1223 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1224 Writer(this) {
1225 AllowRecursiveCall = *Params.AllowRecursiveCall;
1226 }
1227
1228 /// Annotation Writer for instruction details
1229 InlineCostAnnotationWriter Writer;
1230
1231 void dump();
1232
1233 // Prints the same analysis as dump(), but its definition is not dependent
1234 // on the build.
1235 void print(raw_ostream &OS);
1236
1237 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1238 auto It = InstructionCostDetailMap.find(Val: I);
1239 if (It != InstructionCostDetailMap.end())
1240 return It->second;
1241 return std::nullopt;
1242 }
1243
1244 ~InlineCostCallAnalyzer() override = default;
1245 int getThreshold() const { return Threshold; }
1246 int getCost() const { return Cost; }
1247 int getStaticBonusApplied() const { return StaticBonusApplied; }
1248 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1249 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1250 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1251};
1252
1253// Return true if CB is the sole call to local function Callee.
1254static bool isSoleCallToLocalFunction(const CallBase &CB,
1255 const Function &Callee) {
1256 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1257 &Callee == CB.getCalledFunction();
1258}
1259
1260class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1261private:
1262 InlineCostFeatures Cost = {};
1263
1264 // FIXME: These constants are taken from the heuristic-based cost visitor.
1265 // These should be removed entirely in a later revision to avoid reliance on
1266 // heuristics in the ML inliner.
1267 static constexpr int JTCostMultiplier = 2;
1268 static constexpr int CaseClusterCostMultiplier = 2;
1269 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1270 static constexpr int SwitchCostMultiplier = 2;
1271
1272 // FIXME: These are taken from the heuristic-based cost visitor: we should
1273 // eventually abstract these to the CallAnalyzer to avoid duplication.
1274 unsigned SROACostSavingOpportunities = 0;
1275 int VectorBonus = 0;
1276 int SingleBBBonus = 0;
1277 int Threshold = 5;
1278
1279 DenseMap<AllocaInst *, unsigned> SROACosts;
1280
1281 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1282 Cost[static_cast<size_t>(Feature)] += Delta;
1283 }
1284
1285 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1286 Cost[static_cast<size_t>(Feature)] = Value;
1287 }
1288
1289 void onDisableSROA(AllocaInst *Arg) override {
1290 auto CostIt = SROACosts.find(Val: Arg);
1291 if (CostIt == SROACosts.end())
1292 return;
1293
1294 increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second);
1295 SROACostSavingOpportunities -= CostIt->second;
1296 SROACosts.erase(I: CostIt);
1297 }
1298
1299 void onDisableLoadElimination() override {
1300 set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1);
1301 }
1302
1303 void onCallPenalty() override {
1304 increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty);
1305 }
1306
1307 void onCallArgumentSetup(const CallBase &Call) override {
1308 increment(Feature: InlineCostFeatureIndex::call_argument_setup,
1309 Delta: Call.arg_size() * InstrCost);
1310 }
1311
1312 void onLoadRelativeIntrinsic() override {
1313 increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost);
1314 }
1315
1316 void onLoweredCall(Function *F, CallBase &Call,
1317 bool IsIndirectCall) override {
1318 increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup,
1319 Delta: Call.arg_size() * InstrCost);
1320
1321 if (IsIndirectCall) {
1322 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1323 /*HintThreshold*/ {},
1324 /*ColdThreshold*/ {},
1325 /*OptSizeThreshold*/ {},
1326 /*OptMinSizeThreshold*/ {},
1327 /*HotCallSiteThreshold*/ {},
1328 /*LocallyHotCallSiteThreshold*/ {},
1329 /*ColdCallSiteThreshold*/ {},
1330 /*ComputeFullInlineCost*/ true,
1331 /*EnableDeferral*/ true};
1332 IndirectCallParams.DefaultThreshold =
1333 InlineConstants::IndirectCallThreshold;
1334
1335 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1336 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
1337 false, true);
1338 if (CA.analyze().isSuccess()) {
1339 increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate,
1340 Delta: CA.getCost());
1341 increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1);
1342 }
1343 } else {
1344 onCallPenalty();
1345 }
1346 }
1347
1348 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1349 bool DefaultDestUnreachable) override {
1350 if (JumpTableSize) {
1351 if (!DefaultDestUnreachable)
1352 increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty,
1353 Delta: SwitchDefaultDestCostMultiplier * InstrCost);
1354 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1355 JTCostMultiplier * InstrCost;
1356 increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost);
1357 return;
1358 }
1359
1360 if (NumCaseCluster <= 3) {
1361 increment(Feature: InlineCostFeatureIndex::case_cluster_penalty,
1362 Delta: (NumCaseCluster - DefaultDestUnreachable) *
1363 CaseClusterCostMultiplier * InstrCost);
1364 return;
1365 }
1366
1367 int64_t ExpectedNumberOfCompare =
1368 getExpectedNumberOfCompare(NumCaseCluster);
1369
1370 int64_t SwitchCost =
1371 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1372 increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost);
1373 }
1374
1375 void onMissedSimplification() override {
1376 increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions,
1377 Delta: InstrCost);
1378 }
1379
1380 void onInitializeSROAArg(AllocaInst *Arg) override {
1381 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
1382 SROACosts[Arg] = SROAArgCost;
1383 SROACostSavingOpportunities += SROAArgCost;
1384 }
1385
1386 void onAggregateSROAUse(AllocaInst *Arg) override {
1387 SROACosts.find(Val: Arg)->second += InstrCost;
1388 SROACostSavingOpportunities += InstrCost;
1389 }
1390
1391 void onBlockAnalyzed(const BasicBlock *BB) override {
1392 if (BB->getTerminator()->getNumSuccessors() > 1)
1393 set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1);
1394 Threshold -= SingleBBBonus;
1395 }
1396
1397 InlineResult finalizeAnalysis() override {
1398 auto *Caller = CandidateCall.getFunction();
1399 if (Caller->hasMinSize()) {
1400 DominatorTree DT(F);
1401 LoopInfo LI(DT);
1402 for (Loop *L : LI) {
1403 // Ignore loops that will not be executed
1404 if (DeadBlocks.count(Ptr: L->getHeader()))
1405 continue;
1406 increment(Feature: InlineCostFeatureIndex::num_loops,
1407 Delta: InlineConstants::LoopPenalty);
1408 }
1409 }
1410 set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size());
1411 set(Feature: InlineCostFeatureIndex::simplified_instructions,
1412 Value: NumInstructionsSimplified);
1413 set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs);
1414 set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args,
1415 Value: NumConstantOffsetPtrArgs);
1416 set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities);
1417
1418 if (NumVectorInstructions <= NumInstructions / 10)
1419 Threshold -= VectorBonus;
1420 else if (NumVectorInstructions <= NumInstructions / 2)
1421 Threshold -= VectorBonus / 2;
1422
1423 set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold);
1424
1425 return InlineResult::success();
1426 }
1427
1428 bool shouldStop() override { return false; }
1429
1430 void onLoadEliminationOpportunity() override {
1431 increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1);
1432 }
1433
1434 InlineResult onAnalysisStart() override {
1435 increment(Feature: InlineCostFeatureIndex::callsite_cost,
1436 Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1437
1438 set(Feature: InlineCostFeatureIndex::cold_cc_penalty,
1439 Value: (F.getCallingConv() == CallingConv::Cold));
1440
1441 set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus,
1442 Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F));
1443
1444 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1445 // analyzer - instead, we should abstract it to a common method in the
1446 // CallAnalyzer
1447 int SingleBBBonusPercent = 50;
1448 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1449 Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall);
1450 Threshold *= TTI.getInliningThresholdMultiplier();
1451 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1452 VectorBonus = Threshold * VectorBonusPercent / 100;
1453 Threshold += (SingleBBBonus + VectorBonus);
1454
1455 return InlineResult::success();
1456 }
1457
1458public:
1459 InlineCostFeaturesAnalyzer(
1460 const TargetTransformInfo &TTI,
1461 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1462 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1463 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
1464 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1465 CallBase &Call)
1466 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
1467 PSI) {}
1468
1469 const InlineCostFeatures &features() const { return Cost; }
1470};
1471
1472} // namespace
1473
1474/// Test whether the given value is an Alloca-derived function argument.
1475bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1476 return SROAArgValues.count(Val: V);
1477}
1478
1479void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1480 onDisableSROA(Arg: SROAArg);
1481 EnabledSROAAllocas.erase(V: SROAArg);
1482 disableLoadElimination();
1483}
1484
1485void InlineCostAnnotationWriter::emitInstructionAnnot(
1486 const Instruction *I, formatted_raw_ostream &OS) {
1487 // The cost of inlining of the given instruction is printed always.
1488 // The threshold delta is printed only when it is non-zero. It happens
1489 // when we decided to give a bonus at a particular instruction.
1490 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1491 if (!Record)
1492 OS << "; No analysis for the instruction";
1493 else {
1494 OS << "; cost before = " << Record->CostBefore
1495 << ", cost after = " << Record->CostAfter
1496 << ", threshold before = " << Record->ThresholdBefore
1497 << ", threshold after = " << Record->ThresholdAfter << ", ";
1498 OS << "cost delta = " << Record->getCostDelta();
1499 if (Record->hasThresholdChanged())
1500 OS << ", threshold delta = " << Record->getThresholdDelta();
1501 }
1502 auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I));
1503 if (V) {
1504 OS << ", simplified to ";
1505 V->print(O&: OS, IsForDebug: true);
1506 if (auto *VI = dyn_cast<Instruction>(Val: V)) {
1507 if (VI->getFunction() != I->getFunction())
1508 OS << " (caller instruction)";
1509 } else if (auto *VArg = dyn_cast<Argument>(Val: V)) {
1510 if (VArg->getParent() != I->getFunction())
1511 OS << " (caller argument)";
1512 }
1513 }
1514 OS << "\n";
1515}
1516
1517/// If 'V' maps to a SROA candidate, disable SROA for it.
1518void CallAnalyzer::disableSROA(Value *V) {
1519 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1520 disableSROAForArg(SROAArg);
1521 }
1522}
1523
1524void CallAnalyzer::disableLoadElimination() {
1525 if (EnableLoadElimination) {
1526 onDisableLoadElimination();
1527 EnableLoadElimination = false;
1528 }
1529}
1530
1531/// Accumulate a constant GEP offset into an APInt if possible.
1532///
1533/// Returns false if unable to compute the offset for any reason. Respects any
1534/// simplified values known during the analysis of this callsite.
1535bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1536 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType());
1537 assert(IntPtrWidth == Offset.getBitWidth());
1538
1539 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1540 GTI != GTE; ++GTI) {
1541 ConstantInt *OpC =
1542 getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand());
1543 if (!OpC)
1544 return false;
1545 if (OpC->isZero())
1546 continue;
1547
1548 // Handle a struct index, which adds its field offset to the pointer.
1549 if (StructType *STy = GTI.getStructTypeOrNull()) {
1550 unsigned ElementIdx = OpC->getZExtValue();
1551 const StructLayout *SL = DL.getStructLayout(Ty: STy);
1552 Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx));
1553 continue;
1554 }
1555
1556 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1557 Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize;
1558 }
1559 return true;
1560}
1561
1562/// Use TTI to check whether a GEP is free.
1563///
1564/// Respects any simplified values known during the analysis of this callsite.
1565bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1566 SmallVector<Value *, 4> Operands;
1567 Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0));
1568 for (const Use &Op : GEP.indices())
1569 if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op))
1570 Operands.push_back(Elt: SimpleOp);
1571 else
1572 Operands.push_back(Elt: Op);
1573 return TTI.getInstructionCost(U: &GEP, Operands,
1574 CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1575 TargetTransformInfo::TCC_Free;
1576}
1577
1578bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1579 disableSROA(V: I.getOperand(i_nocapture: 0));
1580
1581 // Check whether inlining will turn a dynamic alloca into a static
1582 // alloca and handle that case.
1583 if (I.isArrayAllocation()) {
1584 Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize());
1585 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) {
1586 // Sometimes a dynamic alloca could be converted into a static alloca
1587 // after this constant prop, and become a huge static alloca on an
1588 // unconditional CFG path. Avoid inlining if this is going to happen above
1589 // a threshold.
1590 // FIXME: If the threshold is removed or lowered too much, we could end up
1591 // being too pessimistic and prevent inlining non-problematic code. This
1592 // could result in unintended perf regressions. A better overall strategy
1593 // is needed to track stack usage during inlining.
1594 Type *Ty = I.getAllocatedType();
1595 AllocatedSize = SaturatingMultiplyAdd(
1596 X: AllocSize->getLimitedValue(),
1597 Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize);
1598 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1599 HasDynamicAlloca = true;
1600 return false;
1601 }
1602 }
1603
1604 if (I.isStaticAlloca()) {
1605 // Accumulate the allocated size if constant and executed once.
1606 // Note: if AllocSize is a vscale value, this is an underestimate of the
1607 // allocated size, and it also requires some of the cost of a dynamic
1608 // alloca, but is recorded here as a constant size alloca.
1609 TypeSize AllocSize = I.getAllocationSize(DL).value_or(u: TypeSize::getZero());
1610 AllocatedSize = SaturatingAdd(X: AllocSize.getKnownMinValue(), Y: AllocatedSize);
1611 } else {
1612 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1613 // a variety of reasons, and so we would like to not inline them into
1614 // functions which don't currently have a dynamic alloca. This simply
1615 // disables inlining altogether in the presence of a dynamic alloca.
1616 HasDynamicAlloca = true;
1617 }
1618
1619 return false;
1620}
1621
1622bool CallAnalyzer::visitPHI(PHINode &I) {
1623 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1624 // though we don't want to propagate it's bonuses. The idea is to disable
1625 // SROA if it *might* be used in an inappropriate manner.
1626
1627 // Phi nodes are always zero-cost.
1628 // FIXME: Pointer sizes may differ between different address spaces, so do we
1629 // need to use correct address space in the call to getPointerSizeInBits here?
1630 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1631 // see the ZeroOffset is used as a dummy value, so we can probably use any
1632 // bit width for the ZeroOffset?
1633 APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0));
1634 bool CheckSROA = I.getType()->isPointerTy();
1635
1636 // Track the constant or pointer with constant offset we've seen so far.
1637 Constant *FirstC = nullptr;
1638 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1639 Value *FirstV = nullptr;
1640
1641 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1642 BasicBlock *Pred = I.getIncomingBlock(i);
1643 // If the incoming block is dead, skip the incoming block.
1644 if (DeadBlocks.count(Ptr: Pred))
1645 continue;
1646 // If the parent block of phi is not the known successor of the incoming
1647 // block, skip the incoming block.
1648 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1649 if (KnownSuccessor && KnownSuccessor != I.getParent())
1650 continue;
1651
1652 Value *V = I.getIncomingValue(i);
1653 // If the incoming value is this phi itself, skip the incoming value.
1654 if (&I == V)
1655 continue;
1656
1657 Constant *C = getDirectOrSimplifiedValue<Constant>(V);
1658
1659 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1660 if (!C && CheckSROA)
1661 BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V);
1662
1663 if (!C && !BaseAndOffset.first)
1664 // The incoming value is neither a constant nor a pointer with constant
1665 // offset, exit early.
1666 return true;
1667
1668 if (FirstC) {
1669 if (FirstC == C)
1670 // If we've seen a constant incoming value before and it is the same
1671 // constant we see this time, continue checking the next incoming value.
1672 continue;
1673 // Otherwise early exit because we either see a different constant or saw
1674 // a constant before but we have a pointer with constant offset this time.
1675 return true;
1676 }
1677
1678 if (FirstV) {
1679 // The same logic as above, but check pointer with constant offset here.
1680 if (FirstBaseAndOffset == BaseAndOffset)
1681 continue;
1682 return true;
1683 }
1684
1685 if (C) {
1686 // This is the 1st time we've seen a constant, record it.
1687 FirstC = C;
1688 continue;
1689 }
1690
1691 // The remaining case is that this is the 1st time we've seen a pointer with
1692 // constant offset, record it.
1693 FirstV = V;
1694 FirstBaseAndOffset = BaseAndOffset;
1695 }
1696
1697 // Check if we can map phi to a constant.
1698 if (FirstC) {
1699 SimplifiedValues[&I] = FirstC;
1700 return true;
1701 }
1702
1703 // Check if we can map phi to a pointer with constant offset.
1704 if (FirstBaseAndOffset.first) {
1705 ConstantOffsetPtrs[&I] = std::move(FirstBaseAndOffset);
1706
1707 if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV))
1708 SROAArgValues[&I] = SROAArg;
1709 }
1710
1711 return true;
1712}
1713
1714/// Check we can fold GEPs of constant-offset call site argument pointers.
1715/// This requires target data and inbounds GEPs.
1716///
1717/// \return true if the specified GEP can be folded.
1718bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1719 // Check if we have a base + offset for the pointer.
1720 std::pair<Value *, APInt> BaseAndOffset =
1721 ConstantOffsetPtrs.lookup(Val: I.getPointerOperand());
1722 if (!BaseAndOffset.first)
1723 return false;
1724
1725 // Check if the offset of this GEP is constant, and if so accumulate it
1726 // into Offset.
1727 if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second))
1728 return false;
1729
1730 // Add the result as a new mapping to Base + Offset.
1731 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1732
1733 return true;
1734}
1735
1736bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1737 auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand());
1738
1739 // Lambda to check whether a GEP's indices are all constant.
1740 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1741 for (const Use &Op : GEP.indices())
1742 if (!getDirectOrSimplifiedValue<Constant>(V: Op))
1743 return false;
1744 return true;
1745 };
1746
1747 if (!DisableGEPConstOperand)
1748 if (simplifyInstruction(I))
1749 return true;
1750
1751 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1752 if (SROAArg)
1753 SROAArgValues[&I] = SROAArg;
1754
1755 // Constant GEPs are modeled as free.
1756 return true;
1757 }
1758
1759 // Variable GEPs will require math and will disable SROA.
1760 if (SROAArg)
1761 disableSROAForArg(SROAArg);
1762 return isGEPFree(GEP&: I);
1763}
1764
1765// Simplify \p Cmp if RHS is const and we can ValueTrack LHS.
1766// This handles the case only when the Cmp instruction is guarding a recursive
1767// call that will cause the Cmp to fail/succeed for the recursive call.
1768bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) {
1769 // Bail out if LHS is not a function argument or RHS is NOT const:
1770 if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1)))
1771 return false;
1772 auto *CmpOp = Cmp.getOperand(i_nocapture: 0);
1773 // Make sure that the callsite is recursive:
1774 if (CandidateCall.getCaller() != &F)
1775 return false;
1776 // Only handle the case when the callsite has a single predecessor:
1777 auto *CallBB = CandidateCall.getParent();
1778 auto *Predecessor = CallBB->getSinglePredecessor();
1779 if (!Predecessor)
1780 return false;
1781 // Check if the callsite is guarded by the same Cmp instruction:
1782 auto *Br = dyn_cast<CondBrInst>(Val: Predecessor->getTerminator());
1783 if (!Br || Br->getCondition() != &Cmp)
1784 return false;
1785
1786 // Check if there is any arg of the recursive callsite is affecting the cmp
1787 // instr:
1788 bool ArgFound = false;
1789 Value *FuncArg = nullptr, *CallArg = nullptr;
1790 for (unsigned ArgNum = 0;
1791 ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) {
1792 FuncArg = F.getArg(i: ArgNum);
1793 CallArg = CandidateCall.getArgOperand(i: ArgNum);
1794 if (FuncArg == CmpOp && CallArg != CmpOp) {
1795 ArgFound = true;
1796 break;
1797 }
1798 }
1799 if (!ArgFound)
1800 return false;
1801
1802 // Now we have a recursive call that is guarded by a cmp instruction.
1803 // Check if this cmp can be simplified:
1804 SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg));
1805 CondContext CC(&Cmp);
1806 CC.Invert = (CallBB != Br->getSuccessor(i: 0));
1807 SQ.CC = &CC;
1808 CC.AffectedValues.insert(Ptr: FuncArg);
1809 Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands(
1810 I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ);
1811 if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) {
1812 // Make sure that the BB of the recursive call is NOT the true successor
1813 // of the icmp. In other words, make sure that the recursion depth is 1.
1814 if ((ConstVal->isOne() && CC.Invert) ||
1815 (ConstVal->isZero() && !CC.Invert)) {
1816 SimplifiedValues[&Cmp] = ConstVal;
1817 return true;
1818 }
1819 }
1820 return false;
1821}
1822
1823/// Simplify \p I if its operands are constants and update SimplifiedValues.
1824bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1825 SmallVector<Constant *> COps;
1826 for (Value *Op : I.operands()) {
1827 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
1828 if (!COp)
1829 return false;
1830 COps.push_back(Elt: COp);
1831 }
1832 auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL);
1833 if (!C)
1834 return false;
1835 SimplifiedValues[&I] = C;
1836 return true;
1837}
1838
1839/// Try to simplify a call to llvm.is.constant.
1840///
1841/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1842/// we expect calls of this specific intrinsic to be infrequent.
1843///
1844/// FIXME: Given that we know CB's parent (F) caller
1845/// (CandidateCall->getParent()->getParent()), we might be able to determine
1846/// whether inlining F into F's caller would change how the call to
1847/// llvm.is.constant would evaluate.
1848bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1849 Value *Arg = CB.getArgOperand(i: 0);
1850 auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg);
1851
1852 Type *RT = CB.getFunctionType()->getReturnType();
1853 SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0);
1854 return true;
1855}
1856
1857bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1858 // As per the langref, "The fourth argument to llvm.objectsize determines if
1859 // the value should be evaluated at runtime."
1860 if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne())
1861 return false;
1862
1863 Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr,
1864 /*MustSucceed=*/true);
1865 Constant *C = dyn_cast_or_null<Constant>(Val: V);
1866 if (C)
1867 SimplifiedValues[&CB] = C;
1868 return C;
1869}
1870
1871bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1872 // Propagate constants through bitcasts.
1873 if (simplifyInstruction(I))
1874 return true;
1875
1876 // Track base/offsets through casts
1877 std::pair<Value *, APInt> BaseAndOffset =
1878 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1879 // Casts don't change the offset, just wrap it up.
1880 if (BaseAndOffset.first)
1881 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1882
1883 // Also look for SROA candidates here.
1884 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1885 SROAArgValues[&I] = SROAArg;
1886
1887 // Bitcasts are always zero cost.
1888 return true;
1889}
1890
1891bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1892 // Propagate constants through ptrtoint.
1893 if (simplifyInstruction(I))
1894 return true;
1895
1896 // Track base/offset pairs when converted to a plain integer provided the
1897 // integer is large enough to represent the pointer.
1898 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1899 unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1900 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1901 std::pair<Value *, APInt> BaseAndOffset =
1902 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1903 if (BaseAndOffset.first)
1904 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1905 }
1906
1907 // This is really weird. Technically, ptrtoint will disable SROA. However,
1908 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1909 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1910 // would block SROA would also block SROA if applied directly to a pointer,
1911 // and so we can just add the integer in here. The only places where SROA is
1912 // preserved either cannot fire on an integer, or won't in-and-of themselves
1913 // disable SROA (ext) w/o some later use that we would see and disable.
1914 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1915 SROAArgValues[&I] = SROAArg;
1916
1917 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1918 TargetTransformInfo::TCC_Free;
1919}
1920
1921bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1922 // Propagate constants through ptrtoint.
1923 if (simplifyInstruction(I))
1924 return true;
1925
1926 // Track base/offset pairs when round-tripped through a pointer without
1927 // modifications provided the integer is not too large.
1928 Value *Op = I.getOperand(i_nocapture: 0);
1929 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1930 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1931 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op);
1932 if (BaseAndOffset.first)
1933 ConstantOffsetPtrs[&I] = std::move(BaseAndOffset);
1934 }
1935
1936 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1937 if (auto *SROAArg = getSROAArgForValueOrNull(V: Op))
1938 SROAArgValues[&I] = SROAArg;
1939
1940 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1941 TargetTransformInfo::TCC_Free;
1942}
1943
1944bool CallAnalyzer::visitCastInst(CastInst &I) {
1945 // Propagate constants through casts.
1946 if (simplifyInstruction(I))
1947 return true;
1948
1949 // Disable SROA in the face of arbitrary casts we don't explicitly list
1950 // elsewhere.
1951 disableSROA(V: I.getOperand(i_nocapture: 0));
1952
1953 // If this is a floating-point cast, and the target says this operation
1954 // is expensive, this may eventually become a library call. Treat the cost
1955 // as such.
1956 switch (I.getOpcode()) {
1957 case Instruction::FPTrunc:
1958 case Instruction::FPExt:
1959 case Instruction::UIToFP:
1960 case Instruction::SIToFP:
1961 case Instruction::FPToUI:
1962 case Instruction::FPToSI:
1963 if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive)
1964 onCallPenalty();
1965 break;
1966 default:
1967 break;
1968 }
1969
1970 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1971 TargetTransformInfo::TCC_Free;
1972}
1973
1974bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1975 return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr);
1976}
1977
1978bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1979 // Does the *call site* have the NonNull attribute set on an argument? We
1980 // use the attribute on the call site to memoize any analysis done in the
1981 // caller. This will also trip if the callee function has a non-null
1982 // parameter attribute, but that's a less interesting case because hopefully
1983 // the callee would already have been simplified based on that.
1984 if (Argument *A = dyn_cast<Argument>(Val: V))
1985 if (paramHasAttr(A, Attr: Attribute::NonNull))
1986 return true;
1987
1988 // Is this an alloca in the caller? This is distinct from the attribute case
1989 // above because attributes aren't updated within the inliner itself and we
1990 // always want to catch the alloca derived case.
1991 if (isAllocaDerivedArg(V))
1992 // We can actually predict the result of comparisons between an
1993 // alloca-derived value and null. Note that this fires regardless of
1994 // SROA firing.
1995 return true;
1996
1997 return false;
1998}
1999
2000bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
2001 // If the normal destination of the invoke or the parent block of the call
2002 // site is unreachable-terminated, there is little point in inlining this
2003 // unless there is literally zero cost.
2004 // FIXME: Note that it is possible that an unreachable-terminated block has a
2005 // hot entry. For example, in below scenario inlining hot_call_X() may be
2006 // beneficial :
2007 // main() {
2008 // hot_call_1();
2009 // ...
2010 // hot_call_N()
2011 // exit(0);
2012 // }
2013 // For now, we are not handling this corner case here as it is rare in real
2014 // code. In future, we should elaborate this based on BPI and BFI in more
2015 // general threshold adjusting heuristics in updateThreshold().
2016 if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) {
2017 if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator()))
2018 return false;
2019 } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator()))
2020 return false;
2021
2022 return true;
2023}
2024
2025bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
2026 BlockFrequencyInfo *CallerBFI) {
2027 // If global profile summary is available, then callsite's coldness is
2028 // determined based on that.
2029 if (PSI && PSI->hasProfileSummary())
2030 return PSI->isColdCallSite(CB: Call, BFI: CallerBFI);
2031
2032 // Otherwise we need BFI to be available.
2033 if (!CallerBFI)
2034 return false;
2035
2036 // Determine if the callsite is cold relative to caller's entry. We could
2037 // potentially cache the computation of scaled entry frequency, but the added
2038 // complexity is not worth it unless this scaling shows up high in the
2039 // profiles.
2040 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
2041 auto CallSiteBB = Call.getParent();
2042 auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2043 auto CallerEntryFreq =
2044 CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock()));
2045 return CallSiteFreq < CallerEntryFreq * ColdProb;
2046}
2047
2048std::optional<int>
2049InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
2050 BlockFrequencyInfo *CallerBFI) {
2051
2052 // If global profile summary is available, then callsite's hotness is
2053 // determined based on that.
2054 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI))
2055 return Params.HotCallSiteThreshold;
2056
2057 // Otherwise we need BFI to be available and to have a locally hot callsite
2058 // threshold.
2059 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
2060 return std::nullopt;
2061
2062 // Determine if the callsite is hot relative to caller's entry. We could
2063 // potentially cache the computation of scaled entry frequency, but the added
2064 // complexity is not worth it unless this scaling shows up high in the
2065 // profiles.
2066 const BasicBlock *CallSiteBB = Call.getParent();
2067 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2068 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
2069 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq);
2070 if (Limit && CallSiteFreq >= *Limit)
2071 return Params.LocallyHotCallSiteThreshold;
2072
2073 // Otherwise treat it normally.
2074 return std::nullopt;
2075}
2076
2077void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
2078 // If no size growth is allowed for this inlining, set Threshold to 0.
2079 if (!allowSizeGrowth(Call)) {
2080 Threshold = 0;
2081 return;
2082 }
2083
2084 Function *Caller = Call.getCaller();
2085
2086 // return min(A, B) if B is valid.
2087 auto MinIfValid = [](int A, std::optional<int> B) {
2088 return B ? std::min(a: A, b: *B) : A;
2089 };
2090
2091 // return max(A, B) if B is valid.
2092 auto MaxIfValid = [](int A, std::optional<int> B) {
2093 return B ? std::max(a: A, b: *B) : A;
2094 };
2095
2096 // Various bonus percentages. These are multiplied by Threshold to get the
2097 // bonus values.
2098 // SingleBBBonus: This bonus is applied if the callee has a single reachable
2099 // basic block at the given callsite context. This is speculatively applied
2100 // and withdrawn if more than one basic block is seen.
2101 //
2102 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
2103 // of the last call to a static function as inlining such functions is
2104 // guaranteed to reduce code size.
2105 //
2106 // These bonus percentages may be set to 0 based on properties of the caller
2107 // and the callsite.
2108 int SingleBBBonusPercent = 50;
2109 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
2110 int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus();
2111
2112 // Lambda to set all the above bonus and bonus percentages to 0.
2113 auto DisallowAllBonuses = [&]() {
2114 SingleBBBonusPercent = 0;
2115 VectorBonusPercent = 0;
2116 LastCallToStaticBonus = 0;
2117 };
2118
2119 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
2120 // and reduce the threshold if the caller has the necessary attribute.
2121 if (Caller->hasMinSize()) {
2122 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
2123 // For minsize, we want to disable the single BB bonus and the vector
2124 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
2125 // a static function will, at the minimum, eliminate the parameter setup and
2126 // call/return instructions.
2127 SingleBBBonusPercent = 0;
2128 VectorBonusPercent = 0;
2129 } else if (Caller->hasOptSize())
2130 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
2131
2132 // Adjust the threshold based on inlinehint attribute and profile based
2133 // hotness information if the caller does not have MinSize attribute.
2134 if (!Caller->hasMinSize()) {
2135 if (Callee.hasFnAttribute(Kind: Attribute::InlineHint))
2136 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2137
2138 // FIXME: After switching to the new passmanager, simplify the logic below
2139 // by checking only the callsite hotness/coldness as we will reliably
2140 // have local profile information.
2141 //
2142 // Callsite hotness and coldness can be determined if sample profile is
2143 // used (which adds hotness metadata to calls) or if caller's
2144 // BlockFrequencyInfo is available.
2145 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
2146 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
2147 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
2148 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
2149 // FIXME: This should update the threshold only if it exceeds the
2150 // current threshold, but AutoFDO + ThinLTO currently relies on this
2151 // behavior to prevent inlining of hot callsites during ThinLTO
2152 // compile phase.
2153 Threshold = *HotCallSiteThreshold;
2154 } else if (isColdCallSite(Call, CallerBFI)) {
2155 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2156 // Do not apply bonuses for a cold callsite including the
2157 // LastCallToStatic bonus. While this bonus might result in code size
2158 // reduction, it can cause the size of a non-cold caller to increase
2159 // preventing it from being inlined.
2160 DisallowAllBonuses();
2161 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
2162 } else if (PSI) {
2163 // Use callee's global profile information only if we have no way of
2164 // determining this via callsite information.
2165 if (PSI->isFunctionEntryHot(F: &Callee)) {
2166 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2167 // If callsite hotness can not be determined, we may still know
2168 // that the callee is hot and treat it as a weaker hint for threshold
2169 // increase.
2170 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2171 } else if (PSI->isFunctionEntryCold(F: &Callee)) {
2172 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2173 // Do not apply bonuses for a cold callee including the
2174 // LastCallToStatic bonus. While this bonus might result in code size
2175 // reduction, it can cause the size of a non-cold caller to increase
2176 // preventing it from being inlined.
2177 DisallowAllBonuses();
2178 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2179 }
2180 }
2181 }
2182
2183 Threshold += TTI.adjustInliningThreshold(CB: &Call);
2184
2185 // Finally, take the target-specific inlining threshold multiplier into
2186 // account.
2187 Threshold *= TTI.getInliningThresholdMultiplier();
2188
2189 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2190 VectorBonus = Threshold * VectorBonusPercent / 100;
2191
2192 // If there is only one call of the function, and it has internal linkage,
2193 // the cost of inlining it drops dramatically. It may seem odd to update
2194 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2195 if (isSoleCallToLocalFunction(CB: Call, Callee: F)) {
2196 addCost(Inc: -LastCallToStaticBonus);
2197 StaticBonusApplied = LastCallToStaticBonus;
2198 }
2199}
2200
2201bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2202 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2203 // First try to handle simplified comparisons.
2204 if (simplifyInstruction(I))
2205 return true;
2206
2207 // Try to handle comparison that can be simplified using ValueTracking.
2208 if (simplifyCmpInstForRecCall(Cmp&: I))
2209 return true;
2210
2211 if (I.getOpcode() == Instruction::FCmp)
2212 return false;
2213
2214 // Otherwise look for a comparison between constant offset pointers with
2215 // a common base.
2216 Value *LHSBase, *RHSBase;
2217 APInt LHSOffset, RHSOffset;
2218 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2219 if (LHSBase) {
2220 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2221 if (RHSBase && LHSBase == RHSBase) {
2222 // We have common bases, fold the icmp to a constant based on the
2223 // offsets.
2224 SimplifiedValues[&I] = ConstantInt::getBool(
2225 Ty: I.getType(),
2226 V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate()));
2227 ++NumConstantPtrCmps;
2228 return true;
2229 }
2230 }
2231
2232 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2233 for (auto *User : I.users())
2234 if (auto *Instr = dyn_cast<Instruction>(Val: User))
2235 if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit))
2236 return false;
2237 return true;
2238 };
2239
2240 // If the comparison is an equality comparison with null, we can simplify it
2241 // if we know the value (argument) can't be null
2242 if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) {
2243 if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) {
2244 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2245 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType())
2246 : ConstantInt::getFalse(Ty: I.getType());
2247 return true;
2248 }
2249 // Implicit null checks act as unconditional branches and their comparisons
2250 // should be treated as simplified and free of cost.
2251 if (isImplicitNullCheckCmp(I))
2252 return true;
2253 }
2254 return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1)));
2255}
2256
2257bool CallAnalyzer::visitSub(BinaryOperator &I) {
2258 // Try to handle a special case: we can fold computing the difference of two
2259 // constant-related pointers.
2260 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2261 Value *LHSBase, *RHSBase;
2262 APInt LHSOffset, RHSOffset;
2263 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2264 if (LHSBase) {
2265 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2266 if (RHSBase && LHSBase == RHSBase) {
2267 // We have common bases, fold the subtract to a constant based on the
2268 // offsets.
2269 Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset);
2270 Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset);
2271 if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) {
2272 SimplifiedValues[&I] = C;
2273 ++NumConstantPtrDiffs;
2274 return true;
2275 }
2276 }
2277 }
2278
2279 // Otherwise, fall back to the generic logic for simplifying and handling
2280 // instructions.
2281 return Base::visitSub(I);
2282}
2283
2284bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2285 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2286 Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS);
2287 Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS);
2288
2289 Value *SimpleV = nullptr;
2290 if (auto FI = dyn_cast<FPMathOperator>(Val: &I))
2291 SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS,
2292 FMF: FI->getFastMathFlags(), Q: DL);
2293 else
2294 SimpleV =
2295 simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL);
2296
2297 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2298 SimplifiedValues[&I] = C;
2299
2300 if (SimpleV)
2301 return true;
2302
2303 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2304 disableSROA(V: LHS);
2305 disableSROA(V: RHS);
2306
2307 // If the instruction is floating point, and the target says this operation
2308 // is expensive, this may eventually become a library call. Treat the cost
2309 // as such. Unless it's fneg which can be implemented with an xor.
2310 using namespace llvm::PatternMatch;
2311 if (I.getType()->isFloatingPointTy() &&
2312 TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive &&
2313 !match(V: &I, P: m_FNeg(X: m_Value())))
2314 onCallPenalty();
2315
2316 return false;
2317}
2318
2319bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2320 Value *Op = I.getOperand(i_nocapture: 0);
2321 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
2322
2323 Value *SimpleV = simplifyFNegInst(
2324 Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL);
2325
2326 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2327 SimplifiedValues[&I] = C;
2328
2329 if (SimpleV)
2330 return true;
2331
2332 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2333 disableSROA(V: Op);
2334
2335 return false;
2336}
2337
2338bool CallAnalyzer::visitLoad(LoadInst &I) {
2339 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2340 return true;
2341
2342 // If the data is already loaded from this address and hasn't been clobbered
2343 // by any stores or calls, this load is likely to be redundant and can be
2344 // eliminated.
2345 if (EnableLoadElimination &&
2346 !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) {
2347 onLoadEliminationOpportunity();
2348 return true;
2349 }
2350
2351 onMemAccess();
2352 return false;
2353}
2354
2355bool CallAnalyzer::visitStore(StoreInst &I) {
2356 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2357 return true;
2358
2359 // The store can potentially clobber loads and prevent repeated loads from
2360 // being eliminated.
2361 // FIXME:
2362 // 1. We can probably keep an initial set of eliminatable loads substracted
2363 // from the cost even when we finally see a store. We just need to disable
2364 // *further* accumulation of elimination savings.
2365 // 2. We should probably at some point thread MemorySSA for the callee into
2366 // this and then use that to actually compute *really* precise savings.
2367 disableLoadElimination();
2368
2369 onMemAccess();
2370 return false;
2371}
2372
2373bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2374 Value *Op = I.getAggregateOperand();
2375
2376 // Special handling, because we want to simplify extractvalue with a
2377 // potential insertvalue from the caller.
2378 if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) {
2379 SimplifyQuery SQ(DL);
2380 Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ);
2381 if (SimpleV) {
2382 SimplifiedValues[&I] = SimpleV;
2383 return true;
2384 }
2385 }
2386
2387 // SROA can't look through these, but they may be free.
2388 return Base::visitExtractValue(I);
2389}
2390
2391bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2392 // Constant folding for insert value is trivial.
2393 if (simplifyInstruction(I))
2394 return true;
2395
2396 // SROA can't look through these, but they may be free.
2397 return Base::visitInsertValue(I);
2398}
2399
2400/// Try to simplify a call site.
2401///
2402/// Takes a concrete function and callsite and tries to actually simplify it by
2403/// analyzing the arguments and call itself with instsimplify. Returns true if
2404/// it has simplified the callsite to some other entity (a constant), making it
2405/// free.
2406bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2407 // FIXME: Using the instsimplify logic directly for this is inefficient
2408 // because we have to continually rebuild the argument list even when no
2409 // simplifications can be performed. Until that is fixed with remapping
2410 // inside of instsimplify, directly constant fold calls here.
2411 if (!canConstantFoldCallTo(Call: &Call, F))
2412 return false;
2413
2414 // Try to re-map the arguments to constants.
2415 SmallVector<Constant *, 4> ConstantArgs;
2416 ConstantArgs.reserve(N: Call.arg_size());
2417 for (Value *I : Call.args()) {
2418 Constant *C = getDirectOrSimplifiedValue<Constant>(V: I);
2419 if (!C)
2420 return false; // This argument doesn't map to a constant.
2421
2422 ConstantArgs.push_back(Elt: C);
2423 }
2424 if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) {
2425 SimplifiedValues[&Call] = C;
2426 return true;
2427 }
2428
2429 return false;
2430}
2431
2432bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
2433 const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
2434 LibFunc LF;
2435 if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF))
2436 return TTI.isLoweredToCall(F);
2437
2438 switch (LF) {
2439 case LibFunc_memcpy_chk:
2440 case LibFunc_memmove_chk:
2441 case LibFunc_mempcpy_chk:
2442 case LibFunc_memset_chk: {
2443 // Calls to __memcpy_chk whose length is known to fit within the object
2444 // size will eventually be replaced by inline stores. Therefore, these
2445 // should not incur a call penalty. This is only really relevant on
2446 // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2447 // other platforms use memcpy intrinsics, which are already exempt from the
2448 // call penalty.
2449 auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2));
2450 auto *ObjSizeOp =
2451 getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3));
2452 if (LenOp && ObjSizeOp &&
2453 LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
2454 return false;
2455 }
2456 break;
2457 }
2458 default:
2459 break;
2460 }
2461
2462 return TTI.isLoweredToCall(F);
2463}
2464
2465bool CallAnalyzer::visitCallBase(CallBase &Call) {
2466 if (!onCallBaseVisitStart(Call))
2467 return true;
2468
2469 if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) &&
2470 !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) {
2471 // This aborts the entire analysis.
2472 ExposesReturnsTwice = true;
2473 return false;
2474 }
2475 if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate())
2476 ContainsNoDuplicateCall = true;
2477
2478 if (InlineAsm *InlineAsmOp = dyn_cast<InlineAsm>(Val: Call.getCalledOperand()))
2479 onInlineAsm(Arg: *InlineAsmOp);
2480
2481 Function *F = Call.getCalledFunction();
2482 bool IsIndirectCall = !F;
2483 if (IsIndirectCall) {
2484 // Check if this happens to be an indirect function call to a known function
2485 // in this inline context. If not, we've done all we can.
2486 Value *Callee = Call.getCalledOperand();
2487 F = getSimplifiedValue<Function>(V: Callee);
2488 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2489 onCallArgumentSetup(Call);
2490
2491 if (!Call.onlyReadsMemory())
2492 disableLoadElimination();
2493 return Base::visitCallBase(I&: Call);
2494 }
2495 }
2496
2497 assert(F && "Expected a call to a known function");
2498
2499 // When we have a concrete function, first try to simplify it directly.
2500 if (simplifyCallSite(F, Call))
2501 return true;
2502
2503 // Next check if it is an intrinsic we know about.
2504 // FIXME: Lift this into part of the InstVisitor.
2505 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) {
2506 switch (II->getIntrinsicID()) {
2507 default:
2508 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II))
2509 disableLoadElimination();
2510 return Base::visitCallBase(I&: Call);
2511
2512 case Intrinsic::load_relative:
2513 onLoadRelativeIntrinsic();
2514 return false;
2515
2516 case Intrinsic::memset:
2517 case Intrinsic::memcpy:
2518 case Intrinsic::memmove:
2519 disableLoadElimination();
2520 // SROA can usually chew through these intrinsics, but they aren't free.
2521 return false;
2522 case Intrinsic::icall_branch_funnel:
2523 case Intrinsic::localescape:
2524 HasUninlineableIntrinsic = true;
2525 return false;
2526 case Intrinsic::vastart:
2527 InitsVargArgs = true;
2528 return false;
2529 case Intrinsic::launder_invariant_group:
2530 case Intrinsic::strip_invariant_group:
2531 if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0)))
2532 SROAArgValues[II] = SROAArg;
2533 return true;
2534 case Intrinsic::is_constant:
2535 return simplifyIntrinsicCallIsConstant(CB&: Call);
2536 case Intrinsic::objectsize:
2537 return simplifyIntrinsicCallObjectSize(CB&: Call);
2538 }
2539 }
2540
2541 if (F == Call.getFunction()) {
2542 // This flag will fully abort the analysis, so don't bother with anything
2543 // else.
2544 IsRecursiveCall = true;
2545 if (!AllowRecursiveCall)
2546 return false;
2547 }
2548
2549 if (isLoweredToCall(F, Call)) {
2550 onLoweredCall(F, Call, IsIndirectCall);
2551 }
2552
2553 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2554 disableLoadElimination();
2555 return Base::visitCallBase(I&: Call);
2556}
2557
2558bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2559 // At least one return instruction will be free after inlining.
2560 bool Free = !HasReturn;
2561 HasReturn = true;
2562 return Free;
2563}
2564
2565bool CallAnalyzer::visitUncondBrInst(UncondBrInst &BI) {
2566 // We model unconditional branches as essentially free -- they really
2567 // shouldn't exist at all, but handling them makes the behavior of the
2568 // inliner more regular and predictable.
2569 return true;
2570}
2571
2572bool CallAnalyzer::visitCondBrInst(CondBrInst &BI) {
2573 // Conditional branches which will fold away are free.
2574 return getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) ||
2575 BI.getMetadata(KindID: LLVMContext::MD_make_implicit);
2576}
2577
2578bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2579 bool CheckSROA = SI.getType()->isPointerTy();
2580 Value *TrueVal = SI.getTrueValue();
2581 Value *FalseVal = SI.getFalseValue();
2582
2583 Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal);
2584 Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal);
2585 Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition());
2586
2587 if (!CondC) {
2588 // Select C, X, X => X
2589 if (TrueC == FalseC && TrueC) {
2590 SimplifiedValues[&SI] = TrueC;
2591 return true;
2592 }
2593
2594 if (!CheckSROA)
2595 return Base::visitSelectInst(I&: SI);
2596
2597 std::pair<Value *, APInt> TrueBaseAndOffset =
2598 ConstantOffsetPtrs.lookup(Val: TrueVal);
2599 std::pair<Value *, APInt> FalseBaseAndOffset =
2600 ConstantOffsetPtrs.lookup(Val: FalseVal);
2601 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2602 ConstantOffsetPtrs[&SI] = std::move(TrueBaseAndOffset);
2603
2604 if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal))
2605 SROAArgValues[&SI] = SROAArg;
2606 return true;
2607 }
2608
2609 return Base::visitSelectInst(I&: SI);
2610 }
2611
2612 // Select condition is a constant.
2613 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2614 : (CondC->isNullValue()) ? FalseVal
2615 : nullptr;
2616 if (!SelectedV) {
2617 // Condition is a vector constant that is not all 1s or all 0s. If all
2618 // operands are constants, ConstantFoldSelectInstruction() can handle the
2619 // cases such as select vectors.
2620 if (TrueC && FalseC) {
2621 if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) {
2622 SimplifiedValues[&SI] = C;
2623 return true;
2624 }
2625 }
2626 return Base::visitSelectInst(I&: SI);
2627 }
2628
2629 // Condition is either all 1s or all 0s. SI can be simplified.
2630 if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) {
2631 SimplifiedValues[&SI] = SelectedC;
2632 return true;
2633 }
2634
2635 if (!CheckSROA)
2636 return true;
2637
2638 std::pair<Value *, APInt> BaseAndOffset =
2639 ConstantOffsetPtrs.lookup(Val: SelectedV);
2640 if (BaseAndOffset.first) {
2641 ConstantOffsetPtrs[&SI] = std::move(BaseAndOffset);
2642
2643 if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV))
2644 SROAArgValues[&SI] = SROAArg;
2645 }
2646
2647 return true;
2648}
2649
2650bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2651 // We model unconditional switches as free, see the comments on handling
2652 // branches.
2653 if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition()))
2654 return true;
2655
2656 // Assume the most general case where the switch is lowered into
2657 // either a jump table, bit test, or a balanced binary tree consisting of
2658 // case clusters without merging adjacent clusters with the same
2659 // destination. We do not consider the switches that are lowered with a mix
2660 // of jump table/bit test/binary search tree. The cost of the switch is
2661 // proportional to the size of the tree or the size of jump table range.
2662 //
2663 // NB: We convert large switches which are just used to initialize large phi
2664 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2665 // inlining those. It will prevent inlining in cases where the optimization
2666 // does not (yet) fire.
2667
2668 unsigned JumpTableSize = 0;
2669 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2670 unsigned NumCaseCluster =
2671 TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI);
2672
2673 onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable());
2674 return false;
2675}
2676
2677bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2678 // We never want to inline functions that contain an indirectbr. This is
2679 // incorrect because all the blockaddress's (in static global initializers
2680 // for example) would be referring to the original function, and this
2681 // indirect jump would jump from the inlined copy of the function into the
2682 // original function which is extremely undefined behavior.
2683 // FIXME: This logic isn't really right; we can safely inline functions with
2684 // indirectbr's as long as no other function or global references the
2685 // blockaddress of a block within the current function.
2686 HasIndirectBr = true;
2687 return false;
2688}
2689
2690bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2691 // FIXME: It's not clear that a single instruction is an accurate model for
2692 // the inline cost of a resume instruction.
2693 return false;
2694}
2695
2696bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2697 // FIXME: It's not clear that a single instruction is an accurate model for
2698 // the inline cost of a cleanupret instruction.
2699 return false;
2700}
2701
2702bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2703 // FIXME: It's not clear that a single instruction is an accurate model for
2704 // the inline cost of a catchret instruction.
2705 return false;
2706}
2707
2708bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2709 // FIXME: It might be reasonably to discount the cost of instructions leading
2710 // to unreachable as they have the lowest possible impact on both runtime and
2711 // code size.
2712 return true; // No actual code is needed for unreachable.
2713}
2714
2715bool CallAnalyzer::visitInstruction(Instruction &I) {
2716 // Some instructions are free. All of the free intrinsics can also be
2717 // handled by SROA, etc.
2718 if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
2719 TargetTransformInfo::TCC_Free)
2720 return true;
2721
2722 // We found something we don't understand or can't handle. Mark any SROA-able
2723 // values in the operand list as no longer viable.
2724 for (const Use &Op : I.operands())
2725 disableSROA(V: Op);
2726
2727 return false;
2728}
2729
2730/// Analyze a basic block for its contribution to the inline cost.
2731///
2732/// This method walks the analyzer over every instruction in the given basic
2733/// block and accounts for their cost during inlining at this callsite. It
2734/// aborts early if the threshold has been exceeded or an impossible to inline
2735/// construct has been detected. It returns false if inlining is no longer
2736/// viable, and true if inlining remains viable.
2737InlineResult
2738CallAnalyzer::analyzeBlock(BasicBlock *BB,
2739 const SmallPtrSetImpl<const Value *> &EphValues) {
2740 for (Instruction &I : *BB) {
2741 // FIXME: Currently, the number of instructions in a function regardless of
2742 // our ability to simplify them during inline to constants or dead code,
2743 // are actually used by the vector bonus heuristic. As long as that's true,
2744 // we have to special case debug intrinsics here to prevent differences in
2745 // inlining due to debug symbols. Eventually, the number of unsimplified
2746 // instructions shouldn't factor into the cost computation, but until then,
2747 // hack around it here.
2748 // Similarly, skip pseudo-probes.
2749 if (I.isDebugOrPseudoInst())
2750 continue;
2751
2752 // Skip ephemeral values.
2753 if (EphValues.count(Ptr: &I))
2754 continue;
2755
2756 ++NumInstructions;
2757 if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy())
2758 ++NumVectorInstructions;
2759
2760 // If the instruction simplified to a constant, there is no cost to this
2761 // instruction. Visit the instructions using our InstVisitor to account for
2762 // all of the per-instruction logic. The visit tree returns true if we
2763 // consumed the instruction in any way, and false if the instruction's base
2764 // cost should count against inlining.
2765 onInstructionAnalysisStart(I: &I);
2766
2767 if (Base::visit(I: &I))
2768 ++NumInstructionsSimplified;
2769 else
2770 onMissedSimplification();
2771
2772 onInstructionAnalysisFinish(I: &I);
2773 using namespace ore;
2774 // If the visit this instruction detected an uninlinable pattern, abort.
2775 InlineResult IR = InlineResult::success();
2776 if (IsRecursiveCall && !AllowRecursiveCall)
2777 IR = InlineResult::failure(Reason: "recursive");
2778 else if (ExposesReturnsTwice)
2779 IR = InlineResult::failure(Reason: "exposes returns twice");
2780 else if (HasDynamicAlloca)
2781 IR = InlineResult::failure(Reason: "dynamic alloca");
2782 else if (HasIndirectBr)
2783 IR = InlineResult::failure(Reason: "indirect branch");
2784 else if (HasUninlineableIntrinsic)
2785 IR = InlineResult::failure(Reason: "uninlinable intrinsic");
2786 else if (InitsVargArgs)
2787 IR = InlineResult::failure(Reason: "varargs");
2788 if (!IR.isSuccess()) {
2789 if (ORE)
2790 ORE->emit(RemarkBuilder: [&]() {
2791 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2792 &CandidateCall)
2793 << NV("Callee", &F) << " has uninlinable pattern ("
2794 << NV("InlineResult", IR.getFailureReason())
2795 << ") and cost is not fully computed";
2796 });
2797 return IR;
2798 }
2799
2800 // If the caller is a recursive function then we don't want to inline
2801 // functions which allocate a lot of stack space because it would increase
2802 // the caller stack usage dramatically.
2803 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2804 auto IR =
2805 InlineResult::failure(Reason: "recursive and allocates too much stack space");
2806 if (ORE)
2807 ORE->emit(RemarkBuilder: [&]() {
2808 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2809 &CandidateCall)
2810 << NV("Callee", &F) << " is "
2811 << NV("InlineResult", IR.getFailureReason())
2812 << ". Cost is not fully computed";
2813 });
2814 return IR;
2815 }
2816
2817 if (shouldStop())
2818 return InlineResult::failure(
2819 Reason: "Call site analysis is not favorable to inlining.");
2820 }
2821
2822 return InlineResult::success();
2823}
2824
2825/// Compute the base pointer and cumulative constant offsets for V.
2826///
2827/// This strips all constant offsets off of V, leaving it the base pointer, and
2828/// accumulates the total constant offset applied in the returned constant. It
2829/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2830/// no constant offsets applied.
2831ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2832 if (!V->getType()->isPointerTy())
2833 return nullptr;
2834
2835 unsigned AS = V->getType()->getPointerAddressSpace();
2836 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2837 APInt Offset = APInt::getZero(numBits: IntPtrWidth);
2838
2839 // Even though we don't look through PHI nodes, we could be called on an
2840 // instruction in an unreachable block, which may be on a cycle.
2841 SmallPtrSet<Value *, 4> Visited;
2842 Visited.insert(Ptr: V);
2843 do {
2844 if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) {
2845 if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset))
2846 return nullptr;
2847 V = GEP->getPointerOperand();
2848 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) {
2849 if (GA->isInterposable())
2850 break;
2851 V = GA->getAliasee();
2852 } else {
2853 break;
2854 }
2855 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2856 } while (Visited.insert(Ptr: V).second);
2857
2858 Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType());
2859 return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset));
2860}
2861
2862/// Find dead blocks due to deleted CFG edges during inlining.
2863///
2864/// If we know the successor of the current block, \p CurrBB, has to be \p
2865/// NextBB, the other successors of \p CurrBB are dead if these successors have
2866/// no live incoming CFG edges. If one block is found to be dead, we can
2867/// continue growing the dead block list by checking the successors of the dead
2868/// blocks to see if all their incoming edges are dead or not.
2869void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2870 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2871 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2872 // known successor which is not the one under exam.
2873 if (DeadBlocks.count(Ptr: Pred))
2874 return true;
2875 BasicBlock *KnownSucc = KnownSuccessors[Pred];
2876 return KnownSucc && KnownSucc != Succ;
2877 };
2878
2879 auto IsNewlyDead = [&](BasicBlock *BB) {
2880 // If all the edges to a block are dead, the block is also dead.
2881 return (!DeadBlocks.count(Ptr: BB) &&
2882 llvm::all_of(Range: predecessors(BB),
2883 P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2884 };
2885
2886 for (BasicBlock *Succ : successors(BB: CurrBB)) {
2887 if (Succ == NextBB || !IsNewlyDead(Succ))
2888 continue;
2889 SmallVector<BasicBlock *, 4> NewDead;
2890 NewDead.push_back(Elt: Succ);
2891 while (!NewDead.empty()) {
2892 BasicBlock *Dead = NewDead.pop_back_val();
2893 if (DeadBlocks.insert(Ptr: Dead).second)
2894 // Continue growing the dead block lists.
2895 for (BasicBlock *S : successors(BB: Dead))
2896 if (IsNewlyDead(S))
2897 NewDead.push_back(Elt: S);
2898 }
2899 }
2900}
2901
2902/// Analyze a call site for potential inlining.
2903///
2904/// Returns true if inlining this call is viable, and false if it is not
2905/// viable. It computes the cost and adjusts the threshold based on numerous
2906/// factors and heuristics. If this method returns false but the computed cost
2907/// is below the computed threshold, then inlining was forcibly disabled by
2908/// some artifact of the routine.
2909InlineResult CallAnalyzer::analyze() {
2910 ++NumCallsAnalyzed;
2911
2912 auto Result = onAnalysisStart();
2913 if (!Result.isSuccess())
2914 return Result;
2915
2916 if (F.empty())
2917 return InlineResult::success();
2918
2919 Function *Caller = CandidateCall.getFunction();
2920 // Check if the caller function is recursive itself.
2921 for (User *U : Caller->users()) {
2922 CallBase *Call = dyn_cast<CallBase>(Val: U);
2923 if (Call && Call->getFunction() == Caller) {
2924 IsCallerRecursive = true;
2925 break;
2926 }
2927 }
2928
2929 // Populate our simplified values by mapping from function arguments to call
2930 // arguments with known important simplifications.
2931 auto CAI = CandidateCall.arg_begin();
2932 for (Argument &FAI : F.args()) {
2933 assert(CAI != CandidateCall.arg_end());
2934 SimplifiedValues[&FAI] = *CAI;
2935 if (isa<Constant>(Val: *CAI))
2936 ++NumConstantArgs;
2937
2938 Value *PtrArg = *CAI;
2939 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) {
2940 ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue());
2941
2942 // We can SROA any pointer arguments derived from alloca instructions.
2943 if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) {
2944 SROAArgValues[&FAI] = SROAArg;
2945 onInitializeSROAArg(Arg: SROAArg);
2946 EnabledSROAAllocas.insert(V: SROAArg);
2947 }
2948 }
2949 ++CAI;
2950 }
2951 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2952 NumAllocaArgs = SROAArgValues.size();
2953
2954 // Collecting the ephemeral values of `F` can be expensive, so use the
2955 // ephemeral values cache if available.
2956 SmallPtrSet<const Value *, 32> EphValuesStorage;
2957 const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage;
2958 if (GetEphValuesCache)
2959 EphValues = &GetEphValuesCache(F).ephValues();
2960 else
2961 CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F),
2962 EphValues&: EphValuesStorage);
2963
2964 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2965 // adding basic blocks of the callee which can be proven to be dead for this
2966 // particular call site in order to get more accurate cost estimates. This
2967 // requires a somewhat heavyweight iteration pattern: we need to walk the
2968 // basic blocks in a breadth-first order as we insert live successors. To
2969 // accomplish this, prioritizing for small iterations because we exit after
2970 // crossing our threshold, we use a small-size optimized SetVector.
2971 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2972 BBSetVector BBWorklist;
2973 BBWorklist.insert(X: &F.getEntryBlock());
2974
2975 // Note that we *must not* cache the size, this loop grows the worklist.
2976 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2977 if (shouldStop())
2978 break;
2979
2980 BasicBlock *BB = BBWorklist[Idx];
2981 if (BB->empty())
2982 continue;
2983
2984 onBlockStart(BB);
2985
2986 // Disallow inlining a blockaddress.
2987 // A blockaddress only has defined behavior for an indirect branch in the
2988 // same function, and we do not currently support inlining indirect
2989 // branches. But, the inliner may not see an indirect branch that ends up
2990 // being dead code at a particular call site. If the blockaddress escapes
2991 // the function, e.g., via a global variable, inlining may lead to an
2992 // invalid cross-function reference.
2993 // FIXME: pr/39560: continue relaxing this overt restriction.
2994 if (BB->hasAddressTaken())
2995 return InlineResult::failure(Reason: "blockaddress used");
2996
2997 // Analyze the cost of this block. If we blow through the threshold, this
2998 // returns false, and we can bail on out.
2999 InlineResult IR = analyzeBlock(BB, EphValues: *EphValues);
3000 if (!IR.isSuccess())
3001 return IR;
3002
3003 Instruction *TI = BB->getTerminator();
3004
3005 // Add in the live successors by first checking whether we have terminator
3006 // that may be simplified based on the values simplified by this call.
3007 if (CondBrInst *BI = dyn_cast<CondBrInst>(Val: TI)) {
3008 Value *Cond = BI->getCondition();
3009 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
3010 BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0);
3011 BBWorklist.insert(X: NextBB);
3012 KnownSuccessors[BB] = NextBB;
3013 findDeadBlocks(CurrBB: BB, NextBB);
3014 continue;
3015 }
3016 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) {
3017 Value *Cond = SI->getCondition();
3018 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
3019 BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor();
3020 BBWorklist.insert(X: NextBB);
3021 KnownSuccessors[BB] = NextBB;
3022 findDeadBlocks(CurrBB: BB, NextBB);
3023 continue;
3024 }
3025 }
3026
3027 // If we're unable to select a particular successor, just count all of
3028 // them.
3029 BBWorklist.insert_range(R: successors(BB));
3030
3031 onBlockAnalyzed(BB);
3032 }
3033
3034 // If this is a noduplicate call, we can still inline as long as
3035 // inlining this would cause the removal of the caller (so the instruction
3036 // is not actually duplicated, just moved).
3037 if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall)
3038 return InlineResult::failure(Reason: "noduplicate");
3039
3040 // If the callee's stack size exceeds the user-specified threshold,
3041 // do not let it be inlined.
3042 // The command line option overrides a limit set in the function attributes.
3043 size_t FinalStackSizeThreshold = StackSizeThreshold;
3044 if (!StackSizeThreshold.getNumOccurrences())
3045 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
3046 F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName))
3047 FinalStackSizeThreshold = *AttrMaxStackSize;
3048 if (AllocatedSize > FinalStackSizeThreshold)
3049 return InlineResult::failure(Reason: "stacksize");
3050
3051 return finalizeAnalysis();
3052}
3053
3054void InlineCostCallAnalyzer::print(raw_ostream &OS) {
3055#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
3056 if (PrintInstructionComments)
3057 F.print(OS, AAW: &Writer);
3058 DEBUG_PRINT_STAT(NumConstantArgs);
3059 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
3060 DEBUG_PRINT_STAT(NumAllocaArgs);
3061 DEBUG_PRINT_STAT(NumConstantPtrCmps);
3062 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
3063 DEBUG_PRINT_STAT(NumInstructionsSimplified);
3064 DEBUG_PRINT_STAT(NumInstructions);
3065 DEBUG_PRINT_STAT(NumInlineAsmInstructions);
3066 DEBUG_PRINT_STAT(SROACostSavings);
3067 DEBUG_PRINT_STAT(SROACostSavingsLost);
3068 DEBUG_PRINT_STAT(LoadEliminationCost);
3069 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
3070 DEBUG_PRINT_STAT(Cost);
3071 DEBUG_PRINT_STAT(Threshold);
3072#undef DEBUG_PRINT_STAT
3073}
3074
3075#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3076/// Dump stats about this call's analysis.
3077LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
3078#endif
3079
3080/// Test that there are no attribute conflicts between Caller and Callee
3081/// that prevent inlining.
3082static bool functionsHaveCompatibleAttributes(
3083 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
3084 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
3085 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
3086 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
3087 // object, and always returns the same object (which is overwritten on each
3088 // GetTLI call). Therefore we copy the first result.
3089 auto CalleeTLI = GetTLI(*Callee);
3090 return (IgnoreTTIInlineCompatible ||
3091 TTI.areInlineCompatible(Caller, Callee)) &&
3092 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
3093 AllowCallerSuperset: InlineCallerSupersetNoBuiltin) &&
3094 AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee);
3095}
3096
3097int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
3098 const DataLayout &DL) {
3099 int64_t Cost = 0;
3100 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
3101 if (Call.isByValArgument(ArgNo: I)) {
3102 // We approximate the number of loads and stores needed by dividing the
3103 // size of the byval type by the target's pointer size.
3104 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3105 unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I));
3106 unsigned AS = PTy->getAddressSpace();
3107 unsigned PointerSize = DL.getPointerSizeInBits(AS);
3108 // Ceiling division.
3109 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
3110
3111 // If it generates more than 8 stores it is likely to be expanded as an
3112 // inline memcpy so we take that as an upper bound. Otherwise we assume
3113 // one load and one store per word copied.
3114 // FIXME: The maxStoresPerMemcpy setting from the target should be used
3115 // here instead of a magic number of 8, but it's not available via
3116 // DataLayout.
3117 NumStores = std::min(a: NumStores, b: 8U);
3118
3119 Cost += 2 * NumStores * InstrCost;
3120 } else {
3121 // For non-byval arguments subtract off one instruction per call
3122 // argument.
3123 Cost += InstrCost;
3124 }
3125 }
3126 // The call instruction also disappears after inlining.
3127 Cost += InstrCost;
3128 Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty);
3129
3130 return std::min<int64_t>(a: Cost, INT_MAX);
3131}
3132
3133InlineCost llvm::getInlineCost(
3134 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
3135 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3136 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3137 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3138 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3139 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3140 return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI,
3141 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE,
3142 GetEphValuesCache);
3143}
3144
3145std::optional<int> llvm::getInliningCostEstimate(
3146 CallBase &Call, TargetTransformInfo &CalleeTTI,
3147 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3148 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3149 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3150 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3151 const InlineParams Params = {/* DefaultThreshold*/ 0,
3152 /*HintThreshold*/ {},
3153 /*ColdThreshold*/ {},
3154 /*OptSizeThreshold*/ {},
3155 /*OptMinSizeThreshold*/ {},
3156 /*HotCallSiteThreshold*/ {},
3157 /*LocallyHotCallSiteThreshold*/ {},
3158 /*ColdCallSiteThreshold*/ {},
3159 /*ComputeFullInlineCost*/ true,
3160 /*EnableDeferral*/ true};
3161
3162 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
3163 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
3164 /*IgnoreThreshold*/ true);
3165 auto R = CA.analyze();
3166 if (!R.isSuccess())
3167 return std::nullopt;
3168 return CA.getCost();
3169}
3170
3171std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
3172 CallBase &Call, TargetTransformInfo &CalleeTTI,
3173 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3174 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3175 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3176 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3177 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
3178 PSI, ORE, *Call.getCalledFunction(), Call);
3179 auto R = CFA.analyze();
3180 if (!R.isSuccess())
3181 return std::nullopt;
3182 return CFA.features();
3183}
3184
3185std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
3186 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
3187 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
3188
3189 // Cannot inline indirect calls.
3190 if (!Callee)
3191 return InlineResult::failure(Reason: "indirect call");
3192
3193 // When callee coroutine function is inlined into caller coroutine function
3194 // before coro-split pass,
3195 // coro-early pass can not handle this quiet well.
3196 // So we won't inline the coroutine function if it have not been unsplited
3197 if (Callee->isPresplitCoroutine())
3198 return InlineResult::failure(Reason: "unsplited coroutine call");
3199
3200 // Never inline calls with byval arguments that does not have the alloca
3201 // address space. Since byval arguments can be replaced with a copy to an
3202 // alloca, the inlined code would need to be adjusted to handle that the
3203 // argument is in the alloca address space (so it is a little bit complicated
3204 // to solve).
3205 unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3206 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3207 if (Call.isByValArgument(ArgNo: I)) {
3208 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3209 if (PTy->getAddressSpace() != AllocaAS)
3210 return InlineResult::failure(Reason: "byval arguments without alloca"
3211 " address space");
3212 }
3213
3214 // Calls to functions with always-inline attributes should be inlined
3215 // whenever possible.
3216 if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) {
3217 if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline))
3218 return InlineResult::failure(Reason: "noinline call site attribute");
3219
3220 auto IsViable = isInlineViable(Callee&: *Callee);
3221 if (IsViable.isSuccess())
3222 return InlineResult::success();
3223 return InlineResult::failure(Reason: IsViable.getFailureReason());
3224 }
3225
3226 // Never inline functions with conflicting attributes (unless callee has
3227 // always-inline attribute).
3228 Function *Caller = Call.getCaller();
3229 if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI))
3230 return InlineResult::failure(Reason: "conflicting attributes");
3231
3232 // Don't inline this call if the caller has the optnone attribute.
3233 if (Caller->hasOptNone())
3234 return InlineResult::failure(Reason: "optnone attribute");
3235
3236 // Don't inline a function that treats null pointer as valid into a caller
3237 // that does not have this attribute.
3238 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3239 return InlineResult::failure(Reason: "nullptr definitions incompatible");
3240
3241 // Don't inline functions which can be interposed at link-time.
3242 if (Callee->isInterposable())
3243 return InlineResult::failure(Reason: "interposable");
3244
3245 // Don't inline functions marked noinline.
3246 if (Callee->hasFnAttribute(Kind: Attribute::NoInline))
3247 return InlineResult::failure(Reason: "noinline function attribute");
3248
3249 // Don't inline call sites marked noinline.
3250 if (Call.isNoInline())
3251 return InlineResult::failure(Reason: "noinline call site attribute");
3252
3253 // Don't inline functions that are loader replaceable.
3254 if (Callee->hasFnAttribute(Kind: "loader-replaceable"))
3255 return InlineResult::failure(Reason: "loader replaceable function attribute");
3256
3257 return std::nullopt;
3258}
3259
3260InlineCost llvm::getInlineCost(
3261 CallBase &Call, Function *Callee, const InlineParams &Params,
3262 TargetTransformInfo &CalleeTTI,
3263 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3264 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3265 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3266 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3267 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3268
3269 auto UserDecision =
3270 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3271
3272 if (UserDecision) {
3273 if (UserDecision->isSuccess())
3274 return llvm::InlineCost::getAlways(Reason: "always inline attribute");
3275 return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason());
3276 }
3277
3278 if (InlineAllViableCalls && isInlineViable(Callee&: *Callee).isSuccess())
3279 return llvm::InlineCost::getAlways(
3280 Reason: "Inlining forced by -inline-all-viable-calls");
3281
3282 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3283 << "... (caller:" << Call.getCaller()->getName()
3284 << ")\n");
3285
3286 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3287 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
3288 /*BoostIndirect=*/true, /*IgnoreThreshold=*/false,
3289 GetEphValuesCache);
3290 InlineResult ShouldInline = CA.analyze();
3291
3292 LLVM_DEBUG(CA.dump());
3293
3294 // Always make cost benefit based decision explicit.
3295 // We use always/never here since threshold is not meaningful,
3296 // as it's not what drives cost-benefit analysis.
3297 if (CA.wasDecidedByCostBenefit()) {
3298 if (ShouldInline.isSuccess())
3299 return InlineCost::getAlways(Reason: "benefit over cost",
3300 CostBenefit: CA.getCostBenefitPair());
3301 else
3302 return InlineCost::getNever(Reason: "cost over benefit", CostBenefit: CA.getCostBenefitPair());
3303 }
3304
3305 if (CA.wasDecidedByCostThreshold())
3306 return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(),
3307 StaticBonus: CA.getStaticBonusApplied());
3308
3309 // No details on how the decision was made, simply return always or never.
3310 return ShouldInline.isSuccess()
3311 ? InlineCost::getAlways(Reason: "empty function")
3312 : InlineCost::getNever(Reason: ShouldInline.getFailureReason());
3313}
3314
3315InlineResult llvm::isInlineViable(Function &F) {
3316 bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice);
3317 for (BasicBlock &BB : F) {
3318 // Disallow inlining of functions which contain indirect branches.
3319 if (isa<IndirectBrInst>(Val: BB.getTerminator()))
3320 return InlineResult::failure(Reason: "contains indirect branches");
3321
3322 // Disallow inlining of blockaddresses.
3323 if (BB.hasAddressTaken())
3324 return InlineResult::failure(Reason: "blockaddress used");
3325
3326 for (auto &II : BB) {
3327 CallBase *Call = dyn_cast<CallBase>(Val: &II);
3328 if (!Call)
3329 continue;
3330
3331 // Disallow recursive calls.
3332 Function *Callee = Call->getCalledFunction();
3333 if (&F == Callee)
3334 return InlineResult::failure(Reason: "recursive call");
3335
3336 // Disallow calls which expose returns-twice to a function not previously
3337 // attributed as such.
3338 if (!ReturnsTwice && isa<CallInst>(Val: Call) &&
3339 cast<CallInst>(Val: Call)->canReturnTwice())
3340 return InlineResult::failure(Reason: "exposes returns-twice attribute");
3341
3342 if (Callee)
3343 switch (Callee->getIntrinsicID()) {
3344 default:
3345 break;
3346 case llvm::Intrinsic::icall_branch_funnel:
3347 // Disallow inlining of @llvm.icall.branch.funnel because current
3348 // backend can't separate call targets from call arguments.
3349 return InlineResult::failure(
3350 Reason: "disallowed inlining of @llvm.icall.branch.funnel");
3351 case llvm::Intrinsic::localescape:
3352 // Disallow inlining functions that call @llvm.localescape. Doing this
3353 // correctly would require major changes to the inliner.
3354 return InlineResult::failure(
3355 Reason: "disallowed inlining of @llvm.localescape");
3356 case llvm::Intrinsic::vastart:
3357 // Disallow inlining of functions that initialize VarArgs with
3358 // va_start.
3359 return InlineResult::failure(
3360 Reason: "contains VarArgs initialized with va_start");
3361 }
3362 }
3363 }
3364
3365 return InlineResult::success();
3366}
3367
3368// APIs to create InlineParams based on command line flags and/or other
3369// parameters.
3370
3371InlineParams llvm::getInlineParams(int Threshold) {
3372 InlineParams Params;
3373
3374 // This field is the threshold to use for a callee by default. This is
3375 // derived from one or more of:
3376 // * optimization or size-optimization levels,
3377 // * a value passed to createFunctionInliningPass function, or
3378 // * the -inline-threshold flag.
3379 // If the -inline-threshold flag is explicitly specified, that is used
3380 // irrespective of anything else.
3381 if (InlineThreshold.getNumOccurrences() > 0)
3382 Params.DefaultThreshold = InlineThreshold;
3383 else
3384 Params.DefaultThreshold = Threshold;
3385
3386 // Set the HintThreshold knob from the -inlinehint-threshold.
3387 Params.HintThreshold = HintThreshold;
3388
3389 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3390 Params.HotCallSiteThreshold = HotCallSiteThreshold;
3391
3392 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3393 // populate LocallyHotCallSiteThreshold. Later, we populate
3394 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3395 // we know that optimization level is O3 (in the getInlineParams variant that
3396 // takes the opt and size levels).
3397 // FIXME: Remove this check (and make the assignment unconditional) after
3398 // addressing size regression issues at O2.
3399 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3400 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3401
3402 // Set the ColdCallSiteThreshold knob from the
3403 // -inline-cold-callsite-threshold.
3404 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3405
3406 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3407 // -inlinehint-threshold commandline option is not explicitly given. If that
3408 // option is present, then its value applies even for callees with size and
3409 // minsize attributes.
3410 // If the -inline-threshold is not specified, set the ColdThreshold from the
3411 // -inlinecold-threshold even if it is not explicitly passed. If
3412 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3413 // explicitly specified to set the ColdThreshold knob
3414 if (InlineThreshold.getNumOccurrences() == 0) {
3415 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3416 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3417 Params.ColdThreshold = ColdThreshold;
3418 } else if (ColdThreshold.getNumOccurrences() > 0) {
3419 Params.ColdThreshold = ColdThreshold;
3420 }
3421 return Params;
3422}
3423
3424InlineParams llvm::getInlineParams() {
3425 return getInlineParams(Threshold: DefaultThreshold);
3426}
3427
3428// Compute the default threshold for inlining based on the opt level and the
3429// size opt level.
3430static int computeThresholdFromOptLevels(unsigned OptLevel,
3431 unsigned SizeOptLevel) {
3432 if (OptLevel > 2)
3433 return InlineConstants::OptAggressiveThreshold;
3434 if (SizeOptLevel == 1) // -Os
3435 return InlineConstants::OptSizeThreshold;
3436 if (SizeOptLevel == 2) // -Oz
3437 return InlineConstants::OptMinSizeThreshold;
3438 return DefaultThreshold;
3439}
3440
3441InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3442 auto Params =
3443 getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3444 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3445 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3446 // when it is specified explicitly.
3447 if (OptLevel > 2)
3448 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3449 return Params;
3450}
3451
3452PreservedAnalyses
3453InlineCostAnnotationPrinterPass::run(Function &F,
3454 FunctionAnalysisManager &FAM) {
3455 PrintInstructionComments = true;
3456 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3457 [&](Function &F) -> AssumptionCache & {
3458 return FAM.getResult<AssumptionAnalysis>(IR&: F);
3459 };
3460
3461 auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
3462 ProfileSummaryInfo *PSI =
3463 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
3464 const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3465
3466 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3467 // In the current implementation, the type of InlineParams doesn't matter as
3468 // the pass serves only for verification of inliner's decisions.
3469 // We can add a flag which determines InlineParams for this run. Right now,
3470 // the default InlineParams are used.
3471 const InlineParams Params = llvm::getInlineParams();
3472 for (BasicBlock &BB : F) {
3473 for (Instruction &I : BB) {
3474 if (auto *CB = dyn_cast<CallBase>(Val: &I)) {
3475 Function *CalledFunction = CB->getCalledFunction();
3476 if (!CalledFunction || CalledFunction->isDeclaration())
3477 continue;
3478 OptimizationRemarkEmitter ORE(CalledFunction);
3479 InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
3480 GetAssumptionCache, nullptr, nullptr, PSI,
3481 &ORE);
3482 ICCA.analyze();
3483 OS << " Analyzing call of " << CalledFunction->getName()
3484 << "... (caller:" << CB->getCaller()->getName() << ")\n";
3485 ICCA.print(OS);
3486 OS << "\n";
3487 }
3488 }
3489 }
3490 return PreservedAnalyses::all();
3491}
3492