1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Analysis/InlineCost.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/BlockFrequencyInfo.h"
21#include "llvm/Analysis/CodeMetrics.h"
22#include "llvm/Analysis/ConstantFolding.h"
23#include "llvm/Analysis/DomConditionCache.h"
24#include "llvm/Analysis/EphemeralValuesCache.h"
25#include "llvm/Analysis/InstructionSimplify.h"
26#include "llvm/Analysis/LoopInfo.h"
27#include "llvm/Analysis/MemoryBuiltins.h"
28#include "llvm/Analysis/OptimizationRemarkEmitter.h"
29#include "llvm/Analysis/ProfileSummaryInfo.h"
30#include "llvm/Analysis/TargetLibraryInfo.h"
31#include "llvm/Analysis/TargetTransformInfo.h"
32#include "llvm/Analysis/ValueTracking.h"
33#include "llvm/Config/llvm-config.h"
34#include "llvm/IR/AssemblyAnnotationWriter.h"
35#include "llvm/IR/CallingConv.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/Dominators.h"
38#include "llvm/IR/GetElementPtrTypeIterator.h"
39#include "llvm/IR/GlobalAlias.h"
40#include "llvm/IR/InstVisitor.h"
41#include "llvm/IR/IntrinsicInst.h"
42#include "llvm/IR/Operator.h"
43#include "llvm/IR/PatternMatch.h"
44#include "llvm/Support/CommandLine.h"
45#include "llvm/Support/Debug.h"
46#include "llvm/Support/FormattedStream.h"
47#include "llvm/Support/raw_ostream.h"
48#include <climits>
49#include <limits>
50#include <optional>
51
52using namespace llvm;
53
54#define DEBUG_TYPE "inline-cost"
55
56STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
57
58static cl::opt<int>
59 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(Val: 225),
60 cl::desc("Default amount of inlining to perform"));
61
62// We introduce this option since there is a minor compile-time win by avoiding
63// addition of TTI attributes (target-features in particular) to inline
64// candidates when they are guaranteed to be the same as top level methods in
65// some use cases. If we avoid adding the attribute, we need an option to avoid
66// checking these attributes.
67static cl::opt<bool> IgnoreTTIInlineCompatible(
68 "ignore-tti-inline-compatible", cl::Hidden, cl::init(Val: false),
69 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
70 "during inline cost calculation"));
71
72static cl::opt<bool> PrintInstructionComments(
73 "print-instruction-comments", cl::Hidden, cl::init(Val: false),
74 cl::desc("Prints comments for instruction based on inline cost analysis"));
75
76static cl::opt<int> InlineThreshold(
77 "inline-threshold", cl::Hidden, cl::init(Val: 225),
78 cl::desc("Control the amount of inlining to perform (default = 225)"));
79
80static cl::opt<int> HintThreshold(
81 "inlinehint-threshold", cl::Hidden, cl::init(Val: 325),
82 cl::desc("Threshold for inlining functions with inline hint"));
83
84static cl::opt<int>
85 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
86 cl::init(Val: 45),
87 cl::desc("Threshold for inlining cold callsites"));
88
89static cl::opt<bool> InlineEnableCostBenefitAnalysis(
90 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(Val: false),
91 cl::desc("Enable the cost-benefit analysis for the inliner"));
92
93// InlineSavingsMultiplier overrides per TTI multipliers iff it is
94// specified explicitly in command line options. This option is exposed
95// for tuning and testing.
96static cl::opt<int> InlineSavingsMultiplier(
97 "inline-savings-multiplier", cl::Hidden, cl::init(Val: 8),
98 cl::desc("Multiplier to multiply cycle savings by during inlining"));
99
100// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
101// specified explicitly in command line options. This option is exposed
102// for tuning and testing.
103static cl::opt<int> InlineSavingsProfitableMultiplier(
104 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(Val: 4),
105 cl::desc("A multiplier on top of cycle savings to decide whether the "
106 "savings won't justify the cost"));
107
108static cl::opt<int>
109 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(Val: 100),
110 cl::desc("The maximum size of a callee that get's "
111 "inlined without sufficient cycle savings"));
112
113// We introduce this threshold to help performance of instrumentation based
114// PGO before we actually hook up inliner with analysis passes such as BPI and
115// BFI.
116static cl::opt<int> ColdThreshold(
117 "inlinecold-threshold", cl::Hidden, cl::init(Val: 45),
118 cl::desc("Threshold for inlining functions with cold attribute"));
119
120static cl::opt<int>
121 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(Val: 3000),
122 cl::desc("Threshold for hot callsites "));
123
124static cl::opt<int> LocallyHotCallSiteThreshold(
125 "locally-hot-callsite-threshold", cl::Hidden, cl::init(Val: 525),
126 cl::desc("Threshold for locally hot callsites "));
127
128static cl::opt<int> ColdCallSiteRelFreq(
129 "cold-callsite-rel-freq", cl::Hidden, cl::init(Val: 2),
130 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
131 "entry frequency, for a callsite to be cold in the absence of "
132 "profile information."));
133
134static cl::opt<uint64_t> HotCallSiteRelFreq(
135 "hot-callsite-rel-freq", cl::Hidden, cl::init(Val: 60),
136 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
137 "entry frequency, for a callsite to be hot in the absence of "
138 "profile information."));
139
140static cl::opt<int>
141 InstrCost("inline-instr-cost", cl::Hidden, cl::init(Val: 5),
142 cl::desc("Cost of a single instruction when inlining"));
143
144static cl::opt<int>
145 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(Val: 0),
146 cl::desc("Cost of load/store instruction when inlining"));
147
148static cl::opt<int> CallPenalty(
149 "inline-call-penalty", cl::Hidden, cl::init(Val: 25),
150 cl::desc("Call penalty that is applied per callsite when inlining"));
151
152static cl::opt<size_t>
153 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
154 cl::init(Val: std::numeric_limits<size_t>::max()),
155 cl::desc("Do not inline functions with a stack size "
156 "that exceeds the specified limit"));
157
158static cl::opt<size_t> RecurStackSizeThreshold(
159 "recursive-inline-max-stacksize", cl::Hidden,
160 cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller),
161 cl::desc("Do not inline recursive functions with a stack "
162 "size that exceeds the specified limit"));
163
164static cl::opt<bool> OptComputeFullInlineCost(
165 "inline-cost-full", cl::Hidden,
166 cl::desc("Compute the full inline cost of a call site even when the cost "
167 "exceeds the threshold."));
168
169static cl::opt<bool> InlineCallerSupersetNoBuiltin(
170 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(Val: true),
171 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
172 "attributes."));
173
174static cl::opt<bool> DisableGEPConstOperand(
175 "disable-gep-const-evaluation", cl::Hidden, cl::init(Val: false),
176 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
177
178namespace llvm {
179std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
180 if (Attr.isValid()) {
181 int AttrValue = 0;
182 if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue))
183 return AttrValue;
184 }
185 return std::nullopt;
186}
187
188std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
189 return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind));
190}
191
192std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
193 return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind));
194}
195
196namespace InlineConstants {
197int getInstrCost() { return InstrCost; }
198
199} // namespace InlineConstants
200
201} // namespace llvm
202
203namespace {
204class InlineCostCallAnalyzer;
205
206// This struct is used to store information about inline cost of a
207// particular instruction
208struct InstructionCostDetail {
209 int CostBefore = 0;
210 int CostAfter = 0;
211 int ThresholdBefore = 0;
212 int ThresholdAfter = 0;
213
214 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
215
216 int getCostDelta() const { return CostAfter - CostBefore; }
217
218 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
219};
220
221class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
222private:
223 InlineCostCallAnalyzer *const ICCA;
224
225public:
226 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
227 void emitInstructionAnnot(const Instruction *I,
228 formatted_raw_ostream &OS) override;
229};
230
231/// Carry out call site analysis, in order to evaluate inlinability.
232/// NOTE: the type is currently used as implementation detail of functions such
233/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
234/// expectation is that they come from the outer scope, from the wrapper
235/// functions. If we want to support constructing CallAnalyzer objects where
236/// lambdas are provided inline at construction, or where the object needs to
237/// otherwise survive past the scope of the provided functions, we need to
238/// revisit the argument types.
239class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
240 typedef InstVisitor<CallAnalyzer, bool> Base;
241 friend class InstVisitor<CallAnalyzer, bool>;
242
243protected:
244 virtual ~CallAnalyzer() = default;
245 /// The TargetTransformInfo available for this compilation.
246 const TargetTransformInfo &TTI;
247
248 /// Getter for the cache of @llvm.assume intrinsics.
249 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
250
251 /// Getter for BlockFrequencyInfo
252 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
253
254 /// Getter for TargetLibraryInfo
255 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
256
257 /// Profile summary information.
258 ProfileSummaryInfo *PSI;
259
260 /// The called function.
261 Function &F;
262
263 // Cache the DataLayout since we use it a lot.
264 const DataLayout &DL;
265
266 /// The OptimizationRemarkEmitter available for this compilation.
267 OptimizationRemarkEmitter *ORE;
268
269 /// The candidate callsite being analyzed. Please do not use this to do
270 /// analysis in the caller function; we want the inline cost query to be
271 /// easily cacheable. Instead, use the cover function paramHasAttr.
272 CallBase &CandidateCall;
273
274 /// Getter for the cache of ephemeral values.
275 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr;
276
277 /// Extension points for handling callsite features.
278 // Called before a basic block was analyzed.
279 virtual void onBlockStart(const BasicBlock *BB) {}
280
281 /// Called after a basic block was analyzed.
282 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
283
284 /// Called before an instruction was analyzed
285 virtual void onInstructionAnalysisStart(const Instruction *I) {}
286
287 /// Called after an instruction was analyzed
288 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
289
290 /// Called at the end of the analysis of the callsite. Return the outcome of
291 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
292 /// the reason it can't.
293 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
294 /// Called when we're about to start processing a basic block, and every time
295 /// we are done processing an instruction. Return true if there is no point in
296 /// continuing the analysis (e.g. we've determined already the call site is
297 /// too expensive to inline)
298 virtual bool shouldStop() { return false; }
299
300 /// Called before the analysis of the callee body starts (with callsite
301 /// contexts propagated). It checks callsite-specific information. Return a
302 /// reason analysis can't continue if that's the case, or 'true' if it may
303 /// continue.
304 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
305 /// Called if the analysis engine decides SROA cannot be done for the given
306 /// alloca.
307 virtual void onDisableSROA(AllocaInst *Arg) {}
308
309 /// Called the analysis engine determines load elimination won't happen.
310 virtual void onDisableLoadElimination() {}
311
312 /// Called when we visit a CallBase, before the analysis starts. Return false
313 /// to stop further processing of the instruction.
314 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
315
316 /// Called to account for a call.
317 virtual void onCallPenalty() {}
318
319 /// Called to account for a load or store.
320 virtual void onMemAccess(){};
321
322 /// Called to account for the expectation the inlining would result in a load
323 /// elimination.
324 virtual void onLoadEliminationOpportunity() {}
325
326 /// Called to account for the cost of argument setup for the Call in the
327 /// callee's body (not the callsite currently under analysis).
328 virtual void onCallArgumentSetup(const CallBase &Call) {}
329
330 /// Called to account for a load relative intrinsic.
331 virtual void onLoadRelativeIntrinsic() {}
332
333 /// Called to account for a lowered call.
334 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
335 }
336
337 /// Account for a jump table of given size. Return false to stop further
338 /// processing the switch instruction
339 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
340
341 /// Account for a case cluster of given size. Return false to stop further
342 /// processing of the instruction.
343 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
344
345 /// Called at the end of processing a switch instruction, with the given
346 /// number of case clusters.
347 virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
348 bool DefaultDestUnreachable) {}
349
350 /// Called to account for any other instruction not specifically accounted
351 /// for.
352 virtual void onMissedSimplification() {}
353
354 /// Start accounting potential benefits due to SROA for the given alloca.
355 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
356
357 /// Account SROA savings for the AllocaInst value.
358 virtual void onAggregateSROAUse(AllocaInst *V) {}
359
360 bool handleSROA(Value *V, bool DoNotDisable) {
361 // Check for SROA candidates in comparisons.
362 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
363 if (DoNotDisable) {
364 onAggregateSROAUse(V: SROAArg);
365 return true;
366 }
367 disableSROAForArg(SROAArg);
368 }
369 return false;
370 }
371
372 bool IsCallerRecursive = false;
373 bool IsRecursiveCall = false;
374 bool ExposesReturnsTwice = false;
375 bool HasDynamicAlloca = false;
376 bool ContainsNoDuplicateCall = false;
377 bool HasReturn = false;
378 bool HasIndirectBr = false;
379 bool HasUninlineableIntrinsic = false;
380 bool InitsVargArgs = false;
381
382 /// Number of bytes allocated statically by the callee.
383 uint64_t AllocatedSize = 0;
384 unsigned NumInstructions = 0;
385 unsigned NumVectorInstructions = 0;
386
387 /// While we walk the potentially-inlined instructions, we build up and
388 /// maintain a mapping of simplified values specific to this callsite. The
389 /// idea is to propagate any special information we have about arguments to
390 /// this call through the inlinable section of the function, and account for
391 /// likely simplifications post-inlining. The most important aspect we track
392 /// is CFG altering simplifications -- when we prove a basic block dead, that
393 /// can cause dramatic shifts in the cost of inlining a function.
394 /// Note: The simplified Value may be owned by the caller function.
395 DenseMap<Value *, Value *> SimplifiedValues;
396
397 /// Keep track of the values which map back (through function arguments) to
398 /// allocas on the caller stack which could be simplified through SROA.
399 DenseMap<Value *, AllocaInst *> SROAArgValues;
400
401 /// Keep track of Allocas for which we believe we may get SROA optimization.
402 DenseSet<AllocaInst *> EnabledSROAAllocas;
403
404 /// Keep track of values which map to a pointer base and constant offset.
405 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
406
407 /// Keep track of dead blocks due to the constant arguments.
408 SmallPtrSet<BasicBlock *, 16> DeadBlocks;
409
410 /// The mapping of the blocks to their known unique successors due to the
411 /// constant arguments.
412 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
413
414 /// Model the elimination of repeated loads that is expected to happen
415 /// whenever we simplify away the stores that would otherwise cause them to be
416 /// loads.
417 bool EnableLoadElimination = true;
418
419 /// Whether we allow inlining for recursive call.
420 bool AllowRecursiveCall = false;
421
422 SmallPtrSet<Value *, 16> LoadAddrSet;
423
424 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
425 auto It = SROAArgValues.find(Val: V);
426 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0)
427 return nullptr;
428 return It->second;
429 }
430
431 /// Use a value in its given form directly if possible, otherwise try looking
432 /// for it in SimplifiedValues.
433 template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
434 if (auto *Direct = dyn_cast<T>(V))
435 return Direct;
436 return getSimplifiedValue<T>(V);
437 }
438
439 // Custom simplification helper routines.
440 bool isAllocaDerivedArg(Value *V);
441 void disableSROAForArg(AllocaInst *SROAArg);
442 void disableSROA(Value *V);
443 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
444 void disableLoadElimination();
445 bool isGEPFree(GetElementPtrInst &GEP);
446 bool canFoldInboundsGEP(GetElementPtrInst &I);
447 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
448 bool simplifyCallSite(Function *F, CallBase &Call);
449 bool simplifyCmpInstForRecCall(CmpInst &Cmp);
450 bool simplifyInstruction(Instruction &I);
451 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
452 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
453 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
454 bool isLoweredToCall(Function *F, CallBase &Call);
455
456 /// Return true if the given argument to the function being considered for
457 /// inlining has the given attribute set either at the call site or the
458 /// function declaration. Primarily used to inspect call site specific
459 /// attributes since these can be more precise than the ones on the callee
460 /// itself.
461 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
462
463 /// Return true if the given value is known non null within the callee if
464 /// inlined through this particular callsite.
465 bool isKnownNonNullInCallee(Value *V);
466
467 /// Return true if size growth is allowed when inlining the callee at \p Call.
468 bool allowSizeGrowth(CallBase &Call);
469
470 // Custom analysis routines.
471 InlineResult analyzeBlock(BasicBlock *BB,
472 const SmallPtrSetImpl<const Value *> &EphValues);
473
474 // Disable several entry points to the visitor so we don't accidentally use
475 // them by declaring but not defining them here.
476 void visit(Module *);
477 void visit(Module &);
478 void visit(Function *);
479 void visit(Function &);
480 void visit(BasicBlock *);
481 void visit(BasicBlock &);
482
483 // Provide base case for our instruction visit.
484 bool visitInstruction(Instruction &I);
485
486 // Our visit overrides.
487 bool visitAlloca(AllocaInst &I);
488 bool visitPHI(PHINode &I);
489 bool visitGetElementPtr(GetElementPtrInst &I);
490 bool visitBitCast(BitCastInst &I);
491 bool visitPtrToInt(PtrToIntInst &I);
492 bool visitIntToPtr(IntToPtrInst &I);
493 bool visitCastInst(CastInst &I);
494 bool visitCmpInst(CmpInst &I);
495 bool visitSub(BinaryOperator &I);
496 bool visitBinaryOperator(BinaryOperator &I);
497 bool visitFNeg(UnaryOperator &I);
498 bool visitLoad(LoadInst &I);
499 bool visitStore(StoreInst &I);
500 bool visitExtractValue(ExtractValueInst &I);
501 bool visitInsertValue(InsertValueInst &I);
502 bool visitCallBase(CallBase &Call);
503 bool visitReturnInst(ReturnInst &RI);
504 bool visitBranchInst(BranchInst &BI);
505 bool visitSelectInst(SelectInst &SI);
506 bool visitSwitchInst(SwitchInst &SI);
507 bool visitIndirectBrInst(IndirectBrInst &IBI);
508 bool visitResumeInst(ResumeInst &RI);
509 bool visitCleanupReturnInst(CleanupReturnInst &RI);
510 bool visitCatchReturnInst(CatchReturnInst &RI);
511 bool visitUnreachableInst(UnreachableInst &I);
512
513public:
514 CallAnalyzer(
515 Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
516 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
517 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
518 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
519 ProfileSummaryInfo *PSI = nullptr,
520 OptimizationRemarkEmitter *ORE = nullptr,
521 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
522 nullptr)
523 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
524 GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
525 CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {}
526
527 InlineResult analyze();
528
529 /// Lookup simplified Value. May return a value owned by the caller.
530 Value *getSimplifiedValueUnchecked(Value *V) const {
531 return SimplifiedValues.lookup(Val: V);
532 }
533
534 /// Lookup simplified Value, but return nullptr if the simplified value is
535 /// owned by the caller.
536 template <typename T> T *getSimplifiedValue(Value *V) const {
537 Value *SimpleV = SimplifiedValues.lookup(Val: V);
538 if (!SimpleV)
539 return nullptr;
540
541 // Skip checks if we know T is a global. This has a small, but measurable
542 // impact on compile-time.
543 if constexpr (std::is_base_of_v<Constant, T>)
544 return dyn_cast<T>(SimpleV);
545
546 // Make sure the simplified Value is owned by this function
547 if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) {
548 if (I->getFunction() != &F)
549 return nullptr;
550 } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) {
551 if (Arg->getParent() != &F)
552 return nullptr;
553 } else if (!isa<Constant>(Val: SimpleV))
554 return nullptr;
555 return dyn_cast<T>(SimpleV);
556 }
557
558 // Keep a bunch of stats about the cost savings found so we can print them
559 // out when debugging.
560 unsigned NumConstantArgs = 0;
561 unsigned NumConstantOffsetPtrArgs = 0;
562 unsigned NumAllocaArgs = 0;
563 unsigned NumConstantPtrCmps = 0;
564 unsigned NumConstantPtrDiffs = 0;
565 unsigned NumInstructionsSimplified = 0;
566
567 void dump();
568};
569
570// Considering forming a binary search, we should find the number of nodes
571// which is same as the number of comparisons when lowered. For a given
572// number of clusters, n, we can define a recursive function, f(n), to find
573// the number of nodes in the tree. The recursion is :
574// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
575// and f(n) = n, when n <= 3.
576// This will lead a binary tree where the leaf should be either f(2) or f(3)
577// when n > 3. So, the number of comparisons from leaves should be n, while
578// the number of non-leaf should be :
579// 2^(log2(n) - 1) - 1
580// = 2^log2(n) * 2^-1 - 1
581// = n / 2 - 1.
582// Considering comparisons from leaf and non-leaf nodes, we can estimate the
583// number of comparisons in a simple closed form :
584// n + n / 2 - 1 = n * 3 / 2 - 1
585int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
586 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
587}
588
589/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
590/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
591class InlineCostCallAnalyzer final : public CallAnalyzer {
592 const bool ComputeFullInlineCost;
593 int LoadEliminationCost = 0;
594 /// Bonus to be applied when percentage of vector instructions in callee is
595 /// high (see more details in updateThreshold).
596 int VectorBonus = 0;
597 /// Bonus to be applied when the callee has only one reachable basic block.
598 int SingleBBBonus = 0;
599
600 /// Tunable parameters that control the analysis.
601 const InlineParams &Params;
602
603 // This DenseMap stores the delta change in cost and threshold after
604 // accounting for the given instruction. The map is filled only with the
605 // flag PrintInstructionComments on.
606 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
607
608 /// Upper bound for the inlining cost. Bonuses are being applied to account
609 /// for speculative "expected profit" of the inlining decision.
610 int Threshold = 0;
611
612 /// The amount of StaticBonus applied.
613 int StaticBonusApplied = 0;
614
615 /// Attempt to evaluate indirect calls to boost its inline cost.
616 const bool BoostIndirectCalls;
617
618 /// Ignore the threshold when finalizing analysis.
619 const bool IgnoreThreshold;
620
621 // True if the cost-benefit-analysis-based inliner is enabled.
622 const bool CostBenefitAnalysisEnabled;
623
624 /// Inlining cost measured in abstract units, accounts for all the
625 /// instructions expected to be executed for a given function invocation.
626 /// Instructions that are statically proven to be dead based on call-site
627 /// arguments are not counted here.
628 int Cost = 0;
629
630 // The cumulative cost at the beginning of the basic block being analyzed. At
631 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
632 // the size of that basic block.
633 int CostAtBBStart = 0;
634
635 // The static size of live but cold basic blocks. This is "static" in the
636 // sense that it's not weighted by profile counts at all.
637 int ColdSize = 0;
638
639 // Whether inlining is decided by cost-threshold analysis.
640 bool DecidedByCostThreshold = false;
641
642 // Whether inlining is decided by cost-benefit analysis.
643 bool DecidedByCostBenefit = false;
644
645 // The cost-benefit pair computed by cost-benefit analysis.
646 std::optional<CostBenefitPair> CostBenefit;
647
648 bool SingleBB = true;
649
650 unsigned SROACostSavings = 0;
651 unsigned SROACostSavingsLost = 0;
652
653 /// The mapping of caller Alloca values to their accumulated cost savings. If
654 /// we have to disable SROA for one of the allocas, this tells us how much
655 /// cost must be added.
656 DenseMap<AllocaInst *, int> SROAArgCosts;
657
658 /// Return true if \p Call is a cold callsite.
659 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
660
661 /// Update Threshold based on callsite properties such as callee
662 /// attributes and callee hotness for PGO builds. The Callee is explicitly
663 /// passed to support analyzing indirect calls whose target is inferred by
664 /// analysis.
665 void updateThreshold(CallBase &Call, Function &Callee);
666 /// Return a higher threshold if \p Call is a hot callsite.
667 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
668 BlockFrequencyInfo *CallerBFI);
669
670 /// Handle a capped 'int' increment for Cost.
671 void addCost(int64_t Inc) {
672 Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX);
673 Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX);
674 }
675
676 void onDisableSROA(AllocaInst *Arg) override {
677 auto CostIt = SROAArgCosts.find(Val: Arg);
678 if (CostIt == SROAArgCosts.end())
679 return;
680 addCost(Inc: CostIt->second);
681 SROACostSavings -= CostIt->second;
682 SROACostSavingsLost += CostIt->second;
683 SROAArgCosts.erase(I: CostIt);
684 }
685
686 void onDisableLoadElimination() override {
687 addCost(Inc: LoadEliminationCost);
688 LoadEliminationCost = 0;
689 }
690
691 bool onCallBaseVisitStart(CallBase &Call) override {
692 if (std::optional<int> AttrCallThresholdBonus =
693 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus"))
694 Threshold += *AttrCallThresholdBonus;
695
696 if (std::optional<int> AttrCallCost =
697 getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost")) {
698 addCost(Inc: *AttrCallCost);
699 // Prevent further processing of the call since we want to override its
700 // inline cost, not just add to it.
701 return false;
702 }
703 return true;
704 }
705
706 void onCallPenalty() override { addCost(Inc: CallPenalty); }
707
708 void onMemAccess() override { addCost(Inc: MemAccessCost); }
709
710 void onCallArgumentSetup(const CallBase &Call) override {
711 // Pay the price of the argument setup. We account for the average 1
712 // instruction per call argument setup here.
713 addCost(Inc: Call.arg_size() * InstrCost);
714 }
715 void onLoadRelativeIntrinsic() override {
716 // This is normally lowered to 4 LLVM instructions.
717 addCost(Inc: 3 * InstrCost);
718 }
719 void onLoweredCall(Function *F, CallBase &Call,
720 bool IsIndirectCall) override {
721 // We account for the average 1 instruction per call argument setup here.
722 addCost(Inc: Call.arg_size() * InstrCost);
723
724 // If we have a constant that we are calling as a function, we can peer
725 // through it and see the function target. This happens not infrequently
726 // during devirtualization and so we want to give it a hefty bonus for
727 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
728 // Pretend to inline the function, with a custom threshold.
729 if (IsIndirectCall && BoostIndirectCalls) {
730 auto IndirectCallParams = Params;
731 IndirectCallParams.DefaultThreshold =
732 InlineConstants::IndirectCallThreshold;
733 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
734 /// to instantiate the derived class.
735 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
736 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
737 false);
738 if (CA.analyze().isSuccess()) {
739 // We were able to inline the indirect call! Subtract the cost from the
740 // threshold to get the bonus we want to apply, but don't go below zero.
741 Cost -= std::max(a: 0, b: CA.getThreshold() - CA.getCost());
742 }
743 } else
744 // Otherwise simply add the cost for merely making the call.
745 addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call,
746 DefaultCallPenalty: CallPenalty));
747 }
748
749 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
750 bool DefaultDestUnreachable) override {
751 // If suitable for a jump table, consider the cost for the table size and
752 // branch to destination.
753 // Maximum valid cost increased in this function.
754 if (JumpTableSize) {
755 // Suppose a default branch includes one compare and one conditional
756 // branch if it's reachable.
757 if (!DefaultDestUnreachable)
758 addCost(Inc: 2 * InstrCost);
759 // Suppose a jump table requires one load and one jump instruction.
760 int64_t JTCost =
761 static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
762 addCost(Inc: JTCost);
763 return;
764 }
765
766 if (NumCaseCluster <= 3) {
767 // Suppose a comparison includes one compare and one conditional branch.
768 // We can reduce a set of instructions if the default branch is
769 // undefined.
770 addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost);
771 return;
772 }
773
774 int64_t ExpectedNumberOfCompare =
775 getExpectedNumberOfCompare(NumCaseCluster);
776 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
777
778 addCost(Inc: SwitchCost);
779 }
780 void onMissedSimplification() override { addCost(Inc: InstrCost); }
781
782 void onInitializeSROAArg(AllocaInst *Arg) override {
783 assert(Arg != nullptr &&
784 "Should not initialize SROA costs for null value.");
785 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
786 SROACostSavings += SROAArgCost;
787 SROAArgCosts[Arg] = SROAArgCost;
788 }
789
790 void onAggregateSROAUse(AllocaInst *SROAArg) override {
791 auto CostIt = SROAArgCosts.find(Val: SROAArg);
792 assert(CostIt != SROAArgCosts.end() &&
793 "expected this argument to have a cost");
794 CostIt->second += InstrCost;
795 SROACostSavings += InstrCost;
796 }
797
798 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
799
800 void onBlockAnalyzed(const BasicBlock *BB) override {
801 if (CostBenefitAnalysisEnabled) {
802 // Keep track of the static size of live but cold basic blocks. For now,
803 // we define a cold basic block to be one that's never executed.
804 assert(GetBFI && "GetBFI must be available");
805 BlockFrequencyInfo *BFI = &(GetBFI(F));
806 assert(BFI && "BFI must be available");
807 auto ProfileCount = BFI->getBlockProfileCount(BB);
808 if (*ProfileCount == 0)
809 ColdSize += Cost - CostAtBBStart;
810 }
811
812 auto *TI = BB->getTerminator();
813 // If we had any successors at this point, than post-inlining is likely to
814 // have them as well. Note that we assume any basic blocks which existed
815 // due to branches or switches which folded above will also fold after
816 // inlining.
817 if (SingleBB && TI->getNumSuccessors() > 1) {
818 // Take off the bonus we applied to the threshold.
819 Threshold -= SingleBBBonus;
820 SingleBB = false;
821 }
822 }
823
824 void onInstructionAnalysisStart(const Instruction *I) override {
825 // This function is called to store the initial cost of inlining before
826 // the given instruction was assessed.
827 if (!PrintInstructionComments)
828 return;
829 auto &CostDetail = InstructionCostDetailMap[I];
830 CostDetail.CostBefore = Cost;
831 CostDetail.ThresholdBefore = Threshold;
832 }
833
834 void onInstructionAnalysisFinish(const Instruction *I) override {
835 // This function is called to find new values of cost and threshold after
836 // the instruction has been assessed.
837 if (!PrintInstructionComments)
838 return;
839 auto &CostDetail = InstructionCostDetailMap[I];
840 CostDetail.CostAfter = Cost;
841 CostDetail.ThresholdAfter = Threshold;
842 }
843
844 bool isCostBenefitAnalysisEnabled() {
845 if (!PSI || !PSI->hasProfileSummary())
846 return false;
847
848 if (!GetBFI)
849 return false;
850
851 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
852 // Honor the explicit request from the user.
853 if (!InlineEnableCostBenefitAnalysis)
854 return false;
855 } else {
856 // Otherwise, require instrumentation profile.
857 if (!PSI->hasInstrumentationProfile())
858 return false;
859 }
860
861 auto *Caller = CandidateCall.getParent()->getParent();
862 if (!Caller->getEntryCount())
863 return false;
864
865 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
866 if (!CallerBFI)
867 return false;
868
869 // For now, limit to hot call site.
870 if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI))
871 return false;
872
873 // Make sure we have a nonzero entry count.
874 auto EntryCount = F.getEntryCount();
875 if (!EntryCount || !EntryCount->getCount())
876 return false;
877
878 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
879 if (!CalleeBFI)
880 return false;
881
882 return true;
883 }
884
885 // A helper function to choose between command line override and default.
886 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
887 if (InlineSavingsMultiplier.getNumOccurrences())
888 return InlineSavingsMultiplier;
889 return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
890 }
891
892 // A helper function to choose between command line override and default.
893 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
894 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
895 return InlineSavingsProfitableMultiplier;
896 return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
897 }
898
899 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
900 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
901 CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test")) {
902 CycleSavings = *AttrCycleSavings;
903 }
904
905 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
906 CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test")) {
907 Size = *AttrRuntimeCost;
908 }
909 }
910
911 // Determine whether we should inline the given call site, taking into account
912 // both the size cost and the cycle savings. Return std::nullopt if we don't
913 // have sufficient profiling information to determine.
914 std::optional<bool> costBenefitAnalysis() {
915 if (!CostBenefitAnalysisEnabled)
916 return std::nullopt;
917
918 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
919 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
920 // falling back to the cost-based metric.
921 // TODO: Improve this hacky condition.
922 if (Threshold == 0)
923 return std::nullopt;
924
925 assert(GetBFI);
926 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
927 assert(CalleeBFI);
928
929 // The cycle savings expressed as the sum of InstrCost
930 // multiplied by the estimated dynamic count of each instruction we can
931 // avoid. Savings come from the call site cost, such as argument setup and
932 // the call instruction, as well as the instructions that are folded.
933 //
934 // We use 128-bit APInt here to avoid potential overflow. This variable
935 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
936 // assumes that we can avoid or fold a billion instructions, each with a
937 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
938 // period on a 4GHz machine.
939 APInt CycleSavings(128, 0);
940
941 for (auto &BB : F) {
942 APInt CurrentSavings(128, 0);
943 for (auto &I : BB) {
944 if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) {
945 // Count a conditional branch as savings if it becomes unconditional.
946 if (BI->isConditional() &&
947 getSimplifiedValue<ConstantInt>(V: BI->getCondition())) {
948 CurrentSavings += InstrCost;
949 }
950 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) {
951 if (getSimplifiedValue<ConstantInt>(V: SI->getCondition()))
952 CurrentSavings += InstrCost;
953 } else if (Value *V = dyn_cast<Value>(Val: &I)) {
954 // Count an instruction as savings if we can fold it.
955 if (SimplifiedValues.count(Val: V)) {
956 CurrentSavings += InstrCost;
957 }
958 }
959 }
960
961 auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB);
962 CurrentSavings *= *ProfileCount;
963 CycleSavings += CurrentSavings;
964 }
965
966 // Compute the cycle savings per call.
967 auto EntryProfileCount = F.getEntryCount();
968 assert(EntryProfileCount && EntryProfileCount->getCount());
969 auto EntryCount = EntryProfileCount->getCount();
970 CycleSavings += EntryCount / 2;
971 CycleSavings = CycleSavings.udiv(RHS: EntryCount);
972
973 // Compute the total savings for the call site.
974 auto *CallerBB = CandidateCall.getParent();
975 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
976 CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL);
977 CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB);
978
979 // Remove the cost of the cold basic blocks to model the runtime cost more
980 // accurately. Both machine block placement and function splitting could
981 // place cold blocks further from hot blocks.
982 int Size = Cost - ColdSize;
983
984 // Allow tiny callees to be inlined regardless of whether they meet the
985 // savings threshold.
986 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
987
988 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
989 CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings);
990
991 // Let R be the ratio of CycleSavings to Size. We accept the inlining
992 // opportunity if R is really high and reject if R is really low. If R is
993 // somewhere in the middle, we fall back to the cost-based analysis.
994 //
995 // Specifically, let R = CycleSavings / Size, we accept the inlining
996 // opportunity if:
997 //
998 // PSI->getOrCompHotCountThreshold()
999 // R > -------------------------------------------------
1000 // getInliningCostBenefitAnalysisSavingsMultiplier()
1001 //
1002 // and reject the inlining opportunity if:
1003 //
1004 // PSI->getOrCompHotCountThreshold()
1005 // R <= ----------------------------------------------------
1006 // getInliningCostBenefitAnalysisProfitableMultiplier()
1007 //
1008 // Otherwise, we fall back to the cost-based analysis.
1009 //
1010 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
1011 // HotCountThreshold * Size) rather than division to avoid precision loss.
1012 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
1013 Threshold *= Size;
1014
1015 APInt UpperBoundCycleSavings = CycleSavings;
1016 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
1017 if (UpperBoundCycleSavings.uge(RHS: Threshold))
1018 return true;
1019
1020 APInt LowerBoundCycleSavings = CycleSavings;
1021 LowerBoundCycleSavings *=
1022 getInliningCostBenefitAnalysisProfitableMultiplier();
1023 if (LowerBoundCycleSavings.ult(RHS: Threshold))
1024 return false;
1025
1026 // Otherwise, fall back to the cost-based analysis.
1027 return std::nullopt;
1028 }
1029
1030 InlineResult finalizeAnalysis() override {
1031 // Loops generally act a lot like calls in that they act like barriers to
1032 // movement, require a certain amount of setup, etc. So when optimising for
1033 // size, we penalise any call sites that perform loops. We do this after all
1034 // other costs here, so will likely only be dealing with relatively small
1035 // functions (and hence DT and LI will hopefully be cheap).
1036 auto *Caller = CandidateCall.getFunction();
1037 if (Caller->hasMinSize()) {
1038 DominatorTree DT(F);
1039 LoopInfo LI(DT);
1040 int NumLoops = 0;
1041 for (Loop *L : LI) {
1042 // Ignore loops that will not be executed
1043 if (DeadBlocks.count(Ptr: L->getHeader()))
1044 continue;
1045 NumLoops++;
1046 }
1047 addCost(Inc: NumLoops * InlineConstants::LoopPenalty);
1048 }
1049
1050 // We applied the maximum possible vector bonus at the beginning. Now,
1051 // subtract the excess bonus, if any, from the Threshold before
1052 // comparing against Cost.
1053 if (NumVectorInstructions <= NumInstructions / 10)
1054 Threshold -= VectorBonus;
1055 else if (NumVectorInstructions <= NumInstructions / 2)
1056 Threshold -= VectorBonus / 2;
1057
1058 if (std::optional<int> AttrCost =
1059 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost"))
1060 Cost = *AttrCost;
1061
1062 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1063 CB&: CandidateCall,
1064 AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName))
1065 Cost *= *AttrCostMult;
1066
1067 if (std::optional<int> AttrThreshold =
1068 getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold"))
1069 Threshold = *AttrThreshold;
1070
1071 if (auto Result = costBenefitAnalysis()) {
1072 DecidedByCostBenefit = true;
1073 if (*Result)
1074 return InlineResult::success();
1075 else
1076 return InlineResult::failure(Reason: "Cost over threshold.");
1077 }
1078
1079 if (IgnoreThreshold)
1080 return InlineResult::success();
1081
1082 DecidedByCostThreshold = true;
1083 return Cost < std::max(a: 1, b: Threshold)
1084 ? InlineResult::success()
1085 : InlineResult::failure(Reason: "Cost over threshold.");
1086 }
1087
1088 bool shouldStop() override {
1089 if (IgnoreThreshold || ComputeFullInlineCost)
1090 return false;
1091 // Bail out the moment we cross the threshold. This means we'll under-count
1092 // the cost, but only when undercounting doesn't matter.
1093 if (Cost < Threshold)
1094 return false;
1095 DecidedByCostThreshold = true;
1096 return true;
1097 }
1098
1099 void onLoadEliminationOpportunity() override {
1100 LoadEliminationCost += InstrCost;
1101 }
1102
1103 InlineResult onAnalysisStart() override {
1104 // Perform some tweaks to the cost and threshold based on the direct
1105 // callsite information.
1106
1107 // We want to more aggressively inline vector-dense kernels, so up the
1108 // threshold, and we'll lower it if the % of vector instructions gets too
1109 // low. Note that these bonuses are some what arbitrary and evolved over
1110 // time by accident as much as because they are principled bonuses.
1111 //
1112 // FIXME: It would be nice to remove all such bonuses. At least it would be
1113 // nice to base the bonus values on something more scientific.
1114 assert(NumInstructions == 0);
1115 assert(NumVectorInstructions == 0);
1116
1117 // Update the threshold based on callsite properties
1118 updateThreshold(Call&: CandidateCall, Callee&: F);
1119
1120 // While Threshold depends on commandline options that can take negative
1121 // values, we want to enforce the invariant that the computed threshold and
1122 // bonuses are non-negative.
1123 assert(Threshold >= 0);
1124 assert(SingleBBBonus >= 0);
1125 assert(VectorBonus >= 0);
1126
1127 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1128 // this Threshold any time, and cost cannot decrease, we can stop processing
1129 // the rest of the function body.
1130 Threshold += (SingleBBBonus + VectorBonus);
1131
1132 // Give out bonuses for the callsite, as the instructions setting them up
1133 // will be gone after inlining.
1134 addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1135
1136 // If this function uses the coldcc calling convention, prefer not to inline
1137 // it.
1138 if (F.getCallingConv() == CallingConv::Cold)
1139 Cost += InlineConstants::ColdccPenalty;
1140
1141 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1142
1143 // Check if we're done. This can happen due to bonuses and penalties.
1144 if (Cost >= Threshold && !ComputeFullInlineCost)
1145 return InlineResult::failure(Reason: "high cost");
1146
1147 return InlineResult::success();
1148 }
1149
1150public:
1151 InlineCostCallAnalyzer(
1152 Function &Callee, CallBase &Call, const InlineParams &Params,
1153 const TargetTransformInfo &TTI,
1154 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1155 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1156 function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr,
1157 ProfileSummaryInfo *PSI = nullptr,
1158 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1159 bool IgnoreThreshold = false,
1160 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache =
1161 nullptr)
1162 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI,
1163 ORE, GetEphValuesCache),
1164 ComputeFullInlineCost(OptComputeFullInlineCost ||
1165 Params.ComputeFullInlineCost || ORE ||
1166 isCostBenefitAnalysisEnabled()),
1167 Params(Params), Threshold(Params.DefaultThreshold),
1168 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1169 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1170 Writer(this) {
1171 AllowRecursiveCall = *Params.AllowRecursiveCall;
1172 }
1173
1174 /// Annotation Writer for instruction details
1175 InlineCostAnnotationWriter Writer;
1176
1177 void dump();
1178
1179 // Prints the same analysis as dump(), but its definition is not dependent
1180 // on the build.
1181 void print(raw_ostream &OS);
1182
1183 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1184 auto It = InstructionCostDetailMap.find(Val: I);
1185 if (It != InstructionCostDetailMap.end())
1186 return It->second;
1187 return std::nullopt;
1188 }
1189
1190 virtual ~InlineCostCallAnalyzer() = default;
1191 int getThreshold() const { return Threshold; }
1192 int getCost() const { return Cost; }
1193 int getStaticBonusApplied() const { return StaticBonusApplied; }
1194 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1195 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1196 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1197};
1198
1199// Return true if CB is the sole call to local function Callee.
1200static bool isSoleCallToLocalFunction(const CallBase &CB,
1201 const Function &Callee) {
1202 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1203 &Callee == CB.getCalledFunction();
1204}
1205
1206class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1207private:
1208 InlineCostFeatures Cost = {};
1209
1210 // FIXME: These constants are taken from the heuristic-based cost visitor.
1211 // These should be removed entirely in a later revision to avoid reliance on
1212 // heuristics in the ML inliner.
1213 static constexpr int JTCostMultiplier = 2;
1214 static constexpr int CaseClusterCostMultiplier = 2;
1215 static constexpr int SwitchDefaultDestCostMultiplier = 2;
1216 static constexpr int SwitchCostMultiplier = 2;
1217
1218 // FIXME: These are taken from the heuristic-based cost visitor: we should
1219 // eventually abstract these to the CallAnalyzer to avoid duplication.
1220 unsigned SROACostSavingOpportunities = 0;
1221 int VectorBonus = 0;
1222 int SingleBBBonus = 0;
1223 int Threshold = 5;
1224
1225 DenseMap<AllocaInst *, unsigned> SROACosts;
1226
1227 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1228 Cost[static_cast<size_t>(Feature)] += Delta;
1229 }
1230
1231 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1232 Cost[static_cast<size_t>(Feature)] = Value;
1233 }
1234
1235 void onDisableSROA(AllocaInst *Arg) override {
1236 auto CostIt = SROACosts.find(Val: Arg);
1237 if (CostIt == SROACosts.end())
1238 return;
1239
1240 increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second);
1241 SROACostSavingOpportunities -= CostIt->second;
1242 SROACosts.erase(I: CostIt);
1243 }
1244
1245 void onDisableLoadElimination() override {
1246 set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1);
1247 }
1248
1249 void onCallPenalty() override {
1250 increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty);
1251 }
1252
1253 void onCallArgumentSetup(const CallBase &Call) override {
1254 increment(Feature: InlineCostFeatureIndex::call_argument_setup,
1255 Delta: Call.arg_size() * InstrCost);
1256 }
1257
1258 void onLoadRelativeIntrinsic() override {
1259 increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost);
1260 }
1261
1262 void onLoweredCall(Function *F, CallBase &Call,
1263 bool IsIndirectCall) override {
1264 increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup,
1265 Delta: Call.arg_size() * InstrCost);
1266
1267 if (IsIndirectCall) {
1268 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1269 /*HintThreshold*/ {},
1270 /*ColdThreshold*/ {},
1271 /*OptSizeThreshold*/ {},
1272 /*OptMinSizeThreshold*/ {},
1273 /*HotCallSiteThreshold*/ {},
1274 /*LocallyHotCallSiteThreshold*/ {},
1275 /*ColdCallSiteThreshold*/ {},
1276 /*ComputeFullInlineCost*/ true,
1277 /*EnableDeferral*/ true};
1278 IndirectCallParams.DefaultThreshold =
1279 InlineConstants::IndirectCallThreshold;
1280
1281 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1282 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
1283 false, true);
1284 if (CA.analyze().isSuccess()) {
1285 increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate,
1286 Delta: CA.getCost());
1287 increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1);
1288 }
1289 } else {
1290 onCallPenalty();
1291 }
1292 }
1293
1294 void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1295 bool DefaultDestUnreachable) override {
1296 if (JumpTableSize) {
1297 if (!DefaultDestUnreachable)
1298 increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty,
1299 Delta: SwitchDefaultDestCostMultiplier * InstrCost);
1300 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1301 JTCostMultiplier * InstrCost;
1302 increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost);
1303 return;
1304 }
1305
1306 if (NumCaseCluster <= 3) {
1307 increment(Feature: InlineCostFeatureIndex::case_cluster_penalty,
1308 Delta: (NumCaseCluster - DefaultDestUnreachable) *
1309 CaseClusterCostMultiplier * InstrCost);
1310 return;
1311 }
1312
1313 int64_t ExpectedNumberOfCompare =
1314 getExpectedNumberOfCompare(NumCaseCluster);
1315
1316 int64_t SwitchCost =
1317 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1318 increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost);
1319 }
1320
1321 void onMissedSimplification() override {
1322 increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions,
1323 Delta: InstrCost);
1324 }
1325
1326 void onInitializeSROAArg(AllocaInst *Arg) override {
1327 auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg);
1328 SROACosts[Arg] = SROAArgCost;
1329 SROACostSavingOpportunities += SROAArgCost;
1330 }
1331
1332 void onAggregateSROAUse(AllocaInst *Arg) override {
1333 SROACosts.find(Val: Arg)->second += InstrCost;
1334 SROACostSavingOpportunities += InstrCost;
1335 }
1336
1337 void onBlockAnalyzed(const BasicBlock *BB) override {
1338 if (BB->getTerminator()->getNumSuccessors() > 1)
1339 set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1);
1340 Threshold -= SingleBBBonus;
1341 }
1342
1343 InlineResult finalizeAnalysis() override {
1344 auto *Caller = CandidateCall.getFunction();
1345 if (Caller->hasMinSize()) {
1346 DominatorTree DT(F);
1347 LoopInfo LI(DT);
1348 for (Loop *L : LI) {
1349 // Ignore loops that will not be executed
1350 if (DeadBlocks.count(Ptr: L->getHeader()))
1351 continue;
1352 increment(Feature: InlineCostFeatureIndex::num_loops,
1353 Delta: InlineConstants::LoopPenalty);
1354 }
1355 }
1356 set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size());
1357 set(Feature: InlineCostFeatureIndex::simplified_instructions,
1358 Value: NumInstructionsSimplified);
1359 set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs);
1360 set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args,
1361 Value: NumConstantOffsetPtrArgs);
1362 set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities);
1363
1364 if (NumVectorInstructions <= NumInstructions / 10)
1365 Threshold -= VectorBonus;
1366 else if (NumVectorInstructions <= NumInstructions / 2)
1367 Threshold -= VectorBonus / 2;
1368
1369 set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold);
1370
1371 return InlineResult::success();
1372 }
1373
1374 bool shouldStop() override { return false; }
1375
1376 void onLoadEliminationOpportunity() override {
1377 increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1);
1378 }
1379
1380 InlineResult onAnalysisStart() override {
1381 increment(Feature: InlineCostFeatureIndex::callsite_cost,
1382 Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL));
1383
1384 set(Feature: InlineCostFeatureIndex::cold_cc_penalty,
1385 Value: (F.getCallingConv() == CallingConv::Cold));
1386
1387 set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus,
1388 Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F));
1389
1390 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1391 // analyzer - instead, we should abstract it to a common method in the
1392 // CallAnalyzer
1393 int SingleBBBonusPercent = 50;
1394 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1395 Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall);
1396 Threshold *= TTI.getInliningThresholdMultiplier();
1397 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1398 VectorBonus = Threshold * VectorBonusPercent / 100;
1399 Threshold += (SingleBBBonus + VectorBonus);
1400
1401 return InlineResult::success();
1402 }
1403
1404public:
1405 InlineCostFeaturesAnalyzer(
1406 const TargetTransformInfo &TTI,
1407 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1408 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1409 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
1410 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1411 CallBase &Call)
1412 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI,
1413 PSI) {}
1414
1415 const InlineCostFeatures &features() const { return Cost; }
1416};
1417
1418} // namespace
1419
1420/// Test whether the given value is an Alloca-derived function argument.
1421bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1422 return SROAArgValues.count(Val: V);
1423}
1424
1425void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1426 onDisableSROA(Arg: SROAArg);
1427 EnabledSROAAllocas.erase(V: SROAArg);
1428 disableLoadElimination();
1429}
1430
1431void InlineCostAnnotationWriter::emitInstructionAnnot(
1432 const Instruction *I, formatted_raw_ostream &OS) {
1433 // The cost of inlining of the given instruction is printed always.
1434 // The threshold delta is printed only when it is non-zero. It happens
1435 // when we decided to give a bonus at a particular instruction.
1436 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1437 if (!Record)
1438 OS << "; No analysis for the instruction";
1439 else {
1440 OS << "; cost before = " << Record->CostBefore
1441 << ", cost after = " << Record->CostAfter
1442 << ", threshold before = " << Record->ThresholdBefore
1443 << ", threshold after = " << Record->ThresholdAfter << ", ";
1444 OS << "cost delta = " << Record->getCostDelta();
1445 if (Record->hasThresholdChanged())
1446 OS << ", threshold delta = " << Record->getThresholdDelta();
1447 }
1448 auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I));
1449 if (V) {
1450 OS << ", simplified to ";
1451 V->print(O&: OS, IsForDebug: true);
1452 if (auto *VI = dyn_cast<Instruction>(Val: V)) {
1453 if (VI->getFunction() != I->getFunction())
1454 OS << " (caller instruction)";
1455 } else if (auto *VArg = dyn_cast<Argument>(Val: V)) {
1456 if (VArg->getParent() != I->getFunction())
1457 OS << " (caller argument)";
1458 }
1459 }
1460 OS << "\n";
1461}
1462
1463/// If 'V' maps to a SROA candidate, disable SROA for it.
1464void CallAnalyzer::disableSROA(Value *V) {
1465 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1466 disableSROAForArg(SROAArg);
1467 }
1468}
1469
1470void CallAnalyzer::disableLoadElimination() {
1471 if (EnableLoadElimination) {
1472 onDisableLoadElimination();
1473 EnableLoadElimination = false;
1474 }
1475}
1476
1477/// Accumulate a constant GEP offset into an APInt if possible.
1478///
1479/// Returns false if unable to compute the offset for any reason. Respects any
1480/// simplified values known during the analysis of this callsite.
1481bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1482 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType());
1483 assert(IntPtrWidth == Offset.getBitWidth());
1484
1485 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1486 GTI != GTE; ++GTI) {
1487 ConstantInt *OpC =
1488 getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand());
1489 if (!OpC)
1490 return false;
1491 if (OpC->isZero())
1492 continue;
1493
1494 // Handle a struct index, which adds its field offset to the pointer.
1495 if (StructType *STy = GTI.getStructTypeOrNull()) {
1496 unsigned ElementIdx = OpC->getZExtValue();
1497 const StructLayout *SL = DL.getStructLayout(Ty: STy);
1498 Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx));
1499 continue;
1500 }
1501
1502 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1503 Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize;
1504 }
1505 return true;
1506}
1507
1508/// Use TTI to check whether a GEP is free.
1509///
1510/// Respects any simplified values known during the analysis of this callsite.
1511bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1512 SmallVector<Value *, 4> Operands;
1513 Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0));
1514 for (const Use &Op : GEP.indices())
1515 if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op))
1516 Operands.push_back(Elt: SimpleOp);
1517 else
1518 Operands.push_back(Elt: Op);
1519 return TTI.getInstructionCost(U: &GEP, Operands,
1520 CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1521 TargetTransformInfo::TCC_Free;
1522}
1523
1524bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1525 disableSROA(V: I.getOperand(i_nocapture: 0));
1526
1527 // Check whether inlining will turn a dynamic alloca into a static
1528 // alloca and handle that case.
1529 if (I.isArrayAllocation()) {
1530 Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize());
1531 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) {
1532 // Sometimes a dynamic alloca could be converted into a static alloca
1533 // after this constant prop, and become a huge static alloca on an
1534 // unconditional CFG path. Avoid inlining if this is going to happen above
1535 // a threshold.
1536 // FIXME: If the threshold is removed or lowered too much, we could end up
1537 // being too pessimistic and prevent inlining non-problematic code. This
1538 // could result in unintended perf regressions. A better overall strategy
1539 // is needed to track stack usage during inlining.
1540 Type *Ty = I.getAllocatedType();
1541 AllocatedSize = SaturatingMultiplyAdd(
1542 X: AllocSize->getLimitedValue(),
1543 Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize);
1544 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1545 HasDynamicAlloca = true;
1546 return false;
1547 }
1548 }
1549
1550 // Accumulate the allocated size.
1551 if (I.isStaticAlloca()) {
1552 Type *Ty = I.getAllocatedType();
1553 AllocatedSize = SaturatingAdd(X: DL.getTypeAllocSize(Ty).getKnownMinValue(),
1554 Y: AllocatedSize);
1555 }
1556
1557 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1558 // a variety of reasons, and so we would like to not inline them into
1559 // functions which don't currently have a dynamic alloca. This simply
1560 // disables inlining altogether in the presence of a dynamic alloca.
1561 if (!I.isStaticAlloca())
1562 HasDynamicAlloca = true;
1563
1564 return false;
1565}
1566
1567bool CallAnalyzer::visitPHI(PHINode &I) {
1568 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1569 // though we don't want to propagate it's bonuses. The idea is to disable
1570 // SROA if it *might* be used in an inappropriate manner.
1571
1572 // Phi nodes are always zero-cost.
1573 // FIXME: Pointer sizes may differ between different address spaces, so do we
1574 // need to use correct address space in the call to getPointerSizeInBits here?
1575 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1576 // see the ZeroOffset is used as a dummy value, so we can probably use any
1577 // bit width for the ZeroOffset?
1578 APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0));
1579 bool CheckSROA = I.getType()->isPointerTy();
1580
1581 // Track the constant or pointer with constant offset we've seen so far.
1582 Constant *FirstC = nullptr;
1583 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1584 Value *FirstV = nullptr;
1585
1586 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1587 BasicBlock *Pred = I.getIncomingBlock(i);
1588 // If the incoming block is dead, skip the incoming block.
1589 if (DeadBlocks.count(Ptr: Pred))
1590 continue;
1591 // If the parent block of phi is not the known successor of the incoming
1592 // block, skip the incoming block.
1593 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1594 if (KnownSuccessor && KnownSuccessor != I.getParent())
1595 continue;
1596
1597 Value *V = I.getIncomingValue(i);
1598 // If the incoming value is this phi itself, skip the incoming value.
1599 if (&I == V)
1600 continue;
1601
1602 Constant *C = getDirectOrSimplifiedValue<Constant>(V);
1603
1604 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1605 if (!C && CheckSROA)
1606 BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V);
1607
1608 if (!C && !BaseAndOffset.first)
1609 // The incoming value is neither a constant nor a pointer with constant
1610 // offset, exit early.
1611 return true;
1612
1613 if (FirstC) {
1614 if (FirstC == C)
1615 // If we've seen a constant incoming value before and it is the same
1616 // constant we see this time, continue checking the next incoming value.
1617 continue;
1618 // Otherwise early exit because we either see a different constant or saw
1619 // a constant before but we have a pointer with constant offset this time.
1620 return true;
1621 }
1622
1623 if (FirstV) {
1624 // The same logic as above, but check pointer with constant offset here.
1625 if (FirstBaseAndOffset == BaseAndOffset)
1626 continue;
1627 return true;
1628 }
1629
1630 if (C) {
1631 // This is the 1st time we've seen a constant, record it.
1632 FirstC = C;
1633 continue;
1634 }
1635
1636 // The remaining case is that this is the 1st time we've seen a pointer with
1637 // constant offset, record it.
1638 FirstV = V;
1639 FirstBaseAndOffset = BaseAndOffset;
1640 }
1641
1642 // Check if we can map phi to a constant.
1643 if (FirstC) {
1644 SimplifiedValues[&I] = FirstC;
1645 return true;
1646 }
1647
1648 // Check if we can map phi to a pointer with constant offset.
1649 if (FirstBaseAndOffset.first) {
1650 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1651
1652 if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV))
1653 SROAArgValues[&I] = SROAArg;
1654 }
1655
1656 return true;
1657}
1658
1659/// Check we can fold GEPs of constant-offset call site argument pointers.
1660/// This requires target data and inbounds GEPs.
1661///
1662/// \return true if the specified GEP can be folded.
1663bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1664 // Check if we have a base + offset for the pointer.
1665 std::pair<Value *, APInt> BaseAndOffset =
1666 ConstantOffsetPtrs.lookup(Val: I.getPointerOperand());
1667 if (!BaseAndOffset.first)
1668 return false;
1669
1670 // Check if the offset of this GEP is constant, and if so accumulate it
1671 // into Offset.
1672 if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second))
1673 return false;
1674
1675 // Add the result as a new mapping to Base + Offset.
1676 ConstantOffsetPtrs[&I] = BaseAndOffset;
1677
1678 return true;
1679}
1680
1681bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1682 auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand());
1683
1684 // Lambda to check whether a GEP's indices are all constant.
1685 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1686 for (const Use &Op : GEP.indices())
1687 if (!getDirectOrSimplifiedValue<Constant>(V: Op))
1688 return false;
1689 return true;
1690 };
1691
1692 if (!DisableGEPConstOperand)
1693 if (simplifyInstruction(I))
1694 return true;
1695
1696 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1697 if (SROAArg)
1698 SROAArgValues[&I] = SROAArg;
1699
1700 // Constant GEPs are modeled as free.
1701 return true;
1702 }
1703
1704 // Variable GEPs will require math and will disable SROA.
1705 if (SROAArg)
1706 disableSROAForArg(SROAArg);
1707 return isGEPFree(GEP&: I);
1708}
1709
1710// Simplify \p Cmp if RHS is const and we can ValueTrack LHS.
1711// This handles the case only when the Cmp instruction is guarding a recursive
1712// call that will cause the Cmp to fail/succeed for the recursive call.
1713bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) {
1714 // Bail out if LHS is not a function argument or RHS is NOT const:
1715 if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1)))
1716 return false;
1717 auto *CmpOp = Cmp.getOperand(i_nocapture: 0);
1718 // Make sure that the callsite is recursive:
1719 if (CandidateCall.getCaller() != &F)
1720 return false;
1721 // Only handle the case when the callsite has a single predecessor:
1722 auto *CallBB = CandidateCall.getParent();
1723 auto *Predecessor = CallBB->getSinglePredecessor();
1724 if (!Predecessor)
1725 return false;
1726 // Check if the callsite is guarded by the same Cmp instruction:
1727 auto *Br = dyn_cast<BranchInst>(Val: Predecessor->getTerminator());
1728 if (!Br || Br->isUnconditional() || Br->getCondition() != &Cmp)
1729 return false;
1730
1731 // Check if there is any arg of the recursive callsite is affecting the cmp
1732 // instr:
1733 bool ArgFound = false;
1734 Value *FuncArg = nullptr, *CallArg = nullptr;
1735 for (unsigned ArgNum = 0;
1736 ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) {
1737 FuncArg = F.getArg(i: ArgNum);
1738 CallArg = CandidateCall.getArgOperand(i: ArgNum);
1739 if (FuncArg == CmpOp && CallArg != CmpOp) {
1740 ArgFound = true;
1741 break;
1742 }
1743 }
1744 if (!ArgFound)
1745 return false;
1746
1747 // Now we have a recursive call that is guarded by a cmp instruction.
1748 // Check if this cmp can be simplified:
1749 SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg));
1750 CondContext CC(&Cmp);
1751 CC.Invert = (CallBB != Br->getSuccessor(i: 0));
1752 SQ.CC = &CC;
1753 CC.AffectedValues.insert(Ptr: FuncArg);
1754 Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands(
1755 I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ);
1756 if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) {
1757 // Make sure that the BB of the recursive call is NOT the true successor
1758 // of the icmp. In other words, make sure that the recursion depth is 1.
1759 if ((ConstVal->isOne() && CC.Invert) ||
1760 (ConstVal->isZero() && !CC.Invert)) {
1761 SimplifiedValues[&Cmp] = ConstVal;
1762 return true;
1763 }
1764 }
1765 return false;
1766}
1767
1768/// Simplify \p I if its operands are constants and update SimplifiedValues.
1769bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1770 SmallVector<Constant *> COps;
1771 for (Value *Op : I.operands()) {
1772 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
1773 if (!COp)
1774 return false;
1775 COps.push_back(Elt: COp);
1776 }
1777 auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL);
1778 if (!C)
1779 return false;
1780 SimplifiedValues[&I] = C;
1781 return true;
1782}
1783
1784/// Try to simplify a call to llvm.is.constant.
1785///
1786/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1787/// we expect calls of this specific intrinsic to be infrequent.
1788///
1789/// FIXME: Given that we know CB's parent (F) caller
1790/// (CandidateCall->getParent()->getParent()), we might be able to determine
1791/// whether inlining F into F's caller would change how the call to
1792/// llvm.is.constant would evaluate.
1793bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1794 Value *Arg = CB.getArgOperand(i: 0);
1795 auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg);
1796
1797 Type *RT = CB.getFunctionType()->getReturnType();
1798 SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0);
1799 return true;
1800}
1801
1802bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1803 // As per the langref, "The fourth argument to llvm.objectsize determines if
1804 // the value should be evaluated at runtime."
1805 if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne())
1806 return false;
1807
1808 Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr,
1809 /*MustSucceed=*/true);
1810 Constant *C = dyn_cast_or_null<Constant>(Val: V);
1811 if (C)
1812 SimplifiedValues[&CB] = C;
1813 return C;
1814}
1815
1816bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1817 // Propagate constants through bitcasts.
1818 if (simplifyInstruction(I))
1819 return true;
1820
1821 // Track base/offsets through casts
1822 std::pair<Value *, APInt> BaseAndOffset =
1823 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1824 // Casts don't change the offset, just wrap it up.
1825 if (BaseAndOffset.first)
1826 ConstantOffsetPtrs[&I] = BaseAndOffset;
1827
1828 // Also look for SROA candidates here.
1829 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1830 SROAArgValues[&I] = SROAArg;
1831
1832 // Bitcasts are always zero cost.
1833 return true;
1834}
1835
1836bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1837 // Propagate constants through ptrtoint.
1838 if (simplifyInstruction(I))
1839 return true;
1840
1841 // Track base/offset pairs when converted to a plain integer provided the
1842 // integer is large enough to represent the pointer.
1843 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1844 unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1845 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1846 std::pair<Value *, APInt> BaseAndOffset =
1847 ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0));
1848 if (BaseAndOffset.first)
1849 ConstantOffsetPtrs[&I] = BaseAndOffset;
1850 }
1851
1852 // This is really weird. Technically, ptrtoint will disable SROA. However,
1853 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1854 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1855 // would block SROA would also block SROA if applied directly to a pointer,
1856 // and so we can just add the integer in here. The only places where SROA is
1857 // preserved either cannot fire on an integer, or won't in-and-of themselves
1858 // disable SROA (ext) w/o some later use that we would see and disable.
1859 if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0)))
1860 SROAArgValues[&I] = SROAArg;
1861
1862 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1863 TargetTransformInfo::TCC_Free;
1864}
1865
1866bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1867 // Propagate constants through ptrtoint.
1868 if (simplifyInstruction(I))
1869 return true;
1870
1871 // Track base/offset pairs when round-tripped through a pointer without
1872 // modifications provided the integer is not too large.
1873 Value *Op = I.getOperand(i_nocapture: 0);
1874 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1875 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1876 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op);
1877 if (BaseAndOffset.first)
1878 ConstantOffsetPtrs[&I] = BaseAndOffset;
1879 }
1880
1881 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1882 if (auto *SROAArg = getSROAArgForValueOrNull(V: Op))
1883 SROAArgValues[&I] = SROAArg;
1884
1885 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1886 TargetTransformInfo::TCC_Free;
1887}
1888
1889bool CallAnalyzer::visitCastInst(CastInst &I) {
1890 // Propagate constants through casts.
1891 if (simplifyInstruction(I))
1892 return true;
1893
1894 // Disable SROA in the face of arbitrary casts we don't explicitly list
1895 // elsewhere.
1896 disableSROA(V: I.getOperand(i_nocapture: 0));
1897
1898 // If this is a floating-point cast, and the target says this operation
1899 // is expensive, this may eventually become a library call. Treat the cost
1900 // as such.
1901 switch (I.getOpcode()) {
1902 case Instruction::FPTrunc:
1903 case Instruction::FPExt:
1904 case Instruction::UIToFP:
1905 case Instruction::SIToFP:
1906 case Instruction::FPToUI:
1907 case Instruction::FPToSI:
1908 if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive)
1909 onCallPenalty();
1910 break;
1911 default:
1912 break;
1913 }
1914
1915 return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
1916 TargetTransformInfo::TCC_Free;
1917}
1918
1919bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1920 return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr);
1921}
1922
1923bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1924 // Does the *call site* have the NonNull attribute set on an argument? We
1925 // use the attribute on the call site to memoize any analysis done in the
1926 // caller. This will also trip if the callee function has a non-null
1927 // parameter attribute, but that's a less interesting case because hopefully
1928 // the callee would already have been simplified based on that.
1929 if (Argument *A = dyn_cast<Argument>(Val: V))
1930 if (paramHasAttr(A, Attr: Attribute::NonNull))
1931 return true;
1932
1933 // Is this an alloca in the caller? This is distinct from the attribute case
1934 // above because attributes aren't updated within the inliner itself and we
1935 // always want to catch the alloca derived case.
1936 if (isAllocaDerivedArg(V))
1937 // We can actually predict the result of comparisons between an
1938 // alloca-derived value and null. Note that this fires regardless of
1939 // SROA firing.
1940 return true;
1941
1942 return false;
1943}
1944
1945bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1946 // If the normal destination of the invoke or the parent block of the call
1947 // site is unreachable-terminated, there is little point in inlining this
1948 // unless there is literally zero cost.
1949 // FIXME: Note that it is possible that an unreachable-terminated block has a
1950 // hot entry. For example, in below scenario inlining hot_call_X() may be
1951 // beneficial :
1952 // main() {
1953 // hot_call_1();
1954 // ...
1955 // hot_call_N()
1956 // exit(0);
1957 // }
1958 // For now, we are not handling this corner case here as it is rare in real
1959 // code. In future, we should elaborate this based on BPI and BFI in more
1960 // general threshold adjusting heuristics in updateThreshold().
1961 if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) {
1962 if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator()))
1963 return false;
1964 } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator()))
1965 return false;
1966
1967 return true;
1968}
1969
1970bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1971 BlockFrequencyInfo *CallerBFI) {
1972 // If global profile summary is available, then callsite's coldness is
1973 // determined based on that.
1974 if (PSI && PSI->hasProfileSummary())
1975 return PSI->isColdCallSite(CB: Call, BFI: CallerBFI);
1976
1977 // Otherwise we need BFI to be available.
1978 if (!CallerBFI)
1979 return false;
1980
1981 // Determine if the callsite is cold relative to caller's entry. We could
1982 // potentially cache the computation of scaled entry frequency, but the added
1983 // complexity is not worth it unless this scaling shows up high in the
1984 // profiles.
1985 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1986 auto CallSiteBB = Call.getParent();
1987 auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
1988 auto CallerEntryFreq =
1989 CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock()));
1990 return CallSiteFreq < CallerEntryFreq * ColdProb;
1991}
1992
1993std::optional<int>
1994InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1995 BlockFrequencyInfo *CallerBFI) {
1996
1997 // If global profile summary is available, then callsite's hotness is
1998 // determined based on that.
1999 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI))
2000 return Params.HotCallSiteThreshold;
2001
2002 // Otherwise we need BFI to be available and to have a locally hot callsite
2003 // threshold.
2004 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
2005 return std::nullopt;
2006
2007 // Determine if the callsite is hot relative to caller's entry. We could
2008 // potentially cache the computation of scaled entry frequency, but the added
2009 // complexity is not worth it unless this scaling shows up high in the
2010 // profiles.
2011 const BasicBlock *CallSiteBB = Call.getParent();
2012 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB);
2013 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
2014 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq);
2015 if (Limit && CallSiteFreq >= *Limit)
2016 return Params.LocallyHotCallSiteThreshold;
2017
2018 // Otherwise treat it normally.
2019 return std::nullopt;
2020}
2021
2022void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
2023 // If no size growth is allowed for this inlining, set Threshold to 0.
2024 if (!allowSizeGrowth(Call)) {
2025 Threshold = 0;
2026 return;
2027 }
2028
2029 Function *Caller = Call.getCaller();
2030
2031 // return min(A, B) if B is valid.
2032 auto MinIfValid = [](int A, std::optional<int> B) {
2033 return B ? std::min(a: A, b: *B) : A;
2034 };
2035
2036 // return max(A, B) if B is valid.
2037 auto MaxIfValid = [](int A, std::optional<int> B) {
2038 return B ? std::max(a: A, b: *B) : A;
2039 };
2040
2041 // Various bonus percentages. These are multiplied by Threshold to get the
2042 // bonus values.
2043 // SingleBBBonus: This bonus is applied if the callee has a single reachable
2044 // basic block at the given callsite context. This is speculatively applied
2045 // and withdrawn if more than one basic block is seen.
2046 //
2047 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
2048 // of the last call to a static function as inlining such functions is
2049 // guaranteed to reduce code size.
2050 //
2051 // These bonus percentages may be set to 0 based on properties of the caller
2052 // and the callsite.
2053 int SingleBBBonusPercent = 50;
2054 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
2055 int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus();
2056
2057 // Lambda to set all the above bonus and bonus percentages to 0.
2058 auto DisallowAllBonuses = [&]() {
2059 SingleBBBonusPercent = 0;
2060 VectorBonusPercent = 0;
2061 LastCallToStaticBonus = 0;
2062 };
2063
2064 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
2065 // and reduce the threshold if the caller has the necessary attribute.
2066 if (Caller->hasMinSize()) {
2067 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
2068 // For minsize, we want to disable the single BB bonus and the vector
2069 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
2070 // a static function will, at the minimum, eliminate the parameter setup and
2071 // call/return instructions.
2072 SingleBBBonusPercent = 0;
2073 VectorBonusPercent = 0;
2074 } else if (Caller->hasOptSize())
2075 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
2076
2077 // Adjust the threshold based on inlinehint attribute and profile based
2078 // hotness information if the caller does not have MinSize attribute.
2079 if (!Caller->hasMinSize()) {
2080 if (Callee.hasFnAttribute(Kind: Attribute::InlineHint))
2081 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2082
2083 // FIXME: After switching to the new passmanager, simplify the logic below
2084 // by checking only the callsite hotness/coldness as we will reliably
2085 // have local profile information.
2086 //
2087 // Callsite hotness and coldness can be determined if sample profile is
2088 // used (which adds hotness metadata to calls) or if caller's
2089 // BlockFrequencyInfo is available.
2090 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
2091 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
2092 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
2093 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
2094 // FIXME: This should update the threshold only if it exceeds the
2095 // current threshold, but AutoFDO + ThinLTO currently relies on this
2096 // behavior to prevent inlining of hot callsites during ThinLTO
2097 // compile phase.
2098 Threshold = *HotCallSiteThreshold;
2099 } else if (isColdCallSite(Call, CallerBFI)) {
2100 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2101 // Do not apply bonuses for a cold callsite including the
2102 // LastCallToStatic bonus. While this bonus might result in code size
2103 // reduction, it can cause the size of a non-cold caller to increase
2104 // preventing it from being inlined.
2105 DisallowAllBonuses();
2106 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
2107 } else if (PSI) {
2108 // Use callee's global profile information only if we have no way of
2109 // determining this via callsite information.
2110 if (PSI->isFunctionEntryHot(F: &Callee)) {
2111 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2112 // If callsite hotness can not be determined, we may still know
2113 // that the callee is hot and treat it as a weaker hint for threshold
2114 // increase.
2115 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2116 } else if (PSI->isFunctionEntryCold(F: &Callee)) {
2117 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2118 // Do not apply bonuses for a cold callee including the
2119 // LastCallToStatic bonus. While this bonus might result in code size
2120 // reduction, it can cause the size of a non-cold caller to increase
2121 // preventing it from being inlined.
2122 DisallowAllBonuses();
2123 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2124 }
2125 }
2126 }
2127
2128 Threshold += TTI.adjustInliningThreshold(CB: &Call);
2129
2130 // Finally, take the target-specific inlining threshold multiplier into
2131 // account.
2132 Threshold *= TTI.getInliningThresholdMultiplier();
2133
2134 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2135 VectorBonus = Threshold * VectorBonusPercent / 100;
2136
2137 // If there is only one call of the function, and it has internal linkage,
2138 // the cost of inlining it drops dramatically. It may seem odd to update
2139 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2140 if (isSoleCallToLocalFunction(CB: Call, Callee: F)) {
2141 Cost -= LastCallToStaticBonus;
2142 StaticBonusApplied = LastCallToStaticBonus;
2143 }
2144}
2145
2146bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2147 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2148 // First try to handle simplified comparisons.
2149 if (simplifyInstruction(I))
2150 return true;
2151
2152 // Try to handle comparison that can be simplified using ValueTracking.
2153 if (simplifyCmpInstForRecCall(Cmp&: I))
2154 return true;
2155
2156 if (I.getOpcode() == Instruction::FCmp)
2157 return false;
2158
2159 // Otherwise look for a comparison between constant offset pointers with
2160 // a common base.
2161 Value *LHSBase, *RHSBase;
2162 APInt LHSOffset, RHSOffset;
2163 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2164 if (LHSBase) {
2165 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2166 if (RHSBase && LHSBase == RHSBase) {
2167 // We have common bases, fold the icmp to a constant based on the
2168 // offsets.
2169 SimplifiedValues[&I] = ConstantInt::getBool(
2170 Ty: I.getType(),
2171 V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate()));
2172 ++NumConstantPtrCmps;
2173 return true;
2174 }
2175 }
2176
2177 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2178 for (auto *User : I.users())
2179 if (auto *Instr = dyn_cast<Instruction>(Val: User))
2180 if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit))
2181 return false;
2182 return true;
2183 };
2184
2185 // If the comparison is an equality comparison with null, we can simplify it
2186 // if we know the value (argument) can't be null
2187 if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) {
2188 if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) {
2189 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2190 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType())
2191 : ConstantInt::getFalse(Ty: I.getType());
2192 return true;
2193 }
2194 // Implicit null checks act as unconditional branches and their comparisons
2195 // should be treated as simplified and free of cost.
2196 if (isImplicitNullCheckCmp(I))
2197 return true;
2198 }
2199 return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1)));
2200}
2201
2202bool CallAnalyzer::visitSub(BinaryOperator &I) {
2203 // Try to handle a special case: we can fold computing the difference of two
2204 // constant-related pointers.
2205 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2206 Value *LHSBase, *RHSBase;
2207 APInt LHSOffset, RHSOffset;
2208 std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS);
2209 if (LHSBase) {
2210 std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS);
2211 if (RHSBase && LHSBase == RHSBase) {
2212 // We have common bases, fold the subtract to a constant based on the
2213 // offsets.
2214 Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset);
2215 Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset);
2216 if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) {
2217 SimplifiedValues[&I] = C;
2218 ++NumConstantPtrDiffs;
2219 return true;
2220 }
2221 }
2222 }
2223
2224 // Otherwise, fall back to the generic logic for simplifying and handling
2225 // instructions.
2226 return Base::visitSub(I);
2227}
2228
2229bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2230 Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1);
2231 Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS);
2232 Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS);
2233
2234 Value *SimpleV = nullptr;
2235 if (auto FI = dyn_cast<FPMathOperator>(Val: &I))
2236 SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS,
2237 FMF: FI->getFastMathFlags(), Q: DL);
2238 else
2239 SimpleV =
2240 simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL);
2241
2242 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2243 SimplifiedValues[&I] = C;
2244
2245 if (SimpleV)
2246 return true;
2247
2248 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2249 disableSROA(V: LHS);
2250 disableSROA(V: RHS);
2251
2252 // If the instruction is floating point, and the target says this operation
2253 // is expensive, this may eventually become a library call. Treat the cost
2254 // as such. Unless it's fneg which can be implemented with an xor.
2255 using namespace llvm::PatternMatch;
2256 if (I.getType()->isFloatingPointTy() &&
2257 TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive &&
2258 !match(V: &I, P: m_FNeg(X: m_Value())))
2259 onCallPenalty();
2260
2261 return false;
2262}
2263
2264bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2265 Value *Op = I.getOperand(i_nocapture: 0);
2266 Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op);
2267
2268 Value *SimpleV = simplifyFNegInst(
2269 Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL);
2270
2271 if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV))
2272 SimplifiedValues[&I] = C;
2273
2274 if (SimpleV)
2275 return true;
2276
2277 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2278 disableSROA(V: Op);
2279
2280 return false;
2281}
2282
2283bool CallAnalyzer::visitLoad(LoadInst &I) {
2284 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2285 return true;
2286
2287 // If the data is already loaded from this address and hasn't been clobbered
2288 // by any stores or calls, this load is likely to be redundant and can be
2289 // eliminated.
2290 if (EnableLoadElimination &&
2291 !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) {
2292 onLoadEliminationOpportunity();
2293 return true;
2294 }
2295
2296 onMemAccess();
2297 return false;
2298}
2299
2300bool CallAnalyzer::visitStore(StoreInst &I) {
2301 if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple()))
2302 return true;
2303
2304 // The store can potentially clobber loads and prevent repeated loads from
2305 // being eliminated.
2306 // FIXME:
2307 // 1. We can probably keep an initial set of eliminatable loads substracted
2308 // from the cost even when we finally see a store. We just need to disable
2309 // *further* accumulation of elimination savings.
2310 // 2. We should probably at some point thread MemorySSA for the callee into
2311 // this and then use that to actually compute *really* precise savings.
2312 disableLoadElimination();
2313
2314 onMemAccess();
2315 return false;
2316}
2317
2318bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2319 Value *Op = I.getAggregateOperand();
2320
2321 // Special handling, because we want to simplify extractvalue with a
2322 // potential insertvalue from the caller.
2323 if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) {
2324 SimplifyQuery SQ(DL);
2325 Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ);
2326 if (SimpleV) {
2327 SimplifiedValues[&I] = SimpleV;
2328 return true;
2329 }
2330 }
2331
2332 // SROA can't look through these, but they may be free.
2333 return Base::visitExtractValue(I);
2334}
2335
2336bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2337 // Constant folding for insert value is trivial.
2338 if (simplifyInstruction(I))
2339 return true;
2340
2341 // SROA can't look through these, but they may be free.
2342 return Base::visitInsertValue(I);
2343}
2344
2345/// Try to simplify a call site.
2346///
2347/// Takes a concrete function and callsite and tries to actually simplify it by
2348/// analyzing the arguments and call itself with instsimplify. Returns true if
2349/// it has simplified the callsite to some other entity (a constant), making it
2350/// free.
2351bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2352 // FIXME: Using the instsimplify logic directly for this is inefficient
2353 // because we have to continually rebuild the argument list even when no
2354 // simplifications can be performed. Until that is fixed with remapping
2355 // inside of instsimplify, directly constant fold calls here.
2356 if (!canConstantFoldCallTo(Call: &Call, F))
2357 return false;
2358
2359 // Try to re-map the arguments to constants.
2360 SmallVector<Constant *, 4> ConstantArgs;
2361 ConstantArgs.reserve(N: Call.arg_size());
2362 for (Value *I : Call.args()) {
2363 Constant *C = getDirectOrSimplifiedValue<Constant>(V: I);
2364 if (!C)
2365 return false; // This argument doesn't map to a constant.
2366
2367 ConstantArgs.push_back(Elt: C);
2368 }
2369 if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) {
2370 SimplifiedValues[&Call] = C;
2371 return true;
2372 }
2373
2374 return false;
2375}
2376
2377bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) {
2378 const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr;
2379 LibFunc LF;
2380 if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF))
2381 return TTI.isLoweredToCall(F);
2382
2383 switch (LF) {
2384 case LibFunc_memcpy_chk:
2385 case LibFunc_memmove_chk:
2386 case LibFunc_mempcpy_chk:
2387 case LibFunc_memset_chk: {
2388 // Calls to __memcpy_chk whose length is known to fit within the object
2389 // size will eventually be replaced by inline stores. Therefore, these
2390 // should not incur a call penalty. This is only really relevant on
2391 // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2392 // other platforms use memcpy intrinsics, which are already exempt from the
2393 // call penalty.
2394 auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2));
2395 auto *ObjSizeOp =
2396 getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3));
2397 if (LenOp && ObjSizeOp &&
2398 LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) {
2399 return false;
2400 }
2401 break;
2402 }
2403 default:
2404 break;
2405 }
2406
2407 return TTI.isLoweredToCall(F);
2408}
2409
2410bool CallAnalyzer::visitCallBase(CallBase &Call) {
2411 if (!onCallBaseVisitStart(Call))
2412 return true;
2413
2414 if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) &&
2415 !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) {
2416 // This aborts the entire analysis.
2417 ExposesReturnsTwice = true;
2418 return false;
2419 }
2420 if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate())
2421 ContainsNoDuplicateCall = true;
2422
2423 Function *F = Call.getCalledFunction();
2424 bool IsIndirectCall = !F;
2425 if (IsIndirectCall) {
2426 // Check if this happens to be an indirect function call to a known function
2427 // in this inline context. If not, we've done all we can.
2428 Value *Callee = Call.getCalledOperand();
2429 F = getSimplifiedValue<Function>(V: Callee);
2430 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2431 onCallArgumentSetup(Call);
2432
2433 if (!Call.onlyReadsMemory())
2434 disableLoadElimination();
2435 return Base::visitCallBase(I&: Call);
2436 }
2437 }
2438
2439 assert(F && "Expected a call to a known function");
2440
2441 // When we have a concrete function, first try to simplify it directly.
2442 if (simplifyCallSite(F, Call))
2443 return true;
2444
2445 // Next check if it is an intrinsic we know about.
2446 // FIXME: Lift this into part of the InstVisitor.
2447 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) {
2448 switch (II->getIntrinsicID()) {
2449 default:
2450 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II))
2451 disableLoadElimination();
2452 return Base::visitCallBase(I&: Call);
2453
2454 case Intrinsic::load_relative:
2455 onLoadRelativeIntrinsic();
2456 return false;
2457
2458 case Intrinsic::memset:
2459 case Intrinsic::memcpy:
2460 case Intrinsic::memmove:
2461 disableLoadElimination();
2462 // SROA can usually chew through these intrinsics, but they aren't free.
2463 return false;
2464 case Intrinsic::icall_branch_funnel:
2465 case Intrinsic::localescape:
2466 HasUninlineableIntrinsic = true;
2467 return false;
2468 case Intrinsic::vastart:
2469 InitsVargArgs = true;
2470 return false;
2471 case Intrinsic::launder_invariant_group:
2472 case Intrinsic::strip_invariant_group:
2473 if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0)))
2474 SROAArgValues[II] = SROAArg;
2475 return true;
2476 case Intrinsic::is_constant:
2477 return simplifyIntrinsicCallIsConstant(CB&: Call);
2478 case Intrinsic::objectsize:
2479 return simplifyIntrinsicCallObjectSize(CB&: Call);
2480 }
2481 }
2482
2483 if (F == Call.getFunction()) {
2484 // This flag will fully abort the analysis, so don't bother with anything
2485 // else.
2486 IsRecursiveCall = true;
2487 if (!AllowRecursiveCall)
2488 return false;
2489 }
2490
2491 if (isLoweredToCall(F, Call)) {
2492 onLoweredCall(F, Call, IsIndirectCall);
2493 }
2494
2495 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2496 disableLoadElimination();
2497 return Base::visitCallBase(I&: Call);
2498}
2499
2500bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2501 // At least one return instruction will be free after inlining.
2502 bool Free = !HasReturn;
2503 HasReturn = true;
2504 return Free;
2505}
2506
2507bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2508 // We model unconditional branches as essentially free -- they really
2509 // shouldn't exist at all, but handling them makes the behavior of the
2510 // inliner more regular and predictable. Interestingly, conditional branches
2511 // which will fold away are also free.
2512 return BI.isUnconditional() ||
2513 getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) ||
2514 BI.getMetadata(KindID: LLVMContext::MD_make_implicit);
2515}
2516
2517bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2518 bool CheckSROA = SI.getType()->isPointerTy();
2519 Value *TrueVal = SI.getTrueValue();
2520 Value *FalseVal = SI.getFalseValue();
2521
2522 Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal);
2523 Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal);
2524 Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition());
2525
2526 if (!CondC) {
2527 // Select C, X, X => X
2528 if (TrueC == FalseC && TrueC) {
2529 SimplifiedValues[&SI] = TrueC;
2530 return true;
2531 }
2532
2533 if (!CheckSROA)
2534 return Base::visitSelectInst(I&: SI);
2535
2536 std::pair<Value *, APInt> TrueBaseAndOffset =
2537 ConstantOffsetPtrs.lookup(Val: TrueVal);
2538 std::pair<Value *, APInt> FalseBaseAndOffset =
2539 ConstantOffsetPtrs.lookup(Val: FalseVal);
2540 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2541 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2542
2543 if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal))
2544 SROAArgValues[&SI] = SROAArg;
2545 return true;
2546 }
2547
2548 return Base::visitSelectInst(I&: SI);
2549 }
2550
2551 // Select condition is a constant.
2552 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2553 : (CondC->isNullValue()) ? FalseVal
2554 : nullptr;
2555 if (!SelectedV) {
2556 // Condition is a vector constant that is not all 1s or all 0s. If all
2557 // operands are constants, ConstantFoldSelectInstruction() can handle the
2558 // cases such as select vectors.
2559 if (TrueC && FalseC) {
2560 if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) {
2561 SimplifiedValues[&SI] = C;
2562 return true;
2563 }
2564 }
2565 return Base::visitSelectInst(I&: SI);
2566 }
2567
2568 // Condition is either all 1s or all 0s. SI can be simplified.
2569 if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) {
2570 SimplifiedValues[&SI] = SelectedC;
2571 return true;
2572 }
2573
2574 if (!CheckSROA)
2575 return true;
2576
2577 std::pair<Value *, APInt> BaseAndOffset =
2578 ConstantOffsetPtrs.lookup(Val: SelectedV);
2579 if (BaseAndOffset.first) {
2580 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2581
2582 if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV))
2583 SROAArgValues[&SI] = SROAArg;
2584 }
2585
2586 return true;
2587}
2588
2589bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2590 // We model unconditional switches as free, see the comments on handling
2591 // branches.
2592 if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition()))
2593 return true;
2594
2595 // Assume the most general case where the switch is lowered into
2596 // either a jump table, bit test, or a balanced binary tree consisting of
2597 // case clusters without merging adjacent clusters with the same
2598 // destination. We do not consider the switches that are lowered with a mix
2599 // of jump table/bit test/binary search tree. The cost of the switch is
2600 // proportional to the size of the tree or the size of jump table range.
2601 //
2602 // NB: We convert large switches which are just used to initialize large phi
2603 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2604 // inlining those. It will prevent inlining in cases where the optimization
2605 // does not (yet) fire.
2606
2607 unsigned JumpTableSize = 0;
2608 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2609 unsigned NumCaseCluster =
2610 TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI);
2611
2612 onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable());
2613 return false;
2614}
2615
2616bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2617 // We never want to inline functions that contain an indirectbr. This is
2618 // incorrect because all the blockaddress's (in static global initializers
2619 // for example) would be referring to the original function, and this
2620 // indirect jump would jump from the inlined copy of the function into the
2621 // original function which is extremely undefined behavior.
2622 // FIXME: This logic isn't really right; we can safely inline functions with
2623 // indirectbr's as long as no other function or global references the
2624 // blockaddress of a block within the current function.
2625 HasIndirectBr = true;
2626 return false;
2627}
2628
2629bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2630 // FIXME: It's not clear that a single instruction is an accurate model for
2631 // the inline cost of a resume instruction.
2632 return false;
2633}
2634
2635bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2636 // FIXME: It's not clear that a single instruction is an accurate model for
2637 // the inline cost of a cleanupret instruction.
2638 return false;
2639}
2640
2641bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2642 // FIXME: It's not clear that a single instruction is an accurate model for
2643 // the inline cost of a catchret instruction.
2644 return false;
2645}
2646
2647bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2648 // FIXME: It might be reasonably to discount the cost of instructions leading
2649 // to unreachable as they have the lowest possible impact on both runtime and
2650 // code size.
2651 return true; // No actual code is needed for unreachable.
2652}
2653
2654bool CallAnalyzer::visitInstruction(Instruction &I) {
2655 // Some instructions are free. All of the free intrinsics can also be
2656 // handled by SROA, etc.
2657 if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) ==
2658 TargetTransformInfo::TCC_Free)
2659 return true;
2660
2661 // We found something we don't understand or can't handle. Mark any SROA-able
2662 // values in the operand list as no longer viable.
2663 for (const Use &Op : I.operands())
2664 disableSROA(V: Op);
2665
2666 return false;
2667}
2668
2669/// Analyze a basic block for its contribution to the inline cost.
2670///
2671/// This method walks the analyzer over every instruction in the given basic
2672/// block and accounts for their cost during inlining at this callsite. It
2673/// aborts early if the threshold has been exceeded or an impossible to inline
2674/// construct has been detected. It returns false if inlining is no longer
2675/// viable, and true if inlining remains viable.
2676InlineResult
2677CallAnalyzer::analyzeBlock(BasicBlock *BB,
2678 const SmallPtrSetImpl<const Value *> &EphValues) {
2679 for (Instruction &I : *BB) {
2680 // FIXME: Currently, the number of instructions in a function regardless of
2681 // our ability to simplify them during inline to constants or dead code,
2682 // are actually used by the vector bonus heuristic. As long as that's true,
2683 // we have to special case debug intrinsics here to prevent differences in
2684 // inlining due to debug symbols. Eventually, the number of unsimplified
2685 // instructions shouldn't factor into the cost computation, but until then,
2686 // hack around it here.
2687 // Similarly, skip pseudo-probes.
2688 if (I.isDebugOrPseudoInst())
2689 continue;
2690
2691 // Skip ephemeral values.
2692 if (EphValues.count(Ptr: &I))
2693 continue;
2694
2695 ++NumInstructions;
2696 if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy())
2697 ++NumVectorInstructions;
2698
2699 // If the instruction simplified to a constant, there is no cost to this
2700 // instruction. Visit the instructions using our InstVisitor to account for
2701 // all of the per-instruction logic. The visit tree returns true if we
2702 // consumed the instruction in any way, and false if the instruction's base
2703 // cost should count against inlining.
2704 onInstructionAnalysisStart(I: &I);
2705
2706 if (Base::visit(I: &I))
2707 ++NumInstructionsSimplified;
2708 else
2709 onMissedSimplification();
2710
2711 onInstructionAnalysisFinish(I: &I);
2712 using namespace ore;
2713 // If the visit this instruction detected an uninlinable pattern, abort.
2714 InlineResult IR = InlineResult::success();
2715 if (IsRecursiveCall && !AllowRecursiveCall)
2716 IR = InlineResult::failure(Reason: "recursive");
2717 else if (ExposesReturnsTwice)
2718 IR = InlineResult::failure(Reason: "exposes returns twice");
2719 else if (HasDynamicAlloca)
2720 IR = InlineResult::failure(Reason: "dynamic alloca");
2721 else if (HasIndirectBr)
2722 IR = InlineResult::failure(Reason: "indirect branch");
2723 else if (HasUninlineableIntrinsic)
2724 IR = InlineResult::failure(Reason: "uninlinable intrinsic");
2725 else if (InitsVargArgs)
2726 IR = InlineResult::failure(Reason: "varargs");
2727 if (!IR.isSuccess()) {
2728 if (ORE)
2729 ORE->emit(RemarkBuilder: [&]() {
2730 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2731 &CandidateCall)
2732 << NV("Callee", &F) << " has uninlinable pattern ("
2733 << NV("InlineResult", IR.getFailureReason())
2734 << ") and cost is not fully computed";
2735 });
2736 return IR;
2737 }
2738
2739 // If the caller is a recursive function then we don't want to inline
2740 // functions which allocate a lot of stack space because it would increase
2741 // the caller stack usage dramatically.
2742 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2743 auto IR =
2744 InlineResult::failure(Reason: "recursive and allocates too much stack space");
2745 if (ORE)
2746 ORE->emit(RemarkBuilder: [&]() {
2747 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2748 &CandidateCall)
2749 << NV("Callee", &F) << " is "
2750 << NV("InlineResult", IR.getFailureReason())
2751 << ". Cost is not fully computed";
2752 });
2753 return IR;
2754 }
2755
2756 if (shouldStop())
2757 return InlineResult::failure(
2758 Reason: "Call site analysis is not favorable to inlining.");
2759 }
2760
2761 return InlineResult::success();
2762}
2763
2764/// Compute the base pointer and cumulative constant offsets for V.
2765///
2766/// This strips all constant offsets off of V, leaving it the base pointer, and
2767/// accumulates the total constant offset applied in the returned constant. It
2768/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2769/// no constant offsets applied.
2770ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2771 if (!V->getType()->isPointerTy())
2772 return nullptr;
2773
2774 unsigned AS = V->getType()->getPointerAddressSpace();
2775 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2776 APInt Offset = APInt::getZero(numBits: IntPtrWidth);
2777
2778 // Even though we don't look through PHI nodes, we could be called on an
2779 // instruction in an unreachable block, which may be on a cycle.
2780 SmallPtrSet<Value *, 4> Visited;
2781 Visited.insert(Ptr: V);
2782 do {
2783 if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) {
2784 if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset))
2785 return nullptr;
2786 V = GEP->getPointerOperand();
2787 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) {
2788 if (GA->isInterposable())
2789 break;
2790 V = GA->getAliasee();
2791 } else {
2792 break;
2793 }
2794 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2795 } while (Visited.insert(Ptr: V).second);
2796
2797 Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType());
2798 return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset));
2799}
2800
2801/// Find dead blocks due to deleted CFG edges during inlining.
2802///
2803/// If we know the successor of the current block, \p CurrBB, has to be \p
2804/// NextBB, the other successors of \p CurrBB are dead if these successors have
2805/// no live incoming CFG edges. If one block is found to be dead, we can
2806/// continue growing the dead block list by checking the successors of the dead
2807/// blocks to see if all their incoming edges are dead or not.
2808void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2809 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2810 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2811 // known successor which is not the one under exam.
2812 if (DeadBlocks.count(Ptr: Pred))
2813 return true;
2814 BasicBlock *KnownSucc = KnownSuccessors[Pred];
2815 return KnownSucc && KnownSucc != Succ;
2816 };
2817
2818 auto IsNewlyDead = [&](BasicBlock *BB) {
2819 // If all the edges to a block are dead, the block is also dead.
2820 return (!DeadBlocks.count(Ptr: BB) &&
2821 llvm::all_of(Range: predecessors(BB),
2822 P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2823 };
2824
2825 for (BasicBlock *Succ : successors(BB: CurrBB)) {
2826 if (Succ == NextBB || !IsNewlyDead(Succ))
2827 continue;
2828 SmallVector<BasicBlock *, 4> NewDead;
2829 NewDead.push_back(Elt: Succ);
2830 while (!NewDead.empty()) {
2831 BasicBlock *Dead = NewDead.pop_back_val();
2832 if (DeadBlocks.insert(Ptr: Dead).second)
2833 // Continue growing the dead block lists.
2834 for (BasicBlock *S : successors(BB: Dead))
2835 if (IsNewlyDead(S))
2836 NewDead.push_back(Elt: S);
2837 }
2838 }
2839}
2840
2841/// Analyze a call site for potential inlining.
2842///
2843/// Returns true if inlining this call is viable, and false if it is not
2844/// viable. It computes the cost and adjusts the threshold based on numerous
2845/// factors and heuristics. If this method returns false but the computed cost
2846/// is below the computed threshold, then inlining was forcibly disabled by
2847/// some artifact of the routine.
2848InlineResult CallAnalyzer::analyze() {
2849 ++NumCallsAnalyzed;
2850
2851 auto Result = onAnalysisStart();
2852 if (!Result.isSuccess())
2853 return Result;
2854
2855 if (F.empty())
2856 return InlineResult::success();
2857
2858 Function *Caller = CandidateCall.getFunction();
2859 // Check if the caller function is recursive itself.
2860 for (User *U : Caller->users()) {
2861 CallBase *Call = dyn_cast<CallBase>(Val: U);
2862 if (Call && Call->getFunction() == Caller) {
2863 IsCallerRecursive = true;
2864 break;
2865 }
2866 }
2867
2868 // Populate our simplified values by mapping from function arguments to call
2869 // arguments with known important simplifications.
2870 auto CAI = CandidateCall.arg_begin();
2871 for (Argument &FAI : F.args()) {
2872 assert(CAI != CandidateCall.arg_end());
2873 SimplifiedValues[&FAI] = *CAI;
2874 if (isa<Constant>(Val: *CAI))
2875 ++NumConstantArgs;
2876
2877 Value *PtrArg = *CAI;
2878 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) {
2879 ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue());
2880
2881 // We can SROA any pointer arguments derived from alloca instructions.
2882 if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) {
2883 SROAArgValues[&FAI] = SROAArg;
2884 onInitializeSROAArg(Arg: SROAArg);
2885 EnabledSROAAllocas.insert(V: SROAArg);
2886 }
2887 }
2888 ++CAI;
2889 }
2890 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2891 NumAllocaArgs = SROAArgValues.size();
2892
2893 // Collecting the ephemeral values of `F` can be expensive, so use the
2894 // ephemeral values cache if available.
2895 SmallPtrSet<const Value *, 32> EphValuesStorage;
2896 const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage;
2897 if (GetEphValuesCache)
2898 EphValues = &GetEphValuesCache(F).ephValues();
2899 else
2900 CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F),
2901 EphValues&: EphValuesStorage);
2902
2903 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2904 // adding basic blocks of the callee which can be proven to be dead for this
2905 // particular call site in order to get more accurate cost estimates. This
2906 // requires a somewhat heavyweight iteration pattern: we need to walk the
2907 // basic blocks in a breadth-first order as we insert live successors. To
2908 // accomplish this, prioritizing for small iterations because we exit after
2909 // crossing our threshold, we use a small-size optimized SetVector.
2910 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2911 BBSetVector BBWorklist;
2912 BBWorklist.insert(X: &F.getEntryBlock());
2913
2914 // Note that we *must not* cache the size, this loop grows the worklist.
2915 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2916 if (shouldStop())
2917 break;
2918
2919 BasicBlock *BB = BBWorklist[Idx];
2920 if (BB->empty())
2921 continue;
2922
2923 onBlockStart(BB);
2924
2925 // Disallow inlining a blockaddress with uses other than strictly callbr.
2926 // A blockaddress only has defined behavior for an indirect branch in the
2927 // same function, and we do not currently support inlining indirect
2928 // branches. But, the inliner may not see an indirect branch that ends up
2929 // being dead code at a particular call site. If the blockaddress escapes
2930 // the function, e.g., via a global variable, inlining may lead to an
2931 // invalid cross-function reference.
2932 // FIXME: pr/39560: continue relaxing this overt restriction.
2933 if (BB->hasAddressTaken())
2934 for (User *U : BlockAddress::get(BB: &*BB)->users())
2935 if (!isa<CallBrInst>(Val: *U))
2936 return InlineResult::failure(Reason: "blockaddress used outside of callbr");
2937
2938 // Analyze the cost of this block. If we blow through the threshold, this
2939 // returns false, and we can bail on out.
2940 InlineResult IR = analyzeBlock(BB, EphValues: *EphValues);
2941 if (!IR.isSuccess())
2942 return IR;
2943
2944 Instruction *TI = BB->getTerminator();
2945
2946 // Add in the live successors by first checking whether we have terminator
2947 // that may be simplified based on the values simplified by this call.
2948 if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) {
2949 if (BI->isConditional()) {
2950 Value *Cond = BI->getCondition();
2951 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
2952 BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0);
2953 BBWorklist.insert(X: NextBB);
2954 KnownSuccessors[BB] = NextBB;
2955 findDeadBlocks(CurrBB: BB, NextBB);
2956 continue;
2957 }
2958 }
2959 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) {
2960 Value *Cond = SI->getCondition();
2961 if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) {
2962 BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor();
2963 BBWorklist.insert(X: NextBB);
2964 KnownSuccessors[BB] = NextBB;
2965 findDeadBlocks(CurrBB: BB, NextBB);
2966 continue;
2967 }
2968 }
2969
2970 // If we're unable to select a particular successor, just count all of
2971 // them.
2972 BBWorklist.insert_range(R: successors(BB));
2973
2974 onBlockAnalyzed(BB);
2975 }
2976
2977 // If this is a noduplicate call, we can still inline as long as
2978 // inlining this would cause the removal of the caller (so the instruction
2979 // is not actually duplicated, just moved).
2980 if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall)
2981 return InlineResult::failure(Reason: "noduplicate");
2982
2983 // If the callee's stack size exceeds the user-specified threshold,
2984 // do not let it be inlined.
2985 // The command line option overrides a limit set in the function attributes.
2986 size_t FinalStackSizeThreshold = StackSizeThreshold;
2987 if (!StackSizeThreshold.getNumOccurrences())
2988 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2989 F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName))
2990 FinalStackSizeThreshold = *AttrMaxStackSize;
2991 if (AllocatedSize > FinalStackSizeThreshold)
2992 return InlineResult::failure(Reason: "stacksize");
2993
2994 return finalizeAnalysis();
2995}
2996
2997void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2998#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2999 if (PrintInstructionComments)
3000 F.print(OS, AAW: &Writer);
3001 DEBUG_PRINT_STAT(NumConstantArgs);
3002 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
3003 DEBUG_PRINT_STAT(NumAllocaArgs);
3004 DEBUG_PRINT_STAT(NumConstantPtrCmps);
3005 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
3006 DEBUG_PRINT_STAT(NumInstructionsSimplified);
3007 DEBUG_PRINT_STAT(NumInstructions);
3008 DEBUG_PRINT_STAT(SROACostSavings);
3009 DEBUG_PRINT_STAT(SROACostSavingsLost);
3010 DEBUG_PRINT_STAT(LoadEliminationCost);
3011 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
3012 DEBUG_PRINT_STAT(Cost);
3013 DEBUG_PRINT_STAT(Threshold);
3014#undef DEBUG_PRINT_STAT
3015}
3016
3017#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3018/// Dump stats about this call's analysis.
3019LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
3020#endif
3021
3022/// Test that there are no attribute conflicts between Caller and Callee
3023/// that prevent inlining.
3024static bool functionsHaveCompatibleAttributes(
3025 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
3026 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
3027 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
3028 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
3029 // object, and always returns the same object (which is overwritten on each
3030 // GetTLI call). Therefore we copy the first result.
3031 auto CalleeTLI = GetTLI(*Callee);
3032 return (IgnoreTTIInlineCompatible ||
3033 TTI.areInlineCompatible(Caller, Callee)) &&
3034 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
3035 AllowCallerSuperset: InlineCallerSupersetNoBuiltin) &&
3036 AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee);
3037}
3038
3039int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
3040 const DataLayout &DL) {
3041 int64_t Cost = 0;
3042 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
3043 if (Call.isByValArgument(ArgNo: I)) {
3044 // We approximate the number of loads and stores needed by dividing the
3045 // size of the byval type by the target's pointer size.
3046 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3047 unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I));
3048 unsigned AS = PTy->getAddressSpace();
3049 unsigned PointerSize = DL.getPointerSizeInBits(AS);
3050 // Ceiling division.
3051 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
3052
3053 // If it generates more than 8 stores it is likely to be expanded as an
3054 // inline memcpy so we take that as an upper bound. Otherwise we assume
3055 // one load and one store per word copied.
3056 // FIXME: The maxStoresPerMemcpy setting from the target should be used
3057 // here instead of a magic number of 8, but it's not available via
3058 // DataLayout.
3059 NumStores = std::min(a: NumStores, b: 8U);
3060
3061 Cost += 2 * NumStores * InstrCost;
3062 } else {
3063 // For non-byval arguments subtract off one instruction per call
3064 // argument.
3065 Cost += InstrCost;
3066 }
3067 }
3068 // The call instruction also disappears after inlining.
3069 Cost += InstrCost;
3070 Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty);
3071
3072 return std::min<int64_t>(a: Cost, INT_MAX);
3073}
3074
3075InlineCost llvm::getInlineCost(
3076 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
3077 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3078 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3079 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3080 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3081 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3082 return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI,
3083 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE,
3084 GetEphValuesCache);
3085}
3086
3087std::optional<int> llvm::getInliningCostEstimate(
3088 CallBase &Call, TargetTransformInfo &CalleeTTI,
3089 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3090 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3091 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3092 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3093 const InlineParams Params = {/* DefaultThreshold*/ 0,
3094 /*HintThreshold*/ {},
3095 /*ColdThreshold*/ {},
3096 /*OptSizeThreshold*/ {},
3097 /*OptMinSizeThreshold*/ {},
3098 /*HotCallSiteThreshold*/ {},
3099 /*LocallyHotCallSiteThreshold*/ {},
3100 /*ColdCallSiteThreshold*/ {},
3101 /*ComputeFullInlineCost*/ true,
3102 /*EnableDeferral*/ true};
3103
3104 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
3105 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true,
3106 /*IgnoreThreshold*/ true);
3107 auto R = CA.analyze();
3108 if (!R.isSuccess())
3109 return std::nullopt;
3110 return CA.getCost();
3111}
3112
3113std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
3114 CallBase &Call, TargetTransformInfo &CalleeTTI,
3115 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3116 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3117 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3118 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3119 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI,
3120 PSI, ORE, *Call.getCalledFunction(), Call);
3121 auto R = CFA.analyze();
3122 if (!R.isSuccess())
3123 return std::nullopt;
3124 return CFA.features();
3125}
3126
3127std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
3128 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
3129 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
3130
3131 // Cannot inline indirect calls.
3132 if (!Callee)
3133 return InlineResult::failure(Reason: "indirect call");
3134
3135 // When callee coroutine function is inlined into caller coroutine function
3136 // before coro-split pass,
3137 // coro-early pass can not handle this quiet well.
3138 // So we won't inline the coroutine function if it have not been unsplited
3139 if (Callee->isPresplitCoroutine())
3140 return InlineResult::failure(Reason: "unsplited coroutine call");
3141
3142 // Never inline calls with byval arguments that does not have the alloca
3143 // address space. Since byval arguments can be replaced with a copy to an
3144 // alloca, the inlined code would need to be adjusted to handle that the
3145 // argument is in the alloca address space (so it is a little bit complicated
3146 // to solve).
3147 unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3148 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3149 if (Call.isByValArgument(ArgNo: I)) {
3150 PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType());
3151 if (PTy->getAddressSpace() != AllocaAS)
3152 return InlineResult::failure(Reason: "byval arguments without alloca"
3153 " address space");
3154 }
3155
3156 // Calls to functions with always-inline attributes should be inlined
3157 // whenever possible.
3158 if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) {
3159 if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline))
3160 return InlineResult::failure(Reason: "noinline call site attribute");
3161
3162 auto IsViable = isInlineViable(Callee&: *Callee);
3163 if (IsViable.isSuccess())
3164 return InlineResult::success();
3165 return InlineResult::failure(Reason: IsViable.getFailureReason());
3166 }
3167
3168 // Never inline functions with conflicting attributes (unless callee has
3169 // always-inline attribute).
3170 Function *Caller = Call.getCaller();
3171 if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI))
3172 return InlineResult::failure(Reason: "conflicting attributes");
3173
3174 // Don't inline this call if the caller has the optnone attribute.
3175 if (Caller->hasOptNone())
3176 return InlineResult::failure(Reason: "optnone attribute");
3177
3178 // Don't inline a function that treats null pointer as valid into a caller
3179 // that does not have this attribute.
3180 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3181 return InlineResult::failure(Reason: "nullptr definitions incompatible");
3182
3183 // Don't inline functions which can be interposed at link-time.
3184 if (Callee->isInterposable())
3185 return InlineResult::failure(Reason: "interposable");
3186
3187 // Don't inline functions marked noinline.
3188 if (Callee->hasFnAttribute(Kind: Attribute::NoInline))
3189 return InlineResult::failure(Reason: "noinline function attribute");
3190
3191 // Don't inline call sites marked noinline.
3192 if (Call.isNoInline())
3193 return InlineResult::failure(Reason: "noinline call site attribute");
3194
3195 // Don't inline functions that are loader replaceable.
3196 if (Callee->hasFnAttribute(Kind: "loader-replaceable"))
3197 return InlineResult::failure(Reason: "loader replaceable function attribute");
3198
3199 return std::nullopt;
3200}
3201
3202InlineCost llvm::getInlineCost(
3203 CallBase &Call, Function *Callee, const InlineParams &Params,
3204 TargetTransformInfo &CalleeTTI,
3205 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3206 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3207 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3208 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
3209 function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) {
3210
3211 auto UserDecision =
3212 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3213
3214 if (UserDecision) {
3215 if (UserDecision->isSuccess())
3216 return llvm::InlineCost::getAlways(Reason: "always inline attribute");
3217 return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason());
3218 }
3219
3220 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3221 << "... (caller:" << Call.getCaller()->getName()
3222 << ")\n");
3223
3224 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3225 GetAssumptionCache, GetBFI, GetTLI, PSI, ORE,
3226 /*BoostIndirect=*/true, /*IgnoreThreshold=*/false,
3227 GetEphValuesCache);
3228 InlineResult ShouldInline = CA.analyze();
3229
3230 LLVM_DEBUG(CA.dump());
3231
3232 // Always make cost benefit based decision explicit.
3233 // We use always/never here since threshold is not meaningful,
3234 // as it's not what drives cost-benefit analysis.
3235 if (CA.wasDecidedByCostBenefit()) {
3236 if (ShouldInline.isSuccess())
3237 return InlineCost::getAlways(Reason: "benefit over cost",
3238 CostBenefit: CA.getCostBenefitPair());
3239 else
3240 return InlineCost::getNever(Reason: "cost over benefit", CostBenefit: CA.getCostBenefitPair());
3241 }
3242
3243 if (CA.wasDecidedByCostThreshold())
3244 return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(),
3245 StaticBonus: CA.getStaticBonusApplied());
3246
3247 // No details on how the decision was made, simply return always or never.
3248 return ShouldInline.isSuccess()
3249 ? InlineCost::getAlways(Reason: "empty function")
3250 : InlineCost::getNever(Reason: ShouldInline.getFailureReason());
3251}
3252
3253InlineResult llvm::isInlineViable(Function &F) {
3254 bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice);
3255 for (BasicBlock &BB : F) {
3256 // Disallow inlining of functions which contain indirect branches.
3257 if (isa<IndirectBrInst>(Val: BB.getTerminator()))
3258 return InlineResult::failure(Reason: "contains indirect branches");
3259
3260 // Disallow inlining of blockaddresses which are used by non-callbr
3261 // instructions.
3262 if (BB.hasAddressTaken())
3263 for (User *U : BlockAddress::get(BB: &BB)->users())
3264 if (!isa<CallBrInst>(Val: *U))
3265 return InlineResult::failure(Reason: "blockaddress used outside of callbr");
3266
3267 for (auto &II : BB) {
3268 CallBase *Call = dyn_cast<CallBase>(Val: &II);
3269 if (!Call)
3270 continue;
3271
3272 // Disallow recursive calls.
3273 Function *Callee = Call->getCalledFunction();
3274 if (&F == Callee)
3275 return InlineResult::failure(Reason: "recursive call");
3276
3277 // Disallow calls which expose returns-twice to a function not previously
3278 // attributed as such.
3279 if (!ReturnsTwice && isa<CallInst>(Val: Call) &&
3280 cast<CallInst>(Val: Call)->canReturnTwice())
3281 return InlineResult::failure(Reason: "exposes returns-twice attribute");
3282
3283 if (Callee)
3284 switch (Callee->getIntrinsicID()) {
3285 default:
3286 break;
3287 case llvm::Intrinsic::icall_branch_funnel:
3288 // Disallow inlining of @llvm.icall.branch.funnel because current
3289 // backend can't separate call targets from call arguments.
3290 return InlineResult::failure(
3291 Reason: "disallowed inlining of @llvm.icall.branch.funnel");
3292 case llvm::Intrinsic::localescape:
3293 // Disallow inlining functions that call @llvm.localescape. Doing this
3294 // correctly would require major changes to the inliner.
3295 return InlineResult::failure(
3296 Reason: "disallowed inlining of @llvm.localescape");
3297 case llvm::Intrinsic::vastart:
3298 // Disallow inlining of functions that initialize VarArgs with
3299 // va_start.
3300 return InlineResult::failure(
3301 Reason: "contains VarArgs initialized with va_start");
3302 }
3303 }
3304 }
3305
3306 return InlineResult::success();
3307}
3308
3309// APIs to create InlineParams based on command line flags and/or other
3310// parameters.
3311
3312InlineParams llvm::getInlineParams(int Threshold) {
3313 InlineParams Params;
3314
3315 // This field is the threshold to use for a callee by default. This is
3316 // derived from one or more of:
3317 // * optimization or size-optimization levels,
3318 // * a value passed to createFunctionInliningPass function, or
3319 // * the -inline-threshold flag.
3320 // If the -inline-threshold flag is explicitly specified, that is used
3321 // irrespective of anything else.
3322 if (InlineThreshold.getNumOccurrences() > 0)
3323 Params.DefaultThreshold = InlineThreshold;
3324 else
3325 Params.DefaultThreshold = Threshold;
3326
3327 // Set the HintThreshold knob from the -inlinehint-threshold.
3328 Params.HintThreshold = HintThreshold;
3329
3330 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3331 Params.HotCallSiteThreshold = HotCallSiteThreshold;
3332
3333 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3334 // populate LocallyHotCallSiteThreshold. Later, we populate
3335 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3336 // we know that optimization level is O3 (in the getInlineParams variant that
3337 // takes the opt and size levels).
3338 // FIXME: Remove this check (and make the assignment unconditional) after
3339 // addressing size regression issues at O2.
3340 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3341 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3342
3343 // Set the ColdCallSiteThreshold knob from the
3344 // -inline-cold-callsite-threshold.
3345 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3346
3347 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3348 // -inlinehint-threshold commandline option is not explicitly given. If that
3349 // option is present, then its value applies even for callees with size and
3350 // minsize attributes.
3351 // If the -inline-threshold is not specified, set the ColdThreshold from the
3352 // -inlinecold-threshold even if it is not explicitly passed. If
3353 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3354 // explicitly specified to set the ColdThreshold knob
3355 if (InlineThreshold.getNumOccurrences() == 0) {
3356 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3357 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3358 Params.ColdThreshold = ColdThreshold;
3359 } else if (ColdThreshold.getNumOccurrences() > 0) {
3360 Params.ColdThreshold = ColdThreshold;
3361 }
3362 return Params;
3363}
3364
3365InlineParams llvm::getInlineParams() {
3366 return getInlineParams(Threshold: DefaultThreshold);
3367}
3368
3369// Compute the default threshold for inlining based on the opt level and the
3370// size opt level.
3371static int computeThresholdFromOptLevels(unsigned OptLevel,
3372 unsigned SizeOptLevel) {
3373 if (OptLevel > 2)
3374 return InlineConstants::OptAggressiveThreshold;
3375 if (SizeOptLevel == 1) // -Os
3376 return InlineConstants::OptSizeThreshold;
3377 if (SizeOptLevel == 2) // -Oz
3378 return InlineConstants::OptMinSizeThreshold;
3379 return DefaultThreshold;
3380}
3381
3382InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3383 auto Params =
3384 getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3385 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3386 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3387 // when it is specified explicitly.
3388 if (OptLevel > 2)
3389 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3390 return Params;
3391}
3392
3393PreservedAnalyses
3394InlineCostAnnotationPrinterPass::run(Function &F,
3395 FunctionAnalysisManager &FAM) {
3396 PrintInstructionComments = true;
3397 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3398 [&](Function &F) -> AssumptionCache & {
3399 return FAM.getResult<AssumptionAnalysis>(IR&: F);
3400 };
3401
3402 auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
3403 ProfileSummaryInfo *PSI =
3404 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
3405 const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
3406
3407 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3408 // In the current implementation, the type of InlineParams doesn't matter as
3409 // the pass serves only for verification of inliner's decisions.
3410 // We can add a flag which determines InlineParams for this run. Right now,
3411 // the default InlineParams are used.
3412 const InlineParams Params = llvm::getInlineParams();
3413 for (BasicBlock &BB : F) {
3414 for (Instruction &I : BB) {
3415 if (auto *CB = dyn_cast<CallBase>(Val: &I)) {
3416 Function *CalledFunction = CB->getCalledFunction();
3417 if (!CalledFunction || CalledFunction->isDeclaration())
3418 continue;
3419 OptimizationRemarkEmitter ORE(CalledFunction);
3420 InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI,
3421 GetAssumptionCache, nullptr, nullptr, PSI,
3422 &ORE);
3423 ICCA.analyze();
3424 OS << " Analyzing call of " << CalledFunction->getName()
3425 << "... (caller:" << CB->getCaller()->getName() << ")\n";
3426 ICCA.print(OS);
3427 OS << "\n";
3428 }
3429 }
3430 }
3431 return PreservedAnalyses::all();
3432}
3433