| 1 | //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements inline cost analysis. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "llvm/Analysis/InlineCost.h" |
| 14 | #include "llvm/ADT/STLExtras.h" |
| 15 | #include "llvm/ADT/SetVector.h" |
| 16 | #include "llvm/ADT/SmallPtrSet.h" |
| 17 | #include "llvm/ADT/SmallVector.h" |
| 18 | #include "llvm/ADT/Statistic.h" |
| 19 | #include "llvm/Analysis/AssumptionCache.h" |
| 20 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
| 21 | #include "llvm/Analysis/CodeMetrics.h" |
| 22 | #include "llvm/Analysis/ConstantFolding.h" |
| 23 | #include "llvm/Analysis/DomConditionCache.h" |
| 24 | #include "llvm/Analysis/EphemeralValuesCache.h" |
| 25 | #include "llvm/Analysis/InstructionSimplify.h" |
| 26 | #include "llvm/Analysis/LoopInfo.h" |
| 27 | #include "llvm/Analysis/MemoryBuiltins.h" |
| 28 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 29 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
| 30 | #include "llvm/Analysis/TargetLibraryInfo.h" |
| 31 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 32 | #include "llvm/Analysis/ValueTracking.h" |
| 33 | #include "llvm/Config/llvm-config.h" |
| 34 | #include "llvm/IR/AssemblyAnnotationWriter.h" |
| 35 | #include "llvm/IR/CallingConv.h" |
| 36 | #include "llvm/IR/DataLayout.h" |
| 37 | #include "llvm/IR/Dominators.h" |
| 38 | #include "llvm/IR/GetElementPtrTypeIterator.h" |
| 39 | #include "llvm/IR/GlobalAlias.h" |
| 40 | #include "llvm/IR/InstVisitor.h" |
| 41 | #include "llvm/IR/IntrinsicInst.h" |
| 42 | #include "llvm/IR/Operator.h" |
| 43 | #include "llvm/IR/PatternMatch.h" |
| 44 | #include "llvm/Support/CommandLine.h" |
| 45 | #include "llvm/Support/Debug.h" |
| 46 | #include "llvm/Support/FormattedStream.h" |
| 47 | #include "llvm/Support/raw_ostream.h" |
| 48 | #include <climits> |
| 49 | #include <limits> |
| 50 | #include <optional> |
| 51 | |
| 52 | using namespace llvm; |
| 53 | |
| 54 | #define DEBUG_TYPE "inline-cost" |
| 55 | |
| 56 | STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed" ); |
| 57 | |
| 58 | static cl::opt<int> |
| 59 | DefaultThreshold("inlinedefault-threshold" , cl::Hidden, cl::init(Val: 225), |
| 60 | cl::desc("Default amount of inlining to perform" )); |
| 61 | |
| 62 | // We introduce this option since there is a minor compile-time win by avoiding |
| 63 | // addition of TTI attributes (target-features in particular) to inline |
| 64 | // candidates when they are guaranteed to be the same as top level methods in |
| 65 | // some use cases. If we avoid adding the attribute, we need an option to avoid |
| 66 | // checking these attributes. |
| 67 | static cl::opt<bool> IgnoreTTIInlineCompatible( |
| 68 | "ignore-tti-inline-compatible" , cl::Hidden, cl::init(Val: false), |
| 69 | cl::desc("Ignore TTI attributes compatibility check between callee/caller " |
| 70 | "during inline cost calculation" )); |
| 71 | |
| 72 | static cl::opt<bool> ( |
| 73 | "print-instruction-comments" , cl::Hidden, cl::init(Val: false), |
| 74 | cl::desc("Prints comments for instruction based on inline cost analysis" )); |
| 75 | |
| 76 | static cl::opt<int> InlineThreshold( |
| 77 | "inline-threshold" , cl::Hidden, cl::init(Val: 225), |
| 78 | cl::desc("Control the amount of inlining to perform (default = 225)" )); |
| 79 | |
| 80 | static cl::opt<int> HintThreshold( |
| 81 | "inlinehint-threshold" , cl::Hidden, cl::init(Val: 325), |
| 82 | cl::desc("Threshold for inlining functions with inline hint" )); |
| 83 | |
| 84 | static cl::opt<int> |
| 85 | ColdCallSiteThreshold("inline-cold-callsite-threshold" , cl::Hidden, |
| 86 | cl::init(Val: 45), |
| 87 | cl::desc("Threshold for inlining cold callsites" )); |
| 88 | |
| 89 | static cl::opt<bool> InlineEnableCostBenefitAnalysis( |
| 90 | "inline-enable-cost-benefit-analysis" , cl::Hidden, cl::init(Val: false), |
| 91 | cl::desc("Enable the cost-benefit analysis for the inliner" )); |
| 92 | |
| 93 | // InlineSavingsMultiplier overrides per TTI multipliers iff it is |
| 94 | // specified explicitly in command line options. This option is exposed |
| 95 | // for tuning and testing. |
| 96 | static cl::opt<int> InlineSavingsMultiplier( |
| 97 | "inline-savings-multiplier" , cl::Hidden, cl::init(Val: 8), |
| 98 | cl::desc("Multiplier to multiply cycle savings by during inlining" )); |
| 99 | |
| 100 | // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is |
| 101 | // specified explicitly in command line options. This option is exposed |
| 102 | // for tuning and testing. |
| 103 | static cl::opt<int> InlineSavingsProfitableMultiplier( |
| 104 | "inline-savings-profitable-multiplier" , cl::Hidden, cl::init(Val: 4), |
| 105 | cl::desc("A multiplier on top of cycle savings to decide whether the " |
| 106 | "savings won't justify the cost" )); |
| 107 | |
| 108 | static cl::opt<int> |
| 109 | InlineSizeAllowance("inline-size-allowance" , cl::Hidden, cl::init(Val: 100), |
| 110 | cl::desc("The maximum size of a callee that get's " |
| 111 | "inlined without sufficient cycle savings" )); |
| 112 | |
| 113 | // We introduce this threshold to help performance of instrumentation based |
| 114 | // PGO before we actually hook up inliner with analysis passes such as BPI and |
| 115 | // BFI. |
| 116 | static cl::opt<int> ColdThreshold( |
| 117 | "inlinecold-threshold" , cl::Hidden, cl::init(Val: 45), |
| 118 | cl::desc("Threshold for inlining functions with cold attribute" )); |
| 119 | |
| 120 | static cl::opt<int> |
| 121 | HotCallSiteThreshold("hot-callsite-threshold" , cl::Hidden, cl::init(Val: 3000), |
| 122 | cl::desc("Threshold for hot callsites " )); |
| 123 | |
| 124 | static cl::opt<int> LocallyHotCallSiteThreshold( |
| 125 | "locally-hot-callsite-threshold" , cl::Hidden, cl::init(Val: 525), |
| 126 | cl::desc("Threshold for locally hot callsites " )); |
| 127 | |
| 128 | static cl::opt<int> ColdCallSiteRelFreq( |
| 129 | "cold-callsite-rel-freq" , cl::Hidden, cl::init(Val: 2), |
| 130 | cl::desc("Maximum block frequency, expressed as a percentage of caller's " |
| 131 | "entry frequency, for a callsite to be cold in the absence of " |
| 132 | "profile information." )); |
| 133 | |
| 134 | static cl::opt<uint64_t> HotCallSiteRelFreq( |
| 135 | "hot-callsite-rel-freq" , cl::Hidden, cl::init(Val: 60), |
| 136 | cl::desc("Minimum block frequency, expressed as a multiple of caller's " |
| 137 | "entry frequency, for a callsite to be hot in the absence of " |
| 138 | "profile information." )); |
| 139 | |
| 140 | static cl::opt<int> |
| 141 | InstrCost("inline-instr-cost" , cl::Hidden, cl::init(Val: 5), |
| 142 | cl::desc("Cost of a single instruction when inlining" )); |
| 143 | |
| 144 | static cl::opt<int> |
| 145 | MemAccessCost("inline-memaccess-cost" , cl::Hidden, cl::init(Val: 0), |
| 146 | cl::desc("Cost of load/store instruction when inlining" )); |
| 147 | |
| 148 | static cl::opt<int> CallPenalty( |
| 149 | "inline-call-penalty" , cl::Hidden, cl::init(Val: 25), |
| 150 | cl::desc("Call penalty that is applied per callsite when inlining" )); |
| 151 | |
| 152 | static cl::opt<size_t> |
| 153 | StackSizeThreshold("inline-max-stacksize" , cl::Hidden, |
| 154 | cl::init(Val: std::numeric_limits<size_t>::max()), |
| 155 | cl::desc("Do not inline functions with a stack size " |
| 156 | "that exceeds the specified limit" )); |
| 157 | |
| 158 | static cl::opt<size_t> RecurStackSizeThreshold( |
| 159 | "recursive-inline-max-stacksize" , cl::Hidden, |
| 160 | cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller), |
| 161 | cl::desc("Do not inline recursive functions with a stack " |
| 162 | "size that exceeds the specified limit" )); |
| 163 | |
| 164 | static cl::opt<bool> OptComputeFullInlineCost( |
| 165 | "inline-cost-full" , cl::Hidden, |
| 166 | cl::desc("Compute the full inline cost of a call site even when the cost " |
| 167 | "exceeds the threshold." )); |
| 168 | |
| 169 | static cl::opt<bool> InlineCallerSupersetNoBuiltin( |
| 170 | "inline-caller-superset-nobuiltin" , cl::Hidden, cl::init(Val: true), |
| 171 | cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " |
| 172 | "attributes." )); |
| 173 | |
| 174 | static cl::opt<bool> DisableGEPConstOperand( |
| 175 | "disable-gep-const-evaluation" , cl::Hidden, cl::init(Val: false), |
| 176 | cl::desc("Disables evaluation of GetElementPtr with constant operands" )); |
| 177 | |
| 178 | namespace llvm { |
| 179 | std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) { |
| 180 | if (Attr.isValid()) { |
| 181 | int AttrValue = 0; |
| 182 | if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue)) |
| 183 | return AttrValue; |
| 184 | } |
| 185 | return std::nullopt; |
| 186 | } |
| 187 | |
| 188 | std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) { |
| 189 | return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind)); |
| 190 | } |
| 191 | |
| 192 | std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) { |
| 193 | return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind)); |
| 194 | } |
| 195 | |
| 196 | namespace InlineConstants { |
| 197 | int getInstrCost() { return InstrCost; } |
| 198 | |
| 199 | } // namespace InlineConstants |
| 200 | |
| 201 | } // namespace llvm |
| 202 | |
| 203 | namespace { |
| 204 | class InlineCostCallAnalyzer; |
| 205 | |
| 206 | // This struct is used to store information about inline cost of a |
| 207 | // particular instruction |
| 208 | struct InstructionCostDetail { |
| 209 | int CostBefore = 0; |
| 210 | int CostAfter = 0; |
| 211 | int ThresholdBefore = 0; |
| 212 | int ThresholdAfter = 0; |
| 213 | |
| 214 | int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; } |
| 215 | |
| 216 | int getCostDelta() const { return CostAfter - CostBefore; } |
| 217 | |
| 218 | bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; } |
| 219 | }; |
| 220 | |
| 221 | class InlineCostAnnotationWriter : public AssemblyAnnotationWriter { |
| 222 | private: |
| 223 | InlineCostCallAnalyzer *const ICCA; |
| 224 | |
| 225 | public: |
| 226 | InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} |
| 227 | void emitInstructionAnnot(const Instruction *I, |
| 228 | formatted_raw_ostream &OS) override; |
| 229 | }; |
| 230 | |
| 231 | /// Carry out call site analysis, in order to evaluate inlinability. |
| 232 | /// NOTE: the type is currently used as implementation detail of functions such |
| 233 | /// as llvm::getInlineCost. Note the function_ref constructor parameters - the |
| 234 | /// expectation is that they come from the outer scope, from the wrapper |
| 235 | /// functions. If we want to support constructing CallAnalyzer objects where |
| 236 | /// lambdas are provided inline at construction, or where the object needs to |
| 237 | /// otherwise survive past the scope of the provided functions, we need to |
| 238 | /// revisit the argument types. |
| 239 | class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { |
| 240 | typedef InstVisitor<CallAnalyzer, bool> Base; |
| 241 | friend class InstVisitor<CallAnalyzer, bool>; |
| 242 | |
| 243 | protected: |
| 244 | virtual ~CallAnalyzer() = default; |
| 245 | /// The TargetTransformInfo available for this compilation. |
| 246 | const TargetTransformInfo &TTI; |
| 247 | |
| 248 | /// Getter for the cache of @llvm.assume intrinsics. |
| 249 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache; |
| 250 | |
| 251 | /// Getter for BlockFrequencyInfo |
| 252 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI; |
| 253 | |
| 254 | /// Getter for TargetLibraryInfo |
| 255 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI; |
| 256 | |
| 257 | /// Profile summary information. |
| 258 | ProfileSummaryInfo *PSI; |
| 259 | |
| 260 | /// The called function. |
| 261 | Function &F; |
| 262 | |
| 263 | // Cache the DataLayout since we use it a lot. |
| 264 | const DataLayout &DL; |
| 265 | |
| 266 | /// The OptimizationRemarkEmitter available for this compilation. |
| 267 | OptimizationRemarkEmitter *ORE; |
| 268 | |
| 269 | /// The candidate callsite being analyzed. Please do not use this to do |
| 270 | /// analysis in the caller function; we want the inline cost query to be |
| 271 | /// easily cacheable. Instead, use the cover function paramHasAttr. |
| 272 | CallBase &CandidateCall; |
| 273 | |
| 274 | /// Getter for the cache of ephemeral values. |
| 275 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr; |
| 276 | |
| 277 | /// Extension points for handling callsite features. |
| 278 | // Called before a basic block was analyzed. |
| 279 | virtual void onBlockStart(const BasicBlock *BB) {} |
| 280 | |
| 281 | /// Called after a basic block was analyzed. |
| 282 | virtual void onBlockAnalyzed(const BasicBlock *BB) {} |
| 283 | |
| 284 | /// Called before an instruction was analyzed |
| 285 | virtual void onInstructionAnalysisStart(const Instruction *I) {} |
| 286 | |
| 287 | /// Called after an instruction was analyzed |
| 288 | virtual void onInstructionAnalysisFinish(const Instruction *I) {} |
| 289 | |
| 290 | /// Called at the end of the analysis of the callsite. Return the outcome of |
| 291 | /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or |
| 292 | /// the reason it can't. |
| 293 | virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } |
| 294 | /// Called when we're about to start processing a basic block, and every time |
| 295 | /// we are done processing an instruction. Return true if there is no point in |
| 296 | /// continuing the analysis (e.g. we've determined already the call site is |
| 297 | /// too expensive to inline) |
| 298 | virtual bool shouldStop() { return false; } |
| 299 | |
| 300 | /// Called before the analysis of the callee body starts (with callsite |
| 301 | /// contexts propagated). It checks callsite-specific information. Return a |
| 302 | /// reason analysis can't continue if that's the case, or 'true' if it may |
| 303 | /// continue. |
| 304 | virtual InlineResult onAnalysisStart() { return InlineResult::success(); } |
| 305 | /// Called if the analysis engine decides SROA cannot be done for the given |
| 306 | /// alloca. |
| 307 | virtual void onDisableSROA(AllocaInst *Arg) {} |
| 308 | |
| 309 | /// Called the analysis engine determines load elimination won't happen. |
| 310 | virtual void onDisableLoadElimination() {} |
| 311 | |
| 312 | /// Called when we visit a CallBase, before the analysis starts. Return false |
| 313 | /// to stop further processing of the instruction. |
| 314 | virtual bool onCallBaseVisitStart(CallBase &Call) { return true; } |
| 315 | |
| 316 | /// Called to account for a call. |
| 317 | virtual void onCallPenalty() {} |
| 318 | |
| 319 | /// Called to account for a load or store. |
| 320 | virtual void onMemAccess(){}; |
| 321 | |
| 322 | /// Called to account for the expectation the inlining would result in a load |
| 323 | /// elimination. |
| 324 | virtual void onLoadEliminationOpportunity() {} |
| 325 | |
| 326 | /// Called to account for the cost of argument setup for the Call in the |
| 327 | /// callee's body (not the callsite currently under analysis). |
| 328 | virtual void onCallArgumentSetup(const CallBase &Call) {} |
| 329 | |
| 330 | /// Called to account for a load relative intrinsic. |
| 331 | virtual void onLoadRelativeIntrinsic() {} |
| 332 | |
| 333 | /// Called to account for a lowered call. |
| 334 | virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { |
| 335 | } |
| 336 | |
| 337 | /// Account for a jump table of given size. Return false to stop further |
| 338 | /// processing the switch instruction |
| 339 | virtual bool onJumpTable(unsigned JumpTableSize) { return true; } |
| 340 | |
| 341 | /// Account for a case cluster of given size. Return false to stop further |
| 342 | /// processing of the instruction. |
| 343 | virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } |
| 344 | |
| 345 | /// Called at the end of processing a switch instruction, with the given |
| 346 | /// number of case clusters. |
| 347 | virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
| 348 | bool DefaultDestUnreachable) {} |
| 349 | |
| 350 | /// Called to account for any other instruction not specifically accounted |
| 351 | /// for. |
| 352 | virtual void onMissedSimplification() {} |
| 353 | |
| 354 | /// Start accounting potential benefits due to SROA for the given alloca. |
| 355 | virtual void onInitializeSROAArg(AllocaInst *Arg) {} |
| 356 | |
| 357 | /// Account SROA savings for the AllocaInst value. |
| 358 | virtual void onAggregateSROAUse(AllocaInst *V) {} |
| 359 | |
| 360 | bool handleSROA(Value *V, bool DoNotDisable) { |
| 361 | // Check for SROA candidates in comparisons. |
| 362 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
| 363 | if (DoNotDisable) { |
| 364 | onAggregateSROAUse(V: SROAArg); |
| 365 | return true; |
| 366 | } |
| 367 | disableSROAForArg(SROAArg); |
| 368 | } |
| 369 | return false; |
| 370 | } |
| 371 | |
| 372 | bool IsCallerRecursive = false; |
| 373 | bool IsRecursiveCall = false; |
| 374 | bool ExposesReturnsTwice = false; |
| 375 | bool HasDynamicAlloca = false; |
| 376 | bool ContainsNoDuplicateCall = false; |
| 377 | bool HasReturn = false; |
| 378 | bool HasIndirectBr = false; |
| 379 | bool HasUninlineableIntrinsic = false; |
| 380 | bool InitsVargArgs = false; |
| 381 | |
| 382 | /// Number of bytes allocated statically by the callee. |
| 383 | uint64_t AllocatedSize = 0; |
| 384 | unsigned NumInstructions = 0; |
| 385 | unsigned NumVectorInstructions = 0; |
| 386 | |
| 387 | /// While we walk the potentially-inlined instructions, we build up and |
| 388 | /// maintain a mapping of simplified values specific to this callsite. The |
| 389 | /// idea is to propagate any special information we have about arguments to |
| 390 | /// this call through the inlinable section of the function, and account for |
| 391 | /// likely simplifications post-inlining. The most important aspect we track |
| 392 | /// is CFG altering simplifications -- when we prove a basic block dead, that |
| 393 | /// can cause dramatic shifts in the cost of inlining a function. |
| 394 | /// Note: The simplified Value may be owned by the caller function. |
| 395 | DenseMap<Value *, Value *> SimplifiedValues; |
| 396 | |
| 397 | /// Keep track of the values which map back (through function arguments) to |
| 398 | /// allocas on the caller stack which could be simplified through SROA. |
| 399 | DenseMap<Value *, AllocaInst *> SROAArgValues; |
| 400 | |
| 401 | /// Keep track of Allocas for which we believe we may get SROA optimization. |
| 402 | DenseSet<AllocaInst *> EnabledSROAAllocas; |
| 403 | |
| 404 | /// Keep track of values which map to a pointer base and constant offset. |
| 405 | DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; |
| 406 | |
| 407 | /// Keep track of dead blocks due to the constant arguments. |
| 408 | SmallPtrSet<BasicBlock *, 16> DeadBlocks; |
| 409 | |
| 410 | /// The mapping of the blocks to their known unique successors due to the |
| 411 | /// constant arguments. |
| 412 | DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; |
| 413 | |
| 414 | /// Model the elimination of repeated loads that is expected to happen |
| 415 | /// whenever we simplify away the stores that would otherwise cause them to be |
| 416 | /// loads. |
| 417 | bool EnableLoadElimination = true; |
| 418 | |
| 419 | /// Whether we allow inlining for recursive call. |
| 420 | bool AllowRecursiveCall = false; |
| 421 | |
| 422 | SmallPtrSet<Value *, 16> LoadAddrSet; |
| 423 | |
| 424 | AllocaInst *getSROAArgForValueOrNull(Value *V) const { |
| 425 | auto It = SROAArgValues.find(Val: V); |
| 426 | if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0) |
| 427 | return nullptr; |
| 428 | return It->second; |
| 429 | } |
| 430 | |
| 431 | /// Use a value in its given form directly if possible, otherwise try looking |
| 432 | /// for it in SimplifiedValues. |
| 433 | template <typename T> T *getDirectOrSimplifiedValue(Value *V) const { |
| 434 | if (auto *Direct = dyn_cast<T>(V)) |
| 435 | return Direct; |
| 436 | return getSimplifiedValue<T>(V); |
| 437 | } |
| 438 | |
| 439 | // Custom simplification helper routines. |
| 440 | bool isAllocaDerivedArg(Value *V); |
| 441 | void disableSROAForArg(AllocaInst *SROAArg); |
| 442 | void disableSROA(Value *V); |
| 443 | void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); |
| 444 | void disableLoadElimination(); |
| 445 | bool isGEPFree(GetElementPtrInst &GEP); |
| 446 | bool canFoldInboundsGEP(GetElementPtrInst &I); |
| 447 | bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); |
| 448 | bool simplifyCallSite(Function *F, CallBase &Call); |
| 449 | bool simplifyCmpInstForRecCall(CmpInst &Cmp); |
| 450 | bool simplifyInstruction(Instruction &I); |
| 451 | bool simplifyIntrinsicCallIsConstant(CallBase &CB); |
| 452 | bool simplifyIntrinsicCallObjectSize(CallBase &CB); |
| 453 | ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); |
| 454 | bool isLoweredToCall(Function *F, CallBase &Call); |
| 455 | |
| 456 | /// Return true if the given argument to the function being considered for |
| 457 | /// inlining has the given attribute set either at the call site or the |
| 458 | /// function declaration. Primarily used to inspect call site specific |
| 459 | /// attributes since these can be more precise than the ones on the callee |
| 460 | /// itself. |
| 461 | bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); |
| 462 | |
| 463 | /// Return true if the given value is known non null within the callee if |
| 464 | /// inlined through this particular callsite. |
| 465 | bool isKnownNonNullInCallee(Value *V); |
| 466 | |
| 467 | /// Return true if size growth is allowed when inlining the callee at \p Call. |
| 468 | bool allowSizeGrowth(CallBase &Call); |
| 469 | |
| 470 | // Custom analysis routines. |
| 471 | InlineResult analyzeBlock(BasicBlock *BB, |
| 472 | const SmallPtrSetImpl<const Value *> &EphValues); |
| 473 | |
| 474 | // Disable several entry points to the visitor so we don't accidentally use |
| 475 | // them by declaring but not defining them here. |
| 476 | void visit(Module *); |
| 477 | void visit(Module &); |
| 478 | void visit(Function *); |
| 479 | void visit(Function &); |
| 480 | void visit(BasicBlock *); |
| 481 | void visit(BasicBlock &); |
| 482 | |
| 483 | // Provide base case for our instruction visit. |
| 484 | bool visitInstruction(Instruction &I); |
| 485 | |
| 486 | // Our visit overrides. |
| 487 | bool visitAlloca(AllocaInst &I); |
| 488 | bool visitPHI(PHINode &I); |
| 489 | bool visitGetElementPtr(GetElementPtrInst &I); |
| 490 | bool visitBitCast(BitCastInst &I); |
| 491 | bool visitPtrToInt(PtrToIntInst &I); |
| 492 | bool visitIntToPtr(IntToPtrInst &I); |
| 493 | bool visitCastInst(CastInst &I); |
| 494 | bool visitCmpInst(CmpInst &I); |
| 495 | bool visitSub(BinaryOperator &I); |
| 496 | bool visitBinaryOperator(BinaryOperator &I); |
| 497 | bool visitFNeg(UnaryOperator &I); |
| 498 | bool visitLoad(LoadInst &I); |
| 499 | bool visitStore(StoreInst &I); |
| 500 | bool visitExtractValue(ExtractValueInst &I); |
| 501 | bool visitInsertValue(InsertValueInst &I); |
| 502 | bool visitCallBase(CallBase &Call); |
| 503 | bool visitReturnInst(ReturnInst &RI); |
| 504 | bool visitBranchInst(BranchInst &BI); |
| 505 | bool visitSelectInst(SelectInst &SI); |
| 506 | bool visitSwitchInst(SwitchInst &SI); |
| 507 | bool visitIndirectBrInst(IndirectBrInst &IBI); |
| 508 | bool visitResumeInst(ResumeInst &RI); |
| 509 | bool visitCleanupReturnInst(CleanupReturnInst &RI); |
| 510 | bool visitCatchReturnInst(CatchReturnInst &RI); |
| 511 | bool visitUnreachableInst(UnreachableInst &I); |
| 512 | |
| 513 | public: |
| 514 | CallAnalyzer( |
| 515 | Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, |
| 516 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 517 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
| 518 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr, |
| 519 | ProfileSummaryInfo *PSI = nullptr, |
| 520 | OptimizationRemarkEmitter *ORE = nullptr, |
| 521 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = |
| 522 | nullptr) |
| 523 | : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), |
| 524 | GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE), |
| 525 | CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {} |
| 526 | |
| 527 | InlineResult analyze(); |
| 528 | |
| 529 | /// Lookup simplified Value. May return a value owned by the caller. |
| 530 | Value *getSimplifiedValueUnchecked(Value *V) const { |
| 531 | return SimplifiedValues.lookup(Val: V); |
| 532 | } |
| 533 | |
| 534 | /// Lookup simplified Value, but return nullptr if the simplified value is |
| 535 | /// owned by the caller. |
| 536 | template <typename T> T *getSimplifiedValue(Value *V) const { |
| 537 | Value *SimpleV = SimplifiedValues.lookup(Val: V); |
| 538 | if (!SimpleV) |
| 539 | return nullptr; |
| 540 | |
| 541 | // Skip checks if we know T is a global. This has a small, but measurable |
| 542 | // impact on compile-time. |
| 543 | if constexpr (std::is_base_of_v<Constant, T>) |
| 544 | return dyn_cast<T>(SimpleV); |
| 545 | |
| 546 | // Make sure the simplified Value is owned by this function |
| 547 | if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) { |
| 548 | if (I->getFunction() != &F) |
| 549 | return nullptr; |
| 550 | } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) { |
| 551 | if (Arg->getParent() != &F) |
| 552 | return nullptr; |
| 553 | } else if (!isa<Constant>(Val: SimpleV)) |
| 554 | return nullptr; |
| 555 | return dyn_cast<T>(SimpleV); |
| 556 | } |
| 557 | |
| 558 | // Keep a bunch of stats about the cost savings found so we can print them |
| 559 | // out when debugging. |
| 560 | unsigned NumConstantArgs = 0; |
| 561 | unsigned NumConstantOffsetPtrArgs = 0; |
| 562 | unsigned NumAllocaArgs = 0; |
| 563 | unsigned NumConstantPtrCmps = 0; |
| 564 | unsigned NumConstantPtrDiffs = 0; |
| 565 | unsigned NumInstructionsSimplified = 0; |
| 566 | |
| 567 | void dump(); |
| 568 | }; |
| 569 | |
| 570 | // Considering forming a binary search, we should find the number of nodes |
| 571 | // which is same as the number of comparisons when lowered. For a given |
| 572 | // number of clusters, n, we can define a recursive function, f(n), to find |
| 573 | // the number of nodes in the tree. The recursion is : |
| 574 | // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, |
| 575 | // and f(n) = n, when n <= 3. |
| 576 | // This will lead a binary tree where the leaf should be either f(2) or f(3) |
| 577 | // when n > 3. So, the number of comparisons from leaves should be n, while |
| 578 | // the number of non-leaf should be : |
| 579 | // 2^(log2(n) - 1) - 1 |
| 580 | // = 2^log2(n) * 2^-1 - 1 |
| 581 | // = n / 2 - 1. |
| 582 | // Considering comparisons from leaf and non-leaf nodes, we can estimate the |
| 583 | // number of comparisons in a simple closed form : |
| 584 | // n + n / 2 - 1 = n * 3 / 2 - 1 |
| 585 | int64_t getExpectedNumberOfCompare(int NumCaseCluster) { |
| 586 | return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1; |
| 587 | } |
| 588 | |
| 589 | /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note |
| 590 | /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer |
| 591 | class InlineCostCallAnalyzer final : public CallAnalyzer { |
| 592 | const bool ComputeFullInlineCost; |
| 593 | int LoadEliminationCost = 0; |
| 594 | /// Bonus to be applied when percentage of vector instructions in callee is |
| 595 | /// high (see more details in updateThreshold). |
| 596 | int VectorBonus = 0; |
| 597 | /// Bonus to be applied when the callee has only one reachable basic block. |
| 598 | int SingleBBBonus = 0; |
| 599 | |
| 600 | /// Tunable parameters that control the analysis. |
| 601 | const InlineParams &Params; |
| 602 | |
| 603 | // This DenseMap stores the delta change in cost and threshold after |
| 604 | // accounting for the given instruction. The map is filled only with the |
| 605 | // flag PrintInstructionComments on. |
| 606 | DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap; |
| 607 | |
| 608 | /// Upper bound for the inlining cost. Bonuses are being applied to account |
| 609 | /// for speculative "expected profit" of the inlining decision. |
| 610 | int Threshold = 0; |
| 611 | |
| 612 | /// The amount of StaticBonus applied. |
| 613 | int StaticBonusApplied = 0; |
| 614 | |
| 615 | /// Attempt to evaluate indirect calls to boost its inline cost. |
| 616 | const bool BoostIndirectCalls; |
| 617 | |
| 618 | /// Ignore the threshold when finalizing analysis. |
| 619 | const bool IgnoreThreshold; |
| 620 | |
| 621 | // True if the cost-benefit-analysis-based inliner is enabled. |
| 622 | const bool CostBenefitAnalysisEnabled; |
| 623 | |
| 624 | /// Inlining cost measured in abstract units, accounts for all the |
| 625 | /// instructions expected to be executed for a given function invocation. |
| 626 | /// Instructions that are statically proven to be dead based on call-site |
| 627 | /// arguments are not counted here. |
| 628 | int Cost = 0; |
| 629 | |
| 630 | // The cumulative cost at the beginning of the basic block being analyzed. At |
| 631 | // the end of analyzing each basic block, "Cost - CostAtBBStart" represents |
| 632 | // the size of that basic block. |
| 633 | int CostAtBBStart = 0; |
| 634 | |
| 635 | // The static size of live but cold basic blocks. This is "static" in the |
| 636 | // sense that it's not weighted by profile counts at all. |
| 637 | int ColdSize = 0; |
| 638 | |
| 639 | // Whether inlining is decided by cost-threshold analysis. |
| 640 | bool DecidedByCostThreshold = false; |
| 641 | |
| 642 | // Whether inlining is decided by cost-benefit analysis. |
| 643 | bool DecidedByCostBenefit = false; |
| 644 | |
| 645 | // The cost-benefit pair computed by cost-benefit analysis. |
| 646 | std::optional<CostBenefitPair> CostBenefit; |
| 647 | |
| 648 | bool SingleBB = true; |
| 649 | |
| 650 | unsigned SROACostSavings = 0; |
| 651 | unsigned SROACostSavingsLost = 0; |
| 652 | |
| 653 | /// The mapping of caller Alloca values to their accumulated cost savings. If |
| 654 | /// we have to disable SROA for one of the allocas, this tells us how much |
| 655 | /// cost must be added. |
| 656 | DenseMap<AllocaInst *, int> SROAArgCosts; |
| 657 | |
| 658 | /// Return true if \p Call is a cold callsite. |
| 659 | bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); |
| 660 | |
| 661 | /// Update Threshold based on callsite properties such as callee |
| 662 | /// attributes and callee hotness for PGO builds. The Callee is explicitly |
| 663 | /// passed to support analyzing indirect calls whose target is inferred by |
| 664 | /// analysis. |
| 665 | void updateThreshold(CallBase &Call, Function &Callee); |
| 666 | /// Return a higher threshold if \p Call is a hot callsite. |
| 667 | std::optional<int> getHotCallSiteThreshold(CallBase &Call, |
| 668 | BlockFrequencyInfo *CallerBFI); |
| 669 | |
| 670 | /// Handle a capped 'int' increment for Cost. |
| 671 | void addCost(int64_t Inc) { |
| 672 | Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX); |
| 673 | Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX); |
| 674 | } |
| 675 | |
| 676 | void onDisableSROA(AllocaInst *Arg) override { |
| 677 | auto CostIt = SROAArgCosts.find(Val: Arg); |
| 678 | if (CostIt == SROAArgCosts.end()) |
| 679 | return; |
| 680 | addCost(Inc: CostIt->second); |
| 681 | SROACostSavings -= CostIt->second; |
| 682 | SROACostSavingsLost += CostIt->second; |
| 683 | SROAArgCosts.erase(I: CostIt); |
| 684 | } |
| 685 | |
| 686 | void onDisableLoadElimination() override { |
| 687 | addCost(Inc: LoadEliminationCost); |
| 688 | LoadEliminationCost = 0; |
| 689 | } |
| 690 | |
| 691 | bool onCallBaseVisitStart(CallBase &Call) override { |
| 692 | if (std::optional<int> AttrCallThresholdBonus = |
| 693 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus" )) |
| 694 | Threshold += *AttrCallThresholdBonus; |
| 695 | |
| 696 | if (std::optional<int> AttrCallCost = |
| 697 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost" )) { |
| 698 | addCost(Inc: *AttrCallCost); |
| 699 | // Prevent further processing of the call since we want to override its |
| 700 | // inline cost, not just add to it. |
| 701 | return false; |
| 702 | } |
| 703 | return true; |
| 704 | } |
| 705 | |
| 706 | void onCallPenalty() override { addCost(Inc: CallPenalty); } |
| 707 | |
| 708 | void onMemAccess() override { addCost(Inc: MemAccessCost); } |
| 709 | |
| 710 | void onCallArgumentSetup(const CallBase &Call) override { |
| 711 | // Pay the price of the argument setup. We account for the average 1 |
| 712 | // instruction per call argument setup here. |
| 713 | addCost(Inc: Call.arg_size() * InstrCost); |
| 714 | } |
| 715 | void onLoadRelativeIntrinsic() override { |
| 716 | // This is normally lowered to 4 LLVM instructions. |
| 717 | addCost(Inc: 3 * InstrCost); |
| 718 | } |
| 719 | void onLoweredCall(Function *F, CallBase &Call, |
| 720 | bool IsIndirectCall) override { |
| 721 | // We account for the average 1 instruction per call argument setup here. |
| 722 | addCost(Inc: Call.arg_size() * InstrCost); |
| 723 | |
| 724 | // If we have a constant that we are calling as a function, we can peer |
| 725 | // through it and see the function target. This happens not infrequently |
| 726 | // during devirtualization and so we want to give it a hefty bonus for |
| 727 | // inlining, but cap that bonus in the event that inlining wouldn't pan out. |
| 728 | // Pretend to inline the function, with a custom threshold. |
| 729 | if (IsIndirectCall && BoostIndirectCalls) { |
| 730 | auto IndirectCallParams = Params; |
| 731 | IndirectCallParams.DefaultThreshold = |
| 732 | InlineConstants::IndirectCallThreshold; |
| 733 | /// FIXME: if InlineCostCallAnalyzer is derived from, this may need |
| 734 | /// to instantiate the derived class. |
| 735 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
| 736 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
| 737 | false); |
| 738 | if (CA.analyze().isSuccess()) { |
| 739 | // We were able to inline the indirect call! Subtract the cost from the |
| 740 | // threshold to get the bonus we want to apply, but don't go below zero. |
| 741 | Cost -= std::max(a: 0, b: CA.getThreshold() - CA.getCost()); |
| 742 | } |
| 743 | } else |
| 744 | // Otherwise simply add the cost for merely making the call. |
| 745 | addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call, |
| 746 | DefaultCallPenalty: CallPenalty)); |
| 747 | } |
| 748 | |
| 749 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
| 750 | bool DefaultDestUnreachable) override { |
| 751 | // If suitable for a jump table, consider the cost for the table size and |
| 752 | // branch to destination. |
| 753 | // Maximum valid cost increased in this function. |
| 754 | if (JumpTableSize) { |
| 755 | // Suppose a default branch includes one compare and one conditional |
| 756 | // branch if it's reachable. |
| 757 | if (!DefaultDestUnreachable) |
| 758 | addCost(Inc: 2 * InstrCost); |
| 759 | // Suppose a jump table requires one load and one jump instruction. |
| 760 | int64_t JTCost = |
| 761 | static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost; |
| 762 | addCost(Inc: JTCost); |
| 763 | return; |
| 764 | } |
| 765 | |
| 766 | if (NumCaseCluster <= 3) { |
| 767 | // Suppose a comparison includes one compare and one conditional branch. |
| 768 | // We can reduce a set of instructions if the default branch is |
| 769 | // undefined. |
| 770 | addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost); |
| 771 | return; |
| 772 | } |
| 773 | |
| 774 | int64_t ExpectedNumberOfCompare = |
| 775 | getExpectedNumberOfCompare(NumCaseCluster); |
| 776 | int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost; |
| 777 | |
| 778 | addCost(Inc: SwitchCost); |
| 779 | } |
| 780 | void onMissedSimplification() override { addCost(Inc: InstrCost); } |
| 781 | |
| 782 | void onInitializeSROAArg(AllocaInst *Arg) override { |
| 783 | assert(Arg != nullptr && |
| 784 | "Should not initialize SROA costs for null value." ); |
| 785 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
| 786 | SROACostSavings += SROAArgCost; |
| 787 | SROAArgCosts[Arg] = SROAArgCost; |
| 788 | } |
| 789 | |
| 790 | void onAggregateSROAUse(AllocaInst *SROAArg) override { |
| 791 | auto CostIt = SROAArgCosts.find(Val: SROAArg); |
| 792 | assert(CostIt != SROAArgCosts.end() && |
| 793 | "expected this argument to have a cost" ); |
| 794 | CostIt->second += InstrCost; |
| 795 | SROACostSavings += InstrCost; |
| 796 | } |
| 797 | |
| 798 | void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } |
| 799 | |
| 800 | void onBlockAnalyzed(const BasicBlock *BB) override { |
| 801 | if (CostBenefitAnalysisEnabled) { |
| 802 | // Keep track of the static size of live but cold basic blocks. For now, |
| 803 | // we define a cold basic block to be one that's never executed. |
| 804 | assert(GetBFI && "GetBFI must be available" ); |
| 805 | BlockFrequencyInfo *BFI = &(GetBFI(F)); |
| 806 | assert(BFI && "BFI must be available" ); |
| 807 | auto ProfileCount = BFI->getBlockProfileCount(BB); |
| 808 | if (*ProfileCount == 0) |
| 809 | ColdSize += Cost - CostAtBBStart; |
| 810 | } |
| 811 | |
| 812 | auto *TI = BB->getTerminator(); |
| 813 | // If we had any successors at this point, than post-inlining is likely to |
| 814 | // have them as well. Note that we assume any basic blocks which existed |
| 815 | // due to branches or switches which folded above will also fold after |
| 816 | // inlining. |
| 817 | if (SingleBB && TI->getNumSuccessors() > 1) { |
| 818 | // Take off the bonus we applied to the threshold. |
| 819 | Threshold -= SingleBBBonus; |
| 820 | SingleBB = false; |
| 821 | } |
| 822 | } |
| 823 | |
| 824 | void onInstructionAnalysisStart(const Instruction *I) override { |
| 825 | // This function is called to store the initial cost of inlining before |
| 826 | // the given instruction was assessed. |
| 827 | if (!PrintInstructionComments) |
| 828 | return; |
| 829 | auto &CostDetail = InstructionCostDetailMap[I]; |
| 830 | CostDetail.CostBefore = Cost; |
| 831 | CostDetail.ThresholdBefore = Threshold; |
| 832 | } |
| 833 | |
| 834 | void onInstructionAnalysisFinish(const Instruction *I) override { |
| 835 | // This function is called to find new values of cost and threshold after |
| 836 | // the instruction has been assessed. |
| 837 | if (!PrintInstructionComments) |
| 838 | return; |
| 839 | auto &CostDetail = InstructionCostDetailMap[I]; |
| 840 | CostDetail.CostAfter = Cost; |
| 841 | CostDetail.ThresholdAfter = Threshold; |
| 842 | } |
| 843 | |
| 844 | bool isCostBenefitAnalysisEnabled() { |
| 845 | if (!PSI || !PSI->hasProfileSummary()) |
| 846 | return false; |
| 847 | |
| 848 | if (!GetBFI) |
| 849 | return false; |
| 850 | |
| 851 | if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) { |
| 852 | // Honor the explicit request from the user. |
| 853 | if (!InlineEnableCostBenefitAnalysis) |
| 854 | return false; |
| 855 | } else { |
| 856 | // Otherwise, require instrumentation profile. |
| 857 | if (!PSI->hasInstrumentationProfile()) |
| 858 | return false; |
| 859 | } |
| 860 | |
| 861 | auto *Caller = CandidateCall.getParent()->getParent(); |
| 862 | if (!Caller->getEntryCount()) |
| 863 | return false; |
| 864 | |
| 865 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller)); |
| 866 | if (!CallerBFI) |
| 867 | return false; |
| 868 | |
| 869 | // For now, limit to hot call site. |
| 870 | if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI)) |
| 871 | return false; |
| 872 | |
| 873 | // Make sure we have a nonzero entry count. |
| 874 | auto EntryCount = F.getEntryCount(); |
| 875 | if (!EntryCount || !EntryCount->getCount()) |
| 876 | return false; |
| 877 | |
| 878 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
| 879 | if (!CalleeBFI) |
| 880 | return false; |
| 881 | |
| 882 | return true; |
| 883 | } |
| 884 | |
| 885 | // A helper function to choose between command line override and default. |
| 886 | unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { |
| 887 | if (InlineSavingsMultiplier.getNumOccurrences()) |
| 888 | return InlineSavingsMultiplier; |
| 889 | return TTI.getInliningCostBenefitAnalysisSavingsMultiplier(); |
| 890 | } |
| 891 | |
| 892 | // A helper function to choose between command line override and default. |
| 893 | unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const { |
| 894 | if (InlineSavingsProfitableMultiplier.getNumOccurrences()) |
| 895 | return InlineSavingsProfitableMultiplier; |
| 896 | return TTI.getInliningCostBenefitAnalysisProfitableMultiplier(); |
| 897 | } |
| 898 | |
| 899 | void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) { |
| 900 | if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt( |
| 901 | CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test" )) { |
| 902 | CycleSavings = *AttrCycleSavings; |
| 903 | } |
| 904 | |
| 905 | if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt( |
| 906 | CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test" )) { |
| 907 | Size = *AttrRuntimeCost; |
| 908 | } |
| 909 | } |
| 910 | |
| 911 | // Determine whether we should inline the given call site, taking into account |
| 912 | // both the size cost and the cycle savings. Return std::nullopt if we don't |
| 913 | // have sufficient profiling information to determine. |
| 914 | std::optional<bool> costBenefitAnalysis() { |
| 915 | if (!CostBenefitAnalysisEnabled) |
| 916 | return std::nullopt; |
| 917 | |
| 918 | // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 |
| 919 | // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by |
| 920 | // falling back to the cost-based metric. |
| 921 | // TODO: Improve this hacky condition. |
| 922 | if (Threshold == 0) |
| 923 | return std::nullopt; |
| 924 | |
| 925 | assert(GetBFI); |
| 926 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
| 927 | assert(CalleeBFI); |
| 928 | |
| 929 | // The cycle savings expressed as the sum of InstrCost |
| 930 | // multiplied by the estimated dynamic count of each instruction we can |
| 931 | // avoid. Savings come from the call site cost, such as argument setup and |
| 932 | // the call instruction, as well as the instructions that are folded. |
| 933 | // |
| 934 | // We use 128-bit APInt here to avoid potential overflow. This variable |
| 935 | // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case |
| 936 | // assumes that we can avoid or fold a billion instructions, each with a |
| 937 | // profile count of 10^^15 -- roughly the number of cycles for a 24-hour |
| 938 | // period on a 4GHz machine. |
| 939 | APInt CycleSavings(128, 0); |
| 940 | |
| 941 | for (auto &BB : F) { |
| 942 | APInt CurrentSavings(128, 0); |
| 943 | for (auto &I : BB) { |
| 944 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) { |
| 945 | // Count a conditional branch as savings if it becomes unconditional. |
| 946 | if (BI->isConditional() && |
| 947 | getSimplifiedValue<ConstantInt>(V: BI->getCondition())) { |
| 948 | CurrentSavings += InstrCost; |
| 949 | } |
| 950 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) { |
| 951 | if (getSimplifiedValue<ConstantInt>(V: SI->getCondition())) |
| 952 | CurrentSavings += InstrCost; |
| 953 | } else if (Value *V = dyn_cast<Value>(Val: &I)) { |
| 954 | // Count an instruction as savings if we can fold it. |
| 955 | if (SimplifiedValues.count(Val: V)) { |
| 956 | CurrentSavings += InstrCost; |
| 957 | } |
| 958 | } |
| 959 | } |
| 960 | |
| 961 | auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB); |
| 962 | CurrentSavings *= *ProfileCount; |
| 963 | CycleSavings += CurrentSavings; |
| 964 | } |
| 965 | |
| 966 | // Compute the cycle savings per call. |
| 967 | auto EntryProfileCount = F.getEntryCount(); |
| 968 | assert(EntryProfileCount && EntryProfileCount->getCount()); |
| 969 | auto EntryCount = EntryProfileCount->getCount(); |
| 970 | CycleSavings += EntryCount / 2; |
| 971 | CycleSavings = CycleSavings.udiv(RHS: EntryCount); |
| 972 | |
| 973 | // Compute the total savings for the call site. |
| 974 | auto *CallerBB = CandidateCall.getParent(); |
| 975 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent()))); |
| 976 | CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL); |
| 977 | CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB); |
| 978 | |
| 979 | // Remove the cost of the cold basic blocks to model the runtime cost more |
| 980 | // accurately. Both machine block placement and function splitting could |
| 981 | // place cold blocks further from hot blocks. |
| 982 | int Size = Cost - ColdSize; |
| 983 | |
| 984 | // Allow tiny callees to be inlined regardless of whether they meet the |
| 985 | // savings threshold. |
| 986 | Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; |
| 987 | |
| 988 | OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size); |
| 989 | CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings); |
| 990 | |
| 991 | // Let R be the ratio of CycleSavings to Size. We accept the inlining |
| 992 | // opportunity if R is really high and reject if R is really low. If R is |
| 993 | // somewhere in the middle, we fall back to the cost-based analysis. |
| 994 | // |
| 995 | // Specifically, let R = CycleSavings / Size, we accept the inlining |
| 996 | // opportunity if: |
| 997 | // |
| 998 | // PSI->getOrCompHotCountThreshold() |
| 999 | // R > ------------------------------------------------- |
| 1000 | // getInliningCostBenefitAnalysisSavingsMultiplier() |
| 1001 | // |
| 1002 | // and reject the inlining opportunity if: |
| 1003 | // |
| 1004 | // PSI->getOrCompHotCountThreshold() |
| 1005 | // R <= ---------------------------------------------------- |
| 1006 | // getInliningCostBenefitAnalysisProfitableMultiplier() |
| 1007 | // |
| 1008 | // Otherwise, we fall back to the cost-based analysis. |
| 1009 | // |
| 1010 | // Implementation-wise, use multiplication (CycleSavings * Multiplier, |
| 1011 | // HotCountThreshold * Size) rather than division to avoid precision loss. |
| 1012 | APInt Threshold(128, PSI->getOrCompHotCountThreshold()); |
| 1013 | Threshold *= Size; |
| 1014 | |
| 1015 | APInt UpperBoundCycleSavings = CycleSavings; |
| 1016 | UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier(); |
| 1017 | if (UpperBoundCycleSavings.uge(RHS: Threshold)) |
| 1018 | return true; |
| 1019 | |
| 1020 | APInt LowerBoundCycleSavings = CycleSavings; |
| 1021 | LowerBoundCycleSavings *= |
| 1022 | getInliningCostBenefitAnalysisProfitableMultiplier(); |
| 1023 | if (LowerBoundCycleSavings.ult(RHS: Threshold)) |
| 1024 | return false; |
| 1025 | |
| 1026 | // Otherwise, fall back to the cost-based analysis. |
| 1027 | return std::nullopt; |
| 1028 | } |
| 1029 | |
| 1030 | InlineResult finalizeAnalysis() override { |
| 1031 | // Loops generally act a lot like calls in that they act like barriers to |
| 1032 | // movement, require a certain amount of setup, etc. So when optimising for |
| 1033 | // size, we penalise any call sites that perform loops. We do this after all |
| 1034 | // other costs here, so will likely only be dealing with relatively small |
| 1035 | // functions (and hence DT and LI will hopefully be cheap). |
| 1036 | auto *Caller = CandidateCall.getFunction(); |
| 1037 | if (Caller->hasMinSize()) { |
| 1038 | DominatorTree DT(F); |
| 1039 | LoopInfo LI(DT); |
| 1040 | int NumLoops = 0; |
| 1041 | for (Loop *L : LI) { |
| 1042 | // Ignore loops that will not be executed |
| 1043 | if (DeadBlocks.count(Ptr: L->getHeader())) |
| 1044 | continue; |
| 1045 | NumLoops++; |
| 1046 | } |
| 1047 | addCost(Inc: NumLoops * InlineConstants::LoopPenalty); |
| 1048 | } |
| 1049 | |
| 1050 | // We applied the maximum possible vector bonus at the beginning. Now, |
| 1051 | // subtract the excess bonus, if any, from the Threshold before |
| 1052 | // comparing against Cost. |
| 1053 | if (NumVectorInstructions <= NumInstructions / 10) |
| 1054 | Threshold -= VectorBonus; |
| 1055 | else if (NumVectorInstructions <= NumInstructions / 2) |
| 1056 | Threshold -= VectorBonus / 2; |
| 1057 | |
| 1058 | if (std::optional<int> AttrCost = |
| 1059 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost" )) |
| 1060 | Cost = *AttrCost; |
| 1061 | |
| 1062 | if (std::optional<int> AttrCostMult = getStringFnAttrAsInt( |
| 1063 | CB&: CandidateCall, |
| 1064 | AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName)) |
| 1065 | Cost *= *AttrCostMult; |
| 1066 | |
| 1067 | if (std::optional<int> AttrThreshold = |
| 1068 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold" )) |
| 1069 | Threshold = *AttrThreshold; |
| 1070 | |
| 1071 | if (auto Result = costBenefitAnalysis()) { |
| 1072 | DecidedByCostBenefit = true; |
| 1073 | if (*Result) |
| 1074 | return InlineResult::success(); |
| 1075 | else |
| 1076 | return InlineResult::failure(Reason: "Cost over threshold." ); |
| 1077 | } |
| 1078 | |
| 1079 | if (IgnoreThreshold) |
| 1080 | return InlineResult::success(); |
| 1081 | |
| 1082 | DecidedByCostThreshold = true; |
| 1083 | return Cost < std::max(a: 1, b: Threshold) |
| 1084 | ? InlineResult::success() |
| 1085 | : InlineResult::failure(Reason: "Cost over threshold." ); |
| 1086 | } |
| 1087 | |
| 1088 | bool shouldStop() override { |
| 1089 | if (IgnoreThreshold || ComputeFullInlineCost) |
| 1090 | return false; |
| 1091 | // Bail out the moment we cross the threshold. This means we'll under-count |
| 1092 | // the cost, but only when undercounting doesn't matter. |
| 1093 | if (Cost < Threshold) |
| 1094 | return false; |
| 1095 | DecidedByCostThreshold = true; |
| 1096 | return true; |
| 1097 | } |
| 1098 | |
| 1099 | void onLoadEliminationOpportunity() override { |
| 1100 | LoadEliminationCost += InstrCost; |
| 1101 | } |
| 1102 | |
| 1103 | InlineResult onAnalysisStart() override { |
| 1104 | // Perform some tweaks to the cost and threshold based on the direct |
| 1105 | // callsite information. |
| 1106 | |
| 1107 | // We want to more aggressively inline vector-dense kernels, so up the |
| 1108 | // threshold, and we'll lower it if the % of vector instructions gets too |
| 1109 | // low. Note that these bonuses are some what arbitrary and evolved over |
| 1110 | // time by accident as much as because they are principled bonuses. |
| 1111 | // |
| 1112 | // FIXME: It would be nice to remove all such bonuses. At least it would be |
| 1113 | // nice to base the bonus values on something more scientific. |
| 1114 | assert(NumInstructions == 0); |
| 1115 | assert(NumVectorInstructions == 0); |
| 1116 | |
| 1117 | // Update the threshold based on callsite properties |
| 1118 | updateThreshold(Call&: CandidateCall, Callee&: F); |
| 1119 | |
| 1120 | // While Threshold depends on commandline options that can take negative |
| 1121 | // values, we want to enforce the invariant that the computed threshold and |
| 1122 | // bonuses are non-negative. |
| 1123 | assert(Threshold >= 0); |
| 1124 | assert(SingleBBBonus >= 0); |
| 1125 | assert(VectorBonus >= 0); |
| 1126 | |
| 1127 | // Speculatively apply all possible bonuses to Threshold. If cost exceeds |
| 1128 | // this Threshold any time, and cost cannot decrease, we can stop processing |
| 1129 | // the rest of the function body. |
| 1130 | Threshold += (SingleBBBonus + VectorBonus); |
| 1131 | |
| 1132 | // Give out bonuses for the callsite, as the instructions setting them up |
| 1133 | // will be gone after inlining. |
| 1134 | addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
| 1135 | |
| 1136 | // If this function uses the coldcc calling convention, prefer not to inline |
| 1137 | // it. |
| 1138 | if (F.getCallingConv() == CallingConv::Cold) |
| 1139 | Cost += InlineConstants::ColdccPenalty; |
| 1140 | |
| 1141 | LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n" ); |
| 1142 | |
| 1143 | // Check if we're done. This can happen due to bonuses and penalties. |
| 1144 | if (Cost >= Threshold && !ComputeFullInlineCost) |
| 1145 | return InlineResult::failure(Reason: "high cost" ); |
| 1146 | |
| 1147 | return InlineResult::success(); |
| 1148 | } |
| 1149 | |
| 1150 | public: |
| 1151 | InlineCostCallAnalyzer( |
| 1152 | Function &Callee, CallBase &Call, const InlineParams &Params, |
| 1153 | const TargetTransformInfo &TTI, |
| 1154 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 1155 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
| 1156 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr, |
| 1157 | ProfileSummaryInfo *PSI = nullptr, |
| 1158 | OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, |
| 1159 | bool IgnoreThreshold = false, |
| 1160 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = |
| 1161 | nullptr) |
| 1162 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI, |
| 1163 | ORE, GetEphValuesCache), |
| 1164 | ComputeFullInlineCost(OptComputeFullInlineCost || |
| 1165 | Params.ComputeFullInlineCost || ORE || |
| 1166 | isCostBenefitAnalysisEnabled()), |
| 1167 | Params(Params), Threshold(Params.DefaultThreshold), |
| 1168 | BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold), |
| 1169 | CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()), |
| 1170 | Writer(this) { |
| 1171 | AllowRecursiveCall = *Params.AllowRecursiveCall; |
| 1172 | } |
| 1173 | |
| 1174 | /// Annotation Writer for instruction details |
| 1175 | InlineCostAnnotationWriter Writer; |
| 1176 | |
| 1177 | void dump(); |
| 1178 | |
| 1179 | // Prints the same analysis as dump(), but its definition is not dependent |
| 1180 | // on the build. |
| 1181 | void print(raw_ostream &OS); |
| 1182 | |
| 1183 | std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) { |
| 1184 | auto It = InstructionCostDetailMap.find(Val: I); |
| 1185 | if (It != InstructionCostDetailMap.end()) |
| 1186 | return It->second; |
| 1187 | return std::nullopt; |
| 1188 | } |
| 1189 | |
| 1190 | virtual ~InlineCostCallAnalyzer() = default; |
| 1191 | int getThreshold() const { return Threshold; } |
| 1192 | int getCost() const { return Cost; } |
| 1193 | int getStaticBonusApplied() const { return StaticBonusApplied; } |
| 1194 | std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; } |
| 1195 | bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; } |
| 1196 | bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; } |
| 1197 | }; |
| 1198 | |
| 1199 | // Return true if CB is the sole call to local function Callee. |
| 1200 | static bool isSoleCallToLocalFunction(const CallBase &CB, |
| 1201 | const Function &Callee) { |
| 1202 | return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() && |
| 1203 | &Callee == CB.getCalledFunction(); |
| 1204 | } |
| 1205 | |
| 1206 | class InlineCostFeaturesAnalyzer final : public CallAnalyzer { |
| 1207 | private: |
| 1208 | InlineCostFeatures Cost = {}; |
| 1209 | |
| 1210 | // FIXME: These constants are taken from the heuristic-based cost visitor. |
| 1211 | // These should be removed entirely in a later revision to avoid reliance on |
| 1212 | // heuristics in the ML inliner. |
| 1213 | static constexpr int JTCostMultiplier = 2; |
| 1214 | static constexpr int CaseClusterCostMultiplier = 2; |
| 1215 | static constexpr int SwitchDefaultDestCostMultiplier = 2; |
| 1216 | static constexpr int SwitchCostMultiplier = 2; |
| 1217 | |
| 1218 | // FIXME: These are taken from the heuristic-based cost visitor: we should |
| 1219 | // eventually abstract these to the CallAnalyzer to avoid duplication. |
| 1220 | unsigned SROACostSavingOpportunities = 0; |
| 1221 | int VectorBonus = 0; |
| 1222 | int SingleBBBonus = 0; |
| 1223 | int Threshold = 5; |
| 1224 | |
| 1225 | DenseMap<AllocaInst *, unsigned> SROACosts; |
| 1226 | |
| 1227 | void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) { |
| 1228 | Cost[static_cast<size_t>(Feature)] += Delta; |
| 1229 | } |
| 1230 | |
| 1231 | void set(InlineCostFeatureIndex Feature, int64_t Value) { |
| 1232 | Cost[static_cast<size_t>(Feature)] = Value; |
| 1233 | } |
| 1234 | |
| 1235 | void onDisableSROA(AllocaInst *Arg) override { |
| 1236 | auto CostIt = SROACosts.find(Val: Arg); |
| 1237 | if (CostIt == SROACosts.end()) |
| 1238 | return; |
| 1239 | |
| 1240 | increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second); |
| 1241 | SROACostSavingOpportunities -= CostIt->second; |
| 1242 | SROACosts.erase(I: CostIt); |
| 1243 | } |
| 1244 | |
| 1245 | void onDisableLoadElimination() override { |
| 1246 | set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1); |
| 1247 | } |
| 1248 | |
| 1249 | void onCallPenalty() override { |
| 1250 | increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty); |
| 1251 | } |
| 1252 | |
| 1253 | void onCallArgumentSetup(const CallBase &Call) override { |
| 1254 | increment(Feature: InlineCostFeatureIndex::call_argument_setup, |
| 1255 | Delta: Call.arg_size() * InstrCost); |
| 1256 | } |
| 1257 | |
| 1258 | void onLoadRelativeIntrinsic() override { |
| 1259 | increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost); |
| 1260 | } |
| 1261 | |
| 1262 | void onLoweredCall(Function *F, CallBase &Call, |
| 1263 | bool IsIndirectCall) override { |
| 1264 | increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup, |
| 1265 | Delta: Call.arg_size() * InstrCost); |
| 1266 | |
| 1267 | if (IsIndirectCall) { |
| 1268 | InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0, |
| 1269 | /*HintThreshold*/ {}, |
| 1270 | /*ColdThreshold*/ {}, |
| 1271 | /*OptSizeThreshold*/ {}, |
| 1272 | /*OptMinSizeThreshold*/ {}, |
| 1273 | /*HotCallSiteThreshold*/ {}, |
| 1274 | /*LocallyHotCallSiteThreshold*/ {}, |
| 1275 | /*ColdCallSiteThreshold*/ {}, |
| 1276 | /*ComputeFullInlineCost*/ true, |
| 1277 | /*EnableDeferral*/ true}; |
| 1278 | IndirectCallParams.DefaultThreshold = |
| 1279 | InlineConstants::IndirectCallThreshold; |
| 1280 | |
| 1281 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
| 1282 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
| 1283 | false, true); |
| 1284 | if (CA.analyze().isSuccess()) { |
| 1285 | increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate, |
| 1286 | Delta: CA.getCost()); |
| 1287 | increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1); |
| 1288 | } |
| 1289 | } else { |
| 1290 | onCallPenalty(); |
| 1291 | } |
| 1292 | } |
| 1293 | |
| 1294 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
| 1295 | bool DefaultDestUnreachable) override { |
| 1296 | if (JumpTableSize) { |
| 1297 | if (!DefaultDestUnreachable) |
| 1298 | increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty, |
| 1299 | Delta: SwitchDefaultDestCostMultiplier * InstrCost); |
| 1300 | int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost + |
| 1301 | JTCostMultiplier * InstrCost; |
| 1302 | increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost); |
| 1303 | return; |
| 1304 | } |
| 1305 | |
| 1306 | if (NumCaseCluster <= 3) { |
| 1307 | increment(Feature: InlineCostFeatureIndex::case_cluster_penalty, |
| 1308 | Delta: (NumCaseCluster - DefaultDestUnreachable) * |
| 1309 | CaseClusterCostMultiplier * InstrCost); |
| 1310 | return; |
| 1311 | } |
| 1312 | |
| 1313 | int64_t ExpectedNumberOfCompare = |
| 1314 | getExpectedNumberOfCompare(NumCaseCluster); |
| 1315 | |
| 1316 | int64_t SwitchCost = |
| 1317 | ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost; |
| 1318 | increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost); |
| 1319 | } |
| 1320 | |
| 1321 | void onMissedSimplification() override { |
| 1322 | increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions, |
| 1323 | Delta: InstrCost); |
| 1324 | } |
| 1325 | |
| 1326 | void onInitializeSROAArg(AllocaInst *Arg) override { |
| 1327 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
| 1328 | SROACosts[Arg] = SROAArgCost; |
| 1329 | SROACostSavingOpportunities += SROAArgCost; |
| 1330 | } |
| 1331 | |
| 1332 | void onAggregateSROAUse(AllocaInst *Arg) override { |
| 1333 | SROACosts.find(Val: Arg)->second += InstrCost; |
| 1334 | SROACostSavingOpportunities += InstrCost; |
| 1335 | } |
| 1336 | |
| 1337 | void onBlockAnalyzed(const BasicBlock *BB) override { |
| 1338 | if (BB->getTerminator()->getNumSuccessors() > 1) |
| 1339 | set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1); |
| 1340 | Threshold -= SingleBBBonus; |
| 1341 | } |
| 1342 | |
| 1343 | InlineResult finalizeAnalysis() override { |
| 1344 | auto *Caller = CandidateCall.getFunction(); |
| 1345 | if (Caller->hasMinSize()) { |
| 1346 | DominatorTree DT(F); |
| 1347 | LoopInfo LI(DT); |
| 1348 | for (Loop *L : LI) { |
| 1349 | // Ignore loops that will not be executed |
| 1350 | if (DeadBlocks.count(Ptr: L->getHeader())) |
| 1351 | continue; |
| 1352 | increment(Feature: InlineCostFeatureIndex::num_loops, |
| 1353 | Delta: InlineConstants::LoopPenalty); |
| 1354 | } |
| 1355 | } |
| 1356 | set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size()); |
| 1357 | set(Feature: InlineCostFeatureIndex::simplified_instructions, |
| 1358 | Value: NumInstructionsSimplified); |
| 1359 | set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs); |
| 1360 | set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args, |
| 1361 | Value: NumConstantOffsetPtrArgs); |
| 1362 | set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities); |
| 1363 | |
| 1364 | if (NumVectorInstructions <= NumInstructions / 10) |
| 1365 | Threshold -= VectorBonus; |
| 1366 | else if (NumVectorInstructions <= NumInstructions / 2) |
| 1367 | Threshold -= VectorBonus / 2; |
| 1368 | |
| 1369 | set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold); |
| 1370 | |
| 1371 | return InlineResult::success(); |
| 1372 | } |
| 1373 | |
| 1374 | bool shouldStop() override { return false; } |
| 1375 | |
| 1376 | void onLoadEliminationOpportunity() override { |
| 1377 | increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1); |
| 1378 | } |
| 1379 | |
| 1380 | InlineResult onAnalysisStart() override { |
| 1381 | increment(Feature: InlineCostFeatureIndex::callsite_cost, |
| 1382 | Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
| 1383 | |
| 1384 | set(Feature: InlineCostFeatureIndex::cold_cc_penalty, |
| 1385 | Value: (F.getCallingConv() == CallingConv::Cold)); |
| 1386 | |
| 1387 | set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus, |
| 1388 | Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F)); |
| 1389 | |
| 1390 | // FIXME: we shouldn't repeat this logic in both the Features and Cost |
| 1391 | // analyzer - instead, we should abstract it to a common method in the |
| 1392 | // CallAnalyzer |
| 1393 | int SingleBBBonusPercent = 50; |
| 1394 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
| 1395 | Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall); |
| 1396 | Threshold *= TTI.getInliningThresholdMultiplier(); |
| 1397 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
| 1398 | VectorBonus = Threshold * VectorBonusPercent / 100; |
| 1399 | Threshold += (SingleBBBonus + VectorBonus); |
| 1400 | |
| 1401 | return InlineResult::success(); |
| 1402 | } |
| 1403 | |
| 1404 | public: |
| 1405 | InlineCostFeaturesAnalyzer( |
| 1406 | const TargetTransformInfo &TTI, |
| 1407 | function_ref<AssumptionCache &(Function &)> &GetAssumptionCache, |
| 1408 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 1409 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 1410 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, |
| 1411 | CallBase &Call) |
| 1412 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, |
| 1413 | PSI) {} |
| 1414 | |
| 1415 | const InlineCostFeatures &features() const { return Cost; } |
| 1416 | }; |
| 1417 | |
| 1418 | } // namespace |
| 1419 | |
| 1420 | /// Test whether the given value is an Alloca-derived function argument. |
| 1421 | bool CallAnalyzer::isAllocaDerivedArg(Value *V) { |
| 1422 | return SROAArgValues.count(Val: V); |
| 1423 | } |
| 1424 | |
| 1425 | void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { |
| 1426 | onDisableSROA(Arg: SROAArg); |
| 1427 | EnabledSROAAllocas.erase(V: SROAArg); |
| 1428 | disableLoadElimination(); |
| 1429 | } |
| 1430 | |
| 1431 | void InlineCostAnnotationWriter::emitInstructionAnnot( |
| 1432 | const Instruction *I, formatted_raw_ostream &OS) { |
| 1433 | // The cost of inlining of the given instruction is printed always. |
| 1434 | // The threshold delta is printed only when it is non-zero. It happens |
| 1435 | // when we decided to give a bonus at a particular instruction. |
| 1436 | std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I); |
| 1437 | if (!Record) |
| 1438 | OS << "; No analysis for the instruction" ; |
| 1439 | else { |
| 1440 | OS << "; cost before = " << Record->CostBefore |
| 1441 | << ", cost after = " << Record->CostAfter |
| 1442 | << ", threshold before = " << Record->ThresholdBefore |
| 1443 | << ", threshold after = " << Record->ThresholdAfter << ", " ; |
| 1444 | OS << "cost delta = " << Record->getCostDelta(); |
| 1445 | if (Record->hasThresholdChanged()) |
| 1446 | OS << ", threshold delta = " << Record->getThresholdDelta(); |
| 1447 | } |
| 1448 | auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I)); |
| 1449 | if (V) { |
| 1450 | OS << ", simplified to " ; |
| 1451 | V->print(O&: OS, IsForDebug: true); |
| 1452 | if (auto *VI = dyn_cast<Instruction>(Val: V)) { |
| 1453 | if (VI->getFunction() != I->getFunction()) |
| 1454 | OS << " (caller instruction)" ; |
| 1455 | } else if (auto *VArg = dyn_cast<Argument>(Val: V)) { |
| 1456 | if (VArg->getParent() != I->getFunction()) |
| 1457 | OS << " (caller argument)" ; |
| 1458 | } |
| 1459 | } |
| 1460 | OS << "\n" ; |
| 1461 | } |
| 1462 | |
| 1463 | /// If 'V' maps to a SROA candidate, disable SROA for it. |
| 1464 | void CallAnalyzer::disableSROA(Value *V) { |
| 1465 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
| 1466 | disableSROAForArg(SROAArg); |
| 1467 | } |
| 1468 | } |
| 1469 | |
| 1470 | void CallAnalyzer::disableLoadElimination() { |
| 1471 | if (EnableLoadElimination) { |
| 1472 | onDisableLoadElimination(); |
| 1473 | EnableLoadElimination = false; |
| 1474 | } |
| 1475 | } |
| 1476 | |
| 1477 | /// Accumulate a constant GEP offset into an APInt if possible. |
| 1478 | /// |
| 1479 | /// Returns false if unable to compute the offset for any reason. Respects any |
| 1480 | /// simplified values known during the analysis of this callsite. |
| 1481 | bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { |
| 1482 | unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType()); |
| 1483 | assert(IntPtrWidth == Offset.getBitWidth()); |
| 1484 | |
| 1485 | for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); |
| 1486 | GTI != GTE; ++GTI) { |
| 1487 | ConstantInt *OpC = |
| 1488 | getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand()); |
| 1489 | if (!OpC) |
| 1490 | return false; |
| 1491 | if (OpC->isZero()) |
| 1492 | continue; |
| 1493 | |
| 1494 | // Handle a struct index, which adds its field offset to the pointer. |
| 1495 | if (StructType *STy = GTI.getStructTypeOrNull()) { |
| 1496 | unsigned ElementIdx = OpC->getZExtValue(); |
| 1497 | const StructLayout *SL = DL.getStructLayout(Ty: STy); |
| 1498 | Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx)); |
| 1499 | continue; |
| 1500 | } |
| 1501 | |
| 1502 | APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL)); |
| 1503 | Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize; |
| 1504 | } |
| 1505 | return true; |
| 1506 | } |
| 1507 | |
| 1508 | /// Use TTI to check whether a GEP is free. |
| 1509 | /// |
| 1510 | /// Respects any simplified values known during the analysis of this callsite. |
| 1511 | bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { |
| 1512 | SmallVector<Value *, 4> Operands; |
| 1513 | Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0)); |
| 1514 | for (const Use &Op : GEP.indices()) |
| 1515 | if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op)) |
| 1516 | Operands.push_back(Elt: SimpleOp); |
| 1517 | else |
| 1518 | Operands.push_back(Elt: Op); |
| 1519 | return TTI.getInstructionCost(U: &GEP, Operands, |
| 1520 | CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 1521 | TargetTransformInfo::TCC_Free; |
| 1522 | } |
| 1523 | |
| 1524 | bool CallAnalyzer::visitAlloca(AllocaInst &I) { |
| 1525 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
| 1526 | |
| 1527 | // Check whether inlining will turn a dynamic alloca into a static |
| 1528 | // alloca and handle that case. |
| 1529 | if (I.isArrayAllocation()) { |
| 1530 | Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize()); |
| 1531 | if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) { |
| 1532 | // Sometimes a dynamic alloca could be converted into a static alloca |
| 1533 | // after this constant prop, and become a huge static alloca on an |
| 1534 | // unconditional CFG path. Avoid inlining if this is going to happen above |
| 1535 | // a threshold. |
| 1536 | // FIXME: If the threshold is removed or lowered too much, we could end up |
| 1537 | // being too pessimistic and prevent inlining non-problematic code. This |
| 1538 | // could result in unintended perf regressions. A better overall strategy |
| 1539 | // is needed to track stack usage during inlining. |
| 1540 | Type *Ty = I.getAllocatedType(); |
| 1541 | AllocatedSize = SaturatingMultiplyAdd( |
| 1542 | X: AllocSize->getLimitedValue(), |
| 1543 | Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize); |
| 1544 | if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) |
| 1545 | HasDynamicAlloca = true; |
| 1546 | return false; |
| 1547 | } |
| 1548 | } |
| 1549 | |
| 1550 | // Accumulate the allocated size. |
| 1551 | if (I.isStaticAlloca()) { |
| 1552 | Type *Ty = I.getAllocatedType(); |
| 1553 | AllocatedSize = SaturatingAdd(X: DL.getTypeAllocSize(Ty).getKnownMinValue(), |
| 1554 | Y: AllocatedSize); |
| 1555 | } |
| 1556 | |
| 1557 | // FIXME: This is overly conservative. Dynamic allocas are inefficient for |
| 1558 | // a variety of reasons, and so we would like to not inline them into |
| 1559 | // functions which don't currently have a dynamic alloca. This simply |
| 1560 | // disables inlining altogether in the presence of a dynamic alloca. |
| 1561 | if (!I.isStaticAlloca()) |
| 1562 | HasDynamicAlloca = true; |
| 1563 | |
| 1564 | return false; |
| 1565 | } |
| 1566 | |
| 1567 | bool CallAnalyzer::visitPHI(PHINode &I) { |
| 1568 | // FIXME: We need to propagate SROA *disabling* through phi nodes, even |
| 1569 | // though we don't want to propagate it's bonuses. The idea is to disable |
| 1570 | // SROA if it *might* be used in an inappropriate manner. |
| 1571 | |
| 1572 | // Phi nodes are always zero-cost. |
| 1573 | // FIXME: Pointer sizes may differ between different address spaces, so do we |
| 1574 | // need to use correct address space in the call to getPointerSizeInBits here? |
| 1575 | // Or could we skip the getPointerSizeInBits call completely? As far as I can |
| 1576 | // see the ZeroOffset is used as a dummy value, so we can probably use any |
| 1577 | // bit width for the ZeroOffset? |
| 1578 | APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0)); |
| 1579 | bool CheckSROA = I.getType()->isPointerTy(); |
| 1580 | |
| 1581 | // Track the constant or pointer with constant offset we've seen so far. |
| 1582 | Constant *FirstC = nullptr; |
| 1583 | std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; |
| 1584 | Value *FirstV = nullptr; |
| 1585 | |
| 1586 | for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { |
| 1587 | BasicBlock *Pred = I.getIncomingBlock(i); |
| 1588 | // If the incoming block is dead, skip the incoming block. |
| 1589 | if (DeadBlocks.count(Ptr: Pred)) |
| 1590 | continue; |
| 1591 | // If the parent block of phi is not the known successor of the incoming |
| 1592 | // block, skip the incoming block. |
| 1593 | BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; |
| 1594 | if (KnownSuccessor && KnownSuccessor != I.getParent()) |
| 1595 | continue; |
| 1596 | |
| 1597 | Value *V = I.getIncomingValue(i); |
| 1598 | // If the incoming value is this phi itself, skip the incoming value. |
| 1599 | if (&I == V) |
| 1600 | continue; |
| 1601 | |
| 1602 | Constant *C = getDirectOrSimplifiedValue<Constant>(V); |
| 1603 | |
| 1604 | std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; |
| 1605 | if (!C && CheckSROA) |
| 1606 | BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V); |
| 1607 | |
| 1608 | if (!C && !BaseAndOffset.first) |
| 1609 | // The incoming value is neither a constant nor a pointer with constant |
| 1610 | // offset, exit early. |
| 1611 | return true; |
| 1612 | |
| 1613 | if (FirstC) { |
| 1614 | if (FirstC == C) |
| 1615 | // If we've seen a constant incoming value before and it is the same |
| 1616 | // constant we see this time, continue checking the next incoming value. |
| 1617 | continue; |
| 1618 | // Otherwise early exit because we either see a different constant or saw |
| 1619 | // a constant before but we have a pointer with constant offset this time. |
| 1620 | return true; |
| 1621 | } |
| 1622 | |
| 1623 | if (FirstV) { |
| 1624 | // The same logic as above, but check pointer with constant offset here. |
| 1625 | if (FirstBaseAndOffset == BaseAndOffset) |
| 1626 | continue; |
| 1627 | return true; |
| 1628 | } |
| 1629 | |
| 1630 | if (C) { |
| 1631 | // This is the 1st time we've seen a constant, record it. |
| 1632 | FirstC = C; |
| 1633 | continue; |
| 1634 | } |
| 1635 | |
| 1636 | // The remaining case is that this is the 1st time we've seen a pointer with |
| 1637 | // constant offset, record it. |
| 1638 | FirstV = V; |
| 1639 | FirstBaseAndOffset = BaseAndOffset; |
| 1640 | } |
| 1641 | |
| 1642 | // Check if we can map phi to a constant. |
| 1643 | if (FirstC) { |
| 1644 | SimplifiedValues[&I] = FirstC; |
| 1645 | return true; |
| 1646 | } |
| 1647 | |
| 1648 | // Check if we can map phi to a pointer with constant offset. |
| 1649 | if (FirstBaseAndOffset.first) { |
| 1650 | ConstantOffsetPtrs[&I] = FirstBaseAndOffset; |
| 1651 | |
| 1652 | if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV)) |
| 1653 | SROAArgValues[&I] = SROAArg; |
| 1654 | } |
| 1655 | |
| 1656 | return true; |
| 1657 | } |
| 1658 | |
| 1659 | /// Check we can fold GEPs of constant-offset call site argument pointers. |
| 1660 | /// This requires target data and inbounds GEPs. |
| 1661 | /// |
| 1662 | /// \return true if the specified GEP can be folded. |
| 1663 | bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { |
| 1664 | // Check if we have a base + offset for the pointer. |
| 1665 | std::pair<Value *, APInt> BaseAndOffset = |
| 1666 | ConstantOffsetPtrs.lookup(Val: I.getPointerOperand()); |
| 1667 | if (!BaseAndOffset.first) |
| 1668 | return false; |
| 1669 | |
| 1670 | // Check if the offset of this GEP is constant, and if so accumulate it |
| 1671 | // into Offset. |
| 1672 | if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second)) |
| 1673 | return false; |
| 1674 | |
| 1675 | // Add the result as a new mapping to Base + Offset. |
| 1676 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
| 1677 | |
| 1678 | return true; |
| 1679 | } |
| 1680 | |
| 1681 | bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { |
| 1682 | auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand()); |
| 1683 | |
| 1684 | // Lambda to check whether a GEP's indices are all constant. |
| 1685 | auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { |
| 1686 | for (const Use &Op : GEP.indices()) |
| 1687 | if (!getDirectOrSimplifiedValue<Constant>(V: Op)) |
| 1688 | return false; |
| 1689 | return true; |
| 1690 | }; |
| 1691 | |
| 1692 | if (!DisableGEPConstOperand) |
| 1693 | if (simplifyInstruction(I)) |
| 1694 | return true; |
| 1695 | |
| 1696 | if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { |
| 1697 | if (SROAArg) |
| 1698 | SROAArgValues[&I] = SROAArg; |
| 1699 | |
| 1700 | // Constant GEPs are modeled as free. |
| 1701 | return true; |
| 1702 | } |
| 1703 | |
| 1704 | // Variable GEPs will require math and will disable SROA. |
| 1705 | if (SROAArg) |
| 1706 | disableSROAForArg(SROAArg); |
| 1707 | return isGEPFree(GEP&: I); |
| 1708 | } |
| 1709 | |
| 1710 | // Simplify \p Cmp if RHS is const and we can ValueTrack LHS. |
| 1711 | // This handles the case only when the Cmp instruction is guarding a recursive |
| 1712 | // call that will cause the Cmp to fail/succeed for the recursive call. |
| 1713 | bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) { |
| 1714 | // Bail out if LHS is not a function argument or RHS is NOT const: |
| 1715 | if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1))) |
| 1716 | return false; |
| 1717 | auto *CmpOp = Cmp.getOperand(i_nocapture: 0); |
| 1718 | // Make sure that the callsite is recursive: |
| 1719 | if (CandidateCall.getCaller() != &F) |
| 1720 | return false; |
| 1721 | // Only handle the case when the callsite has a single predecessor: |
| 1722 | auto *CallBB = CandidateCall.getParent(); |
| 1723 | auto *Predecessor = CallBB->getSinglePredecessor(); |
| 1724 | if (!Predecessor) |
| 1725 | return false; |
| 1726 | // Check if the callsite is guarded by the same Cmp instruction: |
| 1727 | auto *Br = dyn_cast<BranchInst>(Val: Predecessor->getTerminator()); |
| 1728 | if (!Br || Br->isUnconditional() || Br->getCondition() != &Cmp) |
| 1729 | return false; |
| 1730 | |
| 1731 | // Check if there is any arg of the recursive callsite is affecting the cmp |
| 1732 | // instr: |
| 1733 | bool ArgFound = false; |
| 1734 | Value *FuncArg = nullptr, *CallArg = nullptr; |
| 1735 | for (unsigned ArgNum = 0; |
| 1736 | ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) { |
| 1737 | FuncArg = F.getArg(i: ArgNum); |
| 1738 | CallArg = CandidateCall.getArgOperand(i: ArgNum); |
| 1739 | if (FuncArg == CmpOp && CallArg != CmpOp) { |
| 1740 | ArgFound = true; |
| 1741 | break; |
| 1742 | } |
| 1743 | } |
| 1744 | if (!ArgFound) |
| 1745 | return false; |
| 1746 | |
| 1747 | // Now we have a recursive call that is guarded by a cmp instruction. |
| 1748 | // Check if this cmp can be simplified: |
| 1749 | SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg)); |
| 1750 | CondContext CC(&Cmp); |
| 1751 | CC.Invert = (CallBB != Br->getSuccessor(i: 0)); |
| 1752 | SQ.CC = &CC; |
| 1753 | CC.AffectedValues.insert(Ptr: FuncArg); |
| 1754 | Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands( |
| 1755 | I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ); |
| 1756 | if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) { |
| 1757 | // Make sure that the BB of the recursive call is NOT the true successor |
| 1758 | // of the icmp. In other words, make sure that the recursion depth is 1. |
| 1759 | if ((ConstVal->isOne() && CC.Invert) || |
| 1760 | (ConstVal->isZero() && !CC.Invert)) { |
| 1761 | SimplifiedValues[&Cmp] = ConstVal; |
| 1762 | return true; |
| 1763 | } |
| 1764 | } |
| 1765 | return false; |
| 1766 | } |
| 1767 | |
| 1768 | /// Simplify \p I if its operands are constants and update SimplifiedValues. |
| 1769 | bool CallAnalyzer::simplifyInstruction(Instruction &I) { |
| 1770 | SmallVector<Constant *> COps; |
| 1771 | for (Value *Op : I.operands()) { |
| 1772 | Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op); |
| 1773 | if (!COp) |
| 1774 | return false; |
| 1775 | COps.push_back(Elt: COp); |
| 1776 | } |
| 1777 | auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL); |
| 1778 | if (!C) |
| 1779 | return false; |
| 1780 | SimplifiedValues[&I] = C; |
| 1781 | return true; |
| 1782 | } |
| 1783 | |
| 1784 | /// Try to simplify a call to llvm.is.constant. |
| 1785 | /// |
| 1786 | /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since |
| 1787 | /// we expect calls of this specific intrinsic to be infrequent. |
| 1788 | /// |
| 1789 | /// FIXME: Given that we know CB's parent (F) caller |
| 1790 | /// (CandidateCall->getParent()->getParent()), we might be able to determine |
| 1791 | /// whether inlining F into F's caller would change how the call to |
| 1792 | /// llvm.is.constant would evaluate. |
| 1793 | bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) { |
| 1794 | Value *Arg = CB.getArgOperand(i: 0); |
| 1795 | auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg); |
| 1796 | |
| 1797 | Type *RT = CB.getFunctionType()->getReturnType(); |
| 1798 | SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0); |
| 1799 | return true; |
| 1800 | } |
| 1801 | |
| 1802 | bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) { |
| 1803 | // As per the langref, "The fourth argument to llvm.objectsize determines if |
| 1804 | // the value should be evaluated at runtime." |
| 1805 | if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne()) |
| 1806 | return false; |
| 1807 | |
| 1808 | Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr, |
| 1809 | /*MustSucceed=*/true); |
| 1810 | Constant *C = dyn_cast_or_null<Constant>(Val: V); |
| 1811 | if (C) |
| 1812 | SimplifiedValues[&CB] = C; |
| 1813 | return C; |
| 1814 | } |
| 1815 | |
| 1816 | bool CallAnalyzer::visitBitCast(BitCastInst &I) { |
| 1817 | // Propagate constants through bitcasts. |
| 1818 | if (simplifyInstruction(I)) |
| 1819 | return true; |
| 1820 | |
| 1821 | // Track base/offsets through casts |
| 1822 | std::pair<Value *, APInt> BaseAndOffset = |
| 1823 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
| 1824 | // Casts don't change the offset, just wrap it up. |
| 1825 | if (BaseAndOffset.first) |
| 1826 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
| 1827 | |
| 1828 | // Also look for SROA candidates here. |
| 1829 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
| 1830 | SROAArgValues[&I] = SROAArg; |
| 1831 | |
| 1832 | // Bitcasts are always zero cost. |
| 1833 | return true; |
| 1834 | } |
| 1835 | |
| 1836 | bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { |
| 1837 | // Propagate constants through ptrtoint. |
| 1838 | if (simplifyInstruction(I)) |
| 1839 | return true; |
| 1840 | |
| 1841 | // Track base/offset pairs when converted to a plain integer provided the |
| 1842 | // integer is large enough to represent the pointer. |
| 1843 | unsigned IntegerSize = I.getType()->getScalarSizeInBits(); |
| 1844 | unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace(); |
| 1845 | if (IntegerSize == DL.getPointerSizeInBits(AS)) { |
| 1846 | std::pair<Value *, APInt> BaseAndOffset = |
| 1847 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
| 1848 | if (BaseAndOffset.first) |
| 1849 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
| 1850 | } |
| 1851 | |
| 1852 | // This is really weird. Technically, ptrtoint will disable SROA. However, |
| 1853 | // unless that ptrtoint is *used* somewhere in the live basic blocks after |
| 1854 | // inlining, it will be nuked, and SROA should proceed. All of the uses which |
| 1855 | // would block SROA would also block SROA if applied directly to a pointer, |
| 1856 | // and so we can just add the integer in here. The only places where SROA is |
| 1857 | // preserved either cannot fire on an integer, or won't in-and-of themselves |
| 1858 | // disable SROA (ext) w/o some later use that we would see and disable. |
| 1859 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
| 1860 | SROAArgValues[&I] = SROAArg; |
| 1861 | |
| 1862 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 1863 | TargetTransformInfo::TCC_Free; |
| 1864 | } |
| 1865 | |
| 1866 | bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { |
| 1867 | // Propagate constants through ptrtoint. |
| 1868 | if (simplifyInstruction(I)) |
| 1869 | return true; |
| 1870 | |
| 1871 | // Track base/offset pairs when round-tripped through a pointer without |
| 1872 | // modifications provided the integer is not too large. |
| 1873 | Value *Op = I.getOperand(i_nocapture: 0); |
| 1874 | unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); |
| 1875 | if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { |
| 1876 | std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op); |
| 1877 | if (BaseAndOffset.first) |
| 1878 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
| 1879 | } |
| 1880 | |
| 1881 | // "Propagate" SROA here in the same manner as we do for ptrtoint above. |
| 1882 | if (auto *SROAArg = getSROAArgForValueOrNull(V: Op)) |
| 1883 | SROAArgValues[&I] = SROAArg; |
| 1884 | |
| 1885 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 1886 | TargetTransformInfo::TCC_Free; |
| 1887 | } |
| 1888 | |
| 1889 | bool CallAnalyzer::visitCastInst(CastInst &I) { |
| 1890 | // Propagate constants through casts. |
| 1891 | if (simplifyInstruction(I)) |
| 1892 | return true; |
| 1893 | |
| 1894 | // Disable SROA in the face of arbitrary casts we don't explicitly list |
| 1895 | // elsewhere. |
| 1896 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
| 1897 | |
| 1898 | // If this is a floating-point cast, and the target says this operation |
| 1899 | // is expensive, this may eventually become a library call. Treat the cost |
| 1900 | // as such. |
| 1901 | switch (I.getOpcode()) { |
| 1902 | case Instruction::FPTrunc: |
| 1903 | case Instruction::FPExt: |
| 1904 | case Instruction::UIToFP: |
| 1905 | case Instruction::SIToFP: |
| 1906 | case Instruction::FPToUI: |
| 1907 | case Instruction::FPToSI: |
| 1908 | if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive) |
| 1909 | onCallPenalty(); |
| 1910 | break; |
| 1911 | default: |
| 1912 | break; |
| 1913 | } |
| 1914 | |
| 1915 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 1916 | TargetTransformInfo::TCC_Free; |
| 1917 | } |
| 1918 | |
| 1919 | bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { |
| 1920 | return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr); |
| 1921 | } |
| 1922 | |
| 1923 | bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { |
| 1924 | // Does the *call site* have the NonNull attribute set on an argument? We |
| 1925 | // use the attribute on the call site to memoize any analysis done in the |
| 1926 | // caller. This will also trip if the callee function has a non-null |
| 1927 | // parameter attribute, but that's a less interesting case because hopefully |
| 1928 | // the callee would already have been simplified based on that. |
| 1929 | if (Argument *A = dyn_cast<Argument>(Val: V)) |
| 1930 | if (paramHasAttr(A, Attr: Attribute::NonNull)) |
| 1931 | return true; |
| 1932 | |
| 1933 | // Is this an alloca in the caller? This is distinct from the attribute case |
| 1934 | // above because attributes aren't updated within the inliner itself and we |
| 1935 | // always want to catch the alloca derived case. |
| 1936 | if (isAllocaDerivedArg(V)) |
| 1937 | // We can actually predict the result of comparisons between an |
| 1938 | // alloca-derived value and null. Note that this fires regardless of |
| 1939 | // SROA firing. |
| 1940 | return true; |
| 1941 | |
| 1942 | return false; |
| 1943 | } |
| 1944 | |
| 1945 | bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { |
| 1946 | // If the normal destination of the invoke or the parent block of the call |
| 1947 | // site is unreachable-terminated, there is little point in inlining this |
| 1948 | // unless there is literally zero cost. |
| 1949 | // FIXME: Note that it is possible that an unreachable-terminated block has a |
| 1950 | // hot entry. For example, in below scenario inlining hot_call_X() may be |
| 1951 | // beneficial : |
| 1952 | // main() { |
| 1953 | // hot_call_1(); |
| 1954 | // ... |
| 1955 | // hot_call_N() |
| 1956 | // exit(0); |
| 1957 | // } |
| 1958 | // For now, we are not handling this corner case here as it is rare in real |
| 1959 | // code. In future, we should elaborate this based on BPI and BFI in more |
| 1960 | // general threshold adjusting heuristics in updateThreshold(). |
| 1961 | if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) { |
| 1962 | if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator())) |
| 1963 | return false; |
| 1964 | } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator())) |
| 1965 | return false; |
| 1966 | |
| 1967 | return true; |
| 1968 | } |
| 1969 | |
| 1970 | bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, |
| 1971 | BlockFrequencyInfo *CallerBFI) { |
| 1972 | // If global profile summary is available, then callsite's coldness is |
| 1973 | // determined based on that. |
| 1974 | if (PSI && PSI->hasProfileSummary()) |
| 1975 | return PSI->isColdCallSite(CB: Call, BFI: CallerBFI); |
| 1976 | |
| 1977 | // Otherwise we need BFI to be available. |
| 1978 | if (!CallerBFI) |
| 1979 | return false; |
| 1980 | |
| 1981 | // Determine if the callsite is cold relative to caller's entry. We could |
| 1982 | // potentially cache the computation of scaled entry frequency, but the added |
| 1983 | // complexity is not worth it unless this scaling shows up high in the |
| 1984 | // profiles. |
| 1985 | const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); |
| 1986 | auto CallSiteBB = Call.getParent(); |
| 1987 | auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
| 1988 | auto CallerEntryFreq = |
| 1989 | CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock())); |
| 1990 | return CallSiteFreq < CallerEntryFreq * ColdProb; |
| 1991 | } |
| 1992 | |
| 1993 | std::optional<int> |
| 1994 | InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, |
| 1995 | BlockFrequencyInfo *CallerBFI) { |
| 1996 | |
| 1997 | // If global profile summary is available, then callsite's hotness is |
| 1998 | // determined based on that. |
| 1999 | if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI)) |
| 2000 | return Params.HotCallSiteThreshold; |
| 2001 | |
| 2002 | // Otherwise we need BFI to be available and to have a locally hot callsite |
| 2003 | // threshold. |
| 2004 | if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) |
| 2005 | return std::nullopt; |
| 2006 | |
| 2007 | // Determine if the callsite is hot relative to caller's entry. We could |
| 2008 | // potentially cache the computation of scaled entry frequency, but the added |
| 2009 | // complexity is not worth it unless this scaling shows up high in the |
| 2010 | // profiles. |
| 2011 | const BasicBlock *CallSiteBB = Call.getParent(); |
| 2012 | BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
| 2013 | BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq(); |
| 2014 | std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq); |
| 2015 | if (Limit && CallSiteFreq >= *Limit) |
| 2016 | return Params.LocallyHotCallSiteThreshold; |
| 2017 | |
| 2018 | // Otherwise treat it normally. |
| 2019 | return std::nullopt; |
| 2020 | } |
| 2021 | |
| 2022 | void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { |
| 2023 | // If no size growth is allowed for this inlining, set Threshold to 0. |
| 2024 | if (!allowSizeGrowth(Call)) { |
| 2025 | Threshold = 0; |
| 2026 | return; |
| 2027 | } |
| 2028 | |
| 2029 | Function *Caller = Call.getCaller(); |
| 2030 | |
| 2031 | // return min(A, B) if B is valid. |
| 2032 | auto MinIfValid = [](int A, std::optional<int> B) { |
| 2033 | return B ? std::min(a: A, b: *B) : A; |
| 2034 | }; |
| 2035 | |
| 2036 | // return max(A, B) if B is valid. |
| 2037 | auto MaxIfValid = [](int A, std::optional<int> B) { |
| 2038 | return B ? std::max(a: A, b: *B) : A; |
| 2039 | }; |
| 2040 | |
| 2041 | // Various bonus percentages. These are multiplied by Threshold to get the |
| 2042 | // bonus values. |
| 2043 | // SingleBBBonus: This bonus is applied if the callee has a single reachable |
| 2044 | // basic block at the given callsite context. This is speculatively applied |
| 2045 | // and withdrawn if more than one basic block is seen. |
| 2046 | // |
| 2047 | // LstCallToStaticBonus: This large bonus is applied to ensure the inlining |
| 2048 | // of the last call to a static function as inlining such functions is |
| 2049 | // guaranteed to reduce code size. |
| 2050 | // |
| 2051 | // These bonus percentages may be set to 0 based on properties of the caller |
| 2052 | // and the callsite. |
| 2053 | int SingleBBBonusPercent = 50; |
| 2054 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
| 2055 | int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus(); |
| 2056 | |
| 2057 | // Lambda to set all the above bonus and bonus percentages to 0. |
| 2058 | auto DisallowAllBonuses = [&]() { |
| 2059 | SingleBBBonusPercent = 0; |
| 2060 | VectorBonusPercent = 0; |
| 2061 | LastCallToStaticBonus = 0; |
| 2062 | }; |
| 2063 | |
| 2064 | // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available |
| 2065 | // and reduce the threshold if the caller has the necessary attribute. |
| 2066 | if (Caller->hasMinSize()) { |
| 2067 | Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); |
| 2068 | // For minsize, we want to disable the single BB bonus and the vector |
| 2069 | // bonuses, but not the last-call-to-static bonus. Inlining the last call to |
| 2070 | // a static function will, at the minimum, eliminate the parameter setup and |
| 2071 | // call/return instructions. |
| 2072 | SingleBBBonusPercent = 0; |
| 2073 | VectorBonusPercent = 0; |
| 2074 | } else if (Caller->hasOptSize()) |
| 2075 | Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); |
| 2076 | |
| 2077 | // Adjust the threshold based on inlinehint attribute and profile based |
| 2078 | // hotness information if the caller does not have MinSize attribute. |
| 2079 | if (!Caller->hasMinSize()) { |
| 2080 | if (Callee.hasFnAttribute(Kind: Attribute::InlineHint)) |
| 2081 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
| 2082 | |
| 2083 | // FIXME: After switching to the new passmanager, simplify the logic below |
| 2084 | // by checking only the callsite hotness/coldness as we will reliably |
| 2085 | // have local profile information. |
| 2086 | // |
| 2087 | // Callsite hotness and coldness can be determined if sample profile is |
| 2088 | // used (which adds hotness metadata to calls) or if caller's |
| 2089 | // BlockFrequencyInfo is available. |
| 2090 | BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr; |
| 2091 | auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); |
| 2092 | if (!Caller->hasOptSize() && HotCallSiteThreshold) { |
| 2093 | LLVM_DEBUG(dbgs() << "Hot callsite.\n" ); |
| 2094 | // FIXME: This should update the threshold only if it exceeds the |
| 2095 | // current threshold, but AutoFDO + ThinLTO currently relies on this |
| 2096 | // behavior to prevent inlining of hot callsites during ThinLTO |
| 2097 | // compile phase. |
| 2098 | Threshold = *HotCallSiteThreshold; |
| 2099 | } else if (isColdCallSite(Call, CallerBFI)) { |
| 2100 | LLVM_DEBUG(dbgs() << "Cold callsite.\n" ); |
| 2101 | // Do not apply bonuses for a cold callsite including the |
| 2102 | // LastCallToStatic bonus. While this bonus might result in code size |
| 2103 | // reduction, it can cause the size of a non-cold caller to increase |
| 2104 | // preventing it from being inlined. |
| 2105 | DisallowAllBonuses(); |
| 2106 | Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); |
| 2107 | } else if (PSI) { |
| 2108 | // Use callee's global profile information only if we have no way of |
| 2109 | // determining this via callsite information. |
| 2110 | if (PSI->isFunctionEntryHot(F: &Callee)) { |
| 2111 | LLVM_DEBUG(dbgs() << "Hot callee.\n" ); |
| 2112 | // If callsite hotness can not be determined, we may still know |
| 2113 | // that the callee is hot and treat it as a weaker hint for threshold |
| 2114 | // increase. |
| 2115 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
| 2116 | } else if (PSI->isFunctionEntryCold(F: &Callee)) { |
| 2117 | LLVM_DEBUG(dbgs() << "Cold callee.\n" ); |
| 2118 | // Do not apply bonuses for a cold callee including the |
| 2119 | // LastCallToStatic bonus. While this bonus might result in code size |
| 2120 | // reduction, it can cause the size of a non-cold caller to increase |
| 2121 | // preventing it from being inlined. |
| 2122 | DisallowAllBonuses(); |
| 2123 | Threshold = MinIfValid(Threshold, Params.ColdThreshold); |
| 2124 | } |
| 2125 | } |
| 2126 | } |
| 2127 | |
| 2128 | Threshold += TTI.adjustInliningThreshold(CB: &Call); |
| 2129 | |
| 2130 | // Finally, take the target-specific inlining threshold multiplier into |
| 2131 | // account. |
| 2132 | Threshold *= TTI.getInliningThresholdMultiplier(); |
| 2133 | |
| 2134 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
| 2135 | VectorBonus = Threshold * VectorBonusPercent / 100; |
| 2136 | |
| 2137 | // If there is only one call of the function, and it has internal linkage, |
| 2138 | // the cost of inlining it drops dramatically. It may seem odd to update |
| 2139 | // Cost in updateThreshold, but the bonus depends on the logic in this method. |
| 2140 | if (isSoleCallToLocalFunction(CB: Call, Callee: F)) { |
| 2141 | Cost -= LastCallToStaticBonus; |
| 2142 | StaticBonusApplied = LastCallToStaticBonus; |
| 2143 | } |
| 2144 | } |
| 2145 | |
| 2146 | bool CallAnalyzer::visitCmpInst(CmpInst &I) { |
| 2147 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
| 2148 | // First try to handle simplified comparisons. |
| 2149 | if (simplifyInstruction(I)) |
| 2150 | return true; |
| 2151 | |
| 2152 | // Try to handle comparison that can be simplified using ValueTracking. |
| 2153 | if (simplifyCmpInstForRecCall(Cmp&: I)) |
| 2154 | return true; |
| 2155 | |
| 2156 | if (I.getOpcode() == Instruction::FCmp) |
| 2157 | return false; |
| 2158 | |
| 2159 | // Otherwise look for a comparison between constant offset pointers with |
| 2160 | // a common base. |
| 2161 | Value *LHSBase, *RHSBase; |
| 2162 | APInt LHSOffset, RHSOffset; |
| 2163 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
| 2164 | if (LHSBase) { |
| 2165 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
| 2166 | if (RHSBase && LHSBase == RHSBase) { |
| 2167 | // We have common bases, fold the icmp to a constant based on the |
| 2168 | // offsets. |
| 2169 | SimplifiedValues[&I] = ConstantInt::getBool( |
| 2170 | Ty: I.getType(), |
| 2171 | V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate())); |
| 2172 | ++NumConstantPtrCmps; |
| 2173 | return true; |
| 2174 | } |
| 2175 | } |
| 2176 | |
| 2177 | auto isImplicitNullCheckCmp = [](const CmpInst &I) { |
| 2178 | for (auto *User : I.users()) |
| 2179 | if (auto *Instr = dyn_cast<Instruction>(Val: User)) |
| 2180 | if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit)) |
| 2181 | return false; |
| 2182 | return true; |
| 2183 | }; |
| 2184 | |
| 2185 | // If the comparison is an equality comparison with null, we can simplify it |
| 2186 | // if we know the value (argument) can't be null |
| 2187 | if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) { |
| 2188 | if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) { |
| 2189 | bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; |
| 2190 | SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType()) |
| 2191 | : ConstantInt::getFalse(Ty: I.getType()); |
| 2192 | return true; |
| 2193 | } |
| 2194 | // Implicit null checks act as unconditional branches and their comparisons |
| 2195 | // should be treated as simplified and free of cost. |
| 2196 | if (isImplicitNullCheckCmp(I)) |
| 2197 | return true; |
| 2198 | } |
| 2199 | return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))); |
| 2200 | } |
| 2201 | |
| 2202 | bool CallAnalyzer::visitSub(BinaryOperator &I) { |
| 2203 | // Try to handle a special case: we can fold computing the difference of two |
| 2204 | // constant-related pointers. |
| 2205 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
| 2206 | Value *LHSBase, *RHSBase; |
| 2207 | APInt LHSOffset, RHSOffset; |
| 2208 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
| 2209 | if (LHSBase) { |
| 2210 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
| 2211 | if (RHSBase && LHSBase == RHSBase) { |
| 2212 | // We have common bases, fold the subtract to a constant based on the |
| 2213 | // offsets. |
| 2214 | Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset); |
| 2215 | Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset); |
| 2216 | if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) { |
| 2217 | SimplifiedValues[&I] = C; |
| 2218 | ++NumConstantPtrDiffs; |
| 2219 | return true; |
| 2220 | } |
| 2221 | } |
| 2222 | } |
| 2223 | |
| 2224 | // Otherwise, fall back to the generic logic for simplifying and handling |
| 2225 | // instructions. |
| 2226 | return Base::visitSub(I); |
| 2227 | } |
| 2228 | |
| 2229 | bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { |
| 2230 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
| 2231 | Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS); |
| 2232 | Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS); |
| 2233 | |
| 2234 | Value *SimpleV = nullptr; |
| 2235 | if (auto FI = dyn_cast<FPMathOperator>(Val: &I)) |
| 2236 | SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, |
| 2237 | FMF: FI->getFastMathFlags(), Q: DL); |
| 2238 | else |
| 2239 | SimpleV = |
| 2240 | simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL); |
| 2241 | |
| 2242 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
| 2243 | SimplifiedValues[&I] = C; |
| 2244 | |
| 2245 | if (SimpleV) |
| 2246 | return true; |
| 2247 | |
| 2248 | // Disable any SROA on arguments to arbitrary, unsimplified binary operators. |
| 2249 | disableSROA(V: LHS); |
| 2250 | disableSROA(V: RHS); |
| 2251 | |
| 2252 | // If the instruction is floating point, and the target says this operation |
| 2253 | // is expensive, this may eventually become a library call. Treat the cost |
| 2254 | // as such. Unless it's fneg which can be implemented with an xor. |
| 2255 | using namespace llvm::PatternMatch; |
| 2256 | if (I.getType()->isFloatingPointTy() && |
| 2257 | TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive && |
| 2258 | !match(V: &I, P: m_FNeg(X: m_Value()))) |
| 2259 | onCallPenalty(); |
| 2260 | |
| 2261 | return false; |
| 2262 | } |
| 2263 | |
| 2264 | bool CallAnalyzer::visitFNeg(UnaryOperator &I) { |
| 2265 | Value *Op = I.getOperand(i_nocapture: 0); |
| 2266 | Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op); |
| 2267 | |
| 2268 | Value *SimpleV = simplifyFNegInst( |
| 2269 | Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL); |
| 2270 | |
| 2271 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
| 2272 | SimplifiedValues[&I] = C; |
| 2273 | |
| 2274 | if (SimpleV) |
| 2275 | return true; |
| 2276 | |
| 2277 | // Disable any SROA on arguments to arbitrary, unsimplified fneg. |
| 2278 | disableSROA(V: Op); |
| 2279 | |
| 2280 | return false; |
| 2281 | } |
| 2282 | |
| 2283 | bool CallAnalyzer::visitLoad(LoadInst &I) { |
| 2284 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
| 2285 | return true; |
| 2286 | |
| 2287 | // If the data is already loaded from this address and hasn't been clobbered |
| 2288 | // by any stores or calls, this load is likely to be redundant and can be |
| 2289 | // eliminated. |
| 2290 | if (EnableLoadElimination && |
| 2291 | !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) { |
| 2292 | onLoadEliminationOpportunity(); |
| 2293 | return true; |
| 2294 | } |
| 2295 | |
| 2296 | onMemAccess(); |
| 2297 | return false; |
| 2298 | } |
| 2299 | |
| 2300 | bool CallAnalyzer::visitStore(StoreInst &I) { |
| 2301 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
| 2302 | return true; |
| 2303 | |
| 2304 | // The store can potentially clobber loads and prevent repeated loads from |
| 2305 | // being eliminated. |
| 2306 | // FIXME: |
| 2307 | // 1. We can probably keep an initial set of eliminatable loads substracted |
| 2308 | // from the cost even when we finally see a store. We just need to disable |
| 2309 | // *further* accumulation of elimination savings. |
| 2310 | // 2. We should probably at some point thread MemorySSA for the callee into |
| 2311 | // this and then use that to actually compute *really* precise savings. |
| 2312 | disableLoadElimination(); |
| 2313 | |
| 2314 | onMemAccess(); |
| 2315 | return false; |
| 2316 | } |
| 2317 | |
| 2318 | bool CallAnalyzer::(ExtractValueInst &I) { |
| 2319 | Value *Op = I.getAggregateOperand(); |
| 2320 | |
| 2321 | // Special handling, because we want to simplify extractvalue with a |
| 2322 | // potential insertvalue from the caller. |
| 2323 | if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) { |
| 2324 | SimplifyQuery SQ(DL); |
| 2325 | Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ); |
| 2326 | if (SimpleV) { |
| 2327 | SimplifiedValues[&I] = SimpleV; |
| 2328 | return true; |
| 2329 | } |
| 2330 | } |
| 2331 | |
| 2332 | // SROA can't look through these, but they may be free. |
| 2333 | return Base::visitExtractValue(I); |
| 2334 | } |
| 2335 | |
| 2336 | bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { |
| 2337 | // Constant folding for insert value is trivial. |
| 2338 | if (simplifyInstruction(I)) |
| 2339 | return true; |
| 2340 | |
| 2341 | // SROA can't look through these, but they may be free. |
| 2342 | return Base::visitInsertValue(I); |
| 2343 | } |
| 2344 | |
| 2345 | /// Try to simplify a call site. |
| 2346 | /// |
| 2347 | /// Takes a concrete function and callsite and tries to actually simplify it by |
| 2348 | /// analyzing the arguments and call itself with instsimplify. Returns true if |
| 2349 | /// it has simplified the callsite to some other entity (a constant), making it |
| 2350 | /// free. |
| 2351 | bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { |
| 2352 | // FIXME: Using the instsimplify logic directly for this is inefficient |
| 2353 | // because we have to continually rebuild the argument list even when no |
| 2354 | // simplifications can be performed. Until that is fixed with remapping |
| 2355 | // inside of instsimplify, directly constant fold calls here. |
| 2356 | if (!canConstantFoldCallTo(Call: &Call, F)) |
| 2357 | return false; |
| 2358 | |
| 2359 | // Try to re-map the arguments to constants. |
| 2360 | SmallVector<Constant *, 4> ConstantArgs; |
| 2361 | ConstantArgs.reserve(N: Call.arg_size()); |
| 2362 | for (Value *I : Call.args()) { |
| 2363 | Constant *C = getDirectOrSimplifiedValue<Constant>(V: I); |
| 2364 | if (!C) |
| 2365 | return false; // This argument doesn't map to a constant. |
| 2366 | |
| 2367 | ConstantArgs.push_back(Elt: C); |
| 2368 | } |
| 2369 | if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) { |
| 2370 | SimplifiedValues[&Call] = C; |
| 2371 | return true; |
| 2372 | } |
| 2373 | |
| 2374 | return false; |
| 2375 | } |
| 2376 | |
| 2377 | bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) { |
| 2378 | const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr; |
| 2379 | LibFunc LF; |
| 2380 | if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF)) |
| 2381 | return TTI.isLoweredToCall(F); |
| 2382 | |
| 2383 | switch (LF) { |
| 2384 | case LibFunc_memcpy_chk: |
| 2385 | case LibFunc_memmove_chk: |
| 2386 | case LibFunc_mempcpy_chk: |
| 2387 | case LibFunc_memset_chk: { |
| 2388 | // Calls to __memcpy_chk whose length is known to fit within the object |
| 2389 | // size will eventually be replaced by inline stores. Therefore, these |
| 2390 | // should not incur a call penalty. This is only really relevant on |
| 2391 | // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as |
| 2392 | // other platforms use memcpy intrinsics, which are already exempt from the |
| 2393 | // call penalty. |
| 2394 | auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2)); |
| 2395 | auto *ObjSizeOp = |
| 2396 | getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3)); |
| 2397 | if (LenOp && ObjSizeOp && |
| 2398 | LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) { |
| 2399 | return false; |
| 2400 | } |
| 2401 | break; |
| 2402 | } |
| 2403 | default: |
| 2404 | break; |
| 2405 | } |
| 2406 | |
| 2407 | return TTI.isLoweredToCall(F); |
| 2408 | } |
| 2409 | |
| 2410 | bool CallAnalyzer::visitCallBase(CallBase &Call) { |
| 2411 | if (!onCallBaseVisitStart(Call)) |
| 2412 | return true; |
| 2413 | |
| 2414 | if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) && |
| 2415 | !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) { |
| 2416 | // This aborts the entire analysis. |
| 2417 | ExposesReturnsTwice = true; |
| 2418 | return false; |
| 2419 | } |
| 2420 | if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate()) |
| 2421 | ContainsNoDuplicateCall = true; |
| 2422 | |
| 2423 | Function *F = Call.getCalledFunction(); |
| 2424 | bool IsIndirectCall = !F; |
| 2425 | if (IsIndirectCall) { |
| 2426 | // Check if this happens to be an indirect function call to a known function |
| 2427 | // in this inline context. If not, we've done all we can. |
| 2428 | Value *Callee = Call.getCalledOperand(); |
| 2429 | F = getSimplifiedValue<Function>(V: Callee); |
| 2430 | if (!F || F->getFunctionType() != Call.getFunctionType()) { |
| 2431 | onCallArgumentSetup(Call); |
| 2432 | |
| 2433 | if (!Call.onlyReadsMemory()) |
| 2434 | disableLoadElimination(); |
| 2435 | return Base::visitCallBase(I&: Call); |
| 2436 | } |
| 2437 | } |
| 2438 | |
| 2439 | assert(F && "Expected a call to a known function" ); |
| 2440 | |
| 2441 | // When we have a concrete function, first try to simplify it directly. |
| 2442 | if (simplifyCallSite(F, Call)) |
| 2443 | return true; |
| 2444 | |
| 2445 | // Next check if it is an intrinsic we know about. |
| 2446 | // FIXME: Lift this into part of the InstVisitor. |
| 2447 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) { |
| 2448 | switch (II->getIntrinsicID()) { |
| 2449 | default: |
| 2450 | if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II)) |
| 2451 | disableLoadElimination(); |
| 2452 | return Base::visitCallBase(I&: Call); |
| 2453 | |
| 2454 | case Intrinsic::load_relative: |
| 2455 | onLoadRelativeIntrinsic(); |
| 2456 | return false; |
| 2457 | |
| 2458 | case Intrinsic::memset: |
| 2459 | case Intrinsic::memcpy: |
| 2460 | case Intrinsic::memmove: |
| 2461 | disableLoadElimination(); |
| 2462 | // SROA can usually chew through these intrinsics, but they aren't free. |
| 2463 | return false; |
| 2464 | case Intrinsic::icall_branch_funnel: |
| 2465 | case Intrinsic::localescape: |
| 2466 | HasUninlineableIntrinsic = true; |
| 2467 | return false; |
| 2468 | case Intrinsic::vastart: |
| 2469 | InitsVargArgs = true; |
| 2470 | return false; |
| 2471 | case Intrinsic::launder_invariant_group: |
| 2472 | case Intrinsic::strip_invariant_group: |
| 2473 | if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0))) |
| 2474 | SROAArgValues[II] = SROAArg; |
| 2475 | return true; |
| 2476 | case Intrinsic::is_constant: |
| 2477 | return simplifyIntrinsicCallIsConstant(CB&: Call); |
| 2478 | case Intrinsic::objectsize: |
| 2479 | return simplifyIntrinsicCallObjectSize(CB&: Call); |
| 2480 | } |
| 2481 | } |
| 2482 | |
| 2483 | if (F == Call.getFunction()) { |
| 2484 | // This flag will fully abort the analysis, so don't bother with anything |
| 2485 | // else. |
| 2486 | IsRecursiveCall = true; |
| 2487 | if (!AllowRecursiveCall) |
| 2488 | return false; |
| 2489 | } |
| 2490 | |
| 2491 | if (isLoweredToCall(F, Call)) { |
| 2492 | onLoweredCall(F, Call, IsIndirectCall); |
| 2493 | } |
| 2494 | |
| 2495 | if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) |
| 2496 | disableLoadElimination(); |
| 2497 | return Base::visitCallBase(I&: Call); |
| 2498 | } |
| 2499 | |
| 2500 | bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { |
| 2501 | // At least one return instruction will be free after inlining. |
| 2502 | bool Free = !HasReturn; |
| 2503 | HasReturn = true; |
| 2504 | return Free; |
| 2505 | } |
| 2506 | |
| 2507 | bool CallAnalyzer::visitBranchInst(BranchInst &BI) { |
| 2508 | // We model unconditional branches as essentially free -- they really |
| 2509 | // shouldn't exist at all, but handling them makes the behavior of the |
| 2510 | // inliner more regular and predictable. Interestingly, conditional branches |
| 2511 | // which will fold away are also free. |
| 2512 | return BI.isUnconditional() || |
| 2513 | getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) || |
| 2514 | BI.getMetadata(KindID: LLVMContext::MD_make_implicit); |
| 2515 | } |
| 2516 | |
| 2517 | bool CallAnalyzer::visitSelectInst(SelectInst &SI) { |
| 2518 | bool CheckSROA = SI.getType()->isPointerTy(); |
| 2519 | Value *TrueVal = SI.getTrueValue(); |
| 2520 | Value *FalseVal = SI.getFalseValue(); |
| 2521 | |
| 2522 | Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal); |
| 2523 | Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal); |
| 2524 | Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition()); |
| 2525 | |
| 2526 | if (!CondC) { |
| 2527 | // Select C, X, X => X |
| 2528 | if (TrueC == FalseC && TrueC) { |
| 2529 | SimplifiedValues[&SI] = TrueC; |
| 2530 | return true; |
| 2531 | } |
| 2532 | |
| 2533 | if (!CheckSROA) |
| 2534 | return Base::visitSelectInst(I&: SI); |
| 2535 | |
| 2536 | std::pair<Value *, APInt> TrueBaseAndOffset = |
| 2537 | ConstantOffsetPtrs.lookup(Val: TrueVal); |
| 2538 | std::pair<Value *, APInt> FalseBaseAndOffset = |
| 2539 | ConstantOffsetPtrs.lookup(Val: FalseVal); |
| 2540 | if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { |
| 2541 | ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; |
| 2542 | |
| 2543 | if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal)) |
| 2544 | SROAArgValues[&SI] = SROAArg; |
| 2545 | return true; |
| 2546 | } |
| 2547 | |
| 2548 | return Base::visitSelectInst(I&: SI); |
| 2549 | } |
| 2550 | |
| 2551 | // Select condition is a constant. |
| 2552 | Value *SelectedV = CondC->isAllOnesValue() ? TrueVal |
| 2553 | : (CondC->isNullValue()) ? FalseVal |
| 2554 | : nullptr; |
| 2555 | if (!SelectedV) { |
| 2556 | // Condition is a vector constant that is not all 1s or all 0s. If all |
| 2557 | // operands are constants, ConstantFoldSelectInstruction() can handle the |
| 2558 | // cases such as select vectors. |
| 2559 | if (TrueC && FalseC) { |
| 2560 | if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) { |
| 2561 | SimplifiedValues[&SI] = C; |
| 2562 | return true; |
| 2563 | } |
| 2564 | } |
| 2565 | return Base::visitSelectInst(I&: SI); |
| 2566 | } |
| 2567 | |
| 2568 | // Condition is either all 1s or all 0s. SI can be simplified. |
| 2569 | if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) { |
| 2570 | SimplifiedValues[&SI] = SelectedC; |
| 2571 | return true; |
| 2572 | } |
| 2573 | |
| 2574 | if (!CheckSROA) |
| 2575 | return true; |
| 2576 | |
| 2577 | std::pair<Value *, APInt> BaseAndOffset = |
| 2578 | ConstantOffsetPtrs.lookup(Val: SelectedV); |
| 2579 | if (BaseAndOffset.first) { |
| 2580 | ConstantOffsetPtrs[&SI] = BaseAndOffset; |
| 2581 | |
| 2582 | if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV)) |
| 2583 | SROAArgValues[&SI] = SROAArg; |
| 2584 | } |
| 2585 | |
| 2586 | return true; |
| 2587 | } |
| 2588 | |
| 2589 | bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { |
| 2590 | // We model unconditional switches as free, see the comments on handling |
| 2591 | // branches. |
| 2592 | if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition())) |
| 2593 | return true; |
| 2594 | |
| 2595 | // Assume the most general case where the switch is lowered into |
| 2596 | // either a jump table, bit test, or a balanced binary tree consisting of |
| 2597 | // case clusters without merging adjacent clusters with the same |
| 2598 | // destination. We do not consider the switches that are lowered with a mix |
| 2599 | // of jump table/bit test/binary search tree. The cost of the switch is |
| 2600 | // proportional to the size of the tree or the size of jump table range. |
| 2601 | // |
| 2602 | // NB: We convert large switches which are just used to initialize large phi |
| 2603 | // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent |
| 2604 | // inlining those. It will prevent inlining in cases where the optimization |
| 2605 | // does not (yet) fire. |
| 2606 | |
| 2607 | unsigned JumpTableSize = 0; |
| 2608 | BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; |
| 2609 | unsigned NumCaseCluster = |
| 2610 | TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI); |
| 2611 | |
| 2612 | onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable()); |
| 2613 | return false; |
| 2614 | } |
| 2615 | |
| 2616 | bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { |
| 2617 | // We never want to inline functions that contain an indirectbr. This is |
| 2618 | // incorrect because all the blockaddress's (in static global initializers |
| 2619 | // for example) would be referring to the original function, and this |
| 2620 | // indirect jump would jump from the inlined copy of the function into the |
| 2621 | // original function which is extremely undefined behavior. |
| 2622 | // FIXME: This logic isn't really right; we can safely inline functions with |
| 2623 | // indirectbr's as long as no other function or global references the |
| 2624 | // blockaddress of a block within the current function. |
| 2625 | HasIndirectBr = true; |
| 2626 | return false; |
| 2627 | } |
| 2628 | |
| 2629 | bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { |
| 2630 | // FIXME: It's not clear that a single instruction is an accurate model for |
| 2631 | // the inline cost of a resume instruction. |
| 2632 | return false; |
| 2633 | } |
| 2634 | |
| 2635 | bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { |
| 2636 | // FIXME: It's not clear that a single instruction is an accurate model for |
| 2637 | // the inline cost of a cleanupret instruction. |
| 2638 | return false; |
| 2639 | } |
| 2640 | |
| 2641 | bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { |
| 2642 | // FIXME: It's not clear that a single instruction is an accurate model for |
| 2643 | // the inline cost of a catchret instruction. |
| 2644 | return false; |
| 2645 | } |
| 2646 | |
| 2647 | bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { |
| 2648 | // FIXME: It might be reasonably to discount the cost of instructions leading |
| 2649 | // to unreachable as they have the lowest possible impact on both runtime and |
| 2650 | // code size. |
| 2651 | return true; // No actual code is needed for unreachable. |
| 2652 | } |
| 2653 | |
| 2654 | bool CallAnalyzer::visitInstruction(Instruction &I) { |
| 2655 | // Some instructions are free. All of the free intrinsics can also be |
| 2656 | // handled by SROA, etc. |
| 2657 | if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 2658 | TargetTransformInfo::TCC_Free) |
| 2659 | return true; |
| 2660 | |
| 2661 | // We found something we don't understand or can't handle. Mark any SROA-able |
| 2662 | // values in the operand list as no longer viable. |
| 2663 | for (const Use &Op : I.operands()) |
| 2664 | disableSROA(V: Op); |
| 2665 | |
| 2666 | return false; |
| 2667 | } |
| 2668 | |
| 2669 | /// Analyze a basic block for its contribution to the inline cost. |
| 2670 | /// |
| 2671 | /// This method walks the analyzer over every instruction in the given basic |
| 2672 | /// block and accounts for their cost during inlining at this callsite. It |
| 2673 | /// aborts early if the threshold has been exceeded or an impossible to inline |
| 2674 | /// construct has been detected. It returns false if inlining is no longer |
| 2675 | /// viable, and true if inlining remains viable. |
| 2676 | InlineResult |
| 2677 | CallAnalyzer::analyzeBlock(BasicBlock *BB, |
| 2678 | const SmallPtrSetImpl<const Value *> &EphValues) { |
| 2679 | for (Instruction &I : *BB) { |
| 2680 | // FIXME: Currently, the number of instructions in a function regardless of |
| 2681 | // our ability to simplify them during inline to constants or dead code, |
| 2682 | // are actually used by the vector bonus heuristic. As long as that's true, |
| 2683 | // we have to special case debug intrinsics here to prevent differences in |
| 2684 | // inlining due to debug symbols. Eventually, the number of unsimplified |
| 2685 | // instructions shouldn't factor into the cost computation, but until then, |
| 2686 | // hack around it here. |
| 2687 | // Similarly, skip pseudo-probes. |
| 2688 | if (I.isDebugOrPseudoInst()) |
| 2689 | continue; |
| 2690 | |
| 2691 | // Skip ephemeral values. |
| 2692 | if (EphValues.count(Ptr: &I)) |
| 2693 | continue; |
| 2694 | |
| 2695 | ++NumInstructions; |
| 2696 | if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy()) |
| 2697 | ++NumVectorInstructions; |
| 2698 | |
| 2699 | // If the instruction simplified to a constant, there is no cost to this |
| 2700 | // instruction. Visit the instructions using our InstVisitor to account for |
| 2701 | // all of the per-instruction logic. The visit tree returns true if we |
| 2702 | // consumed the instruction in any way, and false if the instruction's base |
| 2703 | // cost should count against inlining. |
| 2704 | onInstructionAnalysisStart(I: &I); |
| 2705 | |
| 2706 | if (Base::visit(I: &I)) |
| 2707 | ++NumInstructionsSimplified; |
| 2708 | else |
| 2709 | onMissedSimplification(); |
| 2710 | |
| 2711 | onInstructionAnalysisFinish(I: &I); |
| 2712 | using namespace ore; |
| 2713 | // If the visit this instruction detected an uninlinable pattern, abort. |
| 2714 | InlineResult IR = InlineResult::success(); |
| 2715 | if (IsRecursiveCall && !AllowRecursiveCall) |
| 2716 | IR = InlineResult::failure(Reason: "recursive" ); |
| 2717 | else if (ExposesReturnsTwice) |
| 2718 | IR = InlineResult::failure(Reason: "exposes returns twice" ); |
| 2719 | else if (HasDynamicAlloca) |
| 2720 | IR = InlineResult::failure(Reason: "dynamic alloca" ); |
| 2721 | else if (HasIndirectBr) |
| 2722 | IR = InlineResult::failure(Reason: "indirect branch" ); |
| 2723 | else if (HasUninlineableIntrinsic) |
| 2724 | IR = InlineResult::failure(Reason: "uninlinable intrinsic" ); |
| 2725 | else if (InitsVargArgs) |
| 2726 | IR = InlineResult::failure(Reason: "varargs" ); |
| 2727 | if (!IR.isSuccess()) { |
| 2728 | if (ORE) |
| 2729 | ORE->emit(RemarkBuilder: [&]() { |
| 2730 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
| 2731 | &CandidateCall) |
| 2732 | << NV("Callee" , &F) << " has uninlinable pattern (" |
| 2733 | << NV("InlineResult" , IR.getFailureReason()) |
| 2734 | << ") and cost is not fully computed" ; |
| 2735 | }); |
| 2736 | return IR; |
| 2737 | } |
| 2738 | |
| 2739 | // If the caller is a recursive function then we don't want to inline |
| 2740 | // functions which allocate a lot of stack space because it would increase |
| 2741 | // the caller stack usage dramatically. |
| 2742 | if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) { |
| 2743 | auto IR = |
| 2744 | InlineResult::failure(Reason: "recursive and allocates too much stack space" ); |
| 2745 | if (ORE) |
| 2746 | ORE->emit(RemarkBuilder: [&]() { |
| 2747 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
| 2748 | &CandidateCall) |
| 2749 | << NV("Callee" , &F) << " is " |
| 2750 | << NV("InlineResult" , IR.getFailureReason()) |
| 2751 | << ". Cost is not fully computed" ; |
| 2752 | }); |
| 2753 | return IR; |
| 2754 | } |
| 2755 | |
| 2756 | if (shouldStop()) |
| 2757 | return InlineResult::failure( |
| 2758 | Reason: "Call site analysis is not favorable to inlining." ); |
| 2759 | } |
| 2760 | |
| 2761 | return InlineResult::success(); |
| 2762 | } |
| 2763 | |
| 2764 | /// Compute the base pointer and cumulative constant offsets for V. |
| 2765 | /// |
| 2766 | /// This strips all constant offsets off of V, leaving it the base pointer, and |
| 2767 | /// accumulates the total constant offset applied in the returned constant. It |
| 2768 | /// returns 0 if V is not a pointer, and returns the constant '0' if there are |
| 2769 | /// no constant offsets applied. |
| 2770 | ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { |
| 2771 | if (!V->getType()->isPointerTy()) |
| 2772 | return nullptr; |
| 2773 | |
| 2774 | unsigned AS = V->getType()->getPointerAddressSpace(); |
| 2775 | unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); |
| 2776 | APInt Offset = APInt::getZero(numBits: IntPtrWidth); |
| 2777 | |
| 2778 | // Even though we don't look through PHI nodes, we could be called on an |
| 2779 | // instruction in an unreachable block, which may be on a cycle. |
| 2780 | SmallPtrSet<Value *, 4> Visited; |
| 2781 | Visited.insert(Ptr: V); |
| 2782 | do { |
| 2783 | if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) { |
| 2784 | if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset)) |
| 2785 | return nullptr; |
| 2786 | V = GEP->getPointerOperand(); |
| 2787 | } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) { |
| 2788 | if (GA->isInterposable()) |
| 2789 | break; |
| 2790 | V = GA->getAliasee(); |
| 2791 | } else { |
| 2792 | break; |
| 2793 | } |
| 2794 | assert(V->getType()->isPointerTy() && "Unexpected operand type!" ); |
| 2795 | } while (Visited.insert(Ptr: V).second); |
| 2796 | |
| 2797 | Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType()); |
| 2798 | return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset)); |
| 2799 | } |
| 2800 | |
| 2801 | /// Find dead blocks due to deleted CFG edges during inlining. |
| 2802 | /// |
| 2803 | /// If we know the successor of the current block, \p CurrBB, has to be \p |
| 2804 | /// NextBB, the other successors of \p CurrBB are dead if these successors have |
| 2805 | /// no live incoming CFG edges. If one block is found to be dead, we can |
| 2806 | /// continue growing the dead block list by checking the successors of the dead |
| 2807 | /// blocks to see if all their incoming edges are dead or not. |
| 2808 | void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { |
| 2809 | auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { |
| 2810 | // A CFG edge is dead if the predecessor is dead or the predecessor has a |
| 2811 | // known successor which is not the one under exam. |
| 2812 | if (DeadBlocks.count(Ptr: Pred)) |
| 2813 | return true; |
| 2814 | BasicBlock *KnownSucc = KnownSuccessors[Pred]; |
| 2815 | return KnownSucc && KnownSucc != Succ; |
| 2816 | }; |
| 2817 | |
| 2818 | auto IsNewlyDead = [&](BasicBlock *BB) { |
| 2819 | // If all the edges to a block are dead, the block is also dead. |
| 2820 | return (!DeadBlocks.count(Ptr: BB) && |
| 2821 | llvm::all_of(Range: predecessors(BB), |
| 2822 | P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); |
| 2823 | }; |
| 2824 | |
| 2825 | for (BasicBlock *Succ : successors(BB: CurrBB)) { |
| 2826 | if (Succ == NextBB || !IsNewlyDead(Succ)) |
| 2827 | continue; |
| 2828 | SmallVector<BasicBlock *, 4> NewDead; |
| 2829 | NewDead.push_back(Elt: Succ); |
| 2830 | while (!NewDead.empty()) { |
| 2831 | BasicBlock *Dead = NewDead.pop_back_val(); |
| 2832 | if (DeadBlocks.insert(Ptr: Dead).second) |
| 2833 | // Continue growing the dead block lists. |
| 2834 | for (BasicBlock *S : successors(BB: Dead)) |
| 2835 | if (IsNewlyDead(S)) |
| 2836 | NewDead.push_back(Elt: S); |
| 2837 | } |
| 2838 | } |
| 2839 | } |
| 2840 | |
| 2841 | /// Analyze a call site for potential inlining. |
| 2842 | /// |
| 2843 | /// Returns true if inlining this call is viable, and false if it is not |
| 2844 | /// viable. It computes the cost and adjusts the threshold based on numerous |
| 2845 | /// factors and heuristics. If this method returns false but the computed cost |
| 2846 | /// is below the computed threshold, then inlining was forcibly disabled by |
| 2847 | /// some artifact of the routine. |
| 2848 | InlineResult CallAnalyzer::analyze() { |
| 2849 | ++NumCallsAnalyzed; |
| 2850 | |
| 2851 | auto Result = onAnalysisStart(); |
| 2852 | if (!Result.isSuccess()) |
| 2853 | return Result; |
| 2854 | |
| 2855 | if (F.empty()) |
| 2856 | return InlineResult::success(); |
| 2857 | |
| 2858 | Function *Caller = CandidateCall.getFunction(); |
| 2859 | // Check if the caller function is recursive itself. |
| 2860 | for (User *U : Caller->users()) { |
| 2861 | CallBase *Call = dyn_cast<CallBase>(Val: U); |
| 2862 | if (Call && Call->getFunction() == Caller) { |
| 2863 | IsCallerRecursive = true; |
| 2864 | break; |
| 2865 | } |
| 2866 | } |
| 2867 | |
| 2868 | // Populate our simplified values by mapping from function arguments to call |
| 2869 | // arguments with known important simplifications. |
| 2870 | auto CAI = CandidateCall.arg_begin(); |
| 2871 | for (Argument &FAI : F.args()) { |
| 2872 | assert(CAI != CandidateCall.arg_end()); |
| 2873 | SimplifiedValues[&FAI] = *CAI; |
| 2874 | if (isa<Constant>(Val: *CAI)) |
| 2875 | ++NumConstantArgs; |
| 2876 | |
| 2877 | Value *PtrArg = *CAI; |
| 2878 | if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) { |
| 2879 | ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue()); |
| 2880 | |
| 2881 | // We can SROA any pointer arguments derived from alloca instructions. |
| 2882 | if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) { |
| 2883 | SROAArgValues[&FAI] = SROAArg; |
| 2884 | onInitializeSROAArg(Arg: SROAArg); |
| 2885 | EnabledSROAAllocas.insert(V: SROAArg); |
| 2886 | } |
| 2887 | } |
| 2888 | ++CAI; |
| 2889 | } |
| 2890 | NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); |
| 2891 | NumAllocaArgs = SROAArgValues.size(); |
| 2892 | |
| 2893 | // Collecting the ephemeral values of `F` can be expensive, so use the |
| 2894 | // ephemeral values cache if available. |
| 2895 | SmallPtrSet<const Value *, 32> EphValuesStorage; |
| 2896 | const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage; |
| 2897 | if (GetEphValuesCache) |
| 2898 | EphValues = &GetEphValuesCache(F).ephValues(); |
| 2899 | else |
| 2900 | CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F), |
| 2901 | EphValues&: EphValuesStorage); |
| 2902 | |
| 2903 | // The worklist of live basic blocks in the callee *after* inlining. We avoid |
| 2904 | // adding basic blocks of the callee which can be proven to be dead for this |
| 2905 | // particular call site in order to get more accurate cost estimates. This |
| 2906 | // requires a somewhat heavyweight iteration pattern: we need to walk the |
| 2907 | // basic blocks in a breadth-first order as we insert live successors. To |
| 2908 | // accomplish this, prioritizing for small iterations because we exit after |
| 2909 | // crossing our threshold, we use a small-size optimized SetVector. |
| 2910 | typedef SmallSetVector<BasicBlock *, 16> BBSetVector; |
| 2911 | BBSetVector BBWorklist; |
| 2912 | BBWorklist.insert(X: &F.getEntryBlock()); |
| 2913 | |
| 2914 | // Note that we *must not* cache the size, this loop grows the worklist. |
| 2915 | for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { |
| 2916 | if (shouldStop()) |
| 2917 | break; |
| 2918 | |
| 2919 | BasicBlock *BB = BBWorklist[Idx]; |
| 2920 | if (BB->empty()) |
| 2921 | continue; |
| 2922 | |
| 2923 | onBlockStart(BB); |
| 2924 | |
| 2925 | // Disallow inlining a blockaddress with uses other than strictly callbr. |
| 2926 | // A blockaddress only has defined behavior for an indirect branch in the |
| 2927 | // same function, and we do not currently support inlining indirect |
| 2928 | // branches. But, the inliner may not see an indirect branch that ends up |
| 2929 | // being dead code at a particular call site. If the blockaddress escapes |
| 2930 | // the function, e.g., via a global variable, inlining may lead to an |
| 2931 | // invalid cross-function reference. |
| 2932 | // FIXME: pr/39560: continue relaxing this overt restriction. |
| 2933 | if (BB->hasAddressTaken()) |
| 2934 | for (User *U : BlockAddress::get(BB: &*BB)->users()) |
| 2935 | if (!isa<CallBrInst>(Val: *U)) |
| 2936 | return InlineResult::failure(Reason: "blockaddress used outside of callbr" ); |
| 2937 | |
| 2938 | // Analyze the cost of this block. If we blow through the threshold, this |
| 2939 | // returns false, and we can bail on out. |
| 2940 | InlineResult IR = analyzeBlock(BB, EphValues: *EphValues); |
| 2941 | if (!IR.isSuccess()) |
| 2942 | return IR; |
| 2943 | |
| 2944 | Instruction *TI = BB->getTerminator(); |
| 2945 | |
| 2946 | // Add in the live successors by first checking whether we have terminator |
| 2947 | // that may be simplified based on the values simplified by this call. |
| 2948 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) { |
| 2949 | if (BI->isConditional()) { |
| 2950 | Value *Cond = BI->getCondition(); |
| 2951 | if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) { |
| 2952 | BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0); |
| 2953 | BBWorklist.insert(X: NextBB); |
| 2954 | KnownSuccessors[BB] = NextBB; |
| 2955 | findDeadBlocks(CurrBB: BB, NextBB); |
| 2956 | continue; |
| 2957 | } |
| 2958 | } |
| 2959 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) { |
| 2960 | Value *Cond = SI->getCondition(); |
| 2961 | if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) { |
| 2962 | BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor(); |
| 2963 | BBWorklist.insert(X: NextBB); |
| 2964 | KnownSuccessors[BB] = NextBB; |
| 2965 | findDeadBlocks(CurrBB: BB, NextBB); |
| 2966 | continue; |
| 2967 | } |
| 2968 | } |
| 2969 | |
| 2970 | // If we're unable to select a particular successor, just count all of |
| 2971 | // them. |
| 2972 | BBWorklist.insert_range(R: successors(BB)); |
| 2973 | |
| 2974 | onBlockAnalyzed(BB); |
| 2975 | } |
| 2976 | |
| 2977 | // If this is a noduplicate call, we can still inline as long as |
| 2978 | // inlining this would cause the removal of the caller (so the instruction |
| 2979 | // is not actually duplicated, just moved). |
| 2980 | if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall) |
| 2981 | return InlineResult::failure(Reason: "noduplicate" ); |
| 2982 | |
| 2983 | // If the callee's stack size exceeds the user-specified threshold, |
| 2984 | // do not let it be inlined. |
| 2985 | // The command line option overrides a limit set in the function attributes. |
| 2986 | size_t FinalStackSizeThreshold = StackSizeThreshold; |
| 2987 | if (!StackSizeThreshold.getNumOccurrences()) |
| 2988 | if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt( |
| 2989 | F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName)) |
| 2990 | FinalStackSizeThreshold = *AttrMaxStackSize; |
| 2991 | if (AllocatedSize > FinalStackSizeThreshold) |
| 2992 | return InlineResult::failure(Reason: "stacksize" ); |
| 2993 | |
| 2994 | return finalizeAnalysis(); |
| 2995 | } |
| 2996 | |
| 2997 | void InlineCostCallAnalyzer::print(raw_ostream &OS) { |
| 2998 | #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n" |
| 2999 | if (PrintInstructionComments) |
| 3000 | F.print(OS, AAW: &Writer); |
| 3001 | DEBUG_PRINT_STAT(NumConstantArgs); |
| 3002 | DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); |
| 3003 | DEBUG_PRINT_STAT(NumAllocaArgs); |
| 3004 | DEBUG_PRINT_STAT(NumConstantPtrCmps); |
| 3005 | DEBUG_PRINT_STAT(NumConstantPtrDiffs); |
| 3006 | DEBUG_PRINT_STAT(NumInstructionsSimplified); |
| 3007 | DEBUG_PRINT_STAT(NumInstructions); |
| 3008 | DEBUG_PRINT_STAT(SROACostSavings); |
| 3009 | DEBUG_PRINT_STAT(SROACostSavingsLost); |
| 3010 | DEBUG_PRINT_STAT(LoadEliminationCost); |
| 3011 | DEBUG_PRINT_STAT(ContainsNoDuplicateCall); |
| 3012 | DEBUG_PRINT_STAT(Cost); |
| 3013 | DEBUG_PRINT_STAT(Threshold); |
| 3014 | #undef DEBUG_PRINT_STAT |
| 3015 | } |
| 3016 | |
| 3017 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3018 | /// Dump stats about this call's analysis. |
| 3019 | LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); } |
| 3020 | #endif |
| 3021 | |
| 3022 | /// Test that there are no attribute conflicts between Caller and Callee |
| 3023 | /// that prevent inlining. |
| 3024 | static bool functionsHaveCompatibleAttributes( |
| 3025 | Function *Caller, Function *Callee, TargetTransformInfo &TTI, |
| 3026 | function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) { |
| 3027 | // Note that CalleeTLI must be a copy not a reference. The legacy pass manager |
| 3028 | // caches the most recently created TLI in the TargetLibraryInfoWrapperPass |
| 3029 | // object, and always returns the same object (which is overwritten on each |
| 3030 | // GetTLI call). Therefore we copy the first result. |
| 3031 | auto CalleeTLI = GetTLI(*Callee); |
| 3032 | return (IgnoreTTIInlineCompatible || |
| 3033 | TTI.areInlineCompatible(Caller, Callee)) && |
| 3034 | GetTLI(*Caller).areInlineCompatible(CalleeTLI, |
| 3035 | AllowCallerSuperset: InlineCallerSupersetNoBuiltin) && |
| 3036 | AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee); |
| 3037 | } |
| 3038 | |
| 3039 | int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, |
| 3040 | const DataLayout &DL) { |
| 3041 | int64_t Cost = 0; |
| 3042 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { |
| 3043 | if (Call.isByValArgument(ArgNo: I)) { |
| 3044 | // We approximate the number of loads and stores needed by dividing the |
| 3045 | // size of the byval type by the target's pointer size. |
| 3046 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
| 3047 | unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I)); |
| 3048 | unsigned AS = PTy->getAddressSpace(); |
| 3049 | unsigned PointerSize = DL.getPointerSizeInBits(AS); |
| 3050 | // Ceiling division. |
| 3051 | unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; |
| 3052 | |
| 3053 | // If it generates more than 8 stores it is likely to be expanded as an |
| 3054 | // inline memcpy so we take that as an upper bound. Otherwise we assume |
| 3055 | // one load and one store per word copied. |
| 3056 | // FIXME: The maxStoresPerMemcpy setting from the target should be used |
| 3057 | // here instead of a magic number of 8, but it's not available via |
| 3058 | // DataLayout. |
| 3059 | NumStores = std::min(a: NumStores, b: 8U); |
| 3060 | |
| 3061 | Cost += 2 * NumStores * InstrCost; |
| 3062 | } else { |
| 3063 | // For non-byval arguments subtract off one instruction per call |
| 3064 | // argument. |
| 3065 | Cost += InstrCost; |
| 3066 | } |
| 3067 | } |
| 3068 | // The call instruction also disappears after inlining. |
| 3069 | Cost += InstrCost; |
| 3070 | Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty); |
| 3071 | |
| 3072 | return std::min<int64_t>(a: Cost, INT_MAX); |
| 3073 | } |
| 3074 | |
| 3075 | InlineCost llvm::getInlineCost( |
| 3076 | CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, |
| 3077 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 3078 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 3079 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 3080 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, |
| 3081 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) { |
| 3082 | return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI, |
| 3083 | GetAssumptionCache, GetTLI, GetBFI, PSI, ORE, |
| 3084 | GetEphValuesCache); |
| 3085 | } |
| 3086 | |
| 3087 | std::optional<int> llvm::getInliningCostEstimate( |
| 3088 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
| 3089 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 3090 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 3091 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 3092 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
| 3093 | const InlineParams Params = {/* DefaultThreshold*/ 0, |
| 3094 | /*HintThreshold*/ {}, |
| 3095 | /*ColdThreshold*/ {}, |
| 3096 | /*OptSizeThreshold*/ {}, |
| 3097 | /*OptMinSizeThreshold*/ {}, |
| 3098 | /*HotCallSiteThreshold*/ {}, |
| 3099 | /*LocallyHotCallSiteThreshold*/ {}, |
| 3100 | /*ColdCallSiteThreshold*/ {}, |
| 3101 | /*ComputeFullInlineCost*/ true, |
| 3102 | /*EnableDeferral*/ true}; |
| 3103 | |
| 3104 | InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, |
| 3105 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true, |
| 3106 | /*IgnoreThreshold*/ true); |
| 3107 | auto R = CA.analyze(); |
| 3108 | if (!R.isSuccess()) |
| 3109 | return std::nullopt; |
| 3110 | return CA.getCost(); |
| 3111 | } |
| 3112 | |
| 3113 | std::optional<InlineCostFeatures> llvm::getInliningCostFeatures( |
| 3114 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
| 3115 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 3116 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 3117 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 3118 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
| 3119 | InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI, |
| 3120 | PSI, ORE, *Call.getCalledFunction(), Call); |
| 3121 | auto R = CFA.analyze(); |
| 3122 | if (!R.isSuccess()) |
| 3123 | return std::nullopt; |
| 3124 | return CFA.features(); |
| 3125 | } |
| 3126 | |
| 3127 | std::optional<InlineResult> llvm::getAttributeBasedInliningDecision( |
| 3128 | CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, |
| 3129 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { |
| 3130 | |
| 3131 | // Cannot inline indirect calls. |
| 3132 | if (!Callee) |
| 3133 | return InlineResult::failure(Reason: "indirect call" ); |
| 3134 | |
| 3135 | // When callee coroutine function is inlined into caller coroutine function |
| 3136 | // before coro-split pass, |
| 3137 | // coro-early pass can not handle this quiet well. |
| 3138 | // So we won't inline the coroutine function if it have not been unsplited |
| 3139 | if (Callee->isPresplitCoroutine()) |
| 3140 | return InlineResult::failure(Reason: "unsplited coroutine call" ); |
| 3141 | |
| 3142 | // Never inline calls with byval arguments that does not have the alloca |
| 3143 | // address space. Since byval arguments can be replaced with a copy to an |
| 3144 | // alloca, the inlined code would need to be adjusted to handle that the |
| 3145 | // argument is in the alloca address space (so it is a little bit complicated |
| 3146 | // to solve). |
| 3147 | unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace(); |
| 3148 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) |
| 3149 | if (Call.isByValArgument(ArgNo: I)) { |
| 3150 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
| 3151 | if (PTy->getAddressSpace() != AllocaAS) |
| 3152 | return InlineResult::failure(Reason: "byval arguments without alloca" |
| 3153 | " address space" ); |
| 3154 | } |
| 3155 | |
| 3156 | // Calls to functions with always-inline attributes should be inlined |
| 3157 | // whenever possible. |
| 3158 | if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) { |
| 3159 | if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline)) |
| 3160 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
| 3161 | |
| 3162 | auto IsViable = isInlineViable(Callee&: *Callee); |
| 3163 | if (IsViable.isSuccess()) |
| 3164 | return InlineResult::success(); |
| 3165 | return InlineResult::failure(Reason: IsViable.getFailureReason()); |
| 3166 | } |
| 3167 | |
| 3168 | // Never inline functions with conflicting attributes (unless callee has |
| 3169 | // always-inline attribute). |
| 3170 | Function *Caller = Call.getCaller(); |
| 3171 | if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI)) |
| 3172 | return InlineResult::failure(Reason: "conflicting attributes" ); |
| 3173 | |
| 3174 | // Don't inline this call if the caller has the optnone attribute. |
| 3175 | if (Caller->hasOptNone()) |
| 3176 | return InlineResult::failure(Reason: "optnone attribute" ); |
| 3177 | |
| 3178 | // Don't inline a function that treats null pointer as valid into a caller |
| 3179 | // that does not have this attribute. |
| 3180 | if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) |
| 3181 | return InlineResult::failure(Reason: "nullptr definitions incompatible" ); |
| 3182 | |
| 3183 | // Don't inline functions which can be interposed at link-time. |
| 3184 | if (Callee->isInterposable()) |
| 3185 | return InlineResult::failure(Reason: "interposable" ); |
| 3186 | |
| 3187 | // Don't inline functions marked noinline. |
| 3188 | if (Callee->hasFnAttribute(Kind: Attribute::NoInline)) |
| 3189 | return InlineResult::failure(Reason: "noinline function attribute" ); |
| 3190 | |
| 3191 | // Don't inline call sites marked noinline. |
| 3192 | if (Call.isNoInline()) |
| 3193 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
| 3194 | |
| 3195 | // Don't inline functions that are loader replaceable. |
| 3196 | if (Callee->hasFnAttribute(Kind: "loader-replaceable" )) |
| 3197 | return InlineResult::failure(Reason: "loader replaceable function attribute" ); |
| 3198 | |
| 3199 | return std::nullopt; |
| 3200 | } |
| 3201 | |
| 3202 | InlineCost llvm::getInlineCost( |
| 3203 | CallBase &Call, Function *Callee, const InlineParams &Params, |
| 3204 | TargetTransformInfo &CalleeTTI, |
| 3205 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 3206 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 3207 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 3208 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, |
| 3209 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) { |
| 3210 | |
| 3211 | auto UserDecision = |
| 3212 | llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI); |
| 3213 | |
| 3214 | if (UserDecision) { |
| 3215 | if (UserDecision->isSuccess()) |
| 3216 | return llvm::InlineCost::getAlways(Reason: "always inline attribute" ); |
| 3217 | return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason()); |
| 3218 | } |
| 3219 | |
| 3220 | LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() |
| 3221 | << "... (caller:" << Call.getCaller()->getName() |
| 3222 | << ")\n" ); |
| 3223 | |
| 3224 | InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, |
| 3225 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
| 3226 | /*BoostIndirect=*/true, /*IgnoreThreshold=*/false, |
| 3227 | GetEphValuesCache); |
| 3228 | InlineResult ShouldInline = CA.analyze(); |
| 3229 | |
| 3230 | LLVM_DEBUG(CA.dump()); |
| 3231 | |
| 3232 | // Always make cost benefit based decision explicit. |
| 3233 | // We use always/never here since threshold is not meaningful, |
| 3234 | // as it's not what drives cost-benefit analysis. |
| 3235 | if (CA.wasDecidedByCostBenefit()) { |
| 3236 | if (ShouldInline.isSuccess()) |
| 3237 | return InlineCost::getAlways(Reason: "benefit over cost" , |
| 3238 | CostBenefit: CA.getCostBenefitPair()); |
| 3239 | else |
| 3240 | return InlineCost::getNever(Reason: "cost over benefit" , CostBenefit: CA.getCostBenefitPair()); |
| 3241 | } |
| 3242 | |
| 3243 | if (CA.wasDecidedByCostThreshold()) |
| 3244 | return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(), |
| 3245 | StaticBonus: CA.getStaticBonusApplied()); |
| 3246 | |
| 3247 | // No details on how the decision was made, simply return always or never. |
| 3248 | return ShouldInline.isSuccess() |
| 3249 | ? InlineCost::getAlways(Reason: "empty function" ) |
| 3250 | : InlineCost::getNever(Reason: ShouldInline.getFailureReason()); |
| 3251 | } |
| 3252 | |
| 3253 | InlineResult llvm::isInlineViable(Function &F) { |
| 3254 | bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice); |
| 3255 | for (BasicBlock &BB : F) { |
| 3256 | // Disallow inlining of functions which contain indirect branches. |
| 3257 | if (isa<IndirectBrInst>(Val: BB.getTerminator())) |
| 3258 | return InlineResult::failure(Reason: "contains indirect branches" ); |
| 3259 | |
| 3260 | // Disallow inlining of blockaddresses which are used by non-callbr |
| 3261 | // instructions. |
| 3262 | if (BB.hasAddressTaken()) |
| 3263 | for (User *U : BlockAddress::get(BB: &BB)->users()) |
| 3264 | if (!isa<CallBrInst>(Val: *U)) |
| 3265 | return InlineResult::failure(Reason: "blockaddress used outside of callbr" ); |
| 3266 | |
| 3267 | for (auto &II : BB) { |
| 3268 | CallBase *Call = dyn_cast<CallBase>(Val: &II); |
| 3269 | if (!Call) |
| 3270 | continue; |
| 3271 | |
| 3272 | // Disallow recursive calls. |
| 3273 | Function *Callee = Call->getCalledFunction(); |
| 3274 | if (&F == Callee) |
| 3275 | return InlineResult::failure(Reason: "recursive call" ); |
| 3276 | |
| 3277 | // Disallow calls which expose returns-twice to a function not previously |
| 3278 | // attributed as such. |
| 3279 | if (!ReturnsTwice && isa<CallInst>(Val: Call) && |
| 3280 | cast<CallInst>(Val: Call)->canReturnTwice()) |
| 3281 | return InlineResult::failure(Reason: "exposes returns-twice attribute" ); |
| 3282 | |
| 3283 | if (Callee) |
| 3284 | switch (Callee->getIntrinsicID()) { |
| 3285 | default: |
| 3286 | break; |
| 3287 | case llvm::Intrinsic::icall_branch_funnel: |
| 3288 | // Disallow inlining of @llvm.icall.branch.funnel because current |
| 3289 | // backend can't separate call targets from call arguments. |
| 3290 | return InlineResult::failure( |
| 3291 | Reason: "disallowed inlining of @llvm.icall.branch.funnel" ); |
| 3292 | case llvm::Intrinsic::localescape: |
| 3293 | // Disallow inlining functions that call @llvm.localescape. Doing this |
| 3294 | // correctly would require major changes to the inliner. |
| 3295 | return InlineResult::failure( |
| 3296 | Reason: "disallowed inlining of @llvm.localescape" ); |
| 3297 | case llvm::Intrinsic::vastart: |
| 3298 | // Disallow inlining of functions that initialize VarArgs with |
| 3299 | // va_start. |
| 3300 | return InlineResult::failure( |
| 3301 | Reason: "contains VarArgs initialized with va_start" ); |
| 3302 | } |
| 3303 | } |
| 3304 | } |
| 3305 | |
| 3306 | return InlineResult::success(); |
| 3307 | } |
| 3308 | |
| 3309 | // APIs to create InlineParams based on command line flags and/or other |
| 3310 | // parameters. |
| 3311 | |
| 3312 | InlineParams llvm::getInlineParams(int Threshold) { |
| 3313 | InlineParams Params; |
| 3314 | |
| 3315 | // This field is the threshold to use for a callee by default. This is |
| 3316 | // derived from one or more of: |
| 3317 | // * optimization or size-optimization levels, |
| 3318 | // * a value passed to createFunctionInliningPass function, or |
| 3319 | // * the -inline-threshold flag. |
| 3320 | // If the -inline-threshold flag is explicitly specified, that is used |
| 3321 | // irrespective of anything else. |
| 3322 | if (InlineThreshold.getNumOccurrences() > 0) |
| 3323 | Params.DefaultThreshold = InlineThreshold; |
| 3324 | else |
| 3325 | Params.DefaultThreshold = Threshold; |
| 3326 | |
| 3327 | // Set the HintThreshold knob from the -inlinehint-threshold. |
| 3328 | Params.HintThreshold = HintThreshold; |
| 3329 | |
| 3330 | // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. |
| 3331 | Params.HotCallSiteThreshold = HotCallSiteThreshold; |
| 3332 | |
| 3333 | // If the -locally-hot-callsite-threshold is explicitly specified, use it to |
| 3334 | // populate LocallyHotCallSiteThreshold. Later, we populate |
| 3335 | // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if |
| 3336 | // we know that optimization level is O3 (in the getInlineParams variant that |
| 3337 | // takes the opt and size levels). |
| 3338 | // FIXME: Remove this check (and make the assignment unconditional) after |
| 3339 | // addressing size regression issues at O2. |
| 3340 | if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) |
| 3341 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
| 3342 | |
| 3343 | // Set the ColdCallSiteThreshold knob from the |
| 3344 | // -inline-cold-callsite-threshold. |
| 3345 | Params.ColdCallSiteThreshold = ColdCallSiteThreshold; |
| 3346 | |
| 3347 | // Set the OptMinSizeThreshold and OptSizeThreshold params only if the |
| 3348 | // -inlinehint-threshold commandline option is not explicitly given. If that |
| 3349 | // option is present, then its value applies even for callees with size and |
| 3350 | // minsize attributes. |
| 3351 | // If the -inline-threshold is not specified, set the ColdThreshold from the |
| 3352 | // -inlinecold-threshold even if it is not explicitly passed. If |
| 3353 | // -inline-threshold is specified, then -inlinecold-threshold needs to be |
| 3354 | // explicitly specified to set the ColdThreshold knob |
| 3355 | if (InlineThreshold.getNumOccurrences() == 0) { |
| 3356 | Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; |
| 3357 | Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; |
| 3358 | Params.ColdThreshold = ColdThreshold; |
| 3359 | } else if (ColdThreshold.getNumOccurrences() > 0) { |
| 3360 | Params.ColdThreshold = ColdThreshold; |
| 3361 | } |
| 3362 | return Params; |
| 3363 | } |
| 3364 | |
| 3365 | InlineParams llvm::getInlineParams() { |
| 3366 | return getInlineParams(Threshold: DefaultThreshold); |
| 3367 | } |
| 3368 | |
| 3369 | // Compute the default threshold for inlining based on the opt level and the |
| 3370 | // size opt level. |
| 3371 | static int computeThresholdFromOptLevels(unsigned OptLevel, |
| 3372 | unsigned SizeOptLevel) { |
| 3373 | if (OptLevel > 2) |
| 3374 | return InlineConstants::OptAggressiveThreshold; |
| 3375 | if (SizeOptLevel == 1) // -Os |
| 3376 | return InlineConstants::OptSizeThreshold; |
| 3377 | if (SizeOptLevel == 2) // -Oz |
| 3378 | return InlineConstants::OptMinSizeThreshold; |
| 3379 | return DefaultThreshold; |
| 3380 | } |
| 3381 | |
| 3382 | InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { |
| 3383 | auto Params = |
| 3384 | getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); |
| 3385 | // At O3, use the value of -locally-hot-callsite-threshold option to populate |
| 3386 | // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only |
| 3387 | // when it is specified explicitly. |
| 3388 | if (OptLevel > 2) |
| 3389 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
| 3390 | return Params; |
| 3391 | } |
| 3392 | |
| 3393 | PreservedAnalyses |
| 3394 | InlineCostAnnotationPrinterPass::run(Function &F, |
| 3395 | FunctionAnalysisManager &FAM) { |
| 3396 | PrintInstructionComments = true; |
| 3397 | std::function<AssumptionCache &(Function &)> GetAssumptionCache = |
| 3398 | [&](Function &F) -> AssumptionCache & { |
| 3399 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
| 3400 | }; |
| 3401 | |
| 3402 | auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F); |
| 3403 | ProfileSummaryInfo *PSI = |
| 3404 | MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
| 3405 | const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); |
| 3406 | |
| 3407 | // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. |
| 3408 | // In the current implementation, the type of InlineParams doesn't matter as |
| 3409 | // the pass serves only for verification of inliner's decisions. |
| 3410 | // We can add a flag which determines InlineParams for this run. Right now, |
| 3411 | // the default InlineParams are used. |
| 3412 | const InlineParams Params = llvm::getInlineParams(); |
| 3413 | for (BasicBlock &BB : F) { |
| 3414 | for (Instruction &I : BB) { |
| 3415 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
| 3416 | Function *CalledFunction = CB->getCalledFunction(); |
| 3417 | if (!CalledFunction || CalledFunction->isDeclaration()) |
| 3418 | continue; |
| 3419 | OptimizationRemarkEmitter ORE(CalledFunction); |
| 3420 | InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI, |
| 3421 | GetAssumptionCache, nullptr, nullptr, PSI, |
| 3422 | &ORE); |
| 3423 | ICCA.analyze(); |
| 3424 | OS << " Analyzing call of " << CalledFunction->getName() |
| 3425 | << "... (caller:" << CB->getCaller()->getName() << ")\n" ; |
| 3426 | ICCA.print(OS); |
| 3427 | OS << "\n" ; |
| 3428 | } |
| 3429 | } |
| 3430 | } |
| 3431 | return PreservedAnalyses::all(); |
| 3432 | } |
| 3433 | |