| 1 | //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements inline cost analysis. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "llvm/Analysis/InlineCost.h" |
| 14 | #include "llvm/ADT/STLExtras.h" |
| 15 | #include "llvm/ADT/SetVector.h" |
| 16 | #include "llvm/ADT/SmallPtrSet.h" |
| 17 | #include "llvm/ADT/SmallVector.h" |
| 18 | #include "llvm/ADT/Statistic.h" |
| 19 | #include "llvm/Analysis/AssumptionCache.h" |
| 20 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
| 21 | #include "llvm/Analysis/CodeMetrics.h" |
| 22 | #include "llvm/Analysis/ConstantFolding.h" |
| 23 | #include "llvm/Analysis/DomConditionCache.h" |
| 24 | #include "llvm/Analysis/EphemeralValuesCache.h" |
| 25 | #include "llvm/Analysis/InstructionSimplify.h" |
| 26 | #include "llvm/Analysis/LoopInfo.h" |
| 27 | #include "llvm/Analysis/MemoryBuiltins.h" |
| 28 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 29 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
| 30 | #include "llvm/Analysis/TargetLibraryInfo.h" |
| 31 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 32 | #include "llvm/Analysis/ValueTracking.h" |
| 33 | #include "llvm/Config/llvm-config.h" |
| 34 | #include "llvm/IR/AssemblyAnnotationWriter.h" |
| 35 | #include "llvm/IR/CallingConv.h" |
| 36 | #include "llvm/IR/DataLayout.h" |
| 37 | #include "llvm/IR/Dominators.h" |
| 38 | #include "llvm/IR/GetElementPtrTypeIterator.h" |
| 39 | #include "llvm/IR/GlobalAlias.h" |
| 40 | #include "llvm/IR/InlineAsm.h" |
| 41 | #include "llvm/IR/InstVisitor.h" |
| 42 | #include "llvm/IR/IntrinsicInst.h" |
| 43 | #include "llvm/IR/Operator.h" |
| 44 | #include "llvm/IR/PatternMatch.h" |
| 45 | #include "llvm/Support/CommandLine.h" |
| 46 | #include "llvm/Support/Debug.h" |
| 47 | #include "llvm/Support/FormattedStream.h" |
| 48 | #include "llvm/Support/raw_ostream.h" |
| 49 | #include <climits> |
| 50 | #include <limits> |
| 51 | #include <optional> |
| 52 | |
| 53 | using namespace llvm; |
| 54 | |
| 55 | #define DEBUG_TYPE "inline-cost" |
| 56 | |
| 57 | STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed" ); |
| 58 | |
| 59 | static cl::opt<int> |
| 60 | DefaultThreshold("inlinedefault-threshold" , cl::Hidden, cl::init(Val: 225), |
| 61 | cl::desc("Default amount of inlining to perform" )); |
| 62 | |
| 63 | // We introduce this option since there is a minor compile-time win by avoiding |
| 64 | // addition of TTI attributes (target-features in particular) to inline |
| 65 | // candidates when they are guaranteed to be the same as top level methods in |
| 66 | // some use cases. If we avoid adding the attribute, we need an option to avoid |
| 67 | // checking these attributes. |
| 68 | static cl::opt<bool> IgnoreTTIInlineCompatible( |
| 69 | "ignore-tti-inline-compatible" , cl::Hidden, cl::init(Val: false), |
| 70 | cl::desc("Ignore TTI attributes compatibility check between callee/caller " |
| 71 | "during inline cost calculation" )); |
| 72 | |
| 73 | static cl::opt<bool> ( |
| 74 | "print-instruction-comments" , cl::Hidden, cl::init(Val: false), |
| 75 | cl::desc("Prints comments for instruction based on inline cost analysis" )); |
| 76 | |
| 77 | static cl::opt<int> InlineThreshold( |
| 78 | "inline-threshold" , cl::Hidden, cl::init(Val: 225), |
| 79 | cl::desc("Control the amount of inlining to perform (default = 225)" )); |
| 80 | |
| 81 | static cl::opt<int> HintThreshold( |
| 82 | "inlinehint-threshold" , cl::Hidden, cl::init(Val: 325), |
| 83 | cl::desc("Threshold for inlining functions with inline hint" )); |
| 84 | |
| 85 | static cl::opt<int> |
| 86 | ColdCallSiteThreshold("inline-cold-callsite-threshold" , cl::Hidden, |
| 87 | cl::init(Val: 45), |
| 88 | cl::desc("Threshold for inlining cold callsites" )); |
| 89 | |
| 90 | static cl::opt<bool> InlineEnableCostBenefitAnalysis( |
| 91 | "inline-enable-cost-benefit-analysis" , cl::Hidden, cl::init(Val: false), |
| 92 | cl::desc("Enable the cost-benefit analysis for the inliner" )); |
| 93 | |
| 94 | // InlineSavingsMultiplier overrides per TTI multipliers iff it is |
| 95 | // specified explicitly in command line options. This option is exposed |
| 96 | // for tuning and testing. |
| 97 | static cl::opt<int> InlineSavingsMultiplier( |
| 98 | "inline-savings-multiplier" , cl::Hidden, cl::init(Val: 8), |
| 99 | cl::desc("Multiplier to multiply cycle savings by during inlining" )); |
| 100 | |
| 101 | // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is |
| 102 | // specified explicitly in command line options. This option is exposed |
| 103 | // for tuning and testing. |
| 104 | static cl::opt<int> InlineSavingsProfitableMultiplier( |
| 105 | "inline-savings-profitable-multiplier" , cl::Hidden, cl::init(Val: 4), |
| 106 | cl::desc("A multiplier on top of cycle savings to decide whether the " |
| 107 | "savings won't justify the cost" )); |
| 108 | |
| 109 | static cl::opt<int> |
| 110 | InlineSizeAllowance("inline-size-allowance" , cl::Hidden, cl::init(Val: 100), |
| 111 | cl::desc("The maximum size of a callee that get's " |
| 112 | "inlined without sufficient cycle savings" )); |
| 113 | |
| 114 | // We introduce this threshold to help performance of instrumentation based |
| 115 | // PGO before we actually hook up inliner with analysis passes such as BPI and |
| 116 | // BFI. |
| 117 | static cl::opt<int> ColdThreshold( |
| 118 | "inlinecold-threshold" , cl::Hidden, cl::init(Val: 45), |
| 119 | cl::desc("Threshold for inlining functions with cold attribute" )); |
| 120 | |
| 121 | static cl::opt<int> |
| 122 | HotCallSiteThreshold("hot-callsite-threshold" , cl::Hidden, cl::init(Val: 3000), |
| 123 | cl::desc("Threshold for hot callsites " )); |
| 124 | |
| 125 | static cl::opt<int> LocallyHotCallSiteThreshold( |
| 126 | "locally-hot-callsite-threshold" , cl::Hidden, cl::init(Val: 525), |
| 127 | cl::desc("Threshold for locally hot callsites " )); |
| 128 | |
| 129 | static cl::opt<int> ColdCallSiteRelFreq( |
| 130 | "cold-callsite-rel-freq" , cl::Hidden, cl::init(Val: 2), |
| 131 | cl::desc("Maximum block frequency, expressed as a percentage of caller's " |
| 132 | "entry frequency, for a callsite to be cold in the absence of " |
| 133 | "profile information." )); |
| 134 | |
| 135 | static cl::opt<uint64_t> HotCallSiteRelFreq( |
| 136 | "hot-callsite-rel-freq" , cl::Hidden, cl::init(Val: 60), |
| 137 | cl::desc("Minimum block frequency, expressed as a multiple of caller's " |
| 138 | "entry frequency, for a callsite to be hot in the absence of " |
| 139 | "profile information." )); |
| 140 | |
| 141 | static cl::opt<int> |
| 142 | InstrCost("inline-instr-cost" , cl::Hidden, cl::init(Val: 5), |
| 143 | cl::desc("Cost of a single instruction when inlining" )); |
| 144 | |
| 145 | static cl::opt<int> InlineAsmInstrCost( |
| 146 | "inline-asm-instr-cost" , cl::Hidden, cl::init(Val: 0), |
| 147 | cl::desc("Cost of a single inline asm instruction when inlining" )); |
| 148 | |
| 149 | static cl::opt<int> |
| 150 | MemAccessCost("inline-memaccess-cost" , cl::Hidden, cl::init(Val: 0), |
| 151 | cl::desc("Cost of load/store instruction when inlining" )); |
| 152 | |
| 153 | static cl::opt<int> CallPenalty( |
| 154 | "inline-call-penalty" , cl::Hidden, cl::init(Val: 25), |
| 155 | cl::desc("Call penalty that is applied per callsite when inlining" )); |
| 156 | |
| 157 | static cl::opt<size_t> |
| 158 | StackSizeThreshold("inline-max-stacksize" , cl::Hidden, |
| 159 | cl::init(Val: std::numeric_limits<size_t>::max()), |
| 160 | cl::desc("Do not inline functions with a stack size " |
| 161 | "that exceeds the specified limit" )); |
| 162 | |
| 163 | static cl::opt<size_t> RecurStackSizeThreshold( |
| 164 | "recursive-inline-max-stacksize" , cl::Hidden, |
| 165 | cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller), |
| 166 | cl::desc("Do not inline recursive functions with a stack " |
| 167 | "size that exceeds the specified limit" )); |
| 168 | |
| 169 | static cl::opt<bool> OptComputeFullInlineCost( |
| 170 | "inline-cost-full" , cl::Hidden, |
| 171 | cl::desc("Compute the full inline cost of a call site even when the cost " |
| 172 | "exceeds the threshold." )); |
| 173 | |
| 174 | static cl::opt<bool> InlineCallerSupersetNoBuiltin( |
| 175 | "inline-caller-superset-nobuiltin" , cl::Hidden, cl::init(Val: true), |
| 176 | cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " |
| 177 | "attributes." )); |
| 178 | |
| 179 | static cl::opt<bool> DisableGEPConstOperand( |
| 180 | "disable-gep-const-evaluation" , cl::Hidden, cl::init(Val: false), |
| 181 | cl::desc("Disables evaluation of GetElementPtr with constant operands" )); |
| 182 | |
| 183 | static cl::opt<bool> InlineAllViableCalls( |
| 184 | "inline-all-viable-calls" , cl::Hidden, cl::init(Val: false), |
| 185 | cl::desc("Inline all viable calls, even if they exceed the inlining " |
| 186 | "threshold" )); |
| 187 | namespace llvm { |
| 188 | std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) { |
| 189 | if (Attr.isValid()) { |
| 190 | int AttrValue = 0; |
| 191 | if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue)) |
| 192 | return AttrValue; |
| 193 | } |
| 194 | return std::nullopt; |
| 195 | } |
| 196 | |
| 197 | std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) { |
| 198 | return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind)); |
| 199 | } |
| 200 | |
| 201 | std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) { |
| 202 | return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind)); |
| 203 | } |
| 204 | |
| 205 | namespace InlineConstants { |
| 206 | int getInstrCost() { return InstrCost; } |
| 207 | |
| 208 | } // namespace InlineConstants |
| 209 | |
| 210 | } // namespace llvm |
| 211 | |
| 212 | namespace { |
| 213 | class InlineCostCallAnalyzer; |
| 214 | |
| 215 | // This struct is used to store information about inline cost of a |
| 216 | // particular instruction |
| 217 | struct InstructionCostDetail { |
| 218 | int CostBefore = 0; |
| 219 | int CostAfter = 0; |
| 220 | int ThresholdBefore = 0; |
| 221 | int ThresholdAfter = 0; |
| 222 | |
| 223 | int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; } |
| 224 | |
| 225 | int getCostDelta() const { return CostAfter - CostBefore; } |
| 226 | |
| 227 | bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; } |
| 228 | }; |
| 229 | |
| 230 | class InlineCostAnnotationWriter : public AssemblyAnnotationWriter { |
| 231 | private: |
| 232 | InlineCostCallAnalyzer *const ICCA; |
| 233 | |
| 234 | public: |
| 235 | InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} |
| 236 | void emitInstructionAnnot(const Instruction *I, |
| 237 | formatted_raw_ostream &OS) override; |
| 238 | }; |
| 239 | |
| 240 | /// Carry out call site analysis, in order to evaluate inlinability. |
| 241 | /// NOTE: the type is currently used as implementation detail of functions such |
| 242 | /// as llvm::getInlineCost. Note the function_ref constructor parameters - the |
| 243 | /// expectation is that they come from the outer scope, from the wrapper |
| 244 | /// functions. If we want to support constructing CallAnalyzer objects where |
| 245 | /// lambdas are provided inline at construction, or where the object needs to |
| 246 | /// otherwise survive past the scope of the provided functions, we need to |
| 247 | /// revisit the argument types. |
| 248 | class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { |
| 249 | typedef InstVisitor<CallAnalyzer, bool> Base; |
| 250 | friend class InstVisitor<CallAnalyzer, bool>; |
| 251 | |
| 252 | protected: |
| 253 | virtual ~CallAnalyzer() = default; |
| 254 | /// The TargetTransformInfo available for this compilation. |
| 255 | const TargetTransformInfo &TTI; |
| 256 | |
| 257 | /// Getter for the cache of @llvm.assume intrinsics. |
| 258 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache; |
| 259 | |
| 260 | /// Getter for BlockFrequencyInfo |
| 261 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI; |
| 262 | |
| 263 | /// Getter for TargetLibraryInfo |
| 264 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI; |
| 265 | |
| 266 | /// Profile summary information. |
| 267 | ProfileSummaryInfo *PSI; |
| 268 | |
| 269 | /// The called function. |
| 270 | Function &F; |
| 271 | |
| 272 | // Cache the DataLayout since we use it a lot. |
| 273 | const DataLayout &DL; |
| 274 | |
| 275 | /// The OptimizationRemarkEmitter available for this compilation. |
| 276 | OptimizationRemarkEmitter *ORE; |
| 277 | |
| 278 | /// The candidate callsite being analyzed. Please do not use this to do |
| 279 | /// analysis in the caller function; we want the inline cost query to be |
| 280 | /// easily cacheable. Instead, use the cover function paramHasAttr. |
| 281 | CallBase &CandidateCall; |
| 282 | |
| 283 | /// Getter for the cache of ephemeral values. |
| 284 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr; |
| 285 | |
| 286 | /// Extension points for handling callsite features. |
| 287 | // Called before a basic block was analyzed. |
| 288 | virtual void onBlockStart(const BasicBlock *BB) {} |
| 289 | |
| 290 | /// Called after a basic block was analyzed. |
| 291 | virtual void onBlockAnalyzed(const BasicBlock *BB) {} |
| 292 | |
| 293 | /// Called before an instruction was analyzed |
| 294 | virtual void onInstructionAnalysisStart(const Instruction *I) {} |
| 295 | |
| 296 | /// Called after an instruction was analyzed |
| 297 | virtual void onInstructionAnalysisFinish(const Instruction *I) {} |
| 298 | |
| 299 | /// Called at the end of the analysis of the callsite. Return the outcome of |
| 300 | /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or |
| 301 | /// the reason it can't. |
| 302 | virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } |
| 303 | /// Called when we're about to start processing a basic block, and every time |
| 304 | /// we are done processing an instruction. Return true if there is no point in |
| 305 | /// continuing the analysis (e.g. we've determined already the call site is |
| 306 | /// too expensive to inline) |
| 307 | virtual bool shouldStop() { return false; } |
| 308 | |
| 309 | /// Called before the analysis of the callee body starts (with callsite |
| 310 | /// contexts propagated). It checks callsite-specific information. Return a |
| 311 | /// reason analysis can't continue if that's the case, or 'true' if it may |
| 312 | /// continue. |
| 313 | virtual InlineResult onAnalysisStart() { return InlineResult::success(); } |
| 314 | /// Called if the analysis engine decides SROA cannot be done for the given |
| 315 | /// alloca. |
| 316 | virtual void onDisableSROA(AllocaInst *Arg) {} |
| 317 | |
| 318 | /// Called the analysis engine determines load elimination won't happen. |
| 319 | virtual void onDisableLoadElimination() {} |
| 320 | |
| 321 | /// Called when we visit a CallBase, before the analysis starts. Return false |
| 322 | /// to stop further processing of the instruction. |
| 323 | virtual bool onCallBaseVisitStart(CallBase &Call) { return true; } |
| 324 | |
| 325 | /// Called to account for a call. |
| 326 | virtual void onCallPenalty() {} |
| 327 | |
| 328 | /// Called to account for a load or store. |
| 329 | virtual void onMemAccess(){}; |
| 330 | |
| 331 | /// Called to account for the expectation the inlining would result in a load |
| 332 | /// elimination. |
| 333 | virtual void onLoadEliminationOpportunity() {} |
| 334 | |
| 335 | /// Called to account for the cost of argument setup for the Call in the |
| 336 | /// callee's body (not the callsite currently under analysis). |
| 337 | virtual void onCallArgumentSetup(const CallBase &Call) {} |
| 338 | |
| 339 | /// Called to account for a load relative intrinsic. |
| 340 | virtual void onLoadRelativeIntrinsic() {} |
| 341 | |
| 342 | /// Called to account for a lowered call. |
| 343 | virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { |
| 344 | } |
| 345 | |
| 346 | /// Account for a jump table of given size. Return false to stop further |
| 347 | /// processing the switch instruction |
| 348 | virtual bool onJumpTable(unsigned JumpTableSize) { return true; } |
| 349 | |
| 350 | /// Account for a case cluster of given size. Return false to stop further |
| 351 | /// processing of the instruction. |
| 352 | virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } |
| 353 | |
| 354 | /// Called at the end of processing a switch instruction, with the given |
| 355 | /// number of case clusters. |
| 356 | virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
| 357 | bool DefaultDestUnreachable) {} |
| 358 | |
| 359 | /// Called to account for any other instruction not specifically accounted |
| 360 | /// for. |
| 361 | virtual void onMissedSimplification() {} |
| 362 | |
| 363 | /// Account for inline assembly instructions. |
| 364 | virtual void onInlineAsm(const InlineAsm &Arg) {} |
| 365 | |
| 366 | /// Start accounting potential benefits due to SROA for the given alloca. |
| 367 | virtual void onInitializeSROAArg(AllocaInst *Arg) {} |
| 368 | |
| 369 | /// Account SROA savings for the AllocaInst value. |
| 370 | virtual void onAggregateSROAUse(AllocaInst *V) {} |
| 371 | |
| 372 | bool handleSROA(Value *V, bool DoNotDisable) { |
| 373 | // Check for SROA candidates in comparisons. |
| 374 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
| 375 | if (DoNotDisable) { |
| 376 | onAggregateSROAUse(V: SROAArg); |
| 377 | return true; |
| 378 | } |
| 379 | disableSROAForArg(SROAArg); |
| 380 | } |
| 381 | return false; |
| 382 | } |
| 383 | |
| 384 | bool IsCallerRecursive = false; |
| 385 | bool IsRecursiveCall = false; |
| 386 | bool ExposesReturnsTwice = false; |
| 387 | bool HasDynamicAlloca = false; |
| 388 | bool ContainsNoDuplicateCall = false; |
| 389 | bool HasReturn = false; |
| 390 | bool HasIndirectBr = false; |
| 391 | bool HasUninlineableIntrinsic = false; |
| 392 | bool InitsVargArgs = false; |
| 393 | |
| 394 | /// Number of bytes allocated statically by the callee. |
| 395 | uint64_t AllocatedSize = 0; |
| 396 | unsigned NumInstructions = 0; |
| 397 | unsigned NumInlineAsmInstructions = 0; |
| 398 | unsigned NumVectorInstructions = 0; |
| 399 | |
| 400 | /// While we walk the potentially-inlined instructions, we build up and |
| 401 | /// maintain a mapping of simplified values specific to this callsite. The |
| 402 | /// idea is to propagate any special information we have about arguments to |
| 403 | /// this call through the inlinable section of the function, and account for |
| 404 | /// likely simplifications post-inlining. The most important aspect we track |
| 405 | /// is CFG altering simplifications -- when we prove a basic block dead, that |
| 406 | /// can cause dramatic shifts in the cost of inlining a function. |
| 407 | /// Note: The simplified Value may be owned by the caller function. |
| 408 | DenseMap<Value *, Value *> SimplifiedValues; |
| 409 | |
| 410 | /// Keep track of the values which map back (through function arguments) to |
| 411 | /// allocas on the caller stack which could be simplified through SROA. |
| 412 | DenseMap<Value *, AllocaInst *> SROAArgValues; |
| 413 | |
| 414 | /// Keep track of Allocas for which we believe we may get SROA optimization. |
| 415 | DenseSet<AllocaInst *> EnabledSROAAllocas; |
| 416 | |
| 417 | /// Keep track of values which map to a pointer base and constant offset. |
| 418 | DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; |
| 419 | |
| 420 | /// Keep track of dead blocks due to the constant arguments. |
| 421 | SmallPtrSet<BasicBlock *, 16> DeadBlocks; |
| 422 | |
| 423 | /// The mapping of the blocks to their known unique successors due to the |
| 424 | /// constant arguments. |
| 425 | DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; |
| 426 | |
| 427 | /// Model the elimination of repeated loads that is expected to happen |
| 428 | /// whenever we simplify away the stores that would otherwise cause them to be |
| 429 | /// loads. |
| 430 | bool EnableLoadElimination = true; |
| 431 | |
| 432 | /// Whether we allow inlining for recursive call. |
| 433 | bool AllowRecursiveCall = false; |
| 434 | |
| 435 | SmallPtrSet<Value *, 16> LoadAddrSet; |
| 436 | |
| 437 | AllocaInst *getSROAArgForValueOrNull(Value *V) const { |
| 438 | auto It = SROAArgValues.find(Val: V); |
| 439 | if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0) |
| 440 | return nullptr; |
| 441 | return It->second; |
| 442 | } |
| 443 | |
| 444 | /// Use a value in its given form directly if possible, otherwise try looking |
| 445 | /// for it in SimplifiedValues. |
| 446 | template <typename T> T *getDirectOrSimplifiedValue(Value *V) const { |
| 447 | if (auto *Direct = dyn_cast<T>(V)) |
| 448 | return Direct; |
| 449 | return getSimplifiedValue<T>(V); |
| 450 | } |
| 451 | |
| 452 | // Custom simplification helper routines. |
| 453 | bool isAllocaDerivedArg(Value *V); |
| 454 | void disableSROAForArg(AllocaInst *SROAArg); |
| 455 | void disableSROA(Value *V); |
| 456 | void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); |
| 457 | void disableLoadElimination(); |
| 458 | bool isGEPFree(GetElementPtrInst &GEP); |
| 459 | bool canFoldInboundsGEP(GetElementPtrInst &I); |
| 460 | bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); |
| 461 | bool simplifyCallSite(Function *F, CallBase &Call); |
| 462 | bool simplifyCmpInstForRecCall(CmpInst &Cmp); |
| 463 | bool simplifyInstruction(Instruction &I); |
| 464 | bool simplifyIntrinsicCallIsConstant(CallBase &CB); |
| 465 | bool simplifyIntrinsicCallObjectSize(CallBase &CB); |
| 466 | ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); |
| 467 | bool isLoweredToCall(Function *F, CallBase &Call); |
| 468 | |
| 469 | /// Return true if the given argument to the function being considered for |
| 470 | /// inlining has the given attribute set either at the call site or the |
| 471 | /// function declaration. Primarily used to inspect call site specific |
| 472 | /// attributes since these can be more precise than the ones on the callee |
| 473 | /// itself. |
| 474 | bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); |
| 475 | |
| 476 | /// Return true if the given value is known non null within the callee if |
| 477 | /// inlined through this particular callsite. |
| 478 | bool isKnownNonNullInCallee(Value *V); |
| 479 | |
| 480 | /// Return true if size growth is allowed when inlining the callee at \p Call. |
| 481 | bool allowSizeGrowth(CallBase &Call); |
| 482 | |
| 483 | // Custom analysis routines. |
| 484 | InlineResult analyzeBlock(BasicBlock *BB, |
| 485 | const SmallPtrSetImpl<const Value *> &EphValues); |
| 486 | |
| 487 | // Disable several entry points to the visitor so we don't accidentally use |
| 488 | // them by declaring but not defining them here. |
| 489 | void visit(Module *); |
| 490 | void visit(Module &); |
| 491 | void visit(Function *); |
| 492 | void visit(Function &); |
| 493 | void visit(BasicBlock *); |
| 494 | void visit(BasicBlock &); |
| 495 | |
| 496 | // Provide base case for our instruction visit. |
| 497 | bool visitInstruction(Instruction &I); |
| 498 | |
| 499 | // Our visit overrides. |
| 500 | bool visitAlloca(AllocaInst &I); |
| 501 | bool visitPHI(PHINode &I); |
| 502 | bool visitGetElementPtr(GetElementPtrInst &I); |
| 503 | bool visitBitCast(BitCastInst &I); |
| 504 | bool visitPtrToInt(PtrToIntInst &I); |
| 505 | bool visitIntToPtr(IntToPtrInst &I); |
| 506 | bool visitCastInst(CastInst &I); |
| 507 | bool visitCmpInst(CmpInst &I); |
| 508 | bool visitSub(BinaryOperator &I); |
| 509 | bool visitBinaryOperator(BinaryOperator &I); |
| 510 | bool visitFNeg(UnaryOperator &I); |
| 511 | bool visitLoad(LoadInst &I); |
| 512 | bool visitStore(StoreInst &I); |
| 513 | bool visitExtractValue(ExtractValueInst &I); |
| 514 | bool visitInsertValue(InsertValueInst &I); |
| 515 | bool visitCallBase(CallBase &Call); |
| 516 | bool visitReturnInst(ReturnInst &RI); |
| 517 | bool visitBranchInst(BranchInst &BI); |
| 518 | bool visitSelectInst(SelectInst &SI); |
| 519 | bool visitSwitchInst(SwitchInst &SI); |
| 520 | bool visitIndirectBrInst(IndirectBrInst &IBI); |
| 521 | bool visitResumeInst(ResumeInst &RI); |
| 522 | bool visitCleanupReturnInst(CleanupReturnInst &RI); |
| 523 | bool visitCatchReturnInst(CatchReturnInst &RI); |
| 524 | bool visitUnreachableInst(UnreachableInst &I); |
| 525 | |
| 526 | public: |
| 527 | CallAnalyzer( |
| 528 | Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, |
| 529 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 530 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
| 531 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr, |
| 532 | ProfileSummaryInfo *PSI = nullptr, |
| 533 | OptimizationRemarkEmitter *ORE = nullptr, |
| 534 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = |
| 535 | nullptr) |
| 536 | : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), |
| 537 | GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE), |
| 538 | CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {} |
| 539 | |
| 540 | InlineResult analyze(); |
| 541 | |
| 542 | /// Lookup simplified Value. May return a value owned by the caller. |
| 543 | Value *getSimplifiedValueUnchecked(Value *V) const { |
| 544 | return SimplifiedValues.lookup(Val: V); |
| 545 | } |
| 546 | |
| 547 | /// Lookup simplified Value, but return nullptr if the simplified value is |
| 548 | /// owned by the caller. |
| 549 | template <typename T> T *getSimplifiedValue(Value *V) const { |
| 550 | Value *SimpleV = SimplifiedValues.lookup(Val: V); |
| 551 | if (!SimpleV) |
| 552 | return nullptr; |
| 553 | |
| 554 | // Skip checks if we know T is a global. This has a small, but measurable |
| 555 | // impact on compile-time. |
| 556 | if constexpr (std::is_base_of_v<Constant, T>) |
| 557 | return dyn_cast<T>(SimpleV); |
| 558 | |
| 559 | // Make sure the simplified Value is owned by this function |
| 560 | if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) { |
| 561 | if (I->getFunction() != &F) |
| 562 | return nullptr; |
| 563 | } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) { |
| 564 | if (Arg->getParent() != &F) |
| 565 | return nullptr; |
| 566 | } else if (!isa<Constant>(Val: SimpleV)) |
| 567 | return nullptr; |
| 568 | return dyn_cast<T>(SimpleV); |
| 569 | } |
| 570 | |
| 571 | // Keep a bunch of stats about the cost savings found so we can print them |
| 572 | // out when debugging. |
| 573 | unsigned NumConstantArgs = 0; |
| 574 | unsigned NumConstantOffsetPtrArgs = 0; |
| 575 | unsigned NumAllocaArgs = 0; |
| 576 | unsigned NumConstantPtrCmps = 0; |
| 577 | unsigned NumConstantPtrDiffs = 0; |
| 578 | unsigned NumInstructionsSimplified = 0; |
| 579 | |
| 580 | void dump(); |
| 581 | }; |
| 582 | |
| 583 | // Considering forming a binary search, we should find the number of nodes |
| 584 | // which is same as the number of comparisons when lowered. For a given |
| 585 | // number of clusters, n, we can define a recursive function, f(n), to find |
| 586 | // the number of nodes in the tree. The recursion is : |
| 587 | // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, |
| 588 | // and f(n) = n, when n <= 3. |
| 589 | // This will lead a binary tree where the leaf should be either f(2) or f(3) |
| 590 | // when n > 3. So, the number of comparisons from leaves should be n, while |
| 591 | // the number of non-leaf should be : |
| 592 | // 2^(log2(n) - 1) - 1 |
| 593 | // = 2^log2(n) * 2^-1 - 1 |
| 594 | // = n / 2 - 1. |
| 595 | // Considering comparisons from leaf and non-leaf nodes, we can estimate the |
| 596 | // number of comparisons in a simple closed form : |
| 597 | // n + n / 2 - 1 = n * 3 / 2 - 1 |
| 598 | int64_t getExpectedNumberOfCompare(int NumCaseCluster) { |
| 599 | return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1; |
| 600 | } |
| 601 | |
| 602 | /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note |
| 603 | /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer |
| 604 | class InlineCostCallAnalyzer final : public CallAnalyzer { |
| 605 | const bool ComputeFullInlineCost; |
| 606 | int LoadEliminationCost = 0; |
| 607 | /// Bonus to be applied when percentage of vector instructions in callee is |
| 608 | /// high (see more details in updateThreshold). |
| 609 | int VectorBonus = 0; |
| 610 | /// Bonus to be applied when the callee has only one reachable basic block. |
| 611 | int SingleBBBonus = 0; |
| 612 | |
| 613 | /// Tunable parameters that control the analysis. |
| 614 | const InlineParams &Params; |
| 615 | |
| 616 | // This DenseMap stores the delta change in cost and threshold after |
| 617 | // accounting for the given instruction. The map is filled only with the |
| 618 | // flag PrintInstructionComments on. |
| 619 | DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap; |
| 620 | |
| 621 | /// Upper bound for the inlining cost. Bonuses are being applied to account |
| 622 | /// for speculative "expected profit" of the inlining decision. |
| 623 | int Threshold = 0; |
| 624 | |
| 625 | /// The amount of StaticBonus applied. |
| 626 | int StaticBonusApplied = 0; |
| 627 | |
| 628 | /// Attempt to evaluate indirect calls to boost its inline cost. |
| 629 | const bool BoostIndirectCalls; |
| 630 | |
| 631 | /// Ignore the threshold when finalizing analysis. |
| 632 | const bool IgnoreThreshold; |
| 633 | |
| 634 | // True if the cost-benefit-analysis-based inliner is enabled. |
| 635 | const bool CostBenefitAnalysisEnabled; |
| 636 | |
| 637 | /// Inlining cost measured in abstract units, accounts for all the |
| 638 | /// instructions expected to be executed for a given function invocation. |
| 639 | /// Instructions that are statically proven to be dead based on call-site |
| 640 | /// arguments are not counted here. |
| 641 | int Cost = 0; |
| 642 | |
| 643 | // The cumulative cost at the beginning of the basic block being analyzed. At |
| 644 | // the end of analyzing each basic block, "Cost - CostAtBBStart" represents |
| 645 | // the size of that basic block. |
| 646 | int CostAtBBStart = 0; |
| 647 | |
| 648 | // The static size of live but cold basic blocks. This is "static" in the |
| 649 | // sense that it's not weighted by profile counts at all. |
| 650 | int ColdSize = 0; |
| 651 | |
| 652 | // Whether inlining is decided by cost-threshold analysis. |
| 653 | bool DecidedByCostThreshold = false; |
| 654 | |
| 655 | // Whether inlining is decided by cost-benefit analysis. |
| 656 | bool DecidedByCostBenefit = false; |
| 657 | |
| 658 | // The cost-benefit pair computed by cost-benefit analysis. |
| 659 | std::optional<CostBenefitPair> CostBenefit; |
| 660 | |
| 661 | bool SingleBB = true; |
| 662 | |
| 663 | unsigned SROACostSavings = 0; |
| 664 | unsigned SROACostSavingsLost = 0; |
| 665 | |
| 666 | /// The mapping of caller Alloca values to their accumulated cost savings. If |
| 667 | /// we have to disable SROA for one of the allocas, this tells us how much |
| 668 | /// cost must be added. |
| 669 | DenseMap<AllocaInst *, int> SROAArgCosts; |
| 670 | |
| 671 | /// Return true if \p Call is a cold callsite. |
| 672 | bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); |
| 673 | |
| 674 | /// Update Threshold based on callsite properties such as callee |
| 675 | /// attributes and callee hotness for PGO builds. The Callee is explicitly |
| 676 | /// passed to support analyzing indirect calls whose target is inferred by |
| 677 | /// analysis. |
| 678 | void updateThreshold(CallBase &Call, Function &Callee); |
| 679 | /// Return a higher threshold if \p Call is a hot callsite. |
| 680 | std::optional<int> getHotCallSiteThreshold(CallBase &Call, |
| 681 | BlockFrequencyInfo *CallerBFI); |
| 682 | |
| 683 | /// Handle a capped 'int' increment for Cost. |
| 684 | void addCost(int64_t Inc) { |
| 685 | Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX); |
| 686 | Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX); |
| 687 | } |
| 688 | |
| 689 | void onDisableSROA(AllocaInst *Arg) override { |
| 690 | auto CostIt = SROAArgCosts.find(Val: Arg); |
| 691 | if (CostIt == SROAArgCosts.end()) |
| 692 | return; |
| 693 | addCost(Inc: CostIt->second); |
| 694 | SROACostSavings -= CostIt->second; |
| 695 | SROACostSavingsLost += CostIt->second; |
| 696 | SROAArgCosts.erase(I: CostIt); |
| 697 | } |
| 698 | |
| 699 | void onDisableLoadElimination() override { |
| 700 | addCost(Inc: LoadEliminationCost); |
| 701 | LoadEliminationCost = 0; |
| 702 | } |
| 703 | |
| 704 | bool onCallBaseVisitStart(CallBase &Call) override { |
| 705 | if (std::optional<int> AttrCallThresholdBonus = |
| 706 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus" )) |
| 707 | Threshold += *AttrCallThresholdBonus; |
| 708 | |
| 709 | if (std::optional<int> AttrCallCost = |
| 710 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost" )) { |
| 711 | addCost(Inc: *AttrCallCost); |
| 712 | // Prevent further processing of the call since we want to override its |
| 713 | // inline cost, not just add to it. |
| 714 | return false; |
| 715 | } |
| 716 | return true; |
| 717 | } |
| 718 | |
| 719 | void onCallPenalty() override { addCost(Inc: CallPenalty); } |
| 720 | |
| 721 | void onMemAccess() override { addCost(Inc: MemAccessCost); } |
| 722 | |
| 723 | void onCallArgumentSetup(const CallBase &Call) override { |
| 724 | // Pay the price of the argument setup. We account for the average 1 |
| 725 | // instruction per call argument setup here. |
| 726 | addCost(Inc: Call.arg_size() * InstrCost); |
| 727 | } |
| 728 | void onLoadRelativeIntrinsic() override { |
| 729 | // This is normally lowered to 4 LLVM instructions. |
| 730 | addCost(Inc: 3 * InstrCost); |
| 731 | } |
| 732 | void onLoweredCall(Function *F, CallBase &Call, |
| 733 | bool IsIndirectCall) override { |
| 734 | // We account for the average 1 instruction per call argument setup here. |
| 735 | addCost(Inc: Call.arg_size() * InstrCost); |
| 736 | |
| 737 | // If we have a constant that we are calling as a function, we can peer |
| 738 | // through it and see the function target. This happens not infrequently |
| 739 | // during devirtualization and so we want to give it a hefty bonus for |
| 740 | // inlining, but cap that bonus in the event that inlining wouldn't pan out. |
| 741 | // Pretend to inline the function, with a custom threshold. |
| 742 | if (IsIndirectCall && BoostIndirectCalls) { |
| 743 | auto IndirectCallParams = Params; |
| 744 | IndirectCallParams.DefaultThreshold = |
| 745 | InlineConstants::IndirectCallThreshold; |
| 746 | /// FIXME: if InlineCostCallAnalyzer is derived from, this may need |
| 747 | /// to instantiate the derived class. |
| 748 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
| 749 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
| 750 | false); |
| 751 | if (CA.analyze().isSuccess()) { |
| 752 | // We were able to inline the indirect call! Subtract the cost from the |
| 753 | // threshold to get the bonus we want to apply, but don't go below zero. |
| 754 | addCost(Inc: -std::max(a: 0, b: CA.getThreshold() - CA.getCost())); |
| 755 | } |
| 756 | } else |
| 757 | // Otherwise simply add the cost for merely making the call. |
| 758 | addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call, |
| 759 | DefaultCallPenalty: CallPenalty)); |
| 760 | } |
| 761 | |
| 762 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
| 763 | bool DefaultDestUnreachable) override { |
| 764 | // If suitable for a jump table, consider the cost for the table size and |
| 765 | // branch to destination. |
| 766 | // Maximum valid cost increased in this function. |
| 767 | if (JumpTableSize) { |
| 768 | // Suppose a default branch includes one compare and one conditional |
| 769 | // branch if it's reachable. |
| 770 | if (!DefaultDestUnreachable) |
| 771 | addCost(Inc: 2 * InstrCost); |
| 772 | // Suppose a jump table requires one load and one jump instruction. |
| 773 | int64_t JTCost = |
| 774 | static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost; |
| 775 | addCost(Inc: JTCost); |
| 776 | return; |
| 777 | } |
| 778 | |
| 779 | if (NumCaseCluster <= 3) { |
| 780 | // Suppose a comparison includes one compare and one conditional branch. |
| 781 | // We can reduce a set of instructions if the default branch is |
| 782 | // undefined. |
| 783 | addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost); |
| 784 | return; |
| 785 | } |
| 786 | |
| 787 | int64_t ExpectedNumberOfCompare = |
| 788 | getExpectedNumberOfCompare(NumCaseCluster); |
| 789 | int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost; |
| 790 | |
| 791 | addCost(Inc: SwitchCost); |
| 792 | } |
| 793 | |
| 794 | // Parses the inline assembly argument to account for its cost. Inline |
| 795 | // assembly instructions incur higher costs for inlining since they cannot be |
| 796 | // analyzed and optimized. |
| 797 | void onInlineAsm(const InlineAsm &Arg) override { |
| 798 | if (!InlineAsmInstrCost) |
| 799 | return; |
| 800 | SmallVector<StringRef, 4> AsmStrs; |
| 801 | Arg.collectAsmStrs(AsmStrs); |
| 802 | int SectionLevel = 0; |
| 803 | int InlineAsmInstrCount = 0; |
| 804 | for (StringRef AsmStr : AsmStrs) { |
| 805 | // Trim whitespaces and comments. |
| 806 | StringRef Trimmed = AsmStr.trim(); |
| 807 | size_t hashPos = Trimmed.find(C: '#'); |
| 808 | if (hashPos != StringRef::npos) |
| 809 | Trimmed = Trimmed.substr(Start: 0, N: hashPos); |
| 810 | // Ignore comments. |
| 811 | if (Trimmed.empty()) |
| 812 | continue; |
| 813 | // Filter out the outlined assembly instructions from the cost by keeping |
| 814 | // track of the section level and only accounting for instrutions at |
| 815 | // section level of zero. Note there will be duplication in outlined |
| 816 | // sections too, but is not accounted in the inlining cost model. |
| 817 | if (Trimmed.starts_with(Prefix: ".pushsection" )) { |
| 818 | ++SectionLevel; |
| 819 | continue; |
| 820 | } |
| 821 | if (Trimmed.starts_with(Prefix: ".popsection" )) { |
| 822 | --SectionLevel; |
| 823 | continue; |
| 824 | } |
| 825 | // Ignore directives and labels. |
| 826 | if (Trimmed.starts_with(Prefix: "." ) || Trimmed.contains(Other: ":" )) |
| 827 | continue; |
| 828 | if (SectionLevel == 0) |
| 829 | ++InlineAsmInstrCount; |
| 830 | } |
| 831 | NumInlineAsmInstructions += InlineAsmInstrCount; |
| 832 | addCost(Inc: InlineAsmInstrCount * InlineAsmInstrCost); |
| 833 | } |
| 834 | |
| 835 | void onMissedSimplification() override { addCost(Inc: InstrCost); } |
| 836 | |
| 837 | void onInitializeSROAArg(AllocaInst *Arg) override { |
| 838 | assert(Arg != nullptr && |
| 839 | "Should not initialize SROA costs for null value." ); |
| 840 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
| 841 | SROACostSavings += SROAArgCost; |
| 842 | SROAArgCosts[Arg] = SROAArgCost; |
| 843 | } |
| 844 | |
| 845 | void onAggregateSROAUse(AllocaInst *SROAArg) override { |
| 846 | auto CostIt = SROAArgCosts.find(Val: SROAArg); |
| 847 | assert(CostIt != SROAArgCosts.end() && |
| 848 | "expected this argument to have a cost" ); |
| 849 | CostIt->second += InstrCost; |
| 850 | SROACostSavings += InstrCost; |
| 851 | } |
| 852 | |
| 853 | void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } |
| 854 | |
| 855 | void onBlockAnalyzed(const BasicBlock *BB) override { |
| 856 | if (CostBenefitAnalysisEnabled) { |
| 857 | // Keep track of the static size of live but cold basic blocks. For now, |
| 858 | // we define a cold basic block to be one that's never executed. |
| 859 | assert(GetBFI && "GetBFI must be available" ); |
| 860 | BlockFrequencyInfo *BFI = &(GetBFI(F)); |
| 861 | assert(BFI && "BFI must be available" ); |
| 862 | auto ProfileCount = BFI->getBlockProfileCount(BB); |
| 863 | if (*ProfileCount == 0) |
| 864 | ColdSize += Cost - CostAtBBStart; |
| 865 | } |
| 866 | |
| 867 | auto *TI = BB->getTerminator(); |
| 868 | // If we had any successors at this point, than post-inlining is likely to |
| 869 | // have them as well. Note that we assume any basic blocks which existed |
| 870 | // due to branches or switches which folded above will also fold after |
| 871 | // inlining. |
| 872 | if (SingleBB && TI->getNumSuccessors() > 1) { |
| 873 | // Take off the bonus we applied to the threshold. |
| 874 | Threshold -= SingleBBBonus; |
| 875 | SingleBB = false; |
| 876 | } |
| 877 | } |
| 878 | |
| 879 | void onInstructionAnalysisStart(const Instruction *I) override { |
| 880 | // This function is called to store the initial cost of inlining before |
| 881 | // the given instruction was assessed. |
| 882 | if (!PrintInstructionComments) |
| 883 | return; |
| 884 | auto &CostDetail = InstructionCostDetailMap[I]; |
| 885 | CostDetail.CostBefore = Cost; |
| 886 | CostDetail.ThresholdBefore = Threshold; |
| 887 | } |
| 888 | |
| 889 | void onInstructionAnalysisFinish(const Instruction *I) override { |
| 890 | // This function is called to find new values of cost and threshold after |
| 891 | // the instruction has been assessed. |
| 892 | if (!PrintInstructionComments) |
| 893 | return; |
| 894 | auto &CostDetail = InstructionCostDetailMap[I]; |
| 895 | CostDetail.CostAfter = Cost; |
| 896 | CostDetail.ThresholdAfter = Threshold; |
| 897 | } |
| 898 | |
| 899 | bool isCostBenefitAnalysisEnabled() { |
| 900 | if (!PSI || !PSI->hasProfileSummary()) |
| 901 | return false; |
| 902 | |
| 903 | if (!GetBFI) |
| 904 | return false; |
| 905 | |
| 906 | if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) { |
| 907 | // Honor the explicit request from the user. |
| 908 | if (!InlineEnableCostBenefitAnalysis) |
| 909 | return false; |
| 910 | } else { |
| 911 | // Otherwise, require instrumentation profile. |
| 912 | if (!PSI->hasInstrumentationProfile()) |
| 913 | return false; |
| 914 | } |
| 915 | |
| 916 | auto *Caller = CandidateCall.getParent()->getParent(); |
| 917 | if (!Caller->getEntryCount()) |
| 918 | return false; |
| 919 | |
| 920 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller)); |
| 921 | if (!CallerBFI) |
| 922 | return false; |
| 923 | |
| 924 | // For now, limit to hot call site. |
| 925 | if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI)) |
| 926 | return false; |
| 927 | |
| 928 | // Make sure we have a nonzero entry count. |
| 929 | auto EntryCount = F.getEntryCount(); |
| 930 | if (!EntryCount || !EntryCount->getCount()) |
| 931 | return false; |
| 932 | |
| 933 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
| 934 | if (!CalleeBFI) |
| 935 | return false; |
| 936 | |
| 937 | return true; |
| 938 | } |
| 939 | |
| 940 | // A helper function to choose between command line override and default. |
| 941 | unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { |
| 942 | if (InlineSavingsMultiplier.getNumOccurrences()) |
| 943 | return InlineSavingsMultiplier; |
| 944 | return TTI.getInliningCostBenefitAnalysisSavingsMultiplier(); |
| 945 | } |
| 946 | |
| 947 | // A helper function to choose between command line override and default. |
| 948 | unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const { |
| 949 | if (InlineSavingsProfitableMultiplier.getNumOccurrences()) |
| 950 | return InlineSavingsProfitableMultiplier; |
| 951 | return TTI.getInliningCostBenefitAnalysisProfitableMultiplier(); |
| 952 | } |
| 953 | |
| 954 | void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) { |
| 955 | if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt( |
| 956 | CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test" )) { |
| 957 | CycleSavings = *AttrCycleSavings; |
| 958 | } |
| 959 | |
| 960 | if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt( |
| 961 | CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test" )) { |
| 962 | Size = *AttrRuntimeCost; |
| 963 | } |
| 964 | } |
| 965 | |
| 966 | // Determine whether we should inline the given call site, taking into account |
| 967 | // both the size cost and the cycle savings. Return std::nullopt if we don't |
| 968 | // have sufficient profiling information to determine. |
| 969 | std::optional<bool> costBenefitAnalysis() { |
| 970 | if (!CostBenefitAnalysisEnabled) |
| 971 | return std::nullopt; |
| 972 | |
| 973 | // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 |
| 974 | // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by |
| 975 | // falling back to the cost-based metric. |
| 976 | // TODO: Improve this hacky condition. |
| 977 | if (Threshold == 0) |
| 978 | return std::nullopt; |
| 979 | |
| 980 | assert(GetBFI); |
| 981 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); |
| 982 | assert(CalleeBFI); |
| 983 | |
| 984 | // The cycle savings expressed as the sum of InstrCost |
| 985 | // multiplied by the estimated dynamic count of each instruction we can |
| 986 | // avoid. Savings come from the call site cost, such as argument setup and |
| 987 | // the call instruction, as well as the instructions that are folded. |
| 988 | // |
| 989 | // We use 128-bit APInt here to avoid potential overflow. This variable |
| 990 | // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case |
| 991 | // assumes that we can avoid or fold a billion instructions, each with a |
| 992 | // profile count of 10^^15 -- roughly the number of cycles for a 24-hour |
| 993 | // period on a 4GHz machine. |
| 994 | APInt CycleSavings(128, 0); |
| 995 | |
| 996 | for (auto &BB : F) { |
| 997 | APInt CurrentSavings(128, 0); |
| 998 | for (auto &I : BB) { |
| 999 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) { |
| 1000 | // Count a conditional branch as savings if it becomes unconditional. |
| 1001 | if (BI->isConditional() && |
| 1002 | getSimplifiedValue<ConstantInt>(V: BI->getCondition())) { |
| 1003 | CurrentSavings += InstrCost; |
| 1004 | } |
| 1005 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) { |
| 1006 | if (getSimplifiedValue<ConstantInt>(V: SI->getCondition())) |
| 1007 | CurrentSavings += InstrCost; |
| 1008 | } else if (Value *V = dyn_cast<Value>(Val: &I)) { |
| 1009 | // Count an instruction as savings if we can fold it. |
| 1010 | if (SimplifiedValues.count(Val: V)) { |
| 1011 | CurrentSavings += InstrCost; |
| 1012 | } |
| 1013 | } |
| 1014 | } |
| 1015 | |
| 1016 | auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB); |
| 1017 | CurrentSavings *= *ProfileCount; |
| 1018 | CycleSavings += CurrentSavings; |
| 1019 | } |
| 1020 | |
| 1021 | // Compute the cycle savings per call. |
| 1022 | auto EntryProfileCount = F.getEntryCount(); |
| 1023 | assert(EntryProfileCount && EntryProfileCount->getCount()); |
| 1024 | auto EntryCount = EntryProfileCount->getCount(); |
| 1025 | CycleSavings += EntryCount / 2; |
| 1026 | CycleSavings = CycleSavings.udiv(RHS: EntryCount); |
| 1027 | |
| 1028 | // Compute the total savings for the call site. |
| 1029 | auto *CallerBB = CandidateCall.getParent(); |
| 1030 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent()))); |
| 1031 | CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL); |
| 1032 | CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB); |
| 1033 | |
| 1034 | // Remove the cost of the cold basic blocks to model the runtime cost more |
| 1035 | // accurately. Both machine block placement and function splitting could |
| 1036 | // place cold blocks further from hot blocks. |
| 1037 | int Size = Cost - ColdSize; |
| 1038 | |
| 1039 | // Allow tiny callees to be inlined regardless of whether they meet the |
| 1040 | // savings threshold. |
| 1041 | Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; |
| 1042 | |
| 1043 | OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size); |
| 1044 | CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings); |
| 1045 | |
| 1046 | // Let R be the ratio of CycleSavings to Size. We accept the inlining |
| 1047 | // opportunity if R is really high and reject if R is really low. If R is |
| 1048 | // somewhere in the middle, we fall back to the cost-based analysis. |
| 1049 | // |
| 1050 | // Specifically, let R = CycleSavings / Size, we accept the inlining |
| 1051 | // opportunity if: |
| 1052 | // |
| 1053 | // PSI->getOrCompHotCountThreshold() |
| 1054 | // R > ------------------------------------------------- |
| 1055 | // getInliningCostBenefitAnalysisSavingsMultiplier() |
| 1056 | // |
| 1057 | // and reject the inlining opportunity if: |
| 1058 | // |
| 1059 | // PSI->getOrCompHotCountThreshold() |
| 1060 | // R <= ---------------------------------------------------- |
| 1061 | // getInliningCostBenefitAnalysisProfitableMultiplier() |
| 1062 | // |
| 1063 | // Otherwise, we fall back to the cost-based analysis. |
| 1064 | // |
| 1065 | // Implementation-wise, use multiplication (CycleSavings * Multiplier, |
| 1066 | // HotCountThreshold * Size) rather than division to avoid precision loss. |
| 1067 | APInt Threshold(128, PSI->getOrCompHotCountThreshold()); |
| 1068 | Threshold *= Size; |
| 1069 | |
| 1070 | APInt UpperBoundCycleSavings = CycleSavings; |
| 1071 | UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier(); |
| 1072 | if (UpperBoundCycleSavings.uge(RHS: Threshold)) |
| 1073 | return true; |
| 1074 | |
| 1075 | APInt LowerBoundCycleSavings = CycleSavings; |
| 1076 | LowerBoundCycleSavings *= |
| 1077 | getInliningCostBenefitAnalysisProfitableMultiplier(); |
| 1078 | if (LowerBoundCycleSavings.ult(RHS: Threshold)) |
| 1079 | return false; |
| 1080 | |
| 1081 | // Otherwise, fall back to the cost-based analysis. |
| 1082 | return std::nullopt; |
| 1083 | } |
| 1084 | |
| 1085 | InlineResult finalizeAnalysis() override { |
| 1086 | // Loops generally act a lot like calls in that they act like barriers to |
| 1087 | // movement, require a certain amount of setup, etc. So when optimising for |
| 1088 | // size, we penalise any call sites that perform loops. We do this after all |
| 1089 | // other costs here, so will likely only be dealing with relatively small |
| 1090 | // functions (and hence DT and LI will hopefully be cheap). |
| 1091 | auto *Caller = CandidateCall.getFunction(); |
| 1092 | if (Caller->hasMinSize()) { |
| 1093 | DominatorTree DT(F); |
| 1094 | LoopInfo LI(DT); |
| 1095 | int NumLoops = 0; |
| 1096 | for (Loop *L : LI) { |
| 1097 | // Ignore loops that will not be executed |
| 1098 | if (DeadBlocks.count(Ptr: L->getHeader())) |
| 1099 | continue; |
| 1100 | NumLoops++; |
| 1101 | } |
| 1102 | addCost(Inc: NumLoops * InlineConstants::LoopPenalty); |
| 1103 | } |
| 1104 | |
| 1105 | // We applied the maximum possible vector bonus at the beginning. Now, |
| 1106 | // subtract the excess bonus, if any, from the Threshold before |
| 1107 | // comparing against Cost. |
| 1108 | if (NumVectorInstructions <= NumInstructions / 10) |
| 1109 | Threshold -= VectorBonus; |
| 1110 | else if (NumVectorInstructions <= NumInstructions / 2) |
| 1111 | Threshold -= VectorBonus / 2; |
| 1112 | |
| 1113 | if (std::optional<int> AttrCost = |
| 1114 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost" )) |
| 1115 | Cost = *AttrCost; |
| 1116 | |
| 1117 | if (std::optional<int> AttrCostMult = getStringFnAttrAsInt( |
| 1118 | CB&: CandidateCall, |
| 1119 | AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName)) |
| 1120 | Cost *= *AttrCostMult; |
| 1121 | |
| 1122 | if (std::optional<int> AttrThreshold = |
| 1123 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold" )) |
| 1124 | Threshold = *AttrThreshold; |
| 1125 | |
| 1126 | if (auto Result = costBenefitAnalysis()) { |
| 1127 | DecidedByCostBenefit = true; |
| 1128 | if (*Result) |
| 1129 | return InlineResult::success(); |
| 1130 | else |
| 1131 | return InlineResult::failure(Reason: "Cost over threshold." ); |
| 1132 | } |
| 1133 | |
| 1134 | if (IgnoreThreshold) |
| 1135 | return InlineResult::success(); |
| 1136 | |
| 1137 | DecidedByCostThreshold = true; |
| 1138 | return Cost < std::max(a: 1, b: Threshold) |
| 1139 | ? InlineResult::success() |
| 1140 | : InlineResult::failure(Reason: "Cost over threshold." ); |
| 1141 | } |
| 1142 | |
| 1143 | bool shouldStop() override { |
| 1144 | if (IgnoreThreshold || ComputeFullInlineCost) |
| 1145 | return false; |
| 1146 | // Bail out the moment we cross the threshold. This means we'll under-count |
| 1147 | // the cost, but only when undercounting doesn't matter. |
| 1148 | if (Cost < Threshold) |
| 1149 | return false; |
| 1150 | DecidedByCostThreshold = true; |
| 1151 | return true; |
| 1152 | } |
| 1153 | |
| 1154 | void onLoadEliminationOpportunity() override { |
| 1155 | LoadEliminationCost += InstrCost; |
| 1156 | } |
| 1157 | |
| 1158 | InlineResult onAnalysisStart() override { |
| 1159 | // Perform some tweaks to the cost and threshold based on the direct |
| 1160 | // callsite information. |
| 1161 | |
| 1162 | // We want to more aggressively inline vector-dense kernels, so up the |
| 1163 | // threshold, and we'll lower it if the % of vector instructions gets too |
| 1164 | // low. Note that these bonuses are some what arbitrary and evolved over |
| 1165 | // time by accident as much as because they are principled bonuses. |
| 1166 | // |
| 1167 | // FIXME: It would be nice to remove all such bonuses. At least it would be |
| 1168 | // nice to base the bonus values on something more scientific. |
| 1169 | assert(NumInstructions == 0); |
| 1170 | assert(NumVectorInstructions == 0); |
| 1171 | |
| 1172 | // Update the threshold based on callsite properties |
| 1173 | updateThreshold(Call&: CandidateCall, Callee&: F); |
| 1174 | |
| 1175 | // While Threshold depends on commandline options that can take negative |
| 1176 | // values, we want to enforce the invariant that the computed threshold and |
| 1177 | // bonuses are non-negative. |
| 1178 | assert(Threshold >= 0); |
| 1179 | assert(SingleBBBonus >= 0); |
| 1180 | assert(VectorBonus >= 0); |
| 1181 | |
| 1182 | // Speculatively apply all possible bonuses to Threshold. If cost exceeds |
| 1183 | // this Threshold any time, and cost cannot decrease, we can stop processing |
| 1184 | // the rest of the function body. |
| 1185 | Threshold += (SingleBBBonus + VectorBonus); |
| 1186 | |
| 1187 | // Give out bonuses for the callsite, as the instructions setting them up |
| 1188 | // will be gone after inlining. |
| 1189 | addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
| 1190 | |
| 1191 | // If this function uses the coldcc calling convention, prefer not to inline |
| 1192 | // it. |
| 1193 | if (F.getCallingConv() == CallingConv::Cold) |
| 1194 | addCost(Inc: InlineConstants::ColdccPenalty); |
| 1195 | |
| 1196 | LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n" ); |
| 1197 | |
| 1198 | // Check if we're done. This can happen due to bonuses and penalties. |
| 1199 | if (Cost >= Threshold && !ComputeFullInlineCost) |
| 1200 | return InlineResult::failure(Reason: "high cost" ); |
| 1201 | |
| 1202 | return InlineResult::success(); |
| 1203 | } |
| 1204 | |
| 1205 | public: |
| 1206 | InlineCostCallAnalyzer( |
| 1207 | Function &Callee, CallBase &Call, const InlineParams &Params, |
| 1208 | const TargetTransformInfo &TTI, |
| 1209 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 1210 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, |
| 1211 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr, |
| 1212 | ProfileSummaryInfo *PSI = nullptr, |
| 1213 | OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, |
| 1214 | bool IgnoreThreshold = false, |
| 1215 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = |
| 1216 | nullptr) |
| 1217 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI, |
| 1218 | ORE, GetEphValuesCache), |
| 1219 | ComputeFullInlineCost(OptComputeFullInlineCost || |
| 1220 | Params.ComputeFullInlineCost || ORE || |
| 1221 | isCostBenefitAnalysisEnabled()), |
| 1222 | Params(Params), Threshold(Params.DefaultThreshold), |
| 1223 | BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold), |
| 1224 | CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()), |
| 1225 | Writer(this) { |
| 1226 | AllowRecursiveCall = *Params.AllowRecursiveCall; |
| 1227 | } |
| 1228 | |
| 1229 | /// Annotation Writer for instruction details |
| 1230 | InlineCostAnnotationWriter Writer; |
| 1231 | |
| 1232 | void dump(); |
| 1233 | |
| 1234 | // Prints the same analysis as dump(), but its definition is not dependent |
| 1235 | // on the build. |
| 1236 | void print(raw_ostream &OS); |
| 1237 | |
| 1238 | std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) { |
| 1239 | auto It = InstructionCostDetailMap.find(Val: I); |
| 1240 | if (It != InstructionCostDetailMap.end()) |
| 1241 | return It->second; |
| 1242 | return std::nullopt; |
| 1243 | } |
| 1244 | |
| 1245 | ~InlineCostCallAnalyzer() override = default; |
| 1246 | int getThreshold() const { return Threshold; } |
| 1247 | int getCost() const { return Cost; } |
| 1248 | int getStaticBonusApplied() const { return StaticBonusApplied; } |
| 1249 | std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; } |
| 1250 | bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; } |
| 1251 | bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; } |
| 1252 | }; |
| 1253 | |
| 1254 | // Return true if CB is the sole call to local function Callee. |
| 1255 | static bool isSoleCallToLocalFunction(const CallBase &CB, |
| 1256 | const Function &Callee) { |
| 1257 | return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() && |
| 1258 | &Callee == CB.getCalledFunction(); |
| 1259 | } |
| 1260 | |
| 1261 | class InlineCostFeaturesAnalyzer final : public CallAnalyzer { |
| 1262 | private: |
| 1263 | InlineCostFeatures Cost = {}; |
| 1264 | |
| 1265 | // FIXME: These constants are taken from the heuristic-based cost visitor. |
| 1266 | // These should be removed entirely in a later revision to avoid reliance on |
| 1267 | // heuristics in the ML inliner. |
| 1268 | static constexpr int JTCostMultiplier = 2; |
| 1269 | static constexpr int CaseClusterCostMultiplier = 2; |
| 1270 | static constexpr int SwitchDefaultDestCostMultiplier = 2; |
| 1271 | static constexpr int SwitchCostMultiplier = 2; |
| 1272 | |
| 1273 | // FIXME: These are taken from the heuristic-based cost visitor: we should |
| 1274 | // eventually abstract these to the CallAnalyzer to avoid duplication. |
| 1275 | unsigned SROACostSavingOpportunities = 0; |
| 1276 | int VectorBonus = 0; |
| 1277 | int SingleBBBonus = 0; |
| 1278 | int Threshold = 5; |
| 1279 | |
| 1280 | DenseMap<AllocaInst *, unsigned> SROACosts; |
| 1281 | |
| 1282 | void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) { |
| 1283 | Cost[static_cast<size_t>(Feature)] += Delta; |
| 1284 | } |
| 1285 | |
| 1286 | void set(InlineCostFeatureIndex Feature, int64_t Value) { |
| 1287 | Cost[static_cast<size_t>(Feature)] = Value; |
| 1288 | } |
| 1289 | |
| 1290 | void onDisableSROA(AllocaInst *Arg) override { |
| 1291 | auto CostIt = SROACosts.find(Val: Arg); |
| 1292 | if (CostIt == SROACosts.end()) |
| 1293 | return; |
| 1294 | |
| 1295 | increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second); |
| 1296 | SROACostSavingOpportunities -= CostIt->second; |
| 1297 | SROACosts.erase(I: CostIt); |
| 1298 | } |
| 1299 | |
| 1300 | void onDisableLoadElimination() override { |
| 1301 | set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1); |
| 1302 | } |
| 1303 | |
| 1304 | void onCallPenalty() override { |
| 1305 | increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty); |
| 1306 | } |
| 1307 | |
| 1308 | void onCallArgumentSetup(const CallBase &Call) override { |
| 1309 | increment(Feature: InlineCostFeatureIndex::call_argument_setup, |
| 1310 | Delta: Call.arg_size() * InstrCost); |
| 1311 | } |
| 1312 | |
| 1313 | void onLoadRelativeIntrinsic() override { |
| 1314 | increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost); |
| 1315 | } |
| 1316 | |
| 1317 | void onLoweredCall(Function *F, CallBase &Call, |
| 1318 | bool IsIndirectCall) override { |
| 1319 | increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup, |
| 1320 | Delta: Call.arg_size() * InstrCost); |
| 1321 | |
| 1322 | if (IsIndirectCall) { |
| 1323 | InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0, |
| 1324 | /*HintThreshold*/ {}, |
| 1325 | /*ColdThreshold*/ {}, |
| 1326 | /*OptSizeThreshold*/ {}, |
| 1327 | /*OptMinSizeThreshold*/ {}, |
| 1328 | /*HotCallSiteThreshold*/ {}, |
| 1329 | /*LocallyHotCallSiteThreshold*/ {}, |
| 1330 | /*ColdCallSiteThreshold*/ {}, |
| 1331 | /*ComputeFullInlineCost*/ true, |
| 1332 | /*EnableDeferral*/ true}; |
| 1333 | IndirectCallParams.DefaultThreshold = |
| 1334 | InlineConstants::IndirectCallThreshold; |
| 1335 | |
| 1336 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, |
| 1337 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
| 1338 | false, true); |
| 1339 | if (CA.analyze().isSuccess()) { |
| 1340 | increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate, |
| 1341 | Delta: CA.getCost()); |
| 1342 | increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1); |
| 1343 | } |
| 1344 | } else { |
| 1345 | onCallPenalty(); |
| 1346 | } |
| 1347 | } |
| 1348 | |
| 1349 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, |
| 1350 | bool DefaultDestUnreachable) override { |
| 1351 | if (JumpTableSize) { |
| 1352 | if (!DefaultDestUnreachable) |
| 1353 | increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty, |
| 1354 | Delta: SwitchDefaultDestCostMultiplier * InstrCost); |
| 1355 | int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost + |
| 1356 | JTCostMultiplier * InstrCost; |
| 1357 | increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost); |
| 1358 | return; |
| 1359 | } |
| 1360 | |
| 1361 | if (NumCaseCluster <= 3) { |
| 1362 | increment(Feature: InlineCostFeatureIndex::case_cluster_penalty, |
| 1363 | Delta: (NumCaseCluster - DefaultDestUnreachable) * |
| 1364 | CaseClusterCostMultiplier * InstrCost); |
| 1365 | return; |
| 1366 | } |
| 1367 | |
| 1368 | int64_t ExpectedNumberOfCompare = |
| 1369 | getExpectedNumberOfCompare(NumCaseCluster); |
| 1370 | |
| 1371 | int64_t SwitchCost = |
| 1372 | ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost; |
| 1373 | increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost); |
| 1374 | } |
| 1375 | |
| 1376 | void onMissedSimplification() override { |
| 1377 | increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions, |
| 1378 | Delta: InstrCost); |
| 1379 | } |
| 1380 | |
| 1381 | void onInitializeSROAArg(AllocaInst *Arg) override { |
| 1382 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); |
| 1383 | SROACosts[Arg] = SROAArgCost; |
| 1384 | SROACostSavingOpportunities += SROAArgCost; |
| 1385 | } |
| 1386 | |
| 1387 | void onAggregateSROAUse(AllocaInst *Arg) override { |
| 1388 | SROACosts.find(Val: Arg)->second += InstrCost; |
| 1389 | SROACostSavingOpportunities += InstrCost; |
| 1390 | } |
| 1391 | |
| 1392 | void onBlockAnalyzed(const BasicBlock *BB) override { |
| 1393 | if (BB->getTerminator()->getNumSuccessors() > 1) |
| 1394 | set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1); |
| 1395 | Threshold -= SingleBBBonus; |
| 1396 | } |
| 1397 | |
| 1398 | InlineResult finalizeAnalysis() override { |
| 1399 | auto *Caller = CandidateCall.getFunction(); |
| 1400 | if (Caller->hasMinSize()) { |
| 1401 | DominatorTree DT(F); |
| 1402 | LoopInfo LI(DT); |
| 1403 | for (Loop *L : LI) { |
| 1404 | // Ignore loops that will not be executed |
| 1405 | if (DeadBlocks.count(Ptr: L->getHeader())) |
| 1406 | continue; |
| 1407 | increment(Feature: InlineCostFeatureIndex::num_loops, |
| 1408 | Delta: InlineConstants::LoopPenalty); |
| 1409 | } |
| 1410 | } |
| 1411 | set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size()); |
| 1412 | set(Feature: InlineCostFeatureIndex::simplified_instructions, |
| 1413 | Value: NumInstructionsSimplified); |
| 1414 | set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs); |
| 1415 | set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args, |
| 1416 | Value: NumConstantOffsetPtrArgs); |
| 1417 | set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities); |
| 1418 | |
| 1419 | if (NumVectorInstructions <= NumInstructions / 10) |
| 1420 | Threshold -= VectorBonus; |
| 1421 | else if (NumVectorInstructions <= NumInstructions / 2) |
| 1422 | Threshold -= VectorBonus / 2; |
| 1423 | |
| 1424 | set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold); |
| 1425 | |
| 1426 | return InlineResult::success(); |
| 1427 | } |
| 1428 | |
| 1429 | bool shouldStop() override { return false; } |
| 1430 | |
| 1431 | void onLoadEliminationOpportunity() override { |
| 1432 | increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1); |
| 1433 | } |
| 1434 | |
| 1435 | InlineResult onAnalysisStart() override { |
| 1436 | increment(Feature: InlineCostFeatureIndex::callsite_cost, |
| 1437 | Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL)); |
| 1438 | |
| 1439 | set(Feature: InlineCostFeatureIndex::cold_cc_penalty, |
| 1440 | Value: (F.getCallingConv() == CallingConv::Cold)); |
| 1441 | |
| 1442 | set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus, |
| 1443 | Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F)); |
| 1444 | |
| 1445 | // FIXME: we shouldn't repeat this logic in both the Features and Cost |
| 1446 | // analyzer - instead, we should abstract it to a common method in the |
| 1447 | // CallAnalyzer |
| 1448 | int SingleBBBonusPercent = 50; |
| 1449 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
| 1450 | Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall); |
| 1451 | Threshold *= TTI.getInliningThresholdMultiplier(); |
| 1452 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
| 1453 | VectorBonus = Threshold * VectorBonusPercent / 100; |
| 1454 | Threshold += (SingleBBBonus + VectorBonus); |
| 1455 | |
| 1456 | return InlineResult::success(); |
| 1457 | } |
| 1458 | |
| 1459 | public: |
| 1460 | InlineCostFeaturesAnalyzer( |
| 1461 | const TargetTransformInfo &TTI, |
| 1462 | function_ref<AssumptionCache &(Function &)> &GetAssumptionCache, |
| 1463 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 1464 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 1465 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, |
| 1466 | CallBase &Call) |
| 1467 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, |
| 1468 | PSI) {} |
| 1469 | |
| 1470 | const InlineCostFeatures &features() const { return Cost; } |
| 1471 | }; |
| 1472 | |
| 1473 | } // namespace |
| 1474 | |
| 1475 | /// Test whether the given value is an Alloca-derived function argument. |
| 1476 | bool CallAnalyzer::isAllocaDerivedArg(Value *V) { |
| 1477 | return SROAArgValues.count(Val: V); |
| 1478 | } |
| 1479 | |
| 1480 | void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { |
| 1481 | onDisableSROA(Arg: SROAArg); |
| 1482 | EnabledSROAAllocas.erase(V: SROAArg); |
| 1483 | disableLoadElimination(); |
| 1484 | } |
| 1485 | |
| 1486 | void InlineCostAnnotationWriter::emitInstructionAnnot( |
| 1487 | const Instruction *I, formatted_raw_ostream &OS) { |
| 1488 | // The cost of inlining of the given instruction is printed always. |
| 1489 | // The threshold delta is printed only when it is non-zero. It happens |
| 1490 | // when we decided to give a bonus at a particular instruction. |
| 1491 | std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I); |
| 1492 | if (!Record) |
| 1493 | OS << "; No analysis for the instruction" ; |
| 1494 | else { |
| 1495 | OS << "; cost before = " << Record->CostBefore |
| 1496 | << ", cost after = " << Record->CostAfter |
| 1497 | << ", threshold before = " << Record->ThresholdBefore |
| 1498 | << ", threshold after = " << Record->ThresholdAfter << ", " ; |
| 1499 | OS << "cost delta = " << Record->getCostDelta(); |
| 1500 | if (Record->hasThresholdChanged()) |
| 1501 | OS << ", threshold delta = " << Record->getThresholdDelta(); |
| 1502 | } |
| 1503 | auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I)); |
| 1504 | if (V) { |
| 1505 | OS << ", simplified to " ; |
| 1506 | V->print(O&: OS, IsForDebug: true); |
| 1507 | if (auto *VI = dyn_cast<Instruction>(Val: V)) { |
| 1508 | if (VI->getFunction() != I->getFunction()) |
| 1509 | OS << " (caller instruction)" ; |
| 1510 | } else if (auto *VArg = dyn_cast<Argument>(Val: V)) { |
| 1511 | if (VArg->getParent() != I->getFunction()) |
| 1512 | OS << " (caller argument)" ; |
| 1513 | } |
| 1514 | } |
| 1515 | OS << "\n" ; |
| 1516 | } |
| 1517 | |
| 1518 | /// If 'V' maps to a SROA candidate, disable SROA for it. |
| 1519 | void CallAnalyzer::disableSROA(Value *V) { |
| 1520 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { |
| 1521 | disableSROAForArg(SROAArg); |
| 1522 | } |
| 1523 | } |
| 1524 | |
| 1525 | void CallAnalyzer::disableLoadElimination() { |
| 1526 | if (EnableLoadElimination) { |
| 1527 | onDisableLoadElimination(); |
| 1528 | EnableLoadElimination = false; |
| 1529 | } |
| 1530 | } |
| 1531 | |
| 1532 | /// Accumulate a constant GEP offset into an APInt if possible. |
| 1533 | /// |
| 1534 | /// Returns false if unable to compute the offset for any reason. Respects any |
| 1535 | /// simplified values known during the analysis of this callsite. |
| 1536 | bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { |
| 1537 | unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType()); |
| 1538 | assert(IntPtrWidth == Offset.getBitWidth()); |
| 1539 | |
| 1540 | for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); |
| 1541 | GTI != GTE; ++GTI) { |
| 1542 | ConstantInt *OpC = |
| 1543 | getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand()); |
| 1544 | if (!OpC) |
| 1545 | return false; |
| 1546 | if (OpC->isZero()) |
| 1547 | continue; |
| 1548 | |
| 1549 | // Handle a struct index, which adds its field offset to the pointer. |
| 1550 | if (StructType *STy = GTI.getStructTypeOrNull()) { |
| 1551 | unsigned ElementIdx = OpC->getZExtValue(); |
| 1552 | const StructLayout *SL = DL.getStructLayout(Ty: STy); |
| 1553 | Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx)); |
| 1554 | continue; |
| 1555 | } |
| 1556 | |
| 1557 | APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL)); |
| 1558 | Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize; |
| 1559 | } |
| 1560 | return true; |
| 1561 | } |
| 1562 | |
| 1563 | /// Use TTI to check whether a GEP is free. |
| 1564 | /// |
| 1565 | /// Respects any simplified values known during the analysis of this callsite. |
| 1566 | bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { |
| 1567 | SmallVector<Value *, 4> Operands; |
| 1568 | Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0)); |
| 1569 | for (const Use &Op : GEP.indices()) |
| 1570 | if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op)) |
| 1571 | Operands.push_back(Elt: SimpleOp); |
| 1572 | else |
| 1573 | Operands.push_back(Elt: Op); |
| 1574 | return TTI.getInstructionCost(U: &GEP, Operands, |
| 1575 | CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 1576 | TargetTransformInfo::TCC_Free; |
| 1577 | } |
| 1578 | |
| 1579 | bool CallAnalyzer::visitAlloca(AllocaInst &I) { |
| 1580 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
| 1581 | |
| 1582 | // Check whether inlining will turn a dynamic alloca into a static |
| 1583 | // alloca and handle that case. |
| 1584 | if (I.isArrayAllocation()) { |
| 1585 | Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize()); |
| 1586 | if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) { |
| 1587 | // Sometimes a dynamic alloca could be converted into a static alloca |
| 1588 | // after this constant prop, and become a huge static alloca on an |
| 1589 | // unconditional CFG path. Avoid inlining if this is going to happen above |
| 1590 | // a threshold. |
| 1591 | // FIXME: If the threshold is removed or lowered too much, we could end up |
| 1592 | // being too pessimistic and prevent inlining non-problematic code. This |
| 1593 | // could result in unintended perf regressions. A better overall strategy |
| 1594 | // is needed to track stack usage during inlining. |
| 1595 | Type *Ty = I.getAllocatedType(); |
| 1596 | AllocatedSize = SaturatingMultiplyAdd( |
| 1597 | X: AllocSize->getLimitedValue(), |
| 1598 | Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize); |
| 1599 | if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) |
| 1600 | HasDynamicAlloca = true; |
| 1601 | return false; |
| 1602 | } |
| 1603 | } |
| 1604 | |
| 1605 | if (I.isStaticAlloca()) { |
| 1606 | // Accumulate the allocated size if constant and executed once. |
| 1607 | // Note: if AllocSize is a vscale value, this is an underestimate of the |
| 1608 | // allocated size, and it also requires some of the cost of a dynamic |
| 1609 | // alloca, but is recorded here as a constant size alloca. |
| 1610 | TypeSize AllocSize = I.getAllocationSize(DL).value_or(u: TypeSize::getZero()); |
| 1611 | AllocatedSize = SaturatingAdd(X: AllocSize.getKnownMinValue(), Y: AllocatedSize); |
| 1612 | } else { |
| 1613 | // FIXME: This is overly conservative. Dynamic allocas are inefficient for |
| 1614 | // a variety of reasons, and so we would like to not inline them into |
| 1615 | // functions which don't currently have a dynamic alloca. This simply |
| 1616 | // disables inlining altogether in the presence of a dynamic alloca. |
| 1617 | HasDynamicAlloca = true; |
| 1618 | } |
| 1619 | |
| 1620 | return false; |
| 1621 | } |
| 1622 | |
| 1623 | bool CallAnalyzer::visitPHI(PHINode &I) { |
| 1624 | // FIXME: We need to propagate SROA *disabling* through phi nodes, even |
| 1625 | // though we don't want to propagate it's bonuses. The idea is to disable |
| 1626 | // SROA if it *might* be used in an inappropriate manner. |
| 1627 | |
| 1628 | // Phi nodes are always zero-cost. |
| 1629 | // FIXME: Pointer sizes may differ between different address spaces, so do we |
| 1630 | // need to use correct address space in the call to getPointerSizeInBits here? |
| 1631 | // Or could we skip the getPointerSizeInBits call completely? As far as I can |
| 1632 | // see the ZeroOffset is used as a dummy value, so we can probably use any |
| 1633 | // bit width for the ZeroOffset? |
| 1634 | APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0)); |
| 1635 | bool CheckSROA = I.getType()->isPointerTy(); |
| 1636 | |
| 1637 | // Track the constant or pointer with constant offset we've seen so far. |
| 1638 | Constant *FirstC = nullptr; |
| 1639 | std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; |
| 1640 | Value *FirstV = nullptr; |
| 1641 | |
| 1642 | for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { |
| 1643 | BasicBlock *Pred = I.getIncomingBlock(i); |
| 1644 | // If the incoming block is dead, skip the incoming block. |
| 1645 | if (DeadBlocks.count(Ptr: Pred)) |
| 1646 | continue; |
| 1647 | // If the parent block of phi is not the known successor of the incoming |
| 1648 | // block, skip the incoming block. |
| 1649 | BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; |
| 1650 | if (KnownSuccessor && KnownSuccessor != I.getParent()) |
| 1651 | continue; |
| 1652 | |
| 1653 | Value *V = I.getIncomingValue(i); |
| 1654 | // If the incoming value is this phi itself, skip the incoming value. |
| 1655 | if (&I == V) |
| 1656 | continue; |
| 1657 | |
| 1658 | Constant *C = getDirectOrSimplifiedValue<Constant>(V); |
| 1659 | |
| 1660 | std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; |
| 1661 | if (!C && CheckSROA) |
| 1662 | BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V); |
| 1663 | |
| 1664 | if (!C && !BaseAndOffset.first) |
| 1665 | // The incoming value is neither a constant nor a pointer with constant |
| 1666 | // offset, exit early. |
| 1667 | return true; |
| 1668 | |
| 1669 | if (FirstC) { |
| 1670 | if (FirstC == C) |
| 1671 | // If we've seen a constant incoming value before and it is the same |
| 1672 | // constant we see this time, continue checking the next incoming value. |
| 1673 | continue; |
| 1674 | // Otherwise early exit because we either see a different constant or saw |
| 1675 | // a constant before but we have a pointer with constant offset this time. |
| 1676 | return true; |
| 1677 | } |
| 1678 | |
| 1679 | if (FirstV) { |
| 1680 | // The same logic as above, but check pointer with constant offset here. |
| 1681 | if (FirstBaseAndOffset == BaseAndOffset) |
| 1682 | continue; |
| 1683 | return true; |
| 1684 | } |
| 1685 | |
| 1686 | if (C) { |
| 1687 | // This is the 1st time we've seen a constant, record it. |
| 1688 | FirstC = C; |
| 1689 | continue; |
| 1690 | } |
| 1691 | |
| 1692 | // The remaining case is that this is the 1st time we've seen a pointer with |
| 1693 | // constant offset, record it. |
| 1694 | FirstV = V; |
| 1695 | FirstBaseAndOffset = BaseAndOffset; |
| 1696 | } |
| 1697 | |
| 1698 | // Check if we can map phi to a constant. |
| 1699 | if (FirstC) { |
| 1700 | SimplifiedValues[&I] = FirstC; |
| 1701 | return true; |
| 1702 | } |
| 1703 | |
| 1704 | // Check if we can map phi to a pointer with constant offset. |
| 1705 | if (FirstBaseAndOffset.first) { |
| 1706 | ConstantOffsetPtrs[&I] = FirstBaseAndOffset; |
| 1707 | |
| 1708 | if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV)) |
| 1709 | SROAArgValues[&I] = SROAArg; |
| 1710 | } |
| 1711 | |
| 1712 | return true; |
| 1713 | } |
| 1714 | |
| 1715 | /// Check we can fold GEPs of constant-offset call site argument pointers. |
| 1716 | /// This requires target data and inbounds GEPs. |
| 1717 | /// |
| 1718 | /// \return true if the specified GEP can be folded. |
| 1719 | bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { |
| 1720 | // Check if we have a base + offset for the pointer. |
| 1721 | std::pair<Value *, APInt> BaseAndOffset = |
| 1722 | ConstantOffsetPtrs.lookup(Val: I.getPointerOperand()); |
| 1723 | if (!BaseAndOffset.first) |
| 1724 | return false; |
| 1725 | |
| 1726 | // Check if the offset of this GEP is constant, and if so accumulate it |
| 1727 | // into Offset. |
| 1728 | if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second)) |
| 1729 | return false; |
| 1730 | |
| 1731 | // Add the result as a new mapping to Base + Offset. |
| 1732 | ConstantOffsetPtrs[&I] = std::move(BaseAndOffset); |
| 1733 | |
| 1734 | return true; |
| 1735 | } |
| 1736 | |
| 1737 | bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { |
| 1738 | auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand()); |
| 1739 | |
| 1740 | // Lambda to check whether a GEP's indices are all constant. |
| 1741 | auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { |
| 1742 | for (const Use &Op : GEP.indices()) |
| 1743 | if (!getDirectOrSimplifiedValue<Constant>(V: Op)) |
| 1744 | return false; |
| 1745 | return true; |
| 1746 | }; |
| 1747 | |
| 1748 | if (!DisableGEPConstOperand) |
| 1749 | if (simplifyInstruction(I)) |
| 1750 | return true; |
| 1751 | |
| 1752 | if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { |
| 1753 | if (SROAArg) |
| 1754 | SROAArgValues[&I] = SROAArg; |
| 1755 | |
| 1756 | // Constant GEPs are modeled as free. |
| 1757 | return true; |
| 1758 | } |
| 1759 | |
| 1760 | // Variable GEPs will require math and will disable SROA. |
| 1761 | if (SROAArg) |
| 1762 | disableSROAForArg(SROAArg); |
| 1763 | return isGEPFree(GEP&: I); |
| 1764 | } |
| 1765 | |
| 1766 | // Simplify \p Cmp if RHS is const and we can ValueTrack LHS. |
| 1767 | // This handles the case only when the Cmp instruction is guarding a recursive |
| 1768 | // call that will cause the Cmp to fail/succeed for the recursive call. |
| 1769 | bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) { |
| 1770 | // Bail out if LHS is not a function argument or RHS is NOT const: |
| 1771 | if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1))) |
| 1772 | return false; |
| 1773 | auto *CmpOp = Cmp.getOperand(i_nocapture: 0); |
| 1774 | // Make sure that the callsite is recursive: |
| 1775 | if (CandidateCall.getCaller() != &F) |
| 1776 | return false; |
| 1777 | // Only handle the case when the callsite has a single predecessor: |
| 1778 | auto *CallBB = CandidateCall.getParent(); |
| 1779 | auto *Predecessor = CallBB->getSinglePredecessor(); |
| 1780 | if (!Predecessor) |
| 1781 | return false; |
| 1782 | // Check if the callsite is guarded by the same Cmp instruction: |
| 1783 | auto *Br = dyn_cast<BranchInst>(Val: Predecessor->getTerminator()); |
| 1784 | if (!Br || Br->isUnconditional() || Br->getCondition() != &Cmp) |
| 1785 | return false; |
| 1786 | |
| 1787 | // Check if there is any arg of the recursive callsite is affecting the cmp |
| 1788 | // instr: |
| 1789 | bool ArgFound = false; |
| 1790 | Value *FuncArg = nullptr, *CallArg = nullptr; |
| 1791 | for (unsigned ArgNum = 0; |
| 1792 | ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) { |
| 1793 | FuncArg = F.getArg(i: ArgNum); |
| 1794 | CallArg = CandidateCall.getArgOperand(i: ArgNum); |
| 1795 | if (FuncArg == CmpOp && CallArg != CmpOp) { |
| 1796 | ArgFound = true; |
| 1797 | break; |
| 1798 | } |
| 1799 | } |
| 1800 | if (!ArgFound) |
| 1801 | return false; |
| 1802 | |
| 1803 | // Now we have a recursive call that is guarded by a cmp instruction. |
| 1804 | // Check if this cmp can be simplified: |
| 1805 | SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg)); |
| 1806 | CondContext CC(&Cmp); |
| 1807 | CC.Invert = (CallBB != Br->getSuccessor(i: 0)); |
| 1808 | SQ.CC = &CC; |
| 1809 | CC.AffectedValues.insert(Ptr: FuncArg); |
| 1810 | Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands( |
| 1811 | I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ); |
| 1812 | if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) { |
| 1813 | // Make sure that the BB of the recursive call is NOT the true successor |
| 1814 | // of the icmp. In other words, make sure that the recursion depth is 1. |
| 1815 | if ((ConstVal->isOne() && CC.Invert) || |
| 1816 | (ConstVal->isZero() && !CC.Invert)) { |
| 1817 | SimplifiedValues[&Cmp] = ConstVal; |
| 1818 | return true; |
| 1819 | } |
| 1820 | } |
| 1821 | return false; |
| 1822 | } |
| 1823 | |
| 1824 | /// Simplify \p I if its operands are constants and update SimplifiedValues. |
| 1825 | bool CallAnalyzer::simplifyInstruction(Instruction &I) { |
| 1826 | SmallVector<Constant *> COps; |
| 1827 | for (Value *Op : I.operands()) { |
| 1828 | Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op); |
| 1829 | if (!COp) |
| 1830 | return false; |
| 1831 | COps.push_back(Elt: COp); |
| 1832 | } |
| 1833 | auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL); |
| 1834 | if (!C) |
| 1835 | return false; |
| 1836 | SimplifiedValues[&I] = C; |
| 1837 | return true; |
| 1838 | } |
| 1839 | |
| 1840 | /// Try to simplify a call to llvm.is.constant. |
| 1841 | /// |
| 1842 | /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since |
| 1843 | /// we expect calls of this specific intrinsic to be infrequent. |
| 1844 | /// |
| 1845 | /// FIXME: Given that we know CB's parent (F) caller |
| 1846 | /// (CandidateCall->getParent()->getParent()), we might be able to determine |
| 1847 | /// whether inlining F into F's caller would change how the call to |
| 1848 | /// llvm.is.constant would evaluate. |
| 1849 | bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) { |
| 1850 | Value *Arg = CB.getArgOperand(i: 0); |
| 1851 | auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg); |
| 1852 | |
| 1853 | Type *RT = CB.getFunctionType()->getReturnType(); |
| 1854 | SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0); |
| 1855 | return true; |
| 1856 | } |
| 1857 | |
| 1858 | bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) { |
| 1859 | // As per the langref, "The fourth argument to llvm.objectsize determines if |
| 1860 | // the value should be evaluated at runtime." |
| 1861 | if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne()) |
| 1862 | return false; |
| 1863 | |
| 1864 | Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr, |
| 1865 | /*MustSucceed=*/true); |
| 1866 | Constant *C = dyn_cast_or_null<Constant>(Val: V); |
| 1867 | if (C) |
| 1868 | SimplifiedValues[&CB] = C; |
| 1869 | return C; |
| 1870 | } |
| 1871 | |
| 1872 | bool CallAnalyzer::visitBitCast(BitCastInst &I) { |
| 1873 | // Propagate constants through bitcasts. |
| 1874 | if (simplifyInstruction(I)) |
| 1875 | return true; |
| 1876 | |
| 1877 | // Track base/offsets through casts |
| 1878 | std::pair<Value *, APInt> BaseAndOffset = |
| 1879 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
| 1880 | // Casts don't change the offset, just wrap it up. |
| 1881 | if (BaseAndOffset.first) |
| 1882 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
| 1883 | |
| 1884 | // Also look for SROA candidates here. |
| 1885 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
| 1886 | SROAArgValues[&I] = SROAArg; |
| 1887 | |
| 1888 | // Bitcasts are always zero cost. |
| 1889 | return true; |
| 1890 | } |
| 1891 | |
| 1892 | bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { |
| 1893 | // Propagate constants through ptrtoint. |
| 1894 | if (simplifyInstruction(I)) |
| 1895 | return true; |
| 1896 | |
| 1897 | // Track base/offset pairs when converted to a plain integer provided the |
| 1898 | // integer is large enough to represent the pointer. |
| 1899 | unsigned IntegerSize = I.getType()->getScalarSizeInBits(); |
| 1900 | unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace(); |
| 1901 | if (IntegerSize == DL.getPointerSizeInBits(AS)) { |
| 1902 | std::pair<Value *, APInt> BaseAndOffset = |
| 1903 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); |
| 1904 | if (BaseAndOffset.first) |
| 1905 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
| 1906 | } |
| 1907 | |
| 1908 | // This is really weird. Technically, ptrtoint will disable SROA. However, |
| 1909 | // unless that ptrtoint is *used* somewhere in the live basic blocks after |
| 1910 | // inlining, it will be nuked, and SROA should proceed. All of the uses which |
| 1911 | // would block SROA would also block SROA if applied directly to a pointer, |
| 1912 | // and so we can just add the integer in here. The only places where SROA is |
| 1913 | // preserved either cannot fire on an integer, or won't in-and-of themselves |
| 1914 | // disable SROA (ext) w/o some later use that we would see and disable. |
| 1915 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) |
| 1916 | SROAArgValues[&I] = SROAArg; |
| 1917 | |
| 1918 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 1919 | TargetTransformInfo::TCC_Free; |
| 1920 | } |
| 1921 | |
| 1922 | bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { |
| 1923 | // Propagate constants through ptrtoint. |
| 1924 | if (simplifyInstruction(I)) |
| 1925 | return true; |
| 1926 | |
| 1927 | // Track base/offset pairs when round-tripped through a pointer without |
| 1928 | // modifications provided the integer is not too large. |
| 1929 | Value *Op = I.getOperand(i_nocapture: 0); |
| 1930 | unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); |
| 1931 | if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { |
| 1932 | std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op); |
| 1933 | if (BaseAndOffset.first) |
| 1934 | ConstantOffsetPtrs[&I] = BaseAndOffset; |
| 1935 | } |
| 1936 | |
| 1937 | // "Propagate" SROA here in the same manner as we do for ptrtoint above. |
| 1938 | if (auto *SROAArg = getSROAArgForValueOrNull(V: Op)) |
| 1939 | SROAArgValues[&I] = SROAArg; |
| 1940 | |
| 1941 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 1942 | TargetTransformInfo::TCC_Free; |
| 1943 | } |
| 1944 | |
| 1945 | bool CallAnalyzer::visitCastInst(CastInst &I) { |
| 1946 | // Propagate constants through casts. |
| 1947 | if (simplifyInstruction(I)) |
| 1948 | return true; |
| 1949 | |
| 1950 | // Disable SROA in the face of arbitrary casts we don't explicitly list |
| 1951 | // elsewhere. |
| 1952 | disableSROA(V: I.getOperand(i_nocapture: 0)); |
| 1953 | |
| 1954 | // If this is a floating-point cast, and the target says this operation |
| 1955 | // is expensive, this may eventually become a library call. Treat the cost |
| 1956 | // as such. |
| 1957 | switch (I.getOpcode()) { |
| 1958 | case Instruction::FPTrunc: |
| 1959 | case Instruction::FPExt: |
| 1960 | case Instruction::UIToFP: |
| 1961 | case Instruction::SIToFP: |
| 1962 | case Instruction::FPToUI: |
| 1963 | case Instruction::FPToSI: |
| 1964 | if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive) |
| 1965 | onCallPenalty(); |
| 1966 | break; |
| 1967 | default: |
| 1968 | break; |
| 1969 | } |
| 1970 | |
| 1971 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 1972 | TargetTransformInfo::TCC_Free; |
| 1973 | } |
| 1974 | |
| 1975 | bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { |
| 1976 | return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr); |
| 1977 | } |
| 1978 | |
| 1979 | bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { |
| 1980 | // Does the *call site* have the NonNull attribute set on an argument? We |
| 1981 | // use the attribute on the call site to memoize any analysis done in the |
| 1982 | // caller. This will also trip if the callee function has a non-null |
| 1983 | // parameter attribute, but that's a less interesting case because hopefully |
| 1984 | // the callee would already have been simplified based on that. |
| 1985 | if (Argument *A = dyn_cast<Argument>(Val: V)) |
| 1986 | if (paramHasAttr(A, Attr: Attribute::NonNull)) |
| 1987 | return true; |
| 1988 | |
| 1989 | // Is this an alloca in the caller? This is distinct from the attribute case |
| 1990 | // above because attributes aren't updated within the inliner itself and we |
| 1991 | // always want to catch the alloca derived case. |
| 1992 | if (isAllocaDerivedArg(V)) |
| 1993 | // We can actually predict the result of comparisons between an |
| 1994 | // alloca-derived value and null. Note that this fires regardless of |
| 1995 | // SROA firing. |
| 1996 | return true; |
| 1997 | |
| 1998 | return false; |
| 1999 | } |
| 2000 | |
| 2001 | bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { |
| 2002 | // If the normal destination of the invoke or the parent block of the call |
| 2003 | // site is unreachable-terminated, there is little point in inlining this |
| 2004 | // unless there is literally zero cost. |
| 2005 | // FIXME: Note that it is possible that an unreachable-terminated block has a |
| 2006 | // hot entry. For example, in below scenario inlining hot_call_X() may be |
| 2007 | // beneficial : |
| 2008 | // main() { |
| 2009 | // hot_call_1(); |
| 2010 | // ... |
| 2011 | // hot_call_N() |
| 2012 | // exit(0); |
| 2013 | // } |
| 2014 | // For now, we are not handling this corner case here as it is rare in real |
| 2015 | // code. In future, we should elaborate this based on BPI and BFI in more |
| 2016 | // general threshold adjusting heuristics in updateThreshold(). |
| 2017 | if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) { |
| 2018 | if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator())) |
| 2019 | return false; |
| 2020 | } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator())) |
| 2021 | return false; |
| 2022 | |
| 2023 | return true; |
| 2024 | } |
| 2025 | |
| 2026 | bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, |
| 2027 | BlockFrequencyInfo *CallerBFI) { |
| 2028 | // If global profile summary is available, then callsite's coldness is |
| 2029 | // determined based on that. |
| 2030 | if (PSI && PSI->hasProfileSummary()) |
| 2031 | return PSI->isColdCallSite(CB: Call, BFI: CallerBFI); |
| 2032 | |
| 2033 | // Otherwise we need BFI to be available. |
| 2034 | if (!CallerBFI) |
| 2035 | return false; |
| 2036 | |
| 2037 | // Determine if the callsite is cold relative to caller's entry. We could |
| 2038 | // potentially cache the computation of scaled entry frequency, but the added |
| 2039 | // complexity is not worth it unless this scaling shows up high in the |
| 2040 | // profiles. |
| 2041 | const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); |
| 2042 | auto CallSiteBB = Call.getParent(); |
| 2043 | auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
| 2044 | auto CallerEntryFreq = |
| 2045 | CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock())); |
| 2046 | return CallSiteFreq < CallerEntryFreq * ColdProb; |
| 2047 | } |
| 2048 | |
| 2049 | std::optional<int> |
| 2050 | InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, |
| 2051 | BlockFrequencyInfo *CallerBFI) { |
| 2052 | |
| 2053 | // If global profile summary is available, then callsite's hotness is |
| 2054 | // determined based on that. |
| 2055 | if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI)) |
| 2056 | return Params.HotCallSiteThreshold; |
| 2057 | |
| 2058 | // Otherwise we need BFI to be available and to have a locally hot callsite |
| 2059 | // threshold. |
| 2060 | if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) |
| 2061 | return std::nullopt; |
| 2062 | |
| 2063 | // Determine if the callsite is hot relative to caller's entry. We could |
| 2064 | // potentially cache the computation of scaled entry frequency, but the added |
| 2065 | // complexity is not worth it unless this scaling shows up high in the |
| 2066 | // profiles. |
| 2067 | const BasicBlock *CallSiteBB = Call.getParent(); |
| 2068 | BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); |
| 2069 | BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq(); |
| 2070 | std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq); |
| 2071 | if (Limit && CallSiteFreq >= *Limit) |
| 2072 | return Params.LocallyHotCallSiteThreshold; |
| 2073 | |
| 2074 | // Otherwise treat it normally. |
| 2075 | return std::nullopt; |
| 2076 | } |
| 2077 | |
| 2078 | void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { |
| 2079 | // If no size growth is allowed for this inlining, set Threshold to 0. |
| 2080 | if (!allowSizeGrowth(Call)) { |
| 2081 | Threshold = 0; |
| 2082 | return; |
| 2083 | } |
| 2084 | |
| 2085 | Function *Caller = Call.getCaller(); |
| 2086 | |
| 2087 | // return min(A, B) if B is valid. |
| 2088 | auto MinIfValid = [](int A, std::optional<int> B) { |
| 2089 | return B ? std::min(a: A, b: *B) : A; |
| 2090 | }; |
| 2091 | |
| 2092 | // return max(A, B) if B is valid. |
| 2093 | auto MaxIfValid = [](int A, std::optional<int> B) { |
| 2094 | return B ? std::max(a: A, b: *B) : A; |
| 2095 | }; |
| 2096 | |
| 2097 | // Various bonus percentages. These are multiplied by Threshold to get the |
| 2098 | // bonus values. |
| 2099 | // SingleBBBonus: This bonus is applied if the callee has a single reachable |
| 2100 | // basic block at the given callsite context. This is speculatively applied |
| 2101 | // and withdrawn if more than one basic block is seen. |
| 2102 | // |
| 2103 | // LstCallToStaticBonus: This large bonus is applied to ensure the inlining |
| 2104 | // of the last call to a static function as inlining such functions is |
| 2105 | // guaranteed to reduce code size. |
| 2106 | // |
| 2107 | // These bonus percentages may be set to 0 based on properties of the caller |
| 2108 | // and the callsite. |
| 2109 | int SingleBBBonusPercent = 50; |
| 2110 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); |
| 2111 | int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus(); |
| 2112 | |
| 2113 | // Lambda to set all the above bonus and bonus percentages to 0. |
| 2114 | auto DisallowAllBonuses = [&]() { |
| 2115 | SingleBBBonusPercent = 0; |
| 2116 | VectorBonusPercent = 0; |
| 2117 | LastCallToStaticBonus = 0; |
| 2118 | }; |
| 2119 | |
| 2120 | // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available |
| 2121 | // and reduce the threshold if the caller has the necessary attribute. |
| 2122 | if (Caller->hasMinSize()) { |
| 2123 | Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); |
| 2124 | // For minsize, we want to disable the single BB bonus and the vector |
| 2125 | // bonuses, but not the last-call-to-static bonus. Inlining the last call to |
| 2126 | // a static function will, at the minimum, eliminate the parameter setup and |
| 2127 | // call/return instructions. |
| 2128 | SingleBBBonusPercent = 0; |
| 2129 | VectorBonusPercent = 0; |
| 2130 | } else if (Caller->hasOptSize()) |
| 2131 | Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); |
| 2132 | |
| 2133 | // Adjust the threshold based on inlinehint attribute and profile based |
| 2134 | // hotness information if the caller does not have MinSize attribute. |
| 2135 | if (!Caller->hasMinSize()) { |
| 2136 | if (Callee.hasFnAttribute(Kind: Attribute::InlineHint)) |
| 2137 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
| 2138 | |
| 2139 | // FIXME: After switching to the new passmanager, simplify the logic below |
| 2140 | // by checking only the callsite hotness/coldness as we will reliably |
| 2141 | // have local profile information. |
| 2142 | // |
| 2143 | // Callsite hotness and coldness can be determined if sample profile is |
| 2144 | // used (which adds hotness metadata to calls) or if caller's |
| 2145 | // BlockFrequencyInfo is available. |
| 2146 | BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr; |
| 2147 | auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); |
| 2148 | if (!Caller->hasOptSize() && HotCallSiteThreshold) { |
| 2149 | LLVM_DEBUG(dbgs() << "Hot callsite.\n" ); |
| 2150 | // FIXME: This should update the threshold only if it exceeds the |
| 2151 | // current threshold, but AutoFDO + ThinLTO currently relies on this |
| 2152 | // behavior to prevent inlining of hot callsites during ThinLTO |
| 2153 | // compile phase. |
| 2154 | Threshold = *HotCallSiteThreshold; |
| 2155 | } else if (isColdCallSite(Call, CallerBFI)) { |
| 2156 | LLVM_DEBUG(dbgs() << "Cold callsite.\n" ); |
| 2157 | // Do not apply bonuses for a cold callsite including the |
| 2158 | // LastCallToStatic bonus. While this bonus might result in code size |
| 2159 | // reduction, it can cause the size of a non-cold caller to increase |
| 2160 | // preventing it from being inlined. |
| 2161 | DisallowAllBonuses(); |
| 2162 | Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); |
| 2163 | } else if (PSI) { |
| 2164 | // Use callee's global profile information only if we have no way of |
| 2165 | // determining this via callsite information. |
| 2166 | if (PSI->isFunctionEntryHot(F: &Callee)) { |
| 2167 | LLVM_DEBUG(dbgs() << "Hot callee.\n" ); |
| 2168 | // If callsite hotness can not be determined, we may still know |
| 2169 | // that the callee is hot and treat it as a weaker hint for threshold |
| 2170 | // increase. |
| 2171 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
| 2172 | } else if (PSI->isFunctionEntryCold(F: &Callee)) { |
| 2173 | LLVM_DEBUG(dbgs() << "Cold callee.\n" ); |
| 2174 | // Do not apply bonuses for a cold callee including the |
| 2175 | // LastCallToStatic bonus. While this bonus might result in code size |
| 2176 | // reduction, it can cause the size of a non-cold caller to increase |
| 2177 | // preventing it from being inlined. |
| 2178 | DisallowAllBonuses(); |
| 2179 | Threshold = MinIfValid(Threshold, Params.ColdThreshold); |
| 2180 | } |
| 2181 | } |
| 2182 | } |
| 2183 | |
| 2184 | Threshold += TTI.adjustInliningThreshold(CB: &Call); |
| 2185 | |
| 2186 | // Finally, take the target-specific inlining threshold multiplier into |
| 2187 | // account. |
| 2188 | Threshold *= TTI.getInliningThresholdMultiplier(); |
| 2189 | |
| 2190 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; |
| 2191 | VectorBonus = Threshold * VectorBonusPercent / 100; |
| 2192 | |
| 2193 | // If there is only one call of the function, and it has internal linkage, |
| 2194 | // the cost of inlining it drops dramatically. It may seem odd to update |
| 2195 | // Cost in updateThreshold, but the bonus depends on the logic in this method. |
| 2196 | if (isSoleCallToLocalFunction(CB: Call, Callee: F)) { |
| 2197 | addCost(Inc: -LastCallToStaticBonus); |
| 2198 | StaticBonusApplied = LastCallToStaticBonus; |
| 2199 | } |
| 2200 | } |
| 2201 | |
| 2202 | bool CallAnalyzer::visitCmpInst(CmpInst &I) { |
| 2203 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
| 2204 | // First try to handle simplified comparisons. |
| 2205 | if (simplifyInstruction(I)) |
| 2206 | return true; |
| 2207 | |
| 2208 | // Try to handle comparison that can be simplified using ValueTracking. |
| 2209 | if (simplifyCmpInstForRecCall(Cmp&: I)) |
| 2210 | return true; |
| 2211 | |
| 2212 | if (I.getOpcode() == Instruction::FCmp) |
| 2213 | return false; |
| 2214 | |
| 2215 | // Otherwise look for a comparison between constant offset pointers with |
| 2216 | // a common base. |
| 2217 | Value *LHSBase, *RHSBase; |
| 2218 | APInt LHSOffset, RHSOffset; |
| 2219 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
| 2220 | if (LHSBase) { |
| 2221 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
| 2222 | if (RHSBase && LHSBase == RHSBase) { |
| 2223 | // We have common bases, fold the icmp to a constant based on the |
| 2224 | // offsets. |
| 2225 | SimplifiedValues[&I] = ConstantInt::getBool( |
| 2226 | Ty: I.getType(), |
| 2227 | V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate())); |
| 2228 | ++NumConstantPtrCmps; |
| 2229 | return true; |
| 2230 | } |
| 2231 | } |
| 2232 | |
| 2233 | auto isImplicitNullCheckCmp = [](const CmpInst &I) { |
| 2234 | for (auto *User : I.users()) |
| 2235 | if (auto *Instr = dyn_cast<Instruction>(Val: User)) |
| 2236 | if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit)) |
| 2237 | return false; |
| 2238 | return true; |
| 2239 | }; |
| 2240 | |
| 2241 | // If the comparison is an equality comparison with null, we can simplify it |
| 2242 | // if we know the value (argument) can't be null |
| 2243 | if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) { |
| 2244 | if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) { |
| 2245 | bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; |
| 2246 | SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType()) |
| 2247 | : ConstantInt::getFalse(Ty: I.getType()); |
| 2248 | return true; |
| 2249 | } |
| 2250 | // Implicit null checks act as unconditional branches and their comparisons |
| 2251 | // should be treated as simplified and free of cost. |
| 2252 | if (isImplicitNullCheckCmp(I)) |
| 2253 | return true; |
| 2254 | } |
| 2255 | return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))); |
| 2256 | } |
| 2257 | |
| 2258 | bool CallAnalyzer::visitSub(BinaryOperator &I) { |
| 2259 | // Try to handle a special case: we can fold computing the difference of two |
| 2260 | // constant-related pointers. |
| 2261 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
| 2262 | Value *LHSBase, *RHSBase; |
| 2263 | APInt LHSOffset, RHSOffset; |
| 2264 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); |
| 2265 | if (LHSBase) { |
| 2266 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); |
| 2267 | if (RHSBase && LHSBase == RHSBase) { |
| 2268 | // We have common bases, fold the subtract to a constant based on the |
| 2269 | // offsets. |
| 2270 | Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset); |
| 2271 | Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset); |
| 2272 | if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) { |
| 2273 | SimplifiedValues[&I] = C; |
| 2274 | ++NumConstantPtrDiffs; |
| 2275 | return true; |
| 2276 | } |
| 2277 | } |
| 2278 | } |
| 2279 | |
| 2280 | // Otherwise, fall back to the generic logic for simplifying and handling |
| 2281 | // instructions. |
| 2282 | return Base::visitSub(I); |
| 2283 | } |
| 2284 | |
| 2285 | bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { |
| 2286 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); |
| 2287 | Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS); |
| 2288 | Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS); |
| 2289 | |
| 2290 | Value *SimpleV = nullptr; |
| 2291 | if (auto FI = dyn_cast<FPMathOperator>(Val: &I)) |
| 2292 | SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, |
| 2293 | FMF: FI->getFastMathFlags(), Q: DL); |
| 2294 | else |
| 2295 | SimpleV = |
| 2296 | simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL); |
| 2297 | |
| 2298 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
| 2299 | SimplifiedValues[&I] = C; |
| 2300 | |
| 2301 | if (SimpleV) |
| 2302 | return true; |
| 2303 | |
| 2304 | // Disable any SROA on arguments to arbitrary, unsimplified binary operators. |
| 2305 | disableSROA(V: LHS); |
| 2306 | disableSROA(V: RHS); |
| 2307 | |
| 2308 | // If the instruction is floating point, and the target says this operation |
| 2309 | // is expensive, this may eventually become a library call. Treat the cost |
| 2310 | // as such. Unless it's fneg which can be implemented with an xor. |
| 2311 | using namespace llvm::PatternMatch; |
| 2312 | if (I.getType()->isFloatingPointTy() && |
| 2313 | TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive && |
| 2314 | !match(V: &I, P: m_FNeg(X: m_Value()))) |
| 2315 | onCallPenalty(); |
| 2316 | |
| 2317 | return false; |
| 2318 | } |
| 2319 | |
| 2320 | bool CallAnalyzer::visitFNeg(UnaryOperator &I) { |
| 2321 | Value *Op = I.getOperand(i_nocapture: 0); |
| 2322 | Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op); |
| 2323 | |
| 2324 | Value *SimpleV = simplifyFNegInst( |
| 2325 | Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL); |
| 2326 | |
| 2327 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) |
| 2328 | SimplifiedValues[&I] = C; |
| 2329 | |
| 2330 | if (SimpleV) |
| 2331 | return true; |
| 2332 | |
| 2333 | // Disable any SROA on arguments to arbitrary, unsimplified fneg. |
| 2334 | disableSROA(V: Op); |
| 2335 | |
| 2336 | return false; |
| 2337 | } |
| 2338 | |
| 2339 | bool CallAnalyzer::visitLoad(LoadInst &I) { |
| 2340 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
| 2341 | return true; |
| 2342 | |
| 2343 | // If the data is already loaded from this address and hasn't been clobbered |
| 2344 | // by any stores or calls, this load is likely to be redundant and can be |
| 2345 | // eliminated. |
| 2346 | if (EnableLoadElimination && |
| 2347 | !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) { |
| 2348 | onLoadEliminationOpportunity(); |
| 2349 | return true; |
| 2350 | } |
| 2351 | |
| 2352 | onMemAccess(); |
| 2353 | return false; |
| 2354 | } |
| 2355 | |
| 2356 | bool CallAnalyzer::visitStore(StoreInst &I) { |
| 2357 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) |
| 2358 | return true; |
| 2359 | |
| 2360 | // The store can potentially clobber loads and prevent repeated loads from |
| 2361 | // being eliminated. |
| 2362 | // FIXME: |
| 2363 | // 1. We can probably keep an initial set of eliminatable loads substracted |
| 2364 | // from the cost even when we finally see a store. We just need to disable |
| 2365 | // *further* accumulation of elimination savings. |
| 2366 | // 2. We should probably at some point thread MemorySSA for the callee into |
| 2367 | // this and then use that to actually compute *really* precise savings. |
| 2368 | disableLoadElimination(); |
| 2369 | |
| 2370 | onMemAccess(); |
| 2371 | return false; |
| 2372 | } |
| 2373 | |
| 2374 | bool CallAnalyzer::(ExtractValueInst &I) { |
| 2375 | Value *Op = I.getAggregateOperand(); |
| 2376 | |
| 2377 | // Special handling, because we want to simplify extractvalue with a |
| 2378 | // potential insertvalue from the caller. |
| 2379 | if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) { |
| 2380 | SimplifyQuery SQ(DL); |
| 2381 | Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ); |
| 2382 | if (SimpleV) { |
| 2383 | SimplifiedValues[&I] = SimpleV; |
| 2384 | return true; |
| 2385 | } |
| 2386 | } |
| 2387 | |
| 2388 | // SROA can't look through these, but they may be free. |
| 2389 | return Base::visitExtractValue(I); |
| 2390 | } |
| 2391 | |
| 2392 | bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { |
| 2393 | // Constant folding for insert value is trivial. |
| 2394 | if (simplifyInstruction(I)) |
| 2395 | return true; |
| 2396 | |
| 2397 | // SROA can't look through these, but they may be free. |
| 2398 | return Base::visitInsertValue(I); |
| 2399 | } |
| 2400 | |
| 2401 | /// Try to simplify a call site. |
| 2402 | /// |
| 2403 | /// Takes a concrete function and callsite and tries to actually simplify it by |
| 2404 | /// analyzing the arguments and call itself with instsimplify. Returns true if |
| 2405 | /// it has simplified the callsite to some other entity (a constant), making it |
| 2406 | /// free. |
| 2407 | bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { |
| 2408 | // FIXME: Using the instsimplify logic directly for this is inefficient |
| 2409 | // because we have to continually rebuild the argument list even when no |
| 2410 | // simplifications can be performed. Until that is fixed with remapping |
| 2411 | // inside of instsimplify, directly constant fold calls here. |
| 2412 | if (!canConstantFoldCallTo(Call: &Call, F)) |
| 2413 | return false; |
| 2414 | |
| 2415 | // Try to re-map the arguments to constants. |
| 2416 | SmallVector<Constant *, 4> ConstantArgs; |
| 2417 | ConstantArgs.reserve(N: Call.arg_size()); |
| 2418 | for (Value *I : Call.args()) { |
| 2419 | Constant *C = getDirectOrSimplifiedValue<Constant>(V: I); |
| 2420 | if (!C) |
| 2421 | return false; // This argument doesn't map to a constant. |
| 2422 | |
| 2423 | ConstantArgs.push_back(Elt: C); |
| 2424 | } |
| 2425 | if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) { |
| 2426 | SimplifiedValues[&Call] = C; |
| 2427 | return true; |
| 2428 | } |
| 2429 | |
| 2430 | return false; |
| 2431 | } |
| 2432 | |
| 2433 | bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) { |
| 2434 | const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr; |
| 2435 | LibFunc LF; |
| 2436 | if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF)) |
| 2437 | return TTI.isLoweredToCall(F); |
| 2438 | |
| 2439 | switch (LF) { |
| 2440 | case LibFunc_memcpy_chk: |
| 2441 | case LibFunc_memmove_chk: |
| 2442 | case LibFunc_mempcpy_chk: |
| 2443 | case LibFunc_memset_chk: { |
| 2444 | // Calls to __memcpy_chk whose length is known to fit within the object |
| 2445 | // size will eventually be replaced by inline stores. Therefore, these |
| 2446 | // should not incur a call penalty. This is only really relevant on |
| 2447 | // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as |
| 2448 | // other platforms use memcpy intrinsics, which are already exempt from the |
| 2449 | // call penalty. |
| 2450 | auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2)); |
| 2451 | auto *ObjSizeOp = |
| 2452 | getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3)); |
| 2453 | if (LenOp && ObjSizeOp && |
| 2454 | LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) { |
| 2455 | return false; |
| 2456 | } |
| 2457 | break; |
| 2458 | } |
| 2459 | default: |
| 2460 | break; |
| 2461 | } |
| 2462 | |
| 2463 | return TTI.isLoweredToCall(F); |
| 2464 | } |
| 2465 | |
| 2466 | bool CallAnalyzer::visitCallBase(CallBase &Call) { |
| 2467 | if (!onCallBaseVisitStart(Call)) |
| 2468 | return true; |
| 2469 | |
| 2470 | if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) && |
| 2471 | !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) { |
| 2472 | // This aborts the entire analysis. |
| 2473 | ExposesReturnsTwice = true; |
| 2474 | return false; |
| 2475 | } |
| 2476 | if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate()) |
| 2477 | ContainsNoDuplicateCall = true; |
| 2478 | |
| 2479 | if (InlineAsm *InlineAsmOp = dyn_cast<InlineAsm>(Val: Call.getCalledOperand())) |
| 2480 | onInlineAsm(Arg: *InlineAsmOp); |
| 2481 | |
| 2482 | Function *F = Call.getCalledFunction(); |
| 2483 | bool IsIndirectCall = !F; |
| 2484 | if (IsIndirectCall) { |
| 2485 | // Check if this happens to be an indirect function call to a known function |
| 2486 | // in this inline context. If not, we've done all we can. |
| 2487 | Value *Callee = Call.getCalledOperand(); |
| 2488 | F = getSimplifiedValue<Function>(V: Callee); |
| 2489 | if (!F || F->getFunctionType() != Call.getFunctionType()) { |
| 2490 | onCallArgumentSetup(Call); |
| 2491 | |
| 2492 | if (!Call.onlyReadsMemory()) |
| 2493 | disableLoadElimination(); |
| 2494 | return Base::visitCallBase(I&: Call); |
| 2495 | } |
| 2496 | } |
| 2497 | |
| 2498 | assert(F && "Expected a call to a known function" ); |
| 2499 | |
| 2500 | // When we have a concrete function, first try to simplify it directly. |
| 2501 | if (simplifyCallSite(F, Call)) |
| 2502 | return true; |
| 2503 | |
| 2504 | // Next check if it is an intrinsic we know about. |
| 2505 | // FIXME: Lift this into part of the InstVisitor. |
| 2506 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) { |
| 2507 | switch (II->getIntrinsicID()) { |
| 2508 | default: |
| 2509 | if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II)) |
| 2510 | disableLoadElimination(); |
| 2511 | return Base::visitCallBase(I&: Call); |
| 2512 | |
| 2513 | case Intrinsic::load_relative: |
| 2514 | onLoadRelativeIntrinsic(); |
| 2515 | return false; |
| 2516 | |
| 2517 | case Intrinsic::memset: |
| 2518 | case Intrinsic::memcpy: |
| 2519 | case Intrinsic::memmove: |
| 2520 | disableLoadElimination(); |
| 2521 | // SROA can usually chew through these intrinsics, but they aren't free. |
| 2522 | return false; |
| 2523 | case Intrinsic::icall_branch_funnel: |
| 2524 | case Intrinsic::localescape: |
| 2525 | HasUninlineableIntrinsic = true; |
| 2526 | return false; |
| 2527 | case Intrinsic::vastart: |
| 2528 | InitsVargArgs = true; |
| 2529 | return false; |
| 2530 | case Intrinsic::launder_invariant_group: |
| 2531 | case Intrinsic::strip_invariant_group: |
| 2532 | if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0))) |
| 2533 | SROAArgValues[II] = SROAArg; |
| 2534 | return true; |
| 2535 | case Intrinsic::is_constant: |
| 2536 | return simplifyIntrinsicCallIsConstant(CB&: Call); |
| 2537 | case Intrinsic::objectsize: |
| 2538 | return simplifyIntrinsicCallObjectSize(CB&: Call); |
| 2539 | } |
| 2540 | } |
| 2541 | |
| 2542 | if (F == Call.getFunction()) { |
| 2543 | // This flag will fully abort the analysis, so don't bother with anything |
| 2544 | // else. |
| 2545 | IsRecursiveCall = true; |
| 2546 | if (!AllowRecursiveCall) |
| 2547 | return false; |
| 2548 | } |
| 2549 | |
| 2550 | if (isLoweredToCall(F, Call)) { |
| 2551 | onLoweredCall(F, Call, IsIndirectCall); |
| 2552 | } |
| 2553 | |
| 2554 | if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) |
| 2555 | disableLoadElimination(); |
| 2556 | return Base::visitCallBase(I&: Call); |
| 2557 | } |
| 2558 | |
| 2559 | bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { |
| 2560 | // At least one return instruction will be free after inlining. |
| 2561 | bool Free = !HasReturn; |
| 2562 | HasReturn = true; |
| 2563 | return Free; |
| 2564 | } |
| 2565 | |
| 2566 | bool CallAnalyzer::visitBranchInst(BranchInst &BI) { |
| 2567 | // We model unconditional branches as essentially free -- they really |
| 2568 | // shouldn't exist at all, but handling them makes the behavior of the |
| 2569 | // inliner more regular and predictable. Interestingly, conditional branches |
| 2570 | // which will fold away are also free. |
| 2571 | return BI.isUnconditional() || |
| 2572 | getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) || |
| 2573 | BI.getMetadata(KindID: LLVMContext::MD_make_implicit); |
| 2574 | } |
| 2575 | |
| 2576 | bool CallAnalyzer::visitSelectInst(SelectInst &SI) { |
| 2577 | bool CheckSROA = SI.getType()->isPointerTy(); |
| 2578 | Value *TrueVal = SI.getTrueValue(); |
| 2579 | Value *FalseVal = SI.getFalseValue(); |
| 2580 | |
| 2581 | Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal); |
| 2582 | Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal); |
| 2583 | Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition()); |
| 2584 | |
| 2585 | if (!CondC) { |
| 2586 | // Select C, X, X => X |
| 2587 | if (TrueC == FalseC && TrueC) { |
| 2588 | SimplifiedValues[&SI] = TrueC; |
| 2589 | return true; |
| 2590 | } |
| 2591 | |
| 2592 | if (!CheckSROA) |
| 2593 | return Base::visitSelectInst(I&: SI); |
| 2594 | |
| 2595 | std::pair<Value *, APInt> TrueBaseAndOffset = |
| 2596 | ConstantOffsetPtrs.lookup(Val: TrueVal); |
| 2597 | std::pair<Value *, APInt> FalseBaseAndOffset = |
| 2598 | ConstantOffsetPtrs.lookup(Val: FalseVal); |
| 2599 | if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { |
| 2600 | ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; |
| 2601 | |
| 2602 | if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal)) |
| 2603 | SROAArgValues[&SI] = SROAArg; |
| 2604 | return true; |
| 2605 | } |
| 2606 | |
| 2607 | return Base::visitSelectInst(I&: SI); |
| 2608 | } |
| 2609 | |
| 2610 | // Select condition is a constant. |
| 2611 | Value *SelectedV = CondC->isAllOnesValue() ? TrueVal |
| 2612 | : (CondC->isNullValue()) ? FalseVal |
| 2613 | : nullptr; |
| 2614 | if (!SelectedV) { |
| 2615 | // Condition is a vector constant that is not all 1s or all 0s. If all |
| 2616 | // operands are constants, ConstantFoldSelectInstruction() can handle the |
| 2617 | // cases such as select vectors. |
| 2618 | if (TrueC && FalseC) { |
| 2619 | if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) { |
| 2620 | SimplifiedValues[&SI] = C; |
| 2621 | return true; |
| 2622 | } |
| 2623 | } |
| 2624 | return Base::visitSelectInst(I&: SI); |
| 2625 | } |
| 2626 | |
| 2627 | // Condition is either all 1s or all 0s. SI can be simplified. |
| 2628 | if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) { |
| 2629 | SimplifiedValues[&SI] = SelectedC; |
| 2630 | return true; |
| 2631 | } |
| 2632 | |
| 2633 | if (!CheckSROA) |
| 2634 | return true; |
| 2635 | |
| 2636 | std::pair<Value *, APInt> BaseAndOffset = |
| 2637 | ConstantOffsetPtrs.lookup(Val: SelectedV); |
| 2638 | if (BaseAndOffset.first) { |
| 2639 | ConstantOffsetPtrs[&SI] = BaseAndOffset; |
| 2640 | |
| 2641 | if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV)) |
| 2642 | SROAArgValues[&SI] = SROAArg; |
| 2643 | } |
| 2644 | |
| 2645 | return true; |
| 2646 | } |
| 2647 | |
| 2648 | bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { |
| 2649 | // We model unconditional switches as free, see the comments on handling |
| 2650 | // branches. |
| 2651 | if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition())) |
| 2652 | return true; |
| 2653 | |
| 2654 | // Assume the most general case where the switch is lowered into |
| 2655 | // either a jump table, bit test, or a balanced binary tree consisting of |
| 2656 | // case clusters without merging adjacent clusters with the same |
| 2657 | // destination. We do not consider the switches that are lowered with a mix |
| 2658 | // of jump table/bit test/binary search tree. The cost of the switch is |
| 2659 | // proportional to the size of the tree or the size of jump table range. |
| 2660 | // |
| 2661 | // NB: We convert large switches which are just used to initialize large phi |
| 2662 | // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent |
| 2663 | // inlining those. It will prevent inlining in cases where the optimization |
| 2664 | // does not (yet) fire. |
| 2665 | |
| 2666 | unsigned JumpTableSize = 0; |
| 2667 | BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; |
| 2668 | unsigned NumCaseCluster = |
| 2669 | TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI); |
| 2670 | |
| 2671 | onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable()); |
| 2672 | return false; |
| 2673 | } |
| 2674 | |
| 2675 | bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { |
| 2676 | // We never want to inline functions that contain an indirectbr. This is |
| 2677 | // incorrect because all the blockaddress's (in static global initializers |
| 2678 | // for example) would be referring to the original function, and this |
| 2679 | // indirect jump would jump from the inlined copy of the function into the |
| 2680 | // original function which is extremely undefined behavior. |
| 2681 | // FIXME: This logic isn't really right; we can safely inline functions with |
| 2682 | // indirectbr's as long as no other function or global references the |
| 2683 | // blockaddress of a block within the current function. |
| 2684 | HasIndirectBr = true; |
| 2685 | return false; |
| 2686 | } |
| 2687 | |
| 2688 | bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { |
| 2689 | // FIXME: It's not clear that a single instruction is an accurate model for |
| 2690 | // the inline cost of a resume instruction. |
| 2691 | return false; |
| 2692 | } |
| 2693 | |
| 2694 | bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { |
| 2695 | // FIXME: It's not clear that a single instruction is an accurate model for |
| 2696 | // the inline cost of a cleanupret instruction. |
| 2697 | return false; |
| 2698 | } |
| 2699 | |
| 2700 | bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { |
| 2701 | // FIXME: It's not clear that a single instruction is an accurate model for |
| 2702 | // the inline cost of a catchret instruction. |
| 2703 | return false; |
| 2704 | } |
| 2705 | |
| 2706 | bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { |
| 2707 | // FIXME: It might be reasonably to discount the cost of instructions leading |
| 2708 | // to unreachable as they have the lowest possible impact on both runtime and |
| 2709 | // code size. |
| 2710 | return true; // No actual code is needed for unreachable. |
| 2711 | } |
| 2712 | |
| 2713 | bool CallAnalyzer::visitInstruction(Instruction &I) { |
| 2714 | // Some instructions are free. All of the free intrinsics can also be |
| 2715 | // handled by SROA, etc. |
| 2716 | if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == |
| 2717 | TargetTransformInfo::TCC_Free) |
| 2718 | return true; |
| 2719 | |
| 2720 | // We found something we don't understand or can't handle. Mark any SROA-able |
| 2721 | // values in the operand list as no longer viable. |
| 2722 | for (const Use &Op : I.operands()) |
| 2723 | disableSROA(V: Op); |
| 2724 | |
| 2725 | return false; |
| 2726 | } |
| 2727 | |
| 2728 | /// Analyze a basic block for its contribution to the inline cost. |
| 2729 | /// |
| 2730 | /// This method walks the analyzer over every instruction in the given basic |
| 2731 | /// block and accounts for their cost during inlining at this callsite. It |
| 2732 | /// aborts early if the threshold has been exceeded or an impossible to inline |
| 2733 | /// construct has been detected. It returns false if inlining is no longer |
| 2734 | /// viable, and true if inlining remains viable. |
| 2735 | InlineResult |
| 2736 | CallAnalyzer::analyzeBlock(BasicBlock *BB, |
| 2737 | const SmallPtrSetImpl<const Value *> &EphValues) { |
| 2738 | for (Instruction &I : *BB) { |
| 2739 | // FIXME: Currently, the number of instructions in a function regardless of |
| 2740 | // our ability to simplify them during inline to constants or dead code, |
| 2741 | // are actually used by the vector bonus heuristic. As long as that's true, |
| 2742 | // we have to special case debug intrinsics here to prevent differences in |
| 2743 | // inlining due to debug symbols. Eventually, the number of unsimplified |
| 2744 | // instructions shouldn't factor into the cost computation, but until then, |
| 2745 | // hack around it here. |
| 2746 | // Similarly, skip pseudo-probes. |
| 2747 | if (I.isDebugOrPseudoInst()) |
| 2748 | continue; |
| 2749 | |
| 2750 | // Skip ephemeral values. |
| 2751 | if (EphValues.count(Ptr: &I)) |
| 2752 | continue; |
| 2753 | |
| 2754 | ++NumInstructions; |
| 2755 | if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy()) |
| 2756 | ++NumVectorInstructions; |
| 2757 | |
| 2758 | // If the instruction simplified to a constant, there is no cost to this |
| 2759 | // instruction. Visit the instructions using our InstVisitor to account for |
| 2760 | // all of the per-instruction logic. The visit tree returns true if we |
| 2761 | // consumed the instruction in any way, and false if the instruction's base |
| 2762 | // cost should count against inlining. |
| 2763 | onInstructionAnalysisStart(I: &I); |
| 2764 | |
| 2765 | if (Base::visit(I: &I)) |
| 2766 | ++NumInstructionsSimplified; |
| 2767 | else |
| 2768 | onMissedSimplification(); |
| 2769 | |
| 2770 | onInstructionAnalysisFinish(I: &I); |
| 2771 | using namespace ore; |
| 2772 | // If the visit this instruction detected an uninlinable pattern, abort. |
| 2773 | InlineResult IR = InlineResult::success(); |
| 2774 | if (IsRecursiveCall && !AllowRecursiveCall) |
| 2775 | IR = InlineResult::failure(Reason: "recursive" ); |
| 2776 | else if (ExposesReturnsTwice) |
| 2777 | IR = InlineResult::failure(Reason: "exposes returns twice" ); |
| 2778 | else if (HasDynamicAlloca) |
| 2779 | IR = InlineResult::failure(Reason: "dynamic alloca" ); |
| 2780 | else if (HasIndirectBr) |
| 2781 | IR = InlineResult::failure(Reason: "indirect branch" ); |
| 2782 | else if (HasUninlineableIntrinsic) |
| 2783 | IR = InlineResult::failure(Reason: "uninlinable intrinsic" ); |
| 2784 | else if (InitsVargArgs) |
| 2785 | IR = InlineResult::failure(Reason: "varargs" ); |
| 2786 | if (!IR.isSuccess()) { |
| 2787 | if (ORE) |
| 2788 | ORE->emit(RemarkBuilder: [&]() { |
| 2789 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
| 2790 | &CandidateCall) |
| 2791 | << NV("Callee" , &F) << " has uninlinable pattern (" |
| 2792 | << NV("InlineResult" , IR.getFailureReason()) |
| 2793 | << ") and cost is not fully computed" ; |
| 2794 | }); |
| 2795 | return IR; |
| 2796 | } |
| 2797 | |
| 2798 | // If the caller is a recursive function then we don't want to inline |
| 2799 | // functions which allocate a lot of stack space because it would increase |
| 2800 | // the caller stack usage dramatically. |
| 2801 | if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) { |
| 2802 | auto IR = |
| 2803 | InlineResult::failure(Reason: "recursive and allocates too much stack space" ); |
| 2804 | if (ORE) |
| 2805 | ORE->emit(RemarkBuilder: [&]() { |
| 2806 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline" , |
| 2807 | &CandidateCall) |
| 2808 | << NV("Callee" , &F) << " is " |
| 2809 | << NV("InlineResult" , IR.getFailureReason()) |
| 2810 | << ". Cost is not fully computed" ; |
| 2811 | }); |
| 2812 | return IR; |
| 2813 | } |
| 2814 | |
| 2815 | if (shouldStop()) |
| 2816 | return InlineResult::failure( |
| 2817 | Reason: "Call site analysis is not favorable to inlining." ); |
| 2818 | } |
| 2819 | |
| 2820 | return InlineResult::success(); |
| 2821 | } |
| 2822 | |
| 2823 | /// Compute the base pointer and cumulative constant offsets for V. |
| 2824 | /// |
| 2825 | /// This strips all constant offsets off of V, leaving it the base pointer, and |
| 2826 | /// accumulates the total constant offset applied in the returned constant. It |
| 2827 | /// returns 0 if V is not a pointer, and returns the constant '0' if there are |
| 2828 | /// no constant offsets applied. |
| 2829 | ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { |
| 2830 | if (!V->getType()->isPointerTy()) |
| 2831 | return nullptr; |
| 2832 | |
| 2833 | unsigned AS = V->getType()->getPointerAddressSpace(); |
| 2834 | unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); |
| 2835 | APInt Offset = APInt::getZero(numBits: IntPtrWidth); |
| 2836 | |
| 2837 | // Even though we don't look through PHI nodes, we could be called on an |
| 2838 | // instruction in an unreachable block, which may be on a cycle. |
| 2839 | SmallPtrSet<Value *, 4> Visited; |
| 2840 | Visited.insert(Ptr: V); |
| 2841 | do { |
| 2842 | if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) { |
| 2843 | if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset)) |
| 2844 | return nullptr; |
| 2845 | V = GEP->getPointerOperand(); |
| 2846 | } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) { |
| 2847 | if (GA->isInterposable()) |
| 2848 | break; |
| 2849 | V = GA->getAliasee(); |
| 2850 | } else { |
| 2851 | break; |
| 2852 | } |
| 2853 | assert(V->getType()->isPointerTy() && "Unexpected operand type!" ); |
| 2854 | } while (Visited.insert(Ptr: V).second); |
| 2855 | |
| 2856 | Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType()); |
| 2857 | return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset)); |
| 2858 | } |
| 2859 | |
| 2860 | /// Find dead blocks due to deleted CFG edges during inlining. |
| 2861 | /// |
| 2862 | /// If we know the successor of the current block, \p CurrBB, has to be \p |
| 2863 | /// NextBB, the other successors of \p CurrBB are dead if these successors have |
| 2864 | /// no live incoming CFG edges. If one block is found to be dead, we can |
| 2865 | /// continue growing the dead block list by checking the successors of the dead |
| 2866 | /// blocks to see if all their incoming edges are dead or not. |
| 2867 | void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { |
| 2868 | auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { |
| 2869 | // A CFG edge is dead if the predecessor is dead or the predecessor has a |
| 2870 | // known successor which is not the one under exam. |
| 2871 | if (DeadBlocks.count(Ptr: Pred)) |
| 2872 | return true; |
| 2873 | BasicBlock *KnownSucc = KnownSuccessors[Pred]; |
| 2874 | return KnownSucc && KnownSucc != Succ; |
| 2875 | }; |
| 2876 | |
| 2877 | auto IsNewlyDead = [&](BasicBlock *BB) { |
| 2878 | // If all the edges to a block are dead, the block is also dead. |
| 2879 | return (!DeadBlocks.count(Ptr: BB) && |
| 2880 | llvm::all_of(Range: predecessors(BB), |
| 2881 | P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); |
| 2882 | }; |
| 2883 | |
| 2884 | for (BasicBlock *Succ : successors(BB: CurrBB)) { |
| 2885 | if (Succ == NextBB || !IsNewlyDead(Succ)) |
| 2886 | continue; |
| 2887 | SmallVector<BasicBlock *, 4> NewDead; |
| 2888 | NewDead.push_back(Elt: Succ); |
| 2889 | while (!NewDead.empty()) { |
| 2890 | BasicBlock *Dead = NewDead.pop_back_val(); |
| 2891 | if (DeadBlocks.insert(Ptr: Dead).second) |
| 2892 | // Continue growing the dead block lists. |
| 2893 | for (BasicBlock *S : successors(BB: Dead)) |
| 2894 | if (IsNewlyDead(S)) |
| 2895 | NewDead.push_back(Elt: S); |
| 2896 | } |
| 2897 | } |
| 2898 | } |
| 2899 | |
| 2900 | /// Analyze a call site for potential inlining. |
| 2901 | /// |
| 2902 | /// Returns true if inlining this call is viable, and false if it is not |
| 2903 | /// viable. It computes the cost and adjusts the threshold based on numerous |
| 2904 | /// factors and heuristics. If this method returns false but the computed cost |
| 2905 | /// is below the computed threshold, then inlining was forcibly disabled by |
| 2906 | /// some artifact of the routine. |
| 2907 | InlineResult CallAnalyzer::analyze() { |
| 2908 | ++NumCallsAnalyzed; |
| 2909 | |
| 2910 | auto Result = onAnalysisStart(); |
| 2911 | if (!Result.isSuccess()) |
| 2912 | return Result; |
| 2913 | |
| 2914 | if (F.empty()) |
| 2915 | return InlineResult::success(); |
| 2916 | |
| 2917 | Function *Caller = CandidateCall.getFunction(); |
| 2918 | // Check if the caller function is recursive itself. |
| 2919 | for (User *U : Caller->users()) { |
| 2920 | CallBase *Call = dyn_cast<CallBase>(Val: U); |
| 2921 | if (Call && Call->getFunction() == Caller) { |
| 2922 | IsCallerRecursive = true; |
| 2923 | break; |
| 2924 | } |
| 2925 | } |
| 2926 | |
| 2927 | // Populate our simplified values by mapping from function arguments to call |
| 2928 | // arguments with known important simplifications. |
| 2929 | auto CAI = CandidateCall.arg_begin(); |
| 2930 | for (Argument &FAI : F.args()) { |
| 2931 | assert(CAI != CandidateCall.arg_end()); |
| 2932 | SimplifiedValues[&FAI] = *CAI; |
| 2933 | if (isa<Constant>(Val: *CAI)) |
| 2934 | ++NumConstantArgs; |
| 2935 | |
| 2936 | Value *PtrArg = *CAI; |
| 2937 | if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) { |
| 2938 | ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue()); |
| 2939 | |
| 2940 | // We can SROA any pointer arguments derived from alloca instructions. |
| 2941 | if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) { |
| 2942 | SROAArgValues[&FAI] = SROAArg; |
| 2943 | onInitializeSROAArg(Arg: SROAArg); |
| 2944 | EnabledSROAAllocas.insert(V: SROAArg); |
| 2945 | } |
| 2946 | } |
| 2947 | ++CAI; |
| 2948 | } |
| 2949 | NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); |
| 2950 | NumAllocaArgs = SROAArgValues.size(); |
| 2951 | |
| 2952 | // Collecting the ephemeral values of `F` can be expensive, so use the |
| 2953 | // ephemeral values cache if available. |
| 2954 | SmallPtrSet<const Value *, 32> EphValuesStorage; |
| 2955 | const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage; |
| 2956 | if (GetEphValuesCache) |
| 2957 | EphValues = &GetEphValuesCache(F).ephValues(); |
| 2958 | else |
| 2959 | CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F), |
| 2960 | EphValues&: EphValuesStorage); |
| 2961 | |
| 2962 | // The worklist of live basic blocks in the callee *after* inlining. We avoid |
| 2963 | // adding basic blocks of the callee which can be proven to be dead for this |
| 2964 | // particular call site in order to get more accurate cost estimates. This |
| 2965 | // requires a somewhat heavyweight iteration pattern: we need to walk the |
| 2966 | // basic blocks in a breadth-first order as we insert live successors. To |
| 2967 | // accomplish this, prioritizing for small iterations because we exit after |
| 2968 | // crossing our threshold, we use a small-size optimized SetVector. |
| 2969 | typedef SmallSetVector<BasicBlock *, 16> BBSetVector; |
| 2970 | BBSetVector BBWorklist; |
| 2971 | BBWorklist.insert(X: &F.getEntryBlock()); |
| 2972 | |
| 2973 | // Note that we *must not* cache the size, this loop grows the worklist. |
| 2974 | for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { |
| 2975 | if (shouldStop()) |
| 2976 | break; |
| 2977 | |
| 2978 | BasicBlock *BB = BBWorklist[Idx]; |
| 2979 | if (BB->empty()) |
| 2980 | continue; |
| 2981 | |
| 2982 | onBlockStart(BB); |
| 2983 | |
| 2984 | // Disallow inlining a blockaddress. |
| 2985 | // A blockaddress only has defined behavior for an indirect branch in the |
| 2986 | // same function, and we do not currently support inlining indirect |
| 2987 | // branches. But, the inliner may not see an indirect branch that ends up |
| 2988 | // being dead code at a particular call site. If the blockaddress escapes |
| 2989 | // the function, e.g., via a global variable, inlining may lead to an |
| 2990 | // invalid cross-function reference. |
| 2991 | // FIXME: pr/39560: continue relaxing this overt restriction. |
| 2992 | if (BB->hasAddressTaken()) |
| 2993 | return InlineResult::failure(Reason: "blockaddress used" ); |
| 2994 | |
| 2995 | // Analyze the cost of this block. If we blow through the threshold, this |
| 2996 | // returns false, and we can bail on out. |
| 2997 | InlineResult IR = analyzeBlock(BB, EphValues: *EphValues); |
| 2998 | if (!IR.isSuccess()) |
| 2999 | return IR; |
| 3000 | |
| 3001 | Instruction *TI = BB->getTerminator(); |
| 3002 | |
| 3003 | // Add in the live successors by first checking whether we have terminator |
| 3004 | // that may be simplified based on the values simplified by this call. |
| 3005 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) { |
| 3006 | if (BI->isConditional()) { |
| 3007 | Value *Cond = BI->getCondition(); |
| 3008 | if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) { |
| 3009 | BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0); |
| 3010 | BBWorklist.insert(X: NextBB); |
| 3011 | KnownSuccessors[BB] = NextBB; |
| 3012 | findDeadBlocks(CurrBB: BB, NextBB); |
| 3013 | continue; |
| 3014 | } |
| 3015 | } |
| 3016 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) { |
| 3017 | Value *Cond = SI->getCondition(); |
| 3018 | if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) { |
| 3019 | BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor(); |
| 3020 | BBWorklist.insert(X: NextBB); |
| 3021 | KnownSuccessors[BB] = NextBB; |
| 3022 | findDeadBlocks(CurrBB: BB, NextBB); |
| 3023 | continue; |
| 3024 | } |
| 3025 | } |
| 3026 | |
| 3027 | // If we're unable to select a particular successor, just count all of |
| 3028 | // them. |
| 3029 | BBWorklist.insert_range(R: successors(BB)); |
| 3030 | |
| 3031 | onBlockAnalyzed(BB); |
| 3032 | } |
| 3033 | |
| 3034 | // If this is a noduplicate call, we can still inline as long as |
| 3035 | // inlining this would cause the removal of the caller (so the instruction |
| 3036 | // is not actually duplicated, just moved). |
| 3037 | if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall) |
| 3038 | return InlineResult::failure(Reason: "noduplicate" ); |
| 3039 | |
| 3040 | // If the callee's stack size exceeds the user-specified threshold, |
| 3041 | // do not let it be inlined. |
| 3042 | // The command line option overrides a limit set in the function attributes. |
| 3043 | size_t FinalStackSizeThreshold = StackSizeThreshold; |
| 3044 | if (!StackSizeThreshold.getNumOccurrences()) |
| 3045 | if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt( |
| 3046 | F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName)) |
| 3047 | FinalStackSizeThreshold = *AttrMaxStackSize; |
| 3048 | if (AllocatedSize > FinalStackSizeThreshold) |
| 3049 | return InlineResult::failure(Reason: "stacksize" ); |
| 3050 | |
| 3051 | return finalizeAnalysis(); |
| 3052 | } |
| 3053 | |
| 3054 | void InlineCostCallAnalyzer::print(raw_ostream &OS) { |
| 3055 | #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n" |
| 3056 | if (PrintInstructionComments) |
| 3057 | F.print(OS, AAW: &Writer); |
| 3058 | DEBUG_PRINT_STAT(NumConstantArgs); |
| 3059 | DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); |
| 3060 | DEBUG_PRINT_STAT(NumAllocaArgs); |
| 3061 | DEBUG_PRINT_STAT(NumConstantPtrCmps); |
| 3062 | DEBUG_PRINT_STAT(NumConstantPtrDiffs); |
| 3063 | DEBUG_PRINT_STAT(NumInstructionsSimplified); |
| 3064 | DEBUG_PRINT_STAT(NumInstructions); |
| 3065 | DEBUG_PRINT_STAT(NumInlineAsmInstructions); |
| 3066 | DEBUG_PRINT_STAT(SROACostSavings); |
| 3067 | DEBUG_PRINT_STAT(SROACostSavingsLost); |
| 3068 | DEBUG_PRINT_STAT(LoadEliminationCost); |
| 3069 | DEBUG_PRINT_STAT(ContainsNoDuplicateCall); |
| 3070 | DEBUG_PRINT_STAT(Cost); |
| 3071 | DEBUG_PRINT_STAT(Threshold); |
| 3072 | #undef DEBUG_PRINT_STAT |
| 3073 | } |
| 3074 | |
| 3075 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3076 | /// Dump stats about this call's analysis. |
| 3077 | LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); } |
| 3078 | #endif |
| 3079 | |
| 3080 | /// Test that there are no attribute conflicts between Caller and Callee |
| 3081 | /// that prevent inlining. |
| 3082 | static bool functionsHaveCompatibleAttributes( |
| 3083 | Function *Caller, Function *Callee, TargetTransformInfo &TTI, |
| 3084 | function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) { |
| 3085 | // Note that CalleeTLI must be a copy not a reference. The legacy pass manager |
| 3086 | // caches the most recently created TLI in the TargetLibraryInfoWrapperPass |
| 3087 | // object, and always returns the same object (which is overwritten on each |
| 3088 | // GetTLI call). Therefore we copy the first result. |
| 3089 | auto CalleeTLI = GetTLI(*Callee); |
| 3090 | return (IgnoreTTIInlineCompatible || |
| 3091 | TTI.areInlineCompatible(Caller, Callee)) && |
| 3092 | GetTLI(*Caller).areInlineCompatible(CalleeTLI, |
| 3093 | AllowCallerSuperset: InlineCallerSupersetNoBuiltin) && |
| 3094 | AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee); |
| 3095 | } |
| 3096 | |
| 3097 | int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, |
| 3098 | const DataLayout &DL) { |
| 3099 | int64_t Cost = 0; |
| 3100 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { |
| 3101 | if (Call.isByValArgument(ArgNo: I)) { |
| 3102 | // We approximate the number of loads and stores needed by dividing the |
| 3103 | // size of the byval type by the target's pointer size. |
| 3104 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
| 3105 | unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I)); |
| 3106 | unsigned AS = PTy->getAddressSpace(); |
| 3107 | unsigned PointerSize = DL.getPointerSizeInBits(AS); |
| 3108 | // Ceiling division. |
| 3109 | unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; |
| 3110 | |
| 3111 | // If it generates more than 8 stores it is likely to be expanded as an |
| 3112 | // inline memcpy so we take that as an upper bound. Otherwise we assume |
| 3113 | // one load and one store per word copied. |
| 3114 | // FIXME: The maxStoresPerMemcpy setting from the target should be used |
| 3115 | // here instead of a magic number of 8, but it's not available via |
| 3116 | // DataLayout. |
| 3117 | NumStores = std::min(a: NumStores, b: 8U); |
| 3118 | |
| 3119 | Cost += 2 * NumStores * InstrCost; |
| 3120 | } else { |
| 3121 | // For non-byval arguments subtract off one instruction per call |
| 3122 | // argument. |
| 3123 | Cost += InstrCost; |
| 3124 | } |
| 3125 | } |
| 3126 | // The call instruction also disappears after inlining. |
| 3127 | Cost += InstrCost; |
| 3128 | Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty); |
| 3129 | |
| 3130 | return std::min<int64_t>(a: Cost, INT_MAX); |
| 3131 | } |
| 3132 | |
| 3133 | InlineCost llvm::getInlineCost( |
| 3134 | CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, |
| 3135 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 3136 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 3137 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 3138 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, |
| 3139 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) { |
| 3140 | return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI, |
| 3141 | GetAssumptionCache, GetTLI, GetBFI, PSI, ORE, |
| 3142 | GetEphValuesCache); |
| 3143 | } |
| 3144 | |
| 3145 | std::optional<int> llvm::getInliningCostEstimate( |
| 3146 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
| 3147 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 3148 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 3149 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 3150 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
| 3151 | const InlineParams Params = {/* DefaultThreshold*/ 0, |
| 3152 | /*HintThreshold*/ {}, |
| 3153 | /*ColdThreshold*/ {}, |
| 3154 | /*OptSizeThreshold*/ {}, |
| 3155 | /*OptMinSizeThreshold*/ {}, |
| 3156 | /*HotCallSiteThreshold*/ {}, |
| 3157 | /*LocallyHotCallSiteThreshold*/ {}, |
| 3158 | /*ColdCallSiteThreshold*/ {}, |
| 3159 | /*ComputeFullInlineCost*/ true, |
| 3160 | /*EnableDeferral*/ true}; |
| 3161 | |
| 3162 | InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, |
| 3163 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true, |
| 3164 | /*IgnoreThreshold*/ true); |
| 3165 | auto R = CA.analyze(); |
| 3166 | if (!R.isSuccess()) |
| 3167 | return std::nullopt; |
| 3168 | return CA.getCost(); |
| 3169 | } |
| 3170 | |
| 3171 | std::optional<InlineCostFeatures> llvm::getInliningCostFeatures( |
| 3172 | CallBase &Call, TargetTransformInfo &CalleeTTI, |
| 3173 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 3174 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 3175 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 3176 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { |
| 3177 | InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI, |
| 3178 | PSI, ORE, *Call.getCalledFunction(), Call); |
| 3179 | auto R = CFA.analyze(); |
| 3180 | if (!R.isSuccess()) |
| 3181 | return std::nullopt; |
| 3182 | return CFA.features(); |
| 3183 | } |
| 3184 | |
| 3185 | std::optional<InlineResult> llvm::getAttributeBasedInliningDecision( |
| 3186 | CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, |
| 3187 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { |
| 3188 | |
| 3189 | // Cannot inline indirect calls. |
| 3190 | if (!Callee) |
| 3191 | return InlineResult::failure(Reason: "indirect call" ); |
| 3192 | |
| 3193 | // When callee coroutine function is inlined into caller coroutine function |
| 3194 | // before coro-split pass, |
| 3195 | // coro-early pass can not handle this quiet well. |
| 3196 | // So we won't inline the coroutine function if it have not been unsplited |
| 3197 | if (Callee->isPresplitCoroutine()) |
| 3198 | return InlineResult::failure(Reason: "unsplited coroutine call" ); |
| 3199 | |
| 3200 | // Never inline calls with byval arguments that does not have the alloca |
| 3201 | // address space. Since byval arguments can be replaced with a copy to an |
| 3202 | // alloca, the inlined code would need to be adjusted to handle that the |
| 3203 | // argument is in the alloca address space (so it is a little bit complicated |
| 3204 | // to solve). |
| 3205 | unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace(); |
| 3206 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) |
| 3207 | if (Call.isByValArgument(ArgNo: I)) { |
| 3208 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); |
| 3209 | if (PTy->getAddressSpace() != AllocaAS) |
| 3210 | return InlineResult::failure(Reason: "byval arguments without alloca" |
| 3211 | " address space" ); |
| 3212 | } |
| 3213 | |
| 3214 | // Calls to functions with always-inline attributes should be inlined |
| 3215 | // whenever possible. |
| 3216 | if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) { |
| 3217 | if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline)) |
| 3218 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
| 3219 | |
| 3220 | auto IsViable = isInlineViable(Callee&: *Callee); |
| 3221 | if (IsViable.isSuccess()) |
| 3222 | return InlineResult::success(); |
| 3223 | return InlineResult::failure(Reason: IsViable.getFailureReason()); |
| 3224 | } |
| 3225 | |
| 3226 | // Never inline functions with conflicting attributes (unless callee has |
| 3227 | // always-inline attribute). |
| 3228 | Function *Caller = Call.getCaller(); |
| 3229 | if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI)) |
| 3230 | return InlineResult::failure(Reason: "conflicting attributes" ); |
| 3231 | |
| 3232 | // Don't inline this call if the caller has the optnone attribute. |
| 3233 | if (Caller->hasOptNone()) |
| 3234 | return InlineResult::failure(Reason: "optnone attribute" ); |
| 3235 | |
| 3236 | // Don't inline a function that treats null pointer as valid into a caller |
| 3237 | // that does not have this attribute. |
| 3238 | if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) |
| 3239 | return InlineResult::failure(Reason: "nullptr definitions incompatible" ); |
| 3240 | |
| 3241 | // Don't inline functions which can be interposed at link-time. |
| 3242 | if (Callee->isInterposable()) |
| 3243 | return InlineResult::failure(Reason: "interposable" ); |
| 3244 | |
| 3245 | // Don't inline functions marked noinline. |
| 3246 | if (Callee->hasFnAttribute(Kind: Attribute::NoInline)) |
| 3247 | return InlineResult::failure(Reason: "noinline function attribute" ); |
| 3248 | |
| 3249 | // Don't inline call sites marked noinline. |
| 3250 | if (Call.isNoInline()) |
| 3251 | return InlineResult::failure(Reason: "noinline call site attribute" ); |
| 3252 | |
| 3253 | // Don't inline functions that are loader replaceable. |
| 3254 | if (Callee->hasFnAttribute(Kind: "loader-replaceable" )) |
| 3255 | return InlineResult::failure(Reason: "loader replaceable function attribute" ); |
| 3256 | |
| 3257 | return std::nullopt; |
| 3258 | } |
| 3259 | |
| 3260 | InlineCost llvm::getInlineCost( |
| 3261 | CallBase &Call, Function *Callee, const InlineParams &Params, |
| 3262 | TargetTransformInfo &CalleeTTI, |
| 3263 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, |
| 3264 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, |
| 3265 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, |
| 3266 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, |
| 3267 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) { |
| 3268 | |
| 3269 | auto UserDecision = |
| 3270 | llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI); |
| 3271 | |
| 3272 | if (UserDecision) { |
| 3273 | if (UserDecision->isSuccess()) |
| 3274 | return llvm::InlineCost::getAlways(Reason: "always inline attribute" ); |
| 3275 | return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason()); |
| 3276 | } |
| 3277 | |
| 3278 | if (InlineAllViableCalls && isInlineViable(Callee&: *Callee).isSuccess()) |
| 3279 | return llvm::InlineCost::getAlways( |
| 3280 | Reason: "Inlining forced by -inline-all-viable-calls" ); |
| 3281 | |
| 3282 | LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() |
| 3283 | << "... (caller:" << Call.getCaller()->getName() |
| 3284 | << ")\n" ); |
| 3285 | |
| 3286 | InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, |
| 3287 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, |
| 3288 | /*BoostIndirect=*/true, /*IgnoreThreshold=*/false, |
| 3289 | GetEphValuesCache); |
| 3290 | InlineResult ShouldInline = CA.analyze(); |
| 3291 | |
| 3292 | LLVM_DEBUG(CA.dump()); |
| 3293 | |
| 3294 | // Always make cost benefit based decision explicit. |
| 3295 | // We use always/never here since threshold is not meaningful, |
| 3296 | // as it's not what drives cost-benefit analysis. |
| 3297 | if (CA.wasDecidedByCostBenefit()) { |
| 3298 | if (ShouldInline.isSuccess()) |
| 3299 | return InlineCost::getAlways(Reason: "benefit over cost" , |
| 3300 | CostBenefit: CA.getCostBenefitPair()); |
| 3301 | else |
| 3302 | return InlineCost::getNever(Reason: "cost over benefit" , CostBenefit: CA.getCostBenefitPair()); |
| 3303 | } |
| 3304 | |
| 3305 | if (CA.wasDecidedByCostThreshold()) |
| 3306 | return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(), |
| 3307 | StaticBonus: CA.getStaticBonusApplied()); |
| 3308 | |
| 3309 | // No details on how the decision was made, simply return always or never. |
| 3310 | return ShouldInline.isSuccess() |
| 3311 | ? InlineCost::getAlways(Reason: "empty function" ) |
| 3312 | : InlineCost::getNever(Reason: ShouldInline.getFailureReason()); |
| 3313 | } |
| 3314 | |
| 3315 | InlineResult llvm::isInlineViable(Function &F) { |
| 3316 | bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice); |
| 3317 | for (BasicBlock &BB : F) { |
| 3318 | // Disallow inlining of functions which contain indirect branches. |
| 3319 | if (isa<IndirectBrInst>(Val: BB.getTerminator())) |
| 3320 | return InlineResult::failure(Reason: "contains indirect branches" ); |
| 3321 | |
| 3322 | // Disallow inlining of blockaddresses. |
| 3323 | if (BB.hasAddressTaken()) |
| 3324 | return InlineResult::failure(Reason: "blockaddress used" ); |
| 3325 | |
| 3326 | for (auto &II : BB) { |
| 3327 | CallBase *Call = dyn_cast<CallBase>(Val: &II); |
| 3328 | if (!Call) |
| 3329 | continue; |
| 3330 | |
| 3331 | // Disallow recursive calls. |
| 3332 | Function *Callee = Call->getCalledFunction(); |
| 3333 | if (&F == Callee) |
| 3334 | return InlineResult::failure(Reason: "recursive call" ); |
| 3335 | |
| 3336 | // Disallow calls which expose returns-twice to a function not previously |
| 3337 | // attributed as such. |
| 3338 | if (!ReturnsTwice && isa<CallInst>(Val: Call) && |
| 3339 | cast<CallInst>(Val: Call)->canReturnTwice()) |
| 3340 | return InlineResult::failure(Reason: "exposes returns-twice attribute" ); |
| 3341 | |
| 3342 | if (Callee) |
| 3343 | switch (Callee->getIntrinsicID()) { |
| 3344 | default: |
| 3345 | break; |
| 3346 | case llvm::Intrinsic::icall_branch_funnel: |
| 3347 | // Disallow inlining of @llvm.icall.branch.funnel because current |
| 3348 | // backend can't separate call targets from call arguments. |
| 3349 | return InlineResult::failure( |
| 3350 | Reason: "disallowed inlining of @llvm.icall.branch.funnel" ); |
| 3351 | case llvm::Intrinsic::localescape: |
| 3352 | // Disallow inlining functions that call @llvm.localescape. Doing this |
| 3353 | // correctly would require major changes to the inliner. |
| 3354 | return InlineResult::failure( |
| 3355 | Reason: "disallowed inlining of @llvm.localescape" ); |
| 3356 | case llvm::Intrinsic::vastart: |
| 3357 | // Disallow inlining of functions that initialize VarArgs with |
| 3358 | // va_start. |
| 3359 | return InlineResult::failure( |
| 3360 | Reason: "contains VarArgs initialized with va_start" ); |
| 3361 | } |
| 3362 | } |
| 3363 | } |
| 3364 | |
| 3365 | return InlineResult::success(); |
| 3366 | } |
| 3367 | |
| 3368 | // APIs to create InlineParams based on command line flags and/or other |
| 3369 | // parameters. |
| 3370 | |
| 3371 | InlineParams llvm::getInlineParams(int Threshold) { |
| 3372 | InlineParams Params; |
| 3373 | |
| 3374 | // This field is the threshold to use for a callee by default. This is |
| 3375 | // derived from one or more of: |
| 3376 | // * optimization or size-optimization levels, |
| 3377 | // * a value passed to createFunctionInliningPass function, or |
| 3378 | // * the -inline-threshold flag. |
| 3379 | // If the -inline-threshold flag is explicitly specified, that is used |
| 3380 | // irrespective of anything else. |
| 3381 | if (InlineThreshold.getNumOccurrences() > 0) |
| 3382 | Params.DefaultThreshold = InlineThreshold; |
| 3383 | else |
| 3384 | Params.DefaultThreshold = Threshold; |
| 3385 | |
| 3386 | // Set the HintThreshold knob from the -inlinehint-threshold. |
| 3387 | Params.HintThreshold = HintThreshold; |
| 3388 | |
| 3389 | // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. |
| 3390 | Params.HotCallSiteThreshold = HotCallSiteThreshold; |
| 3391 | |
| 3392 | // If the -locally-hot-callsite-threshold is explicitly specified, use it to |
| 3393 | // populate LocallyHotCallSiteThreshold. Later, we populate |
| 3394 | // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if |
| 3395 | // we know that optimization level is O3 (in the getInlineParams variant that |
| 3396 | // takes the opt and size levels). |
| 3397 | // FIXME: Remove this check (and make the assignment unconditional) after |
| 3398 | // addressing size regression issues at O2. |
| 3399 | if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) |
| 3400 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
| 3401 | |
| 3402 | // Set the ColdCallSiteThreshold knob from the |
| 3403 | // -inline-cold-callsite-threshold. |
| 3404 | Params.ColdCallSiteThreshold = ColdCallSiteThreshold; |
| 3405 | |
| 3406 | // Set the OptMinSizeThreshold and OptSizeThreshold params only if the |
| 3407 | // -inlinehint-threshold commandline option is not explicitly given. If that |
| 3408 | // option is present, then its value applies even for callees with size and |
| 3409 | // minsize attributes. |
| 3410 | // If the -inline-threshold is not specified, set the ColdThreshold from the |
| 3411 | // -inlinecold-threshold even if it is not explicitly passed. If |
| 3412 | // -inline-threshold is specified, then -inlinecold-threshold needs to be |
| 3413 | // explicitly specified to set the ColdThreshold knob |
| 3414 | if (InlineThreshold.getNumOccurrences() == 0) { |
| 3415 | Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; |
| 3416 | Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; |
| 3417 | Params.ColdThreshold = ColdThreshold; |
| 3418 | } else if (ColdThreshold.getNumOccurrences() > 0) { |
| 3419 | Params.ColdThreshold = ColdThreshold; |
| 3420 | } |
| 3421 | return Params; |
| 3422 | } |
| 3423 | |
| 3424 | InlineParams llvm::getInlineParams() { |
| 3425 | return getInlineParams(Threshold: DefaultThreshold); |
| 3426 | } |
| 3427 | |
| 3428 | // Compute the default threshold for inlining based on the opt level and the |
| 3429 | // size opt level. |
| 3430 | static int computeThresholdFromOptLevels(unsigned OptLevel, |
| 3431 | unsigned SizeOptLevel) { |
| 3432 | if (OptLevel > 2) |
| 3433 | return InlineConstants::OptAggressiveThreshold; |
| 3434 | if (SizeOptLevel == 1) // -Os |
| 3435 | return InlineConstants::OptSizeThreshold; |
| 3436 | if (SizeOptLevel == 2) // -Oz |
| 3437 | return InlineConstants::OptMinSizeThreshold; |
| 3438 | return DefaultThreshold; |
| 3439 | } |
| 3440 | |
| 3441 | InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { |
| 3442 | auto Params = |
| 3443 | getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); |
| 3444 | // At O3, use the value of -locally-hot-callsite-threshold option to populate |
| 3445 | // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only |
| 3446 | // when it is specified explicitly. |
| 3447 | if (OptLevel > 2) |
| 3448 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; |
| 3449 | return Params; |
| 3450 | } |
| 3451 | |
| 3452 | PreservedAnalyses |
| 3453 | InlineCostAnnotationPrinterPass::run(Function &F, |
| 3454 | FunctionAnalysisManager &FAM) { |
| 3455 | PrintInstructionComments = true; |
| 3456 | std::function<AssumptionCache &(Function &)> GetAssumptionCache = |
| 3457 | [&](Function &F) -> AssumptionCache & { |
| 3458 | return FAM.getResult<AssumptionAnalysis>(IR&: F); |
| 3459 | }; |
| 3460 | |
| 3461 | auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F); |
| 3462 | ProfileSummaryInfo *PSI = |
| 3463 | MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
| 3464 | const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); |
| 3465 | |
| 3466 | // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. |
| 3467 | // In the current implementation, the type of InlineParams doesn't matter as |
| 3468 | // the pass serves only for verification of inliner's decisions. |
| 3469 | // We can add a flag which determines InlineParams for this run. Right now, |
| 3470 | // the default InlineParams are used. |
| 3471 | const InlineParams Params = llvm::getInlineParams(); |
| 3472 | for (BasicBlock &BB : F) { |
| 3473 | for (Instruction &I : BB) { |
| 3474 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
| 3475 | Function *CalledFunction = CB->getCalledFunction(); |
| 3476 | if (!CalledFunction || CalledFunction->isDeclaration()) |
| 3477 | continue; |
| 3478 | OptimizationRemarkEmitter ORE(CalledFunction); |
| 3479 | InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI, |
| 3480 | GetAssumptionCache, nullptr, nullptr, PSI, |
| 3481 | &ORE); |
| 3482 | ICCA.analyze(); |
| 3483 | OS << " Analyzing call of " << CalledFunction->getName() |
| 3484 | << "... (caller:" << CB->getCaller()->getName() << ")\n" ; |
| 3485 | ICCA.print(OS); |
| 3486 | OS << "\n" ; |
| 3487 | } |
| 3488 | } |
| 3489 | } |
| 3490 | return PreservedAnalyses::all(); |
| 3491 | } |
| 3492 | |