| 1 | //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// | 
|---|
| 2 | // | 
|---|
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|---|
| 4 | // See https://llvm.org/LICENSE.txt for license information. | 
|---|
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|---|
| 6 | // | 
|---|
| 7 | //===----------------------------------------------------------------------===// | 
|---|
| 8 | // | 
|---|
| 9 | // This file implements inline cost analysis. | 
|---|
| 10 | // | 
|---|
| 11 | //===----------------------------------------------------------------------===// | 
|---|
| 12 |  | 
|---|
| 13 | #include "llvm/Analysis/InlineCost.h" | 
|---|
| 14 | #include "llvm/ADT/STLExtras.h" | 
|---|
| 15 | #include "llvm/ADT/SetVector.h" | 
|---|
| 16 | #include "llvm/ADT/SmallPtrSet.h" | 
|---|
| 17 | #include "llvm/ADT/SmallVector.h" | 
|---|
| 18 | #include "llvm/ADT/Statistic.h" | 
|---|
| 19 | #include "llvm/Analysis/AssumptionCache.h" | 
|---|
| 20 | #include "llvm/Analysis/BlockFrequencyInfo.h" | 
|---|
| 21 | #include "llvm/Analysis/CodeMetrics.h" | 
|---|
| 22 | #include "llvm/Analysis/ConstantFolding.h" | 
|---|
| 23 | #include "llvm/Analysis/DomConditionCache.h" | 
|---|
| 24 | #include "llvm/Analysis/EphemeralValuesCache.h" | 
|---|
| 25 | #include "llvm/Analysis/InstructionSimplify.h" | 
|---|
| 26 | #include "llvm/Analysis/LoopInfo.h" | 
|---|
| 27 | #include "llvm/Analysis/MemoryBuiltins.h" | 
|---|
| 28 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" | 
|---|
| 29 | #include "llvm/Analysis/ProfileSummaryInfo.h" | 
|---|
| 30 | #include "llvm/Analysis/TargetLibraryInfo.h" | 
|---|
| 31 | #include "llvm/Analysis/TargetTransformInfo.h" | 
|---|
| 32 | #include "llvm/Analysis/ValueTracking.h" | 
|---|
| 33 | #include "llvm/Config/llvm-config.h" | 
|---|
| 34 | #include "llvm/IR/AssemblyAnnotationWriter.h" | 
|---|
| 35 | #include "llvm/IR/CallingConv.h" | 
|---|
| 36 | #include "llvm/IR/DataLayout.h" | 
|---|
| 37 | #include "llvm/IR/Dominators.h" | 
|---|
| 38 | #include "llvm/IR/GetElementPtrTypeIterator.h" | 
|---|
| 39 | #include "llvm/IR/GlobalAlias.h" | 
|---|
| 40 | #include "llvm/IR/InstVisitor.h" | 
|---|
| 41 | #include "llvm/IR/IntrinsicInst.h" | 
|---|
| 42 | #include "llvm/IR/Operator.h" | 
|---|
| 43 | #include "llvm/IR/PatternMatch.h" | 
|---|
| 44 | #include "llvm/Support/CommandLine.h" | 
|---|
| 45 | #include "llvm/Support/Debug.h" | 
|---|
| 46 | #include "llvm/Support/FormattedStream.h" | 
|---|
| 47 | #include "llvm/Support/raw_ostream.h" | 
|---|
| 48 | #include <climits> | 
|---|
| 49 | #include <limits> | 
|---|
| 50 | #include <optional> | 
|---|
| 51 |  | 
|---|
| 52 | using namespace llvm; | 
|---|
| 53 |  | 
|---|
| 54 | #define DEBUG_TYPE "inline-cost" | 
|---|
| 55 |  | 
|---|
| 56 | STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); | 
|---|
| 57 |  | 
|---|
| 58 | static cl::opt<int> | 
|---|
| 59 | DefaultThreshold( "inlinedefault-threshold", cl::Hidden, cl::init(Val: 225), | 
|---|
| 60 | cl::desc( "Default amount of inlining to perform")); | 
|---|
| 61 |  | 
|---|
| 62 | // We introduce this option since there is a minor compile-time win by avoiding | 
|---|
| 63 | // addition of TTI attributes (target-features in particular) to inline | 
|---|
| 64 | // candidates when they are guaranteed to be the same as top level methods in | 
|---|
| 65 | // some use cases. If we avoid adding the attribute, we need an option to avoid | 
|---|
| 66 | // checking these attributes. | 
|---|
| 67 | static cl::opt<bool> IgnoreTTIInlineCompatible( | 
|---|
| 68 | "ignore-tti-inline-compatible", cl::Hidden, cl::init(Val: false), | 
|---|
| 69 | cl::desc( "Ignore TTI attributes compatibility check between callee/caller " | 
|---|
| 70 | "during inline cost calculation")); | 
|---|
| 71 |  | 
|---|
| 72 | static cl::opt<bool> ( | 
|---|
| 73 | "print-instruction-comments", cl::Hidden, cl::init(Val: false), | 
|---|
| 74 | cl::desc( "Prints comments for instruction based on inline cost analysis")); | 
|---|
| 75 |  | 
|---|
| 76 | static cl::opt<int> InlineThreshold( | 
|---|
| 77 | "inline-threshold", cl::Hidden, cl::init(Val: 225), | 
|---|
| 78 | cl::desc( "Control the amount of inlining to perform (default = 225)")); | 
|---|
| 79 |  | 
|---|
| 80 | static cl::opt<int> HintThreshold( | 
|---|
| 81 | "inlinehint-threshold", cl::Hidden, cl::init(Val: 325), | 
|---|
| 82 | cl::desc( "Threshold for inlining functions with inline hint")); | 
|---|
| 83 |  | 
|---|
| 84 | static cl::opt<int> | 
|---|
| 85 | ColdCallSiteThreshold( "inline-cold-callsite-threshold", cl::Hidden, | 
|---|
| 86 | cl::init(Val: 45), | 
|---|
| 87 | cl::desc( "Threshold for inlining cold callsites")); | 
|---|
| 88 |  | 
|---|
| 89 | static cl::opt<bool> InlineEnableCostBenefitAnalysis( | 
|---|
| 90 | "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(Val: false), | 
|---|
| 91 | cl::desc( "Enable the cost-benefit analysis for the inliner")); | 
|---|
| 92 |  | 
|---|
| 93 | // InlineSavingsMultiplier overrides per TTI multipliers iff it is | 
|---|
| 94 | // specified explicitly in command line options. This option is exposed | 
|---|
| 95 | // for tuning and testing. | 
|---|
| 96 | static cl::opt<int> InlineSavingsMultiplier( | 
|---|
| 97 | "inline-savings-multiplier", cl::Hidden, cl::init(Val: 8), | 
|---|
| 98 | cl::desc( "Multiplier to multiply cycle savings by during inlining")); | 
|---|
| 99 |  | 
|---|
| 100 | // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is | 
|---|
| 101 | // specified explicitly in command line options. This option is exposed | 
|---|
| 102 | // for tuning and testing. | 
|---|
| 103 | static cl::opt<int> InlineSavingsProfitableMultiplier( | 
|---|
| 104 | "inline-savings-profitable-multiplier", cl::Hidden, cl::init(Val: 4), | 
|---|
| 105 | cl::desc( "A multiplier on top of cycle savings to decide whether the " | 
|---|
| 106 | "savings won't justify the cost")); | 
|---|
| 107 |  | 
|---|
| 108 | static cl::opt<int> | 
|---|
| 109 | InlineSizeAllowance( "inline-size-allowance", cl::Hidden, cl::init(Val: 100), | 
|---|
| 110 | cl::desc( "The maximum size of a callee that get's " | 
|---|
| 111 | "inlined without sufficient cycle savings")); | 
|---|
| 112 |  | 
|---|
| 113 | // We introduce this threshold to help performance of instrumentation based | 
|---|
| 114 | // PGO before we actually hook up inliner with analysis passes such as BPI and | 
|---|
| 115 | // BFI. | 
|---|
| 116 | static cl::opt<int> ColdThreshold( | 
|---|
| 117 | "inlinecold-threshold", cl::Hidden, cl::init(Val: 45), | 
|---|
| 118 | cl::desc( "Threshold for inlining functions with cold attribute")); | 
|---|
| 119 |  | 
|---|
| 120 | static cl::opt<int> | 
|---|
| 121 | HotCallSiteThreshold( "hot-callsite-threshold", cl::Hidden, cl::init(Val: 3000), | 
|---|
| 122 | cl::desc( "Threshold for hot callsites ")); | 
|---|
| 123 |  | 
|---|
| 124 | static cl::opt<int> LocallyHotCallSiteThreshold( | 
|---|
| 125 | "locally-hot-callsite-threshold", cl::Hidden, cl::init(Val: 525), | 
|---|
| 126 | cl::desc( "Threshold for locally hot callsites ")); | 
|---|
| 127 |  | 
|---|
| 128 | static cl::opt<int> ColdCallSiteRelFreq( | 
|---|
| 129 | "cold-callsite-rel-freq", cl::Hidden, cl::init(Val: 2), | 
|---|
| 130 | cl::desc( "Maximum block frequency, expressed as a percentage of caller's " | 
|---|
| 131 | "entry frequency, for a callsite to be cold in the absence of " | 
|---|
| 132 | "profile information.")); | 
|---|
| 133 |  | 
|---|
| 134 | static cl::opt<uint64_t> HotCallSiteRelFreq( | 
|---|
| 135 | "hot-callsite-rel-freq", cl::Hidden, cl::init(Val: 60), | 
|---|
| 136 | cl::desc( "Minimum block frequency, expressed as a multiple of caller's " | 
|---|
| 137 | "entry frequency, for a callsite to be hot in the absence of " | 
|---|
| 138 | "profile information.")); | 
|---|
| 139 |  | 
|---|
| 140 | static cl::opt<int> | 
|---|
| 141 | InstrCost( "inline-instr-cost", cl::Hidden, cl::init(Val: 5), | 
|---|
| 142 | cl::desc( "Cost of a single instruction when inlining")); | 
|---|
| 143 |  | 
|---|
| 144 | static cl::opt<int> | 
|---|
| 145 | MemAccessCost( "inline-memaccess-cost", cl::Hidden, cl::init(Val: 0), | 
|---|
| 146 | cl::desc( "Cost of load/store instruction when inlining")); | 
|---|
| 147 |  | 
|---|
| 148 | static cl::opt<int> CallPenalty( | 
|---|
| 149 | "inline-call-penalty", cl::Hidden, cl::init(Val: 25), | 
|---|
| 150 | cl::desc( "Call penalty that is applied per callsite when inlining")); | 
|---|
| 151 |  | 
|---|
| 152 | static cl::opt<size_t> | 
|---|
| 153 | StackSizeThreshold( "inline-max-stacksize", cl::Hidden, | 
|---|
| 154 | cl::init(Val: std::numeric_limits<size_t>::max()), | 
|---|
| 155 | cl::desc( "Do not inline functions with a stack size " | 
|---|
| 156 | "that exceeds the specified limit")); | 
|---|
| 157 |  | 
|---|
| 158 | static cl::opt<size_t> RecurStackSizeThreshold( | 
|---|
| 159 | "recursive-inline-max-stacksize", cl::Hidden, | 
|---|
| 160 | cl::init(Val: InlineConstants::TotalAllocaSizeRecursiveCaller), | 
|---|
| 161 | cl::desc( "Do not inline recursive functions with a stack " | 
|---|
| 162 | "size that exceeds the specified limit")); | 
|---|
| 163 |  | 
|---|
| 164 | static cl::opt<bool> OptComputeFullInlineCost( | 
|---|
| 165 | "inline-cost-full", cl::Hidden, | 
|---|
| 166 | cl::desc( "Compute the full inline cost of a call site even when the cost " | 
|---|
| 167 | "exceeds the threshold.")); | 
|---|
| 168 |  | 
|---|
| 169 | static cl::opt<bool> InlineCallerSupersetNoBuiltin( | 
|---|
| 170 | "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(Val: true), | 
|---|
| 171 | cl::desc( "Allow inlining when caller has a superset of callee's nobuiltin " | 
|---|
| 172 | "attributes.")); | 
|---|
| 173 |  | 
|---|
| 174 | static cl::opt<bool> DisableGEPConstOperand( | 
|---|
| 175 | "disable-gep-const-evaluation", cl::Hidden, cl::init(Val: false), | 
|---|
| 176 | cl::desc( "Disables evaluation of GetElementPtr with constant operands")); | 
|---|
| 177 |  | 
|---|
| 178 | namespace llvm { | 
|---|
| 179 | std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) { | 
|---|
| 180 | if (Attr.isValid()) { | 
|---|
| 181 | int AttrValue = 0; | 
|---|
| 182 | if (!Attr.getValueAsString().getAsInteger(Radix: 10, Result&: AttrValue)) | 
|---|
| 183 | return AttrValue; | 
|---|
| 184 | } | 
|---|
| 185 | return std::nullopt; | 
|---|
| 186 | } | 
|---|
| 187 |  | 
|---|
| 188 | std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) { | 
|---|
| 189 | return getStringFnAttrAsInt(Attr: CB.getFnAttr(Kind: AttrKind)); | 
|---|
| 190 | } | 
|---|
| 191 |  | 
|---|
| 192 | std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) { | 
|---|
| 193 | return getStringFnAttrAsInt(Attr: F->getFnAttribute(Kind: AttrKind)); | 
|---|
| 194 | } | 
|---|
| 195 |  | 
|---|
| 196 | namespace InlineConstants { | 
|---|
| 197 | int getInstrCost() { return InstrCost; } | 
|---|
| 198 |  | 
|---|
| 199 | } // namespace InlineConstants | 
|---|
| 200 |  | 
|---|
| 201 | } // namespace llvm | 
|---|
| 202 |  | 
|---|
| 203 | namespace { | 
|---|
| 204 | class InlineCostCallAnalyzer; | 
|---|
| 205 |  | 
|---|
| 206 | // This struct is used to store information about inline cost of a | 
|---|
| 207 | // particular instruction | 
|---|
| 208 | struct InstructionCostDetail { | 
|---|
| 209 | int CostBefore = 0; | 
|---|
| 210 | int CostAfter = 0; | 
|---|
| 211 | int ThresholdBefore = 0; | 
|---|
| 212 | int ThresholdAfter = 0; | 
|---|
| 213 |  | 
|---|
| 214 | int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; } | 
|---|
| 215 |  | 
|---|
| 216 | int getCostDelta() const { return CostAfter - CostBefore; } | 
|---|
| 217 |  | 
|---|
| 218 | bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; } | 
|---|
| 219 | }; | 
|---|
| 220 |  | 
|---|
| 221 | class InlineCostAnnotationWriter : public AssemblyAnnotationWriter { | 
|---|
| 222 | private: | 
|---|
| 223 | InlineCostCallAnalyzer *const ICCA; | 
|---|
| 224 |  | 
|---|
| 225 | public: | 
|---|
| 226 | InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} | 
|---|
| 227 | void emitInstructionAnnot(const Instruction *I, | 
|---|
| 228 | formatted_raw_ostream &OS) override; | 
|---|
| 229 | }; | 
|---|
| 230 |  | 
|---|
| 231 | /// Carry out call site analysis, in order to evaluate inlinability. | 
|---|
| 232 | /// NOTE: the type is currently used as implementation detail of functions such | 
|---|
| 233 | /// as llvm::getInlineCost. Note the function_ref constructor parameters - the | 
|---|
| 234 | /// expectation is that they come from the outer scope, from the wrapper | 
|---|
| 235 | /// functions. If we want to support constructing CallAnalyzer objects where | 
|---|
| 236 | /// lambdas are provided inline at construction, or where the object needs to | 
|---|
| 237 | /// otherwise survive past the scope of the provided functions, we need to | 
|---|
| 238 | /// revisit the argument types. | 
|---|
| 239 | class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { | 
|---|
| 240 | typedef InstVisitor<CallAnalyzer, bool> Base; | 
|---|
| 241 | friend class InstVisitor<CallAnalyzer, bool>; | 
|---|
| 242 |  | 
|---|
| 243 | protected: | 
|---|
| 244 | virtual ~CallAnalyzer() = default; | 
|---|
| 245 | /// The TargetTransformInfo available for this compilation. | 
|---|
| 246 | const TargetTransformInfo &TTI; | 
|---|
| 247 |  | 
|---|
| 248 | /// Getter for the cache of @llvm.assume intrinsics. | 
|---|
| 249 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache; | 
|---|
| 250 |  | 
|---|
| 251 | /// Getter for BlockFrequencyInfo | 
|---|
| 252 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI; | 
|---|
| 253 |  | 
|---|
| 254 | /// Getter for TargetLibraryInfo | 
|---|
| 255 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI; | 
|---|
| 256 |  | 
|---|
| 257 | /// Profile summary information. | 
|---|
| 258 | ProfileSummaryInfo *PSI; | 
|---|
| 259 |  | 
|---|
| 260 | /// The called function. | 
|---|
| 261 | Function &F; | 
|---|
| 262 |  | 
|---|
| 263 | // Cache the DataLayout since we use it a lot. | 
|---|
| 264 | const DataLayout &DL; | 
|---|
| 265 |  | 
|---|
| 266 | /// The OptimizationRemarkEmitter available for this compilation. | 
|---|
| 267 | OptimizationRemarkEmitter *ORE; | 
|---|
| 268 |  | 
|---|
| 269 | /// The candidate callsite being analyzed. Please do not use this to do | 
|---|
| 270 | /// analysis in the caller function; we want the inline cost query to be | 
|---|
| 271 | /// easily cacheable. Instead, use the cover function paramHasAttr. | 
|---|
| 272 | CallBase &CandidateCall; | 
|---|
| 273 |  | 
|---|
| 274 | /// Getter for the cache of ephemeral values. | 
|---|
| 275 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = nullptr; | 
|---|
| 276 |  | 
|---|
| 277 | /// Extension points for handling callsite features. | 
|---|
| 278 | // Called before a basic block was analyzed. | 
|---|
| 279 | virtual void onBlockStart(const BasicBlock *BB) {} | 
|---|
| 280 |  | 
|---|
| 281 | /// Called after a basic block was analyzed. | 
|---|
| 282 | virtual void onBlockAnalyzed(const BasicBlock *BB) {} | 
|---|
| 283 |  | 
|---|
| 284 | /// Called before an instruction was analyzed | 
|---|
| 285 | virtual void onInstructionAnalysisStart(const Instruction *I) {} | 
|---|
| 286 |  | 
|---|
| 287 | /// Called after an instruction was analyzed | 
|---|
| 288 | virtual void onInstructionAnalysisFinish(const Instruction *I) {} | 
|---|
| 289 |  | 
|---|
| 290 | /// Called at the end of the analysis of the callsite. Return the outcome of | 
|---|
| 291 | /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or | 
|---|
| 292 | /// the reason it can't. | 
|---|
| 293 | virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } | 
|---|
| 294 | /// Called when we're about to start processing a basic block, and every time | 
|---|
| 295 | /// we are done processing an instruction. Return true if there is no point in | 
|---|
| 296 | /// continuing the analysis (e.g. we've determined already the call site is | 
|---|
| 297 | /// too expensive to inline) | 
|---|
| 298 | virtual bool shouldStop() { return false; } | 
|---|
| 299 |  | 
|---|
| 300 | /// Called before the analysis of the callee body starts (with callsite | 
|---|
| 301 | /// contexts propagated).  It checks callsite-specific information. Return a | 
|---|
| 302 | /// reason analysis can't continue if that's the case, or 'true' if it may | 
|---|
| 303 | /// continue. | 
|---|
| 304 | virtual InlineResult onAnalysisStart() { return InlineResult::success(); } | 
|---|
| 305 | /// Called if the analysis engine decides SROA cannot be done for the given | 
|---|
| 306 | /// alloca. | 
|---|
| 307 | virtual void onDisableSROA(AllocaInst *Arg) {} | 
|---|
| 308 |  | 
|---|
| 309 | /// Called the analysis engine determines load elimination won't happen. | 
|---|
| 310 | virtual void onDisableLoadElimination() {} | 
|---|
| 311 |  | 
|---|
| 312 | /// Called when we visit a CallBase, before the analysis starts. Return false | 
|---|
| 313 | /// to stop further processing of the instruction. | 
|---|
| 314 | virtual bool onCallBaseVisitStart(CallBase &Call) { return true; } | 
|---|
| 315 |  | 
|---|
| 316 | /// Called to account for a call. | 
|---|
| 317 | virtual void onCallPenalty() {} | 
|---|
| 318 |  | 
|---|
| 319 | /// Called to account for a load or store. | 
|---|
| 320 | virtual void onMemAccess(){}; | 
|---|
| 321 |  | 
|---|
| 322 | /// Called to account for the expectation the inlining would result in a load | 
|---|
| 323 | /// elimination. | 
|---|
| 324 | virtual void onLoadEliminationOpportunity() {} | 
|---|
| 325 |  | 
|---|
| 326 | /// Called to account for the cost of argument setup for the Call in the | 
|---|
| 327 | /// callee's body (not the callsite currently under analysis). | 
|---|
| 328 | virtual void onCallArgumentSetup(const CallBase &Call) {} | 
|---|
| 329 |  | 
|---|
| 330 | /// Called to account for a load relative intrinsic. | 
|---|
| 331 | virtual void onLoadRelativeIntrinsic() {} | 
|---|
| 332 |  | 
|---|
| 333 | /// Called to account for a lowered call. | 
|---|
| 334 | virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { | 
|---|
| 335 | } | 
|---|
| 336 |  | 
|---|
| 337 | /// Account for a jump table of given size. Return false to stop further | 
|---|
| 338 | /// processing the switch instruction | 
|---|
| 339 | virtual bool onJumpTable(unsigned JumpTableSize) { return true; } | 
|---|
| 340 |  | 
|---|
| 341 | /// Account for a case cluster of given size. Return false to stop further | 
|---|
| 342 | /// processing of the instruction. | 
|---|
| 343 | virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } | 
|---|
| 344 |  | 
|---|
| 345 | /// Called at the end of processing a switch instruction, with the given | 
|---|
| 346 | /// number of case clusters. | 
|---|
| 347 | virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, | 
|---|
| 348 | bool DefaultDestUnreachable) {} | 
|---|
| 349 |  | 
|---|
| 350 | /// Called to account for any other instruction not specifically accounted | 
|---|
| 351 | /// for. | 
|---|
| 352 | virtual void onMissedSimplification() {} | 
|---|
| 353 |  | 
|---|
| 354 | /// Start accounting potential benefits due to SROA for the given alloca. | 
|---|
| 355 | virtual void onInitializeSROAArg(AllocaInst *Arg) {} | 
|---|
| 356 |  | 
|---|
| 357 | /// Account SROA savings for the AllocaInst value. | 
|---|
| 358 | virtual void onAggregateSROAUse(AllocaInst *V) {} | 
|---|
| 359 |  | 
|---|
| 360 | bool handleSROA(Value *V, bool DoNotDisable) { | 
|---|
| 361 | // Check for SROA candidates in comparisons. | 
|---|
| 362 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { | 
|---|
| 363 | if (DoNotDisable) { | 
|---|
| 364 | onAggregateSROAUse(V: SROAArg); | 
|---|
| 365 | return true; | 
|---|
| 366 | } | 
|---|
| 367 | disableSROAForArg(SROAArg); | 
|---|
| 368 | } | 
|---|
| 369 | return false; | 
|---|
| 370 | } | 
|---|
| 371 |  | 
|---|
| 372 | bool IsCallerRecursive = false; | 
|---|
| 373 | bool IsRecursiveCall = false; | 
|---|
| 374 | bool ExposesReturnsTwice = false; | 
|---|
| 375 | bool HasDynamicAlloca = false; | 
|---|
| 376 | bool ContainsNoDuplicateCall = false; | 
|---|
| 377 | bool HasReturn = false; | 
|---|
| 378 | bool HasIndirectBr = false; | 
|---|
| 379 | bool HasUninlineableIntrinsic = false; | 
|---|
| 380 | bool InitsVargArgs = false; | 
|---|
| 381 |  | 
|---|
| 382 | /// Number of bytes allocated statically by the callee. | 
|---|
| 383 | uint64_t AllocatedSize = 0; | 
|---|
| 384 | unsigned NumInstructions = 0; | 
|---|
| 385 | unsigned NumVectorInstructions = 0; | 
|---|
| 386 |  | 
|---|
| 387 | /// While we walk the potentially-inlined instructions, we build up and | 
|---|
| 388 | /// maintain a mapping of simplified values specific to this callsite. The | 
|---|
| 389 | /// idea is to propagate any special information we have about arguments to | 
|---|
| 390 | /// this call through the inlinable section of the function, and account for | 
|---|
| 391 | /// likely simplifications post-inlining. The most important aspect we track | 
|---|
| 392 | /// is CFG altering simplifications -- when we prove a basic block dead, that | 
|---|
| 393 | /// can cause dramatic shifts in the cost of inlining a function. | 
|---|
| 394 | /// Note: The simplified Value may be owned by the caller function. | 
|---|
| 395 | DenseMap<Value *, Value *> SimplifiedValues; | 
|---|
| 396 |  | 
|---|
| 397 | /// Keep track of the values which map back (through function arguments) to | 
|---|
| 398 | /// allocas on the caller stack which could be simplified through SROA. | 
|---|
| 399 | DenseMap<Value *, AllocaInst *> SROAArgValues; | 
|---|
| 400 |  | 
|---|
| 401 | /// Keep track of Allocas for which we believe we may get SROA optimization. | 
|---|
| 402 | DenseSet<AllocaInst *> EnabledSROAAllocas; | 
|---|
| 403 |  | 
|---|
| 404 | /// Keep track of values which map to a pointer base and constant offset. | 
|---|
| 405 | DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; | 
|---|
| 406 |  | 
|---|
| 407 | /// Keep track of dead blocks due to the constant arguments. | 
|---|
| 408 | SmallPtrSet<BasicBlock *, 16> DeadBlocks; | 
|---|
| 409 |  | 
|---|
| 410 | /// The mapping of the blocks to their known unique successors due to the | 
|---|
| 411 | /// constant arguments. | 
|---|
| 412 | DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; | 
|---|
| 413 |  | 
|---|
| 414 | /// Model the elimination of repeated loads that is expected to happen | 
|---|
| 415 | /// whenever we simplify away the stores that would otherwise cause them to be | 
|---|
| 416 | /// loads. | 
|---|
| 417 | bool EnableLoadElimination = true; | 
|---|
| 418 |  | 
|---|
| 419 | /// Whether we allow inlining for recursive call. | 
|---|
| 420 | bool AllowRecursiveCall = false; | 
|---|
| 421 |  | 
|---|
| 422 | SmallPtrSet<Value *, 16> LoadAddrSet; | 
|---|
| 423 |  | 
|---|
| 424 | AllocaInst *getSROAArgForValueOrNull(Value *V) const { | 
|---|
| 425 | auto It = SROAArgValues.find(Val: V); | 
|---|
| 426 | if (It == SROAArgValues.end() || EnabledSROAAllocas.count(V: It->second) == 0) | 
|---|
| 427 | return nullptr; | 
|---|
| 428 | return It->second; | 
|---|
| 429 | } | 
|---|
| 430 |  | 
|---|
| 431 | /// Use a value in its given form directly if possible, otherwise try looking | 
|---|
| 432 | /// for it in SimplifiedValues. | 
|---|
| 433 | template <typename T> T *getDirectOrSimplifiedValue(Value *V) const { | 
|---|
| 434 | if (auto *Direct = dyn_cast<T>(V)) | 
|---|
| 435 | return Direct; | 
|---|
| 436 | return getSimplifiedValue<T>(V); | 
|---|
| 437 | } | 
|---|
| 438 |  | 
|---|
| 439 | // Custom simplification helper routines. | 
|---|
| 440 | bool isAllocaDerivedArg(Value *V); | 
|---|
| 441 | void disableSROAForArg(AllocaInst *SROAArg); | 
|---|
| 442 | void disableSROA(Value *V); | 
|---|
| 443 | void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); | 
|---|
| 444 | void disableLoadElimination(); | 
|---|
| 445 | bool isGEPFree(GetElementPtrInst &GEP); | 
|---|
| 446 | bool canFoldInboundsGEP(GetElementPtrInst &I); | 
|---|
| 447 | bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); | 
|---|
| 448 | bool simplifyCallSite(Function *F, CallBase &Call); | 
|---|
| 449 | bool simplifyCmpInstForRecCall(CmpInst &Cmp); | 
|---|
| 450 | bool simplifyInstruction(Instruction &I); | 
|---|
| 451 | bool simplifyIntrinsicCallIsConstant(CallBase &CB); | 
|---|
| 452 | bool simplifyIntrinsicCallObjectSize(CallBase &CB); | 
|---|
| 453 | ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); | 
|---|
| 454 | bool isLoweredToCall(Function *F, CallBase &Call); | 
|---|
| 455 |  | 
|---|
| 456 | /// Return true if the given argument to the function being considered for | 
|---|
| 457 | /// inlining has the given attribute set either at the call site or the | 
|---|
| 458 | /// function declaration.  Primarily used to inspect call site specific | 
|---|
| 459 | /// attributes since these can be more precise than the ones on the callee | 
|---|
| 460 | /// itself. | 
|---|
| 461 | bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); | 
|---|
| 462 |  | 
|---|
| 463 | /// Return true if the given value is known non null within the callee if | 
|---|
| 464 | /// inlined through this particular callsite. | 
|---|
| 465 | bool isKnownNonNullInCallee(Value *V); | 
|---|
| 466 |  | 
|---|
| 467 | /// Return true if size growth is allowed when inlining the callee at \p Call. | 
|---|
| 468 | bool allowSizeGrowth(CallBase &Call); | 
|---|
| 469 |  | 
|---|
| 470 | // Custom analysis routines. | 
|---|
| 471 | InlineResult analyzeBlock(BasicBlock *BB, | 
|---|
| 472 | const SmallPtrSetImpl<const Value *> &EphValues); | 
|---|
| 473 |  | 
|---|
| 474 | // Disable several entry points to the visitor so we don't accidentally use | 
|---|
| 475 | // them by declaring but not defining them here. | 
|---|
| 476 | void visit(Module *); | 
|---|
| 477 | void visit(Module &); | 
|---|
| 478 | void visit(Function *); | 
|---|
| 479 | void visit(Function &); | 
|---|
| 480 | void visit(BasicBlock *); | 
|---|
| 481 | void visit(BasicBlock &); | 
|---|
| 482 |  | 
|---|
| 483 | // Provide base case for our instruction visit. | 
|---|
| 484 | bool visitInstruction(Instruction &I); | 
|---|
| 485 |  | 
|---|
| 486 | // Our visit overrides. | 
|---|
| 487 | bool visitAlloca(AllocaInst &I); | 
|---|
| 488 | bool visitPHI(PHINode &I); | 
|---|
| 489 | bool visitGetElementPtr(GetElementPtrInst &I); | 
|---|
| 490 | bool visitBitCast(BitCastInst &I); | 
|---|
| 491 | bool visitPtrToInt(PtrToIntInst &I); | 
|---|
| 492 | bool visitIntToPtr(IntToPtrInst &I); | 
|---|
| 493 | bool visitCastInst(CastInst &I); | 
|---|
| 494 | bool visitCmpInst(CmpInst &I); | 
|---|
| 495 | bool visitSub(BinaryOperator &I); | 
|---|
| 496 | bool visitBinaryOperator(BinaryOperator &I); | 
|---|
| 497 | bool visitFNeg(UnaryOperator &I); | 
|---|
| 498 | bool visitLoad(LoadInst &I); | 
|---|
| 499 | bool visitStore(StoreInst &I); | 
|---|
| 500 | bool visitExtractValue(ExtractValueInst &I); | 
|---|
| 501 | bool visitInsertValue(InsertValueInst &I); | 
|---|
| 502 | bool visitCallBase(CallBase &Call); | 
|---|
| 503 | bool visitReturnInst(ReturnInst &RI); | 
|---|
| 504 | bool visitBranchInst(BranchInst &BI); | 
|---|
| 505 | bool visitSelectInst(SelectInst &SI); | 
|---|
| 506 | bool visitSwitchInst(SwitchInst &SI); | 
|---|
| 507 | bool visitIndirectBrInst(IndirectBrInst &IBI); | 
|---|
| 508 | bool visitResumeInst(ResumeInst &RI); | 
|---|
| 509 | bool visitCleanupReturnInst(CleanupReturnInst &RI); | 
|---|
| 510 | bool visitCatchReturnInst(CatchReturnInst &RI); | 
|---|
| 511 | bool visitUnreachableInst(UnreachableInst &I); | 
|---|
| 512 |  | 
|---|
| 513 | public: | 
|---|
| 514 | CallAnalyzer( | 
|---|
| 515 | Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, | 
|---|
| 516 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, | 
|---|
| 517 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, | 
|---|
| 518 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr, | 
|---|
| 519 | ProfileSummaryInfo *PSI = nullptr, | 
|---|
| 520 | OptimizationRemarkEmitter *ORE = nullptr, | 
|---|
| 521 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = | 
|---|
| 522 | nullptr) | 
|---|
| 523 | : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), | 
|---|
| 524 | GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE), | 
|---|
| 525 | CandidateCall(Call), GetEphValuesCache(GetEphValuesCache) {} | 
|---|
| 526 |  | 
|---|
| 527 | InlineResult analyze(); | 
|---|
| 528 |  | 
|---|
| 529 | /// Lookup simplified Value. May return a value owned by the caller. | 
|---|
| 530 | Value *getSimplifiedValueUnchecked(Value *V) const { | 
|---|
| 531 | return SimplifiedValues.lookup(Val: V); | 
|---|
| 532 | } | 
|---|
| 533 |  | 
|---|
| 534 | /// Lookup simplified Value, but return nullptr if the simplified value is | 
|---|
| 535 | /// owned by the caller. | 
|---|
| 536 | template <typename T> T *getSimplifiedValue(Value *V) const { | 
|---|
| 537 | Value *SimpleV = SimplifiedValues.lookup(Val: V); | 
|---|
| 538 | if (!SimpleV) | 
|---|
| 539 | return nullptr; | 
|---|
| 540 |  | 
|---|
| 541 | // Skip checks if we know T is a global. This has a small, but measurable | 
|---|
| 542 | // impact on compile-time. | 
|---|
| 543 | if constexpr (std::is_base_of_v<Constant, T>) | 
|---|
| 544 | return dyn_cast<T>(SimpleV); | 
|---|
| 545 |  | 
|---|
| 546 | // Make sure the simplified Value is owned by this function | 
|---|
| 547 | if (auto *I = dyn_cast<Instruction>(Val: SimpleV)) { | 
|---|
| 548 | if (I->getFunction() != &F) | 
|---|
| 549 | return nullptr; | 
|---|
| 550 | } else if (auto *Arg = dyn_cast<Argument>(Val: SimpleV)) { | 
|---|
| 551 | if (Arg->getParent() != &F) | 
|---|
| 552 | return nullptr; | 
|---|
| 553 | } else if (!isa<Constant>(Val: SimpleV)) | 
|---|
| 554 | return nullptr; | 
|---|
| 555 | return dyn_cast<T>(SimpleV); | 
|---|
| 556 | } | 
|---|
| 557 |  | 
|---|
| 558 | // Keep a bunch of stats about the cost savings found so we can print them | 
|---|
| 559 | // out when debugging. | 
|---|
| 560 | unsigned NumConstantArgs = 0; | 
|---|
| 561 | unsigned NumConstantOffsetPtrArgs = 0; | 
|---|
| 562 | unsigned NumAllocaArgs = 0; | 
|---|
| 563 | unsigned NumConstantPtrCmps = 0; | 
|---|
| 564 | unsigned NumConstantPtrDiffs = 0; | 
|---|
| 565 | unsigned NumInstructionsSimplified = 0; | 
|---|
| 566 |  | 
|---|
| 567 | void dump(); | 
|---|
| 568 | }; | 
|---|
| 569 |  | 
|---|
| 570 | // Considering forming a binary search, we should find the number of nodes | 
|---|
| 571 | // which is same as the number of comparisons when lowered. For a given | 
|---|
| 572 | // number of clusters, n, we can define a recursive function, f(n), to find | 
|---|
| 573 | // the number of nodes in the tree. The recursion is : | 
|---|
| 574 | // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, | 
|---|
| 575 | // and f(n) = n, when n <= 3. | 
|---|
| 576 | // This will lead a binary tree where the leaf should be either f(2) or f(3) | 
|---|
| 577 | // when n > 3.  So, the number of comparisons from leaves should be n, while | 
|---|
| 578 | // the number of non-leaf should be : | 
|---|
| 579 | //   2^(log2(n) - 1) - 1 | 
|---|
| 580 | //   = 2^log2(n) * 2^-1 - 1 | 
|---|
| 581 | //   = n / 2 - 1. | 
|---|
| 582 | // Considering comparisons from leaf and non-leaf nodes, we can estimate the | 
|---|
| 583 | // number of comparisons in a simple closed form : | 
|---|
| 584 | //   n + n / 2 - 1 = n * 3 / 2 - 1 | 
|---|
| 585 | int64_t getExpectedNumberOfCompare(int NumCaseCluster) { | 
|---|
| 586 | return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1; | 
|---|
| 587 | } | 
|---|
| 588 |  | 
|---|
| 589 | /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note | 
|---|
| 590 | /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer | 
|---|
| 591 | class InlineCostCallAnalyzer final : public CallAnalyzer { | 
|---|
| 592 | const bool ComputeFullInlineCost; | 
|---|
| 593 | int LoadEliminationCost = 0; | 
|---|
| 594 | /// Bonus to be applied when percentage of vector instructions in callee is | 
|---|
| 595 | /// high (see more details in updateThreshold). | 
|---|
| 596 | int VectorBonus = 0; | 
|---|
| 597 | /// Bonus to be applied when the callee has only one reachable basic block. | 
|---|
| 598 | int SingleBBBonus = 0; | 
|---|
| 599 |  | 
|---|
| 600 | /// Tunable parameters that control the analysis. | 
|---|
| 601 | const InlineParams &Params; | 
|---|
| 602 |  | 
|---|
| 603 | // This DenseMap stores the delta change in cost and threshold after | 
|---|
| 604 | // accounting for the given instruction. The map is filled only with the | 
|---|
| 605 | // flag PrintInstructionComments on. | 
|---|
| 606 | DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap; | 
|---|
| 607 |  | 
|---|
| 608 | /// Upper bound for the inlining cost. Bonuses are being applied to account | 
|---|
| 609 | /// for speculative "expected profit" of the inlining decision. | 
|---|
| 610 | int Threshold = 0; | 
|---|
| 611 |  | 
|---|
| 612 | /// The amount of StaticBonus applied. | 
|---|
| 613 | int StaticBonusApplied = 0; | 
|---|
| 614 |  | 
|---|
| 615 | /// Attempt to evaluate indirect calls to boost its inline cost. | 
|---|
| 616 | const bool BoostIndirectCalls; | 
|---|
| 617 |  | 
|---|
| 618 | /// Ignore the threshold when finalizing analysis. | 
|---|
| 619 | const bool IgnoreThreshold; | 
|---|
| 620 |  | 
|---|
| 621 | // True if the cost-benefit-analysis-based inliner is enabled. | 
|---|
| 622 | const bool CostBenefitAnalysisEnabled; | 
|---|
| 623 |  | 
|---|
| 624 | /// Inlining cost measured in abstract units, accounts for all the | 
|---|
| 625 | /// instructions expected to be executed for a given function invocation. | 
|---|
| 626 | /// Instructions that are statically proven to be dead based on call-site | 
|---|
| 627 | /// arguments are not counted here. | 
|---|
| 628 | int Cost = 0; | 
|---|
| 629 |  | 
|---|
| 630 | // The cumulative cost at the beginning of the basic block being analyzed.  At | 
|---|
| 631 | // the end of analyzing each basic block, "Cost - CostAtBBStart" represents | 
|---|
| 632 | // the size of that basic block. | 
|---|
| 633 | int CostAtBBStart = 0; | 
|---|
| 634 |  | 
|---|
| 635 | // The static size of live but cold basic blocks.  This is "static" in the | 
|---|
| 636 | // sense that it's not weighted by profile counts at all. | 
|---|
| 637 | int ColdSize = 0; | 
|---|
| 638 |  | 
|---|
| 639 | // Whether inlining is decided by cost-threshold analysis. | 
|---|
| 640 | bool DecidedByCostThreshold = false; | 
|---|
| 641 |  | 
|---|
| 642 | // Whether inlining is decided by cost-benefit analysis. | 
|---|
| 643 | bool DecidedByCostBenefit = false; | 
|---|
| 644 |  | 
|---|
| 645 | // The cost-benefit pair computed by cost-benefit analysis. | 
|---|
| 646 | std::optional<CostBenefitPair> CostBenefit; | 
|---|
| 647 |  | 
|---|
| 648 | bool SingleBB = true; | 
|---|
| 649 |  | 
|---|
| 650 | unsigned SROACostSavings = 0; | 
|---|
| 651 | unsigned SROACostSavingsLost = 0; | 
|---|
| 652 |  | 
|---|
| 653 | /// The mapping of caller Alloca values to their accumulated cost savings. If | 
|---|
| 654 | /// we have to disable SROA for one of the allocas, this tells us how much | 
|---|
| 655 | /// cost must be added. | 
|---|
| 656 | DenseMap<AllocaInst *, int> SROAArgCosts; | 
|---|
| 657 |  | 
|---|
| 658 | /// Return true if \p Call is a cold callsite. | 
|---|
| 659 | bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); | 
|---|
| 660 |  | 
|---|
| 661 | /// Update Threshold based on callsite properties such as callee | 
|---|
| 662 | /// attributes and callee hotness for PGO builds. The Callee is explicitly | 
|---|
| 663 | /// passed to support analyzing indirect calls whose target is inferred by | 
|---|
| 664 | /// analysis. | 
|---|
| 665 | void updateThreshold(CallBase &Call, Function &Callee); | 
|---|
| 666 | /// Return a higher threshold if \p Call is a hot callsite. | 
|---|
| 667 | std::optional<int> getHotCallSiteThreshold(CallBase &Call, | 
|---|
| 668 | BlockFrequencyInfo *CallerBFI); | 
|---|
| 669 |  | 
|---|
| 670 | /// Handle a capped 'int' increment for Cost. | 
|---|
| 671 | void addCost(int64_t Inc) { | 
|---|
| 672 | Inc = std::clamp<int64_t>(val: Inc, INT_MIN, INT_MAX); | 
|---|
| 673 | Cost = std::clamp<int64_t>(val: Inc + Cost, INT_MIN, INT_MAX); | 
|---|
| 674 | } | 
|---|
| 675 |  | 
|---|
| 676 | void onDisableSROA(AllocaInst *Arg) override { | 
|---|
| 677 | auto CostIt = SROAArgCosts.find(Val: Arg); | 
|---|
| 678 | if (CostIt == SROAArgCosts.end()) | 
|---|
| 679 | return; | 
|---|
| 680 | addCost(Inc: CostIt->second); | 
|---|
| 681 | SROACostSavings -= CostIt->second; | 
|---|
| 682 | SROACostSavingsLost += CostIt->second; | 
|---|
| 683 | SROAArgCosts.erase(I: CostIt); | 
|---|
| 684 | } | 
|---|
| 685 |  | 
|---|
| 686 | void onDisableLoadElimination() override { | 
|---|
| 687 | addCost(Inc: LoadEliminationCost); | 
|---|
| 688 | LoadEliminationCost = 0; | 
|---|
| 689 | } | 
|---|
| 690 |  | 
|---|
| 691 | bool onCallBaseVisitStart(CallBase &Call) override { | 
|---|
| 692 | if (std::optional<int> AttrCallThresholdBonus = | 
|---|
| 693 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-threshold-bonus")) | 
|---|
| 694 | Threshold += *AttrCallThresholdBonus; | 
|---|
| 695 |  | 
|---|
| 696 | if (std::optional<int> AttrCallCost = | 
|---|
| 697 | getStringFnAttrAsInt(CB&: Call, AttrKind: "call-inline-cost")) { | 
|---|
| 698 | addCost(Inc: *AttrCallCost); | 
|---|
| 699 | // Prevent further processing of the call since we want to override its | 
|---|
| 700 | // inline cost, not just add to it. | 
|---|
| 701 | return false; | 
|---|
| 702 | } | 
|---|
| 703 | return true; | 
|---|
| 704 | } | 
|---|
| 705 |  | 
|---|
| 706 | void onCallPenalty() override { addCost(Inc: CallPenalty); } | 
|---|
| 707 |  | 
|---|
| 708 | void onMemAccess() override { addCost(Inc: MemAccessCost); } | 
|---|
| 709 |  | 
|---|
| 710 | void onCallArgumentSetup(const CallBase &Call) override { | 
|---|
| 711 | // Pay the price of the argument setup. We account for the average 1 | 
|---|
| 712 | // instruction per call argument setup here. | 
|---|
| 713 | addCost(Inc: Call.arg_size() * InstrCost); | 
|---|
| 714 | } | 
|---|
| 715 | void onLoadRelativeIntrinsic() override { | 
|---|
| 716 | // This is normally lowered to 4 LLVM instructions. | 
|---|
| 717 | addCost(Inc: 3 * InstrCost); | 
|---|
| 718 | } | 
|---|
| 719 | void onLoweredCall(Function *F, CallBase &Call, | 
|---|
| 720 | bool IsIndirectCall) override { | 
|---|
| 721 | // We account for the average 1 instruction per call argument setup here. | 
|---|
| 722 | addCost(Inc: Call.arg_size() * InstrCost); | 
|---|
| 723 |  | 
|---|
| 724 | // If we have a constant that we are calling as a function, we can peer | 
|---|
| 725 | // through it and see the function target. This happens not infrequently | 
|---|
| 726 | // during devirtualization and so we want to give it a hefty bonus for | 
|---|
| 727 | // inlining, but cap that bonus in the event that inlining wouldn't pan out. | 
|---|
| 728 | // Pretend to inline the function, with a custom threshold. | 
|---|
| 729 | if (IsIndirectCall && BoostIndirectCalls) { | 
|---|
| 730 | auto IndirectCallParams = Params; | 
|---|
| 731 | IndirectCallParams.DefaultThreshold = | 
|---|
| 732 | InlineConstants::IndirectCallThreshold; | 
|---|
| 733 | /// FIXME: if InlineCostCallAnalyzer is derived from, this may need | 
|---|
| 734 | /// to instantiate the derived class. | 
|---|
| 735 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, | 
|---|
| 736 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, | 
|---|
| 737 | false); | 
|---|
| 738 | if (CA.analyze().isSuccess()) { | 
|---|
| 739 | // We were able to inline the indirect call! Subtract the cost from the | 
|---|
| 740 | // threshold to get the bonus we want to apply, but don't go below zero. | 
|---|
| 741 | Cost -= std::max(a: 0, b: CA.getThreshold() - CA.getCost()); | 
|---|
| 742 | } | 
|---|
| 743 | } else | 
|---|
| 744 | // Otherwise simply add the cost for merely making the call. | 
|---|
| 745 | addCost(Inc: TTI.getInlineCallPenalty(F: CandidateCall.getCaller(), Call, | 
|---|
| 746 | DefaultCallPenalty: CallPenalty)); | 
|---|
| 747 | } | 
|---|
| 748 |  | 
|---|
| 749 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, | 
|---|
| 750 | bool DefaultDestUnreachable) override { | 
|---|
| 751 | // If suitable for a jump table, consider the cost for the table size and | 
|---|
| 752 | // branch to destination. | 
|---|
| 753 | // Maximum valid cost increased in this function. | 
|---|
| 754 | if (JumpTableSize) { | 
|---|
| 755 | // Suppose a default branch includes one compare and one conditional | 
|---|
| 756 | // branch if it's reachable. | 
|---|
| 757 | if (!DefaultDestUnreachable) | 
|---|
| 758 | addCost(Inc: 2 * InstrCost); | 
|---|
| 759 | // Suppose a jump table requires one load and one jump instruction. | 
|---|
| 760 | int64_t JTCost = | 
|---|
| 761 | static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost; | 
|---|
| 762 | addCost(Inc: JTCost); | 
|---|
| 763 | return; | 
|---|
| 764 | } | 
|---|
| 765 |  | 
|---|
| 766 | if (NumCaseCluster <= 3) { | 
|---|
| 767 | // Suppose a comparison includes one compare and one conditional branch. | 
|---|
| 768 | // We can reduce a set of instructions if the default branch is | 
|---|
| 769 | // undefined. | 
|---|
| 770 | addCost(Inc: (NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost); | 
|---|
| 771 | return; | 
|---|
| 772 | } | 
|---|
| 773 |  | 
|---|
| 774 | int64_t ExpectedNumberOfCompare = | 
|---|
| 775 | getExpectedNumberOfCompare(NumCaseCluster); | 
|---|
| 776 | int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost; | 
|---|
| 777 |  | 
|---|
| 778 | addCost(Inc: SwitchCost); | 
|---|
| 779 | } | 
|---|
| 780 | void onMissedSimplification() override { addCost(Inc: InstrCost); } | 
|---|
| 781 |  | 
|---|
| 782 | void onInitializeSROAArg(AllocaInst *Arg) override { | 
|---|
| 783 | assert(Arg != nullptr && | 
|---|
| 784 | "Should not initialize SROA costs for null value."); | 
|---|
| 785 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); | 
|---|
| 786 | SROACostSavings += SROAArgCost; | 
|---|
| 787 | SROAArgCosts[Arg] = SROAArgCost; | 
|---|
| 788 | } | 
|---|
| 789 |  | 
|---|
| 790 | void onAggregateSROAUse(AllocaInst *SROAArg) override { | 
|---|
| 791 | auto CostIt = SROAArgCosts.find(Val: SROAArg); | 
|---|
| 792 | assert(CostIt != SROAArgCosts.end() && | 
|---|
| 793 | "expected this argument to have a cost"); | 
|---|
| 794 | CostIt->second += InstrCost; | 
|---|
| 795 | SROACostSavings += InstrCost; | 
|---|
| 796 | } | 
|---|
| 797 |  | 
|---|
| 798 | void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } | 
|---|
| 799 |  | 
|---|
| 800 | void onBlockAnalyzed(const BasicBlock *BB) override { | 
|---|
| 801 | if (CostBenefitAnalysisEnabled) { | 
|---|
| 802 | // Keep track of the static size of live but cold basic blocks.  For now, | 
|---|
| 803 | // we define a cold basic block to be one that's never executed. | 
|---|
| 804 | assert(GetBFI && "GetBFI must be available"); | 
|---|
| 805 | BlockFrequencyInfo *BFI = &(GetBFI(F)); | 
|---|
| 806 | assert(BFI && "BFI must be available"); | 
|---|
| 807 | auto ProfileCount = BFI->getBlockProfileCount(BB); | 
|---|
| 808 | if (*ProfileCount == 0) | 
|---|
| 809 | ColdSize += Cost - CostAtBBStart; | 
|---|
| 810 | } | 
|---|
| 811 |  | 
|---|
| 812 | auto *TI = BB->getTerminator(); | 
|---|
| 813 | // If we had any successors at this point, than post-inlining is likely to | 
|---|
| 814 | // have them as well. Note that we assume any basic blocks which existed | 
|---|
| 815 | // due to branches or switches which folded above will also fold after | 
|---|
| 816 | // inlining. | 
|---|
| 817 | if (SingleBB && TI->getNumSuccessors() > 1) { | 
|---|
| 818 | // Take off the bonus we applied to the threshold. | 
|---|
| 819 | Threshold -= SingleBBBonus; | 
|---|
| 820 | SingleBB = false; | 
|---|
| 821 | } | 
|---|
| 822 | } | 
|---|
| 823 |  | 
|---|
| 824 | void onInstructionAnalysisStart(const Instruction *I) override { | 
|---|
| 825 | // This function is called to store the initial cost of inlining before | 
|---|
| 826 | // the given instruction was assessed. | 
|---|
| 827 | if (!PrintInstructionComments) | 
|---|
| 828 | return; | 
|---|
| 829 | auto &CostDetail = InstructionCostDetailMap[I]; | 
|---|
| 830 | CostDetail.CostBefore = Cost; | 
|---|
| 831 | CostDetail.ThresholdBefore = Threshold; | 
|---|
| 832 | } | 
|---|
| 833 |  | 
|---|
| 834 | void onInstructionAnalysisFinish(const Instruction *I) override { | 
|---|
| 835 | // This function is called to find new values of cost and threshold after | 
|---|
| 836 | // the instruction has been assessed. | 
|---|
| 837 | if (!PrintInstructionComments) | 
|---|
| 838 | return; | 
|---|
| 839 | auto &CostDetail = InstructionCostDetailMap[I]; | 
|---|
| 840 | CostDetail.CostAfter = Cost; | 
|---|
| 841 | CostDetail.ThresholdAfter = Threshold; | 
|---|
| 842 | } | 
|---|
| 843 |  | 
|---|
| 844 | bool isCostBenefitAnalysisEnabled() { | 
|---|
| 845 | if (!PSI || !PSI->hasProfileSummary()) | 
|---|
| 846 | return false; | 
|---|
| 847 |  | 
|---|
| 848 | if (!GetBFI) | 
|---|
| 849 | return false; | 
|---|
| 850 |  | 
|---|
| 851 | if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) { | 
|---|
| 852 | // Honor the explicit request from the user. | 
|---|
| 853 | if (!InlineEnableCostBenefitAnalysis) | 
|---|
| 854 | return false; | 
|---|
| 855 | } else { | 
|---|
| 856 | // Otherwise, require instrumentation profile. | 
|---|
| 857 | if (!PSI->hasInstrumentationProfile()) | 
|---|
| 858 | return false; | 
|---|
| 859 | } | 
|---|
| 860 |  | 
|---|
| 861 | auto *Caller = CandidateCall.getParent()->getParent(); | 
|---|
| 862 | if (!Caller->getEntryCount()) | 
|---|
| 863 | return false; | 
|---|
| 864 |  | 
|---|
| 865 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller)); | 
|---|
| 866 | if (!CallerBFI) | 
|---|
| 867 | return false; | 
|---|
| 868 |  | 
|---|
| 869 | // For now, limit to hot call site. | 
|---|
| 870 | if (!PSI->isHotCallSite(CB: CandidateCall, BFI: CallerBFI)) | 
|---|
| 871 | return false; | 
|---|
| 872 |  | 
|---|
| 873 | // Make sure we have a nonzero entry count. | 
|---|
| 874 | auto EntryCount = F.getEntryCount(); | 
|---|
| 875 | if (!EntryCount || !EntryCount->getCount()) | 
|---|
| 876 | return false; | 
|---|
| 877 |  | 
|---|
| 878 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); | 
|---|
| 879 | if (!CalleeBFI) | 
|---|
| 880 | return false; | 
|---|
| 881 |  | 
|---|
| 882 | return true; | 
|---|
| 883 | } | 
|---|
| 884 |  | 
|---|
| 885 | // A helper function to choose between command line override and default. | 
|---|
| 886 | unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { | 
|---|
| 887 | if (InlineSavingsMultiplier.getNumOccurrences()) | 
|---|
| 888 | return InlineSavingsMultiplier; | 
|---|
| 889 | return TTI.getInliningCostBenefitAnalysisSavingsMultiplier(); | 
|---|
| 890 | } | 
|---|
| 891 |  | 
|---|
| 892 | // A helper function to choose between command line override and default. | 
|---|
| 893 | unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const { | 
|---|
| 894 | if (InlineSavingsProfitableMultiplier.getNumOccurrences()) | 
|---|
| 895 | return InlineSavingsProfitableMultiplier; | 
|---|
| 896 | return TTI.getInliningCostBenefitAnalysisProfitableMultiplier(); | 
|---|
| 897 | } | 
|---|
| 898 |  | 
|---|
| 899 | void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) { | 
|---|
| 900 | if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt( | 
|---|
| 901 | CB&: CandidateCall, AttrKind: "inline-cycle-savings-for-test")) { | 
|---|
| 902 | CycleSavings = *AttrCycleSavings; | 
|---|
| 903 | } | 
|---|
| 904 |  | 
|---|
| 905 | if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt( | 
|---|
| 906 | CB&: CandidateCall, AttrKind: "inline-runtime-cost-for-test")) { | 
|---|
| 907 | Size = *AttrRuntimeCost; | 
|---|
| 908 | } | 
|---|
| 909 | } | 
|---|
| 910 |  | 
|---|
| 911 | // Determine whether we should inline the given call site, taking into account | 
|---|
| 912 | // both the size cost and the cycle savings.  Return std::nullopt if we don't | 
|---|
| 913 | // have sufficient profiling information to determine. | 
|---|
| 914 | std::optional<bool> costBenefitAnalysis() { | 
|---|
| 915 | if (!CostBenefitAnalysisEnabled) | 
|---|
| 916 | return std::nullopt; | 
|---|
| 917 |  | 
|---|
| 918 | // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 | 
|---|
| 919 | // for the prelink phase of the AutoFDO + ThinLTO build.  Honor the logic by | 
|---|
| 920 | // falling back to the cost-based metric. | 
|---|
| 921 | // TODO: Improve this hacky condition. | 
|---|
| 922 | if (Threshold == 0) | 
|---|
| 923 | return std::nullopt; | 
|---|
| 924 |  | 
|---|
| 925 | assert(GetBFI); | 
|---|
| 926 | BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); | 
|---|
| 927 | assert(CalleeBFI); | 
|---|
| 928 |  | 
|---|
| 929 | // The cycle savings expressed as the sum of InstrCost | 
|---|
| 930 | // multiplied by the estimated dynamic count of each instruction we can | 
|---|
| 931 | // avoid.  Savings come from the call site cost, such as argument setup and | 
|---|
| 932 | // the call instruction, as well as the instructions that are folded. | 
|---|
| 933 | // | 
|---|
| 934 | // We use 128-bit APInt here to avoid potential overflow.  This variable | 
|---|
| 935 | // should stay well below 10^^24 (or 2^^80) in practice.  This "worst" case | 
|---|
| 936 | // assumes that we can avoid or fold a billion instructions, each with a | 
|---|
| 937 | // profile count of 10^^15 -- roughly the number of cycles for a 24-hour | 
|---|
| 938 | // period on a 4GHz machine. | 
|---|
| 939 | APInt CycleSavings(128, 0); | 
|---|
| 940 |  | 
|---|
| 941 | for (auto &BB : F) { | 
|---|
| 942 | APInt CurrentSavings(128, 0); | 
|---|
| 943 | for (auto &I : BB) { | 
|---|
| 944 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &I)) { | 
|---|
| 945 | // Count a conditional branch as savings if it becomes unconditional. | 
|---|
| 946 | if (BI->isConditional() && | 
|---|
| 947 | getSimplifiedValue<ConstantInt>(V: BI->getCondition())) { | 
|---|
| 948 | CurrentSavings += InstrCost; | 
|---|
| 949 | } | 
|---|
| 950 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: &I)) { | 
|---|
| 951 | if (getSimplifiedValue<ConstantInt>(V: SI->getCondition())) | 
|---|
| 952 | CurrentSavings += InstrCost; | 
|---|
| 953 | } else if (Value *V = dyn_cast<Value>(Val: &I)) { | 
|---|
| 954 | // Count an instruction as savings if we can fold it. | 
|---|
| 955 | if (SimplifiedValues.count(Val: V)) { | 
|---|
| 956 | CurrentSavings += InstrCost; | 
|---|
| 957 | } | 
|---|
| 958 | } | 
|---|
| 959 | } | 
|---|
| 960 |  | 
|---|
| 961 | auto ProfileCount = CalleeBFI->getBlockProfileCount(BB: &BB); | 
|---|
| 962 | CurrentSavings *= *ProfileCount; | 
|---|
| 963 | CycleSavings += CurrentSavings; | 
|---|
| 964 | } | 
|---|
| 965 |  | 
|---|
| 966 | // Compute the cycle savings per call. | 
|---|
| 967 | auto EntryProfileCount = F.getEntryCount(); | 
|---|
| 968 | assert(EntryProfileCount && EntryProfileCount->getCount()); | 
|---|
| 969 | auto EntryCount = EntryProfileCount->getCount(); | 
|---|
| 970 | CycleSavings += EntryCount / 2; | 
|---|
| 971 | CycleSavings = CycleSavings.udiv(RHS: EntryCount); | 
|---|
| 972 |  | 
|---|
| 973 | // Compute the total savings for the call site. | 
|---|
| 974 | auto *CallerBB = CandidateCall.getParent(); | 
|---|
| 975 | BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent()))); | 
|---|
| 976 | CycleSavings += getCallsiteCost(TTI, Call: this->CandidateCall, DL); | 
|---|
| 977 | CycleSavings *= *CallerBFI->getBlockProfileCount(BB: CallerBB); | 
|---|
| 978 |  | 
|---|
| 979 | // Remove the cost of the cold basic blocks to model the runtime cost more | 
|---|
| 980 | // accurately. Both machine block placement and function splitting could | 
|---|
| 981 | // place cold blocks further from hot blocks. | 
|---|
| 982 | int Size = Cost - ColdSize; | 
|---|
| 983 |  | 
|---|
| 984 | // Allow tiny callees to be inlined regardless of whether they meet the | 
|---|
| 985 | // savings threshold. | 
|---|
| 986 | Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; | 
|---|
| 987 |  | 
|---|
| 988 | OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size); | 
|---|
| 989 | CostBenefit.emplace(args: APInt(128, Size), args&: CycleSavings); | 
|---|
| 990 |  | 
|---|
| 991 | // Let R be the ratio of CycleSavings to Size.  We accept the inlining | 
|---|
| 992 | // opportunity if R is really high and reject if R is really low.  If R is | 
|---|
| 993 | // somewhere in the middle, we fall back to the cost-based analysis. | 
|---|
| 994 | // | 
|---|
| 995 | // Specifically, let R = CycleSavings / Size, we accept the inlining | 
|---|
| 996 | // opportunity if: | 
|---|
| 997 | // | 
|---|
| 998 | //             PSI->getOrCompHotCountThreshold() | 
|---|
| 999 | // R > ------------------------------------------------- | 
|---|
| 1000 | //     getInliningCostBenefitAnalysisSavingsMultiplier() | 
|---|
| 1001 | // | 
|---|
| 1002 | // and reject the inlining opportunity if: | 
|---|
| 1003 | // | 
|---|
| 1004 | //                PSI->getOrCompHotCountThreshold() | 
|---|
| 1005 | // R <= ---------------------------------------------------- | 
|---|
| 1006 | //      getInliningCostBenefitAnalysisProfitableMultiplier() | 
|---|
| 1007 | // | 
|---|
| 1008 | // Otherwise, we fall back to the cost-based analysis. | 
|---|
| 1009 | // | 
|---|
| 1010 | // Implementation-wise, use multiplication (CycleSavings * Multiplier, | 
|---|
| 1011 | // HotCountThreshold * Size) rather than division to avoid precision loss. | 
|---|
| 1012 | APInt Threshold(128, PSI->getOrCompHotCountThreshold()); | 
|---|
| 1013 | Threshold *= Size; | 
|---|
| 1014 |  | 
|---|
| 1015 | APInt UpperBoundCycleSavings = CycleSavings; | 
|---|
| 1016 | UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier(); | 
|---|
| 1017 | if (UpperBoundCycleSavings.uge(RHS: Threshold)) | 
|---|
| 1018 | return true; | 
|---|
| 1019 |  | 
|---|
| 1020 | APInt LowerBoundCycleSavings = CycleSavings; | 
|---|
| 1021 | LowerBoundCycleSavings *= | 
|---|
| 1022 | getInliningCostBenefitAnalysisProfitableMultiplier(); | 
|---|
| 1023 | if (LowerBoundCycleSavings.ult(RHS: Threshold)) | 
|---|
| 1024 | return false; | 
|---|
| 1025 |  | 
|---|
| 1026 | // Otherwise, fall back to the cost-based analysis. | 
|---|
| 1027 | return std::nullopt; | 
|---|
| 1028 | } | 
|---|
| 1029 |  | 
|---|
| 1030 | InlineResult finalizeAnalysis() override { | 
|---|
| 1031 | // Loops generally act a lot like calls in that they act like barriers to | 
|---|
| 1032 | // movement, require a certain amount of setup, etc. So when optimising for | 
|---|
| 1033 | // size, we penalise any call sites that perform loops. We do this after all | 
|---|
| 1034 | // other costs here, so will likely only be dealing with relatively small | 
|---|
| 1035 | // functions (and hence DT and LI will hopefully be cheap). | 
|---|
| 1036 | auto *Caller = CandidateCall.getFunction(); | 
|---|
| 1037 | if (Caller->hasMinSize()) { | 
|---|
| 1038 | DominatorTree DT(F); | 
|---|
| 1039 | LoopInfo LI(DT); | 
|---|
| 1040 | int NumLoops = 0; | 
|---|
| 1041 | for (Loop *L : LI) { | 
|---|
| 1042 | // Ignore loops that will not be executed | 
|---|
| 1043 | if (DeadBlocks.count(Ptr: L->getHeader())) | 
|---|
| 1044 | continue; | 
|---|
| 1045 | NumLoops++; | 
|---|
| 1046 | } | 
|---|
| 1047 | addCost(Inc: NumLoops * InlineConstants::LoopPenalty); | 
|---|
| 1048 | } | 
|---|
| 1049 |  | 
|---|
| 1050 | // We applied the maximum possible vector bonus at the beginning. Now, | 
|---|
| 1051 | // subtract the excess bonus, if any, from the Threshold before | 
|---|
| 1052 | // comparing against Cost. | 
|---|
| 1053 | if (NumVectorInstructions <= NumInstructions / 10) | 
|---|
| 1054 | Threshold -= VectorBonus; | 
|---|
| 1055 | else if (NumVectorInstructions <= NumInstructions / 2) | 
|---|
| 1056 | Threshold -= VectorBonus / 2; | 
|---|
| 1057 |  | 
|---|
| 1058 | if (std::optional<int> AttrCost = | 
|---|
| 1059 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-cost")) | 
|---|
| 1060 | Cost = *AttrCost; | 
|---|
| 1061 |  | 
|---|
| 1062 | if (std::optional<int> AttrCostMult = getStringFnAttrAsInt( | 
|---|
| 1063 | CB&: CandidateCall, | 
|---|
| 1064 | AttrKind: InlineConstants::FunctionInlineCostMultiplierAttributeName)) | 
|---|
| 1065 | Cost *= *AttrCostMult; | 
|---|
| 1066 |  | 
|---|
| 1067 | if (std::optional<int> AttrThreshold = | 
|---|
| 1068 | getStringFnAttrAsInt(CB&: CandidateCall, AttrKind: "function-inline-threshold")) | 
|---|
| 1069 | Threshold = *AttrThreshold; | 
|---|
| 1070 |  | 
|---|
| 1071 | if (auto Result = costBenefitAnalysis()) { | 
|---|
| 1072 | DecidedByCostBenefit = true; | 
|---|
| 1073 | if (*Result) | 
|---|
| 1074 | return InlineResult::success(); | 
|---|
| 1075 | else | 
|---|
| 1076 | return InlineResult::failure(Reason: "Cost over threshold."); | 
|---|
| 1077 | } | 
|---|
| 1078 |  | 
|---|
| 1079 | if (IgnoreThreshold) | 
|---|
| 1080 | return InlineResult::success(); | 
|---|
| 1081 |  | 
|---|
| 1082 | DecidedByCostThreshold = true; | 
|---|
| 1083 | return Cost < std::max(a: 1, b: Threshold) | 
|---|
| 1084 | ? InlineResult::success() | 
|---|
| 1085 | : InlineResult::failure(Reason: "Cost over threshold."); | 
|---|
| 1086 | } | 
|---|
| 1087 |  | 
|---|
| 1088 | bool shouldStop() override { | 
|---|
| 1089 | if (IgnoreThreshold || ComputeFullInlineCost) | 
|---|
| 1090 | return false; | 
|---|
| 1091 | // Bail out the moment we cross the threshold. This means we'll under-count | 
|---|
| 1092 | // the cost, but only when undercounting doesn't matter. | 
|---|
| 1093 | if (Cost < Threshold) | 
|---|
| 1094 | return false; | 
|---|
| 1095 | DecidedByCostThreshold = true; | 
|---|
| 1096 | return true; | 
|---|
| 1097 | } | 
|---|
| 1098 |  | 
|---|
| 1099 | void onLoadEliminationOpportunity() override { | 
|---|
| 1100 | LoadEliminationCost += InstrCost; | 
|---|
| 1101 | } | 
|---|
| 1102 |  | 
|---|
| 1103 | InlineResult onAnalysisStart() override { | 
|---|
| 1104 | // Perform some tweaks to the cost and threshold based on the direct | 
|---|
| 1105 | // callsite information. | 
|---|
| 1106 |  | 
|---|
| 1107 | // We want to more aggressively inline vector-dense kernels, so up the | 
|---|
| 1108 | // threshold, and we'll lower it if the % of vector instructions gets too | 
|---|
| 1109 | // low. Note that these bonuses are some what arbitrary and evolved over | 
|---|
| 1110 | // time by accident as much as because they are principled bonuses. | 
|---|
| 1111 | // | 
|---|
| 1112 | // FIXME: It would be nice to remove all such bonuses. At least it would be | 
|---|
| 1113 | // nice to base the bonus values on something more scientific. | 
|---|
| 1114 | assert(NumInstructions == 0); | 
|---|
| 1115 | assert(NumVectorInstructions == 0); | 
|---|
| 1116 |  | 
|---|
| 1117 | // Update the threshold based on callsite properties | 
|---|
| 1118 | updateThreshold(Call&: CandidateCall, Callee&: F); | 
|---|
| 1119 |  | 
|---|
| 1120 | // While Threshold depends on commandline options that can take negative | 
|---|
| 1121 | // values, we want to enforce the invariant that the computed threshold and | 
|---|
| 1122 | // bonuses are non-negative. | 
|---|
| 1123 | assert(Threshold >= 0); | 
|---|
| 1124 | assert(SingleBBBonus >= 0); | 
|---|
| 1125 | assert(VectorBonus >= 0); | 
|---|
| 1126 |  | 
|---|
| 1127 | // Speculatively apply all possible bonuses to Threshold. If cost exceeds | 
|---|
| 1128 | // this Threshold any time, and cost cannot decrease, we can stop processing | 
|---|
| 1129 | // the rest of the function body. | 
|---|
| 1130 | Threshold += (SingleBBBonus + VectorBonus); | 
|---|
| 1131 |  | 
|---|
| 1132 | // Give out bonuses for the callsite, as the instructions setting them up | 
|---|
| 1133 | // will be gone after inlining. | 
|---|
| 1134 | addCost(Inc: -getCallsiteCost(TTI, Call: this->CandidateCall, DL)); | 
|---|
| 1135 |  | 
|---|
| 1136 | // If this function uses the coldcc calling convention, prefer not to inline | 
|---|
| 1137 | // it. | 
|---|
| 1138 | if (F.getCallingConv() == CallingConv::Cold) | 
|---|
| 1139 | Cost += InlineConstants::ColdccPenalty; | 
|---|
| 1140 |  | 
|---|
| 1141 | LLVM_DEBUG(dbgs() << "      Initial cost: "<< Cost << "\n"); | 
|---|
| 1142 |  | 
|---|
| 1143 | // Check if we're done. This can happen due to bonuses and penalties. | 
|---|
| 1144 | if (Cost >= Threshold && !ComputeFullInlineCost) | 
|---|
| 1145 | return InlineResult::failure(Reason: "high cost"); | 
|---|
| 1146 |  | 
|---|
| 1147 | return InlineResult::success(); | 
|---|
| 1148 | } | 
|---|
| 1149 |  | 
|---|
| 1150 | public: | 
|---|
| 1151 | InlineCostCallAnalyzer( | 
|---|
| 1152 | Function &Callee, CallBase &Call, const InlineParams &Params, | 
|---|
| 1153 | const TargetTransformInfo &TTI, | 
|---|
| 1154 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, | 
|---|
| 1155 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, | 
|---|
| 1156 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI = nullptr, | 
|---|
| 1157 | ProfileSummaryInfo *PSI = nullptr, | 
|---|
| 1158 | OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, | 
|---|
| 1159 | bool IgnoreThreshold = false, | 
|---|
| 1160 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache = | 
|---|
| 1161 | nullptr) | 
|---|
| 1162 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI, | 
|---|
| 1163 | ORE, GetEphValuesCache), | 
|---|
| 1164 | ComputeFullInlineCost(OptComputeFullInlineCost || | 
|---|
| 1165 | Params.ComputeFullInlineCost || ORE || | 
|---|
| 1166 | isCostBenefitAnalysisEnabled()), | 
|---|
| 1167 | Params(Params), Threshold(Params.DefaultThreshold), | 
|---|
| 1168 | BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold), | 
|---|
| 1169 | CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()), | 
|---|
| 1170 | Writer(this) { | 
|---|
| 1171 | AllowRecursiveCall = *Params.AllowRecursiveCall; | 
|---|
| 1172 | } | 
|---|
| 1173 |  | 
|---|
| 1174 | /// Annotation Writer for instruction details | 
|---|
| 1175 | InlineCostAnnotationWriter Writer; | 
|---|
| 1176 |  | 
|---|
| 1177 | void dump(); | 
|---|
| 1178 |  | 
|---|
| 1179 | // Prints the same analysis as dump(), but its definition is not dependent | 
|---|
| 1180 | // on the build. | 
|---|
| 1181 | void print(raw_ostream &OS); | 
|---|
| 1182 |  | 
|---|
| 1183 | std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) { | 
|---|
| 1184 | auto It = InstructionCostDetailMap.find(Val: I); | 
|---|
| 1185 | if (It != InstructionCostDetailMap.end()) | 
|---|
| 1186 | return It->second; | 
|---|
| 1187 | return std::nullopt; | 
|---|
| 1188 | } | 
|---|
| 1189 |  | 
|---|
| 1190 | virtual ~InlineCostCallAnalyzer() = default; | 
|---|
| 1191 | int getThreshold() const { return Threshold; } | 
|---|
| 1192 | int getCost() const { return Cost; } | 
|---|
| 1193 | int getStaticBonusApplied() const { return StaticBonusApplied; } | 
|---|
| 1194 | std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; } | 
|---|
| 1195 | bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; } | 
|---|
| 1196 | bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; } | 
|---|
| 1197 | }; | 
|---|
| 1198 |  | 
|---|
| 1199 | // Return true if CB is the sole call to local function Callee. | 
|---|
| 1200 | static bool isSoleCallToLocalFunction(const CallBase &CB, | 
|---|
| 1201 | const Function &Callee) { | 
|---|
| 1202 | return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() && | 
|---|
| 1203 | &Callee == CB.getCalledFunction(); | 
|---|
| 1204 | } | 
|---|
| 1205 |  | 
|---|
| 1206 | class InlineCostFeaturesAnalyzer final : public CallAnalyzer { | 
|---|
| 1207 | private: | 
|---|
| 1208 | InlineCostFeatures Cost = {}; | 
|---|
| 1209 |  | 
|---|
| 1210 | // FIXME: These constants are taken from the heuristic-based cost visitor. | 
|---|
| 1211 | // These should be removed entirely in a later revision to avoid reliance on | 
|---|
| 1212 | // heuristics in the ML inliner. | 
|---|
| 1213 | static constexpr int JTCostMultiplier = 2; | 
|---|
| 1214 | static constexpr int CaseClusterCostMultiplier = 2; | 
|---|
| 1215 | static constexpr int SwitchDefaultDestCostMultiplier = 2; | 
|---|
| 1216 | static constexpr int SwitchCostMultiplier = 2; | 
|---|
| 1217 |  | 
|---|
| 1218 | // FIXME: These are taken from the heuristic-based cost visitor: we should | 
|---|
| 1219 | // eventually abstract these to the CallAnalyzer to avoid duplication. | 
|---|
| 1220 | unsigned SROACostSavingOpportunities = 0; | 
|---|
| 1221 | int VectorBonus = 0; | 
|---|
| 1222 | int SingleBBBonus = 0; | 
|---|
| 1223 | int Threshold = 5; | 
|---|
| 1224 |  | 
|---|
| 1225 | DenseMap<AllocaInst *, unsigned> SROACosts; | 
|---|
| 1226 |  | 
|---|
| 1227 | void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) { | 
|---|
| 1228 | Cost[static_cast<size_t>(Feature)] += Delta; | 
|---|
| 1229 | } | 
|---|
| 1230 |  | 
|---|
| 1231 | void set(InlineCostFeatureIndex Feature, int64_t Value) { | 
|---|
| 1232 | Cost[static_cast<size_t>(Feature)] = Value; | 
|---|
| 1233 | } | 
|---|
| 1234 |  | 
|---|
| 1235 | void onDisableSROA(AllocaInst *Arg) override { | 
|---|
| 1236 | auto CostIt = SROACosts.find(Val: Arg); | 
|---|
| 1237 | if (CostIt == SROACosts.end()) | 
|---|
| 1238 | return; | 
|---|
| 1239 |  | 
|---|
| 1240 | increment(Feature: InlineCostFeatureIndex::sroa_losses, Delta: CostIt->second); | 
|---|
| 1241 | SROACostSavingOpportunities -= CostIt->second; | 
|---|
| 1242 | SROACosts.erase(I: CostIt); | 
|---|
| 1243 | } | 
|---|
| 1244 |  | 
|---|
| 1245 | void onDisableLoadElimination() override { | 
|---|
| 1246 | set(Feature: InlineCostFeatureIndex::load_elimination, Value: 1); | 
|---|
| 1247 | } | 
|---|
| 1248 |  | 
|---|
| 1249 | void onCallPenalty() override { | 
|---|
| 1250 | increment(Feature: InlineCostFeatureIndex::call_penalty, Delta: CallPenalty); | 
|---|
| 1251 | } | 
|---|
| 1252 |  | 
|---|
| 1253 | void onCallArgumentSetup(const CallBase &Call) override { | 
|---|
| 1254 | increment(Feature: InlineCostFeatureIndex::call_argument_setup, | 
|---|
| 1255 | Delta: Call.arg_size() * InstrCost); | 
|---|
| 1256 | } | 
|---|
| 1257 |  | 
|---|
| 1258 | void onLoadRelativeIntrinsic() override { | 
|---|
| 1259 | increment(Feature: InlineCostFeatureIndex::load_relative_intrinsic, Delta: 3 * InstrCost); | 
|---|
| 1260 | } | 
|---|
| 1261 |  | 
|---|
| 1262 | void onLoweredCall(Function *F, CallBase &Call, | 
|---|
| 1263 | bool IsIndirectCall) override { | 
|---|
| 1264 | increment(Feature: InlineCostFeatureIndex::lowered_call_arg_setup, | 
|---|
| 1265 | Delta: Call.arg_size() * InstrCost); | 
|---|
| 1266 |  | 
|---|
| 1267 | if (IsIndirectCall) { | 
|---|
| 1268 | InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0, | 
|---|
| 1269 | /*HintThreshold*/ {}, | 
|---|
| 1270 | /*ColdThreshold*/ {}, | 
|---|
| 1271 | /*OptSizeThreshold*/ {}, | 
|---|
| 1272 | /*OptMinSizeThreshold*/ {}, | 
|---|
| 1273 | /*HotCallSiteThreshold*/ {}, | 
|---|
| 1274 | /*LocallyHotCallSiteThreshold*/ {}, | 
|---|
| 1275 | /*ColdCallSiteThreshold*/ {}, | 
|---|
| 1276 | /*ComputeFullInlineCost*/ true, | 
|---|
| 1277 | /*EnableDeferral*/ true}; | 
|---|
| 1278 | IndirectCallParams.DefaultThreshold = | 
|---|
| 1279 | InlineConstants::IndirectCallThreshold; | 
|---|
| 1280 |  | 
|---|
| 1281 | InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, | 
|---|
| 1282 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, | 
|---|
| 1283 | false, true); | 
|---|
| 1284 | if (CA.analyze().isSuccess()) { | 
|---|
| 1285 | increment(Feature: InlineCostFeatureIndex::nested_inline_cost_estimate, | 
|---|
| 1286 | Delta: CA.getCost()); | 
|---|
| 1287 | increment(Feature: InlineCostFeatureIndex::nested_inlines, Delta: 1); | 
|---|
| 1288 | } | 
|---|
| 1289 | } else { | 
|---|
| 1290 | onCallPenalty(); | 
|---|
| 1291 | } | 
|---|
| 1292 | } | 
|---|
| 1293 |  | 
|---|
| 1294 | void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, | 
|---|
| 1295 | bool DefaultDestUnreachable) override { | 
|---|
| 1296 | if (JumpTableSize) { | 
|---|
| 1297 | if (!DefaultDestUnreachable) | 
|---|
| 1298 | increment(Feature: InlineCostFeatureIndex::switch_default_dest_penalty, | 
|---|
| 1299 | Delta: SwitchDefaultDestCostMultiplier * InstrCost); | 
|---|
| 1300 | int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost + | 
|---|
| 1301 | JTCostMultiplier * InstrCost; | 
|---|
| 1302 | increment(Feature: InlineCostFeatureIndex::jump_table_penalty, Delta: JTCost); | 
|---|
| 1303 | return; | 
|---|
| 1304 | } | 
|---|
| 1305 |  | 
|---|
| 1306 | if (NumCaseCluster <= 3) { | 
|---|
| 1307 | increment(Feature: InlineCostFeatureIndex::case_cluster_penalty, | 
|---|
| 1308 | Delta: (NumCaseCluster - DefaultDestUnreachable) * | 
|---|
| 1309 | CaseClusterCostMultiplier * InstrCost); | 
|---|
| 1310 | return; | 
|---|
| 1311 | } | 
|---|
| 1312 |  | 
|---|
| 1313 | int64_t ExpectedNumberOfCompare = | 
|---|
| 1314 | getExpectedNumberOfCompare(NumCaseCluster); | 
|---|
| 1315 |  | 
|---|
| 1316 | int64_t SwitchCost = | 
|---|
| 1317 | ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost; | 
|---|
| 1318 | increment(Feature: InlineCostFeatureIndex::switch_penalty, Delta: SwitchCost); | 
|---|
| 1319 | } | 
|---|
| 1320 |  | 
|---|
| 1321 | void onMissedSimplification() override { | 
|---|
| 1322 | increment(Feature: InlineCostFeatureIndex::unsimplified_common_instructions, | 
|---|
| 1323 | Delta: InstrCost); | 
|---|
| 1324 | } | 
|---|
| 1325 |  | 
|---|
| 1326 | void onInitializeSROAArg(AllocaInst *Arg) override { | 
|---|
| 1327 | auto SROAArgCost = TTI.getCallerAllocaCost(CB: &CandidateCall, AI: Arg); | 
|---|
| 1328 | SROACosts[Arg] = SROAArgCost; | 
|---|
| 1329 | SROACostSavingOpportunities += SROAArgCost; | 
|---|
| 1330 | } | 
|---|
| 1331 |  | 
|---|
| 1332 | void onAggregateSROAUse(AllocaInst *Arg) override { | 
|---|
| 1333 | SROACosts.find(Val: Arg)->second += InstrCost; | 
|---|
| 1334 | SROACostSavingOpportunities += InstrCost; | 
|---|
| 1335 | } | 
|---|
| 1336 |  | 
|---|
| 1337 | void onBlockAnalyzed(const BasicBlock *BB) override { | 
|---|
| 1338 | if (BB->getTerminator()->getNumSuccessors() > 1) | 
|---|
| 1339 | set(Feature: InlineCostFeatureIndex::is_multiple_blocks, Value: 1); | 
|---|
| 1340 | Threshold -= SingleBBBonus; | 
|---|
| 1341 | } | 
|---|
| 1342 |  | 
|---|
| 1343 | InlineResult finalizeAnalysis() override { | 
|---|
| 1344 | auto *Caller = CandidateCall.getFunction(); | 
|---|
| 1345 | if (Caller->hasMinSize()) { | 
|---|
| 1346 | DominatorTree DT(F); | 
|---|
| 1347 | LoopInfo LI(DT); | 
|---|
| 1348 | for (Loop *L : LI) { | 
|---|
| 1349 | // Ignore loops that will not be executed | 
|---|
| 1350 | if (DeadBlocks.count(Ptr: L->getHeader())) | 
|---|
| 1351 | continue; | 
|---|
| 1352 | increment(Feature: InlineCostFeatureIndex::num_loops, | 
|---|
| 1353 | Delta: InlineConstants::LoopPenalty); | 
|---|
| 1354 | } | 
|---|
| 1355 | } | 
|---|
| 1356 | set(Feature: InlineCostFeatureIndex::dead_blocks, Value: DeadBlocks.size()); | 
|---|
| 1357 | set(Feature: InlineCostFeatureIndex::simplified_instructions, | 
|---|
| 1358 | Value: NumInstructionsSimplified); | 
|---|
| 1359 | set(Feature: InlineCostFeatureIndex::constant_args, Value: NumConstantArgs); | 
|---|
| 1360 | set(Feature: InlineCostFeatureIndex::constant_offset_ptr_args, | 
|---|
| 1361 | Value: NumConstantOffsetPtrArgs); | 
|---|
| 1362 | set(Feature: InlineCostFeatureIndex::sroa_savings, Value: SROACostSavingOpportunities); | 
|---|
| 1363 |  | 
|---|
| 1364 | if (NumVectorInstructions <= NumInstructions / 10) | 
|---|
| 1365 | Threshold -= VectorBonus; | 
|---|
| 1366 | else if (NumVectorInstructions <= NumInstructions / 2) | 
|---|
| 1367 | Threshold -= VectorBonus / 2; | 
|---|
| 1368 |  | 
|---|
| 1369 | set(Feature: InlineCostFeatureIndex::threshold, Value: Threshold); | 
|---|
| 1370 |  | 
|---|
| 1371 | return InlineResult::success(); | 
|---|
| 1372 | } | 
|---|
| 1373 |  | 
|---|
| 1374 | bool shouldStop() override { return false; } | 
|---|
| 1375 |  | 
|---|
| 1376 | void onLoadEliminationOpportunity() override { | 
|---|
| 1377 | increment(Feature: InlineCostFeatureIndex::load_elimination, Delta: 1); | 
|---|
| 1378 | } | 
|---|
| 1379 |  | 
|---|
| 1380 | InlineResult onAnalysisStart() override { | 
|---|
| 1381 | increment(Feature: InlineCostFeatureIndex::callsite_cost, | 
|---|
| 1382 | Delta: -1 * getCallsiteCost(TTI, Call: this->CandidateCall, DL)); | 
|---|
| 1383 |  | 
|---|
| 1384 | set(Feature: InlineCostFeatureIndex::cold_cc_penalty, | 
|---|
| 1385 | Value: (F.getCallingConv() == CallingConv::Cold)); | 
|---|
| 1386 |  | 
|---|
| 1387 | set(Feature: InlineCostFeatureIndex::last_call_to_static_bonus, | 
|---|
| 1388 | Value: isSoleCallToLocalFunction(CB: CandidateCall, Callee: F)); | 
|---|
| 1389 |  | 
|---|
| 1390 | // FIXME: we shouldn't repeat this logic in both the Features and Cost | 
|---|
| 1391 | // analyzer - instead, we should abstract it to a common method in the | 
|---|
| 1392 | // CallAnalyzer | 
|---|
| 1393 | int SingleBBBonusPercent = 50; | 
|---|
| 1394 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); | 
|---|
| 1395 | Threshold += TTI.adjustInliningThreshold(CB: &CandidateCall); | 
|---|
| 1396 | Threshold *= TTI.getInliningThresholdMultiplier(); | 
|---|
| 1397 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; | 
|---|
| 1398 | VectorBonus = Threshold * VectorBonusPercent / 100; | 
|---|
| 1399 | Threshold += (SingleBBBonus + VectorBonus); | 
|---|
| 1400 |  | 
|---|
| 1401 | return InlineResult::success(); | 
|---|
| 1402 | } | 
|---|
| 1403 |  | 
|---|
| 1404 | public: | 
|---|
| 1405 | InlineCostFeaturesAnalyzer( | 
|---|
| 1406 | const TargetTransformInfo &TTI, | 
|---|
| 1407 | function_ref<AssumptionCache &(Function &)> &GetAssumptionCache, | 
|---|
| 1408 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, | 
|---|
| 1409 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, | 
|---|
| 1410 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, | 
|---|
| 1411 | CallBase &Call) | 
|---|
| 1412 | : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, | 
|---|
| 1413 | PSI) {} | 
|---|
| 1414 |  | 
|---|
| 1415 | const InlineCostFeatures &features() const { return Cost; } | 
|---|
| 1416 | }; | 
|---|
| 1417 |  | 
|---|
| 1418 | } // namespace | 
|---|
| 1419 |  | 
|---|
| 1420 | /// Test whether the given value is an Alloca-derived function argument. | 
|---|
| 1421 | bool CallAnalyzer::isAllocaDerivedArg(Value *V) { | 
|---|
| 1422 | return SROAArgValues.count(Val: V); | 
|---|
| 1423 | } | 
|---|
| 1424 |  | 
|---|
| 1425 | void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { | 
|---|
| 1426 | onDisableSROA(Arg: SROAArg); | 
|---|
| 1427 | EnabledSROAAllocas.erase(V: SROAArg); | 
|---|
| 1428 | disableLoadElimination(); | 
|---|
| 1429 | } | 
|---|
| 1430 |  | 
|---|
| 1431 | void InlineCostAnnotationWriter::emitInstructionAnnot( | 
|---|
| 1432 | const Instruction *I, formatted_raw_ostream &OS) { | 
|---|
| 1433 | // The cost of inlining of the given instruction is printed always. | 
|---|
| 1434 | // The threshold delta is printed only when it is non-zero. It happens | 
|---|
| 1435 | // when we decided to give a bonus at a particular instruction. | 
|---|
| 1436 | std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I); | 
|---|
| 1437 | if (!Record) | 
|---|
| 1438 | OS << "; No analysis for the instruction"; | 
|---|
| 1439 | else { | 
|---|
| 1440 | OS << "; cost before = "<< Record->CostBefore | 
|---|
| 1441 | << ", cost after = "<< Record->CostAfter | 
|---|
| 1442 | << ", threshold before = "<< Record->ThresholdBefore | 
|---|
| 1443 | << ", threshold after = "<< Record->ThresholdAfter << ", "; | 
|---|
| 1444 | OS << "cost delta = "<< Record->getCostDelta(); | 
|---|
| 1445 | if (Record->hasThresholdChanged()) | 
|---|
| 1446 | OS << ", threshold delta = "<< Record->getThresholdDelta(); | 
|---|
| 1447 | } | 
|---|
| 1448 | auto *V = ICCA->getSimplifiedValueUnchecked(V: const_cast<Instruction *>(I)); | 
|---|
| 1449 | if (V) { | 
|---|
| 1450 | OS << ", simplified to "; | 
|---|
| 1451 | V->print(O&: OS, IsForDebug: true); | 
|---|
| 1452 | if (auto *VI = dyn_cast<Instruction>(Val: V)) { | 
|---|
| 1453 | if (VI->getFunction() != I->getFunction()) | 
|---|
| 1454 | OS << " (caller instruction)"; | 
|---|
| 1455 | } else if (auto *VArg = dyn_cast<Argument>(Val: V)) { | 
|---|
| 1456 | if (VArg->getParent() != I->getFunction()) | 
|---|
| 1457 | OS << " (caller argument)"; | 
|---|
| 1458 | } | 
|---|
| 1459 | } | 
|---|
| 1460 | OS << "\n"; | 
|---|
| 1461 | } | 
|---|
| 1462 |  | 
|---|
| 1463 | /// If 'V' maps to a SROA candidate, disable SROA for it. | 
|---|
| 1464 | void CallAnalyzer::disableSROA(Value *V) { | 
|---|
| 1465 | if (auto *SROAArg = getSROAArgForValueOrNull(V)) { | 
|---|
| 1466 | disableSROAForArg(SROAArg); | 
|---|
| 1467 | } | 
|---|
| 1468 | } | 
|---|
| 1469 |  | 
|---|
| 1470 | void CallAnalyzer::disableLoadElimination() { | 
|---|
| 1471 | if (EnableLoadElimination) { | 
|---|
| 1472 | onDisableLoadElimination(); | 
|---|
| 1473 | EnableLoadElimination = false; | 
|---|
| 1474 | } | 
|---|
| 1475 | } | 
|---|
| 1476 |  | 
|---|
| 1477 | /// Accumulate a constant GEP offset into an APInt if possible. | 
|---|
| 1478 | /// | 
|---|
| 1479 | /// Returns false if unable to compute the offset for any reason. Respects any | 
|---|
| 1480 | /// simplified values known during the analysis of this callsite. | 
|---|
| 1481 | bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { | 
|---|
| 1482 | unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(Ty: GEP.getType()); | 
|---|
| 1483 | assert(IntPtrWidth == Offset.getBitWidth()); | 
|---|
| 1484 |  | 
|---|
| 1485 | for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); | 
|---|
| 1486 | GTI != GTE; ++GTI) { | 
|---|
| 1487 | ConstantInt *OpC = | 
|---|
| 1488 | getDirectOrSimplifiedValue<ConstantInt>(V: GTI.getOperand()); | 
|---|
| 1489 | if (!OpC) | 
|---|
| 1490 | return false; | 
|---|
| 1491 | if (OpC->isZero()) | 
|---|
| 1492 | continue; | 
|---|
| 1493 |  | 
|---|
| 1494 | // Handle a struct index, which adds its field offset to the pointer. | 
|---|
| 1495 | if (StructType *STy = GTI.getStructTypeOrNull()) { | 
|---|
| 1496 | unsigned ElementIdx = OpC->getZExtValue(); | 
|---|
| 1497 | const StructLayout *SL = DL.getStructLayout(Ty: STy); | 
|---|
| 1498 | Offset += APInt(IntPtrWidth, SL->getElementOffset(Idx: ElementIdx)); | 
|---|
| 1499 | continue; | 
|---|
| 1500 | } | 
|---|
| 1501 |  | 
|---|
| 1502 | APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL)); | 
|---|
| 1503 | Offset += OpC->getValue().sextOrTrunc(width: IntPtrWidth) * TypeSize; | 
|---|
| 1504 | } | 
|---|
| 1505 | return true; | 
|---|
| 1506 | } | 
|---|
| 1507 |  | 
|---|
| 1508 | /// Use TTI to check whether a GEP is free. | 
|---|
| 1509 | /// | 
|---|
| 1510 | /// Respects any simplified values known during the analysis of this callsite. | 
|---|
| 1511 | bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { | 
|---|
| 1512 | SmallVector<Value *, 4> Operands; | 
|---|
| 1513 | Operands.push_back(Elt: GEP.getOperand(i_nocapture: 0)); | 
|---|
| 1514 | for (const Use &Op : GEP.indices()) | 
|---|
| 1515 | if (Constant *SimpleOp = getSimplifiedValue<Constant>(V: Op)) | 
|---|
| 1516 | Operands.push_back(Elt: SimpleOp); | 
|---|
| 1517 | else | 
|---|
| 1518 | Operands.push_back(Elt: Op); | 
|---|
| 1519 | return TTI.getInstructionCost(U: &GEP, Operands, | 
|---|
| 1520 | CostKind: TargetTransformInfo::TCK_SizeAndLatency) == | 
|---|
| 1521 | TargetTransformInfo::TCC_Free; | 
|---|
| 1522 | } | 
|---|
| 1523 |  | 
|---|
| 1524 | bool CallAnalyzer::visitAlloca(AllocaInst &I) { | 
|---|
| 1525 | disableSROA(V: I.getOperand(i_nocapture: 0)); | 
|---|
| 1526 |  | 
|---|
| 1527 | // Check whether inlining will turn a dynamic alloca into a static | 
|---|
| 1528 | // alloca and handle that case. | 
|---|
| 1529 | if (I.isArrayAllocation()) { | 
|---|
| 1530 | Constant *Size = getSimplifiedValue<Constant>(V: I.getArraySize()); | 
|---|
| 1531 | if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Val: Size)) { | 
|---|
| 1532 | // Sometimes a dynamic alloca could be converted into a static alloca | 
|---|
| 1533 | // after this constant prop, and become a huge static alloca on an | 
|---|
| 1534 | // unconditional CFG path. Avoid inlining if this is going to happen above | 
|---|
| 1535 | // a threshold. | 
|---|
| 1536 | // FIXME: If the threshold is removed or lowered too much, we could end up | 
|---|
| 1537 | // being too pessimistic and prevent inlining non-problematic code. This | 
|---|
| 1538 | // could result in unintended perf regressions. A better overall strategy | 
|---|
| 1539 | // is needed to track stack usage during inlining. | 
|---|
| 1540 | Type *Ty = I.getAllocatedType(); | 
|---|
| 1541 | AllocatedSize = SaturatingMultiplyAdd( | 
|---|
| 1542 | X: AllocSize->getLimitedValue(), | 
|---|
| 1543 | Y: DL.getTypeAllocSize(Ty).getKnownMinValue(), A: AllocatedSize); | 
|---|
| 1544 | if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) | 
|---|
| 1545 | HasDynamicAlloca = true; | 
|---|
| 1546 | return false; | 
|---|
| 1547 | } | 
|---|
| 1548 | } | 
|---|
| 1549 |  | 
|---|
| 1550 | // Accumulate the allocated size. | 
|---|
| 1551 | if (I.isStaticAlloca()) { | 
|---|
| 1552 | Type *Ty = I.getAllocatedType(); | 
|---|
| 1553 | AllocatedSize = SaturatingAdd(X: DL.getTypeAllocSize(Ty).getKnownMinValue(), | 
|---|
| 1554 | Y: AllocatedSize); | 
|---|
| 1555 | } | 
|---|
| 1556 |  | 
|---|
| 1557 | // FIXME: This is overly conservative. Dynamic allocas are inefficient for | 
|---|
| 1558 | // a variety of reasons, and so we would like to not inline them into | 
|---|
| 1559 | // functions which don't currently have a dynamic alloca. This simply | 
|---|
| 1560 | // disables inlining altogether in the presence of a dynamic alloca. | 
|---|
| 1561 | if (!I.isStaticAlloca()) | 
|---|
| 1562 | HasDynamicAlloca = true; | 
|---|
| 1563 |  | 
|---|
| 1564 | return false; | 
|---|
| 1565 | } | 
|---|
| 1566 |  | 
|---|
| 1567 | bool CallAnalyzer::visitPHI(PHINode &I) { | 
|---|
| 1568 | // FIXME: We need to propagate SROA *disabling* through phi nodes, even | 
|---|
| 1569 | // though we don't want to propagate it's bonuses. The idea is to disable | 
|---|
| 1570 | // SROA if it *might* be used in an inappropriate manner. | 
|---|
| 1571 |  | 
|---|
| 1572 | // Phi nodes are always zero-cost. | 
|---|
| 1573 | // FIXME: Pointer sizes may differ between different address spaces, so do we | 
|---|
| 1574 | // need to use correct address space in the call to getPointerSizeInBits here? | 
|---|
| 1575 | // Or could we skip the getPointerSizeInBits call completely? As far as I can | 
|---|
| 1576 | // see the ZeroOffset is used as a dummy value, so we can probably use any | 
|---|
| 1577 | // bit width for the ZeroOffset? | 
|---|
| 1578 | APInt ZeroOffset = APInt::getZero(numBits: DL.getPointerSizeInBits(AS: 0)); | 
|---|
| 1579 | bool CheckSROA = I.getType()->isPointerTy(); | 
|---|
| 1580 |  | 
|---|
| 1581 | // Track the constant or pointer with constant offset we've seen so far. | 
|---|
| 1582 | Constant *FirstC = nullptr; | 
|---|
| 1583 | std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; | 
|---|
| 1584 | Value *FirstV = nullptr; | 
|---|
| 1585 |  | 
|---|
| 1586 | for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { | 
|---|
| 1587 | BasicBlock *Pred = I.getIncomingBlock(i); | 
|---|
| 1588 | // If the incoming block is dead, skip the incoming block. | 
|---|
| 1589 | if (DeadBlocks.count(Ptr: Pred)) | 
|---|
| 1590 | continue; | 
|---|
| 1591 | // If the parent block of phi is not the known successor of the incoming | 
|---|
| 1592 | // block, skip the incoming block. | 
|---|
| 1593 | BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; | 
|---|
| 1594 | if (KnownSuccessor && KnownSuccessor != I.getParent()) | 
|---|
| 1595 | continue; | 
|---|
| 1596 |  | 
|---|
| 1597 | Value *V = I.getIncomingValue(i); | 
|---|
| 1598 | // If the incoming value is this phi itself, skip the incoming value. | 
|---|
| 1599 | if (&I == V) | 
|---|
| 1600 | continue; | 
|---|
| 1601 |  | 
|---|
| 1602 | Constant *C = getDirectOrSimplifiedValue<Constant>(V); | 
|---|
| 1603 |  | 
|---|
| 1604 | std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; | 
|---|
| 1605 | if (!C && CheckSROA) | 
|---|
| 1606 | BaseAndOffset = ConstantOffsetPtrs.lookup(Val: V); | 
|---|
| 1607 |  | 
|---|
| 1608 | if (!C && !BaseAndOffset.first) | 
|---|
| 1609 | // The incoming value is neither a constant nor a pointer with constant | 
|---|
| 1610 | // offset, exit early. | 
|---|
| 1611 | return true; | 
|---|
| 1612 |  | 
|---|
| 1613 | if (FirstC) { | 
|---|
| 1614 | if (FirstC == C) | 
|---|
| 1615 | // If we've seen a constant incoming value before and it is the same | 
|---|
| 1616 | // constant we see this time, continue checking the next incoming value. | 
|---|
| 1617 | continue; | 
|---|
| 1618 | // Otherwise early exit because we either see a different constant or saw | 
|---|
| 1619 | // a constant before but we have a pointer with constant offset this time. | 
|---|
| 1620 | return true; | 
|---|
| 1621 | } | 
|---|
| 1622 |  | 
|---|
| 1623 | if (FirstV) { | 
|---|
| 1624 | // The same logic as above, but check pointer with constant offset here. | 
|---|
| 1625 | if (FirstBaseAndOffset == BaseAndOffset) | 
|---|
| 1626 | continue; | 
|---|
| 1627 | return true; | 
|---|
| 1628 | } | 
|---|
| 1629 |  | 
|---|
| 1630 | if (C) { | 
|---|
| 1631 | // This is the 1st time we've seen a constant, record it. | 
|---|
| 1632 | FirstC = C; | 
|---|
| 1633 | continue; | 
|---|
| 1634 | } | 
|---|
| 1635 |  | 
|---|
| 1636 | // The remaining case is that this is the 1st time we've seen a pointer with | 
|---|
| 1637 | // constant offset, record it. | 
|---|
| 1638 | FirstV = V; | 
|---|
| 1639 | FirstBaseAndOffset = BaseAndOffset; | 
|---|
| 1640 | } | 
|---|
| 1641 |  | 
|---|
| 1642 | // Check if we can map phi to a constant. | 
|---|
| 1643 | if (FirstC) { | 
|---|
| 1644 | SimplifiedValues[&I] = FirstC; | 
|---|
| 1645 | return true; | 
|---|
| 1646 | } | 
|---|
| 1647 |  | 
|---|
| 1648 | // Check if we can map phi to a pointer with constant offset. | 
|---|
| 1649 | if (FirstBaseAndOffset.first) { | 
|---|
| 1650 | ConstantOffsetPtrs[&I] = FirstBaseAndOffset; | 
|---|
| 1651 |  | 
|---|
| 1652 | if (auto *SROAArg = getSROAArgForValueOrNull(V: FirstV)) | 
|---|
| 1653 | SROAArgValues[&I] = SROAArg; | 
|---|
| 1654 | } | 
|---|
| 1655 |  | 
|---|
| 1656 | return true; | 
|---|
| 1657 | } | 
|---|
| 1658 |  | 
|---|
| 1659 | /// Check we can fold GEPs of constant-offset call site argument pointers. | 
|---|
| 1660 | /// This requires target data and inbounds GEPs. | 
|---|
| 1661 | /// | 
|---|
| 1662 | /// \return true if the specified GEP can be folded. | 
|---|
| 1663 | bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { | 
|---|
| 1664 | // Check if we have a base + offset for the pointer. | 
|---|
| 1665 | std::pair<Value *, APInt> BaseAndOffset = | 
|---|
| 1666 | ConstantOffsetPtrs.lookup(Val: I.getPointerOperand()); | 
|---|
| 1667 | if (!BaseAndOffset.first) | 
|---|
| 1668 | return false; | 
|---|
| 1669 |  | 
|---|
| 1670 | // Check if the offset of this GEP is constant, and if so accumulate it | 
|---|
| 1671 | // into Offset. | 
|---|
| 1672 | if (!accumulateGEPOffset(GEP&: cast<GEPOperator>(Val&: I), Offset&: BaseAndOffset.second)) | 
|---|
| 1673 | return false; | 
|---|
| 1674 |  | 
|---|
| 1675 | // Add the result as a new mapping to Base + Offset. | 
|---|
| 1676 | ConstantOffsetPtrs[&I] = BaseAndOffset; | 
|---|
| 1677 |  | 
|---|
| 1678 | return true; | 
|---|
| 1679 | } | 
|---|
| 1680 |  | 
|---|
| 1681 | bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { | 
|---|
| 1682 | auto *SROAArg = getSROAArgForValueOrNull(V: I.getPointerOperand()); | 
|---|
| 1683 |  | 
|---|
| 1684 | // Lambda to check whether a GEP's indices are all constant. | 
|---|
| 1685 | auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { | 
|---|
| 1686 | for (const Use &Op : GEP.indices()) | 
|---|
| 1687 | if (!getDirectOrSimplifiedValue<Constant>(V: Op)) | 
|---|
| 1688 | return false; | 
|---|
| 1689 | return true; | 
|---|
| 1690 | }; | 
|---|
| 1691 |  | 
|---|
| 1692 | if (!DisableGEPConstOperand) | 
|---|
| 1693 | if (simplifyInstruction(I)) | 
|---|
| 1694 | return true; | 
|---|
| 1695 |  | 
|---|
| 1696 | if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { | 
|---|
| 1697 | if (SROAArg) | 
|---|
| 1698 | SROAArgValues[&I] = SROAArg; | 
|---|
| 1699 |  | 
|---|
| 1700 | // Constant GEPs are modeled as free. | 
|---|
| 1701 | return true; | 
|---|
| 1702 | } | 
|---|
| 1703 |  | 
|---|
| 1704 | // Variable GEPs will require math and will disable SROA. | 
|---|
| 1705 | if (SROAArg) | 
|---|
| 1706 | disableSROAForArg(SROAArg); | 
|---|
| 1707 | return isGEPFree(GEP&: I); | 
|---|
| 1708 | } | 
|---|
| 1709 |  | 
|---|
| 1710 | // Simplify \p Cmp if RHS is const and we can ValueTrack LHS. | 
|---|
| 1711 | // This handles the case only when the Cmp instruction is guarding a recursive | 
|---|
| 1712 | // call that will cause the Cmp to fail/succeed for the recursive call. | 
|---|
| 1713 | bool CallAnalyzer::simplifyCmpInstForRecCall(CmpInst &Cmp) { | 
|---|
| 1714 | // Bail out if LHS is not a function argument or RHS is NOT const: | 
|---|
| 1715 | if (!isa<Argument>(Val: Cmp.getOperand(i_nocapture: 0)) || !isa<Constant>(Val: Cmp.getOperand(i_nocapture: 1))) | 
|---|
| 1716 | return false; | 
|---|
| 1717 | auto *CmpOp = Cmp.getOperand(i_nocapture: 0); | 
|---|
| 1718 | // Make sure that the callsite is recursive: | 
|---|
| 1719 | if (CandidateCall.getCaller() != &F) | 
|---|
| 1720 | return false; | 
|---|
| 1721 | // Only handle the case when the callsite has a single predecessor: | 
|---|
| 1722 | auto *CallBB = CandidateCall.getParent(); | 
|---|
| 1723 | auto *Predecessor = CallBB->getSinglePredecessor(); | 
|---|
| 1724 | if (!Predecessor) | 
|---|
| 1725 | return false; | 
|---|
| 1726 | // Check if the callsite is guarded by the same Cmp instruction: | 
|---|
| 1727 | auto *Br = dyn_cast<BranchInst>(Val: Predecessor->getTerminator()); | 
|---|
| 1728 | if (!Br || Br->isUnconditional() || Br->getCondition() != &Cmp) | 
|---|
| 1729 | return false; | 
|---|
| 1730 |  | 
|---|
| 1731 | // Check if there is any arg of the recursive callsite is affecting the cmp | 
|---|
| 1732 | // instr: | 
|---|
| 1733 | bool ArgFound = false; | 
|---|
| 1734 | Value *FuncArg = nullptr, *CallArg = nullptr; | 
|---|
| 1735 | for (unsigned ArgNum = 0; | 
|---|
| 1736 | ArgNum < F.arg_size() && ArgNum < CandidateCall.arg_size(); ArgNum++) { | 
|---|
| 1737 | FuncArg = F.getArg(i: ArgNum); | 
|---|
| 1738 | CallArg = CandidateCall.getArgOperand(i: ArgNum); | 
|---|
| 1739 | if (FuncArg == CmpOp && CallArg != CmpOp) { | 
|---|
| 1740 | ArgFound = true; | 
|---|
| 1741 | break; | 
|---|
| 1742 | } | 
|---|
| 1743 | } | 
|---|
| 1744 | if (!ArgFound) | 
|---|
| 1745 | return false; | 
|---|
| 1746 |  | 
|---|
| 1747 | // Now we have a recursive call that is guarded by a cmp instruction. | 
|---|
| 1748 | // Check if this cmp can be simplified: | 
|---|
| 1749 | SimplifyQuery SQ(DL, dyn_cast<Instruction>(Val: CallArg)); | 
|---|
| 1750 | CondContext CC(&Cmp); | 
|---|
| 1751 | CC.Invert = (CallBB != Br->getSuccessor(i: 0)); | 
|---|
| 1752 | SQ.CC = &CC; | 
|---|
| 1753 | CC.AffectedValues.insert(Ptr: FuncArg); | 
|---|
| 1754 | Value *SimplifiedInstruction = llvm::simplifyInstructionWithOperands( | 
|---|
| 1755 | I: cast<CmpInst>(Val: &Cmp), NewOps: {CallArg, Cmp.getOperand(i_nocapture: 1)}, Q: SQ); | 
|---|
| 1756 | if (auto *ConstVal = dyn_cast_or_null<ConstantInt>(Val: SimplifiedInstruction)) { | 
|---|
| 1757 | // Make sure that the BB of the recursive call is NOT the true successor | 
|---|
| 1758 | // of the icmp. In other words, make sure that the recursion depth is 1. | 
|---|
| 1759 | if ((ConstVal->isOne() && CC.Invert) || | 
|---|
| 1760 | (ConstVal->isZero() && !CC.Invert)) { | 
|---|
| 1761 | SimplifiedValues[&Cmp] = ConstVal; | 
|---|
| 1762 | return true; | 
|---|
| 1763 | } | 
|---|
| 1764 | } | 
|---|
| 1765 | return false; | 
|---|
| 1766 | } | 
|---|
| 1767 |  | 
|---|
| 1768 | /// Simplify \p I if its operands are constants and update SimplifiedValues. | 
|---|
| 1769 | bool CallAnalyzer::simplifyInstruction(Instruction &I) { | 
|---|
| 1770 | SmallVector<Constant *> COps; | 
|---|
| 1771 | for (Value *Op : I.operands()) { | 
|---|
| 1772 | Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op); | 
|---|
| 1773 | if (!COp) | 
|---|
| 1774 | return false; | 
|---|
| 1775 | COps.push_back(Elt: COp); | 
|---|
| 1776 | } | 
|---|
| 1777 | auto *C = ConstantFoldInstOperands(I: &I, Ops: COps, DL); | 
|---|
| 1778 | if (!C) | 
|---|
| 1779 | return false; | 
|---|
| 1780 | SimplifiedValues[&I] = C; | 
|---|
| 1781 | return true; | 
|---|
| 1782 | } | 
|---|
| 1783 |  | 
|---|
| 1784 | /// Try to simplify a call to llvm.is.constant. | 
|---|
| 1785 | /// | 
|---|
| 1786 | /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since | 
|---|
| 1787 | /// we expect calls of this specific intrinsic to be infrequent. | 
|---|
| 1788 | /// | 
|---|
| 1789 | /// FIXME: Given that we know CB's parent (F) caller | 
|---|
| 1790 | /// (CandidateCall->getParent()->getParent()), we might be able to determine | 
|---|
| 1791 | /// whether inlining F into F's caller would change how the call to | 
|---|
| 1792 | /// llvm.is.constant would evaluate. | 
|---|
| 1793 | bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) { | 
|---|
| 1794 | Value *Arg = CB.getArgOperand(i: 0); | 
|---|
| 1795 | auto *C = getDirectOrSimplifiedValue<Constant>(V: Arg); | 
|---|
| 1796 |  | 
|---|
| 1797 | Type *RT = CB.getFunctionType()->getReturnType(); | 
|---|
| 1798 | SimplifiedValues[&CB] = ConstantInt::get(Ty: RT, V: C ? 1 : 0); | 
|---|
| 1799 | return true; | 
|---|
| 1800 | } | 
|---|
| 1801 |  | 
|---|
| 1802 | bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) { | 
|---|
| 1803 | // As per the langref, "The fourth argument to llvm.objectsize determines if | 
|---|
| 1804 | // the value should be evaluated at runtime." | 
|---|
| 1805 | if (cast<ConstantInt>(Val: CB.getArgOperand(i: 3))->isOne()) | 
|---|
| 1806 | return false; | 
|---|
| 1807 |  | 
|---|
| 1808 | Value *V = lowerObjectSizeCall(ObjectSize: &cast<IntrinsicInst>(Val&: CB), DL, TLI: nullptr, | 
|---|
| 1809 | /*MustSucceed=*/true); | 
|---|
| 1810 | Constant *C = dyn_cast_or_null<Constant>(Val: V); | 
|---|
| 1811 | if (C) | 
|---|
| 1812 | SimplifiedValues[&CB] = C; | 
|---|
| 1813 | return C; | 
|---|
| 1814 | } | 
|---|
| 1815 |  | 
|---|
| 1816 | bool CallAnalyzer::visitBitCast(BitCastInst &I) { | 
|---|
| 1817 | // Propagate constants through bitcasts. | 
|---|
| 1818 | if (simplifyInstruction(I)) | 
|---|
| 1819 | return true; | 
|---|
| 1820 |  | 
|---|
| 1821 | // Track base/offsets through casts | 
|---|
| 1822 | std::pair<Value *, APInt> BaseAndOffset = | 
|---|
| 1823 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); | 
|---|
| 1824 | // Casts don't change the offset, just wrap it up. | 
|---|
| 1825 | if (BaseAndOffset.first) | 
|---|
| 1826 | ConstantOffsetPtrs[&I] = BaseAndOffset; | 
|---|
| 1827 |  | 
|---|
| 1828 | // Also look for SROA candidates here. | 
|---|
| 1829 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) | 
|---|
| 1830 | SROAArgValues[&I] = SROAArg; | 
|---|
| 1831 |  | 
|---|
| 1832 | // Bitcasts are always zero cost. | 
|---|
| 1833 | return true; | 
|---|
| 1834 | } | 
|---|
| 1835 |  | 
|---|
| 1836 | bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { | 
|---|
| 1837 | // Propagate constants through ptrtoint. | 
|---|
| 1838 | if (simplifyInstruction(I)) | 
|---|
| 1839 | return true; | 
|---|
| 1840 |  | 
|---|
| 1841 | // Track base/offset pairs when converted to a plain integer provided the | 
|---|
| 1842 | // integer is large enough to represent the pointer. | 
|---|
| 1843 | unsigned IntegerSize = I.getType()->getScalarSizeInBits(); | 
|---|
| 1844 | unsigned AS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace(); | 
|---|
| 1845 | if (IntegerSize == DL.getPointerSizeInBits(AS)) { | 
|---|
| 1846 | std::pair<Value *, APInt> BaseAndOffset = | 
|---|
| 1847 | ConstantOffsetPtrs.lookup(Val: I.getOperand(i_nocapture: 0)); | 
|---|
| 1848 | if (BaseAndOffset.first) | 
|---|
| 1849 | ConstantOffsetPtrs[&I] = BaseAndOffset; | 
|---|
| 1850 | } | 
|---|
| 1851 |  | 
|---|
| 1852 | // This is really weird. Technically, ptrtoint will disable SROA. However, | 
|---|
| 1853 | // unless that ptrtoint is *used* somewhere in the live basic blocks after | 
|---|
| 1854 | // inlining, it will be nuked, and SROA should proceed. All of the uses which | 
|---|
| 1855 | // would block SROA would also block SROA if applied directly to a pointer, | 
|---|
| 1856 | // and so we can just add the integer in here. The only places where SROA is | 
|---|
| 1857 | // preserved either cannot fire on an integer, or won't in-and-of themselves | 
|---|
| 1858 | // disable SROA (ext) w/o some later use that we would see and disable. | 
|---|
| 1859 | if (auto *SROAArg = getSROAArgForValueOrNull(V: I.getOperand(i_nocapture: 0))) | 
|---|
| 1860 | SROAArgValues[&I] = SROAArg; | 
|---|
| 1861 |  | 
|---|
| 1862 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == | 
|---|
| 1863 | TargetTransformInfo::TCC_Free; | 
|---|
| 1864 | } | 
|---|
| 1865 |  | 
|---|
| 1866 | bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { | 
|---|
| 1867 | // Propagate constants through ptrtoint. | 
|---|
| 1868 | if (simplifyInstruction(I)) | 
|---|
| 1869 | return true; | 
|---|
| 1870 |  | 
|---|
| 1871 | // Track base/offset pairs when round-tripped through a pointer without | 
|---|
| 1872 | // modifications provided the integer is not too large. | 
|---|
| 1873 | Value *Op = I.getOperand(i_nocapture: 0); | 
|---|
| 1874 | unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); | 
|---|
| 1875 | if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { | 
|---|
| 1876 | std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Val: Op); | 
|---|
| 1877 | if (BaseAndOffset.first) | 
|---|
| 1878 | ConstantOffsetPtrs[&I] = BaseAndOffset; | 
|---|
| 1879 | } | 
|---|
| 1880 |  | 
|---|
| 1881 | // "Propagate" SROA here in the same manner as we do for ptrtoint above. | 
|---|
| 1882 | if (auto *SROAArg = getSROAArgForValueOrNull(V: Op)) | 
|---|
| 1883 | SROAArgValues[&I] = SROAArg; | 
|---|
| 1884 |  | 
|---|
| 1885 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == | 
|---|
| 1886 | TargetTransformInfo::TCC_Free; | 
|---|
| 1887 | } | 
|---|
| 1888 |  | 
|---|
| 1889 | bool CallAnalyzer::visitCastInst(CastInst &I) { | 
|---|
| 1890 | // Propagate constants through casts. | 
|---|
| 1891 | if (simplifyInstruction(I)) | 
|---|
| 1892 | return true; | 
|---|
| 1893 |  | 
|---|
| 1894 | // Disable SROA in the face of arbitrary casts we don't explicitly list | 
|---|
| 1895 | // elsewhere. | 
|---|
| 1896 | disableSROA(V: I.getOperand(i_nocapture: 0)); | 
|---|
| 1897 |  | 
|---|
| 1898 | // If this is a floating-point cast, and the target says this operation | 
|---|
| 1899 | // is expensive, this may eventually become a library call. Treat the cost | 
|---|
| 1900 | // as such. | 
|---|
| 1901 | switch (I.getOpcode()) { | 
|---|
| 1902 | case Instruction::FPTrunc: | 
|---|
| 1903 | case Instruction::FPExt: | 
|---|
| 1904 | case Instruction::UIToFP: | 
|---|
| 1905 | case Instruction::SIToFP: | 
|---|
| 1906 | case Instruction::FPToUI: | 
|---|
| 1907 | case Instruction::FPToSI: | 
|---|
| 1908 | if (TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive) | 
|---|
| 1909 | onCallPenalty(); | 
|---|
| 1910 | break; | 
|---|
| 1911 | default: | 
|---|
| 1912 | break; | 
|---|
| 1913 | } | 
|---|
| 1914 |  | 
|---|
| 1915 | return TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == | 
|---|
| 1916 | TargetTransformInfo::TCC_Free; | 
|---|
| 1917 | } | 
|---|
| 1918 |  | 
|---|
| 1919 | bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { | 
|---|
| 1920 | return CandidateCall.paramHasAttr(ArgNo: A->getArgNo(), Kind: Attr); | 
|---|
| 1921 | } | 
|---|
| 1922 |  | 
|---|
| 1923 | bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { | 
|---|
| 1924 | // Does the *call site* have the NonNull attribute set on an argument?  We | 
|---|
| 1925 | // use the attribute on the call site to memoize any analysis done in the | 
|---|
| 1926 | // caller. This will also trip if the callee function has a non-null | 
|---|
| 1927 | // parameter attribute, but that's a less interesting case because hopefully | 
|---|
| 1928 | // the callee would already have been simplified based on that. | 
|---|
| 1929 | if (Argument *A = dyn_cast<Argument>(Val: V)) | 
|---|
| 1930 | if (paramHasAttr(A, Attr: Attribute::NonNull)) | 
|---|
| 1931 | return true; | 
|---|
| 1932 |  | 
|---|
| 1933 | // Is this an alloca in the caller?  This is distinct from the attribute case | 
|---|
| 1934 | // above because attributes aren't updated within the inliner itself and we | 
|---|
| 1935 | // always want to catch the alloca derived case. | 
|---|
| 1936 | if (isAllocaDerivedArg(V)) | 
|---|
| 1937 | // We can actually predict the result of comparisons between an | 
|---|
| 1938 | // alloca-derived value and null. Note that this fires regardless of | 
|---|
| 1939 | // SROA firing. | 
|---|
| 1940 | return true; | 
|---|
| 1941 |  | 
|---|
| 1942 | return false; | 
|---|
| 1943 | } | 
|---|
| 1944 |  | 
|---|
| 1945 | bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { | 
|---|
| 1946 | // If the normal destination of the invoke or the parent block of the call | 
|---|
| 1947 | // site is unreachable-terminated, there is little point in inlining this | 
|---|
| 1948 | // unless there is literally zero cost. | 
|---|
| 1949 | // FIXME: Note that it is possible that an unreachable-terminated block has a | 
|---|
| 1950 | // hot entry. For example, in below scenario inlining hot_call_X() may be | 
|---|
| 1951 | // beneficial : | 
|---|
| 1952 | // main() { | 
|---|
| 1953 | //   hot_call_1(); | 
|---|
| 1954 | //   ... | 
|---|
| 1955 | //   hot_call_N() | 
|---|
| 1956 | //   exit(0); | 
|---|
| 1957 | // } | 
|---|
| 1958 | // For now, we are not handling this corner case here as it is rare in real | 
|---|
| 1959 | // code. In future, we should elaborate this based on BPI and BFI in more | 
|---|
| 1960 | // general threshold adjusting heuristics in updateThreshold(). | 
|---|
| 1961 | if (InvokeInst *II = dyn_cast<InvokeInst>(Val: &Call)) { | 
|---|
| 1962 | if (isa<UnreachableInst>(Val: II->getNormalDest()->getTerminator())) | 
|---|
| 1963 | return false; | 
|---|
| 1964 | } else if (isa<UnreachableInst>(Val: Call.getParent()->getTerminator())) | 
|---|
| 1965 | return false; | 
|---|
| 1966 |  | 
|---|
| 1967 | return true; | 
|---|
| 1968 | } | 
|---|
| 1969 |  | 
|---|
| 1970 | bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, | 
|---|
| 1971 | BlockFrequencyInfo *CallerBFI) { | 
|---|
| 1972 | // If global profile summary is available, then callsite's coldness is | 
|---|
| 1973 | // determined based on that. | 
|---|
| 1974 | if (PSI && PSI->hasProfileSummary()) | 
|---|
| 1975 | return PSI->isColdCallSite(CB: Call, BFI: CallerBFI); | 
|---|
| 1976 |  | 
|---|
| 1977 | // Otherwise we need BFI to be available. | 
|---|
| 1978 | if (!CallerBFI) | 
|---|
| 1979 | return false; | 
|---|
| 1980 |  | 
|---|
| 1981 | // Determine if the callsite is cold relative to caller's entry. We could | 
|---|
| 1982 | // potentially cache the computation of scaled entry frequency, but the added | 
|---|
| 1983 | // complexity is not worth it unless this scaling shows up high in the | 
|---|
| 1984 | // profiles. | 
|---|
| 1985 | const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); | 
|---|
| 1986 | auto CallSiteBB = Call.getParent(); | 
|---|
| 1987 | auto CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); | 
|---|
| 1988 | auto CallerEntryFreq = | 
|---|
| 1989 | CallerBFI->getBlockFreq(BB: &(Call.getCaller()->getEntryBlock())); | 
|---|
| 1990 | return CallSiteFreq < CallerEntryFreq * ColdProb; | 
|---|
| 1991 | } | 
|---|
| 1992 |  | 
|---|
| 1993 | std::optional<int> | 
|---|
| 1994 | InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, | 
|---|
| 1995 | BlockFrequencyInfo *CallerBFI) { | 
|---|
| 1996 |  | 
|---|
| 1997 | // If global profile summary is available, then callsite's hotness is | 
|---|
| 1998 | // determined based on that. | 
|---|
| 1999 | if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CB: Call, BFI: CallerBFI)) | 
|---|
| 2000 | return Params.HotCallSiteThreshold; | 
|---|
| 2001 |  | 
|---|
| 2002 | // Otherwise we need BFI to be available and to have a locally hot callsite | 
|---|
| 2003 | // threshold. | 
|---|
| 2004 | if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) | 
|---|
| 2005 | return std::nullopt; | 
|---|
| 2006 |  | 
|---|
| 2007 | // Determine if the callsite is hot relative to caller's entry. We could | 
|---|
| 2008 | // potentially cache the computation of scaled entry frequency, but the added | 
|---|
| 2009 | // complexity is not worth it unless this scaling shows up high in the | 
|---|
| 2010 | // profiles. | 
|---|
| 2011 | const BasicBlock *CallSiteBB = Call.getParent(); | 
|---|
| 2012 | BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(BB: CallSiteBB); | 
|---|
| 2013 | BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq(); | 
|---|
| 2014 | std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(Factor: HotCallSiteRelFreq); | 
|---|
| 2015 | if (Limit && CallSiteFreq >= *Limit) | 
|---|
| 2016 | return Params.LocallyHotCallSiteThreshold; | 
|---|
| 2017 |  | 
|---|
| 2018 | // Otherwise treat it normally. | 
|---|
| 2019 | return std::nullopt; | 
|---|
| 2020 | } | 
|---|
| 2021 |  | 
|---|
| 2022 | void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { | 
|---|
| 2023 | // If no size growth is allowed for this inlining, set Threshold to 0. | 
|---|
| 2024 | if (!allowSizeGrowth(Call)) { | 
|---|
| 2025 | Threshold = 0; | 
|---|
| 2026 | return; | 
|---|
| 2027 | } | 
|---|
| 2028 |  | 
|---|
| 2029 | Function *Caller = Call.getCaller(); | 
|---|
| 2030 |  | 
|---|
| 2031 | // return min(A, B) if B is valid. | 
|---|
| 2032 | auto MinIfValid = [](int A, std::optional<int> B) { | 
|---|
| 2033 | return B ? std::min(a: A, b: *B) : A; | 
|---|
| 2034 | }; | 
|---|
| 2035 |  | 
|---|
| 2036 | // return max(A, B) if B is valid. | 
|---|
| 2037 | auto MaxIfValid = [](int A, std::optional<int> B) { | 
|---|
| 2038 | return B ? std::max(a: A, b: *B) : A; | 
|---|
| 2039 | }; | 
|---|
| 2040 |  | 
|---|
| 2041 | // Various bonus percentages. These are multiplied by Threshold to get the | 
|---|
| 2042 | // bonus values. | 
|---|
| 2043 | // SingleBBBonus: This bonus is applied if the callee has a single reachable | 
|---|
| 2044 | // basic block at the given callsite context. This is speculatively applied | 
|---|
| 2045 | // and withdrawn if more than one basic block is seen. | 
|---|
| 2046 | // | 
|---|
| 2047 | // LstCallToStaticBonus: This large bonus is applied to ensure the inlining | 
|---|
| 2048 | // of the last call to a static function as inlining such functions is | 
|---|
| 2049 | // guaranteed to reduce code size. | 
|---|
| 2050 | // | 
|---|
| 2051 | // These bonus percentages may be set to 0 based on properties of the caller | 
|---|
| 2052 | // and the callsite. | 
|---|
| 2053 | int SingleBBBonusPercent = 50; | 
|---|
| 2054 | int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); | 
|---|
| 2055 | int LastCallToStaticBonus = TTI.getInliningLastCallToStaticBonus(); | 
|---|
| 2056 |  | 
|---|
| 2057 | // Lambda to set all the above bonus and bonus percentages to 0. | 
|---|
| 2058 | auto DisallowAllBonuses = [&]() { | 
|---|
| 2059 | SingleBBBonusPercent = 0; | 
|---|
| 2060 | VectorBonusPercent = 0; | 
|---|
| 2061 | LastCallToStaticBonus = 0; | 
|---|
| 2062 | }; | 
|---|
| 2063 |  | 
|---|
| 2064 | // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available | 
|---|
| 2065 | // and reduce the threshold if the caller has the necessary attribute. | 
|---|
| 2066 | if (Caller->hasMinSize()) { | 
|---|
| 2067 | Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); | 
|---|
| 2068 | // For minsize, we want to disable the single BB bonus and the vector | 
|---|
| 2069 | // bonuses, but not the last-call-to-static bonus. Inlining the last call to | 
|---|
| 2070 | // a static function will, at the minimum, eliminate the parameter setup and | 
|---|
| 2071 | // call/return instructions. | 
|---|
| 2072 | SingleBBBonusPercent = 0; | 
|---|
| 2073 | VectorBonusPercent = 0; | 
|---|
| 2074 | } else if (Caller->hasOptSize()) | 
|---|
| 2075 | Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); | 
|---|
| 2076 |  | 
|---|
| 2077 | // Adjust the threshold based on inlinehint attribute and profile based | 
|---|
| 2078 | // hotness information if the caller does not have MinSize attribute. | 
|---|
| 2079 | if (!Caller->hasMinSize()) { | 
|---|
| 2080 | if (Callee.hasFnAttribute(Kind: Attribute::InlineHint)) | 
|---|
| 2081 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); | 
|---|
| 2082 |  | 
|---|
| 2083 | // FIXME: After switching to the new passmanager, simplify the logic below | 
|---|
| 2084 | // by checking only the callsite hotness/coldness as we will reliably | 
|---|
| 2085 | // have local profile information. | 
|---|
| 2086 | // | 
|---|
| 2087 | // Callsite hotness and coldness can be determined if sample profile is | 
|---|
| 2088 | // used (which adds hotness metadata to calls) or if caller's | 
|---|
| 2089 | // BlockFrequencyInfo is available. | 
|---|
| 2090 | BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr; | 
|---|
| 2091 | auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); | 
|---|
| 2092 | if (!Caller->hasOptSize() && HotCallSiteThreshold) { | 
|---|
| 2093 | LLVM_DEBUG(dbgs() << "Hot callsite.\n"); | 
|---|
| 2094 | // FIXME: This should update the threshold only if it exceeds the | 
|---|
| 2095 | // current threshold, but AutoFDO + ThinLTO currently relies on this | 
|---|
| 2096 | // behavior to prevent inlining of hot callsites during ThinLTO | 
|---|
| 2097 | // compile phase. | 
|---|
| 2098 | Threshold = *HotCallSiteThreshold; | 
|---|
| 2099 | } else if (isColdCallSite(Call, CallerBFI)) { | 
|---|
| 2100 | LLVM_DEBUG(dbgs() << "Cold callsite.\n"); | 
|---|
| 2101 | // Do not apply bonuses for a cold callsite including the | 
|---|
| 2102 | // LastCallToStatic bonus. While this bonus might result in code size | 
|---|
| 2103 | // reduction, it can cause the size of a non-cold caller to increase | 
|---|
| 2104 | // preventing it from being inlined. | 
|---|
| 2105 | DisallowAllBonuses(); | 
|---|
| 2106 | Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); | 
|---|
| 2107 | } else if (PSI) { | 
|---|
| 2108 | // Use callee's global profile information only if we have no way of | 
|---|
| 2109 | // determining this via callsite information. | 
|---|
| 2110 | if (PSI->isFunctionEntryHot(F: &Callee)) { | 
|---|
| 2111 | LLVM_DEBUG(dbgs() << "Hot callee.\n"); | 
|---|
| 2112 | // If callsite hotness can not be determined, we may still know | 
|---|
| 2113 | // that the callee is hot and treat it as a weaker hint for threshold | 
|---|
| 2114 | // increase. | 
|---|
| 2115 | Threshold = MaxIfValid(Threshold, Params.HintThreshold); | 
|---|
| 2116 | } else if (PSI->isFunctionEntryCold(F: &Callee)) { | 
|---|
| 2117 | LLVM_DEBUG(dbgs() << "Cold callee.\n"); | 
|---|
| 2118 | // Do not apply bonuses for a cold callee including the | 
|---|
| 2119 | // LastCallToStatic bonus. While this bonus might result in code size | 
|---|
| 2120 | // reduction, it can cause the size of a non-cold caller to increase | 
|---|
| 2121 | // preventing it from being inlined. | 
|---|
| 2122 | DisallowAllBonuses(); | 
|---|
| 2123 | Threshold = MinIfValid(Threshold, Params.ColdThreshold); | 
|---|
| 2124 | } | 
|---|
| 2125 | } | 
|---|
| 2126 | } | 
|---|
| 2127 |  | 
|---|
| 2128 | Threshold += TTI.adjustInliningThreshold(CB: &Call); | 
|---|
| 2129 |  | 
|---|
| 2130 | // Finally, take the target-specific inlining threshold multiplier into | 
|---|
| 2131 | // account. | 
|---|
| 2132 | Threshold *= TTI.getInliningThresholdMultiplier(); | 
|---|
| 2133 |  | 
|---|
| 2134 | SingleBBBonus = Threshold * SingleBBBonusPercent / 100; | 
|---|
| 2135 | VectorBonus = Threshold * VectorBonusPercent / 100; | 
|---|
| 2136 |  | 
|---|
| 2137 | // If there is only one call of the function, and it has internal linkage, | 
|---|
| 2138 | // the cost of inlining it drops dramatically. It may seem odd to update | 
|---|
| 2139 | // Cost in updateThreshold, but the bonus depends on the logic in this method. | 
|---|
| 2140 | if (isSoleCallToLocalFunction(CB: Call, Callee: F)) { | 
|---|
| 2141 | Cost -= LastCallToStaticBonus; | 
|---|
| 2142 | StaticBonusApplied = LastCallToStaticBonus; | 
|---|
| 2143 | } | 
|---|
| 2144 | } | 
|---|
| 2145 |  | 
|---|
| 2146 | bool CallAnalyzer::visitCmpInst(CmpInst &I) { | 
|---|
| 2147 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); | 
|---|
| 2148 | // First try to handle simplified comparisons. | 
|---|
| 2149 | if (simplifyInstruction(I)) | 
|---|
| 2150 | return true; | 
|---|
| 2151 |  | 
|---|
| 2152 | // Try to handle comparison that can be simplified using ValueTracking. | 
|---|
| 2153 | if (simplifyCmpInstForRecCall(Cmp&: I)) | 
|---|
| 2154 | return true; | 
|---|
| 2155 |  | 
|---|
| 2156 | if (I.getOpcode() == Instruction::FCmp) | 
|---|
| 2157 | return false; | 
|---|
| 2158 |  | 
|---|
| 2159 | // Otherwise look for a comparison between constant offset pointers with | 
|---|
| 2160 | // a common base. | 
|---|
| 2161 | Value *LHSBase, *RHSBase; | 
|---|
| 2162 | APInt LHSOffset, RHSOffset; | 
|---|
| 2163 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); | 
|---|
| 2164 | if (LHSBase) { | 
|---|
| 2165 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); | 
|---|
| 2166 | if (RHSBase && LHSBase == RHSBase) { | 
|---|
| 2167 | // We have common bases, fold the icmp to a constant based on the | 
|---|
| 2168 | // offsets. | 
|---|
| 2169 | SimplifiedValues[&I] = ConstantInt::getBool( | 
|---|
| 2170 | Ty: I.getType(), | 
|---|
| 2171 | V: ICmpInst::compare(LHS: LHSOffset, RHS: RHSOffset, Pred: I.getPredicate())); | 
|---|
| 2172 | ++NumConstantPtrCmps; | 
|---|
| 2173 | return true; | 
|---|
| 2174 | } | 
|---|
| 2175 | } | 
|---|
| 2176 |  | 
|---|
| 2177 | auto isImplicitNullCheckCmp = [](const CmpInst &I) { | 
|---|
| 2178 | for (auto *User : I.users()) | 
|---|
| 2179 | if (auto *Instr = dyn_cast<Instruction>(Val: User)) | 
|---|
| 2180 | if (!Instr->getMetadata(KindID: LLVMContext::MD_make_implicit)) | 
|---|
| 2181 | return false; | 
|---|
| 2182 | return true; | 
|---|
| 2183 | }; | 
|---|
| 2184 |  | 
|---|
| 2185 | // If the comparison is an equality comparison with null, we can simplify it | 
|---|
| 2186 | // if we know the value (argument) can't be null | 
|---|
| 2187 | if (I.isEquality() && isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))) { | 
|---|
| 2188 | if (isKnownNonNullInCallee(V: I.getOperand(i_nocapture: 0))) { | 
|---|
| 2189 | bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; | 
|---|
| 2190 | SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(Ty: I.getType()) | 
|---|
| 2191 | : ConstantInt::getFalse(Ty: I.getType()); | 
|---|
| 2192 | return true; | 
|---|
| 2193 | } | 
|---|
| 2194 | // Implicit null checks act as unconditional branches and their comparisons | 
|---|
| 2195 | // should be treated as simplified and free of cost. | 
|---|
| 2196 | if (isImplicitNullCheckCmp(I)) | 
|---|
| 2197 | return true; | 
|---|
| 2198 | } | 
|---|
| 2199 | return handleSROA(V: I.getOperand(i_nocapture: 0), DoNotDisable: isa<ConstantPointerNull>(Val: I.getOperand(i_nocapture: 1))); | 
|---|
| 2200 | } | 
|---|
| 2201 |  | 
|---|
| 2202 | bool CallAnalyzer::visitSub(BinaryOperator &I) { | 
|---|
| 2203 | // Try to handle a special case: we can fold computing the difference of two | 
|---|
| 2204 | // constant-related pointers. | 
|---|
| 2205 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); | 
|---|
| 2206 | Value *LHSBase, *RHSBase; | 
|---|
| 2207 | APInt LHSOffset, RHSOffset; | 
|---|
| 2208 | std::tie(args&: LHSBase, args&: LHSOffset) = ConstantOffsetPtrs.lookup(Val: LHS); | 
|---|
| 2209 | if (LHSBase) { | 
|---|
| 2210 | std::tie(args&: RHSBase, args&: RHSOffset) = ConstantOffsetPtrs.lookup(Val: RHS); | 
|---|
| 2211 | if (RHSBase && LHSBase == RHSBase) { | 
|---|
| 2212 | // We have common bases, fold the subtract to a constant based on the | 
|---|
| 2213 | // offsets. | 
|---|
| 2214 | Constant *CLHS = ConstantInt::get(Context&: LHS->getContext(), V: LHSOffset); | 
|---|
| 2215 | Constant *CRHS = ConstantInt::get(Context&: RHS->getContext(), V: RHSOffset); | 
|---|
| 2216 | if (Constant *C = ConstantExpr::getSub(C1: CLHS, C2: CRHS)) { | 
|---|
| 2217 | SimplifiedValues[&I] = C; | 
|---|
| 2218 | ++NumConstantPtrDiffs; | 
|---|
| 2219 | return true; | 
|---|
| 2220 | } | 
|---|
| 2221 | } | 
|---|
| 2222 | } | 
|---|
| 2223 |  | 
|---|
| 2224 | // Otherwise, fall back to the generic logic for simplifying and handling | 
|---|
| 2225 | // instructions. | 
|---|
| 2226 | return Base::visitSub(I); | 
|---|
| 2227 | } | 
|---|
| 2228 |  | 
|---|
| 2229 | bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { | 
|---|
| 2230 | Value *LHS = I.getOperand(i_nocapture: 0), *RHS = I.getOperand(i_nocapture: 1); | 
|---|
| 2231 | Constant *CLHS = getDirectOrSimplifiedValue<Constant>(V: LHS); | 
|---|
| 2232 | Constant *CRHS = getDirectOrSimplifiedValue<Constant>(V: RHS); | 
|---|
| 2233 |  | 
|---|
| 2234 | Value *SimpleV = nullptr; | 
|---|
| 2235 | if (auto FI = dyn_cast<FPMathOperator>(Val: &I)) | 
|---|
| 2236 | SimpleV = simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, | 
|---|
| 2237 | FMF: FI->getFastMathFlags(), Q: DL); | 
|---|
| 2238 | else | 
|---|
| 2239 | SimpleV = | 
|---|
| 2240 | simplifyBinOp(Opcode: I.getOpcode(), LHS: CLHS ? CLHS : LHS, RHS: CRHS ? CRHS : RHS, Q: DL); | 
|---|
| 2241 |  | 
|---|
| 2242 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) | 
|---|
| 2243 | SimplifiedValues[&I] = C; | 
|---|
| 2244 |  | 
|---|
| 2245 | if (SimpleV) | 
|---|
| 2246 | return true; | 
|---|
| 2247 |  | 
|---|
| 2248 | // Disable any SROA on arguments to arbitrary, unsimplified binary operators. | 
|---|
| 2249 | disableSROA(V: LHS); | 
|---|
| 2250 | disableSROA(V: RHS); | 
|---|
| 2251 |  | 
|---|
| 2252 | // If the instruction is floating point, and the target says this operation | 
|---|
| 2253 | // is expensive, this may eventually become a library call. Treat the cost | 
|---|
| 2254 | // as such. Unless it's fneg which can be implemented with an xor. | 
|---|
| 2255 | using namespace llvm::PatternMatch; | 
|---|
| 2256 | if (I.getType()->isFloatingPointTy() && | 
|---|
| 2257 | TTI.getFPOpCost(Ty: I.getType()) == TargetTransformInfo::TCC_Expensive && | 
|---|
| 2258 | !match(V: &I, P: m_FNeg(X: m_Value()))) | 
|---|
| 2259 | onCallPenalty(); | 
|---|
| 2260 |  | 
|---|
| 2261 | return false; | 
|---|
| 2262 | } | 
|---|
| 2263 |  | 
|---|
| 2264 | bool CallAnalyzer::visitFNeg(UnaryOperator &I) { | 
|---|
| 2265 | Value *Op = I.getOperand(i_nocapture: 0); | 
|---|
| 2266 | Constant *COp = getDirectOrSimplifiedValue<Constant>(V: Op); | 
|---|
| 2267 |  | 
|---|
| 2268 | Value *SimpleV = simplifyFNegInst( | 
|---|
| 2269 | Op: COp ? COp : Op, FMF: cast<FPMathOperator>(Val&: I).getFastMathFlags(), Q: DL); | 
|---|
| 2270 |  | 
|---|
| 2271 | if (Constant *C = dyn_cast_or_null<Constant>(Val: SimpleV)) | 
|---|
| 2272 | SimplifiedValues[&I] = C; | 
|---|
| 2273 |  | 
|---|
| 2274 | if (SimpleV) | 
|---|
| 2275 | return true; | 
|---|
| 2276 |  | 
|---|
| 2277 | // Disable any SROA on arguments to arbitrary, unsimplified fneg. | 
|---|
| 2278 | disableSROA(V: Op); | 
|---|
| 2279 |  | 
|---|
| 2280 | return false; | 
|---|
| 2281 | } | 
|---|
| 2282 |  | 
|---|
| 2283 | bool CallAnalyzer::visitLoad(LoadInst &I) { | 
|---|
| 2284 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) | 
|---|
| 2285 | return true; | 
|---|
| 2286 |  | 
|---|
| 2287 | // If the data is already loaded from this address and hasn't been clobbered | 
|---|
| 2288 | // by any stores or calls, this load is likely to be redundant and can be | 
|---|
| 2289 | // eliminated. | 
|---|
| 2290 | if (EnableLoadElimination && | 
|---|
| 2291 | !LoadAddrSet.insert(Ptr: I.getPointerOperand()).second && I.isUnordered()) { | 
|---|
| 2292 | onLoadEliminationOpportunity(); | 
|---|
| 2293 | return true; | 
|---|
| 2294 | } | 
|---|
| 2295 |  | 
|---|
| 2296 | onMemAccess(); | 
|---|
| 2297 | return false; | 
|---|
| 2298 | } | 
|---|
| 2299 |  | 
|---|
| 2300 | bool CallAnalyzer::visitStore(StoreInst &I) { | 
|---|
| 2301 | if (handleSROA(V: I.getPointerOperand(), DoNotDisable: I.isSimple())) | 
|---|
| 2302 | return true; | 
|---|
| 2303 |  | 
|---|
| 2304 | // The store can potentially clobber loads and prevent repeated loads from | 
|---|
| 2305 | // being eliminated. | 
|---|
| 2306 | // FIXME: | 
|---|
| 2307 | // 1. We can probably keep an initial set of eliminatable loads substracted | 
|---|
| 2308 | // from the cost even when we finally see a store. We just need to disable | 
|---|
| 2309 | // *further* accumulation of elimination savings. | 
|---|
| 2310 | // 2. We should probably at some point thread MemorySSA for the callee into | 
|---|
| 2311 | // this and then use that to actually compute *really* precise savings. | 
|---|
| 2312 | disableLoadElimination(); | 
|---|
| 2313 |  | 
|---|
| 2314 | onMemAccess(); | 
|---|
| 2315 | return false; | 
|---|
| 2316 | } | 
|---|
| 2317 |  | 
|---|
| 2318 | bool CallAnalyzer::(ExtractValueInst &I) { | 
|---|
| 2319 | Value *Op = I.getAggregateOperand(); | 
|---|
| 2320 |  | 
|---|
| 2321 | // Special handling, because we want to simplify extractvalue with a | 
|---|
| 2322 | // potential insertvalue from the caller. | 
|---|
| 2323 | if (Value *SimpleOp = getSimplifiedValueUnchecked(V: Op)) { | 
|---|
| 2324 | SimplifyQuery SQ(DL); | 
|---|
| 2325 | Value *SimpleV = simplifyExtractValueInst(Agg: SimpleOp, Idxs: I.getIndices(), Q: SQ); | 
|---|
| 2326 | if (SimpleV) { | 
|---|
| 2327 | SimplifiedValues[&I] = SimpleV; | 
|---|
| 2328 | return true; | 
|---|
| 2329 | } | 
|---|
| 2330 | } | 
|---|
| 2331 |  | 
|---|
| 2332 | // SROA can't look through these, but they may be free. | 
|---|
| 2333 | return Base::visitExtractValue(I); | 
|---|
| 2334 | } | 
|---|
| 2335 |  | 
|---|
| 2336 | bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { | 
|---|
| 2337 | // Constant folding for insert value is trivial. | 
|---|
| 2338 | if (simplifyInstruction(I)) | 
|---|
| 2339 | return true; | 
|---|
| 2340 |  | 
|---|
| 2341 | // SROA can't look through these, but they may be free. | 
|---|
| 2342 | return Base::visitInsertValue(I); | 
|---|
| 2343 | } | 
|---|
| 2344 |  | 
|---|
| 2345 | /// Try to simplify a call site. | 
|---|
| 2346 | /// | 
|---|
| 2347 | /// Takes a concrete function and callsite and tries to actually simplify it by | 
|---|
| 2348 | /// analyzing the arguments and call itself with instsimplify. Returns true if | 
|---|
| 2349 | /// it has simplified the callsite to some other entity (a constant), making it | 
|---|
| 2350 | /// free. | 
|---|
| 2351 | bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { | 
|---|
| 2352 | // FIXME: Using the instsimplify logic directly for this is inefficient | 
|---|
| 2353 | // because we have to continually rebuild the argument list even when no | 
|---|
| 2354 | // simplifications can be performed. Until that is fixed with remapping | 
|---|
| 2355 | // inside of instsimplify, directly constant fold calls here. | 
|---|
| 2356 | if (!canConstantFoldCallTo(Call: &Call, F)) | 
|---|
| 2357 | return false; | 
|---|
| 2358 |  | 
|---|
| 2359 | // Try to re-map the arguments to constants. | 
|---|
| 2360 | SmallVector<Constant *, 4> ConstantArgs; | 
|---|
| 2361 | ConstantArgs.reserve(N: Call.arg_size()); | 
|---|
| 2362 | for (Value *I : Call.args()) { | 
|---|
| 2363 | Constant *C = getDirectOrSimplifiedValue<Constant>(V: I); | 
|---|
| 2364 | if (!C) | 
|---|
| 2365 | return false; // This argument doesn't map to a constant. | 
|---|
| 2366 |  | 
|---|
| 2367 | ConstantArgs.push_back(Elt: C); | 
|---|
| 2368 | } | 
|---|
| 2369 | if (Constant *C = ConstantFoldCall(Call: &Call, F, Operands: ConstantArgs)) { | 
|---|
| 2370 | SimplifiedValues[&Call] = C; | 
|---|
| 2371 | return true; | 
|---|
| 2372 | } | 
|---|
| 2373 |  | 
|---|
| 2374 | return false; | 
|---|
| 2375 | } | 
|---|
| 2376 |  | 
|---|
| 2377 | bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) { | 
|---|
| 2378 | const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr; | 
|---|
| 2379 | LibFunc LF; | 
|---|
| 2380 | if (!TLI || !TLI->getLibFunc(FDecl: *F, F&: LF) || !TLI->has(F: LF)) | 
|---|
| 2381 | return TTI.isLoweredToCall(F); | 
|---|
| 2382 |  | 
|---|
| 2383 | switch (LF) { | 
|---|
| 2384 | case LibFunc_memcpy_chk: | 
|---|
| 2385 | case LibFunc_memmove_chk: | 
|---|
| 2386 | case LibFunc_mempcpy_chk: | 
|---|
| 2387 | case LibFunc_memset_chk: { | 
|---|
| 2388 | // Calls to  __memcpy_chk whose length is known to fit within the object | 
|---|
| 2389 | // size will eventually be replaced by inline stores. Therefore, these | 
|---|
| 2390 | // should not incur a call penalty. This is only really relevant on | 
|---|
| 2391 | // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as | 
|---|
| 2392 | // other platforms use memcpy intrinsics, which are already exempt from the | 
|---|
| 2393 | // call penalty. | 
|---|
| 2394 | auto *LenOp = getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 2)); | 
|---|
| 2395 | auto *ObjSizeOp = | 
|---|
| 2396 | getDirectOrSimplifiedValue<ConstantInt>(V: Call.getOperand(i_nocapture: 3)); | 
|---|
| 2397 | if (LenOp && ObjSizeOp && | 
|---|
| 2398 | LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) { | 
|---|
| 2399 | return false; | 
|---|
| 2400 | } | 
|---|
| 2401 | break; | 
|---|
| 2402 | } | 
|---|
| 2403 | default: | 
|---|
| 2404 | break; | 
|---|
| 2405 | } | 
|---|
| 2406 |  | 
|---|
| 2407 | return TTI.isLoweredToCall(F); | 
|---|
| 2408 | } | 
|---|
| 2409 |  | 
|---|
| 2410 | bool CallAnalyzer::visitCallBase(CallBase &Call) { | 
|---|
| 2411 | if (!onCallBaseVisitStart(Call)) | 
|---|
| 2412 | return true; | 
|---|
| 2413 |  | 
|---|
| 2414 | if (Call.hasFnAttr(Kind: Attribute::ReturnsTwice) && | 
|---|
| 2415 | !F.hasFnAttribute(Kind: Attribute::ReturnsTwice)) { | 
|---|
| 2416 | // This aborts the entire analysis. | 
|---|
| 2417 | ExposesReturnsTwice = true; | 
|---|
| 2418 | return false; | 
|---|
| 2419 | } | 
|---|
| 2420 | if (isa<CallInst>(Val: Call) && cast<CallInst>(Val&: Call).cannotDuplicate()) | 
|---|
| 2421 | ContainsNoDuplicateCall = true; | 
|---|
| 2422 |  | 
|---|
| 2423 | Function *F = Call.getCalledFunction(); | 
|---|
| 2424 | bool IsIndirectCall = !F; | 
|---|
| 2425 | if (IsIndirectCall) { | 
|---|
| 2426 | // Check if this happens to be an indirect function call to a known function | 
|---|
| 2427 | // in this inline context. If not, we've done all we can. | 
|---|
| 2428 | Value *Callee = Call.getCalledOperand(); | 
|---|
| 2429 | F = getSimplifiedValue<Function>(V: Callee); | 
|---|
| 2430 | if (!F || F->getFunctionType() != Call.getFunctionType()) { | 
|---|
| 2431 | onCallArgumentSetup(Call); | 
|---|
| 2432 |  | 
|---|
| 2433 | if (!Call.onlyReadsMemory()) | 
|---|
| 2434 | disableLoadElimination(); | 
|---|
| 2435 | return Base::visitCallBase(I&: Call); | 
|---|
| 2436 | } | 
|---|
| 2437 | } | 
|---|
| 2438 |  | 
|---|
| 2439 | assert(F && "Expected a call to a known function"); | 
|---|
| 2440 |  | 
|---|
| 2441 | // When we have a concrete function, first try to simplify it directly. | 
|---|
| 2442 | if (simplifyCallSite(F, Call)) | 
|---|
| 2443 | return true; | 
|---|
| 2444 |  | 
|---|
| 2445 | // Next check if it is an intrinsic we know about. | 
|---|
| 2446 | // FIXME: Lift this into part of the InstVisitor. | 
|---|
| 2447 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &Call)) { | 
|---|
| 2448 | switch (II->getIntrinsicID()) { | 
|---|
| 2449 | default: | 
|---|
| 2450 | if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(I: II)) | 
|---|
| 2451 | disableLoadElimination(); | 
|---|
| 2452 | return Base::visitCallBase(I&: Call); | 
|---|
| 2453 |  | 
|---|
| 2454 | case Intrinsic::load_relative: | 
|---|
| 2455 | onLoadRelativeIntrinsic(); | 
|---|
| 2456 | return false; | 
|---|
| 2457 |  | 
|---|
| 2458 | case Intrinsic::memset: | 
|---|
| 2459 | case Intrinsic::memcpy: | 
|---|
| 2460 | case Intrinsic::memmove: | 
|---|
| 2461 | disableLoadElimination(); | 
|---|
| 2462 | // SROA can usually chew through these intrinsics, but they aren't free. | 
|---|
| 2463 | return false; | 
|---|
| 2464 | case Intrinsic::icall_branch_funnel: | 
|---|
| 2465 | case Intrinsic::localescape: | 
|---|
| 2466 | HasUninlineableIntrinsic = true; | 
|---|
| 2467 | return false; | 
|---|
| 2468 | case Intrinsic::vastart: | 
|---|
| 2469 | InitsVargArgs = true; | 
|---|
| 2470 | return false; | 
|---|
| 2471 | case Intrinsic::launder_invariant_group: | 
|---|
| 2472 | case Intrinsic::strip_invariant_group: | 
|---|
| 2473 | if (auto *SROAArg = getSROAArgForValueOrNull(V: II->getOperand(i_nocapture: 0))) | 
|---|
| 2474 | SROAArgValues[II] = SROAArg; | 
|---|
| 2475 | return true; | 
|---|
| 2476 | case Intrinsic::is_constant: | 
|---|
| 2477 | return simplifyIntrinsicCallIsConstant(CB&: Call); | 
|---|
| 2478 | case Intrinsic::objectsize: | 
|---|
| 2479 | return simplifyIntrinsicCallObjectSize(CB&: Call); | 
|---|
| 2480 | } | 
|---|
| 2481 | } | 
|---|
| 2482 |  | 
|---|
| 2483 | if (F == Call.getFunction()) { | 
|---|
| 2484 | // This flag will fully abort the analysis, so don't bother with anything | 
|---|
| 2485 | // else. | 
|---|
| 2486 | IsRecursiveCall = true; | 
|---|
| 2487 | if (!AllowRecursiveCall) | 
|---|
| 2488 | return false; | 
|---|
| 2489 | } | 
|---|
| 2490 |  | 
|---|
| 2491 | if (isLoweredToCall(F, Call)) { | 
|---|
| 2492 | onLoweredCall(F, Call, IsIndirectCall); | 
|---|
| 2493 | } | 
|---|
| 2494 |  | 
|---|
| 2495 | if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) | 
|---|
| 2496 | disableLoadElimination(); | 
|---|
| 2497 | return Base::visitCallBase(I&: Call); | 
|---|
| 2498 | } | 
|---|
| 2499 |  | 
|---|
| 2500 | bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { | 
|---|
| 2501 | // At least one return instruction will be free after inlining. | 
|---|
| 2502 | bool Free = !HasReturn; | 
|---|
| 2503 | HasReturn = true; | 
|---|
| 2504 | return Free; | 
|---|
| 2505 | } | 
|---|
| 2506 |  | 
|---|
| 2507 | bool CallAnalyzer::visitBranchInst(BranchInst &BI) { | 
|---|
| 2508 | // We model unconditional branches as essentially free -- they really | 
|---|
| 2509 | // shouldn't exist at all, but handling them makes the behavior of the | 
|---|
| 2510 | // inliner more regular and predictable. Interestingly, conditional branches | 
|---|
| 2511 | // which will fold away are also free. | 
|---|
| 2512 | return BI.isUnconditional() || | 
|---|
| 2513 | getDirectOrSimplifiedValue<ConstantInt>(V: BI.getCondition()) || | 
|---|
| 2514 | BI.getMetadata(KindID: LLVMContext::MD_make_implicit); | 
|---|
| 2515 | } | 
|---|
| 2516 |  | 
|---|
| 2517 | bool CallAnalyzer::visitSelectInst(SelectInst &SI) { | 
|---|
| 2518 | bool CheckSROA = SI.getType()->isPointerTy(); | 
|---|
| 2519 | Value *TrueVal = SI.getTrueValue(); | 
|---|
| 2520 | Value *FalseVal = SI.getFalseValue(); | 
|---|
| 2521 |  | 
|---|
| 2522 | Constant *TrueC = getDirectOrSimplifiedValue<Constant>(V: TrueVal); | 
|---|
| 2523 | Constant *FalseC = getDirectOrSimplifiedValue<Constant>(V: FalseVal); | 
|---|
| 2524 | Constant *CondC = getSimplifiedValue<Constant>(V: SI.getCondition()); | 
|---|
| 2525 |  | 
|---|
| 2526 | if (!CondC) { | 
|---|
| 2527 | // Select C, X, X => X | 
|---|
| 2528 | if (TrueC == FalseC && TrueC) { | 
|---|
| 2529 | SimplifiedValues[&SI] = TrueC; | 
|---|
| 2530 | return true; | 
|---|
| 2531 | } | 
|---|
| 2532 |  | 
|---|
| 2533 | if (!CheckSROA) | 
|---|
| 2534 | return Base::visitSelectInst(I&: SI); | 
|---|
| 2535 |  | 
|---|
| 2536 | std::pair<Value *, APInt> TrueBaseAndOffset = | 
|---|
| 2537 | ConstantOffsetPtrs.lookup(Val: TrueVal); | 
|---|
| 2538 | std::pair<Value *, APInt> FalseBaseAndOffset = | 
|---|
| 2539 | ConstantOffsetPtrs.lookup(Val: FalseVal); | 
|---|
| 2540 | if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { | 
|---|
| 2541 | ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; | 
|---|
| 2542 |  | 
|---|
| 2543 | if (auto *SROAArg = getSROAArgForValueOrNull(V: TrueVal)) | 
|---|
| 2544 | SROAArgValues[&SI] = SROAArg; | 
|---|
| 2545 | return true; | 
|---|
| 2546 | } | 
|---|
| 2547 |  | 
|---|
| 2548 | return Base::visitSelectInst(I&: SI); | 
|---|
| 2549 | } | 
|---|
| 2550 |  | 
|---|
| 2551 | // Select condition is a constant. | 
|---|
| 2552 | Value *SelectedV = CondC->isAllOnesValue()  ? TrueVal | 
|---|
| 2553 | : (CondC->isNullValue()) ? FalseVal | 
|---|
| 2554 | : nullptr; | 
|---|
| 2555 | if (!SelectedV) { | 
|---|
| 2556 | // Condition is a vector constant that is not all 1s or all 0s.  If all | 
|---|
| 2557 | // operands are constants, ConstantFoldSelectInstruction() can handle the | 
|---|
| 2558 | // cases such as select vectors. | 
|---|
| 2559 | if (TrueC && FalseC) { | 
|---|
| 2560 | if (auto *C = ConstantFoldSelectInstruction(Cond: CondC, V1: TrueC, V2: FalseC)) { | 
|---|
| 2561 | SimplifiedValues[&SI] = C; | 
|---|
| 2562 | return true; | 
|---|
| 2563 | } | 
|---|
| 2564 | } | 
|---|
| 2565 | return Base::visitSelectInst(I&: SI); | 
|---|
| 2566 | } | 
|---|
| 2567 |  | 
|---|
| 2568 | // Condition is either all 1s or all 0s. SI can be simplified. | 
|---|
| 2569 | if (Constant *SelectedC = dyn_cast<Constant>(Val: SelectedV)) { | 
|---|
| 2570 | SimplifiedValues[&SI] = SelectedC; | 
|---|
| 2571 | return true; | 
|---|
| 2572 | } | 
|---|
| 2573 |  | 
|---|
| 2574 | if (!CheckSROA) | 
|---|
| 2575 | return true; | 
|---|
| 2576 |  | 
|---|
| 2577 | std::pair<Value *, APInt> BaseAndOffset = | 
|---|
| 2578 | ConstantOffsetPtrs.lookup(Val: SelectedV); | 
|---|
| 2579 | if (BaseAndOffset.first) { | 
|---|
| 2580 | ConstantOffsetPtrs[&SI] = BaseAndOffset; | 
|---|
| 2581 |  | 
|---|
| 2582 | if (auto *SROAArg = getSROAArgForValueOrNull(V: SelectedV)) | 
|---|
| 2583 | SROAArgValues[&SI] = SROAArg; | 
|---|
| 2584 | } | 
|---|
| 2585 |  | 
|---|
| 2586 | return true; | 
|---|
| 2587 | } | 
|---|
| 2588 |  | 
|---|
| 2589 | bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { | 
|---|
| 2590 | // We model unconditional switches as free, see the comments on handling | 
|---|
| 2591 | // branches. | 
|---|
| 2592 | if (getDirectOrSimplifiedValue<ConstantInt>(V: SI.getCondition())) | 
|---|
| 2593 | return true; | 
|---|
| 2594 |  | 
|---|
| 2595 | // Assume the most general case where the switch is lowered into | 
|---|
| 2596 | // either a jump table, bit test, or a balanced binary tree consisting of | 
|---|
| 2597 | // case clusters without merging adjacent clusters with the same | 
|---|
| 2598 | // destination. We do not consider the switches that are lowered with a mix | 
|---|
| 2599 | // of jump table/bit test/binary search tree. The cost of the switch is | 
|---|
| 2600 | // proportional to the size of the tree or the size of jump table range. | 
|---|
| 2601 | // | 
|---|
| 2602 | // NB: We convert large switches which are just used to initialize large phi | 
|---|
| 2603 | // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent | 
|---|
| 2604 | // inlining those. It will prevent inlining in cases where the optimization | 
|---|
| 2605 | // does not (yet) fire. | 
|---|
| 2606 |  | 
|---|
| 2607 | unsigned JumpTableSize = 0; | 
|---|
| 2608 | BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; | 
|---|
| 2609 | unsigned NumCaseCluster = | 
|---|
| 2610 | TTI.getEstimatedNumberOfCaseClusters(SI, JTSize&: JumpTableSize, PSI, BFI); | 
|---|
| 2611 |  | 
|---|
| 2612 | onFinalizeSwitch(JumpTableSize, NumCaseCluster, DefaultDestUnreachable: SI.defaultDestUnreachable()); | 
|---|
| 2613 | return false; | 
|---|
| 2614 | } | 
|---|
| 2615 |  | 
|---|
| 2616 | bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { | 
|---|
| 2617 | // We never want to inline functions that contain an indirectbr.  This is | 
|---|
| 2618 | // incorrect because all the blockaddress's (in static global initializers | 
|---|
| 2619 | // for example) would be referring to the original function, and this | 
|---|
| 2620 | // indirect jump would jump from the inlined copy of the function into the | 
|---|
| 2621 | // original function which is extremely undefined behavior. | 
|---|
| 2622 | // FIXME: This logic isn't really right; we can safely inline functions with | 
|---|
| 2623 | // indirectbr's as long as no other function or global references the | 
|---|
| 2624 | // blockaddress of a block within the current function. | 
|---|
| 2625 | HasIndirectBr = true; | 
|---|
| 2626 | return false; | 
|---|
| 2627 | } | 
|---|
| 2628 |  | 
|---|
| 2629 | bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { | 
|---|
| 2630 | // FIXME: It's not clear that a single instruction is an accurate model for | 
|---|
| 2631 | // the inline cost of a resume instruction. | 
|---|
| 2632 | return false; | 
|---|
| 2633 | } | 
|---|
| 2634 |  | 
|---|
| 2635 | bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { | 
|---|
| 2636 | // FIXME: It's not clear that a single instruction is an accurate model for | 
|---|
| 2637 | // the inline cost of a cleanupret instruction. | 
|---|
| 2638 | return false; | 
|---|
| 2639 | } | 
|---|
| 2640 |  | 
|---|
| 2641 | bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { | 
|---|
| 2642 | // FIXME: It's not clear that a single instruction is an accurate model for | 
|---|
| 2643 | // the inline cost of a catchret instruction. | 
|---|
| 2644 | return false; | 
|---|
| 2645 | } | 
|---|
| 2646 |  | 
|---|
| 2647 | bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { | 
|---|
| 2648 | // FIXME: It might be reasonably to discount the cost of instructions leading | 
|---|
| 2649 | // to unreachable as they have the lowest possible impact on both runtime and | 
|---|
| 2650 | // code size. | 
|---|
| 2651 | return true; // No actual code is needed for unreachable. | 
|---|
| 2652 | } | 
|---|
| 2653 |  | 
|---|
| 2654 | bool CallAnalyzer::visitInstruction(Instruction &I) { | 
|---|
| 2655 | // Some instructions are free. All of the free intrinsics can also be | 
|---|
| 2656 | // handled by SROA, etc. | 
|---|
| 2657 | if (TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_SizeAndLatency) == | 
|---|
| 2658 | TargetTransformInfo::TCC_Free) | 
|---|
| 2659 | return true; | 
|---|
| 2660 |  | 
|---|
| 2661 | // We found something we don't understand or can't handle. Mark any SROA-able | 
|---|
| 2662 | // values in the operand list as no longer viable. | 
|---|
| 2663 | for (const Use &Op : I.operands()) | 
|---|
| 2664 | disableSROA(V: Op); | 
|---|
| 2665 |  | 
|---|
| 2666 | return false; | 
|---|
| 2667 | } | 
|---|
| 2668 |  | 
|---|
| 2669 | /// Analyze a basic block for its contribution to the inline cost. | 
|---|
| 2670 | /// | 
|---|
| 2671 | /// This method walks the analyzer over every instruction in the given basic | 
|---|
| 2672 | /// block and accounts for their cost during inlining at this callsite. It | 
|---|
| 2673 | /// aborts early if the threshold has been exceeded or an impossible to inline | 
|---|
| 2674 | /// construct has been detected. It returns false if inlining is no longer | 
|---|
| 2675 | /// viable, and true if inlining remains viable. | 
|---|
| 2676 | InlineResult | 
|---|
| 2677 | CallAnalyzer::analyzeBlock(BasicBlock *BB, | 
|---|
| 2678 | const SmallPtrSetImpl<const Value *> &EphValues) { | 
|---|
| 2679 | for (Instruction &I : *BB) { | 
|---|
| 2680 | // FIXME: Currently, the number of instructions in a function regardless of | 
|---|
| 2681 | // our ability to simplify them during inline to constants or dead code, | 
|---|
| 2682 | // are actually used by the vector bonus heuristic. As long as that's true, | 
|---|
| 2683 | // we have to special case debug intrinsics here to prevent differences in | 
|---|
| 2684 | // inlining due to debug symbols. Eventually, the number of unsimplified | 
|---|
| 2685 | // instructions shouldn't factor into the cost computation, but until then, | 
|---|
| 2686 | // hack around it here. | 
|---|
| 2687 | // Similarly, skip pseudo-probes. | 
|---|
| 2688 | if (I.isDebugOrPseudoInst()) | 
|---|
| 2689 | continue; | 
|---|
| 2690 |  | 
|---|
| 2691 | // Skip ephemeral values. | 
|---|
| 2692 | if (EphValues.count(Ptr: &I)) | 
|---|
| 2693 | continue; | 
|---|
| 2694 |  | 
|---|
| 2695 | ++NumInstructions; | 
|---|
| 2696 | if (isa<ExtractElementInst>(Val: I) || I.getType()->isVectorTy()) | 
|---|
| 2697 | ++NumVectorInstructions; | 
|---|
| 2698 |  | 
|---|
| 2699 | // If the instruction simplified to a constant, there is no cost to this | 
|---|
| 2700 | // instruction. Visit the instructions using our InstVisitor to account for | 
|---|
| 2701 | // all of the per-instruction logic. The visit tree returns true if we | 
|---|
| 2702 | // consumed the instruction in any way, and false if the instruction's base | 
|---|
| 2703 | // cost should count against inlining. | 
|---|
| 2704 | onInstructionAnalysisStart(I: &I); | 
|---|
| 2705 |  | 
|---|
| 2706 | if (Base::visit(I: &I)) | 
|---|
| 2707 | ++NumInstructionsSimplified; | 
|---|
| 2708 | else | 
|---|
| 2709 | onMissedSimplification(); | 
|---|
| 2710 |  | 
|---|
| 2711 | onInstructionAnalysisFinish(I: &I); | 
|---|
| 2712 | using namespace ore; | 
|---|
| 2713 | // If the visit this instruction detected an uninlinable pattern, abort. | 
|---|
| 2714 | InlineResult IR = InlineResult::success(); | 
|---|
| 2715 | if (IsRecursiveCall && !AllowRecursiveCall) | 
|---|
| 2716 | IR = InlineResult::failure(Reason: "recursive"); | 
|---|
| 2717 | else if (ExposesReturnsTwice) | 
|---|
| 2718 | IR = InlineResult::failure(Reason: "exposes returns twice"); | 
|---|
| 2719 | else if (HasDynamicAlloca) | 
|---|
| 2720 | IR = InlineResult::failure(Reason: "dynamic alloca"); | 
|---|
| 2721 | else if (HasIndirectBr) | 
|---|
| 2722 | IR = InlineResult::failure(Reason: "indirect branch"); | 
|---|
| 2723 | else if (HasUninlineableIntrinsic) | 
|---|
| 2724 | IR = InlineResult::failure(Reason: "uninlinable intrinsic"); | 
|---|
| 2725 | else if (InitsVargArgs) | 
|---|
| 2726 | IR = InlineResult::failure(Reason: "varargs"); | 
|---|
| 2727 | if (!IR.isSuccess()) { | 
|---|
| 2728 | if (ORE) | 
|---|
| 2729 | ORE->emit(RemarkBuilder: [&]() { | 
|---|
| 2730 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", | 
|---|
| 2731 | &CandidateCall) | 
|---|
| 2732 | << NV( "Callee", &F) << " has uninlinable pattern (" | 
|---|
| 2733 | << NV( "InlineResult", IR.getFailureReason()) | 
|---|
| 2734 | << ") and cost is not fully computed"; | 
|---|
| 2735 | }); | 
|---|
| 2736 | return IR; | 
|---|
| 2737 | } | 
|---|
| 2738 |  | 
|---|
| 2739 | // If the caller is a recursive function then we don't want to inline | 
|---|
| 2740 | // functions which allocate a lot of stack space because it would increase | 
|---|
| 2741 | // the caller stack usage dramatically. | 
|---|
| 2742 | if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) { | 
|---|
| 2743 | auto IR = | 
|---|
| 2744 | InlineResult::failure(Reason: "recursive and allocates too much stack space"); | 
|---|
| 2745 | if (ORE) | 
|---|
| 2746 | ORE->emit(RemarkBuilder: [&]() { | 
|---|
| 2747 | return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", | 
|---|
| 2748 | &CandidateCall) | 
|---|
| 2749 | << NV( "Callee", &F) << " is " | 
|---|
| 2750 | << NV( "InlineResult", IR.getFailureReason()) | 
|---|
| 2751 | << ". Cost is not fully computed"; | 
|---|
| 2752 | }); | 
|---|
| 2753 | return IR; | 
|---|
| 2754 | } | 
|---|
| 2755 |  | 
|---|
| 2756 | if (shouldStop()) | 
|---|
| 2757 | return InlineResult::failure( | 
|---|
| 2758 | Reason: "Call site analysis is not favorable to inlining."); | 
|---|
| 2759 | } | 
|---|
| 2760 |  | 
|---|
| 2761 | return InlineResult::success(); | 
|---|
| 2762 | } | 
|---|
| 2763 |  | 
|---|
| 2764 | /// Compute the base pointer and cumulative constant offsets for V. | 
|---|
| 2765 | /// | 
|---|
| 2766 | /// This strips all constant offsets off of V, leaving it the base pointer, and | 
|---|
| 2767 | /// accumulates the total constant offset applied in the returned constant. It | 
|---|
| 2768 | /// returns 0 if V is not a pointer, and returns the constant '0' if there are | 
|---|
| 2769 | /// no constant offsets applied. | 
|---|
| 2770 | ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { | 
|---|
| 2771 | if (!V->getType()->isPointerTy()) | 
|---|
| 2772 | return nullptr; | 
|---|
| 2773 |  | 
|---|
| 2774 | unsigned AS = V->getType()->getPointerAddressSpace(); | 
|---|
| 2775 | unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); | 
|---|
| 2776 | APInt Offset = APInt::getZero(numBits: IntPtrWidth); | 
|---|
| 2777 |  | 
|---|
| 2778 | // Even though we don't look through PHI nodes, we could be called on an | 
|---|
| 2779 | // instruction in an unreachable block, which may be on a cycle. | 
|---|
| 2780 | SmallPtrSet<Value *, 4> Visited; | 
|---|
| 2781 | Visited.insert(Ptr: V); | 
|---|
| 2782 | do { | 
|---|
| 2783 | if (GEPOperator *GEP = dyn_cast<GEPOperator>(Val: V)) { | 
|---|
| 2784 | if (!GEP->isInBounds() || !accumulateGEPOffset(GEP&: *GEP, Offset)) | 
|---|
| 2785 | return nullptr; | 
|---|
| 2786 | V = GEP->getPointerOperand(); | 
|---|
| 2787 | } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: V)) { | 
|---|
| 2788 | if (GA->isInterposable()) | 
|---|
| 2789 | break; | 
|---|
| 2790 | V = GA->getAliasee(); | 
|---|
| 2791 | } else { | 
|---|
| 2792 | break; | 
|---|
| 2793 | } | 
|---|
| 2794 | assert(V->getType()->isPointerTy() && "Unexpected operand type!"); | 
|---|
| 2795 | } while (Visited.insert(Ptr: V).second); | 
|---|
| 2796 |  | 
|---|
| 2797 | Type *IdxPtrTy = DL.getIndexType(PtrTy: V->getType()); | 
|---|
| 2798 | return cast<ConstantInt>(Val: ConstantInt::get(Ty: IdxPtrTy, V: Offset)); | 
|---|
| 2799 | } | 
|---|
| 2800 |  | 
|---|
| 2801 | /// Find dead blocks due to deleted CFG edges during inlining. | 
|---|
| 2802 | /// | 
|---|
| 2803 | /// If we know the successor of the current block, \p CurrBB, has to be \p | 
|---|
| 2804 | /// NextBB, the other successors of \p CurrBB are dead if these successors have | 
|---|
| 2805 | /// no live incoming CFG edges.  If one block is found to be dead, we can | 
|---|
| 2806 | /// continue growing the dead block list by checking the successors of the dead | 
|---|
| 2807 | /// blocks to see if all their incoming edges are dead or not. | 
|---|
| 2808 | void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { | 
|---|
| 2809 | auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { | 
|---|
| 2810 | // A CFG edge is dead if the predecessor is dead or the predecessor has a | 
|---|
| 2811 | // known successor which is not the one under exam. | 
|---|
| 2812 | if (DeadBlocks.count(Ptr: Pred)) | 
|---|
| 2813 | return true; | 
|---|
| 2814 | BasicBlock *KnownSucc = KnownSuccessors[Pred]; | 
|---|
| 2815 | return KnownSucc && KnownSucc != Succ; | 
|---|
| 2816 | }; | 
|---|
| 2817 |  | 
|---|
| 2818 | auto IsNewlyDead = [&](BasicBlock *BB) { | 
|---|
| 2819 | // If all the edges to a block are dead, the block is also dead. | 
|---|
| 2820 | return (!DeadBlocks.count(Ptr: BB) && | 
|---|
| 2821 | llvm::all_of(Range: predecessors(BB), | 
|---|
| 2822 | P: [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); | 
|---|
| 2823 | }; | 
|---|
| 2824 |  | 
|---|
| 2825 | for (BasicBlock *Succ : successors(BB: CurrBB)) { | 
|---|
| 2826 | if (Succ == NextBB || !IsNewlyDead(Succ)) | 
|---|
| 2827 | continue; | 
|---|
| 2828 | SmallVector<BasicBlock *, 4> NewDead; | 
|---|
| 2829 | NewDead.push_back(Elt: Succ); | 
|---|
| 2830 | while (!NewDead.empty()) { | 
|---|
| 2831 | BasicBlock *Dead = NewDead.pop_back_val(); | 
|---|
| 2832 | if (DeadBlocks.insert(Ptr: Dead).second) | 
|---|
| 2833 | // Continue growing the dead block lists. | 
|---|
| 2834 | for (BasicBlock *S : successors(BB: Dead)) | 
|---|
| 2835 | if (IsNewlyDead(S)) | 
|---|
| 2836 | NewDead.push_back(Elt: S); | 
|---|
| 2837 | } | 
|---|
| 2838 | } | 
|---|
| 2839 | } | 
|---|
| 2840 |  | 
|---|
| 2841 | /// Analyze a call site for potential inlining. | 
|---|
| 2842 | /// | 
|---|
| 2843 | /// Returns true if inlining this call is viable, and false if it is not | 
|---|
| 2844 | /// viable. It computes the cost and adjusts the threshold based on numerous | 
|---|
| 2845 | /// factors and heuristics. If this method returns false but the computed cost | 
|---|
| 2846 | /// is below the computed threshold, then inlining was forcibly disabled by | 
|---|
| 2847 | /// some artifact of the routine. | 
|---|
| 2848 | InlineResult CallAnalyzer::analyze() { | 
|---|
| 2849 | ++NumCallsAnalyzed; | 
|---|
| 2850 |  | 
|---|
| 2851 | auto Result = onAnalysisStart(); | 
|---|
| 2852 | if (!Result.isSuccess()) | 
|---|
| 2853 | return Result; | 
|---|
| 2854 |  | 
|---|
| 2855 | if (F.empty()) | 
|---|
| 2856 | return InlineResult::success(); | 
|---|
| 2857 |  | 
|---|
| 2858 | Function *Caller = CandidateCall.getFunction(); | 
|---|
| 2859 | // Check if the caller function is recursive itself. | 
|---|
| 2860 | for (User *U : Caller->users()) { | 
|---|
| 2861 | CallBase *Call = dyn_cast<CallBase>(Val: U); | 
|---|
| 2862 | if (Call && Call->getFunction() == Caller) { | 
|---|
| 2863 | IsCallerRecursive = true; | 
|---|
| 2864 | break; | 
|---|
| 2865 | } | 
|---|
| 2866 | } | 
|---|
| 2867 |  | 
|---|
| 2868 | // Populate our simplified values by mapping from function arguments to call | 
|---|
| 2869 | // arguments with known important simplifications. | 
|---|
| 2870 | auto CAI = CandidateCall.arg_begin(); | 
|---|
| 2871 | for (Argument &FAI : F.args()) { | 
|---|
| 2872 | assert(CAI != CandidateCall.arg_end()); | 
|---|
| 2873 | SimplifiedValues[&FAI] = *CAI; | 
|---|
| 2874 | if (isa<Constant>(Val: *CAI)) | 
|---|
| 2875 | ++NumConstantArgs; | 
|---|
| 2876 |  | 
|---|
| 2877 | Value *PtrArg = *CAI; | 
|---|
| 2878 | if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(V&: PtrArg)) { | 
|---|
| 2879 | ConstantOffsetPtrs[&FAI] = std::make_pair(x&: PtrArg, y: C->getValue()); | 
|---|
| 2880 |  | 
|---|
| 2881 | // We can SROA any pointer arguments derived from alloca instructions. | 
|---|
| 2882 | if (auto *SROAArg = dyn_cast<AllocaInst>(Val: PtrArg)) { | 
|---|
| 2883 | SROAArgValues[&FAI] = SROAArg; | 
|---|
| 2884 | onInitializeSROAArg(Arg: SROAArg); | 
|---|
| 2885 | EnabledSROAAllocas.insert(V: SROAArg); | 
|---|
| 2886 | } | 
|---|
| 2887 | } | 
|---|
| 2888 | ++CAI; | 
|---|
| 2889 | } | 
|---|
| 2890 | NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); | 
|---|
| 2891 | NumAllocaArgs = SROAArgValues.size(); | 
|---|
| 2892 |  | 
|---|
| 2893 | // Collecting the ephemeral values of `F` can be expensive, so use the | 
|---|
| 2894 | // ephemeral values cache if available. | 
|---|
| 2895 | SmallPtrSet<const Value *, 32> EphValuesStorage; | 
|---|
| 2896 | const SmallPtrSetImpl<const Value *> *EphValues = &EphValuesStorage; | 
|---|
| 2897 | if (GetEphValuesCache) | 
|---|
| 2898 | EphValues = &GetEphValuesCache(F).ephValues(); | 
|---|
| 2899 | else | 
|---|
| 2900 | CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAssumptionCache(F), | 
|---|
| 2901 | EphValues&: EphValuesStorage); | 
|---|
| 2902 |  | 
|---|
| 2903 | // The worklist of live basic blocks in the callee *after* inlining. We avoid | 
|---|
| 2904 | // adding basic blocks of the callee which can be proven to be dead for this | 
|---|
| 2905 | // particular call site in order to get more accurate cost estimates. This | 
|---|
| 2906 | // requires a somewhat heavyweight iteration pattern: we need to walk the | 
|---|
| 2907 | // basic blocks in a breadth-first order as we insert live successors. To | 
|---|
| 2908 | // accomplish this, prioritizing for small iterations because we exit after | 
|---|
| 2909 | // crossing our threshold, we use a small-size optimized SetVector. | 
|---|
| 2910 | typedef SmallSetVector<BasicBlock *, 16> BBSetVector; | 
|---|
| 2911 | BBSetVector BBWorklist; | 
|---|
| 2912 | BBWorklist.insert(X: &F.getEntryBlock()); | 
|---|
| 2913 |  | 
|---|
| 2914 | // Note that we *must not* cache the size, this loop grows the worklist. | 
|---|
| 2915 | for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { | 
|---|
| 2916 | if (shouldStop()) | 
|---|
| 2917 | break; | 
|---|
| 2918 |  | 
|---|
| 2919 | BasicBlock *BB = BBWorklist[Idx]; | 
|---|
| 2920 | if (BB->empty()) | 
|---|
| 2921 | continue; | 
|---|
| 2922 |  | 
|---|
| 2923 | onBlockStart(BB); | 
|---|
| 2924 |  | 
|---|
| 2925 | // Disallow inlining a blockaddress with uses other than strictly callbr. | 
|---|
| 2926 | // A blockaddress only has defined behavior for an indirect branch in the | 
|---|
| 2927 | // same function, and we do not currently support inlining indirect | 
|---|
| 2928 | // branches.  But, the inliner may not see an indirect branch that ends up | 
|---|
| 2929 | // being dead code at a particular call site. If the blockaddress escapes | 
|---|
| 2930 | // the function, e.g., via a global variable, inlining may lead to an | 
|---|
| 2931 | // invalid cross-function reference. | 
|---|
| 2932 | // FIXME: pr/39560: continue relaxing this overt restriction. | 
|---|
| 2933 | if (BB->hasAddressTaken()) | 
|---|
| 2934 | for (User *U : BlockAddress::get(BB: &*BB)->users()) | 
|---|
| 2935 | if (!isa<CallBrInst>(Val: *U)) | 
|---|
| 2936 | return InlineResult::failure(Reason: "blockaddress used outside of callbr"); | 
|---|
| 2937 |  | 
|---|
| 2938 | // Analyze the cost of this block. If we blow through the threshold, this | 
|---|
| 2939 | // returns false, and we can bail on out. | 
|---|
| 2940 | InlineResult IR = analyzeBlock(BB, EphValues: *EphValues); | 
|---|
| 2941 | if (!IR.isSuccess()) | 
|---|
| 2942 | return IR; | 
|---|
| 2943 |  | 
|---|
| 2944 | Instruction *TI = BB->getTerminator(); | 
|---|
| 2945 |  | 
|---|
| 2946 | // Add in the live successors by first checking whether we have terminator | 
|---|
| 2947 | // that may be simplified based on the values simplified by this call. | 
|---|
| 2948 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) { | 
|---|
| 2949 | if (BI->isConditional()) { | 
|---|
| 2950 | Value *Cond = BI->getCondition(); | 
|---|
| 2951 | if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) { | 
|---|
| 2952 | BasicBlock *NextBB = BI->getSuccessor(i: SimpleCond->isZero() ? 1 : 0); | 
|---|
| 2953 | BBWorklist.insert(X: NextBB); | 
|---|
| 2954 | KnownSuccessors[BB] = NextBB; | 
|---|
| 2955 | findDeadBlocks(CurrBB: BB, NextBB); | 
|---|
| 2956 | continue; | 
|---|
| 2957 | } | 
|---|
| 2958 | } | 
|---|
| 2959 | } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) { | 
|---|
| 2960 | Value *Cond = SI->getCondition(); | 
|---|
| 2961 | if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(V: Cond)) { | 
|---|
| 2962 | BasicBlock *NextBB = SI->findCaseValue(C: SimpleCond)->getCaseSuccessor(); | 
|---|
| 2963 | BBWorklist.insert(X: NextBB); | 
|---|
| 2964 | KnownSuccessors[BB] = NextBB; | 
|---|
| 2965 | findDeadBlocks(CurrBB: BB, NextBB); | 
|---|
| 2966 | continue; | 
|---|
| 2967 | } | 
|---|
| 2968 | } | 
|---|
| 2969 |  | 
|---|
| 2970 | // If we're unable to select a particular successor, just count all of | 
|---|
| 2971 | // them. | 
|---|
| 2972 | BBWorklist.insert_range(R: successors(BB)); | 
|---|
| 2973 |  | 
|---|
| 2974 | onBlockAnalyzed(BB); | 
|---|
| 2975 | } | 
|---|
| 2976 |  | 
|---|
| 2977 | // If this is a noduplicate call, we can still inline as long as | 
|---|
| 2978 | // inlining this would cause the removal of the caller (so the instruction | 
|---|
| 2979 | // is not actually duplicated, just moved). | 
|---|
| 2980 | if (!isSoleCallToLocalFunction(CB: CandidateCall, Callee: F) && ContainsNoDuplicateCall) | 
|---|
| 2981 | return InlineResult::failure(Reason: "noduplicate"); | 
|---|
| 2982 |  | 
|---|
| 2983 | // If the callee's stack size exceeds the user-specified threshold, | 
|---|
| 2984 | // do not let it be inlined. | 
|---|
| 2985 | // The command line option overrides a limit set in the function attributes. | 
|---|
| 2986 | size_t FinalStackSizeThreshold = StackSizeThreshold; | 
|---|
| 2987 | if (!StackSizeThreshold.getNumOccurrences()) | 
|---|
| 2988 | if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt( | 
|---|
| 2989 | F: Caller, AttrKind: InlineConstants::MaxInlineStackSizeAttributeName)) | 
|---|
| 2990 | FinalStackSizeThreshold = *AttrMaxStackSize; | 
|---|
| 2991 | if (AllocatedSize > FinalStackSizeThreshold) | 
|---|
| 2992 | return InlineResult::failure(Reason: "stacksize"); | 
|---|
| 2993 |  | 
|---|
| 2994 | return finalizeAnalysis(); | 
|---|
| 2995 | } | 
|---|
| 2996 |  | 
|---|
| 2997 | void InlineCostCallAnalyzer::print(raw_ostream &OS) { | 
|---|
| 2998 | #define DEBUG_PRINT_STAT(x) OS << "      " #x ": " << x << "\n" | 
|---|
| 2999 | if (PrintInstructionComments) | 
|---|
| 3000 | F.print(OS, AAW: &Writer); | 
|---|
| 3001 | DEBUG_PRINT_STAT(NumConstantArgs); | 
|---|
| 3002 | DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); | 
|---|
| 3003 | DEBUG_PRINT_STAT(NumAllocaArgs); | 
|---|
| 3004 | DEBUG_PRINT_STAT(NumConstantPtrCmps); | 
|---|
| 3005 | DEBUG_PRINT_STAT(NumConstantPtrDiffs); | 
|---|
| 3006 | DEBUG_PRINT_STAT(NumInstructionsSimplified); | 
|---|
| 3007 | DEBUG_PRINT_STAT(NumInstructions); | 
|---|
| 3008 | DEBUG_PRINT_STAT(SROACostSavings); | 
|---|
| 3009 | DEBUG_PRINT_STAT(SROACostSavingsLost); | 
|---|
| 3010 | DEBUG_PRINT_STAT(LoadEliminationCost); | 
|---|
| 3011 | DEBUG_PRINT_STAT(ContainsNoDuplicateCall); | 
|---|
| 3012 | DEBUG_PRINT_STAT(Cost); | 
|---|
| 3013 | DEBUG_PRINT_STAT(Threshold); | 
|---|
| 3014 | #undef DEBUG_PRINT_STAT | 
|---|
| 3015 | } | 
|---|
| 3016 |  | 
|---|
| 3017 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 
|---|
| 3018 | /// Dump stats about this call's analysis. | 
|---|
| 3019 | LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); } | 
|---|
| 3020 | #endif | 
|---|
| 3021 |  | 
|---|
| 3022 | /// Test that there are no attribute conflicts between Caller and Callee | 
|---|
| 3023 | ///        that prevent inlining. | 
|---|
| 3024 | static bool functionsHaveCompatibleAttributes( | 
|---|
| 3025 | Function *Caller, Function *Callee, TargetTransformInfo &TTI, | 
|---|
| 3026 | function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) { | 
|---|
| 3027 | // Note that CalleeTLI must be a copy not a reference. The legacy pass manager | 
|---|
| 3028 | // caches the most recently created TLI in the TargetLibraryInfoWrapperPass | 
|---|
| 3029 | // object, and always returns the same object (which is overwritten on each | 
|---|
| 3030 | // GetTLI call). Therefore we copy the first result. | 
|---|
| 3031 | auto CalleeTLI = GetTLI(*Callee); | 
|---|
| 3032 | return (IgnoreTTIInlineCompatible || | 
|---|
| 3033 | TTI.areInlineCompatible(Caller, Callee)) && | 
|---|
| 3034 | GetTLI(*Caller).areInlineCompatible(CalleeTLI, | 
|---|
| 3035 | AllowCallerSuperset: InlineCallerSupersetNoBuiltin) && | 
|---|
| 3036 | AttributeFuncs::areInlineCompatible(Caller: *Caller, Callee: *Callee); | 
|---|
| 3037 | } | 
|---|
| 3038 |  | 
|---|
| 3039 | int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, | 
|---|
| 3040 | const DataLayout &DL) { | 
|---|
| 3041 | int64_t Cost = 0; | 
|---|
| 3042 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { | 
|---|
| 3043 | if (Call.isByValArgument(ArgNo: I)) { | 
|---|
| 3044 | // We approximate the number of loads and stores needed by dividing the | 
|---|
| 3045 | // size of the byval type by the target's pointer size. | 
|---|
| 3046 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); | 
|---|
| 3047 | unsigned TypeSize = DL.getTypeSizeInBits(Ty: Call.getParamByValType(ArgNo: I)); | 
|---|
| 3048 | unsigned AS = PTy->getAddressSpace(); | 
|---|
| 3049 | unsigned PointerSize = DL.getPointerSizeInBits(AS); | 
|---|
| 3050 | // Ceiling division. | 
|---|
| 3051 | unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; | 
|---|
| 3052 |  | 
|---|
| 3053 | // If it generates more than 8 stores it is likely to be expanded as an | 
|---|
| 3054 | // inline memcpy so we take that as an upper bound. Otherwise we assume | 
|---|
| 3055 | // one load and one store per word copied. | 
|---|
| 3056 | // FIXME: The maxStoresPerMemcpy setting from the target should be used | 
|---|
| 3057 | // here instead of a magic number of 8, but it's not available via | 
|---|
| 3058 | // DataLayout. | 
|---|
| 3059 | NumStores = std::min(a: NumStores, b: 8U); | 
|---|
| 3060 |  | 
|---|
| 3061 | Cost += 2 * NumStores * InstrCost; | 
|---|
| 3062 | } else { | 
|---|
| 3063 | // For non-byval arguments subtract off one instruction per call | 
|---|
| 3064 | // argument. | 
|---|
| 3065 | Cost += InstrCost; | 
|---|
| 3066 | } | 
|---|
| 3067 | } | 
|---|
| 3068 | // The call instruction also disappears after inlining. | 
|---|
| 3069 | Cost += InstrCost; | 
|---|
| 3070 | Cost += TTI.getInlineCallPenalty(F: Call.getCaller(), Call, DefaultCallPenalty: CallPenalty); | 
|---|
| 3071 |  | 
|---|
| 3072 | return std::min<int64_t>(a: Cost, INT_MAX); | 
|---|
| 3073 | } | 
|---|
| 3074 |  | 
|---|
| 3075 | InlineCost llvm::getInlineCost( | 
|---|
| 3076 | CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, | 
|---|
| 3077 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, | 
|---|
| 3078 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, | 
|---|
| 3079 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, | 
|---|
| 3080 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, | 
|---|
| 3081 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) { | 
|---|
| 3082 | return getInlineCost(Call, Callee: Call.getCalledFunction(), Params, CalleeTTI, | 
|---|
| 3083 | GetAssumptionCache, GetTLI, GetBFI, PSI, ORE, | 
|---|
| 3084 | GetEphValuesCache); | 
|---|
| 3085 | } | 
|---|
| 3086 |  | 
|---|
| 3087 | std::optional<int> llvm::getInliningCostEstimate( | 
|---|
| 3088 | CallBase &Call, TargetTransformInfo &CalleeTTI, | 
|---|
| 3089 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, | 
|---|
| 3090 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, | 
|---|
| 3091 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, | 
|---|
| 3092 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { | 
|---|
| 3093 | const InlineParams Params = {/* DefaultThreshold*/ 0, | 
|---|
| 3094 | /*HintThreshold*/ {}, | 
|---|
| 3095 | /*ColdThreshold*/ {}, | 
|---|
| 3096 | /*OptSizeThreshold*/ {}, | 
|---|
| 3097 | /*OptMinSizeThreshold*/ {}, | 
|---|
| 3098 | /*HotCallSiteThreshold*/ {}, | 
|---|
| 3099 | /*LocallyHotCallSiteThreshold*/ {}, | 
|---|
| 3100 | /*ColdCallSiteThreshold*/ {}, | 
|---|
| 3101 | /*ComputeFullInlineCost*/ true, | 
|---|
| 3102 | /*EnableDeferral*/ true}; | 
|---|
| 3103 |  | 
|---|
| 3104 | InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, | 
|---|
| 3105 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true, | 
|---|
| 3106 | /*IgnoreThreshold*/ true); | 
|---|
| 3107 | auto R = CA.analyze(); | 
|---|
| 3108 | if (!R.isSuccess()) | 
|---|
| 3109 | return std::nullopt; | 
|---|
| 3110 | return CA.getCost(); | 
|---|
| 3111 | } | 
|---|
| 3112 |  | 
|---|
| 3113 | std::optional<InlineCostFeatures> llvm::getInliningCostFeatures( | 
|---|
| 3114 | CallBase &Call, TargetTransformInfo &CalleeTTI, | 
|---|
| 3115 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, | 
|---|
| 3116 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, | 
|---|
| 3117 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, | 
|---|
| 3118 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { | 
|---|
| 3119 | InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI, | 
|---|
| 3120 | PSI, ORE, *Call.getCalledFunction(), Call); | 
|---|
| 3121 | auto R = CFA.analyze(); | 
|---|
| 3122 | if (!R.isSuccess()) | 
|---|
| 3123 | return std::nullopt; | 
|---|
| 3124 | return CFA.features(); | 
|---|
| 3125 | } | 
|---|
| 3126 |  | 
|---|
| 3127 | std::optional<InlineResult> llvm::getAttributeBasedInliningDecision( | 
|---|
| 3128 | CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, | 
|---|
| 3129 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { | 
|---|
| 3130 |  | 
|---|
| 3131 | // Cannot inline indirect calls. | 
|---|
| 3132 | if (!Callee) | 
|---|
| 3133 | return InlineResult::failure(Reason: "indirect call"); | 
|---|
| 3134 |  | 
|---|
| 3135 | // When callee coroutine function is inlined into caller coroutine function | 
|---|
| 3136 | // before coro-split pass, | 
|---|
| 3137 | // coro-early pass can not handle this quiet well. | 
|---|
| 3138 | // So we won't inline the coroutine function if it have not been unsplited | 
|---|
| 3139 | if (Callee->isPresplitCoroutine()) | 
|---|
| 3140 | return InlineResult::failure(Reason: "unsplited coroutine call"); | 
|---|
| 3141 |  | 
|---|
| 3142 | // Never inline calls with byval arguments that does not have the alloca | 
|---|
| 3143 | // address space. Since byval arguments can be replaced with a copy to an | 
|---|
| 3144 | // alloca, the inlined code would need to be adjusted to handle that the | 
|---|
| 3145 | // argument is in the alloca address space (so it is a little bit complicated | 
|---|
| 3146 | // to solve). | 
|---|
| 3147 | unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace(); | 
|---|
| 3148 | for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) | 
|---|
| 3149 | if (Call.isByValArgument(ArgNo: I)) { | 
|---|
| 3150 | PointerType *PTy = cast<PointerType>(Val: Call.getArgOperand(i: I)->getType()); | 
|---|
| 3151 | if (PTy->getAddressSpace() != AllocaAS) | 
|---|
| 3152 | return InlineResult::failure(Reason: "byval arguments without alloca" | 
|---|
| 3153 | " address space"); | 
|---|
| 3154 | } | 
|---|
| 3155 |  | 
|---|
| 3156 | // Calls to functions with always-inline attributes should be inlined | 
|---|
| 3157 | // whenever possible. | 
|---|
| 3158 | if (Call.hasFnAttr(Kind: Attribute::AlwaysInline)) { | 
|---|
| 3159 | if (Call.getAttributes().hasFnAttr(Kind: Attribute::NoInline)) | 
|---|
| 3160 | return InlineResult::failure(Reason: "noinline call site attribute"); | 
|---|
| 3161 |  | 
|---|
| 3162 | auto IsViable = isInlineViable(Callee&: *Callee); | 
|---|
| 3163 | if (IsViable.isSuccess()) | 
|---|
| 3164 | return InlineResult::success(); | 
|---|
| 3165 | return InlineResult::failure(Reason: IsViable.getFailureReason()); | 
|---|
| 3166 | } | 
|---|
| 3167 |  | 
|---|
| 3168 | // Never inline functions with conflicting attributes (unless callee has | 
|---|
| 3169 | // always-inline attribute). | 
|---|
| 3170 | Function *Caller = Call.getCaller(); | 
|---|
| 3171 | if (!functionsHaveCompatibleAttributes(Caller, Callee, TTI&: CalleeTTI, GetTLI)) | 
|---|
| 3172 | return InlineResult::failure(Reason: "conflicting attributes"); | 
|---|
| 3173 |  | 
|---|
| 3174 | // Don't inline this call if the caller has the optnone attribute. | 
|---|
| 3175 | if (Caller->hasOptNone()) | 
|---|
| 3176 | return InlineResult::failure(Reason: "optnone attribute"); | 
|---|
| 3177 |  | 
|---|
| 3178 | // Don't inline a function that treats null pointer as valid into a caller | 
|---|
| 3179 | // that does not have this attribute. | 
|---|
| 3180 | if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) | 
|---|
| 3181 | return InlineResult::failure(Reason: "nullptr definitions incompatible"); | 
|---|
| 3182 |  | 
|---|
| 3183 | // Don't inline functions which can be interposed at link-time. | 
|---|
| 3184 | if (Callee->isInterposable()) | 
|---|
| 3185 | return InlineResult::failure(Reason: "interposable"); | 
|---|
| 3186 |  | 
|---|
| 3187 | // Don't inline functions marked noinline. | 
|---|
| 3188 | if (Callee->hasFnAttribute(Kind: Attribute::NoInline)) | 
|---|
| 3189 | return InlineResult::failure(Reason: "noinline function attribute"); | 
|---|
| 3190 |  | 
|---|
| 3191 | // Don't inline call sites marked noinline. | 
|---|
| 3192 | if (Call.isNoInline()) | 
|---|
| 3193 | return InlineResult::failure(Reason: "noinline call site attribute"); | 
|---|
| 3194 |  | 
|---|
| 3195 | // Don't inline functions that are loader replaceable. | 
|---|
| 3196 | if (Callee->hasFnAttribute(Kind: "loader-replaceable")) | 
|---|
| 3197 | return InlineResult::failure(Reason: "loader replaceable function attribute"); | 
|---|
| 3198 |  | 
|---|
| 3199 | return std::nullopt; | 
|---|
| 3200 | } | 
|---|
| 3201 |  | 
|---|
| 3202 | InlineCost llvm::getInlineCost( | 
|---|
| 3203 | CallBase &Call, Function *Callee, const InlineParams &Params, | 
|---|
| 3204 | TargetTransformInfo &CalleeTTI, | 
|---|
| 3205 | function_ref<AssumptionCache &(Function &)> GetAssumptionCache, | 
|---|
| 3206 | function_ref<const TargetLibraryInfo &(Function &)> GetTLI, | 
|---|
| 3207 | function_ref<BlockFrequencyInfo &(Function &)> GetBFI, | 
|---|
| 3208 | ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, | 
|---|
| 3209 | function_ref<EphemeralValuesCache &(Function &)> GetEphValuesCache) { | 
|---|
| 3210 |  | 
|---|
| 3211 | auto UserDecision = | 
|---|
| 3212 | llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI); | 
|---|
| 3213 |  | 
|---|
| 3214 | if (UserDecision) { | 
|---|
| 3215 | if (UserDecision->isSuccess()) | 
|---|
| 3216 | return llvm::InlineCost::getAlways(Reason: "always inline attribute"); | 
|---|
| 3217 | return llvm::InlineCost::getNever(Reason: UserDecision->getFailureReason()); | 
|---|
| 3218 | } | 
|---|
| 3219 |  | 
|---|
| 3220 | LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of "<< Callee->getName() | 
|---|
| 3221 | << "... (caller:"<< Call.getCaller()->getName() | 
|---|
| 3222 | << ")\n"); | 
|---|
| 3223 |  | 
|---|
| 3224 | InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, | 
|---|
| 3225 | GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, | 
|---|
| 3226 | /*BoostIndirect=*/true, /*IgnoreThreshold=*/false, | 
|---|
| 3227 | GetEphValuesCache); | 
|---|
| 3228 | InlineResult ShouldInline = CA.analyze(); | 
|---|
| 3229 |  | 
|---|
| 3230 | LLVM_DEBUG(CA.dump()); | 
|---|
| 3231 |  | 
|---|
| 3232 | // Always make cost benefit based decision explicit. | 
|---|
| 3233 | // We use always/never here since threshold is not meaningful, | 
|---|
| 3234 | // as it's not what drives cost-benefit analysis. | 
|---|
| 3235 | if (CA.wasDecidedByCostBenefit()) { | 
|---|
| 3236 | if (ShouldInline.isSuccess()) | 
|---|
| 3237 | return InlineCost::getAlways(Reason: "benefit over cost", | 
|---|
| 3238 | CostBenefit: CA.getCostBenefitPair()); | 
|---|
| 3239 | else | 
|---|
| 3240 | return InlineCost::getNever(Reason: "cost over benefit", CostBenefit: CA.getCostBenefitPair()); | 
|---|
| 3241 | } | 
|---|
| 3242 |  | 
|---|
| 3243 | if (CA.wasDecidedByCostThreshold()) | 
|---|
| 3244 | return InlineCost::get(Cost: CA.getCost(), Threshold: CA.getThreshold(), | 
|---|
| 3245 | StaticBonus: CA.getStaticBonusApplied()); | 
|---|
| 3246 |  | 
|---|
| 3247 | // No details on how the decision was made, simply return always or never. | 
|---|
| 3248 | return ShouldInline.isSuccess() | 
|---|
| 3249 | ? InlineCost::getAlways(Reason: "empty function") | 
|---|
| 3250 | : InlineCost::getNever(Reason: ShouldInline.getFailureReason()); | 
|---|
| 3251 | } | 
|---|
| 3252 |  | 
|---|
| 3253 | InlineResult llvm::isInlineViable(Function &F) { | 
|---|
| 3254 | bool ReturnsTwice = F.hasFnAttribute(Kind: Attribute::ReturnsTwice); | 
|---|
| 3255 | for (BasicBlock &BB : F) { | 
|---|
| 3256 | // Disallow inlining of functions which contain indirect branches. | 
|---|
| 3257 | if (isa<IndirectBrInst>(Val: BB.getTerminator())) | 
|---|
| 3258 | return InlineResult::failure(Reason: "contains indirect branches"); | 
|---|
| 3259 |  | 
|---|
| 3260 | // Disallow inlining of blockaddresses which are used by non-callbr | 
|---|
| 3261 | // instructions. | 
|---|
| 3262 | if (BB.hasAddressTaken()) | 
|---|
| 3263 | for (User *U : BlockAddress::get(BB: &BB)->users()) | 
|---|
| 3264 | if (!isa<CallBrInst>(Val: *U)) | 
|---|
| 3265 | return InlineResult::failure(Reason: "blockaddress used outside of callbr"); | 
|---|
| 3266 |  | 
|---|
| 3267 | for (auto &II : BB) { | 
|---|
| 3268 | CallBase *Call = dyn_cast<CallBase>(Val: &II); | 
|---|
| 3269 | if (!Call) | 
|---|
| 3270 | continue; | 
|---|
| 3271 |  | 
|---|
| 3272 | // Disallow recursive calls. | 
|---|
| 3273 | Function *Callee = Call->getCalledFunction(); | 
|---|
| 3274 | if (&F == Callee) | 
|---|
| 3275 | return InlineResult::failure(Reason: "recursive call"); | 
|---|
| 3276 |  | 
|---|
| 3277 | // Disallow calls which expose returns-twice to a function not previously | 
|---|
| 3278 | // attributed as such. | 
|---|
| 3279 | if (!ReturnsTwice && isa<CallInst>(Val: Call) && | 
|---|
| 3280 | cast<CallInst>(Val: Call)->canReturnTwice()) | 
|---|
| 3281 | return InlineResult::failure(Reason: "exposes returns-twice attribute"); | 
|---|
| 3282 |  | 
|---|
| 3283 | if (Callee) | 
|---|
| 3284 | switch (Callee->getIntrinsicID()) { | 
|---|
| 3285 | default: | 
|---|
| 3286 | break; | 
|---|
| 3287 | case llvm::Intrinsic::icall_branch_funnel: | 
|---|
| 3288 | // Disallow inlining of @llvm.icall.branch.funnel because current | 
|---|
| 3289 | // backend can't separate call targets from call arguments. | 
|---|
| 3290 | return InlineResult::failure( | 
|---|
| 3291 | Reason: "disallowed inlining of @llvm.icall.branch.funnel"); | 
|---|
| 3292 | case llvm::Intrinsic::localescape: | 
|---|
| 3293 | // Disallow inlining functions that call @llvm.localescape. Doing this | 
|---|
| 3294 | // correctly would require major changes to the inliner. | 
|---|
| 3295 | return InlineResult::failure( | 
|---|
| 3296 | Reason: "disallowed inlining of @llvm.localescape"); | 
|---|
| 3297 | case llvm::Intrinsic::vastart: | 
|---|
| 3298 | // Disallow inlining of functions that initialize VarArgs with | 
|---|
| 3299 | // va_start. | 
|---|
| 3300 | return InlineResult::failure( | 
|---|
| 3301 | Reason: "contains VarArgs initialized with va_start"); | 
|---|
| 3302 | } | 
|---|
| 3303 | } | 
|---|
| 3304 | } | 
|---|
| 3305 |  | 
|---|
| 3306 | return InlineResult::success(); | 
|---|
| 3307 | } | 
|---|
| 3308 |  | 
|---|
| 3309 | // APIs to create InlineParams based on command line flags and/or other | 
|---|
| 3310 | // parameters. | 
|---|
| 3311 |  | 
|---|
| 3312 | InlineParams llvm::getInlineParams(int Threshold) { | 
|---|
| 3313 | InlineParams Params; | 
|---|
| 3314 |  | 
|---|
| 3315 | // This field is the threshold to use for a callee by default. This is | 
|---|
| 3316 | // derived from one or more of: | 
|---|
| 3317 | //  * optimization or size-optimization levels, | 
|---|
| 3318 | //  * a value passed to createFunctionInliningPass function, or | 
|---|
| 3319 | //  * the -inline-threshold flag. | 
|---|
| 3320 | //  If the -inline-threshold flag is explicitly specified, that is used | 
|---|
| 3321 | //  irrespective of anything else. | 
|---|
| 3322 | if (InlineThreshold.getNumOccurrences() > 0) | 
|---|
| 3323 | Params.DefaultThreshold = InlineThreshold; | 
|---|
| 3324 | else | 
|---|
| 3325 | Params.DefaultThreshold = Threshold; | 
|---|
| 3326 |  | 
|---|
| 3327 | // Set the HintThreshold knob from the -inlinehint-threshold. | 
|---|
| 3328 | Params.HintThreshold = HintThreshold; | 
|---|
| 3329 |  | 
|---|
| 3330 | // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. | 
|---|
| 3331 | Params.HotCallSiteThreshold = HotCallSiteThreshold; | 
|---|
| 3332 |  | 
|---|
| 3333 | // If the -locally-hot-callsite-threshold is explicitly specified, use it to | 
|---|
| 3334 | // populate LocallyHotCallSiteThreshold. Later, we populate | 
|---|
| 3335 | // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if | 
|---|
| 3336 | // we know that optimization level is O3 (in the getInlineParams variant that | 
|---|
| 3337 | // takes the opt and size levels). | 
|---|
| 3338 | // FIXME: Remove this check (and make the assignment unconditional) after | 
|---|
| 3339 | // addressing size regression issues at O2. | 
|---|
| 3340 | if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) | 
|---|
| 3341 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; | 
|---|
| 3342 |  | 
|---|
| 3343 | // Set the ColdCallSiteThreshold knob from the | 
|---|
| 3344 | // -inline-cold-callsite-threshold. | 
|---|
| 3345 | Params.ColdCallSiteThreshold = ColdCallSiteThreshold; | 
|---|
| 3346 |  | 
|---|
| 3347 | // Set the OptMinSizeThreshold and OptSizeThreshold params only if the | 
|---|
| 3348 | // -inlinehint-threshold commandline option is not explicitly given. If that | 
|---|
| 3349 | // option is present, then its value applies even for callees with size and | 
|---|
| 3350 | // minsize attributes. | 
|---|
| 3351 | // If the -inline-threshold is not specified, set the ColdThreshold from the | 
|---|
| 3352 | // -inlinecold-threshold even if it is not explicitly passed. If | 
|---|
| 3353 | // -inline-threshold is specified, then -inlinecold-threshold needs to be | 
|---|
| 3354 | // explicitly specified to set the ColdThreshold knob | 
|---|
| 3355 | if (InlineThreshold.getNumOccurrences() == 0) { | 
|---|
| 3356 | Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; | 
|---|
| 3357 | Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; | 
|---|
| 3358 | Params.ColdThreshold = ColdThreshold; | 
|---|
| 3359 | } else if (ColdThreshold.getNumOccurrences() > 0) { | 
|---|
| 3360 | Params.ColdThreshold = ColdThreshold; | 
|---|
| 3361 | } | 
|---|
| 3362 | return Params; | 
|---|
| 3363 | } | 
|---|
| 3364 |  | 
|---|
| 3365 | InlineParams llvm::getInlineParams() { | 
|---|
| 3366 | return getInlineParams(Threshold: DefaultThreshold); | 
|---|
| 3367 | } | 
|---|
| 3368 |  | 
|---|
| 3369 | // Compute the default threshold for inlining based on the opt level and the | 
|---|
| 3370 | // size opt level. | 
|---|
| 3371 | static int computeThresholdFromOptLevels(unsigned OptLevel, | 
|---|
| 3372 | unsigned SizeOptLevel) { | 
|---|
| 3373 | if (OptLevel > 2) | 
|---|
| 3374 | return InlineConstants::OptAggressiveThreshold; | 
|---|
| 3375 | if (SizeOptLevel == 1) // -Os | 
|---|
| 3376 | return InlineConstants::OptSizeThreshold; | 
|---|
| 3377 | if (SizeOptLevel == 2) // -Oz | 
|---|
| 3378 | return InlineConstants::OptMinSizeThreshold; | 
|---|
| 3379 | return DefaultThreshold; | 
|---|
| 3380 | } | 
|---|
| 3381 |  | 
|---|
| 3382 | InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { | 
|---|
| 3383 | auto Params = | 
|---|
| 3384 | getInlineParams(Threshold: computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); | 
|---|
| 3385 | // At O3, use the value of -locally-hot-callsite-threshold option to populate | 
|---|
| 3386 | // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only | 
|---|
| 3387 | // when it is specified explicitly. | 
|---|
| 3388 | if (OptLevel > 2) | 
|---|
| 3389 | Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; | 
|---|
| 3390 | return Params; | 
|---|
| 3391 | } | 
|---|
| 3392 |  | 
|---|
| 3393 | PreservedAnalyses | 
|---|
| 3394 | InlineCostAnnotationPrinterPass::run(Function &F, | 
|---|
| 3395 | FunctionAnalysisManager &FAM) { | 
|---|
| 3396 | PrintInstructionComments = true; | 
|---|
| 3397 | std::function<AssumptionCache &(Function &)> GetAssumptionCache = | 
|---|
| 3398 | [&](Function &F) -> AssumptionCache & { | 
|---|
| 3399 | return FAM.getResult<AssumptionAnalysis>(IR&: F); | 
|---|
| 3400 | }; | 
|---|
| 3401 |  | 
|---|
| 3402 | auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F); | 
|---|
| 3403 | ProfileSummaryInfo *PSI = | 
|---|
| 3404 | MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); | 
|---|
| 3405 | const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); | 
|---|
| 3406 |  | 
|---|
| 3407 | // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. | 
|---|
| 3408 | // In the current implementation, the type of InlineParams doesn't matter as | 
|---|
| 3409 | // the pass serves only for verification of inliner's decisions. | 
|---|
| 3410 | // We can add a flag which determines InlineParams for this run. Right now, | 
|---|
| 3411 | // the default InlineParams are used. | 
|---|
| 3412 | const InlineParams Params = llvm::getInlineParams(); | 
|---|
| 3413 | for (BasicBlock &BB : F) { | 
|---|
| 3414 | for (Instruction &I : BB) { | 
|---|
| 3415 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { | 
|---|
| 3416 | Function *CalledFunction = CB->getCalledFunction(); | 
|---|
| 3417 | if (!CalledFunction || CalledFunction->isDeclaration()) | 
|---|
| 3418 | continue; | 
|---|
| 3419 | OptimizationRemarkEmitter ORE(CalledFunction); | 
|---|
| 3420 | InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI, | 
|---|
| 3421 | GetAssumptionCache, nullptr, nullptr, PSI, | 
|---|
| 3422 | &ORE); | 
|---|
| 3423 | ICCA.analyze(); | 
|---|
| 3424 | OS << "      Analyzing call of "<< CalledFunction->getName() | 
|---|
| 3425 | << "... (caller:"<< CB->getCaller()->getName() << ")\n"; | 
|---|
| 3426 | ICCA.print(OS); | 
|---|
| 3427 | OS << "\n"; | 
|---|
| 3428 | } | 
|---|
| 3429 | } | 
|---|
| 3430 | } | 
|---|
| 3431 | return PreservedAnalyses::all(); | 
|---|
| 3432 | } | 
|---|
| 3433 |  | 
|---|