| 1 | //===- FunctionSpecialization.cpp - Function Specialization ---------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "llvm/Transforms/IPO/FunctionSpecialization.h" |
| 10 | #include "llvm/ADT/Statistic.h" |
| 11 | #include "llvm/Analysis/CodeMetrics.h" |
| 12 | #include "llvm/Analysis/ConstantFolding.h" |
| 13 | #include "llvm/Analysis/InlineCost.h" |
| 14 | #include "llvm/Analysis/InstructionSimplify.h" |
| 15 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 16 | #include "llvm/Analysis/ValueLattice.h" |
| 17 | #include "llvm/Analysis/ValueLatticeUtils.h" |
| 18 | #include "llvm/Analysis/ValueTracking.h" |
| 19 | #include "llvm/IR/IntrinsicInst.h" |
| 20 | #include "llvm/Transforms/Scalar/SCCP.h" |
| 21 | #include "llvm/Transforms/Utils/Cloning.h" |
| 22 | #include "llvm/Transforms/Utils/SCCPSolver.h" |
| 23 | #include "llvm/Transforms/Utils/SizeOpts.h" |
| 24 | #include <cmath> |
| 25 | |
| 26 | using namespace llvm; |
| 27 | |
| 28 | #define DEBUG_TYPE "function-specialization" |
| 29 | |
| 30 | STATISTIC(NumSpecsCreated, "Number of specializations created" ); |
| 31 | |
| 32 | static cl::opt<bool> ForceSpecialization( |
| 33 | "force-specialization" , cl::init(Val: false), cl::Hidden, cl::desc( |
| 34 | "Force function specialization for every call site with a constant " |
| 35 | "argument" )); |
| 36 | |
| 37 | static cl::opt<unsigned> MaxClones( |
| 38 | "funcspec-max-clones" , cl::init(Val: 3), cl::Hidden, cl::desc( |
| 39 | "The maximum number of clones allowed for a single function " |
| 40 | "specialization" )); |
| 41 | |
| 42 | static cl::opt<unsigned> |
| 43 | MaxDiscoveryIterations("funcspec-max-discovery-iterations" , cl::init(Val: 100), |
| 44 | cl::Hidden, |
| 45 | cl::desc("The maximum number of iterations allowed " |
| 46 | "when searching for transitive " |
| 47 | "phis" )); |
| 48 | |
| 49 | static cl::opt<unsigned> MaxIncomingPhiValues( |
| 50 | "funcspec-max-incoming-phi-values" , cl::init(Val: 8), cl::Hidden, |
| 51 | cl::desc("The maximum number of incoming values a PHI node can have to be " |
| 52 | "considered during the specialization bonus estimation" )); |
| 53 | |
| 54 | static cl::opt<unsigned> MaxBlockPredecessors( |
| 55 | "funcspec-max-block-predecessors" , cl::init(Val: 2), cl::Hidden, cl::desc( |
| 56 | "The maximum number of predecessors a basic block can have to be " |
| 57 | "considered during the estimation of dead code" )); |
| 58 | |
| 59 | static cl::opt<unsigned> MinFunctionSize( |
| 60 | "funcspec-min-function-size" , cl::init(Val: 500), cl::Hidden, |
| 61 | cl::desc("Don't specialize functions that have less than this number of " |
| 62 | "instructions" )); |
| 63 | |
| 64 | static cl::opt<unsigned> MaxCodeSizeGrowth( |
| 65 | "funcspec-max-codesize-growth" , cl::init(Val: 3), cl::Hidden, cl::desc( |
| 66 | "Maximum codesize growth allowed per function" )); |
| 67 | |
| 68 | static cl::opt<unsigned> MinCodeSizeSavings( |
| 69 | "funcspec-min-codesize-savings" , cl::init(Val: 20), cl::Hidden, |
| 70 | cl::desc("Reject specializations whose codesize savings are less than this " |
| 71 | "much percent of the original function size" )); |
| 72 | |
| 73 | static cl::opt<unsigned> MinLatencySavings( |
| 74 | "funcspec-min-latency-savings" , cl::init(Val: 20), cl::Hidden, |
| 75 | cl::desc("Reject specializations whose latency savings are less than this " |
| 76 | "much percent of the original function size" )); |
| 77 | |
| 78 | static cl::opt<unsigned> MinInliningBonus( |
| 79 | "funcspec-min-inlining-bonus" , cl::init(Val: 300), cl::Hidden, |
| 80 | cl::desc("Reject specializations whose inlining bonus is less than this " |
| 81 | "much percent of the original function size" )); |
| 82 | |
| 83 | static cl::opt<bool> SpecializeOnAddress( |
| 84 | "funcspec-on-address" , cl::init(Val: false), cl::Hidden, cl::desc( |
| 85 | "Enable function specialization on the address of global values" )); |
| 86 | |
| 87 | static cl::opt<bool> SpecializeLiteralConstant( |
| 88 | "funcspec-for-literal-constant" , cl::init(Val: true), cl::Hidden, |
| 89 | cl::desc( |
| 90 | "Enable specialization of functions that take a literal constant as an " |
| 91 | "argument" )); |
| 92 | |
| 93 | bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, |
| 94 | BasicBlock *Succ) const { |
| 95 | unsigned I = 0; |
| 96 | return all_of(Range: predecessors(BB: Succ), P: [&I, BB, Succ, this](BasicBlock *Pred) { |
| 97 | return I++ < MaxBlockPredecessors && |
| 98 | (Pred == BB || Pred == Succ || !isBlockExecutable(BB: Pred)); |
| 99 | }); |
| 100 | } |
| 101 | |
| 102 | // Estimates the codesize savings due to dead code after constant propagation. |
| 103 | // \p WorkList represents the basic blocks of a specialization which will |
| 104 | // eventually become dead once we replace instructions that are known to be |
| 105 | // constants. The successors of such blocks are added to the list as long as |
| 106 | // the \p Solver found they were executable prior to specialization, and only |
| 107 | // if all their predecessors are dead. |
| 108 | Cost InstCostVisitor::estimateBasicBlocks( |
| 109 | SmallVectorImpl<BasicBlock *> &WorkList) { |
| 110 | Cost CodeSize = 0; |
| 111 | // Accumulate the codesize savings of each basic block. |
| 112 | while (!WorkList.empty()) { |
| 113 | BasicBlock *BB = WorkList.pop_back_val(); |
| 114 | |
| 115 | // These blocks are considered dead as far as the InstCostVisitor |
| 116 | // is concerned. They haven't been proven dead yet by the Solver, |
| 117 | // but may become if we propagate the specialization arguments. |
| 118 | assert(Solver.isBlockExecutable(BB) && "BB already found dead by IPSCCP!" ); |
| 119 | if (!DeadBlocks.insert(V: BB).second) |
| 120 | continue; |
| 121 | |
| 122 | for (Instruction &I : *BB) { |
| 123 | // If it's a known constant we have already accounted for it. |
| 124 | if (KnownConstants.contains(Val: &I)) |
| 125 | continue; |
| 126 | |
| 127 | Cost C = TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_CodeSize); |
| 128 | |
| 129 | LLVM_DEBUG(dbgs() << "FnSpecialization: CodeSize " << C |
| 130 | << " for user " << I << "\n" ); |
| 131 | CodeSize += C; |
| 132 | } |
| 133 | |
| 134 | // Keep adding dead successors to the list as long as they are |
| 135 | // executable and only reachable from dead blocks. |
| 136 | for (BasicBlock *SuccBB : successors(BB)) |
| 137 | if (isBlockExecutable(BB: SuccBB) && canEliminateSuccessor(BB, Succ: SuccBB)) |
| 138 | WorkList.push_back(Elt: SuccBB); |
| 139 | } |
| 140 | return CodeSize; |
| 141 | } |
| 142 | |
| 143 | Constant *InstCostVisitor::findConstantFor(Value *V) const { |
| 144 | if (auto *C = dyn_cast<Constant>(Val: V)) |
| 145 | return C; |
| 146 | if (auto *C = Solver.getConstantOrNull(V)) |
| 147 | return C; |
| 148 | return KnownConstants.lookup(Val: V); |
| 149 | } |
| 150 | |
| 151 | Cost InstCostVisitor::getCodeSizeSavingsFromPendingPHIs() { |
| 152 | Cost CodeSize; |
| 153 | while (!PendingPHIs.empty()) { |
| 154 | Instruction *Phi = PendingPHIs.pop_back_val(); |
| 155 | // The pending PHIs could have been proven dead by now. |
| 156 | if (isBlockExecutable(BB: Phi->getParent())) |
| 157 | CodeSize += getCodeSizeSavingsForUser(User: Phi); |
| 158 | } |
| 159 | return CodeSize; |
| 160 | } |
| 161 | |
| 162 | /// Compute the codesize savings for replacing argument \p A with constant \p C. |
| 163 | Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) { |
| 164 | LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: " |
| 165 | << C->getNameOrAsOperand() << "\n" ); |
| 166 | Cost CodeSize; |
| 167 | for (auto *U : A->users()) |
| 168 | if (auto *UI = dyn_cast<Instruction>(Val: U)) |
| 169 | if (isBlockExecutable(BB: UI->getParent())) |
| 170 | CodeSize += getCodeSizeSavingsForUser(User: UI, Use: A, C); |
| 171 | |
| 172 | LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = " |
| 173 | << CodeSize << "} for argument " << *A << "\n" ); |
| 174 | return CodeSize; |
| 175 | } |
| 176 | |
| 177 | /// Compute the latency savings from replacing all arguments with constants for |
| 178 | /// a specialization candidate. As this function computes the latency savings |
| 179 | /// for all Instructions in KnownConstants at once, it should be called only |
| 180 | /// after every instruction has been visited, i.e. after: |
| 181 | /// |
| 182 | /// * getCodeSizeSavingsForArg has been run for every constant argument of a |
| 183 | /// specialization candidate |
| 184 | /// |
| 185 | /// * getCodeSizeSavingsFromPendingPHIs has been run |
| 186 | /// |
| 187 | /// to ensure that the latency savings are calculated for all Instructions we |
| 188 | /// have visited and found to be constant. |
| 189 | Cost InstCostVisitor::getLatencySavingsForKnownConstants() { |
| 190 | auto &BFI = GetBFI(*F); |
| 191 | Cost TotalLatency = 0; |
| 192 | |
| 193 | for (auto Pair : KnownConstants) { |
| 194 | Instruction *I = dyn_cast<Instruction>(Val: Pair.first); |
| 195 | if (!I) |
| 196 | continue; |
| 197 | |
| 198 | uint64_t Weight = BFI.getBlockFreq(BB: I->getParent()).getFrequency() / |
| 199 | BFI.getEntryFreq().getFrequency(); |
| 200 | |
| 201 | Cost Latency = |
| 202 | Weight * TTI.getInstructionCost(U: I, CostKind: TargetTransformInfo::TCK_Latency); |
| 203 | |
| 204 | LLVM_DEBUG(dbgs() << "FnSpecialization: {Latency = " << Latency |
| 205 | << "} for instruction " << *I << "\n" ); |
| 206 | |
| 207 | TotalLatency += Latency; |
| 208 | } |
| 209 | |
| 210 | return TotalLatency; |
| 211 | } |
| 212 | |
| 213 | Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use, |
| 214 | Constant *C) { |
| 215 | // We have already propagated a constant for this user. |
| 216 | if (KnownConstants.contains(Val: User)) |
| 217 | return 0; |
| 218 | |
| 219 | // Cache the iterator before visiting. |
| 220 | LastVisited = Use ? KnownConstants.insert(KV: {Use, C}).first |
| 221 | : KnownConstants.end(); |
| 222 | |
| 223 | Cost CodeSize = 0; |
| 224 | if (auto *I = dyn_cast<SwitchInst>(Val: User)) { |
| 225 | CodeSize = estimateSwitchInst(I&: *I); |
| 226 | } else if (auto *I = dyn_cast<BranchInst>(Val: User)) { |
| 227 | CodeSize = estimateBranchInst(I&: *I); |
| 228 | } else { |
| 229 | C = visit(I&: *User); |
| 230 | if (!C) |
| 231 | return 0; |
| 232 | } |
| 233 | |
| 234 | // Even though it doesn't make sense to bind switch and branch instructions |
| 235 | // with a constant, unlike any other instruction type, it prevents estimating |
| 236 | // their bonus multiple times. |
| 237 | KnownConstants.insert(KV: {User, C}); |
| 238 | |
| 239 | CodeSize += TTI.getInstructionCost(U: User, CostKind: TargetTransformInfo::TCK_CodeSize); |
| 240 | |
| 241 | LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize |
| 242 | << "} for user " << *User << "\n" ); |
| 243 | |
| 244 | for (auto *U : User->users()) |
| 245 | if (auto *UI = dyn_cast<Instruction>(Val: U)) |
| 246 | if (UI != User && isBlockExecutable(BB: UI->getParent())) |
| 247 | CodeSize += getCodeSizeSavingsForUser(User: UI, Use: User, C); |
| 248 | |
| 249 | return CodeSize; |
| 250 | } |
| 251 | |
| 252 | Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) { |
| 253 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 254 | |
| 255 | if (I.getCondition() != LastVisited->first) |
| 256 | return 0; |
| 257 | |
| 258 | auto *C = dyn_cast<ConstantInt>(Val: LastVisited->second); |
| 259 | if (!C) |
| 260 | return 0; |
| 261 | |
| 262 | BasicBlock *Succ = I.findCaseValue(C)->getCaseSuccessor(); |
| 263 | // Initialize the worklist with the dead basic blocks. These are the |
| 264 | // destination labels which are different from the one corresponding |
| 265 | // to \p C. They should be executable and have a unique predecessor. |
| 266 | SmallVector<BasicBlock *> WorkList; |
| 267 | for (const auto &Case : I.cases()) { |
| 268 | BasicBlock *BB = Case.getCaseSuccessor(); |
| 269 | if (BB != Succ && isBlockExecutable(BB) && |
| 270 | canEliminateSuccessor(BB: I.getParent(), Succ: BB)) |
| 271 | WorkList.push_back(Elt: BB); |
| 272 | } |
| 273 | |
| 274 | return estimateBasicBlocks(WorkList); |
| 275 | } |
| 276 | |
| 277 | Cost InstCostVisitor::estimateBranchInst(BranchInst &I) { |
| 278 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 279 | |
| 280 | if (I.getCondition() != LastVisited->first) |
| 281 | return 0; |
| 282 | |
| 283 | BasicBlock *Succ = I.getSuccessor(i: LastVisited->second->isOneValue()); |
| 284 | // Initialize the worklist with the dead successor as long as |
| 285 | // it is executable and has a unique predecessor. |
| 286 | SmallVector<BasicBlock *> WorkList; |
| 287 | if (isBlockExecutable(BB: Succ) && canEliminateSuccessor(BB: I.getParent(), Succ)) |
| 288 | WorkList.push_back(Elt: Succ); |
| 289 | |
| 290 | return estimateBasicBlocks(WorkList); |
| 291 | } |
| 292 | |
| 293 | bool InstCostVisitor::discoverTransitivelyIncomingValues( |
| 294 | Constant *Const, PHINode *Root, DenseSet<PHINode *> &TransitivePHIs) { |
| 295 | |
| 296 | SmallVector<PHINode *, 64> WorkList; |
| 297 | WorkList.push_back(Elt: Root); |
| 298 | unsigned Iter = 0; |
| 299 | |
| 300 | while (!WorkList.empty()) { |
| 301 | PHINode *PN = WorkList.pop_back_val(); |
| 302 | |
| 303 | if (++Iter > MaxDiscoveryIterations || |
| 304 | PN->getNumIncomingValues() > MaxIncomingPhiValues) |
| 305 | return false; |
| 306 | |
| 307 | if (!TransitivePHIs.insert(V: PN).second) |
| 308 | continue; |
| 309 | |
| 310 | for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { |
| 311 | Value *V = PN->getIncomingValue(i: I); |
| 312 | |
| 313 | // Disregard self-references and dead incoming values. |
| 314 | if (auto *Inst = dyn_cast<Instruction>(Val: V)) |
| 315 | if (Inst == PN || !isBlockExecutable(BB: PN->getIncomingBlock(i: I))) |
| 316 | continue; |
| 317 | |
| 318 | if (Constant *C = findConstantFor(V)) { |
| 319 | // Not all incoming values are the same constant. Bail immediately. |
| 320 | if (C != Const) |
| 321 | return false; |
| 322 | continue; |
| 323 | } |
| 324 | |
| 325 | if (auto *Phi = dyn_cast<PHINode>(Val: V)) { |
| 326 | WorkList.push_back(Elt: Phi); |
| 327 | continue; |
| 328 | } |
| 329 | |
| 330 | // We can't reason about anything else. |
| 331 | return false; |
| 332 | } |
| 333 | } |
| 334 | return true; |
| 335 | } |
| 336 | |
| 337 | Constant *InstCostVisitor::visitPHINode(PHINode &I) { |
| 338 | if (I.getNumIncomingValues() > MaxIncomingPhiValues) |
| 339 | return nullptr; |
| 340 | |
| 341 | bool Inserted = VisitedPHIs.insert(V: &I).second; |
| 342 | Constant *Const = nullptr; |
| 343 | bool HaveSeenIncomingPHI = false; |
| 344 | |
| 345 | for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) { |
| 346 | Value *V = I.getIncomingValue(i: Idx); |
| 347 | |
| 348 | // Disregard self-references and dead incoming values. |
| 349 | if (auto *Inst = dyn_cast<Instruction>(Val: V)) |
| 350 | if (Inst == &I || !isBlockExecutable(BB: I.getIncomingBlock(i: Idx))) |
| 351 | continue; |
| 352 | |
| 353 | if (Constant *C = findConstantFor(V)) { |
| 354 | if (!Const) |
| 355 | Const = C; |
| 356 | // Not all incoming values are the same constant. Bail immediately. |
| 357 | if (C != Const) |
| 358 | return nullptr; |
| 359 | continue; |
| 360 | } |
| 361 | |
| 362 | if (Inserted) { |
| 363 | // First time we are seeing this phi. We will retry later, after |
| 364 | // all the constant arguments have been propagated. Bail for now. |
| 365 | PendingPHIs.push_back(Elt: &I); |
| 366 | return nullptr; |
| 367 | } |
| 368 | |
| 369 | if (isa<PHINode>(Val: V)) { |
| 370 | // Perhaps it is a Transitive Phi. We will confirm later. |
| 371 | HaveSeenIncomingPHI = true; |
| 372 | continue; |
| 373 | } |
| 374 | |
| 375 | // We can't reason about anything else. |
| 376 | return nullptr; |
| 377 | } |
| 378 | |
| 379 | if (!Const) |
| 380 | return nullptr; |
| 381 | |
| 382 | if (!HaveSeenIncomingPHI) |
| 383 | return Const; |
| 384 | |
| 385 | DenseSet<PHINode *> TransitivePHIs; |
| 386 | if (!discoverTransitivelyIncomingValues(Const, Root: &I, TransitivePHIs)) |
| 387 | return nullptr; |
| 388 | |
| 389 | return Const; |
| 390 | } |
| 391 | |
| 392 | Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) { |
| 393 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 394 | |
| 395 | if (isGuaranteedNotToBeUndefOrPoison(V: LastVisited->second)) |
| 396 | return LastVisited->second; |
| 397 | return nullptr; |
| 398 | } |
| 399 | |
| 400 | Constant *InstCostVisitor::visitCallBase(CallBase &I) { |
| 401 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 402 | |
| 403 | // Look through calls to ssa_copy intrinsics. |
| 404 | if (auto *II = dyn_cast<IntrinsicInst>(Val: &I); |
| 405 | II && II->getIntrinsicID() == Intrinsic::ssa_copy) { |
| 406 | return LastVisited->second; |
| 407 | } |
| 408 | |
| 409 | Function *F = I.getCalledFunction(); |
| 410 | if (!F || !canConstantFoldCallTo(Call: &I, F)) |
| 411 | return nullptr; |
| 412 | |
| 413 | SmallVector<Constant *, 8> Operands; |
| 414 | Operands.reserve(N: I.getNumOperands()); |
| 415 | |
| 416 | for (unsigned Idx = 0, E = I.getNumOperands() - 1; Idx != E; ++Idx) { |
| 417 | Value *V = I.getOperand(i_nocapture: Idx); |
| 418 | if (isa<MetadataAsValue>(Val: V)) |
| 419 | return nullptr; |
| 420 | Constant *C = findConstantFor(V); |
| 421 | if (!C) |
| 422 | return nullptr; |
| 423 | Operands.push_back(Elt: C); |
| 424 | } |
| 425 | |
| 426 | auto Ops = ArrayRef(Operands.begin(), Operands.end()); |
| 427 | return ConstantFoldCall(Call: &I, F, Operands: Ops); |
| 428 | } |
| 429 | |
| 430 | Constant *InstCostVisitor::visitLoadInst(LoadInst &I) { |
| 431 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 432 | |
| 433 | if (isa<ConstantPointerNull>(Val: LastVisited->second)) |
| 434 | return nullptr; |
| 435 | return ConstantFoldLoadFromConstPtr(C: LastVisited->second, Ty: I.getType(), DL); |
| 436 | } |
| 437 | |
| 438 | Constant *InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { |
| 439 | SmallVector<Constant *, 8> Operands; |
| 440 | Operands.reserve(N: I.getNumOperands()); |
| 441 | |
| 442 | for (unsigned Idx = 0, E = I.getNumOperands(); Idx != E; ++Idx) { |
| 443 | Value *V = I.getOperand(i_nocapture: Idx); |
| 444 | Constant *C = findConstantFor(V); |
| 445 | if (!C) |
| 446 | return nullptr; |
| 447 | Operands.push_back(Elt: C); |
| 448 | } |
| 449 | |
| 450 | auto Ops = ArrayRef(Operands.begin(), Operands.end()); |
| 451 | return ConstantFoldInstOperands(I: &I, Ops, DL); |
| 452 | } |
| 453 | |
| 454 | Constant *InstCostVisitor::visitSelectInst(SelectInst &I) { |
| 455 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 456 | |
| 457 | if (I.getCondition() == LastVisited->first) { |
| 458 | Value *V = LastVisited->second->isZeroValue() ? I.getFalseValue() |
| 459 | : I.getTrueValue(); |
| 460 | return findConstantFor(V); |
| 461 | } |
| 462 | if (Constant *Condition = findConstantFor(V: I.getCondition())) |
| 463 | if ((I.getTrueValue() == LastVisited->first && Condition->isOneValue()) || |
| 464 | (I.getFalseValue() == LastVisited->first && Condition->isZeroValue())) |
| 465 | return LastVisited->second; |
| 466 | return nullptr; |
| 467 | } |
| 468 | |
| 469 | Constant *InstCostVisitor::visitCastInst(CastInst &I) { |
| 470 | return ConstantFoldCastOperand(Opcode: I.getOpcode(), C: LastVisited->second, |
| 471 | DestTy: I.getType(), DL); |
| 472 | } |
| 473 | |
| 474 | Constant *InstCostVisitor::visitCmpInst(CmpInst &I) { |
| 475 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 476 | |
| 477 | Constant *Const = LastVisited->second; |
| 478 | bool ConstOnRHS = I.getOperand(i_nocapture: 1) == LastVisited->first; |
| 479 | Value *V = ConstOnRHS ? I.getOperand(i_nocapture: 0) : I.getOperand(i_nocapture: 1); |
| 480 | Constant *Other = findConstantFor(V); |
| 481 | |
| 482 | if (Other) { |
| 483 | if (ConstOnRHS) |
| 484 | std::swap(a&: Const, b&: Other); |
| 485 | return ConstantFoldCompareInstOperands(Predicate: I.getPredicate(), LHS: Const, RHS: Other, DL); |
| 486 | } |
| 487 | |
| 488 | // If we haven't found Other to be a specific constant value, we may still be |
| 489 | // able to constant fold using information from the lattice value. |
| 490 | const ValueLatticeElement &ConstLV = ValueLatticeElement::get(C: Const); |
| 491 | const ValueLatticeElement &OtherLV = Solver.getLatticeValueFor(V); |
| 492 | auto &V1State = ConstOnRHS ? OtherLV : ConstLV; |
| 493 | auto &V2State = ConstOnRHS ? ConstLV : OtherLV; |
| 494 | return V1State.getCompare(Pred: I.getPredicate(), Ty: I.getType(), Other: V2State, DL); |
| 495 | } |
| 496 | |
| 497 | Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) { |
| 498 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 499 | |
| 500 | return ConstantFoldUnaryOpOperand(Opcode: I.getOpcode(), Op: LastVisited->second, DL); |
| 501 | } |
| 502 | |
| 503 | Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) { |
| 504 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
| 505 | |
| 506 | bool ConstOnRHS = I.getOperand(i_nocapture: 1) == LastVisited->first; |
| 507 | Value *V = ConstOnRHS ? I.getOperand(i_nocapture: 0) : I.getOperand(i_nocapture: 1); |
| 508 | Constant *Other = findConstantFor(V); |
| 509 | Value *OtherVal = Other ? Other : V; |
| 510 | Value *ConstVal = LastVisited->second; |
| 511 | |
| 512 | if (ConstOnRHS) |
| 513 | std::swap(a&: ConstVal, b&: OtherVal); |
| 514 | |
| 515 | return dyn_cast_or_null<Constant>( |
| 516 | Val: simplifyBinOp(Opcode: I.getOpcode(), LHS: ConstVal, RHS: OtherVal, Q: SimplifyQuery(DL))); |
| 517 | } |
| 518 | |
| 519 | Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca, |
| 520 | CallInst *Call) { |
| 521 | Value *StoreValue = nullptr; |
| 522 | for (auto *User : Alloca->users()) { |
| 523 | // We can't use llvm::isAllocaPromotable() as that would fail because of |
| 524 | // the usage in the CallInst, which is what we check here. |
| 525 | if (User == Call) |
| 526 | continue; |
| 527 | |
| 528 | if (auto *Store = dyn_cast<StoreInst>(Val: User)) { |
| 529 | // This is a duplicate store, bail out. |
| 530 | if (StoreValue || Store->isVolatile()) |
| 531 | return nullptr; |
| 532 | StoreValue = Store->getValueOperand(); |
| 533 | continue; |
| 534 | } |
| 535 | // Bail if there is any other unknown usage. |
| 536 | return nullptr; |
| 537 | } |
| 538 | |
| 539 | if (!StoreValue) |
| 540 | return nullptr; |
| 541 | |
| 542 | return getCandidateConstant(V: StoreValue); |
| 543 | } |
| 544 | |
| 545 | // A constant stack value is an AllocaInst that has a single constant |
| 546 | // value stored to it. Return this constant if such an alloca stack value |
| 547 | // is a function argument. |
| 548 | Constant *FunctionSpecializer::getConstantStackValue(CallInst *Call, |
| 549 | Value *Val) { |
| 550 | if (!Val) |
| 551 | return nullptr; |
| 552 | Val = Val->stripPointerCasts(); |
| 553 | if (auto *ConstVal = dyn_cast<ConstantInt>(Val)) |
| 554 | return ConstVal; |
| 555 | auto *Alloca = dyn_cast<AllocaInst>(Val); |
| 556 | if (!Alloca || !Alloca->getAllocatedType()->isIntegerTy()) |
| 557 | return nullptr; |
| 558 | return getPromotableAlloca(Alloca, Call); |
| 559 | } |
| 560 | |
| 561 | // To support specializing recursive functions, it is important to propagate |
| 562 | // constant arguments because after a first iteration of specialisation, a |
| 563 | // reduced example may look like this: |
| 564 | // |
| 565 | // define internal void @RecursiveFn(i32* arg1) { |
| 566 | // %temp = alloca i32, align 4 |
| 567 | // store i32 2 i32* %temp, align 4 |
| 568 | // call void @RecursiveFn.1(i32* nonnull %temp) |
| 569 | // ret void |
| 570 | // } |
| 571 | // |
| 572 | // Before a next iteration, we need to propagate the constant like so |
| 573 | // which allows further specialization in next iterations. |
| 574 | // |
| 575 | // @funcspec.arg = internal constant i32 2 |
| 576 | // |
| 577 | // define internal void @someFunc(i32* arg1) { |
| 578 | // call void @otherFunc(i32* nonnull @funcspec.arg) |
| 579 | // ret void |
| 580 | // } |
| 581 | // |
| 582 | // See if there are any new constant values for the callers of \p F via |
| 583 | // stack variables and promote them to global variables. |
| 584 | void FunctionSpecializer::promoteConstantStackValues(Function *F) { |
| 585 | for (User *U : F->users()) { |
| 586 | |
| 587 | auto *Call = dyn_cast<CallInst>(Val: U); |
| 588 | if (!Call) |
| 589 | continue; |
| 590 | |
| 591 | if (!Solver.isBlockExecutable(BB: Call->getParent())) |
| 592 | continue; |
| 593 | |
| 594 | for (const Use &U : Call->args()) { |
| 595 | unsigned Idx = Call->getArgOperandNo(U: &U); |
| 596 | Value *ArgOp = Call->getArgOperand(i: Idx); |
| 597 | Type *ArgOpType = ArgOp->getType(); |
| 598 | |
| 599 | if (!Call->onlyReadsMemory(OpNo: Idx) || !ArgOpType->isPointerTy()) |
| 600 | continue; |
| 601 | |
| 602 | auto *ConstVal = getConstantStackValue(Call, Val: ArgOp); |
| 603 | if (!ConstVal) |
| 604 | continue; |
| 605 | |
| 606 | Value *GV = new GlobalVariable(M, ConstVal->getType(), true, |
| 607 | GlobalValue::InternalLinkage, ConstVal, |
| 608 | "specialized.arg." + Twine(++NGlobals)); |
| 609 | Call->setArgOperand(i: Idx, v: GV); |
| 610 | } |
| 611 | } |
| 612 | } |
| 613 | |
| 614 | // ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics |
| 615 | // interfere with the promoteConstantStackValues() optimization. |
| 616 | static void removeSSACopy(Function &F) { |
| 617 | for (BasicBlock &BB : F) { |
| 618 | for (Instruction &Inst : llvm::make_early_inc_range(Range&: BB)) { |
| 619 | auto *II = dyn_cast<IntrinsicInst>(Val: &Inst); |
| 620 | if (!II) |
| 621 | continue; |
| 622 | if (II->getIntrinsicID() != Intrinsic::ssa_copy) |
| 623 | continue; |
| 624 | Inst.replaceAllUsesWith(V: II->getOperand(i_nocapture: 0)); |
| 625 | Inst.eraseFromParent(); |
| 626 | } |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | /// Remove any ssa_copy intrinsics that may have been introduced. |
| 631 | void FunctionSpecializer::cleanUpSSA() { |
| 632 | for (Function *F : Specializations) |
| 633 | removeSSACopy(F&: *F); |
| 634 | } |
| 635 | |
| 636 | |
| 637 | template <> struct llvm::DenseMapInfo<SpecSig> { |
| 638 | static inline SpecSig getEmptyKey() { return {.Key: ~0U, .Args: {}}; } |
| 639 | |
| 640 | static inline SpecSig getTombstoneKey() { return {.Key: ~1U, .Args: {}}; } |
| 641 | |
| 642 | static unsigned getHashValue(const SpecSig &S) { |
| 643 | return static_cast<unsigned>(hash_value(S)); |
| 644 | } |
| 645 | |
| 646 | static bool isEqual(const SpecSig &LHS, const SpecSig &RHS) { |
| 647 | return LHS == RHS; |
| 648 | } |
| 649 | }; |
| 650 | |
| 651 | FunctionSpecializer::~FunctionSpecializer() { |
| 652 | LLVM_DEBUG( |
| 653 | if (NumSpecsCreated > 0) |
| 654 | dbgs() << "FnSpecialization: Created " << NumSpecsCreated |
| 655 | << " specializations in module " << M.getName() << "\n" ); |
| 656 | // Eliminate dead code. |
| 657 | removeDeadFunctions(); |
| 658 | cleanUpSSA(); |
| 659 | } |
| 660 | |
| 661 | /// Get the unsigned Value of given Cost object. Assumes the Cost is always |
| 662 | /// non-negative, which is true for both TCK_CodeSize and TCK_Latency, and |
| 663 | /// always Valid. |
| 664 | static unsigned getCostValue(const Cost &C) { |
| 665 | int64_t Value = C.getValue(); |
| 666 | |
| 667 | assert(Value >= 0 && "CodeSize and Latency cannot be negative" ); |
| 668 | // It is safe to down cast since we know the arguments cannot be negative and |
| 669 | // Cost is of type int64_t. |
| 670 | return static_cast<unsigned>(Value); |
| 671 | } |
| 672 | |
| 673 | /// Attempt to specialize functions in the module to enable constant |
| 674 | /// propagation across function boundaries. |
| 675 | /// |
| 676 | /// \returns true if at least one function is specialized. |
| 677 | bool FunctionSpecializer::run() { |
| 678 | // Find possible specializations for each function. |
| 679 | SpecMap SM; |
| 680 | SmallVector<Spec, 32> AllSpecs; |
| 681 | unsigned NumCandidates = 0; |
| 682 | for (Function &F : M) { |
| 683 | if (!isCandidateFunction(F: &F)) |
| 684 | continue; |
| 685 | |
| 686 | auto [It, Inserted] = FunctionMetrics.try_emplace(Key: &F); |
| 687 | CodeMetrics &Metrics = It->second; |
| 688 | //Analyze the function. |
| 689 | if (Inserted) { |
| 690 | SmallPtrSet<const Value *, 32> EphValues; |
| 691 | CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAC(F), EphValues); |
| 692 | for (BasicBlock &BB : F) |
| 693 | Metrics.analyzeBasicBlock(BB: &BB, TTI: GetTTI(F), EphValues); |
| 694 | } |
| 695 | |
| 696 | // When specializing literal constants is enabled, always require functions |
| 697 | // to be larger than MinFunctionSize, to prevent excessive specialization. |
| 698 | const bool RequireMinSize = |
| 699 | !ForceSpecialization && |
| 700 | (SpecializeLiteralConstant || !F.hasFnAttribute(Kind: Attribute::NoInline)); |
| 701 | |
| 702 | // If the code metrics reveal that we shouldn't duplicate the function, |
| 703 | // or if the code size implies that this function is easy to get inlined, |
| 704 | // then we shouldn't specialize it. |
| 705 | if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() || |
| 706 | (RequireMinSize && Metrics.NumInsts < MinFunctionSize)) |
| 707 | continue; |
| 708 | |
| 709 | // When specialization on literal constants is disabled, only consider |
| 710 | // recursive functions when running multiple times to save wasted analysis, |
| 711 | // as we will not be able to specialize on any newly found literal constant |
| 712 | // return values. |
| 713 | if (!SpecializeLiteralConstant && !Inserted && !Metrics.isRecursive) |
| 714 | continue; |
| 715 | |
| 716 | int64_t Sz = Metrics.NumInsts.getValue(); |
| 717 | assert(Sz > 0 && "CodeSize should be positive" ); |
| 718 | // It is safe to down cast from int64_t, NumInsts is always positive. |
| 719 | unsigned FuncSize = static_cast<unsigned>(Sz); |
| 720 | |
| 721 | LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for " |
| 722 | << F.getName() << " is " << FuncSize << "\n" ); |
| 723 | |
| 724 | if (Inserted && Metrics.isRecursive) |
| 725 | promoteConstantStackValues(F: &F); |
| 726 | |
| 727 | if (!findSpecializations(F: &F, FuncSize, AllSpecs, SM)) { |
| 728 | LLVM_DEBUG( |
| 729 | dbgs() << "FnSpecialization: No possible specializations found for " |
| 730 | << F.getName() << "\n" ); |
| 731 | continue; |
| 732 | } |
| 733 | |
| 734 | ++NumCandidates; |
| 735 | } |
| 736 | |
| 737 | if (!NumCandidates) { |
| 738 | LLVM_DEBUG( |
| 739 | dbgs() |
| 740 | << "FnSpecialization: No possible specializations found in module\n" ); |
| 741 | return false; |
| 742 | } |
| 743 | |
| 744 | // Choose the most profitable specialisations, which fit in the module |
| 745 | // specialization budget, which is derived from maximum number of |
| 746 | // specializations per specialization candidate function. |
| 747 | auto CompareScore = [&AllSpecs](unsigned I, unsigned J) { |
| 748 | if (AllSpecs[I].Score != AllSpecs[J].Score) |
| 749 | return AllSpecs[I].Score > AllSpecs[J].Score; |
| 750 | return I > J; |
| 751 | }; |
| 752 | const unsigned NSpecs = |
| 753 | std::min(a: NumCandidates * MaxClones, b: unsigned(AllSpecs.size())); |
| 754 | SmallVector<unsigned> BestSpecs(NSpecs + 1); |
| 755 | std::iota(first: BestSpecs.begin(), last: BestSpecs.begin() + NSpecs, value: 0); |
| 756 | if (AllSpecs.size() > NSpecs) { |
| 757 | LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed " |
| 758 | << "the maximum number of clones threshold.\n" |
| 759 | << "FnSpecialization: Specializing the " |
| 760 | << NSpecs |
| 761 | << " most profitable candidates.\n" ); |
| 762 | std::make_heap(first: BestSpecs.begin(), last: BestSpecs.begin() + NSpecs, comp: CompareScore); |
| 763 | for (unsigned I = NSpecs, N = AllSpecs.size(); I < N; ++I) { |
| 764 | BestSpecs[NSpecs] = I; |
| 765 | std::push_heap(first: BestSpecs.begin(), last: BestSpecs.end(), comp: CompareScore); |
| 766 | std::pop_heap(first: BestSpecs.begin(), last: BestSpecs.end(), comp: CompareScore); |
| 767 | } |
| 768 | } |
| 769 | |
| 770 | LLVM_DEBUG(dbgs() << "FnSpecialization: List of specializations \n" ; |
| 771 | for (unsigned I = 0; I < NSpecs; ++I) { |
| 772 | const Spec &S = AllSpecs[BestSpecs[I]]; |
| 773 | dbgs() << "FnSpecialization: Function " << S.F->getName() |
| 774 | << " , score " << S.Score << "\n" ; |
| 775 | for (const ArgInfo &Arg : S.Sig.Args) |
| 776 | dbgs() << "FnSpecialization: FormalArg = " |
| 777 | << Arg.Formal->getNameOrAsOperand() |
| 778 | << ", ActualArg = " << Arg.Actual->getNameOrAsOperand() |
| 779 | << "\n" ; |
| 780 | }); |
| 781 | |
| 782 | // Create the chosen specializations. |
| 783 | SmallPtrSet<Function *, 8> OriginalFuncs; |
| 784 | SmallVector<Function *> Clones; |
| 785 | for (unsigned I = 0; I < NSpecs; ++I) { |
| 786 | Spec &S = AllSpecs[BestSpecs[I]]; |
| 787 | |
| 788 | // Accumulate the codesize growth for the function, now we are creating the |
| 789 | // specialization. |
| 790 | FunctionGrowth[S.F] += S.CodeSize; |
| 791 | |
| 792 | S.Clone = createSpecialization(F: S.F, S: S.Sig); |
| 793 | |
| 794 | // Update the known call sites to call the clone. |
| 795 | for (CallBase *Call : S.CallSites) { |
| 796 | LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call |
| 797 | << " to call " << S.Clone->getName() << "\n" ); |
| 798 | Call->setCalledFunction(S.Clone); |
| 799 | } |
| 800 | |
| 801 | Clones.push_back(Elt: S.Clone); |
| 802 | OriginalFuncs.insert(Ptr: S.F); |
| 803 | } |
| 804 | |
| 805 | Solver.solveWhileResolvedUndefsIn(WorkList&: Clones); |
| 806 | |
| 807 | // Update the rest of the call sites - these are the recursive calls, calls |
| 808 | // to discarded specialisations and calls that may match a specialisation |
| 809 | // after the solver runs. |
| 810 | for (Function *F : OriginalFuncs) { |
| 811 | auto [Begin, End] = SM[F]; |
| 812 | updateCallSites(F, Begin: AllSpecs.begin() + Begin, End: AllSpecs.begin() + End); |
| 813 | } |
| 814 | |
| 815 | for (Function *F : Clones) { |
| 816 | if (F->getReturnType()->isVoidTy()) |
| 817 | continue; |
| 818 | if (F->getReturnType()->isStructTy()) { |
| 819 | auto *STy = cast<StructType>(Val: F->getReturnType()); |
| 820 | if (!Solver.isStructLatticeConstant(F, STy)) |
| 821 | continue; |
| 822 | } else { |
| 823 | auto It = Solver.getTrackedRetVals().find(Key: F); |
| 824 | assert(It != Solver.getTrackedRetVals().end() && |
| 825 | "Return value ought to be tracked" ); |
| 826 | if (SCCPSolver::isOverdefined(LV: It->second)) |
| 827 | continue; |
| 828 | } |
| 829 | for (User *U : F->users()) { |
| 830 | if (auto *CS = dyn_cast<CallBase>(Val: U)) { |
| 831 | //The user instruction does not call our function. |
| 832 | if (CS->getCalledFunction() != F) |
| 833 | continue; |
| 834 | Solver.resetLatticeValueFor(Call: CS); |
| 835 | } |
| 836 | } |
| 837 | } |
| 838 | |
| 839 | // Rerun the solver to notify the users of the modified callsites. |
| 840 | Solver.solveWhileResolvedUndefs(); |
| 841 | |
| 842 | for (Function *F : OriginalFuncs) |
| 843 | if (FunctionMetrics[F].isRecursive) |
| 844 | promoteConstantStackValues(F); |
| 845 | |
| 846 | return true; |
| 847 | } |
| 848 | |
| 849 | void FunctionSpecializer::removeDeadFunctions() { |
| 850 | for (Function *F : FullySpecialized) { |
| 851 | LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function " |
| 852 | << F->getName() << "\n" ); |
| 853 | if (FAM) |
| 854 | FAM->clear(IR&: *F, Name: F->getName()); |
| 855 | F->eraseFromParent(); |
| 856 | } |
| 857 | FullySpecialized.clear(); |
| 858 | } |
| 859 | |
| 860 | /// Clone the function \p F and remove the ssa_copy intrinsics added by |
| 861 | /// the SCCPSolver in the cloned version. |
| 862 | static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) { |
| 863 | ValueToValueMapTy Mappings; |
| 864 | Function *Clone = CloneFunction(F, VMap&: Mappings); |
| 865 | Clone->setName(F->getName() + ".specialized." + Twine(NSpecs)); |
| 866 | removeSSACopy(F&: *Clone); |
| 867 | return Clone; |
| 868 | } |
| 869 | |
| 870 | bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, |
| 871 | SmallVectorImpl<Spec> &AllSpecs, |
| 872 | SpecMap &SM) { |
| 873 | // A mapping from a specialisation signature to the index of the respective |
| 874 | // entry in the all specialisation array. Used to ensure uniqueness of |
| 875 | // specialisations. |
| 876 | DenseMap<SpecSig, unsigned> UniqueSpecs; |
| 877 | |
| 878 | // Get a list of interesting arguments. |
| 879 | SmallVector<Argument *> Args; |
| 880 | for (Argument &Arg : F->args()) |
| 881 | if (isArgumentInteresting(A: &Arg)) |
| 882 | Args.push_back(Elt: &Arg); |
| 883 | |
| 884 | if (Args.empty()) |
| 885 | return false; |
| 886 | |
| 887 | for (User *U : F->users()) { |
| 888 | if (!isa<CallInst>(Val: U) && !isa<InvokeInst>(Val: U)) |
| 889 | continue; |
| 890 | auto &CS = *cast<CallBase>(Val: U); |
| 891 | |
| 892 | // The user instruction does not call our function. |
| 893 | if (CS.getCalledFunction() != F) |
| 894 | continue; |
| 895 | |
| 896 | // If the call site has attribute minsize set, that callsite won't be |
| 897 | // specialized. |
| 898 | if (CS.hasFnAttr(Kind: Attribute::MinSize)) |
| 899 | continue; |
| 900 | |
| 901 | // If the parent of the call site will never be executed, we don't need |
| 902 | // to worry about the passed value. |
| 903 | if (!Solver.isBlockExecutable(BB: CS.getParent())) |
| 904 | continue; |
| 905 | |
| 906 | // Examine arguments and create a specialisation candidate from the |
| 907 | // constant operands of this call site. |
| 908 | SpecSig S; |
| 909 | for (Argument *A : Args) { |
| 910 | Constant *C = getCandidateConstant(V: CS.getArgOperand(i: A->getArgNo())); |
| 911 | if (!C) |
| 912 | continue; |
| 913 | LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument " |
| 914 | << A->getName() << " : " << C->getNameOrAsOperand() |
| 915 | << "\n" ); |
| 916 | S.Args.push_back(Elt: {A, C}); |
| 917 | } |
| 918 | |
| 919 | if (S.Args.empty()) |
| 920 | continue; |
| 921 | |
| 922 | // Check if we have encountered the same specialisation already. |
| 923 | if (auto It = UniqueSpecs.find(Val: S); It != UniqueSpecs.end()) { |
| 924 | // Existing specialisation. Add the call to the list to rewrite, unless |
| 925 | // it's a recursive call. A specialisation, generated because of a |
| 926 | // recursive call may end up as not the best specialisation for all |
| 927 | // the cloned instances of this call, which result from specialising |
| 928 | // functions. Hence we don't rewrite the call directly, but match it with |
| 929 | // the best specialisation once all specialisations are known. |
| 930 | if (CS.getFunction() == F) |
| 931 | continue; |
| 932 | const unsigned Index = It->second; |
| 933 | AllSpecs[Index].CallSites.push_back(Elt: &CS); |
| 934 | } else { |
| 935 | // Calculate the specialisation gain. |
| 936 | Cost CodeSize; |
| 937 | unsigned Score = 0; |
| 938 | InstCostVisitor Visitor = getInstCostVisitorFor(F); |
| 939 | for (ArgInfo &A : S.Args) { |
| 940 | CodeSize += Visitor.getCodeSizeSavingsForArg(A: A.Formal, C: A.Actual); |
| 941 | Score += getInliningBonus(A: A.Formal, C: A.Actual); |
| 942 | } |
| 943 | CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs(); |
| 944 | |
| 945 | unsigned CodeSizeSavings = getCostValue(C: CodeSize); |
| 946 | unsigned SpecSize = FuncSize - CodeSizeSavings; |
| 947 | |
| 948 | auto IsProfitable = [&]() -> bool { |
| 949 | // No check required. |
| 950 | if (ForceSpecialization) |
| 951 | return true; |
| 952 | |
| 953 | LLVM_DEBUG( |
| 954 | dbgs() << "FnSpecialization: Specialization bonus {Inlining = " |
| 955 | << Score << " (" << (Score * 100 / FuncSize) << "%)}\n" ); |
| 956 | |
| 957 | // Minimum inlining bonus. |
| 958 | if (Score > MinInliningBonus * FuncSize / 100) |
| 959 | return true; |
| 960 | |
| 961 | LLVM_DEBUG( |
| 962 | dbgs() << "FnSpecialization: Specialization bonus {CodeSize = " |
| 963 | << CodeSizeSavings << " (" |
| 964 | << (CodeSizeSavings * 100 / FuncSize) << "%)}\n" ); |
| 965 | |
| 966 | // Minimum codesize savings. |
| 967 | if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100) |
| 968 | return false; |
| 969 | |
| 970 | // Lazily compute the Latency, to avoid unnecessarily computing BFI. |
| 971 | unsigned LatencySavings = |
| 972 | getCostValue(C: Visitor.getLatencySavingsForKnownConstants()); |
| 973 | |
| 974 | LLVM_DEBUG( |
| 975 | dbgs() << "FnSpecialization: Specialization bonus {Latency = " |
| 976 | << LatencySavings << " (" |
| 977 | << (LatencySavings * 100 / FuncSize) << "%)}\n" ); |
| 978 | |
| 979 | // Minimum latency savings. |
| 980 | if (LatencySavings < MinLatencySavings * FuncSize / 100) |
| 981 | return false; |
| 982 | // Maximum codesize growth. |
| 983 | if ((FunctionGrowth[F] + SpecSize) / FuncSize > MaxCodeSizeGrowth) |
| 984 | return false; |
| 985 | |
| 986 | Score += std::max(a: CodeSizeSavings, b: LatencySavings); |
| 987 | return true; |
| 988 | }; |
| 989 | |
| 990 | // Discard unprofitable specialisations. |
| 991 | if (!IsProfitable()) |
| 992 | continue; |
| 993 | |
| 994 | // Create a new specialisation entry. |
| 995 | auto &Spec = AllSpecs.emplace_back(Args&: F, Args&: S, Args&: Score, Args&: SpecSize); |
| 996 | if (CS.getFunction() != F) |
| 997 | Spec.CallSites.push_back(Elt: &CS); |
| 998 | const unsigned Index = AllSpecs.size() - 1; |
| 999 | UniqueSpecs[S] = Index; |
| 1000 | if (auto [It, Inserted] = SM.try_emplace(Key: F, Args: Index, Args: Index + 1); !Inserted) |
| 1001 | It->second.second = Index + 1; |
| 1002 | } |
| 1003 | } |
| 1004 | |
| 1005 | return !UniqueSpecs.empty(); |
| 1006 | } |
| 1007 | |
| 1008 | bool FunctionSpecializer::isCandidateFunction(Function *F) { |
| 1009 | if (F->isDeclaration() || F->arg_empty()) |
| 1010 | return false; |
| 1011 | |
| 1012 | if (F->hasFnAttribute(Kind: Attribute::NoDuplicate)) |
| 1013 | return false; |
| 1014 | |
| 1015 | // Do not specialize the cloned function again. |
| 1016 | if (Specializations.contains(Ptr: F)) |
| 1017 | return false; |
| 1018 | |
| 1019 | // If we're optimizing the function for size, we shouldn't specialize it. |
| 1020 | if (shouldOptimizeForSize(F, PSI: nullptr, BFI: nullptr, QueryType: PGSOQueryType::IRPass)) |
| 1021 | return false; |
| 1022 | |
| 1023 | // Exit if the function is not executable. There's no point in specializing |
| 1024 | // a dead function. |
| 1025 | if (!Solver.isBlockExecutable(BB: &F->getEntryBlock())) |
| 1026 | return false; |
| 1027 | |
| 1028 | // It wastes time to specialize a function which would get inlined finally. |
| 1029 | if (F->hasFnAttribute(Kind: Attribute::AlwaysInline)) |
| 1030 | return false; |
| 1031 | |
| 1032 | LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName() |
| 1033 | << "\n" ); |
| 1034 | return true; |
| 1035 | } |
| 1036 | |
| 1037 | Function *FunctionSpecializer::createSpecialization(Function *F, |
| 1038 | const SpecSig &S) { |
| 1039 | Function *Clone = cloneCandidateFunction(F, NSpecs: Specializations.size() + 1); |
| 1040 | |
| 1041 | // The original function does not neccessarily have internal linkage, but the |
| 1042 | // clone must. |
| 1043 | Clone->setLinkage(GlobalValue::InternalLinkage); |
| 1044 | |
| 1045 | // Initialize the lattice state of the arguments of the function clone, |
| 1046 | // marking the argument on which we specialized the function constant |
| 1047 | // with the given value. |
| 1048 | Solver.setLatticeValueForSpecializationArguments(F: Clone, Args: S.Args); |
| 1049 | Solver.markBlockExecutable(BB: &Clone->front()); |
| 1050 | Solver.addArgumentTrackedFunction(F: Clone); |
| 1051 | Solver.addTrackedFunction(F: Clone); |
| 1052 | |
| 1053 | // Mark all the specialized functions |
| 1054 | Specializations.insert(Ptr: Clone); |
| 1055 | ++NumSpecsCreated; |
| 1056 | |
| 1057 | return Clone; |
| 1058 | } |
| 1059 | |
| 1060 | /// Compute the inlining bonus for replacing argument \p A with constant \p C. |
| 1061 | /// The below heuristic is only concerned with exposing inlining |
| 1062 | /// opportunities via indirect call promotion. If the argument is not a |
| 1063 | /// (potentially casted) function pointer, give up. |
| 1064 | unsigned FunctionSpecializer::getInliningBonus(Argument *A, Constant *C) { |
| 1065 | Function *CalledFunction = dyn_cast<Function>(Val: C->stripPointerCasts()); |
| 1066 | if (!CalledFunction) |
| 1067 | return 0; |
| 1068 | |
| 1069 | // Get TTI for the called function (used for the inline cost). |
| 1070 | auto &CalleeTTI = (GetTTI)(*CalledFunction); |
| 1071 | |
| 1072 | // Look at all the call sites whose called value is the argument. |
| 1073 | // Specializing the function on the argument would allow these indirect |
| 1074 | // calls to be promoted to direct calls. If the indirect call promotion |
| 1075 | // would likely enable the called function to be inlined, specializing is a |
| 1076 | // good idea. |
| 1077 | int InliningBonus = 0; |
| 1078 | for (User *U : A->users()) { |
| 1079 | if (!isa<CallInst>(Val: U) && !isa<InvokeInst>(Val: U)) |
| 1080 | continue; |
| 1081 | auto *CS = cast<CallBase>(Val: U); |
| 1082 | if (CS->getCalledOperand() != A) |
| 1083 | continue; |
| 1084 | if (CS->getFunctionType() != CalledFunction->getFunctionType()) |
| 1085 | continue; |
| 1086 | |
| 1087 | // Get the cost of inlining the called function at this call site. Note |
| 1088 | // that this is only an estimate. The called function may eventually |
| 1089 | // change in a way that leads to it not being inlined here, even though |
| 1090 | // inlining looks profitable now. For example, one of its called |
| 1091 | // functions may be inlined into it, making the called function too large |
| 1092 | // to be inlined into this call site. |
| 1093 | // |
| 1094 | // We apply a boost for performing indirect call promotion by increasing |
| 1095 | // the default threshold by the threshold for indirect calls. |
| 1096 | auto Params = getInlineParams(); |
| 1097 | Params.DefaultThreshold += InlineConstants::IndirectCallThreshold; |
| 1098 | InlineCost IC = |
| 1099 | getInlineCost(Call&: *CS, Callee: CalledFunction, Params, CalleeTTI, GetAssumptionCache: GetAC, GetTLI); |
| 1100 | |
| 1101 | // We clamp the bonus for this call to be between zero and the default |
| 1102 | // threshold. |
| 1103 | if (IC.isAlways()) |
| 1104 | InliningBonus += Params.DefaultThreshold; |
| 1105 | else if (IC.isVariable() && IC.getCostDelta() > 0) |
| 1106 | InliningBonus += IC.getCostDelta(); |
| 1107 | |
| 1108 | LLVM_DEBUG(dbgs() << "FnSpecialization: Inlining bonus " << InliningBonus |
| 1109 | << " for user " << *U << "\n" ); |
| 1110 | } |
| 1111 | |
| 1112 | return InliningBonus > 0 ? static_cast<unsigned>(InliningBonus) : 0; |
| 1113 | } |
| 1114 | |
| 1115 | /// Determine if it is possible to specialise the function for constant values |
| 1116 | /// of the formal parameter \p A. |
| 1117 | bool FunctionSpecializer::isArgumentInteresting(Argument *A) { |
| 1118 | // No point in specialization if the argument is unused. |
| 1119 | if (A->user_empty()) |
| 1120 | return false; |
| 1121 | |
| 1122 | Type *Ty = A->getType(); |
| 1123 | if (!Ty->isPointerTy() && (!SpecializeLiteralConstant || |
| 1124 | (!Ty->isIntegerTy() && !Ty->isFloatingPointTy() && !Ty->isStructTy()))) |
| 1125 | return false; |
| 1126 | |
| 1127 | // SCCP solver does not record an argument that will be constructed on |
| 1128 | // stack. |
| 1129 | if (A->hasByValAttr() && !A->getParent()->onlyReadsMemory()) |
| 1130 | return false; |
| 1131 | |
| 1132 | // For non-argument-tracked functions every argument is overdefined. |
| 1133 | if (!Solver.isArgumentTrackedFunction(F: A->getParent())) |
| 1134 | return true; |
| 1135 | |
| 1136 | // Check the lattice value and decide if we should attemt to specialize, |
| 1137 | // based on this argument. No point in specialization, if the lattice value |
| 1138 | // is already a constant. |
| 1139 | bool IsOverdefined = Ty->isStructTy() |
| 1140 | ? any_of(Range: Solver.getStructLatticeValueFor(V: A), P: SCCPSolver::isOverdefined) |
| 1141 | : SCCPSolver::isOverdefined(LV: Solver.getLatticeValueFor(V: A)); |
| 1142 | |
| 1143 | LLVM_DEBUG( |
| 1144 | if (IsOverdefined) |
| 1145 | dbgs() << "FnSpecialization: Found interesting parameter " |
| 1146 | << A->getNameOrAsOperand() << "\n" ; |
| 1147 | else |
| 1148 | dbgs() << "FnSpecialization: Nothing to do, parameter " |
| 1149 | << A->getNameOrAsOperand() << " is already constant\n" ; |
| 1150 | ); |
| 1151 | return IsOverdefined; |
| 1152 | } |
| 1153 | |
| 1154 | /// Check if the value \p V (an actual argument) is a constant or can only |
| 1155 | /// have a constant value. Return that constant. |
| 1156 | Constant *FunctionSpecializer::getCandidateConstant(Value *V) { |
| 1157 | if (isa<PoisonValue>(Val: V)) |
| 1158 | return nullptr; |
| 1159 | |
| 1160 | // Select for possible specialisation values that are constants or |
| 1161 | // are deduced to be constants or constant ranges with a single element. |
| 1162 | Constant *C = dyn_cast<Constant>(Val: V); |
| 1163 | if (!C) |
| 1164 | C = Solver.getConstantOrNull(V); |
| 1165 | |
| 1166 | // Don't specialize on (anything derived from) the address of a non-constant |
| 1167 | // global variable, unless explicitly enabled. |
| 1168 | if (C && C->getType()->isPointerTy() && !C->isNullValue()) |
| 1169 | if (auto *GV = dyn_cast<GlobalVariable>(Val: getUnderlyingObject(V: C)); |
| 1170 | GV && !(GV->isConstant() || SpecializeOnAddress)) |
| 1171 | return nullptr; |
| 1172 | |
| 1173 | return C; |
| 1174 | } |
| 1175 | |
| 1176 | void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin, |
| 1177 | const Spec *End) { |
| 1178 | // Collect the call sites that need updating. |
| 1179 | SmallVector<CallBase *> ToUpdate; |
| 1180 | for (User *U : F->users()) |
| 1181 | if (auto *CS = dyn_cast<CallBase>(Val: U); |
| 1182 | CS && CS->getCalledFunction() == F && |
| 1183 | Solver.isBlockExecutable(BB: CS->getParent())) |
| 1184 | ToUpdate.push_back(Elt: CS); |
| 1185 | |
| 1186 | unsigned NCallsLeft = ToUpdate.size(); |
| 1187 | for (CallBase *CS : ToUpdate) { |
| 1188 | bool ShouldDecrementCount = CS->getFunction() == F; |
| 1189 | |
| 1190 | // Find the best matching specialisation. |
| 1191 | const Spec *BestSpec = nullptr; |
| 1192 | for (const Spec &S : make_range(x: Begin, y: End)) { |
| 1193 | if (!S.Clone || (BestSpec && S.Score <= BestSpec->Score)) |
| 1194 | continue; |
| 1195 | |
| 1196 | if (any_of(Range: S.Sig.Args, P: [CS, this](const ArgInfo &Arg) { |
| 1197 | unsigned ArgNo = Arg.Formal->getArgNo(); |
| 1198 | return getCandidateConstant(V: CS->getArgOperand(i: ArgNo)) != Arg.Actual; |
| 1199 | })) |
| 1200 | continue; |
| 1201 | |
| 1202 | BestSpec = &S; |
| 1203 | } |
| 1204 | |
| 1205 | if (BestSpec) { |
| 1206 | LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *CS |
| 1207 | << " to call " << BestSpec->Clone->getName() << "\n" ); |
| 1208 | CS->setCalledFunction(BestSpec->Clone); |
| 1209 | ShouldDecrementCount = true; |
| 1210 | } |
| 1211 | |
| 1212 | if (ShouldDecrementCount) |
| 1213 | --NCallsLeft; |
| 1214 | } |
| 1215 | |
| 1216 | // If the function has been completely specialized, the original function |
| 1217 | // is no longer needed. Mark it unreachable. |
| 1218 | if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) { |
| 1219 | Solver.markFunctionUnreachable(F); |
| 1220 | FullySpecialized.insert(Ptr: F); |
| 1221 | } |
| 1222 | } |
| 1223 | |