1 | //===- FunctionSpecialization.cpp - Function Specialization ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "llvm/Transforms/IPO/FunctionSpecialization.h" |
10 | #include "llvm/ADT/Statistic.h" |
11 | #include "llvm/Analysis/CodeMetrics.h" |
12 | #include "llvm/Analysis/ConstantFolding.h" |
13 | #include "llvm/Analysis/InlineCost.h" |
14 | #include "llvm/Analysis/InstructionSimplify.h" |
15 | #include "llvm/Analysis/TargetTransformInfo.h" |
16 | #include "llvm/Analysis/ValueLattice.h" |
17 | #include "llvm/Analysis/ValueLatticeUtils.h" |
18 | #include "llvm/Analysis/ValueTracking.h" |
19 | #include "llvm/IR/IntrinsicInst.h" |
20 | #include "llvm/Transforms/Scalar/SCCP.h" |
21 | #include "llvm/Transforms/Utils/Cloning.h" |
22 | #include "llvm/Transforms/Utils/SCCPSolver.h" |
23 | #include "llvm/Transforms/Utils/SizeOpts.h" |
24 | #include <cmath> |
25 | |
26 | using namespace llvm; |
27 | |
28 | #define DEBUG_TYPE "function-specialization" |
29 | |
30 | STATISTIC(NumSpecsCreated, "Number of specializations created" ); |
31 | |
32 | static cl::opt<bool> ForceSpecialization( |
33 | "force-specialization" , cl::init(Val: false), cl::Hidden, cl::desc( |
34 | "Force function specialization for every call site with a constant " |
35 | "argument" )); |
36 | |
37 | static cl::opt<unsigned> MaxClones( |
38 | "funcspec-max-clones" , cl::init(Val: 3), cl::Hidden, cl::desc( |
39 | "The maximum number of clones allowed for a single function " |
40 | "specialization" )); |
41 | |
42 | static cl::opt<unsigned> |
43 | MaxDiscoveryIterations("funcspec-max-discovery-iterations" , cl::init(Val: 100), |
44 | cl::Hidden, |
45 | cl::desc("The maximum number of iterations allowed " |
46 | "when searching for transitive " |
47 | "phis" )); |
48 | |
49 | static cl::opt<unsigned> MaxIncomingPhiValues( |
50 | "funcspec-max-incoming-phi-values" , cl::init(Val: 8), cl::Hidden, |
51 | cl::desc("The maximum number of incoming values a PHI node can have to be " |
52 | "considered during the specialization bonus estimation" )); |
53 | |
54 | static cl::opt<unsigned> MaxBlockPredecessors( |
55 | "funcspec-max-block-predecessors" , cl::init(Val: 2), cl::Hidden, cl::desc( |
56 | "The maximum number of predecessors a basic block can have to be " |
57 | "considered during the estimation of dead code" )); |
58 | |
59 | static cl::opt<unsigned> MinFunctionSize( |
60 | "funcspec-min-function-size" , cl::init(Val: 500), cl::Hidden, |
61 | cl::desc("Don't specialize functions that have less than this number of " |
62 | "instructions" )); |
63 | |
64 | static cl::opt<unsigned> MaxCodeSizeGrowth( |
65 | "funcspec-max-codesize-growth" , cl::init(Val: 3), cl::Hidden, cl::desc( |
66 | "Maximum codesize growth allowed per function" )); |
67 | |
68 | static cl::opt<unsigned> MinCodeSizeSavings( |
69 | "funcspec-min-codesize-savings" , cl::init(Val: 20), cl::Hidden, |
70 | cl::desc("Reject specializations whose codesize savings are less than this " |
71 | "much percent of the original function size" )); |
72 | |
73 | static cl::opt<unsigned> MinLatencySavings( |
74 | "funcspec-min-latency-savings" , cl::init(Val: 20), cl::Hidden, |
75 | cl::desc("Reject specializations whose latency savings are less than this " |
76 | "much percent of the original function size" )); |
77 | |
78 | static cl::opt<unsigned> MinInliningBonus( |
79 | "funcspec-min-inlining-bonus" , cl::init(Val: 300), cl::Hidden, |
80 | cl::desc("Reject specializations whose inlining bonus is less than this " |
81 | "much percent of the original function size" )); |
82 | |
83 | static cl::opt<bool> SpecializeOnAddress( |
84 | "funcspec-on-address" , cl::init(Val: false), cl::Hidden, cl::desc( |
85 | "Enable function specialization on the address of global values" )); |
86 | |
87 | static cl::opt<bool> SpecializeLiteralConstant( |
88 | "funcspec-for-literal-constant" , cl::init(Val: true), cl::Hidden, |
89 | cl::desc( |
90 | "Enable specialization of functions that take a literal constant as an " |
91 | "argument" )); |
92 | |
93 | bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB, |
94 | BasicBlock *Succ) const { |
95 | unsigned I = 0; |
96 | return all_of(Range: predecessors(BB: Succ), P: [&I, BB, Succ, this](BasicBlock *Pred) { |
97 | return I++ < MaxBlockPredecessors && |
98 | (Pred == BB || Pred == Succ || !isBlockExecutable(BB: Pred)); |
99 | }); |
100 | } |
101 | |
102 | // Estimates the codesize savings due to dead code after constant propagation. |
103 | // \p WorkList represents the basic blocks of a specialization which will |
104 | // eventually become dead once we replace instructions that are known to be |
105 | // constants. The successors of such blocks are added to the list as long as |
106 | // the \p Solver found they were executable prior to specialization, and only |
107 | // if all their predecessors are dead. |
108 | Cost InstCostVisitor::estimateBasicBlocks( |
109 | SmallVectorImpl<BasicBlock *> &WorkList) { |
110 | Cost CodeSize = 0; |
111 | // Accumulate the codesize savings of each basic block. |
112 | while (!WorkList.empty()) { |
113 | BasicBlock *BB = WorkList.pop_back_val(); |
114 | |
115 | // These blocks are considered dead as far as the InstCostVisitor |
116 | // is concerned. They haven't been proven dead yet by the Solver, |
117 | // but may become if we propagate the specialization arguments. |
118 | assert(Solver.isBlockExecutable(BB) && "BB already found dead by IPSCCP!" ); |
119 | if (!DeadBlocks.insert(V: BB).second) |
120 | continue; |
121 | |
122 | for (Instruction &I : *BB) { |
123 | // If it's a known constant we have already accounted for it. |
124 | if (KnownConstants.contains(Val: &I)) |
125 | continue; |
126 | |
127 | Cost C = TTI.getInstructionCost(U: &I, CostKind: TargetTransformInfo::TCK_CodeSize); |
128 | |
129 | LLVM_DEBUG(dbgs() << "FnSpecialization: CodeSize " << C |
130 | << " for user " << I << "\n" ); |
131 | CodeSize += C; |
132 | } |
133 | |
134 | // Keep adding dead successors to the list as long as they are |
135 | // executable and only reachable from dead blocks. |
136 | for (BasicBlock *SuccBB : successors(BB)) |
137 | if (isBlockExecutable(BB: SuccBB) && canEliminateSuccessor(BB, Succ: SuccBB)) |
138 | WorkList.push_back(Elt: SuccBB); |
139 | } |
140 | return CodeSize; |
141 | } |
142 | |
143 | Constant *InstCostVisitor::findConstantFor(Value *V) const { |
144 | if (auto *C = dyn_cast<Constant>(Val: V)) |
145 | return C; |
146 | if (auto *C = Solver.getConstantOrNull(V)) |
147 | return C; |
148 | return KnownConstants.lookup(Val: V); |
149 | } |
150 | |
151 | Cost InstCostVisitor::getCodeSizeSavingsFromPendingPHIs() { |
152 | Cost CodeSize; |
153 | while (!PendingPHIs.empty()) { |
154 | Instruction *Phi = PendingPHIs.pop_back_val(); |
155 | // The pending PHIs could have been proven dead by now. |
156 | if (isBlockExecutable(BB: Phi->getParent())) |
157 | CodeSize += getCodeSizeSavingsForUser(User: Phi); |
158 | } |
159 | return CodeSize; |
160 | } |
161 | |
162 | /// Compute the codesize savings for replacing argument \p A with constant \p C. |
163 | Cost InstCostVisitor::getCodeSizeSavingsForArg(Argument *A, Constant *C) { |
164 | LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: " |
165 | << C->getNameOrAsOperand() << "\n" ); |
166 | Cost CodeSize; |
167 | for (auto *U : A->users()) |
168 | if (auto *UI = dyn_cast<Instruction>(Val: U)) |
169 | if (isBlockExecutable(BB: UI->getParent())) |
170 | CodeSize += getCodeSizeSavingsForUser(User: UI, Use: A, C); |
171 | |
172 | LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = " |
173 | << CodeSize << "} for argument " << *A << "\n" ); |
174 | return CodeSize; |
175 | } |
176 | |
177 | /// Compute the latency savings from replacing all arguments with constants for |
178 | /// a specialization candidate. As this function computes the latency savings |
179 | /// for all Instructions in KnownConstants at once, it should be called only |
180 | /// after every instruction has been visited, i.e. after: |
181 | /// |
182 | /// * getCodeSizeSavingsForArg has been run for every constant argument of a |
183 | /// specialization candidate |
184 | /// |
185 | /// * getCodeSizeSavingsFromPendingPHIs has been run |
186 | /// |
187 | /// to ensure that the latency savings are calculated for all Instructions we |
188 | /// have visited and found to be constant. |
189 | Cost InstCostVisitor::getLatencySavingsForKnownConstants() { |
190 | auto &BFI = GetBFI(*F); |
191 | Cost TotalLatency = 0; |
192 | |
193 | for (auto Pair : KnownConstants) { |
194 | Instruction *I = dyn_cast<Instruction>(Val: Pair.first); |
195 | if (!I) |
196 | continue; |
197 | |
198 | uint64_t Weight = BFI.getBlockFreq(BB: I->getParent()).getFrequency() / |
199 | BFI.getEntryFreq().getFrequency(); |
200 | |
201 | Cost Latency = |
202 | Weight * TTI.getInstructionCost(U: I, CostKind: TargetTransformInfo::TCK_Latency); |
203 | |
204 | LLVM_DEBUG(dbgs() << "FnSpecialization: {Latency = " << Latency |
205 | << "} for instruction " << *I << "\n" ); |
206 | |
207 | TotalLatency += Latency; |
208 | } |
209 | |
210 | return TotalLatency; |
211 | } |
212 | |
213 | Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use, |
214 | Constant *C) { |
215 | // We have already propagated a constant for this user. |
216 | if (KnownConstants.contains(Val: User)) |
217 | return 0; |
218 | |
219 | // Cache the iterator before visiting. |
220 | LastVisited = Use ? KnownConstants.insert(KV: {Use, C}).first |
221 | : KnownConstants.end(); |
222 | |
223 | Cost CodeSize = 0; |
224 | if (auto *I = dyn_cast<SwitchInst>(Val: User)) { |
225 | CodeSize = estimateSwitchInst(I&: *I); |
226 | } else if (auto *I = dyn_cast<BranchInst>(Val: User)) { |
227 | CodeSize = estimateBranchInst(I&: *I); |
228 | } else { |
229 | C = visit(I&: *User); |
230 | if (!C) |
231 | return 0; |
232 | } |
233 | |
234 | // Even though it doesn't make sense to bind switch and branch instructions |
235 | // with a constant, unlike any other instruction type, it prevents estimating |
236 | // their bonus multiple times. |
237 | KnownConstants.insert(KV: {User, C}); |
238 | |
239 | CodeSize += TTI.getInstructionCost(U: User, CostKind: TargetTransformInfo::TCK_CodeSize); |
240 | |
241 | LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize |
242 | << "} for user " << *User << "\n" ); |
243 | |
244 | for (auto *U : User->users()) |
245 | if (auto *UI = dyn_cast<Instruction>(Val: U)) |
246 | if (UI != User && isBlockExecutable(BB: UI->getParent())) |
247 | CodeSize += getCodeSizeSavingsForUser(User: UI, Use: User, C); |
248 | |
249 | return CodeSize; |
250 | } |
251 | |
252 | Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) { |
253 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
254 | |
255 | if (I.getCondition() != LastVisited->first) |
256 | return 0; |
257 | |
258 | auto *C = dyn_cast<ConstantInt>(Val: LastVisited->second); |
259 | if (!C) |
260 | return 0; |
261 | |
262 | BasicBlock *Succ = I.findCaseValue(C)->getCaseSuccessor(); |
263 | // Initialize the worklist with the dead basic blocks. These are the |
264 | // destination labels which are different from the one corresponding |
265 | // to \p C. They should be executable and have a unique predecessor. |
266 | SmallVector<BasicBlock *> WorkList; |
267 | for (const auto &Case : I.cases()) { |
268 | BasicBlock *BB = Case.getCaseSuccessor(); |
269 | if (BB != Succ && isBlockExecutable(BB) && |
270 | canEliminateSuccessor(BB: I.getParent(), Succ: BB)) |
271 | WorkList.push_back(Elt: BB); |
272 | } |
273 | |
274 | return estimateBasicBlocks(WorkList); |
275 | } |
276 | |
277 | Cost InstCostVisitor::estimateBranchInst(BranchInst &I) { |
278 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
279 | |
280 | if (I.getCondition() != LastVisited->first) |
281 | return 0; |
282 | |
283 | BasicBlock *Succ = I.getSuccessor(i: LastVisited->second->isOneValue()); |
284 | // Initialize the worklist with the dead successor as long as |
285 | // it is executable and has a unique predecessor. |
286 | SmallVector<BasicBlock *> WorkList; |
287 | if (isBlockExecutable(BB: Succ) && canEliminateSuccessor(BB: I.getParent(), Succ)) |
288 | WorkList.push_back(Elt: Succ); |
289 | |
290 | return estimateBasicBlocks(WorkList); |
291 | } |
292 | |
293 | bool InstCostVisitor::discoverTransitivelyIncomingValues( |
294 | Constant *Const, PHINode *Root, DenseSet<PHINode *> &TransitivePHIs) { |
295 | |
296 | SmallVector<PHINode *, 64> WorkList; |
297 | WorkList.push_back(Elt: Root); |
298 | unsigned Iter = 0; |
299 | |
300 | while (!WorkList.empty()) { |
301 | PHINode *PN = WorkList.pop_back_val(); |
302 | |
303 | if (++Iter > MaxDiscoveryIterations || |
304 | PN->getNumIncomingValues() > MaxIncomingPhiValues) |
305 | return false; |
306 | |
307 | if (!TransitivePHIs.insert(V: PN).second) |
308 | continue; |
309 | |
310 | for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { |
311 | Value *V = PN->getIncomingValue(i: I); |
312 | |
313 | // Disregard self-references and dead incoming values. |
314 | if (auto *Inst = dyn_cast<Instruction>(Val: V)) |
315 | if (Inst == PN || !isBlockExecutable(BB: PN->getIncomingBlock(i: I))) |
316 | continue; |
317 | |
318 | if (Constant *C = findConstantFor(V)) { |
319 | // Not all incoming values are the same constant. Bail immediately. |
320 | if (C != Const) |
321 | return false; |
322 | continue; |
323 | } |
324 | |
325 | if (auto *Phi = dyn_cast<PHINode>(Val: V)) { |
326 | WorkList.push_back(Elt: Phi); |
327 | continue; |
328 | } |
329 | |
330 | // We can't reason about anything else. |
331 | return false; |
332 | } |
333 | } |
334 | return true; |
335 | } |
336 | |
337 | Constant *InstCostVisitor::visitPHINode(PHINode &I) { |
338 | if (I.getNumIncomingValues() > MaxIncomingPhiValues) |
339 | return nullptr; |
340 | |
341 | bool Inserted = VisitedPHIs.insert(V: &I).second; |
342 | Constant *Const = nullptr; |
343 | bool HaveSeenIncomingPHI = false; |
344 | |
345 | for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) { |
346 | Value *V = I.getIncomingValue(i: Idx); |
347 | |
348 | // Disregard self-references and dead incoming values. |
349 | if (auto *Inst = dyn_cast<Instruction>(Val: V)) |
350 | if (Inst == &I || !isBlockExecutable(BB: I.getIncomingBlock(i: Idx))) |
351 | continue; |
352 | |
353 | if (Constant *C = findConstantFor(V)) { |
354 | if (!Const) |
355 | Const = C; |
356 | // Not all incoming values are the same constant. Bail immediately. |
357 | if (C != Const) |
358 | return nullptr; |
359 | continue; |
360 | } |
361 | |
362 | if (Inserted) { |
363 | // First time we are seeing this phi. We will retry later, after |
364 | // all the constant arguments have been propagated. Bail for now. |
365 | PendingPHIs.push_back(Elt: &I); |
366 | return nullptr; |
367 | } |
368 | |
369 | if (isa<PHINode>(Val: V)) { |
370 | // Perhaps it is a Transitive Phi. We will confirm later. |
371 | HaveSeenIncomingPHI = true; |
372 | continue; |
373 | } |
374 | |
375 | // We can't reason about anything else. |
376 | return nullptr; |
377 | } |
378 | |
379 | if (!Const) |
380 | return nullptr; |
381 | |
382 | if (!HaveSeenIncomingPHI) |
383 | return Const; |
384 | |
385 | DenseSet<PHINode *> TransitivePHIs; |
386 | if (!discoverTransitivelyIncomingValues(Const, Root: &I, TransitivePHIs)) |
387 | return nullptr; |
388 | |
389 | return Const; |
390 | } |
391 | |
392 | Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) { |
393 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
394 | |
395 | if (isGuaranteedNotToBeUndefOrPoison(V: LastVisited->second)) |
396 | return LastVisited->second; |
397 | return nullptr; |
398 | } |
399 | |
400 | Constant *InstCostVisitor::visitCallBase(CallBase &I) { |
401 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
402 | |
403 | // Look through calls to ssa_copy intrinsics. |
404 | if (auto *II = dyn_cast<IntrinsicInst>(Val: &I); |
405 | II && II->getIntrinsicID() == Intrinsic::ssa_copy) { |
406 | return LastVisited->second; |
407 | } |
408 | |
409 | Function *F = I.getCalledFunction(); |
410 | if (!F || !canConstantFoldCallTo(Call: &I, F)) |
411 | return nullptr; |
412 | |
413 | SmallVector<Constant *, 8> Operands; |
414 | Operands.reserve(N: I.getNumOperands()); |
415 | |
416 | for (unsigned Idx = 0, E = I.getNumOperands() - 1; Idx != E; ++Idx) { |
417 | Value *V = I.getOperand(i_nocapture: Idx); |
418 | if (isa<MetadataAsValue>(Val: V)) |
419 | return nullptr; |
420 | Constant *C = findConstantFor(V); |
421 | if (!C) |
422 | return nullptr; |
423 | Operands.push_back(Elt: C); |
424 | } |
425 | |
426 | auto Ops = ArrayRef(Operands.begin(), Operands.end()); |
427 | return ConstantFoldCall(Call: &I, F, Operands: Ops); |
428 | } |
429 | |
430 | Constant *InstCostVisitor::visitLoadInst(LoadInst &I) { |
431 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
432 | |
433 | if (isa<ConstantPointerNull>(Val: LastVisited->second)) |
434 | return nullptr; |
435 | return ConstantFoldLoadFromConstPtr(C: LastVisited->second, Ty: I.getType(), DL); |
436 | } |
437 | |
438 | Constant *InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { |
439 | SmallVector<Constant *, 8> Operands; |
440 | Operands.reserve(N: I.getNumOperands()); |
441 | |
442 | for (unsigned Idx = 0, E = I.getNumOperands(); Idx != E; ++Idx) { |
443 | Value *V = I.getOperand(i_nocapture: Idx); |
444 | Constant *C = findConstantFor(V); |
445 | if (!C) |
446 | return nullptr; |
447 | Operands.push_back(Elt: C); |
448 | } |
449 | |
450 | auto Ops = ArrayRef(Operands.begin(), Operands.end()); |
451 | return ConstantFoldInstOperands(I: &I, Ops, DL); |
452 | } |
453 | |
454 | Constant *InstCostVisitor::visitSelectInst(SelectInst &I) { |
455 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
456 | |
457 | if (I.getCondition() == LastVisited->first) { |
458 | Value *V = LastVisited->second->isZeroValue() ? I.getFalseValue() |
459 | : I.getTrueValue(); |
460 | return findConstantFor(V); |
461 | } |
462 | if (Constant *Condition = findConstantFor(V: I.getCondition())) |
463 | if ((I.getTrueValue() == LastVisited->first && Condition->isOneValue()) || |
464 | (I.getFalseValue() == LastVisited->first && Condition->isZeroValue())) |
465 | return LastVisited->second; |
466 | return nullptr; |
467 | } |
468 | |
469 | Constant *InstCostVisitor::visitCastInst(CastInst &I) { |
470 | return ConstantFoldCastOperand(Opcode: I.getOpcode(), C: LastVisited->second, |
471 | DestTy: I.getType(), DL); |
472 | } |
473 | |
474 | Constant *InstCostVisitor::visitCmpInst(CmpInst &I) { |
475 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
476 | |
477 | Constant *Const = LastVisited->second; |
478 | bool ConstOnRHS = I.getOperand(i_nocapture: 1) == LastVisited->first; |
479 | Value *V = ConstOnRHS ? I.getOperand(i_nocapture: 0) : I.getOperand(i_nocapture: 1); |
480 | Constant *Other = findConstantFor(V); |
481 | |
482 | if (Other) { |
483 | if (ConstOnRHS) |
484 | std::swap(a&: Const, b&: Other); |
485 | return ConstantFoldCompareInstOperands(Predicate: I.getPredicate(), LHS: Const, RHS: Other, DL); |
486 | } |
487 | |
488 | // If we haven't found Other to be a specific constant value, we may still be |
489 | // able to constant fold using information from the lattice value. |
490 | const ValueLatticeElement &ConstLV = ValueLatticeElement::get(C: Const); |
491 | const ValueLatticeElement &OtherLV = Solver.getLatticeValueFor(V); |
492 | auto &V1State = ConstOnRHS ? OtherLV : ConstLV; |
493 | auto &V2State = ConstOnRHS ? ConstLV : OtherLV; |
494 | return V1State.getCompare(Pred: I.getPredicate(), Ty: I.getType(), Other: V2State, DL); |
495 | } |
496 | |
497 | Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) { |
498 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
499 | |
500 | return ConstantFoldUnaryOpOperand(Opcode: I.getOpcode(), Op: LastVisited->second, DL); |
501 | } |
502 | |
503 | Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) { |
504 | assert(LastVisited != KnownConstants.end() && "Invalid iterator!" ); |
505 | |
506 | bool ConstOnRHS = I.getOperand(i_nocapture: 1) == LastVisited->first; |
507 | Value *V = ConstOnRHS ? I.getOperand(i_nocapture: 0) : I.getOperand(i_nocapture: 1); |
508 | Constant *Other = findConstantFor(V); |
509 | Value *OtherVal = Other ? Other : V; |
510 | Value *ConstVal = LastVisited->second; |
511 | |
512 | if (ConstOnRHS) |
513 | std::swap(a&: ConstVal, b&: OtherVal); |
514 | |
515 | return dyn_cast_or_null<Constant>( |
516 | Val: simplifyBinOp(Opcode: I.getOpcode(), LHS: ConstVal, RHS: OtherVal, Q: SimplifyQuery(DL))); |
517 | } |
518 | |
519 | Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca, |
520 | CallInst *Call) { |
521 | Value *StoreValue = nullptr; |
522 | for (auto *User : Alloca->users()) { |
523 | // We can't use llvm::isAllocaPromotable() as that would fail because of |
524 | // the usage in the CallInst, which is what we check here. |
525 | if (User == Call) |
526 | continue; |
527 | |
528 | if (auto *Store = dyn_cast<StoreInst>(Val: User)) { |
529 | // This is a duplicate store, bail out. |
530 | if (StoreValue || Store->isVolatile()) |
531 | return nullptr; |
532 | StoreValue = Store->getValueOperand(); |
533 | continue; |
534 | } |
535 | // Bail if there is any other unknown usage. |
536 | return nullptr; |
537 | } |
538 | |
539 | if (!StoreValue) |
540 | return nullptr; |
541 | |
542 | return getCandidateConstant(V: StoreValue); |
543 | } |
544 | |
545 | // A constant stack value is an AllocaInst that has a single constant |
546 | // value stored to it. Return this constant if such an alloca stack value |
547 | // is a function argument. |
548 | Constant *FunctionSpecializer::getConstantStackValue(CallInst *Call, |
549 | Value *Val) { |
550 | if (!Val) |
551 | return nullptr; |
552 | Val = Val->stripPointerCasts(); |
553 | if (auto *ConstVal = dyn_cast<ConstantInt>(Val)) |
554 | return ConstVal; |
555 | auto *Alloca = dyn_cast<AllocaInst>(Val); |
556 | if (!Alloca || !Alloca->getAllocatedType()->isIntegerTy()) |
557 | return nullptr; |
558 | return getPromotableAlloca(Alloca, Call); |
559 | } |
560 | |
561 | // To support specializing recursive functions, it is important to propagate |
562 | // constant arguments because after a first iteration of specialisation, a |
563 | // reduced example may look like this: |
564 | // |
565 | // define internal void @RecursiveFn(i32* arg1) { |
566 | // %temp = alloca i32, align 4 |
567 | // store i32 2 i32* %temp, align 4 |
568 | // call void @RecursiveFn.1(i32* nonnull %temp) |
569 | // ret void |
570 | // } |
571 | // |
572 | // Before a next iteration, we need to propagate the constant like so |
573 | // which allows further specialization in next iterations. |
574 | // |
575 | // @funcspec.arg = internal constant i32 2 |
576 | // |
577 | // define internal void @someFunc(i32* arg1) { |
578 | // call void @otherFunc(i32* nonnull @funcspec.arg) |
579 | // ret void |
580 | // } |
581 | // |
582 | // See if there are any new constant values for the callers of \p F via |
583 | // stack variables and promote them to global variables. |
584 | void FunctionSpecializer::promoteConstantStackValues(Function *F) { |
585 | for (User *U : F->users()) { |
586 | |
587 | auto *Call = dyn_cast<CallInst>(Val: U); |
588 | if (!Call) |
589 | continue; |
590 | |
591 | if (!Solver.isBlockExecutable(BB: Call->getParent())) |
592 | continue; |
593 | |
594 | for (const Use &U : Call->args()) { |
595 | unsigned Idx = Call->getArgOperandNo(U: &U); |
596 | Value *ArgOp = Call->getArgOperand(i: Idx); |
597 | Type *ArgOpType = ArgOp->getType(); |
598 | |
599 | if (!Call->onlyReadsMemory(OpNo: Idx) || !ArgOpType->isPointerTy()) |
600 | continue; |
601 | |
602 | auto *ConstVal = getConstantStackValue(Call, Val: ArgOp); |
603 | if (!ConstVal) |
604 | continue; |
605 | |
606 | Value *GV = new GlobalVariable(M, ConstVal->getType(), true, |
607 | GlobalValue::InternalLinkage, ConstVal, |
608 | "specialized.arg." + Twine(++NGlobals)); |
609 | Call->setArgOperand(i: Idx, v: GV); |
610 | } |
611 | } |
612 | } |
613 | |
614 | // ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics |
615 | // interfere with the promoteConstantStackValues() optimization. |
616 | static void removeSSACopy(Function &F) { |
617 | for (BasicBlock &BB : F) { |
618 | for (Instruction &Inst : llvm::make_early_inc_range(Range&: BB)) { |
619 | auto *II = dyn_cast<IntrinsicInst>(Val: &Inst); |
620 | if (!II) |
621 | continue; |
622 | if (II->getIntrinsicID() != Intrinsic::ssa_copy) |
623 | continue; |
624 | Inst.replaceAllUsesWith(V: II->getOperand(i_nocapture: 0)); |
625 | Inst.eraseFromParent(); |
626 | } |
627 | } |
628 | } |
629 | |
630 | /// Remove any ssa_copy intrinsics that may have been introduced. |
631 | void FunctionSpecializer::cleanUpSSA() { |
632 | for (Function *F : Specializations) |
633 | removeSSACopy(F&: *F); |
634 | } |
635 | |
636 | |
637 | template <> struct llvm::DenseMapInfo<SpecSig> { |
638 | static inline SpecSig getEmptyKey() { return {.Key: ~0U, .Args: {}}; } |
639 | |
640 | static inline SpecSig getTombstoneKey() { return {.Key: ~1U, .Args: {}}; } |
641 | |
642 | static unsigned getHashValue(const SpecSig &S) { |
643 | return static_cast<unsigned>(hash_value(S)); |
644 | } |
645 | |
646 | static bool isEqual(const SpecSig &LHS, const SpecSig &RHS) { |
647 | return LHS == RHS; |
648 | } |
649 | }; |
650 | |
651 | FunctionSpecializer::~FunctionSpecializer() { |
652 | LLVM_DEBUG( |
653 | if (NumSpecsCreated > 0) |
654 | dbgs() << "FnSpecialization: Created " << NumSpecsCreated |
655 | << " specializations in module " << M.getName() << "\n" ); |
656 | // Eliminate dead code. |
657 | removeDeadFunctions(); |
658 | cleanUpSSA(); |
659 | } |
660 | |
661 | /// Get the unsigned Value of given Cost object. Assumes the Cost is always |
662 | /// non-negative, which is true for both TCK_CodeSize and TCK_Latency, and |
663 | /// always Valid. |
664 | static unsigned getCostValue(const Cost &C) { |
665 | int64_t Value = C.getValue(); |
666 | |
667 | assert(Value >= 0 && "CodeSize and Latency cannot be negative" ); |
668 | // It is safe to down cast since we know the arguments cannot be negative and |
669 | // Cost is of type int64_t. |
670 | return static_cast<unsigned>(Value); |
671 | } |
672 | |
673 | /// Attempt to specialize functions in the module to enable constant |
674 | /// propagation across function boundaries. |
675 | /// |
676 | /// \returns true if at least one function is specialized. |
677 | bool FunctionSpecializer::run() { |
678 | // Find possible specializations for each function. |
679 | SpecMap SM; |
680 | SmallVector<Spec, 32> AllSpecs; |
681 | unsigned NumCandidates = 0; |
682 | for (Function &F : M) { |
683 | if (!isCandidateFunction(F: &F)) |
684 | continue; |
685 | |
686 | auto [It, Inserted] = FunctionMetrics.try_emplace(Key: &F); |
687 | CodeMetrics &Metrics = It->second; |
688 | //Analyze the function. |
689 | if (Inserted) { |
690 | SmallPtrSet<const Value *, 32> EphValues; |
691 | CodeMetrics::collectEphemeralValues(L: &F, AC: &GetAC(F), EphValues); |
692 | for (BasicBlock &BB : F) |
693 | Metrics.analyzeBasicBlock(BB: &BB, TTI: GetTTI(F), EphValues); |
694 | } |
695 | |
696 | // When specializing literal constants is enabled, always require functions |
697 | // to be larger than MinFunctionSize, to prevent excessive specialization. |
698 | const bool RequireMinSize = |
699 | !ForceSpecialization && |
700 | (SpecializeLiteralConstant || !F.hasFnAttribute(Kind: Attribute::NoInline)); |
701 | |
702 | // If the code metrics reveal that we shouldn't duplicate the function, |
703 | // or if the code size implies that this function is easy to get inlined, |
704 | // then we shouldn't specialize it. |
705 | if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() || |
706 | (RequireMinSize && Metrics.NumInsts < MinFunctionSize)) |
707 | continue; |
708 | |
709 | // When specialization on literal constants is disabled, only consider |
710 | // recursive functions when running multiple times to save wasted analysis, |
711 | // as we will not be able to specialize on any newly found literal constant |
712 | // return values. |
713 | if (!SpecializeLiteralConstant && !Inserted && !Metrics.isRecursive) |
714 | continue; |
715 | |
716 | int64_t Sz = Metrics.NumInsts.getValue(); |
717 | assert(Sz > 0 && "CodeSize should be positive" ); |
718 | // It is safe to down cast from int64_t, NumInsts is always positive. |
719 | unsigned FuncSize = static_cast<unsigned>(Sz); |
720 | |
721 | LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for " |
722 | << F.getName() << " is " << FuncSize << "\n" ); |
723 | |
724 | if (Inserted && Metrics.isRecursive) |
725 | promoteConstantStackValues(F: &F); |
726 | |
727 | if (!findSpecializations(F: &F, FuncSize, AllSpecs, SM)) { |
728 | LLVM_DEBUG( |
729 | dbgs() << "FnSpecialization: No possible specializations found for " |
730 | << F.getName() << "\n" ); |
731 | continue; |
732 | } |
733 | |
734 | ++NumCandidates; |
735 | } |
736 | |
737 | if (!NumCandidates) { |
738 | LLVM_DEBUG( |
739 | dbgs() |
740 | << "FnSpecialization: No possible specializations found in module\n" ); |
741 | return false; |
742 | } |
743 | |
744 | // Choose the most profitable specialisations, which fit in the module |
745 | // specialization budget, which is derived from maximum number of |
746 | // specializations per specialization candidate function. |
747 | auto CompareScore = [&AllSpecs](unsigned I, unsigned J) { |
748 | if (AllSpecs[I].Score != AllSpecs[J].Score) |
749 | return AllSpecs[I].Score > AllSpecs[J].Score; |
750 | return I > J; |
751 | }; |
752 | const unsigned NSpecs = |
753 | std::min(a: NumCandidates * MaxClones, b: unsigned(AllSpecs.size())); |
754 | SmallVector<unsigned> BestSpecs(NSpecs + 1); |
755 | std::iota(first: BestSpecs.begin(), last: BestSpecs.begin() + NSpecs, value: 0); |
756 | if (AllSpecs.size() > NSpecs) { |
757 | LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed " |
758 | << "the maximum number of clones threshold.\n" |
759 | << "FnSpecialization: Specializing the " |
760 | << NSpecs |
761 | << " most profitable candidates.\n" ); |
762 | std::make_heap(first: BestSpecs.begin(), last: BestSpecs.begin() + NSpecs, comp: CompareScore); |
763 | for (unsigned I = NSpecs, N = AllSpecs.size(); I < N; ++I) { |
764 | BestSpecs[NSpecs] = I; |
765 | std::push_heap(first: BestSpecs.begin(), last: BestSpecs.end(), comp: CompareScore); |
766 | std::pop_heap(first: BestSpecs.begin(), last: BestSpecs.end(), comp: CompareScore); |
767 | } |
768 | } |
769 | |
770 | LLVM_DEBUG(dbgs() << "FnSpecialization: List of specializations \n" ; |
771 | for (unsigned I = 0; I < NSpecs; ++I) { |
772 | const Spec &S = AllSpecs[BestSpecs[I]]; |
773 | dbgs() << "FnSpecialization: Function " << S.F->getName() |
774 | << " , score " << S.Score << "\n" ; |
775 | for (const ArgInfo &Arg : S.Sig.Args) |
776 | dbgs() << "FnSpecialization: FormalArg = " |
777 | << Arg.Formal->getNameOrAsOperand() |
778 | << ", ActualArg = " << Arg.Actual->getNameOrAsOperand() |
779 | << "\n" ; |
780 | }); |
781 | |
782 | // Create the chosen specializations. |
783 | SmallPtrSet<Function *, 8> OriginalFuncs; |
784 | SmallVector<Function *> Clones; |
785 | for (unsigned I = 0; I < NSpecs; ++I) { |
786 | Spec &S = AllSpecs[BestSpecs[I]]; |
787 | |
788 | // Accumulate the codesize growth for the function, now we are creating the |
789 | // specialization. |
790 | FunctionGrowth[S.F] += S.CodeSize; |
791 | |
792 | S.Clone = createSpecialization(F: S.F, S: S.Sig); |
793 | |
794 | // Update the known call sites to call the clone. |
795 | for (CallBase *Call : S.CallSites) { |
796 | LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call |
797 | << " to call " << S.Clone->getName() << "\n" ); |
798 | Call->setCalledFunction(S.Clone); |
799 | } |
800 | |
801 | Clones.push_back(Elt: S.Clone); |
802 | OriginalFuncs.insert(Ptr: S.F); |
803 | } |
804 | |
805 | Solver.solveWhileResolvedUndefsIn(WorkList&: Clones); |
806 | |
807 | // Update the rest of the call sites - these are the recursive calls, calls |
808 | // to discarded specialisations and calls that may match a specialisation |
809 | // after the solver runs. |
810 | for (Function *F : OriginalFuncs) { |
811 | auto [Begin, End] = SM[F]; |
812 | updateCallSites(F, Begin: AllSpecs.begin() + Begin, End: AllSpecs.begin() + End); |
813 | } |
814 | |
815 | for (Function *F : Clones) { |
816 | if (F->getReturnType()->isVoidTy()) |
817 | continue; |
818 | if (F->getReturnType()->isStructTy()) { |
819 | auto *STy = cast<StructType>(Val: F->getReturnType()); |
820 | if (!Solver.isStructLatticeConstant(F, STy)) |
821 | continue; |
822 | } else { |
823 | auto It = Solver.getTrackedRetVals().find(Key: F); |
824 | assert(It != Solver.getTrackedRetVals().end() && |
825 | "Return value ought to be tracked" ); |
826 | if (SCCPSolver::isOverdefined(LV: It->second)) |
827 | continue; |
828 | } |
829 | for (User *U : F->users()) { |
830 | if (auto *CS = dyn_cast<CallBase>(Val: U)) { |
831 | //The user instruction does not call our function. |
832 | if (CS->getCalledFunction() != F) |
833 | continue; |
834 | Solver.resetLatticeValueFor(Call: CS); |
835 | } |
836 | } |
837 | } |
838 | |
839 | // Rerun the solver to notify the users of the modified callsites. |
840 | Solver.solveWhileResolvedUndefs(); |
841 | |
842 | for (Function *F : OriginalFuncs) |
843 | if (FunctionMetrics[F].isRecursive) |
844 | promoteConstantStackValues(F); |
845 | |
846 | return true; |
847 | } |
848 | |
849 | void FunctionSpecializer::removeDeadFunctions() { |
850 | for (Function *F : FullySpecialized) { |
851 | LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function " |
852 | << F->getName() << "\n" ); |
853 | if (FAM) |
854 | FAM->clear(IR&: *F, Name: F->getName()); |
855 | F->eraseFromParent(); |
856 | } |
857 | FullySpecialized.clear(); |
858 | } |
859 | |
860 | /// Clone the function \p F and remove the ssa_copy intrinsics added by |
861 | /// the SCCPSolver in the cloned version. |
862 | static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) { |
863 | ValueToValueMapTy Mappings; |
864 | Function *Clone = CloneFunction(F, VMap&: Mappings); |
865 | Clone->setName(F->getName() + ".specialized." + Twine(NSpecs)); |
866 | removeSSACopy(F&: *Clone); |
867 | return Clone; |
868 | } |
869 | |
870 | bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize, |
871 | SmallVectorImpl<Spec> &AllSpecs, |
872 | SpecMap &SM) { |
873 | // A mapping from a specialisation signature to the index of the respective |
874 | // entry in the all specialisation array. Used to ensure uniqueness of |
875 | // specialisations. |
876 | DenseMap<SpecSig, unsigned> UniqueSpecs; |
877 | |
878 | // Get a list of interesting arguments. |
879 | SmallVector<Argument *> Args; |
880 | for (Argument &Arg : F->args()) |
881 | if (isArgumentInteresting(A: &Arg)) |
882 | Args.push_back(Elt: &Arg); |
883 | |
884 | if (Args.empty()) |
885 | return false; |
886 | |
887 | for (User *U : F->users()) { |
888 | if (!isa<CallInst>(Val: U) && !isa<InvokeInst>(Val: U)) |
889 | continue; |
890 | auto &CS = *cast<CallBase>(Val: U); |
891 | |
892 | // The user instruction does not call our function. |
893 | if (CS.getCalledFunction() != F) |
894 | continue; |
895 | |
896 | // If the call site has attribute minsize set, that callsite won't be |
897 | // specialized. |
898 | if (CS.hasFnAttr(Kind: Attribute::MinSize)) |
899 | continue; |
900 | |
901 | // If the parent of the call site will never be executed, we don't need |
902 | // to worry about the passed value. |
903 | if (!Solver.isBlockExecutable(BB: CS.getParent())) |
904 | continue; |
905 | |
906 | // Examine arguments and create a specialisation candidate from the |
907 | // constant operands of this call site. |
908 | SpecSig S; |
909 | for (Argument *A : Args) { |
910 | Constant *C = getCandidateConstant(V: CS.getArgOperand(i: A->getArgNo())); |
911 | if (!C) |
912 | continue; |
913 | LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument " |
914 | << A->getName() << " : " << C->getNameOrAsOperand() |
915 | << "\n" ); |
916 | S.Args.push_back(Elt: {A, C}); |
917 | } |
918 | |
919 | if (S.Args.empty()) |
920 | continue; |
921 | |
922 | // Check if we have encountered the same specialisation already. |
923 | if (auto It = UniqueSpecs.find(Val: S); It != UniqueSpecs.end()) { |
924 | // Existing specialisation. Add the call to the list to rewrite, unless |
925 | // it's a recursive call. A specialisation, generated because of a |
926 | // recursive call may end up as not the best specialisation for all |
927 | // the cloned instances of this call, which result from specialising |
928 | // functions. Hence we don't rewrite the call directly, but match it with |
929 | // the best specialisation once all specialisations are known. |
930 | if (CS.getFunction() == F) |
931 | continue; |
932 | const unsigned Index = It->second; |
933 | AllSpecs[Index].CallSites.push_back(Elt: &CS); |
934 | } else { |
935 | // Calculate the specialisation gain. |
936 | Cost CodeSize; |
937 | unsigned Score = 0; |
938 | InstCostVisitor Visitor = getInstCostVisitorFor(F); |
939 | for (ArgInfo &A : S.Args) { |
940 | CodeSize += Visitor.getCodeSizeSavingsForArg(A: A.Formal, C: A.Actual); |
941 | Score += getInliningBonus(A: A.Formal, C: A.Actual); |
942 | } |
943 | CodeSize += Visitor.getCodeSizeSavingsFromPendingPHIs(); |
944 | |
945 | unsigned CodeSizeSavings = getCostValue(C: CodeSize); |
946 | unsigned SpecSize = FuncSize - CodeSizeSavings; |
947 | |
948 | auto IsProfitable = [&]() -> bool { |
949 | // No check required. |
950 | if (ForceSpecialization) |
951 | return true; |
952 | |
953 | LLVM_DEBUG( |
954 | dbgs() << "FnSpecialization: Specialization bonus {Inlining = " |
955 | << Score << " (" << (Score * 100 / FuncSize) << "%)}\n" ); |
956 | |
957 | // Minimum inlining bonus. |
958 | if (Score > MinInliningBonus * FuncSize / 100) |
959 | return true; |
960 | |
961 | LLVM_DEBUG( |
962 | dbgs() << "FnSpecialization: Specialization bonus {CodeSize = " |
963 | << CodeSizeSavings << " (" |
964 | << (CodeSizeSavings * 100 / FuncSize) << "%)}\n" ); |
965 | |
966 | // Minimum codesize savings. |
967 | if (CodeSizeSavings < MinCodeSizeSavings * FuncSize / 100) |
968 | return false; |
969 | |
970 | // Lazily compute the Latency, to avoid unnecessarily computing BFI. |
971 | unsigned LatencySavings = |
972 | getCostValue(C: Visitor.getLatencySavingsForKnownConstants()); |
973 | |
974 | LLVM_DEBUG( |
975 | dbgs() << "FnSpecialization: Specialization bonus {Latency = " |
976 | << LatencySavings << " (" |
977 | << (LatencySavings * 100 / FuncSize) << "%)}\n" ); |
978 | |
979 | // Minimum latency savings. |
980 | if (LatencySavings < MinLatencySavings * FuncSize / 100) |
981 | return false; |
982 | // Maximum codesize growth. |
983 | if ((FunctionGrowth[F] + SpecSize) / FuncSize > MaxCodeSizeGrowth) |
984 | return false; |
985 | |
986 | Score += std::max(a: CodeSizeSavings, b: LatencySavings); |
987 | return true; |
988 | }; |
989 | |
990 | // Discard unprofitable specialisations. |
991 | if (!IsProfitable()) |
992 | continue; |
993 | |
994 | // Create a new specialisation entry. |
995 | auto &Spec = AllSpecs.emplace_back(Args&: F, Args&: S, Args&: Score, Args&: SpecSize); |
996 | if (CS.getFunction() != F) |
997 | Spec.CallSites.push_back(Elt: &CS); |
998 | const unsigned Index = AllSpecs.size() - 1; |
999 | UniqueSpecs[S] = Index; |
1000 | if (auto [It, Inserted] = SM.try_emplace(Key: F, Args: Index, Args: Index + 1); !Inserted) |
1001 | It->second.second = Index + 1; |
1002 | } |
1003 | } |
1004 | |
1005 | return !UniqueSpecs.empty(); |
1006 | } |
1007 | |
1008 | bool FunctionSpecializer::isCandidateFunction(Function *F) { |
1009 | if (F->isDeclaration() || F->arg_empty()) |
1010 | return false; |
1011 | |
1012 | if (F->hasFnAttribute(Kind: Attribute::NoDuplicate)) |
1013 | return false; |
1014 | |
1015 | // Do not specialize the cloned function again. |
1016 | if (Specializations.contains(Ptr: F)) |
1017 | return false; |
1018 | |
1019 | // If we're optimizing the function for size, we shouldn't specialize it. |
1020 | if (shouldOptimizeForSize(F, PSI: nullptr, BFI: nullptr, QueryType: PGSOQueryType::IRPass)) |
1021 | return false; |
1022 | |
1023 | // Exit if the function is not executable. There's no point in specializing |
1024 | // a dead function. |
1025 | if (!Solver.isBlockExecutable(BB: &F->getEntryBlock())) |
1026 | return false; |
1027 | |
1028 | // It wastes time to specialize a function which would get inlined finally. |
1029 | if (F->hasFnAttribute(Kind: Attribute::AlwaysInline)) |
1030 | return false; |
1031 | |
1032 | LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName() |
1033 | << "\n" ); |
1034 | return true; |
1035 | } |
1036 | |
1037 | Function *FunctionSpecializer::createSpecialization(Function *F, |
1038 | const SpecSig &S) { |
1039 | Function *Clone = cloneCandidateFunction(F, NSpecs: Specializations.size() + 1); |
1040 | |
1041 | // The original function does not neccessarily have internal linkage, but the |
1042 | // clone must. |
1043 | Clone->setLinkage(GlobalValue::InternalLinkage); |
1044 | |
1045 | // Initialize the lattice state of the arguments of the function clone, |
1046 | // marking the argument on which we specialized the function constant |
1047 | // with the given value. |
1048 | Solver.setLatticeValueForSpecializationArguments(F: Clone, Args: S.Args); |
1049 | Solver.markBlockExecutable(BB: &Clone->front()); |
1050 | Solver.addArgumentTrackedFunction(F: Clone); |
1051 | Solver.addTrackedFunction(F: Clone); |
1052 | |
1053 | // Mark all the specialized functions |
1054 | Specializations.insert(Ptr: Clone); |
1055 | ++NumSpecsCreated; |
1056 | |
1057 | return Clone; |
1058 | } |
1059 | |
1060 | /// Compute the inlining bonus for replacing argument \p A with constant \p C. |
1061 | /// The below heuristic is only concerned with exposing inlining |
1062 | /// opportunities via indirect call promotion. If the argument is not a |
1063 | /// (potentially casted) function pointer, give up. |
1064 | unsigned FunctionSpecializer::getInliningBonus(Argument *A, Constant *C) { |
1065 | Function *CalledFunction = dyn_cast<Function>(Val: C->stripPointerCasts()); |
1066 | if (!CalledFunction) |
1067 | return 0; |
1068 | |
1069 | // Get TTI for the called function (used for the inline cost). |
1070 | auto &CalleeTTI = (GetTTI)(*CalledFunction); |
1071 | |
1072 | // Look at all the call sites whose called value is the argument. |
1073 | // Specializing the function on the argument would allow these indirect |
1074 | // calls to be promoted to direct calls. If the indirect call promotion |
1075 | // would likely enable the called function to be inlined, specializing is a |
1076 | // good idea. |
1077 | int InliningBonus = 0; |
1078 | for (User *U : A->users()) { |
1079 | if (!isa<CallInst>(Val: U) && !isa<InvokeInst>(Val: U)) |
1080 | continue; |
1081 | auto *CS = cast<CallBase>(Val: U); |
1082 | if (CS->getCalledOperand() != A) |
1083 | continue; |
1084 | if (CS->getFunctionType() != CalledFunction->getFunctionType()) |
1085 | continue; |
1086 | |
1087 | // Get the cost of inlining the called function at this call site. Note |
1088 | // that this is only an estimate. The called function may eventually |
1089 | // change in a way that leads to it not being inlined here, even though |
1090 | // inlining looks profitable now. For example, one of its called |
1091 | // functions may be inlined into it, making the called function too large |
1092 | // to be inlined into this call site. |
1093 | // |
1094 | // We apply a boost for performing indirect call promotion by increasing |
1095 | // the default threshold by the threshold for indirect calls. |
1096 | auto Params = getInlineParams(); |
1097 | Params.DefaultThreshold += InlineConstants::IndirectCallThreshold; |
1098 | InlineCost IC = |
1099 | getInlineCost(Call&: *CS, Callee: CalledFunction, Params, CalleeTTI, GetAssumptionCache: GetAC, GetTLI); |
1100 | |
1101 | // We clamp the bonus for this call to be between zero and the default |
1102 | // threshold. |
1103 | if (IC.isAlways()) |
1104 | InliningBonus += Params.DefaultThreshold; |
1105 | else if (IC.isVariable() && IC.getCostDelta() > 0) |
1106 | InliningBonus += IC.getCostDelta(); |
1107 | |
1108 | LLVM_DEBUG(dbgs() << "FnSpecialization: Inlining bonus " << InliningBonus |
1109 | << " for user " << *U << "\n" ); |
1110 | } |
1111 | |
1112 | return InliningBonus > 0 ? static_cast<unsigned>(InliningBonus) : 0; |
1113 | } |
1114 | |
1115 | /// Determine if it is possible to specialise the function for constant values |
1116 | /// of the formal parameter \p A. |
1117 | bool FunctionSpecializer::isArgumentInteresting(Argument *A) { |
1118 | // No point in specialization if the argument is unused. |
1119 | if (A->user_empty()) |
1120 | return false; |
1121 | |
1122 | Type *Ty = A->getType(); |
1123 | if (!Ty->isPointerTy() && (!SpecializeLiteralConstant || |
1124 | (!Ty->isIntegerTy() && !Ty->isFloatingPointTy() && !Ty->isStructTy()))) |
1125 | return false; |
1126 | |
1127 | // SCCP solver does not record an argument that will be constructed on |
1128 | // stack. |
1129 | if (A->hasByValAttr() && !A->getParent()->onlyReadsMemory()) |
1130 | return false; |
1131 | |
1132 | // For non-argument-tracked functions every argument is overdefined. |
1133 | if (!Solver.isArgumentTrackedFunction(F: A->getParent())) |
1134 | return true; |
1135 | |
1136 | // Check the lattice value and decide if we should attemt to specialize, |
1137 | // based on this argument. No point in specialization, if the lattice value |
1138 | // is already a constant. |
1139 | bool IsOverdefined = Ty->isStructTy() |
1140 | ? any_of(Range: Solver.getStructLatticeValueFor(V: A), P: SCCPSolver::isOverdefined) |
1141 | : SCCPSolver::isOverdefined(LV: Solver.getLatticeValueFor(V: A)); |
1142 | |
1143 | LLVM_DEBUG( |
1144 | if (IsOverdefined) |
1145 | dbgs() << "FnSpecialization: Found interesting parameter " |
1146 | << A->getNameOrAsOperand() << "\n" ; |
1147 | else |
1148 | dbgs() << "FnSpecialization: Nothing to do, parameter " |
1149 | << A->getNameOrAsOperand() << " is already constant\n" ; |
1150 | ); |
1151 | return IsOverdefined; |
1152 | } |
1153 | |
1154 | /// Check if the value \p V (an actual argument) is a constant or can only |
1155 | /// have a constant value. Return that constant. |
1156 | Constant *FunctionSpecializer::getCandidateConstant(Value *V) { |
1157 | if (isa<PoisonValue>(Val: V)) |
1158 | return nullptr; |
1159 | |
1160 | // Select for possible specialisation values that are constants or |
1161 | // are deduced to be constants or constant ranges with a single element. |
1162 | Constant *C = dyn_cast<Constant>(Val: V); |
1163 | if (!C) |
1164 | C = Solver.getConstantOrNull(V); |
1165 | |
1166 | // Don't specialize on (anything derived from) the address of a non-constant |
1167 | // global variable, unless explicitly enabled. |
1168 | if (C && C->getType()->isPointerTy() && !C->isNullValue()) |
1169 | if (auto *GV = dyn_cast<GlobalVariable>(Val: getUnderlyingObject(V: C)); |
1170 | GV && !(GV->isConstant() || SpecializeOnAddress)) |
1171 | return nullptr; |
1172 | |
1173 | return C; |
1174 | } |
1175 | |
1176 | void FunctionSpecializer::updateCallSites(Function *F, const Spec *Begin, |
1177 | const Spec *End) { |
1178 | // Collect the call sites that need updating. |
1179 | SmallVector<CallBase *> ToUpdate; |
1180 | for (User *U : F->users()) |
1181 | if (auto *CS = dyn_cast<CallBase>(Val: U); |
1182 | CS && CS->getCalledFunction() == F && |
1183 | Solver.isBlockExecutable(BB: CS->getParent())) |
1184 | ToUpdate.push_back(Elt: CS); |
1185 | |
1186 | unsigned NCallsLeft = ToUpdate.size(); |
1187 | for (CallBase *CS : ToUpdate) { |
1188 | bool ShouldDecrementCount = CS->getFunction() == F; |
1189 | |
1190 | // Find the best matching specialisation. |
1191 | const Spec *BestSpec = nullptr; |
1192 | for (const Spec &S : make_range(x: Begin, y: End)) { |
1193 | if (!S.Clone || (BestSpec && S.Score <= BestSpec->Score)) |
1194 | continue; |
1195 | |
1196 | if (any_of(Range: S.Sig.Args, P: [CS, this](const ArgInfo &Arg) { |
1197 | unsigned ArgNo = Arg.Formal->getArgNo(); |
1198 | return getCandidateConstant(V: CS->getArgOperand(i: ArgNo)) != Arg.Actual; |
1199 | })) |
1200 | continue; |
1201 | |
1202 | BestSpec = &S; |
1203 | } |
1204 | |
1205 | if (BestSpec) { |
1206 | LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *CS |
1207 | << " to call " << BestSpec->Clone->getName() << "\n" ); |
1208 | CS->setCalledFunction(BestSpec->Clone); |
1209 | ShouldDecrementCount = true; |
1210 | } |
1211 | |
1212 | if (ShouldDecrementCount) |
1213 | --NCallsLeft; |
1214 | } |
1215 | |
1216 | // If the function has been completely specialized, the original function |
1217 | // is no longer needed. Mark it unreachable. |
1218 | if (NCallsLeft == 0 && Solver.isArgumentTrackedFunction(F)) { |
1219 | Solver.markFunctionUnreachable(F); |
1220 | FullySpecialized.insert(Ptr: F); |
1221 | } |
1222 | } |
1223 | |