1 | //===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file transforms calls of the current function (self recursion) followed |
10 | // by a return instruction with a branch to the entry of the function, creating |
11 | // a loop. This pass also implements the following extensions to the basic |
12 | // algorithm: |
13 | // |
14 | // 1. Trivial instructions between the call and return do not prevent the |
15 | // transformation from taking place, though currently the analysis cannot |
16 | // support moving any really useful instructions (only dead ones). |
17 | // 2. This pass transforms functions that are prevented from being tail |
18 | // recursive by an associative and commutative expression to use an |
19 | // accumulator variable, thus compiling the typical naive factorial or |
20 | // 'fib' implementation into efficient code. |
21 | // 3. TRE is performed if the function returns void, if the return |
22 | // returns the result returned by the call, or if the function returns a |
23 | // run-time constant on all exits from the function. It is possible, though |
24 | // unlikely, that the return returns something else (like constant 0), and |
25 | // can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in |
26 | // the function return the exact same value. |
27 | // 4. If it can prove that callees do not access their caller stack frame, |
28 | // they are marked as eligible for tail call elimination (by the code |
29 | // generator). |
30 | // |
31 | // There are several improvements that could be made: |
32 | // |
33 | // 1. If the function has any alloca instructions, these instructions will be |
34 | // moved out of the entry block of the function, causing them to be |
35 | // evaluated each time through the tail recursion. Safely keeping allocas |
36 | // in the entry block requires analysis to proves that the tail-called |
37 | // function does not read or write the stack object. |
38 | // 2. Tail recursion is only performed if the call immediately precedes the |
39 | // return instruction. It's possible that there could be a jump between |
40 | // the call and the return. |
41 | // 3. There can be intervening operations between the call and the return that |
42 | // prevent the TRE from occurring. For example, there could be GEP's and |
43 | // stores to memory that will not be read or written by the call. This |
44 | // requires some substantial analysis (such as with DSA) to prove safe to |
45 | // move ahead of the call, but doing so could allow many more TREs to be |
46 | // performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark. |
47 | // 4. The algorithm we use to detect if callees access their caller stack |
48 | // frames is very primitive. |
49 | // |
50 | //===----------------------------------------------------------------------===// |
51 | |
52 | #include "llvm/Transforms/Scalar/TailRecursionElimination.h" |
53 | #include "llvm/ADT/STLExtras.h" |
54 | #include "llvm/ADT/SmallPtrSet.h" |
55 | #include "llvm/ADT/Statistic.h" |
56 | #include "llvm/Analysis/DomTreeUpdater.h" |
57 | #include "llvm/Analysis/GlobalsModRef.h" |
58 | #include "llvm/Analysis/InstructionSimplify.h" |
59 | #include "llvm/Analysis/Loads.h" |
60 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
61 | #include "llvm/Analysis/PostDominators.h" |
62 | #include "llvm/Analysis/TargetTransformInfo.h" |
63 | #include "llvm/Analysis/ValueTracking.h" |
64 | #include "llvm/IR/CFG.h" |
65 | #include "llvm/IR/Constants.h" |
66 | #include "llvm/IR/DataLayout.h" |
67 | #include "llvm/IR/DerivedTypes.h" |
68 | #include "llvm/IR/DiagnosticInfo.h" |
69 | #include "llvm/IR/Dominators.h" |
70 | #include "llvm/IR/Function.h" |
71 | #include "llvm/IR/IRBuilder.h" |
72 | #include "llvm/IR/InstIterator.h" |
73 | #include "llvm/IR/Instructions.h" |
74 | #include "llvm/IR/IntrinsicInst.h" |
75 | #include "llvm/IR/Module.h" |
76 | #include "llvm/InitializePasses.h" |
77 | #include "llvm/Pass.h" |
78 | #include "llvm/Support/Debug.h" |
79 | #include "llvm/Support/raw_ostream.h" |
80 | #include "llvm/Transforms/Scalar.h" |
81 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
82 | using namespace llvm; |
83 | |
84 | #define DEBUG_TYPE "tailcallelim" |
85 | |
86 | STATISTIC(NumEliminated, "Number of tail calls removed" ); |
87 | STATISTIC(NumRetDuped, "Number of return duplicated" ); |
88 | STATISTIC(NumAccumAdded, "Number of accumulators introduced" ); |
89 | |
90 | /// Scan the specified function for alloca instructions. |
91 | /// If it contains any dynamic allocas, returns false. |
92 | static bool canTRE(Function &F) { |
93 | // TODO: We don't do TRE if dynamic allocas are used. |
94 | // Dynamic allocas allocate stack space which should be |
95 | // deallocated before new iteration started. That is |
96 | // currently not implemented. |
97 | return llvm::all_of(Range: instructions(F), P: [](Instruction &I) { |
98 | auto *AI = dyn_cast<AllocaInst>(Val: &I); |
99 | return !AI || AI->isStaticAlloca(); |
100 | }); |
101 | } |
102 | |
103 | namespace { |
104 | struct AllocaDerivedValueTracker { |
105 | // Start at a root value and walk its use-def chain to mark calls that use the |
106 | // value or a derived value in AllocaUsers, and places where it may escape in |
107 | // EscapePoints. |
108 | void walk(Value *Root) { |
109 | SmallVector<Use *, 32> Worklist; |
110 | SmallPtrSet<Use *, 32> Visited; |
111 | |
112 | auto AddUsesToWorklist = [&](Value *V) { |
113 | for (auto &U : V->uses()) { |
114 | if (!Visited.insert(Ptr: &U).second) |
115 | continue; |
116 | Worklist.push_back(Elt: &U); |
117 | } |
118 | }; |
119 | |
120 | AddUsesToWorklist(Root); |
121 | |
122 | while (!Worklist.empty()) { |
123 | Use *U = Worklist.pop_back_val(); |
124 | Instruction *I = cast<Instruction>(Val: U->getUser()); |
125 | |
126 | switch (I->getOpcode()) { |
127 | case Instruction::Call: |
128 | case Instruction::Invoke: { |
129 | auto &CB = cast<CallBase>(Val&: *I); |
130 | // If the alloca-derived argument is passed byval it is not an escape |
131 | // point, or a use of an alloca. Calling with byval copies the contents |
132 | // of the alloca into argument registers or stack slots, which exist |
133 | // beyond the lifetime of the current frame. |
134 | if (CB.isArgOperand(U) && CB.isByValArgument(ArgNo: CB.getArgOperandNo(U))) |
135 | continue; |
136 | bool IsNocapture = |
137 | CB.isDataOperand(U) && CB.doesNotCapture(OpNo: CB.getDataOperandNo(U)); |
138 | callUsesLocalStack(CB, IsNocapture); |
139 | if (IsNocapture) { |
140 | // If the alloca-derived argument is passed in as nocapture, then it |
141 | // can't propagate to the call's return. That would be capturing. |
142 | continue; |
143 | } |
144 | break; |
145 | } |
146 | case Instruction::Load: { |
147 | // The result of a load is not alloca-derived (unless an alloca has |
148 | // otherwise escaped, but this is a local analysis). |
149 | continue; |
150 | } |
151 | case Instruction::Store: { |
152 | if (U->getOperandNo() == 0) |
153 | EscapePoints.insert(Ptr: I); |
154 | continue; // Stores have no users to analyze. |
155 | } |
156 | case Instruction::BitCast: |
157 | case Instruction::GetElementPtr: |
158 | case Instruction::PHI: |
159 | case Instruction::Select: |
160 | case Instruction::AddrSpaceCast: |
161 | break; |
162 | default: |
163 | EscapePoints.insert(Ptr: I); |
164 | break; |
165 | } |
166 | |
167 | AddUsesToWorklist(I); |
168 | } |
169 | } |
170 | |
171 | void callUsesLocalStack(CallBase &CB, bool IsNocapture) { |
172 | // Add it to the list of alloca users. |
173 | AllocaUsers.insert(Ptr: &CB); |
174 | |
175 | // If it's nocapture then it can't capture this alloca. |
176 | if (IsNocapture) |
177 | return; |
178 | |
179 | // If it can write to memory, it can leak the alloca value. |
180 | if (!CB.onlyReadsMemory()) |
181 | EscapePoints.insert(Ptr: &CB); |
182 | } |
183 | |
184 | SmallPtrSet<Instruction *, 32> AllocaUsers; |
185 | SmallPtrSet<Instruction *, 32> EscapePoints; |
186 | }; |
187 | } |
188 | |
189 | static bool (Function &F, OptimizationRemarkEmitter *ORE) { |
190 | if (F.callsFunctionThatReturnsTwice()) |
191 | return false; |
192 | |
193 | // The local stack holds all alloca instructions and all byval arguments. |
194 | AllocaDerivedValueTracker Tracker; |
195 | for (Argument &Arg : F.args()) { |
196 | if (Arg.hasByValAttr()) |
197 | Tracker.walk(Root: &Arg); |
198 | } |
199 | for (auto &BB : F) { |
200 | for (auto &I : BB) |
201 | if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) |
202 | Tracker.walk(Root: AI); |
203 | } |
204 | |
205 | bool Modified = false; |
206 | |
207 | // Track whether a block is reachable after an alloca has escaped. Blocks that |
208 | // contain the escaping instruction will be marked as being visited without an |
209 | // escaped alloca, since that is how the block began. |
210 | enum VisitType { |
211 | UNVISITED, |
212 | UNESCAPED, |
213 | ESCAPED |
214 | }; |
215 | DenseMap<BasicBlock *, VisitType> Visited; |
216 | |
217 | // We propagate the fact that an alloca has escaped from block to successor. |
218 | // Visit the blocks that are propagating the escapedness first. To do this, we |
219 | // maintain two worklists. |
220 | SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped; |
221 | |
222 | // We may enter a block and visit it thinking that no alloca has escaped yet, |
223 | // then see an escape point and go back around a loop edge and come back to |
224 | // the same block twice. Because of this, we defer setting tail on calls when |
225 | // we first encounter them in a block. Every entry in this list does not |
226 | // statically use an alloca via use-def chain analysis, but may find an alloca |
227 | // through other means if the block turns out to be reachable after an escape |
228 | // point. |
229 | SmallVector<CallInst *, 32> DeferredTails; |
230 | |
231 | BasicBlock *BB = &F.getEntryBlock(); |
232 | VisitType Escaped = UNESCAPED; |
233 | do { |
234 | for (auto &I : *BB) { |
235 | if (Tracker.EscapePoints.count(Ptr: &I)) |
236 | Escaped = ESCAPED; |
237 | |
238 | CallInst *CI = dyn_cast<CallInst>(Val: &I); |
239 | // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is |
240 | // considered accessing memory and will be marked as a tail call if we |
241 | // don't bail out here. |
242 | if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(Val: &I) || |
243 | isa<PseudoProbeInst>(Val: &I)) |
244 | continue; |
245 | |
246 | // Special-case operand bundles "clang.arc.attachedcall", "ptrauth", and |
247 | // "kcfi". |
248 | bool IsNoTail = CI->isNoTailCall() || |
249 | CI->hasOperandBundlesOtherThan( |
250 | IDs: {LLVMContext::OB_clang_arc_attachedcall, |
251 | LLVMContext::OB_ptrauth, LLVMContext::OB_kcfi}); |
252 | |
253 | if (!IsNoTail && CI->doesNotAccessMemory()) { |
254 | // A call to a readnone function whose arguments are all things computed |
255 | // outside this function can be marked tail. Even if you stored the |
256 | // alloca address into a global, a readnone function can't load the |
257 | // global anyhow. |
258 | // |
259 | // Note that this runs whether we know an alloca has escaped or not. If |
260 | // it has, then we can't trust Tracker.AllocaUsers to be accurate. |
261 | bool SafeToTail = true; |
262 | for (auto &Arg : CI->args()) { |
263 | if (isa<Constant>(Val: Arg.getUser())) |
264 | continue; |
265 | if (Argument *A = dyn_cast<Argument>(Val: Arg.getUser())) |
266 | if (!A->hasByValAttr()) |
267 | continue; |
268 | SafeToTail = false; |
269 | break; |
270 | } |
271 | if (SafeToTail) { |
272 | using namespace ore; |
273 | ORE->emit(RemarkBuilder: [&]() { |
274 | return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone" , CI) |
275 | << "marked as tail call candidate (readnone)" ; |
276 | }); |
277 | CI->setTailCall(); |
278 | Modified = true; |
279 | continue; |
280 | } |
281 | } |
282 | |
283 | if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(Ptr: CI)) |
284 | DeferredTails.push_back(Elt: CI); |
285 | } |
286 | |
287 | for (auto *SuccBB : successors(BB)) { |
288 | auto &State = Visited[SuccBB]; |
289 | if (State < Escaped) { |
290 | State = Escaped; |
291 | if (State == ESCAPED) |
292 | WorklistEscaped.push_back(Elt: SuccBB); |
293 | else |
294 | WorklistUnescaped.push_back(Elt: SuccBB); |
295 | } |
296 | } |
297 | |
298 | if (!WorklistEscaped.empty()) { |
299 | BB = WorklistEscaped.pop_back_val(); |
300 | Escaped = ESCAPED; |
301 | } else { |
302 | BB = nullptr; |
303 | while (!WorklistUnescaped.empty()) { |
304 | auto *NextBB = WorklistUnescaped.pop_back_val(); |
305 | if (Visited[NextBB] == UNESCAPED) { |
306 | BB = NextBB; |
307 | Escaped = UNESCAPED; |
308 | break; |
309 | } |
310 | } |
311 | } |
312 | } while (BB); |
313 | |
314 | for (CallInst *CI : DeferredTails) { |
315 | if (Visited[CI->getParent()] != ESCAPED) { |
316 | // If the escape point was part way through the block, calls after the |
317 | // escape point wouldn't have been put into DeferredTails. |
318 | LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n" ); |
319 | CI->setTailCall(); |
320 | Modified = true; |
321 | } |
322 | } |
323 | |
324 | return Modified; |
325 | } |
326 | |
327 | /// Return true if it is safe to move the specified |
328 | /// instruction from after the call to before the call, assuming that all |
329 | /// instructions between the call and this instruction are movable. |
330 | /// |
331 | static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { |
332 | if (isa<DbgInfoIntrinsic>(Val: I)) |
333 | return true; |
334 | |
335 | if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) |
336 | if (II->getIntrinsicID() == Intrinsic::lifetime_end && |
337 | llvm::findAllocaForValue(V: II->getArgOperand(i: 1))) |
338 | return true; |
339 | |
340 | // FIXME: We can move load/store/call/free instructions above the call if the |
341 | // call does not mod/ref the memory location being processed. |
342 | if (I->mayHaveSideEffects()) // This also handles volatile loads. |
343 | return false; |
344 | |
345 | if (LoadInst *L = dyn_cast<LoadInst>(Val: I)) { |
346 | // Loads may always be moved above calls without side effects. |
347 | if (CI->mayHaveSideEffects()) { |
348 | // Non-volatile loads may be moved above a call with side effects if it |
349 | // does not write to memory and the load provably won't trap. |
350 | // Writes to memory only matter if they may alias the pointer |
351 | // being loaded from. |
352 | const DataLayout &DL = L->getDataLayout(); |
353 | if (isModSet(MRI: AA->getModRefInfo(I: CI, OptLoc: MemoryLocation::get(LI: L))) || |
354 | !isSafeToLoadUnconditionally(V: L->getPointerOperand(), Ty: L->getType(), |
355 | Alignment: L->getAlign(), DL, ScanFrom: L)) |
356 | return false; |
357 | } |
358 | } |
359 | |
360 | // Otherwise, if this is a side-effect free instruction, check to make sure |
361 | // that it does not use the return value of the call. If it doesn't use the |
362 | // return value of the call, it must only use things that are defined before |
363 | // the call, or movable instructions between the call and the instruction |
364 | // itself. |
365 | return !is_contained(Range: I->operands(), Element: CI); |
366 | } |
367 | |
368 | static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) { |
369 | if (!I->isAssociative() || !I->isCommutative()) |
370 | return false; |
371 | |
372 | assert(I->getNumOperands() >= 2 && |
373 | "Associative/commutative operations should have at least 2 args!" ); |
374 | |
375 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) { |
376 | // Accumulators must have an identity. |
377 | if (!ConstantExpr::getIntrinsicIdentity(II->getIntrinsicID(), Ty: I->getType())) |
378 | return false; |
379 | } |
380 | |
381 | // Exactly one operand should be the result of the call instruction. |
382 | if ((I->getOperand(i: 0) == CI && I->getOperand(i: 1) == CI) || |
383 | (I->getOperand(i: 0) != CI && I->getOperand(i: 1) != CI)) |
384 | return false; |
385 | |
386 | // The only user of this instruction we allow is a single return instruction. |
387 | if (!I->hasOneUse() || !isa<ReturnInst>(Val: I->user_back())) |
388 | return false; |
389 | |
390 | return true; |
391 | } |
392 | |
393 | static Instruction *firstNonDbg(BasicBlock::iterator I) { |
394 | while (isa<DbgInfoIntrinsic>(Val: I)) |
395 | ++I; |
396 | return &*I; |
397 | } |
398 | |
399 | namespace { |
400 | class TailRecursionEliminator { |
401 | Function &F; |
402 | const TargetTransformInfo *TTI; |
403 | AliasAnalysis *AA; |
404 | OptimizationRemarkEmitter *ORE; |
405 | DomTreeUpdater &DTU; |
406 | |
407 | // The below are shared state we want to have available when eliminating any |
408 | // calls in the function. There values should be populated by |
409 | // createTailRecurseLoopHeader the first time we find a call we can eliminate. |
410 | BasicBlock * = nullptr; |
411 | SmallVector<PHINode *, 8> ArgumentPHIs; |
412 | |
413 | // PHI node to store our return value. |
414 | PHINode *RetPN = nullptr; |
415 | |
416 | // i1 PHI node to track if we have a valid return value stored in RetPN. |
417 | PHINode *RetKnownPN = nullptr; |
418 | |
419 | // Vector of select instructions we insereted. These selects use RetKnownPN |
420 | // to either propagate RetPN or select a new return value. |
421 | SmallVector<SelectInst *, 8> RetSelects; |
422 | |
423 | // The below are shared state needed when performing accumulator recursion. |
424 | // There values should be populated by insertAccumulator the first time we |
425 | // find an elimination that requires an accumulator. |
426 | |
427 | // PHI node to store our current accumulated value. |
428 | PHINode *AccPN = nullptr; |
429 | |
430 | // The instruction doing the accumulating. |
431 | Instruction *AccumulatorRecursionInstr = nullptr; |
432 | |
433 | (Function &F, const TargetTransformInfo *TTI, |
434 | AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, |
435 | DomTreeUpdater &DTU) |
436 | : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {} |
437 | |
438 | CallInst *findTRECandidate(BasicBlock *BB); |
439 | |
440 | void createTailRecurseLoopHeader(CallInst *CI); |
441 | |
442 | void insertAccumulator(Instruction *AccRecInstr); |
443 | |
444 | bool eliminateCall(CallInst *CI); |
445 | |
446 | void cleanupAndFinalize(); |
447 | |
448 | bool processBlock(BasicBlock &BB); |
449 | |
450 | void copyByValueOperandIntoLocalTemp(CallInst *CI, int OpndIdx); |
451 | |
452 | void copyLocalTempOfByValueOperandIntoArguments(CallInst *CI, int OpndIdx); |
453 | |
454 | public: |
455 | static bool eliminate(Function &F, const TargetTransformInfo *TTI, |
456 | AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, |
457 | DomTreeUpdater &DTU); |
458 | }; |
459 | } // namespace |
460 | |
461 | CallInst *TailRecursionEliminator::findTRECandidate(BasicBlock *BB) { |
462 | Instruction *TI = BB->getTerminator(); |
463 | |
464 | if (&BB->front() == TI) // Make sure there is something before the terminator. |
465 | return nullptr; |
466 | |
467 | // Scan backwards from the return, checking to see if there is a tail call in |
468 | // this block. If so, set CI to it. |
469 | CallInst *CI = nullptr; |
470 | BasicBlock::iterator BBI(TI); |
471 | while (true) { |
472 | CI = dyn_cast<CallInst>(Val&: BBI); |
473 | if (CI && CI->getCalledFunction() == &F) |
474 | break; |
475 | |
476 | if (BBI == BB->begin()) |
477 | return nullptr; // Didn't find a potential tail call. |
478 | --BBI; |
479 | } |
480 | |
481 | assert((!CI->isTailCall() || !CI->isNoTailCall()) && |
482 | "Incompatible call site attributes(Tail,NoTail)" ); |
483 | if (!CI->isTailCall()) |
484 | return nullptr; |
485 | |
486 | // As a special case, detect code like this: |
487 | // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call |
488 | // and disable this xform in this case, because the code generator will |
489 | // lower the call to fabs into inline code. |
490 | if (BB == &F.getEntryBlock() && |
491 | firstNonDbg(I: BB->front().getIterator()) == CI && |
492 | firstNonDbg(I: std::next(x: BB->begin())) == TI && CI->getCalledFunction() && |
493 | !TTI->isLoweredToCall(F: CI->getCalledFunction())) { |
494 | // A single-block function with just a call and a return. Check that |
495 | // the arguments match. |
496 | auto I = CI->arg_begin(), E = CI->arg_end(); |
497 | Function::arg_iterator FI = F.arg_begin(), FE = F.arg_end(); |
498 | for (; I != E && FI != FE; ++I, ++FI) |
499 | if (*I != &*FI) break; |
500 | if (I == E && FI == FE) |
501 | return nullptr; |
502 | } |
503 | |
504 | return CI; |
505 | } |
506 | |
507 | void TailRecursionEliminator::(CallInst *CI) { |
508 | HeaderBB = &F.getEntryBlock(); |
509 | BasicBlock *NewEntry = BasicBlock::Create(Context&: F.getContext(), Name: "" , Parent: &F, InsertBefore: HeaderBB); |
510 | NewEntry->takeName(V: HeaderBB); |
511 | HeaderBB->setName("tailrecurse" ); |
512 | BranchInst::Create(IfTrue: HeaderBB, InsertBefore: NewEntry); |
513 | // If the new branch preserves the debug location of CI, it could result in |
514 | // misleading stepping, if CI is located in a conditional branch. |
515 | // So, here we don't give any debug location to the new branch. |
516 | |
517 | // Move all fixed sized allocas from HeaderBB to NewEntry. |
518 | for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), |
519 | NEBI = NewEntry->begin(); |
520 | OEBI != E;) |
521 | if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: OEBI++)) |
522 | if (isa<ConstantInt>(Val: AI->getArraySize())) |
523 | AI->moveBefore(MovePos: &*NEBI); |
524 | |
525 | // Now that we have created a new block, which jumps to the entry |
526 | // block, insert a PHI node for each argument of the function. |
527 | // For now, we initialize each PHI to only have the real arguments |
528 | // which are passed in. |
529 | BasicBlock::iterator InsertPos = HeaderBB->begin(); |
530 | for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) { |
531 | PHINode *PN = PHINode::Create(Ty: I->getType(), NumReservedValues: 2, NameStr: I->getName() + ".tr" ); |
532 | PN->insertBefore(InsertPos); |
533 | I->replaceAllUsesWith(V: PN); // Everyone use the PHI node now! |
534 | PN->addIncoming(V: &*I, BB: NewEntry); |
535 | ArgumentPHIs.push_back(Elt: PN); |
536 | } |
537 | |
538 | // If the function doen't return void, create the RetPN and RetKnownPN PHI |
539 | // nodes to track our return value. We initialize RetPN with poison and |
540 | // RetKnownPN with false since we can't know our return value at function |
541 | // entry. |
542 | Type *RetType = F.getReturnType(); |
543 | if (!RetType->isVoidTy()) { |
544 | Type *BoolType = Type::getInt1Ty(C&: F.getContext()); |
545 | RetPN = PHINode::Create(Ty: RetType, NumReservedValues: 2, NameStr: "ret.tr" ); |
546 | RetPN->insertBefore(InsertPos); |
547 | RetKnownPN = PHINode::Create(Ty: BoolType, NumReservedValues: 2, NameStr: "ret.known.tr" ); |
548 | RetKnownPN->insertBefore(InsertPos); |
549 | |
550 | RetPN->addIncoming(V: PoisonValue::get(T: RetType), BB: NewEntry); |
551 | RetKnownPN->addIncoming(V: ConstantInt::getFalse(Ty: BoolType), BB: NewEntry); |
552 | } |
553 | |
554 | // The entry block was changed from HeaderBB to NewEntry. |
555 | // The forward DominatorTree needs to be recalculated when the EntryBB is |
556 | // changed. In this corner-case we recalculate the entire tree. |
557 | DTU.recalculate(F&: *NewEntry->getParent()); |
558 | } |
559 | |
560 | void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) { |
561 | assert(!AccPN && "Trying to insert multiple accumulators" ); |
562 | |
563 | AccumulatorRecursionInstr = AccRecInstr; |
564 | |
565 | // Start by inserting a new PHI node for the accumulator. |
566 | pred_iterator PB = pred_begin(BB: HeaderBB), PE = pred_end(BB: HeaderBB); |
567 | AccPN = PHINode::Create(Ty: F.getReturnType(), NumReservedValues: std::distance(first: PB, last: PE) + 1, |
568 | NameStr: "accumulator.tr" ); |
569 | AccPN->insertBefore(InsertPos: HeaderBB->begin()); |
570 | |
571 | // Loop over all of the predecessors of the tail recursion block. For the |
572 | // real entry into the function we seed the PHI with the identity constant for |
573 | // the accumulation operation. For any other existing branches to this block |
574 | // (due to other tail recursions eliminated) the accumulator is not modified. |
575 | // Because we haven't added the branch in the current block to HeaderBB yet, |
576 | // it will not show up as a predecessor. |
577 | for (pred_iterator PI = PB; PI != PE; ++PI) { |
578 | BasicBlock *P = *PI; |
579 | if (P == &F.getEntryBlock()) { |
580 | Constant *Identity = |
581 | ConstantExpr::getIdentity(I: AccRecInstr, Ty: AccRecInstr->getType()); |
582 | AccPN->addIncoming(V: Identity, BB: P); |
583 | } else { |
584 | AccPN->addIncoming(V: AccPN, BB: P); |
585 | } |
586 | } |
587 | |
588 | ++NumAccumAdded; |
589 | } |
590 | |
591 | // Creates a copy of contents of ByValue operand of the specified |
592 | // call instruction into the newly created temporarily variable. |
593 | void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst *CI, |
594 | int OpndIdx) { |
595 | Type *AggTy = CI->getParamByValType(ArgNo: OpndIdx); |
596 | assert(AggTy); |
597 | const DataLayout &DL = F.getDataLayout(); |
598 | |
599 | // Get alignment of byVal operand. |
600 | Align Alignment(CI->getParamAlign(ArgNo: OpndIdx).valueOrOne()); |
601 | |
602 | // Create alloca for temporarily byval operands. |
603 | // Put alloca into the entry block. |
604 | Value *NewAlloca = new AllocaInst( |
605 | AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment, |
606 | CI->getArgOperand(i: OpndIdx)->getName(), F.getEntryBlock().begin()); |
607 | |
608 | IRBuilder<> Builder(CI); |
609 | Value *Size = Builder.getInt64(C: DL.getTypeAllocSize(Ty: AggTy)); |
610 | |
611 | // Copy data from byvalue operand into the temporarily variable. |
612 | Builder.CreateMemCpy(Dst: NewAlloca, /*DstAlign*/ Alignment, |
613 | Src: CI->getArgOperand(i: OpndIdx), |
614 | /*SrcAlign*/ Alignment, Size); |
615 | CI->setArgOperand(i: OpndIdx, v: NewAlloca); |
616 | } |
617 | |
618 | // Creates a copy from temporarily variable(keeping value of ByVal argument) |
619 | // into the corresponding function argument location. |
620 | void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments( |
621 | CallInst *CI, int OpndIdx) { |
622 | Type *AggTy = CI->getParamByValType(ArgNo: OpndIdx); |
623 | assert(AggTy); |
624 | const DataLayout &DL = F.getDataLayout(); |
625 | |
626 | // Get alignment of byVal operand. |
627 | Align Alignment(CI->getParamAlign(ArgNo: OpndIdx).valueOrOne()); |
628 | |
629 | IRBuilder<> Builder(CI); |
630 | Value *Size = Builder.getInt64(C: DL.getTypeAllocSize(Ty: AggTy)); |
631 | |
632 | // Copy data from the temporarily variable into corresponding |
633 | // function argument location. |
634 | Builder.CreateMemCpy(Dst: F.getArg(i: OpndIdx), /*DstAlign*/ Alignment, |
635 | Src: CI->getArgOperand(i: OpndIdx), |
636 | /*SrcAlign*/ Alignment, Size); |
637 | } |
638 | |
639 | bool TailRecursionEliminator::eliminateCall(CallInst *CI) { |
640 | ReturnInst *Ret = cast<ReturnInst>(Val: CI->getParent()->getTerminator()); |
641 | |
642 | // Ok, we found a potential tail call. We can currently only transform the |
643 | // tail call if all of the instructions between the call and the return are |
644 | // movable to above the call itself, leaving the call next to the return. |
645 | // Check that this is the case now. |
646 | Instruction *AccRecInstr = nullptr; |
647 | BasicBlock::iterator BBI(CI); |
648 | for (++BBI; &*BBI != Ret; ++BBI) { |
649 | if (canMoveAboveCall(I: &*BBI, CI, AA)) |
650 | continue; |
651 | |
652 | // If we can't move the instruction above the call, it might be because it |
653 | // is an associative and commutative operation that could be transformed |
654 | // using accumulator recursion elimination. Check to see if this is the |
655 | // case, and if so, remember which instruction accumulates for later. |
656 | if (AccPN || !canTransformAccumulatorRecursion(I: &*BBI, CI)) |
657 | return false; // We cannot eliminate the tail recursion! |
658 | |
659 | // Yes, this is accumulator recursion. Remember which instruction |
660 | // accumulates. |
661 | AccRecInstr = &*BBI; |
662 | } |
663 | |
664 | BasicBlock *BB = Ret->getParent(); |
665 | |
666 | using namespace ore; |
667 | ORE->emit(RemarkBuilder: [&]() { |
668 | return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion" , CI) |
669 | << "transforming tail recursion into loop" ; |
670 | }); |
671 | |
672 | // OK! We can transform this tail call. If this is the first one found, |
673 | // create the new entry block, allowing us to branch back to the old entry. |
674 | if (!HeaderBB) |
675 | createTailRecurseLoopHeader(CI); |
676 | |
677 | // Copy values of ByVal operands into local temporarily variables. |
678 | for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) { |
679 | if (CI->isByValArgument(ArgNo: I)) |
680 | copyByValueOperandIntoLocalTemp(CI, OpndIdx: I); |
681 | } |
682 | |
683 | // Ok, now that we know we have a pseudo-entry block WITH all of the |
684 | // required PHI nodes, add entries into the PHI node for the actual |
685 | // parameters passed into the tail-recursive call. |
686 | for (unsigned I = 0, E = CI->arg_size(); I != E; ++I) { |
687 | if (CI->isByValArgument(ArgNo: I)) { |
688 | copyLocalTempOfByValueOperandIntoArguments(CI, OpndIdx: I); |
689 | // When eliminating a tail call, we modify the values of the arguments. |
690 | // Therefore, if the byval parameter has a readonly attribute, we have to |
691 | // remove it. It is safe because, from the perspective of a caller, the |
692 | // byval parameter is always treated as "readonly," even if the readonly |
693 | // attribute is removed. |
694 | F.removeParamAttr(ArgNo: I, Kind: Attribute::ReadOnly); |
695 | ArgumentPHIs[I]->addIncoming(V: F.getArg(i: I), BB); |
696 | } else |
697 | ArgumentPHIs[I]->addIncoming(V: CI->getArgOperand(i: I), BB); |
698 | } |
699 | |
700 | if (AccRecInstr) { |
701 | insertAccumulator(AccRecInstr); |
702 | |
703 | // Rewrite the accumulator recursion instruction so that it does not use |
704 | // the result of the call anymore, instead, use the PHI node we just |
705 | // inserted. |
706 | AccRecInstr->setOperand(i: AccRecInstr->getOperand(i: 0) != CI, Val: AccPN); |
707 | } |
708 | |
709 | // Update our return value tracking |
710 | if (RetPN) { |
711 | if (Ret->getReturnValue() == CI || AccRecInstr) { |
712 | // Defer selecting a return value |
713 | RetPN->addIncoming(V: RetPN, BB); |
714 | RetKnownPN->addIncoming(V: RetKnownPN, BB); |
715 | } else { |
716 | // We found a return value we want to use, insert a select instruction to |
717 | // select it if we don't already know what our return value will be and |
718 | // store the result in our return value PHI node. |
719 | SelectInst *SI = |
720 | SelectInst::Create(C: RetKnownPN, S1: RetPN, S2: Ret->getReturnValue(), |
721 | NameStr: "current.ret.tr" , InsertBefore: Ret->getIterator()); |
722 | RetSelects.push_back(Elt: SI); |
723 | |
724 | RetPN->addIncoming(V: SI, BB); |
725 | RetKnownPN->addIncoming(V: ConstantInt::getTrue(Ty: RetKnownPN->getType()), BB); |
726 | } |
727 | |
728 | if (AccPN) |
729 | AccPN->addIncoming(V: AccRecInstr ? AccRecInstr : AccPN, BB); |
730 | } |
731 | |
732 | // Now that all of the PHI nodes are in place, remove the call and |
733 | // ret instructions, replacing them with an unconditional branch. |
734 | BranchInst *NewBI = BranchInst::Create(IfTrue: HeaderBB, InsertBefore: Ret->getIterator()); |
735 | NewBI->setDebugLoc(CI->getDebugLoc()); |
736 | |
737 | Ret->eraseFromParent(); // Remove return. |
738 | CI->eraseFromParent(); // Remove call. |
739 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, BB, HeaderBB}}); |
740 | ++NumEliminated; |
741 | return true; |
742 | } |
743 | |
744 | void TailRecursionEliminator::cleanupAndFinalize() { |
745 | // If we eliminated any tail recursions, it's possible that we inserted some |
746 | // silly PHI nodes which just merge an initial value (the incoming operand) |
747 | // with themselves. Check to see if we did and clean up our mess if so. This |
748 | // occurs when a function passes an argument straight through to its tail |
749 | // call. |
750 | for (PHINode *PN : ArgumentPHIs) { |
751 | // If the PHI Node is a dynamic constant, replace it with the value it is. |
752 | if (Value *PNV = simplifyInstruction(I: PN, Q: F.getDataLayout())) { |
753 | PN->replaceAllUsesWith(V: PNV); |
754 | PN->eraseFromParent(); |
755 | } |
756 | } |
757 | |
758 | if (RetPN) { |
759 | if (RetSelects.empty()) { |
760 | // If we didn't insert any select instructions, then we know we didn't |
761 | // store a return value and we can remove the PHI nodes we inserted. |
762 | RetPN->dropAllReferences(); |
763 | RetPN->eraseFromParent(); |
764 | |
765 | RetKnownPN->dropAllReferences(); |
766 | RetKnownPN->eraseFromParent(); |
767 | |
768 | if (AccPN) { |
769 | // We need to insert a copy of our accumulator instruction before any |
770 | // return in the function, and return its result instead. |
771 | Instruction *AccRecInstr = AccumulatorRecursionInstr; |
772 | for (BasicBlock &BB : F) { |
773 | ReturnInst *RI = dyn_cast<ReturnInst>(Val: BB.getTerminator()); |
774 | if (!RI) |
775 | continue; |
776 | |
777 | Instruction *AccRecInstrNew = AccRecInstr->clone(); |
778 | AccRecInstrNew->setName("accumulator.ret.tr" ); |
779 | AccRecInstrNew->setOperand(i: AccRecInstr->getOperand(i: 0) == AccPN, |
780 | Val: RI->getOperand(i_nocapture: 0)); |
781 | AccRecInstrNew->insertBefore(InsertPos: RI); |
782 | AccRecInstrNew->dropLocation(); |
783 | RI->setOperand(i_nocapture: 0, Val_nocapture: AccRecInstrNew); |
784 | } |
785 | } |
786 | } else { |
787 | // We need to insert a select instruction before any return left in the |
788 | // function to select our stored return value if we have one. |
789 | for (BasicBlock &BB : F) { |
790 | ReturnInst *RI = dyn_cast<ReturnInst>(Val: BB.getTerminator()); |
791 | if (!RI) |
792 | continue; |
793 | |
794 | SelectInst *SI = |
795 | SelectInst::Create(C: RetKnownPN, S1: RetPN, S2: RI->getOperand(i_nocapture: 0), |
796 | NameStr: "current.ret.tr" , InsertBefore: RI->getIterator()); |
797 | RetSelects.push_back(Elt: SI); |
798 | RI->setOperand(i_nocapture: 0, Val_nocapture: SI); |
799 | } |
800 | |
801 | if (AccPN) { |
802 | // We need to insert a copy of our accumulator instruction before any |
803 | // of the selects we inserted, and select its result instead. |
804 | Instruction *AccRecInstr = AccumulatorRecursionInstr; |
805 | for (SelectInst *SI : RetSelects) { |
806 | Instruction *AccRecInstrNew = AccRecInstr->clone(); |
807 | AccRecInstrNew->setName("accumulator.ret.tr" ); |
808 | AccRecInstrNew->setOperand(i: AccRecInstr->getOperand(i: 0) == AccPN, |
809 | Val: SI->getFalseValue()); |
810 | AccRecInstrNew->insertBefore(InsertPos: SI); |
811 | AccRecInstrNew->dropLocation(); |
812 | SI->setFalseValue(AccRecInstrNew); |
813 | } |
814 | } |
815 | } |
816 | } |
817 | } |
818 | |
819 | bool TailRecursionEliminator::processBlock(BasicBlock &BB) { |
820 | Instruction *TI = BB.getTerminator(); |
821 | |
822 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: TI)) { |
823 | if (BI->isConditional()) |
824 | return false; |
825 | |
826 | BasicBlock *Succ = BI->getSuccessor(i: 0); |
827 | ReturnInst *Ret = dyn_cast<ReturnInst>(Val: Succ->getFirstNonPHIOrDbg(SkipPseudoOp: true)); |
828 | |
829 | if (!Ret) |
830 | return false; |
831 | |
832 | CallInst *CI = findTRECandidate(BB: &BB); |
833 | |
834 | if (!CI) |
835 | return false; |
836 | |
837 | LLVM_DEBUG(dbgs() << "FOLDING: " << *Succ |
838 | << "INTO UNCOND BRANCH PRED: " << BB); |
839 | FoldReturnIntoUncondBranch(RI: Ret, BB: Succ, Pred: &BB, DTU: &DTU); |
840 | ++NumRetDuped; |
841 | |
842 | // If all predecessors of Succ have been eliminated by |
843 | // FoldReturnIntoUncondBranch, delete it. It is important to empty it, |
844 | // because the ret instruction in there is still using a value which |
845 | // eliminateCall will attempt to remove. This block can only contain |
846 | // instructions that can't have uses, therefore it is safe to remove. |
847 | if (pred_empty(BB: Succ)) |
848 | DTU.deleteBB(DelBB: Succ); |
849 | |
850 | eliminateCall(CI); |
851 | return true; |
852 | } else if (isa<ReturnInst>(Val: TI)) { |
853 | CallInst *CI = findTRECandidate(BB: &BB); |
854 | |
855 | if (CI) |
856 | return eliminateCall(CI); |
857 | } |
858 | |
859 | return false; |
860 | } |
861 | |
862 | bool TailRecursionEliminator::(Function &F, |
863 | const TargetTransformInfo *TTI, |
864 | AliasAnalysis *AA, |
865 | OptimizationRemarkEmitter *ORE, |
866 | DomTreeUpdater &DTU) { |
867 | if (F.getFnAttribute(Kind: "disable-tail-calls" ).getValueAsBool()) |
868 | return false; |
869 | |
870 | bool MadeChange = false; |
871 | MadeChange |= markTails(F, ORE); |
872 | |
873 | // If this function is a varargs function, we won't be able to PHI the args |
874 | // right, so don't even try to convert it... |
875 | if (F.getFunctionType()->isVarArg()) |
876 | return MadeChange; |
877 | |
878 | if (!canTRE(F)) |
879 | return MadeChange; |
880 | |
881 | // Change any tail recursive calls to loops. |
882 | TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU); |
883 | |
884 | for (BasicBlock &BB : F) |
885 | MadeChange |= TRE.processBlock(BB); |
886 | |
887 | TRE.cleanupAndFinalize(); |
888 | |
889 | return MadeChange; |
890 | } |
891 | |
892 | namespace { |
893 | struct TailCallElim : public FunctionPass { |
894 | static char ID; // Pass identification, replacement for typeid |
895 | TailCallElim() : FunctionPass(ID) { |
896 | initializeTailCallElimPass(*PassRegistry::getPassRegistry()); |
897 | } |
898 | |
899 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
900 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
901 | AU.addRequired<AAResultsWrapperPass>(); |
902 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
903 | AU.addPreserved<GlobalsAAWrapperPass>(); |
904 | AU.addPreserved<DominatorTreeWrapperPass>(); |
905 | AU.addPreserved<PostDominatorTreeWrapperPass>(); |
906 | } |
907 | |
908 | bool runOnFunction(Function &F) override { |
909 | if (skipFunction(F)) |
910 | return false; |
911 | |
912 | auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); |
913 | auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; |
914 | auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>(); |
915 | auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr; |
916 | // There is no noticable performance difference here between Lazy and Eager |
917 | // UpdateStrategy based on some test results. It is feasible to switch the |
918 | // UpdateStrategy to Lazy if we find it profitable later. |
919 | DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager); |
920 | |
921 | return TailRecursionEliminator::eliminate( |
922 | F, TTI: &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), |
923 | AA: &getAnalysis<AAResultsWrapperPass>().getAAResults(), |
924 | ORE: &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU); |
925 | } |
926 | }; |
927 | } |
928 | |
929 | char TailCallElim::ID = 0; |
930 | INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim" , "Tail Call Elimination" , |
931 | false, false) |
932 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
933 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
934 | INITIALIZE_PASS_END(TailCallElim, "tailcallelim" , "Tail Call Elimination" , |
935 | false, false) |
936 | |
937 | // Public interface to the TailCallElimination pass |
938 | FunctionPass *llvm::createTailCallEliminationPass() { |
939 | return new TailCallElim(); |
940 | } |
941 | |
942 | PreservedAnalyses TailCallElimPass::run(Function &F, |
943 | FunctionAnalysisManager &AM) { |
944 | |
945 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
946 | AliasAnalysis &AA = AM.getResult<AAManager>(IR&: F); |
947 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
948 | auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(IR&: F); |
949 | auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(IR&: F); |
950 | // There is no noticable performance difference here between Lazy and Eager |
951 | // UpdateStrategy based on some test results. It is feasible to switch the |
952 | // UpdateStrategy to Lazy if we find it profitable later. |
953 | DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager); |
954 | bool Changed = TailRecursionEliminator::eliminate(F, TTI: &TTI, AA: &AA, ORE: &ORE, DTU); |
955 | |
956 | if (!Changed) |
957 | return PreservedAnalyses::all(); |
958 | PreservedAnalyses PA; |
959 | PA.preserve<DominatorTreeAnalysis>(); |
960 | PA.preserve<PostDominatorTreeAnalysis>(); |
961 | return PA; |
962 | } |
963 | |