1//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements loop unroll and jam as a routine, much like
10// LoopUnroll.cpp implements loop unroll.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/ADT/ArrayRef.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallPtrSet.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringRef.h"
21#include "llvm/ADT/Twine.h"
22#include "llvm/Analysis/AssumptionCache.h"
23#include "llvm/Analysis/DependenceAnalysis.h"
24#include "llvm/Analysis/DomTreeUpdater.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/LoopIterator.h"
27#include "llvm/Analysis/MustExecute.h"
28#include "llvm/Analysis/OptimizationRemarkEmitter.h"
29#include "llvm/Analysis/ScalarEvolution.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/DebugInfoMetadata.h"
32#include "llvm/IR/DebugLoc.h"
33#include "llvm/IR/DiagnosticInfo.h"
34#include "llvm/IR/Dominators.h"
35#include "llvm/IR/Function.h"
36#include "llvm/IR/Instruction.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/IntrinsicInst.h"
39#include "llvm/IR/User.h"
40#include "llvm/IR/Value.h"
41#include "llvm/IR/ValueHandle.h"
42#include "llvm/IR/ValueMap.h"
43#include "llvm/Support/Casting.h"
44#include "llvm/Support/Debug.h"
45#include "llvm/Support/ErrorHandling.h"
46#include "llvm/Support/GenericDomTree.h"
47#include "llvm/Support/raw_ostream.h"
48#include "llvm/Transforms/Utils/BasicBlockUtils.h"
49#include "llvm/Transforms/Utils/Cloning.h"
50#include "llvm/Transforms/Utils/LoopUtils.h"
51#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
52#include "llvm/Transforms/Utils/UnrollLoop.h"
53#include "llvm/Transforms/Utils/ValueMapper.h"
54#include <assert.h>
55#include <memory>
56#include <type_traits>
57#include <vector>
58
59using namespace llvm;
60
61#define DEBUG_TYPE "loop-unroll-and-jam"
62
63STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
64STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
65
66typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
67
68// Partition blocks in an outer/inner loop pair into blocks before and after
69// the loop
70static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
71 BasicBlockSet &AftBlocks, DominatorTree &DT) {
72 Loop *SubLoop = L.getSubLoops()[0];
73 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
74
75 for (BasicBlock *BB : L.blocks()) {
76 if (!SubLoop->contains(BB)) {
77 if (DT.dominates(A: SubLoopLatch, B: BB))
78 AftBlocks.insert(Ptr: BB);
79 else
80 ForeBlocks.insert(Ptr: BB);
81 }
82 }
83
84 // Check that all blocks in ForeBlocks together dominate the subloop
85 // TODO: This might ideally be done better with a dominator/postdominators.
86 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
87 for (BasicBlock *BB : ForeBlocks) {
88 if (BB == SubLoopPreHeader)
89 continue;
90 Instruction *TI = BB->getTerminator();
91 for (BasicBlock *Succ : successors(I: TI))
92 if (!ForeBlocks.count(Ptr: Succ))
93 return false;
94 }
95
96 return true;
97}
98
99/// Partition blocks in a loop nest into blocks before and after each inner
100/// loop.
101static bool partitionOuterLoopBlocks(
102 Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
103 DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
104 DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
105 JamLoopBlocks.insert_range(R: JamLoop.blocks());
106
107 for (Loop *L : Root.getLoopsInPreorder()) {
108 if (L == &JamLoop)
109 break;
110
111 if (!partitionLoopBlocks(L&: *L, ForeBlocks&: ForeBlocksMap[L], AftBlocks&: AftBlocksMap[L], DT))
112 return false;
113 }
114
115 return true;
116}
117
118// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
119// than 2 levels loop.
120static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
121 BasicBlockSet &ForeBlocks,
122 BasicBlockSet &SubLoopBlocks,
123 BasicBlockSet &AftBlocks,
124 DominatorTree *DT) {
125 SubLoopBlocks.insert_range(R: SubLoop->blocks());
126 return partitionLoopBlocks(L&: *L, ForeBlocks, AftBlocks, DT&: *DT);
127}
128
129// Looks at the phi nodes in Header for values coming from Latch. For these
130// instructions and all their operands calls Visit on them, keeping going for
131// all the operands in AftBlocks. Returns false if Visit returns false,
132// otherwise returns true. This is used to process the instructions in the
133// Aft blocks that need to be moved before the subloop. It is used in two
134// places. One to check that the required set of instructions can be moved
135// before the loop. Then to collect the instructions to actually move in
136// moveHeaderPhiOperandsToForeBlocks.
137template <typename T>
138static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
139 BasicBlockSet &AftBlocks, T Visit) {
140 SmallPtrSet<Instruction *, 8> VisitedInstr;
141
142 std::function<bool(Instruction * I)> ProcessInstr = [&](Instruction *I) {
143 if (!VisitedInstr.insert(Ptr: I).second)
144 return true;
145
146 if (AftBlocks.count(Ptr: I->getParent()))
147 for (auto &U : I->operands())
148 if (Instruction *II = dyn_cast<Instruction>(Val&: U))
149 if (!ProcessInstr(II))
150 return false;
151
152 return Visit(I);
153 };
154
155 for (auto &Phi : Header->phis()) {
156 Value *V = Phi.getIncomingValueForBlock(BB: Latch);
157 if (Instruction *I = dyn_cast<Instruction>(Val: V))
158 if (!ProcessInstr(I))
159 return false;
160 }
161
162 return true;
163}
164
165// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
166static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
167 BasicBlock *Latch,
168 BasicBlock::iterator InsertLoc,
169 BasicBlockSet &AftBlocks) {
170 // We need to ensure we move the instructions in the correct order,
171 // starting with the earliest required instruction and moving forward.
172 processHeaderPhiOperands(Header, Latch, AftBlocks,
173 Visit: [&AftBlocks, &InsertLoc](Instruction *I) {
174 if (AftBlocks.count(Ptr: I->getParent()))
175 I->moveBefore(InsertPos: InsertLoc);
176 return true;
177 });
178}
179
180/*
181 This method performs Unroll and Jam. For a simple loop like:
182 for (i = ..)
183 Fore(i)
184 for (j = ..)
185 SubLoop(i, j)
186 Aft(i)
187
188 Instead of doing normal inner or outer unrolling, we do:
189 for (i = .., i+=2)
190 Fore(i)
191 Fore(i+1)
192 for (j = ..)
193 SubLoop(i, j)
194 SubLoop(i+1, j)
195 Aft(i)
196 Aft(i+1)
197
198 So the outer loop is essetially unrolled and then the inner loops are fused
199 ("jammed") together into a single loop. This can increase speed when there
200 are loads in SubLoop that are invariant to i, as they become shared between
201 the now jammed inner loops.
202
203 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
204 Fore blocks are those before the inner loop, Aft are those after. Normal
205 Unroll code is used to copy each of these sets of blocks and the results are
206 combined together into the final form above.
207
208 isSafeToUnrollAndJam should be used prior to calling this to make sure the
209 unrolling will be valid. Checking profitablility is also advisable.
210
211 If EpilogueLoop is non-null, it receives the epilogue loop (if it was
212 necessary to create one and not fully unrolled).
213*/
214LoopUnrollResult
215llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
216 unsigned TripMultiple, bool UnrollRemainder,
217 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
218 AssumptionCache *AC, const TargetTransformInfo *TTI,
219 OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
220
221 // When we enter here we should have already checked that it is safe
222 BasicBlock *Header = L->getHeader();
223 assert(Header && "No header.");
224 assert(L->getSubLoops().size() == 1);
225 Loop *SubLoop = *L->begin();
226
227 // Don't enter the unroll code if there is nothing to do.
228 if (TripCount == 0 && Count < 2) {
229 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
230 return LoopUnrollResult::Unmodified;
231 }
232
233 assert(Count > 0);
234 assert(TripMultiple > 0);
235 assert(TripCount == 0 || TripCount % TripMultiple == 0);
236
237 // Are we eliminating the loop control altogether?
238 bool CompletelyUnroll = (Count == TripCount);
239
240 // We use the runtime remainder in cases where we don't know trip multiple
241 if (TripMultiple % Count != 0) {
242 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
243 /*UseEpilogRemainder*/ true,
244 UnrollRemainder, /*ForgetAllSCEV*/ false,
245 LI, SE, DT, AC, TTI, PreserveLCSSA: true,
246 SCEVExpansionBudget: SCEVCheapExpansionBudget, RuntimeUnrollMultiExit: EpilogueLoop)) {
247 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
248 "generated when assuming runtime trip count\n");
249 return LoopUnrollResult::Unmodified;
250 }
251 }
252
253 // Notify ScalarEvolution that the loop will be substantially changed,
254 // if not outright eliminated.
255 if (SE) {
256 SE->forgetLoop(L);
257 SE->forgetBlockAndLoopDispositions();
258 }
259
260 using namespace ore;
261 // Report the unrolling decision.
262 if (CompletelyUnroll) {
263 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
264 << Header->getName() << " with trip count " << TripCount
265 << "!\n");
266 ORE->emit(OptDiag: OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
267 L->getHeader())
268 << "completely unroll and jammed loop with "
269 << NV("UnrollCount", TripCount) << " iterations");
270 } else {
271 auto DiagBuilder = [&]() {
272 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
273 L->getHeader());
274 return Diag << "unroll and jammed loop by a factor of "
275 << NV("UnrollCount", Count);
276 };
277
278 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
279 << " by " << Count);
280 if (TripMultiple != 1) {
281 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
282 ORE->emit(RemarkBuilder: [&]() {
283 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
284 << " trips per branch";
285 });
286 } else {
287 LLVM_DEBUG(dbgs() << " with run-time trip count");
288 ORE->emit(RemarkBuilder: [&]() { return DiagBuilder() << " with run-time trip count"; });
289 }
290 LLVM_DEBUG(dbgs() << "!\n");
291 }
292
293 BasicBlock *Preheader = L->getLoopPreheader();
294 BasicBlock *LatchBlock = L->getLoopLatch();
295 assert(Preheader && "No preheader");
296 assert(LatchBlock && "No latch block");
297 BranchInst *BI = dyn_cast<BranchInst>(Val: LatchBlock->getTerminator());
298 assert(BI && !BI->isUnconditional());
299 bool ContinueOnTrue = L->contains(BB: BI->getSuccessor(i: 0));
300 BasicBlock *LoopExit = BI->getSuccessor(i: ContinueOnTrue);
301 bool SubLoopContinueOnTrue = SubLoop->contains(
302 BB: SubLoop->getLoopLatch()->getTerminator()->getSuccessor(Idx: 0));
303
304 // Partition blocks in an outer/inner loop pair into blocks before and after
305 // the loop
306 BasicBlockSet SubLoopBlocks;
307 BasicBlockSet ForeBlocks;
308 BasicBlockSet AftBlocks;
309 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
310 DT);
311
312 // We keep track of the entering/first and exiting/last block of each of
313 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
314 // blocks easier.
315 std::vector<BasicBlock *> ForeBlocksFirst;
316 std::vector<BasicBlock *> ForeBlocksLast;
317 std::vector<BasicBlock *> SubLoopBlocksFirst;
318 std::vector<BasicBlock *> SubLoopBlocksLast;
319 std::vector<BasicBlock *> AftBlocksFirst;
320 std::vector<BasicBlock *> AftBlocksLast;
321 ForeBlocksFirst.push_back(x: Header);
322 ForeBlocksLast.push_back(x: SubLoop->getLoopPreheader());
323 SubLoopBlocksFirst.push_back(x: SubLoop->getHeader());
324 SubLoopBlocksLast.push_back(x: SubLoop->getExitingBlock());
325 AftBlocksFirst.push_back(x: SubLoop->getExitBlock());
326 AftBlocksLast.push_back(x: L->getExitingBlock());
327 // Maps Blocks[0] -> Blocks[It]
328 ValueToValueMapTy LastValueMap;
329
330 // Move any instructions from fore phi operands from AftBlocks into Fore.
331 moveHeaderPhiOperandsToForeBlocks(
332 Header, Latch: LatchBlock, InsertLoc: ForeBlocksLast[0]->getTerminator()->getIterator(),
333 AftBlocks);
334
335 // The current on-the-fly SSA update requires blocks to be processed in
336 // reverse postorder so that LastValueMap contains the correct value at each
337 // exit.
338 LoopBlocksDFS DFS(L);
339 DFS.perform(LI);
340 // Stash the DFS iterators before adding blocks to the loop.
341 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
342 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
343
344 // When a FSDiscriminator is enabled, we don't need to add the multiply
345 // factors to the discriminators.
346 if (Header->getParent()->shouldEmitDebugInfoForProfiling() &&
347 !EnableFSDiscriminator)
348 for (BasicBlock *BB : L->getBlocks())
349 for (Instruction &I : *BB)
350 if (!I.isDebugOrPseudoInst())
351 if (const DILocation *DIL = I.getDebugLoc()) {
352 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(DF: Count);
353 if (NewDIL)
354 I.setDebugLoc(*NewDIL);
355 else
356 LLVM_DEBUG(dbgs()
357 << "Failed to create new discriminator: "
358 << DIL->getFilename() << " Line: " << DIL->getLine());
359 }
360
361 // Copy all blocks
362 for (unsigned It = 1; It != Count; ++It) {
363 SmallVector<BasicBlock *, 8> NewBlocks;
364 // Maps Blocks[It] -> Blocks[It-1]
365 DenseMap<Value *, Value *> PrevItValueMap;
366 SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
367 NewLoops[L] = L;
368 NewLoops[SubLoop] = SubLoop;
369
370 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
371 ValueToValueMapTy VMap;
372 BasicBlock *New = CloneBasicBlock(BB: *BB, VMap, NameSuffix: "." + Twine(It));
373 Header->getParent()->insert(Position: Header->getParent()->end(), BB: New);
374
375 // Tell LI about New.
376 addClonedBlockToLoopInfo(OriginalBB: *BB, ClonedBB: New, LI, NewLoops);
377
378 if (ForeBlocks.count(Ptr: *BB)) {
379 if (*BB == ForeBlocksFirst[0])
380 ForeBlocksFirst.push_back(x: New);
381 if (*BB == ForeBlocksLast[0])
382 ForeBlocksLast.push_back(x: New);
383 } else if (SubLoopBlocks.count(Ptr: *BB)) {
384 if (*BB == SubLoopBlocksFirst[0])
385 SubLoopBlocksFirst.push_back(x: New);
386 if (*BB == SubLoopBlocksLast[0])
387 SubLoopBlocksLast.push_back(x: New);
388 } else if (AftBlocks.count(Ptr: *BB)) {
389 if (*BB == AftBlocksFirst[0])
390 AftBlocksFirst.push_back(x: New);
391 if (*BB == AftBlocksLast[0])
392 AftBlocksLast.push_back(x: New);
393 } else {
394 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
395 }
396
397 // Update our running maps of newest clones
398 auto &Last = LastValueMap[*BB];
399 PrevItValueMap[New] = (It == 1 ? *BB : Last);
400 Last = New;
401 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
402 VI != VE; ++VI) {
403 auto &LVM = LastValueMap[VI->first];
404 PrevItValueMap[VI->second] =
405 const_cast<Value *>(It == 1 ? VI->first : LVM);
406 LVM = VI->second;
407 }
408
409 NewBlocks.push_back(Elt: New);
410
411 // Update DomTree:
412 if (*BB == ForeBlocksFirst[0])
413 DT->addNewBlock(BB: New, DomBB: ForeBlocksLast[It - 1]);
414 else if (*BB == SubLoopBlocksFirst[0])
415 DT->addNewBlock(BB: New, DomBB: SubLoopBlocksLast[It - 1]);
416 else if (*BB == AftBlocksFirst[0])
417 DT->addNewBlock(BB: New, DomBB: AftBlocksLast[It - 1]);
418 else {
419 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
420 // structure.
421 auto BBDomNode = DT->getNode(BB: *BB);
422 auto BBIDom = BBDomNode->getIDom();
423 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
424 assert(OriginalBBIDom);
425 assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
426 DT->addNewBlock(
427 BB: New, DomBB: cast<BasicBlock>(Val&: LastValueMap[cast<Value>(Val: OriginalBBIDom)]));
428 }
429 }
430
431 // Remap all instructions in the most recent iteration
432 remapInstructionsInBlocks(Blocks: NewBlocks, VMap&: LastValueMap);
433 for (BasicBlock *NewBlock : NewBlocks) {
434 for (Instruction &I : *NewBlock) {
435 if (auto *II = dyn_cast<AssumeInst>(Val: &I))
436 AC->registerAssumption(CI: II);
437 }
438 }
439
440 // Alter the ForeBlocks phi's, pointing them at the latest version of the
441 // value from the previous iteration's phis
442 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
443 Value *OldValue = Phi.getIncomingValueForBlock(BB: AftBlocksLast[It]);
444 assert(OldValue && "should have incoming edge from Aft[It]");
445 Value *NewValue = OldValue;
446 if (Value *PrevValue = PrevItValueMap[OldValue])
447 NewValue = PrevValue;
448
449 assert(Phi.getNumOperands() == 2);
450 Phi.setIncomingBlock(i: 0, BB: ForeBlocksLast[It - 1]);
451 Phi.setIncomingValue(i: 0, V: NewValue);
452 Phi.removeIncomingValue(Idx: 1);
453 }
454 }
455
456 // Now that all the basic blocks for the unrolled iterations are in place,
457 // finish up connecting the blocks and phi nodes. At this point LastValueMap
458 // is the last unrolled iterations values.
459
460 // Update Phis in BB from OldBB to point to NewBB and use the latest value
461 // from LastValueMap
462 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
463 BasicBlock *NewBB,
464 ValueToValueMapTy &LastValueMap) {
465 for (PHINode &Phi : BB->phis()) {
466 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
467 if (Phi.getIncomingBlock(i: b) == OldBB) {
468 Value *OldValue = Phi.getIncomingValue(i: b);
469 if (Value *LastValue = LastValueMap[OldValue])
470 Phi.setIncomingValue(i: b, V: LastValue);
471 Phi.setIncomingBlock(i: b, BB: NewBB);
472 break;
473 }
474 }
475 }
476 };
477 // Move all the phis from Src into Dest
478 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
479 BasicBlock::iterator insertPoint = Dest->getFirstNonPHIIt();
480 while (PHINode *Phi = dyn_cast<PHINode>(Val: Src->begin()))
481 Phi->moveBefore(BB&: *Dest, I: insertPoint);
482 };
483
484 // Update the PHI values outside the loop to point to the last block
485 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
486 LastValueMap);
487
488 // Update ForeBlocks successors and phi nodes
489 BranchInst *ForeTerm =
490 cast<BranchInst>(Val: ForeBlocksLast.back()->getTerminator());
491 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
492 ForeTerm->setSuccessor(idx: 0, NewSucc: SubLoopBlocksFirst[0]);
493
494 if (CompletelyUnroll) {
495 while (PHINode *Phi = dyn_cast<PHINode>(Val: ForeBlocksFirst[0]->begin())) {
496 Phi->replaceAllUsesWith(V: Phi->getIncomingValueForBlock(BB: Preheader));
497 Phi->eraseFromParent();
498 }
499 } else {
500 // Update the PHI values to point to the last aft block
501 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
502 AftBlocksLast.back(), LastValueMap);
503 }
504
505 for (unsigned It = 1; It != Count; It++) {
506 // Remap ForeBlock successors from previous iteration to this
507 BranchInst *ForeTerm =
508 cast<BranchInst>(Val: ForeBlocksLast[It - 1]->getTerminator());
509 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
510 ForeTerm->setSuccessor(idx: 0, NewSucc: ForeBlocksFirst[It]);
511 }
512
513 // Subloop successors and phis
514 BranchInst *SubTerm =
515 cast<BranchInst>(Val: SubLoopBlocksLast.back()->getTerminator());
516 SubTerm->setSuccessor(idx: !SubLoopContinueOnTrue, NewSucc: SubLoopBlocksFirst[0]);
517 SubTerm->setSuccessor(idx: SubLoopContinueOnTrue, NewSucc: AftBlocksFirst[0]);
518 SubLoopBlocksFirst[0]->replacePhiUsesWith(Old: ForeBlocksLast[0],
519 New: ForeBlocksLast.back());
520 SubLoopBlocksFirst[0]->replacePhiUsesWith(Old: SubLoopBlocksLast[0],
521 New: SubLoopBlocksLast.back());
522
523 for (unsigned It = 1; It != Count; It++) {
524 // Replace the conditional branch of the previous iteration subloop with an
525 // unconditional one to this one
526 BranchInst *SubTerm =
527 cast<BranchInst>(Val: SubLoopBlocksLast[It - 1]->getTerminator());
528 BranchInst::Create(IfTrue: SubLoopBlocksFirst[It], InsertBefore: SubTerm->getIterator());
529 SubTerm->eraseFromParent();
530
531 SubLoopBlocksFirst[It]->replacePhiUsesWith(Old: ForeBlocksLast[It],
532 New: ForeBlocksLast.back());
533 SubLoopBlocksFirst[It]->replacePhiUsesWith(Old: SubLoopBlocksLast[It],
534 New: SubLoopBlocksLast.back());
535 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
536 }
537
538 // Aft blocks successors and phis
539 BranchInst *AftTerm = cast<BranchInst>(Val: AftBlocksLast.back()->getTerminator());
540 if (CompletelyUnroll) {
541 BranchInst::Create(IfTrue: LoopExit, InsertBefore: AftTerm->getIterator());
542 AftTerm->eraseFromParent();
543 } else {
544 AftTerm->setSuccessor(idx: !ContinueOnTrue, NewSucc: ForeBlocksFirst[0]);
545 assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
546 "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
547 }
548 AftBlocksFirst[0]->replacePhiUsesWith(Old: SubLoopBlocksLast[0],
549 New: SubLoopBlocksLast.back());
550
551 for (unsigned It = 1; It != Count; It++) {
552 // Replace the conditional branch of the previous iteration subloop with an
553 // unconditional one to this one
554 BranchInst *AftTerm =
555 cast<BranchInst>(Val: AftBlocksLast[It - 1]->getTerminator());
556 BranchInst::Create(IfTrue: AftBlocksFirst[It], InsertBefore: AftTerm->getIterator());
557 AftTerm->eraseFromParent();
558
559 AftBlocksFirst[It]->replacePhiUsesWith(Old: SubLoopBlocksLast[It],
560 New: SubLoopBlocksLast.back());
561 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
562 }
563
564 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
565 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
566 // new ones required.
567 if (Count != 1) {
568 SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
569 DTUpdates.emplace_back(Args: DominatorTree::UpdateKind::Delete, Args&: ForeBlocksLast[0],
570 Args&: SubLoopBlocksFirst[0]);
571 DTUpdates.emplace_back(Args: DominatorTree::UpdateKind::Delete,
572 Args&: SubLoopBlocksLast[0], Args&: AftBlocksFirst[0]);
573
574 DTUpdates.emplace_back(Args: DominatorTree::UpdateKind::Insert,
575 Args&: ForeBlocksLast.back(), Args&: SubLoopBlocksFirst[0]);
576 DTUpdates.emplace_back(Args: DominatorTree::UpdateKind::Insert,
577 Args&: SubLoopBlocksLast.back(), Args&: AftBlocksFirst[0]);
578 DTU.applyUpdatesPermissive(Updates: DTUpdates);
579 }
580
581 // Merge adjacent basic blocks, if possible.
582 SmallPtrSet<BasicBlock *, 16> MergeBlocks;
583 MergeBlocks.insert_range(R&: ForeBlocksLast);
584 MergeBlocks.insert_range(R&: SubLoopBlocksLast);
585 MergeBlocks.insert_range(R&: AftBlocksLast);
586
587 MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, DTU: &DTU, LI);
588
589 // Apply updates to the DomTree.
590 DT = &DTU.getDomTree();
591
592 // At this point, the code is well formed. We now do a quick sweep over the
593 // inserted code, doing constant propagation and dead code elimination as we
594 // go.
595 simplifyLoopAfterUnroll(L: SubLoop, SimplifyIVs: true, LI, SE, DT, AC, TTI);
596 simplifyLoopAfterUnroll(L, SimplifyIVs: !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
597 TTI);
598
599 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
600 ++NumUnrolledAndJammed;
601
602 // Update LoopInfo if the loop is completely removed.
603 if (CompletelyUnroll)
604 LI->erase(L);
605
606#ifndef NDEBUG
607 // We shouldn't have done anything to break loop simplify form or LCSSA.
608 Loop *OutestLoop = SubLoop->getParentLoop()
609 ? SubLoop->getParentLoop()->getParentLoop()
610 ? SubLoop->getParentLoop()->getParentLoop()
611 : SubLoop->getParentLoop()
612 : SubLoop;
613 assert(DT->verify());
614 LI->verify(*DT);
615 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
616 if (!CompletelyUnroll)
617 assert(L->isLoopSimplifyForm());
618 assert(SubLoop->isLoopSimplifyForm());
619 SE->verify();
620#endif
621
622 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
623 : LoopUnrollResult::PartiallyUnrolled;
624}
625
626static bool getLoadsAndStores(BasicBlockSet &Blocks,
627 SmallVector<Instruction *, 4> &MemInstr) {
628 // Scan the BBs and collect legal loads and stores.
629 // Returns false if non-simple loads/stores are found.
630 for (BasicBlock *BB : Blocks) {
631 for (Instruction &I : *BB) {
632 if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) {
633 if (!Ld->isSimple())
634 return false;
635 MemInstr.push_back(Elt: &I);
636 } else if (auto *St = dyn_cast<StoreInst>(Val: &I)) {
637 if (!St->isSimple())
638 return false;
639 MemInstr.push_back(Elt: &I);
640 } else if (I.mayReadOrWriteMemory()) {
641 return false;
642 }
643 }
644 }
645 return true;
646}
647
648static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
649 unsigned UnrollLevel, unsigned JamLevel,
650 bool Sequentialized, Dependence *D) {
651 // UnrollLevel might carry the dependency Src --> Dst
652 // Does a different loop after unrolling?
653 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
654 ++CurLoopDepth) {
655 auto JammedDir = D->getDirection(Level: CurLoopDepth);
656 if (JammedDir == Dependence::DVEntry::LT)
657 return true;
658
659 if (JammedDir & Dependence::DVEntry::GT)
660 return false;
661 }
662
663 return true;
664}
665
666static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
667 unsigned UnrollLevel, unsigned JamLevel,
668 bool Sequentialized, Dependence *D) {
669 // UnrollLevel might carry the dependency Dst --> Src
670 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
671 ++CurLoopDepth) {
672 auto JammedDir = D->getDirection(Level: CurLoopDepth);
673 if (JammedDir == Dependence::DVEntry::GT)
674 return true;
675
676 if (JammedDir & Dependence::DVEntry::LT)
677 return false;
678 }
679
680 // Backward dependencies are only preserved if not interleaved.
681 return Sequentialized;
682}
683
684// Check whether it is semantically safe Src and Dst considering any potential
685// dependency between them.
686//
687// @param UnrollLevel The level of the loop being unrolled
688// @param JamLevel The level of the loop being jammed; if Src and Dst are on
689// different levels, the outermost common loop counts as jammed level
690//
691// @return true if is safe and false if there is a dependency violation.
692static bool checkDependency(Instruction *Src, Instruction *Dst,
693 unsigned UnrollLevel, unsigned JamLevel,
694 bool Sequentialized, DependenceInfo &DI) {
695 assert(UnrollLevel <= JamLevel &&
696 "Expecting JamLevel to be at least UnrollLevel");
697
698 if (Src == Dst)
699 return true;
700 // Ignore Input dependencies.
701 if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst))
702 return true;
703
704 // Check whether unroll-and-jam may violate a dependency.
705 // By construction, every dependency will be lexicographically non-negative
706 // (if it was, it would violate the current execution order), such as
707 // (0,0,>,*,*)
708 // Unroll-and-jam changes the GT execution of two executions to the same
709 // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
710 // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
711 // (0,0,>=,*,*)
712 // Now, the dependency is not necessarily non-negative anymore, i.e.
713 // unroll-and-jam may violate correctness.
714 std::unique_ptr<Dependence> D = DI.depends(Src, Dst);
715 if (!D)
716 return true;
717 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
718
719 if (D->isConfused()) {
720 LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
721 << " " << *Src << "\n"
722 << " " << *Dst << "\n");
723 return false;
724 }
725
726 // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
727 // non-equal direction, then the locations accessed in the inner levels cannot
728 // overlap in memory. We assumes the indexes never overlap into neighboring
729 // dimensions.
730 for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
731 if (!(D->getDirection(Level: CurLoopDepth) & Dependence::DVEntry::EQ))
732 return true;
733
734 auto UnrollDirection = D->getDirection(Level: UnrollLevel);
735
736 // If the distance carried by the unrolled loop is 0, then after unrolling
737 // that distance will become non-zero resulting in non-overlapping accesses in
738 // the inner loops.
739 if (UnrollDirection == Dependence::DVEntry::EQ)
740 return true;
741
742 if (UnrollDirection & Dependence::DVEntry::LT &&
743 !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
744 Sequentialized, D: D.get()))
745 return false;
746
747 if (UnrollDirection & Dependence::DVEntry::GT &&
748 !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
749 Sequentialized, D: D.get()))
750 return false;
751
752 return true;
753}
754
755static bool
756checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
757 const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
758 const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
759 DependenceInfo &DI, LoopInfo &LI) {
760 SmallVector<BasicBlockSet, 8> AllBlocks;
761 for (Loop *L : Root.getLoopsInPreorder())
762 if (ForeBlocksMap.contains(Val: L))
763 AllBlocks.push_back(Elt: ForeBlocksMap.lookup(Val: L));
764 AllBlocks.push_back(Elt: SubLoopBlocks);
765 for (Loop *L : Root.getLoopsInPreorder())
766 if (AftBlocksMap.contains(Val: L))
767 AllBlocks.push_back(Elt: AftBlocksMap.lookup(Val: L));
768
769 unsigned LoopDepth = Root.getLoopDepth();
770 SmallVector<Instruction *, 4> EarlierLoadsAndStores;
771 SmallVector<Instruction *, 4> CurrentLoadsAndStores;
772 for (BasicBlockSet &Blocks : AllBlocks) {
773 CurrentLoadsAndStores.clear();
774 if (!getLoadsAndStores(Blocks, MemInstr&: CurrentLoadsAndStores))
775 return false;
776
777 Loop *CurLoop = LI.getLoopFor(BB: (*Blocks.begin())->front().getParent());
778 unsigned CurLoopDepth = CurLoop->getLoopDepth();
779
780 for (auto *Earlier : EarlierLoadsAndStores) {
781 Loop *EarlierLoop = LI.getLoopFor(BB: Earlier->getParent());
782 unsigned EarlierDepth = EarlierLoop->getLoopDepth();
783 unsigned CommonLoopDepth = std::min(a: EarlierDepth, b: CurLoopDepth);
784 for (auto *Later : CurrentLoadsAndStores) {
785 if (!checkDependency(Src: Earlier, Dst: Later, UnrollLevel: LoopDepth, JamLevel: CommonLoopDepth, Sequentialized: false,
786 DI))
787 return false;
788 }
789 }
790
791 size_t NumInsts = CurrentLoadsAndStores.size();
792 for (size_t I = 0; I < NumInsts; ++I) {
793 for (size_t J = I; J < NumInsts; ++J) {
794 if (!checkDependency(Src: CurrentLoadsAndStores[I], Dst: CurrentLoadsAndStores[J],
795 UnrollLevel: LoopDepth, JamLevel: CurLoopDepth, Sequentialized: true, DI))
796 return false;
797 }
798 }
799
800 EarlierLoadsAndStores.append(in_start: CurrentLoadsAndStores.begin(),
801 in_end: CurrentLoadsAndStores.end());
802 }
803 return true;
804}
805
806static bool isEligibleLoopForm(const Loop &Root) {
807 // Root must have a child.
808 if (Root.getSubLoops().size() != 1)
809 return false;
810
811 const Loop *L = &Root;
812 do {
813 // All loops in Root need to be in simplify and rotated form.
814 if (!L->isLoopSimplifyForm())
815 return false;
816
817 if (!L->isRotatedForm())
818 return false;
819
820 if (L->getHeader()->hasAddressTaken()) {
821 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
822 return false;
823 }
824
825 unsigned SubLoopsSize = L->getSubLoops().size();
826 if (SubLoopsSize == 0)
827 return true;
828
829 // Only one child is allowed.
830 if (SubLoopsSize != 1)
831 return false;
832
833 // Only loops with a single exit block can be unrolled and jammed.
834 // The function getExitBlock() is used for this check, rather than
835 // getUniqueExitBlock() to ensure loops with mulitple exit edges are
836 // disallowed.
837 if (!L->getExitBlock()) {
838 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit "
839 "blocks can be unrolled and jammed.\n");
840 return false;
841 }
842
843 // Only loops with a single exiting block can be unrolled and jammed.
844 if (!L->getExitingBlock()) {
845 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single "
846 "exiting blocks can be unrolled and jammed.\n");
847 return false;
848 }
849
850 L = L->getSubLoops()[0];
851 } while (L);
852
853 return true;
854}
855
856static Loop *getInnerMostLoop(Loop *L) {
857 while (!L->getSubLoops().empty())
858 L = L->getSubLoops()[0];
859 return L;
860}
861
862bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
863 DependenceInfo &DI, LoopInfo &LI) {
864 if (!isEligibleLoopForm(Root: *L)) {
865 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
866 return false;
867 }
868
869 /* We currently handle outer loops like this:
870 |
871 ForeFirst <------\ }
872 Blocks | } ForeBlocks of L
873 ForeLast | }
874 | |
875 ... |
876 | |
877 ForeFirst <----\ | }
878 Blocks | | } ForeBlocks of a inner loop of L
879 ForeLast | | }
880 | | |
881 JamLoopFirst <\ | | }
882 Blocks | | | } JamLoopBlocks of the innermost loop
883 JamLoopLast -/ | | }
884 | | |
885 AftFirst | | }
886 Blocks | | } AftBlocks of a inner loop of L
887 AftLast ------/ | }
888 | |
889 ... |
890 | |
891 AftFirst | }
892 Blocks | } AftBlocks of L
893 AftLast --------/ }
894 |
895
896 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
897 and AftBlocks, providing that there is one edge from Fores to SubLoops,
898 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
899 In practice we currently limit Aft blocks to a single block, and limit
900 things further in the profitablility checks of the unroll and jam pass.
901
902 Because of the way we rearrange basic blocks, we also require that
903 the Fore blocks of L on all unrolled iterations are safe to move before the
904 blocks of the direct child of L of all iterations. So we require that the
905 phi node looping operands of ForeHeader can be moved to at least the end of
906 ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
907 match up Phi's correctly.
908
909 i.e. The old order of blocks used to be
910 (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
911 It needs to be safe to transform this to
912 (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
913
914 There are then a number of checks along the lines of no calls, no
915 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
916 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
917 UnrollAndJamLoop if the trip count cannot be easily calculated.
918 */
919
920 // Split blocks into Fore/SubLoop/Aft based on dominators
921 Loop *JamLoop = getInnerMostLoop(L);
922 BasicBlockSet SubLoopBlocks;
923 DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
924 DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
925 if (!partitionOuterLoopBlocks(Root&: *L, JamLoop&: *JamLoop, JamLoopBlocks&: SubLoopBlocks, ForeBlocksMap,
926 AftBlocksMap, DT)) {
927 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
928 return false;
929 }
930
931 // Aft blocks may need to move instructions to fore blocks, which becomes more
932 // difficult if there are multiple (potentially conditionally executed)
933 // blocks. For now we just exclude loops with multiple aft blocks.
934 if (AftBlocksMap[L].size() != 1) {
935 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
936 "multiple blocks after the loop\n");
937 return false;
938 }
939
940 // Check inner loop backedge count is consistent on all iterations of the
941 // outer loop
942 if (any_of(Range: L->getLoopsInPreorder(), P: [&SE](Loop *SubLoop) {
943 return !hasIterationCountInvariantInParent(L: SubLoop, SE);
944 })) {
945 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
946 "not consistent on each iteration\n");
947 return false;
948 }
949
950 // Check the loop safety info for exceptions.
951 SimpleLoopSafetyInfo LSI;
952 LSI.computeLoopSafetyInfo(CurLoop: L);
953 if (LSI.anyBlockMayThrow()) {
954 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
955 return false;
956 }
957
958 // We've ruled out the easy stuff and now need to check that there are no
959 // interdependencies which may prevent us from moving the:
960 // ForeBlocks before Subloop and AftBlocks.
961 // Subloop before AftBlocks.
962 // ForeBlock phi operands before the subloop
963
964 // Make sure we can move all instructions we need to before the subloop
965 BasicBlock *Header = L->getHeader();
966 BasicBlock *Latch = L->getLoopLatch();
967 BasicBlockSet AftBlocks = AftBlocksMap[L];
968 Loop *SubLoop = L->getSubLoops()[0];
969 if (!processHeaderPhiOperands(
970 Header, Latch, AftBlocks, Visit: [&AftBlocks, &SubLoop](Instruction *I) {
971 if (SubLoop->contains(BB: I->getParent()))
972 return false;
973 if (AftBlocks.count(Ptr: I->getParent())) {
974 // If we hit a phi node in afts we know we are done (probably
975 // LCSSA)
976 if (isa<PHINode>(Val: I))
977 return false;
978 // Can't move instructions with side effects or memory
979 // reads/writes
980 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
981 return false;
982 }
983 // Keep going
984 return true;
985 })) {
986 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
987 "instructions after subloop to before it\n");
988 return false;
989 }
990
991 // Check for memory dependencies which prohibit the unrolling we are doing.
992 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
993 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
994 if (!checkDependencies(Root&: *L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
995 LI)) {
996 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
997 return false;
998 }
999
1000 return true;
1001}
1002