1//===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "llvm/CodeGen/ModuloSchedule.h"
10#include "llvm/ADT/StringExtras.h"
11#include "llvm/Analysis/MemoryLocation.h"
12#include "llvm/CodeGen/LiveIntervals.h"
13#include "llvm/CodeGen/MachineBasicBlock.h"
14#include "llvm/CodeGen/MachineInstrBuilder.h"
15#include "llvm/CodeGen/MachineLoopInfo.h"
16#include "llvm/CodeGen/MachineRegisterInfo.h"
17#include "llvm/InitializePasses.h"
18#include "llvm/MC/MCContext.h"
19#include "llvm/Support/Debug.h"
20#include "llvm/Support/ErrorHandling.h"
21#include "llvm/Support/raw_ostream.h"
22
23#define DEBUG_TYPE "pipeliner"
24using namespace llvm;
25
26static cl::opt<bool> SwapBranchTargetsMVE(
27 "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(Val: false),
28 cl::desc("Swap target blocks of a conditional branch for MVE expander"));
29
30void ModuloSchedule::print(raw_ostream &OS) {
31 for (MachineInstr *MI : ScheduledInstrs)
32 OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
33}
34
35//===----------------------------------------------------------------------===//
36// ModuloScheduleExpander implementation
37//===----------------------------------------------------------------------===//
38
39/// Return the register values for the operands of a Phi instruction.
40/// This function assume the instruction is a Phi.
41static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
42 Register &InitVal, Register &LoopVal) {
43 assert(Phi.isPHI() && "Expecting a Phi.");
44
45 InitVal = Register();
46 LoopVal = Register();
47 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
48 if (Phi.getOperand(i: i + 1).getMBB() != Loop)
49 InitVal = Phi.getOperand(i).getReg();
50 else
51 LoopVal = Phi.getOperand(i).getReg();
52
53 assert(InitVal && LoopVal && "Unexpected Phi structure.");
54}
55
56/// Return the Phi register value that comes from the incoming block.
57static Register getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
58 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
59 if (Phi.getOperand(i: i + 1).getMBB() != LoopBB)
60 return Phi.getOperand(i).getReg();
61 return Register();
62}
63
64/// Return the Phi register value that comes the loop block.
65static Register getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
66 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
67 if (Phi.getOperand(i: i + 1).getMBB() == LoopBB)
68 return Phi.getOperand(i).getReg();
69 return Register();
70}
71
72void ModuloScheduleExpander::expand() {
73 BB = Schedule.getLoop()->getTopBlock();
74 Preheader = *BB->pred_begin();
75 if (Preheader == BB)
76 Preheader = *std::next(x: BB->pred_begin());
77
78 // Iterate over the definitions in each instruction, and compute the
79 // stage difference for each use. Keep the maximum value.
80 for (MachineInstr *MI : Schedule.getInstructions()) {
81 int DefStage = Schedule.getStage(MI);
82 for (const MachineOperand &Op : MI->all_defs()) {
83 Register Reg = Op.getReg();
84 unsigned MaxDiff = 0;
85 bool PhiIsSwapped = false;
86 for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
87 MachineInstr *UseMI = UseOp.getParent();
88 int UseStage = Schedule.getStage(MI: UseMI);
89 unsigned Diff = 0;
90 if (UseStage != -1 && UseStage >= DefStage)
91 Diff = UseStage - DefStage;
92 if (MI->isPHI()) {
93 if (isLoopCarried(Phi&: *MI))
94 ++Diff;
95 else
96 PhiIsSwapped = true;
97 }
98 MaxDiff = std::max(a: Diff, b: MaxDiff);
99 }
100 RegToStageDiff[Reg] = std::make_pair(x&: MaxDiff, y&: PhiIsSwapped);
101 }
102 }
103
104 generatePipelinedLoop();
105}
106
107void ModuloScheduleExpander::generatePipelinedLoop() {
108 LoopInfo = TII->analyzeLoopForPipelining(LoopBB: BB);
109 assert(LoopInfo && "Must be able to analyze loop!");
110
111 // Create a new basic block for the kernel and add it to the CFG.
112 MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB: BB->getBasicBlock());
113
114 unsigned MaxStageCount = Schedule.getNumStages() - 1;
115
116 // Remember the registers that are used in different stages. The index is
117 // the iteration, or stage, that the instruction is scheduled in. This is
118 // a map between register names in the original block and the names created
119 // in each stage of the pipelined loop.
120 ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
121
122 // The renaming destination by Phis for the registers across stages.
123 // This map is updated during Phis generation to point to the most recent
124 // renaming destination.
125 ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
126
127 InstrMapTy InstrMap;
128
129 SmallVector<MachineBasicBlock *, 4> PrologBBs;
130
131 // Generate the prolog instructions that set up the pipeline.
132 generateProlog(LastStage: MaxStageCount, KernelBB, VRMap, PrologBBs);
133 MF.insert(MBBI: BB->getIterator(), MBB: KernelBB);
134 LIS.insertMBBInMaps(MBB: KernelBB);
135
136 // Rearrange the instructions to generate the new, pipelined loop,
137 // and update register names as needed.
138 for (MachineInstr *CI : Schedule.getInstructions()) {
139 if (CI->isPHI())
140 continue;
141 unsigned StageNum = Schedule.getStage(MI: CI);
142 MachineInstr *NewMI = cloneInstr(OldMI: CI, CurStageNum: MaxStageCount, InstStageNum: StageNum);
143 updateInstruction(NewMI, LastDef: false, CurStageNum: MaxStageCount, InstrStageNum: StageNum, VRMap);
144 KernelBB->push_back(MI: NewMI);
145 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
146 InstrMap[NewMI] = CI;
147 }
148
149 // Copy any terminator instructions to the new kernel, and update
150 // names as needed.
151 for (MachineInstr &MI : BB->terminators()) {
152 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: &MI);
153 updateInstruction(NewMI, LastDef: false, CurStageNum: MaxStageCount, InstrStageNum: 0, VRMap);
154 KernelBB->push_back(MI: NewMI);
155 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
156 InstrMap[NewMI] = &MI;
157 }
158
159 NewKernel = KernelBB;
160 KernelBB->transferSuccessors(FromMBB: BB);
161 KernelBB->replaceSuccessor(Old: BB, New: KernelBB);
162
163 generateExistingPhis(NewBB: KernelBB, BB1: PrologBBs.back(), BB2: KernelBB, KernelBB, VRMap,
164 InstrMap, LastStageNum: MaxStageCount, CurStageNum: MaxStageCount, IsLast: false);
165 generatePhis(NewBB: KernelBB, BB1: PrologBBs.back(), BB2: KernelBB, KernelBB, VRMap, VRMapPhi,
166 InstrMap, LastStageNum: MaxStageCount, CurStageNum: MaxStageCount, IsLast: false);
167
168 LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
169
170 SmallVector<MachineBasicBlock *, 4> EpilogBBs;
171 // Generate the epilog instructions to complete the pipeline.
172 generateEpilog(LastStage: MaxStageCount, KernelBB, OrigBB: BB, VRMap, VRMapPhi, EpilogBBs,
173 PrologBBs);
174
175 // We need this step because the register allocation doesn't handle some
176 // situations well, so we insert copies to help out.
177 splitLifetimes(KernelBB, EpilogBBs);
178
179 // Remove dead instructions due to loop induction variables.
180 removeDeadInstructions(KernelBB, EpilogBBs);
181
182 // Add branches between prolog and epilog blocks.
183 addBranches(PreheaderBB&: *Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
184
185 delete[] VRMap;
186 delete[] VRMapPhi;
187}
188
189void ModuloScheduleExpander::cleanup() {
190 // Remove the original loop since it's no longer referenced.
191 for (auto &I : *BB)
192 LIS.RemoveMachineInstrFromMaps(MI&: I);
193 BB->clear();
194 BB->eraseFromParent();
195}
196
197/// Generate the pipeline prolog code.
198void ModuloScheduleExpander::generateProlog(unsigned LastStage,
199 MachineBasicBlock *KernelBB,
200 ValueMapTy *VRMap,
201 MBBVectorTy &PrologBBs) {
202 MachineBasicBlock *PredBB = Preheader;
203 InstrMapTy InstrMap;
204
205 // Generate a basic block for each stage, not including the last stage,
206 // which will be generated in the kernel. Each basic block may contain
207 // instructions from multiple stages/iterations.
208 for (unsigned i = 0; i < LastStage; ++i) {
209 // Create and insert the prolog basic block prior to the original loop
210 // basic block. The original loop is removed later.
211 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB: BB->getBasicBlock());
212 PrologBBs.push_back(Elt: NewBB);
213 MF.insert(MBBI: BB->getIterator(), MBB: NewBB);
214 NewBB->transferSuccessors(FromMBB: PredBB);
215 PredBB->addSuccessor(Succ: NewBB);
216 PredBB = NewBB;
217 LIS.insertMBBInMaps(MBB: NewBB);
218
219 // Generate instructions for each appropriate stage. Process instructions
220 // in original program order.
221 for (int StageNum = i; StageNum >= 0; --StageNum) {
222 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
223 BBE = BB->getFirstTerminator();
224 BBI != BBE; ++BBI) {
225 if (Schedule.getStage(MI: &*BBI) == StageNum) {
226 if (BBI->isPHI())
227 continue;
228 MachineInstr *NewMI =
229 cloneAndChangeInstr(OldMI: &*BBI, CurStageNum: i, InstStageNum: (unsigned)StageNum);
230 updateInstruction(NewMI, LastDef: false, CurStageNum: i, InstrStageNum: (unsigned)StageNum, VRMap);
231 NewBB->push_back(MI: NewMI);
232 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
233 InstrMap[NewMI] = &*BBI;
234 }
235 }
236 }
237 rewritePhiValues(NewBB, StageNum: i, VRMap, InstrMap);
238 LLVM_DEBUG({
239 dbgs() << "prolog:\n";
240 NewBB->dump();
241 });
242 }
243
244 PredBB->replaceSuccessor(Old: BB, New: KernelBB);
245
246 // Check if we need to remove the branch from the preheader to the original
247 // loop, and replace it with a branch to the new loop.
248 unsigned numBranches = TII->removeBranch(MBB&: *Preheader);
249 if (numBranches) {
250 SmallVector<MachineOperand, 0> Cond;
251 TII->insertBranch(MBB&: *Preheader, TBB: PrologBBs[0], FBB: nullptr, Cond, DL: DebugLoc());
252 }
253}
254
255/// Generate the pipeline epilog code. The epilog code finishes the iterations
256/// that were started in either the prolog or the kernel. We create a basic
257/// block for each stage that needs to complete.
258void ModuloScheduleExpander::generateEpilog(
259 unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
260 ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
261 MBBVectorTy &PrologBBs) {
262 // We need to change the branch from the kernel to the first epilog block, so
263 // this call to analyze branch uses the kernel rather than the original BB.
264 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
265 SmallVector<MachineOperand, 4> Cond;
266 bool checkBranch = TII->analyzeBranch(MBB&: *KernelBB, TBB, FBB, Cond);
267 assert(!checkBranch && "generateEpilog must be able to analyze the branch");
268 if (checkBranch)
269 return;
270
271 MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
272 if (*LoopExitI == KernelBB)
273 ++LoopExitI;
274 assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
275 MachineBasicBlock *LoopExitBB = *LoopExitI;
276
277 MachineBasicBlock *PredBB = KernelBB;
278 MachineBasicBlock *EpilogStart = LoopExitBB;
279 InstrMapTy InstrMap;
280
281 // Generate a basic block for each stage, not including the last stage,
282 // which was generated for the kernel. Each basic block may contain
283 // instructions from multiple stages/iterations.
284 int EpilogStage = LastStage + 1;
285 for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
286 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
287 EpilogBBs.push_back(Elt: NewBB);
288 MF.insert(MBBI: BB->getIterator(), MBB: NewBB);
289
290 PredBB->replaceSuccessor(Old: LoopExitBB, New: NewBB);
291 NewBB->addSuccessor(Succ: LoopExitBB);
292 LIS.insertMBBInMaps(MBB: NewBB);
293
294 if (EpilogStart == LoopExitBB)
295 EpilogStart = NewBB;
296
297 // Add instructions to the epilog depending on the current block.
298 // Process instructions in original program order.
299 for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
300 for (auto &BBI : *BB) {
301 if (BBI.isPHI())
302 continue;
303 MachineInstr *In = &BBI;
304 if ((unsigned)Schedule.getStage(MI: In) == StageNum) {
305 // Instructions with memoperands in the epilog are updated with
306 // conservative values.
307 MachineInstr *NewMI = cloneInstr(OldMI: In, UINT_MAX, InstStageNum: 0);
308 updateInstruction(NewMI, LastDef: i == 1, CurStageNum: EpilogStage, InstrStageNum: 0, VRMap);
309 NewBB->push_back(MI: NewMI);
310 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
311 InstrMap[NewMI] = In;
312 }
313 }
314 }
315 generateExistingPhis(NewBB, BB1: PrologBBs[i - 1], BB2: PredBB, KernelBB, VRMap,
316 InstrMap, LastStageNum: LastStage, CurStageNum: EpilogStage, IsLast: i == 1);
317 generatePhis(NewBB, BB1: PrologBBs[i - 1], BB2: PredBB, KernelBB, VRMap, VRMapPhi,
318 InstrMap, LastStageNum: LastStage, CurStageNum: EpilogStage, IsLast: i == 1);
319 PredBB = NewBB;
320
321 LLVM_DEBUG({
322 dbgs() << "epilog:\n";
323 NewBB->dump();
324 });
325 }
326
327 // Fix any Phi nodes in the loop exit block.
328 LoopExitBB->replacePhiUsesWith(Old: BB, New: PredBB);
329
330 // Create a branch to the new epilog from the kernel.
331 // Remove the original branch and add a new branch to the epilog.
332 TII->removeBranch(MBB&: *KernelBB);
333 assert((OrigBB == TBB || OrigBB == FBB) &&
334 "Unable to determine looping branch direction");
335 if (OrigBB != TBB)
336 TII->insertBranch(MBB&: *KernelBB, TBB: EpilogStart, FBB: KernelBB, Cond, DL: DebugLoc());
337 else
338 TII->insertBranch(MBB&: *KernelBB, TBB: KernelBB, FBB: EpilogStart, Cond, DL: DebugLoc());
339 // Add a branch to the loop exit.
340 if (EpilogBBs.size() > 0) {
341 MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
342 SmallVector<MachineOperand, 4> Cond1;
343 TII->insertBranch(MBB&: *LastEpilogBB, TBB: LoopExitBB, FBB: nullptr, Cond: Cond1, DL: DebugLoc());
344 }
345}
346
347/// Replace all uses of FromReg that appear outside the specified
348/// basic block with ToReg.
349static void replaceRegUsesAfterLoop(Register FromReg, Register ToReg,
350 MachineBasicBlock *MBB,
351 MachineRegisterInfo &MRI) {
352 for (MachineOperand &O :
353 llvm::make_early_inc_range(Range: MRI.use_operands(Reg: FromReg)))
354 if (O.getParent()->getParent() != MBB)
355 O.setReg(ToReg);
356}
357
358/// Return true if the register has a use that occurs outside the
359/// specified loop.
360static bool hasUseAfterLoop(Register Reg, MachineBasicBlock *BB,
361 MachineRegisterInfo &MRI) {
362 for (const MachineOperand &MO : MRI.use_operands(Reg))
363 if (MO.getParent()->getParent() != BB)
364 return true;
365 return false;
366}
367
368/// Generate Phis for the specific block in the generated pipelined code.
369/// This function looks at the Phis from the original code to guide the
370/// creation of new Phis.
371void ModuloScheduleExpander::generateExistingPhis(
372 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
373 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
374 unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
375 // Compute the stage number for the initial value of the Phi, which
376 // comes from the prolog. The prolog to use depends on to which kernel/
377 // epilog that we're adding the Phi.
378 unsigned PrologStage = 0;
379 unsigned PrevStage = 0;
380 bool InKernel = (LastStageNum == CurStageNum);
381 if (InKernel) {
382 PrologStage = LastStageNum - 1;
383 PrevStage = CurStageNum;
384 } else {
385 PrologStage = LastStageNum - (CurStageNum - LastStageNum);
386 PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
387 }
388
389 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
390 BBE = BB->getFirstNonPHI();
391 BBI != BBE; ++BBI) {
392 Register Def = BBI->getOperand(i: 0).getReg();
393
394 Register InitVal;
395 Register LoopVal;
396 getPhiRegs(Phi&: *BBI, Loop: BB, InitVal, LoopVal);
397
398 Register PhiOp1;
399 // The Phi value from the loop body typically is defined in the loop, but
400 // not always. So, we need to check if the value is defined in the loop.
401 Register PhiOp2 = LoopVal;
402 if (auto It = VRMap[LastStageNum].find(Val: LoopVal);
403 It != VRMap[LastStageNum].end())
404 PhiOp2 = It->second;
405
406 int StageScheduled = Schedule.getStage(MI: &*BBI);
407 int LoopValStage = Schedule.getStage(MI: MRI.getVRegDef(Reg: LoopVal));
408 unsigned NumStages = getStagesForReg(Reg: Def, CurStage: CurStageNum);
409 if (NumStages == 0) {
410 // We don't need to generate a Phi anymore, but we need to rename any uses
411 // of the Phi value.
412 Register NewReg = VRMap[PrevStage][LoopVal];
413 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: 0, Phi: &*BBI, OldReg: Def,
414 NewReg: InitVal, PrevReg: NewReg);
415 auto It = VRMap[CurStageNum].find(Val: LoopVal);
416 if (It != VRMap[CurStageNum].end()) {
417 Register Reg = It->second;
418 VRMap[CurStageNum][Def] = Reg;
419 }
420 }
421 // Adjust the number of Phis needed depending on the number of prologs left,
422 // and the distance from where the Phi is first scheduled. The number of
423 // Phis cannot exceed the number of prolog stages. Each stage can
424 // potentially define two values.
425 unsigned MaxPhis = PrologStage + 2;
426 if (!InKernel && (int)PrologStage <= LoopValStage)
427 MaxPhis = std::max(a: (int)MaxPhis - LoopValStage, b: 1);
428 unsigned NumPhis = std::min(a: NumStages, b: MaxPhis);
429
430 Register NewReg;
431 unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
432 // In the epilog, we may need to look back one stage to get the correct
433 // Phi name, because the epilog and prolog blocks execute the same stage.
434 // The correct name is from the previous block only when the Phi has
435 // been completely scheduled prior to the epilog, and Phi value is not
436 // needed in multiple stages.
437 int StageDiff = 0;
438 if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
439 NumPhis == 1)
440 StageDiff = 1;
441 // Adjust the computations below when the phi and the loop definition
442 // are scheduled in different stages.
443 if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
444 StageDiff = StageScheduled - LoopValStage;
445 for (unsigned np = 0; np < NumPhis; ++np) {
446 // If the Phi hasn't been scheduled, then use the initial Phi operand
447 // value. Otherwise, use the scheduled version of the instruction. This
448 // is a little complicated when a Phi references another Phi.
449 if (np > PrologStage || StageScheduled >= (int)LastStageNum)
450 PhiOp1 = InitVal;
451 // Check if the Phi has already been scheduled in a prolog stage.
452 else if (PrologStage >= AccessStage + StageDiff + np &&
453 VRMap[PrologStage - StageDiff - np].count(Val: LoopVal) != 0)
454 PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
455 // Check if the Phi has already been scheduled, but the loop instruction
456 // is either another Phi, or doesn't occur in the loop.
457 else if (PrologStage >= AccessStage + StageDiff + np) {
458 // If the Phi references another Phi, we need to examine the other
459 // Phi to get the correct value.
460 PhiOp1 = LoopVal;
461 MachineInstr *InstOp1 = MRI.getVRegDef(Reg: PhiOp1);
462 int Indirects = 1;
463 while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
464 int PhiStage = Schedule.getStage(MI: InstOp1);
465 if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
466 PhiOp1 = getInitPhiReg(Phi&: *InstOp1, LoopBB: BB);
467 else
468 PhiOp1 = getLoopPhiReg(Phi&: *InstOp1, LoopBB: BB);
469 InstOp1 = MRI.getVRegDef(Reg: PhiOp1);
470 int PhiOpStage = Schedule.getStage(MI: InstOp1);
471 int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
472 if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np) {
473 auto &M = VRMap[PrologStage - StageAdj - Indirects - np];
474 if (auto It = M.find(Val: PhiOp1); It != M.end()) {
475 PhiOp1 = It->second;
476 break;
477 }
478 }
479 ++Indirects;
480 }
481 } else
482 PhiOp1 = InitVal;
483 // If this references a generated Phi in the kernel, get the Phi operand
484 // from the incoming block.
485 if (MachineInstr *InstOp1 = MRI.getVRegDef(Reg: PhiOp1))
486 if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
487 PhiOp1 = getInitPhiReg(Phi&: *InstOp1, LoopBB: KernelBB);
488
489 MachineInstr *PhiInst = MRI.getVRegDef(Reg: LoopVal);
490 bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
491 // In the epilog, a map lookup is needed to get the value from the kernel,
492 // or previous epilog block. How is does this depends on if the
493 // instruction is scheduled in the previous block.
494 if (!InKernel) {
495 int StageDiffAdj = 0;
496 if (LoopValStage != -1 && StageScheduled > LoopValStage)
497 StageDiffAdj = StageScheduled - LoopValStage;
498 // Use the loop value defined in the kernel, unless the kernel
499 // contains the last definition of the Phi.
500 if (np == 0 && PrevStage == LastStageNum &&
501 (StageScheduled != 0 || LoopValStage != 0) &&
502 VRMap[PrevStage - StageDiffAdj].count(Val: LoopVal))
503 PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
504 // Use the value defined by the Phi. We add one because we switch
505 // from looking at the loop value to the Phi definition.
506 else if (np > 0 && PrevStage == LastStageNum &&
507 VRMap[PrevStage - np + 1].count(Val: Def))
508 PhiOp2 = VRMap[PrevStage - np + 1][Def];
509 // Use the loop value defined in the kernel.
510 else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
511 VRMap[PrevStage - StageDiffAdj - np].count(Val: LoopVal))
512 PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
513 // Use the value defined by the Phi, unless we're generating the first
514 // epilog and the Phi refers to a Phi in a different stage.
515 else if (VRMap[PrevStage - np].count(Val: Def) &&
516 (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
517 (LoopValStage == StageScheduled)))
518 PhiOp2 = VRMap[PrevStage - np][Def];
519 }
520
521 // Check if we can reuse an existing Phi. This occurs when a Phi
522 // references another Phi, and the other Phi is scheduled in an
523 // earlier stage. We can try to reuse an existing Phi up until the last
524 // stage of the current Phi.
525 if (LoopDefIsPhi) {
526 if (static_cast<int>(PrologStage - np) >= StageScheduled) {
527 int LVNumStages = getStagesForPhi(Reg: LoopVal);
528 int StageDiff = (StageScheduled - LoopValStage);
529 LVNumStages -= StageDiff;
530 // Make sure the loop value Phi has been processed already.
531 if (LVNumStages > (int)np && VRMap[CurStageNum].count(Val: LoopVal)) {
532 NewReg = PhiOp2;
533 unsigned ReuseStage = CurStageNum;
534 if (isLoopCarried(Phi&: *PhiInst))
535 ReuseStage -= LVNumStages;
536 // Check if the Phi to reuse has been generated yet. If not, then
537 // there is nothing to reuse.
538 if (VRMap[ReuseStage - np].count(Val: LoopVal)) {
539 NewReg = VRMap[ReuseStage - np][LoopVal];
540
541 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI,
542 OldReg: Def, NewReg);
543 // Update the map with the new Phi name.
544 VRMap[CurStageNum - np][Def] = NewReg;
545 PhiOp2 = NewReg;
546 if (VRMap[LastStageNum - np - 1].count(Val: LoopVal))
547 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
548
549 if (IsLast && np == NumPhis - 1)
550 replaceRegUsesAfterLoop(FromReg: Def, ToReg: NewReg, MBB: BB, MRI);
551 continue;
552 }
553 }
554 }
555 if (InKernel && StageDiff > 0 &&
556 VRMap[CurStageNum - StageDiff - np].count(Val: LoopVal))
557 PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
558 }
559
560 const TargetRegisterClass *RC = MRI.getRegClass(Reg: Def);
561 NewReg = MRI.createVirtualRegister(RegClass: RC);
562
563 MachineInstrBuilder NewPhi =
564 BuildMI(BB&: *NewBB, I: NewBB->getFirstNonPHI(), MIMD: DebugLoc(),
565 MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: NewReg);
566 NewPhi.addReg(RegNo: PhiOp1).addMBB(MBB: BB1);
567 NewPhi.addReg(RegNo: PhiOp2).addMBB(MBB: BB2);
568 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
569 if (np == 0)
570 InstrMap[NewPhi] = &*BBI;
571
572 // We define the Phis after creating the new pipelined code, so
573 // we need to rename the Phi values in scheduled instructions.
574
575 Register PrevReg;
576 if (InKernel && VRMap[PrevStage - np].count(Val: LoopVal))
577 PrevReg = VRMap[PrevStage - np][LoopVal];
578 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: Def,
579 NewReg, PrevReg);
580 // If the Phi has been scheduled, use the new name for rewriting.
581 if (VRMap[CurStageNum - np].count(Val: Def)) {
582 Register R = VRMap[CurStageNum - np][Def];
583 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: R,
584 NewReg);
585 }
586
587 // Check if we need to rename any uses that occurs after the loop. The
588 // register to replace depends on whether the Phi is scheduled in the
589 // epilog.
590 if (IsLast && np == NumPhis - 1)
591 replaceRegUsesAfterLoop(FromReg: Def, ToReg: NewReg, MBB: BB, MRI);
592
593 // In the kernel, a dependent Phi uses the value from this Phi.
594 if (InKernel)
595 PhiOp2 = NewReg;
596
597 // Update the map with the new Phi name.
598 VRMap[CurStageNum - np][Def] = NewReg;
599 }
600
601 while (NumPhis++ < NumStages) {
602 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: NumPhis, Phi: &*BBI, OldReg: Def,
603 NewReg, PrevReg: 0);
604 }
605
606 // Check if we need to rename a Phi that has been eliminated due to
607 // scheduling.
608 if (NumStages == 0 && IsLast) {
609 auto &CurStageMap = VRMap[CurStageNum];
610 auto It = CurStageMap.find(Val: LoopVal);
611 if (It != CurStageMap.end())
612 replaceRegUsesAfterLoop(FromReg: Def, ToReg: It->second, MBB: BB, MRI);
613 }
614 }
615}
616
617/// Generate Phis for the specified block in the generated pipelined code.
618/// These are new Phis needed because the definition is scheduled after the
619/// use in the pipelined sequence.
620void ModuloScheduleExpander::generatePhis(
621 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
622 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
623 InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
624 bool IsLast) {
625 // Compute the stage number that contains the initial Phi value, and
626 // the Phi from the previous stage.
627 unsigned PrologStage = 0;
628 unsigned PrevStage = 0;
629 unsigned StageDiff = CurStageNum - LastStageNum;
630 bool InKernel = (StageDiff == 0);
631 if (InKernel) {
632 PrologStage = LastStageNum - 1;
633 PrevStage = CurStageNum;
634 } else {
635 PrologStage = LastStageNum - StageDiff;
636 PrevStage = LastStageNum + StageDiff - 1;
637 }
638
639 for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
640 BBE = BB->instr_end();
641 BBI != BBE; ++BBI) {
642 for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
643 MachineOperand &MO = BBI->getOperand(i);
644 if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
645 continue;
646
647 int StageScheduled = Schedule.getStage(MI: &*BBI);
648 assert(StageScheduled != -1 && "Expecting scheduled instruction.");
649 Register Def = MO.getReg();
650 unsigned NumPhis = getStagesForReg(Reg: Def, CurStage: CurStageNum);
651 // An instruction scheduled in stage 0 and is used after the loop
652 // requires a phi in the epilog for the last definition from either
653 // the kernel or prolog.
654 if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
655 hasUseAfterLoop(Reg: Def, BB, MRI))
656 NumPhis = 1;
657 if (!InKernel && (unsigned)StageScheduled > PrologStage)
658 continue;
659
660 Register PhiOp2;
661 if (InKernel) {
662 PhiOp2 = VRMap[PrevStage][Def];
663 if (MachineInstr *InstOp2 = MRI.getVRegDef(Reg: PhiOp2))
664 if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
665 PhiOp2 = getLoopPhiReg(Phi&: *InstOp2, LoopBB: BB2);
666 }
667 // The number of Phis can't exceed the number of prolog stages. The
668 // prolog stage number is zero based.
669 if (NumPhis > PrologStage + 1 - StageScheduled)
670 NumPhis = PrologStage + 1 - StageScheduled;
671 for (unsigned np = 0; np < NumPhis; ++np) {
672 // Example for
673 // Org:
674 // %Org = ... (Scheduled at Stage#0, NumPhi = 2)
675 //
676 // Prolog0 (Stage0):
677 // %Clone0 = ...
678 // Prolog1 (Stage1):
679 // %Clone1 = ...
680 // Kernel (Stage2):
681 // %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
682 // %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
683 // %Clone2 = ...
684 // Epilog0 (Stage3):
685 // %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
686 // %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
687 // Epilog1 (Stage4):
688 // %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
689 //
690 // VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
691 // VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
692 // VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
693
694 Register PhiOp1 = VRMap[PrologStage][Def];
695 if (np <= PrologStage)
696 PhiOp1 = VRMap[PrologStage - np][Def];
697 if (!InKernel) {
698 if (PrevStage == LastStageNum && np == 0)
699 PhiOp2 = VRMap[LastStageNum][Def];
700 else
701 PhiOp2 = VRMapPhi[PrevStage - np][Def];
702 }
703
704 const TargetRegisterClass *RC = MRI.getRegClass(Reg: Def);
705 Register NewReg = MRI.createVirtualRegister(RegClass: RC);
706
707 MachineInstrBuilder NewPhi =
708 BuildMI(BB&: *NewBB, I: NewBB->getFirstNonPHI(), MIMD: DebugLoc(),
709 MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: NewReg);
710 NewPhi.addReg(RegNo: PhiOp1).addMBB(MBB: BB1);
711 NewPhi.addReg(RegNo: PhiOp2).addMBB(MBB: BB2);
712 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
713 if (np == 0)
714 InstrMap[NewPhi] = &*BBI;
715
716 // Rewrite uses and update the map. The actions depend upon whether
717 // we generating code for the kernel or epilog blocks.
718 if (InKernel) {
719 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: PhiOp1,
720 NewReg);
721 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: PhiOp2,
722 NewReg);
723
724 PhiOp2 = NewReg;
725 VRMapPhi[PrevStage - np - 1][Def] = NewReg;
726 } else {
727 VRMapPhi[CurStageNum - np][Def] = NewReg;
728 if (np == NumPhis - 1)
729 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: Def,
730 NewReg);
731 }
732 if (IsLast && np == NumPhis - 1)
733 replaceRegUsesAfterLoop(FromReg: Def, ToReg: NewReg, MBB: BB, MRI);
734 }
735 }
736 }
737}
738
739/// Remove instructions that generate values with no uses.
740/// Typically, these are induction variable operations that generate values
741/// used in the loop itself. A dead instruction has a definition with
742/// no uses, or uses that occur in the original loop only.
743void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
744 MBBVectorTy &EpilogBBs) {
745 // For each epilog block, check that the value defined by each instruction
746 // is used. If not, delete it.
747 for (MachineBasicBlock *MBB : llvm::reverse(C&: EpilogBBs))
748 for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
749 ME = MBB->instr_rend();
750 MI != ME;) {
751 // From DeadMachineInstructionElem. Don't delete inline assembly.
752 if (MI->isInlineAsm()) {
753 ++MI;
754 continue;
755 }
756 bool SawStore = false;
757 // Check if it's safe to remove the instruction due to side effects.
758 // We can, and want to, remove Phis here.
759 if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
760 ++MI;
761 continue;
762 }
763 bool used = true;
764 for (const MachineOperand &MO : MI->all_defs()) {
765 Register reg = MO.getReg();
766 // Assume physical registers are used, unless they are marked dead.
767 if (reg.isPhysical()) {
768 used = !MO.isDead();
769 if (used)
770 break;
771 continue;
772 }
773 unsigned realUses = 0;
774 for (const MachineOperand &U : MRI.use_operands(Reg: reg)) {
775 // Check if there are any uses that occur only in the original
776 // loop. If so, that's not a real use.
777 if (U.getParent()->getParent() != BB) {
778 realUses++;
779 used = true;
780 break;
781 }
782 }
783 if (realUses > 0)
784 break;
785 used = false;
786 }
787 if (!used) {
788 LIS.RemoveMachineInstrFromMaps(MI&: *MI);
789 MI++->eraseFromParent();
790 continue;
791 }
792 ++MI;
793 }
794 // In the kernel block, check if we can remove a Phi that generates a value
795 // used in an instruction removed in the epilog block.
796 for (MachineInstr &MI : llvm::make_early_inc_range(Range: KernelBB->phis())) {
797 Register reg = MI.getOperand(i: 0).getReg();
798 if (MRI.use_begin(RegNo: reg) == MRI.use_end()) {
799 LIS.RemoveMachineInstrFromMaps(MI);
800 MI.eraseFromParent();
801 }
802 }
803}
804
805/// For loop carried definitions, we split the lifetime of a virtual register
806/// that has uses past the definition in the next iteration. A copy with a new
807/// virtual register is inserted before the definition, which helps with
808/// generating a better register assignment.
809///
810/// v1 = phi(a, v2) v1 = phi(a, v2)
811/// v2 = phi(b, v3) v2 = phi(b, v3)
812/// v3 = .. v4 = copy v1
813/// .. = V1 v3 = ..
814/// .. = v4
815void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
816 MBBVectorTy &EpilogBBs) {
817 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
818 for (auto &PHI : KernelBB->phis()) {
819 Register Def = PHI.getOperand(i: 0).getReg();
820 // Check for any Phi definition that used as an operand of another Phi
821 // in the same block.
822 for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(RegNo: Def),
823 E = MRI.use_instr_end();
824 I != E; ++I) {
825 if (I->isPHI() && I->getParent() == KernelBB) {
826 // Get the loop carried definition.
827 Register LCDef = getLoopPhiReg(Phi&: PHI, LoopBB: KernelBB);
828 if (!LCDef)
829 continue;
830 MachineInstr *MI = MRI.getVRegDef(Reg: LCDef);
831 if (!MI || MI->getParent() != KernelBB || MI->isPHI())
832 continue;
833 // Search through the rest of the block looking for uses of the Phi
834 // definition. If one occurs, then split the lifetime.
835 Register SplitReg;
836 for (auto &BBJ : make_range(x: MachineBasicBlock::instr_iterator(MI),
837 y: KernelBB->instr_end()))
838 if (BBJ.readsRegister(Reg: Def, /*TRI=*/nullptr)) {
839 // We split the lifetime when we find the first use.
840 if (!SplitReg) {
841 SplitReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: Def));
842 MachineInstr *newCopy =
843 BuildMI(BB&: *KernelBB, I: MI, MIMD: MI->getDebugLoc(),
844 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SplitReg)
845 .addReg(RegNo: Def);
846 LIS.InsertMachineInstrInMaps(MI&: *newCopy);
847 }
848 BBJ.substituteRegister(FromReg: Def, ToReg: SplitReg, SubIdx: 0, RegInfo: *TRI);
849 }
850 if (!SplitReg)
851 continue;
852 // Search through each of the epilog blocks for any uses to be renamed.
853 for (auto &Epilog : EpilogBBs)
854 for (auto &I : *Epilog)
855 if (I.readsRegister(Reg: Def, /*TRI=*/nullptr))
856 I.substituteRegister(FromReg: Def, ToReg: SplitReg, SubIdx: 0, RegInfo: *TRI);
857 break;
858 }
859 }
860 }
861}
862
863/// Create branches from each prolog basic block to the appropriate epilog
864/// block. These edges are needed if the loop ends before reaching the
865/// kernel.
866void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
867 MBBVectorTy &PrologBBs,
868 MachineBasicBlock *KernelBB,
869 MBBVectorTy &EpilogBBs,
870 ValueMapTy *VRMap) {
871 assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
872 MachineBasicBlock *LastPro = KernelBB;
873 MachineBasicBlock *LastEpi = KernelBB;
874
875 // Start from the blocks connected to the kernel and work "out"
876 // to the first prolog and the last epilog blocks.
877 unsigned MaxIter = PrologBBs.size() - 1;
878 for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
879 // Add branches to the prolog that go to the corresponding
880 // epilog, and the fall-thru prolog/kernel block.
881 MachineBasicBlock *Prolog = PrologBBs[j];
882 MachineBasicBlock *Epilog = EpilogBBs[i];
883
884 SmallVector<MachineOperand, 4> Cond;
885 std::optional<bool> StaticallyGreater =
886 LoopInfo->createTripCountGreaterCondition(TC: j + 1, MBB&: *Prolog, Cond);
887 unsigned numAdded = 0;
888 if (!StaticallyGreater) {
889 Prolog->addSuccessor(Succ: Epilog);
890 numAdded = TII->insertBranch(MBB&: *Prolog, TBB: Epilog, FBB: LastPro, Cond, DL: DebugLoc());
891 } else if (*StaticallyGreater == false) {
892 Prolog->addSuccessor(Succ: Epilog);
893 Prolog->removeSuccessor(Succ: LastPro);
894 LastEpi->removeSuccessor(Succ: Epilog);
895 numAdded = TII->insertBranch(MBB&: *Prolog, TBB: Epilog, FBB: nullptr, Cond, DL: DebugLoc());
896 Epilog->removePHIsIncomingValuesForPredecessor(PredMBB: *LastEpi);
897 // Remove the blocks that are no longer referenced.
898 if (LastPro != LastEpi) {
899 for (auto &MI : *LastEpi)
900 LIS.RemoveMachineInstrFromMaps(MI);
901 LastEpi->clear();
902 LastEpi->eraseFromParent();
903 }
904 if (LastPro == KernelBB) {
905 LoopInfo->disposed(LIS: &LIS);
906 NewKernel = nullptr;
907 }
908 for (auto &MI : *LastPro)
909 LIS.RemoveMachineInstrFromMaps(MI);
910 LastPro->clear();
911 LastPro->eraseFromParent();
912 } else {
913 numAdded = TII->insertBranch(MBB&: *Prolog, TBB: LastPro, FBB: nullptr, Cond, DL: DebugLoc());
914 Epilog->removePHIsIncomingValuesForPredecessor(PredMBB: *Prolog);
915 }
916 LastPro = Prolog;
917 LastEpi = Epilog;
918 for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
919 E = Prolog->instr_rend();
920 I != E && numAdded > 0; ++I, --numAdded)
921 updateInstruction(NewMI: &*I, LastDef: false, CurStageNum: j, InstrStageNum: 0, VRMap);
922 }
923
924 if (NewKernel) {
925 LoopInfo->setPreheader(PrologBBs[MaxIter]);
926 LoopInfo->adjustTripCount(TripCountAdjust: -(MaxIter + 1));
927 }
928}
929
930/// Return true if we can compute the amount the instruction changes
931/// during each iteration. Set Delta to the amount of the change.
932bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
933 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
934 const MachineOperand *BaseOp;
935 int64_t Offset;
936 bool OffsetIsScalable;
937 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
938 return false;
939
940 // FIXME: This algorithm assumes instructions have fixed-size offsets.
941 if (OffsetIsScalable)
942 return false;
943
944 if (!BaseOp->isReg())
945 return false;
946
947 Register BaseReg = BaseOp->getReg();
948
949 MachineRegisterInfo &MRI = MF.getRegInfo();
950 // Check if there is a Phi. If so, get the definition in the loop.
951 MachineInstr *BaseDef = MRI.getVRegDef(Reg: BaseReg);
952 if (BaseDef && BaseDef->isPHI()) {
953 BaseReg = getLoopPhiReg(Phi&: *BaseDef, LoopBB: MI.getParent());
954 BaseDef = MRI.getVRegDef(Reg: BaseReg);
955 }
956 if (!BaseDef)
957 return false;
958
959 int D = 0;
960 if (!TII->getIncrementValue(MI: *BaseDef, Value&: D) && D >= 0)
961 return false;
962
963 Delta = D;
964 return true;
965}
966
967/// Update the memory operand with a new offset when the pipeliner
968/// generates a new copy of the instruction that refers to a
969/// different memory location.
970void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
971 MachineInstr &OldMI,
972 unsigned Num) {
973 if (Num == 0)
974 return;
975 // If the instruction has memory operands, then adjust the offset
976 // when the instruction appears in different stages.
977 if (NewMI.memoperands_empty())
978 return;
979 SmallVector<MachineMemOperand *, 2> NewMMOs;
980 for (MachineMemOperand *MMO : NewMI.memoperands()) {
981 // TODO: Figure out whether isAtomic is really necessary (see D57601).
982 if (MMO->isVolatile() || MMO->isAtomic() ||
983 (MMO->isInvariant() && MMO->isDereferenceable()) ||
984 (!MMO->getValue())) {
985 NewMMOs.push_back(Elt: MMO);
986 continue;
987 }
988 unsigned Delta;
989 if (Num != UINT_MAX && computeDelta(MI&: OldMI, Delta)) {
990 int64_t AdjOffset = Delta * Num;
991 NewMMOs.push_back(
992 Elt: MF.getMachineMemOperand(MMO, Offset: AdjOffset, Size: MMO->getSize()));
993 } else {
994 NewMMOs.push_back(Elt: MF.getMachineMemOperand(
995 MMO, Offset: 0, Size: LocationSize::beforeOrAfterPointer()));
996 }
997 }
998 NewMI.setMemRefs(MF, MemRefs: NewMMOs);
999}
1000
1001/// Clone the instruction for the new pipelined loop and update the
1002/// memory operands, if needed.
1003MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
1004 unsigned CurStageNum,
1005 unsigned InstStageNum) {
1006 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: OldMI);
1007 updateMemOperands(NewMI&: *NewMI, OldMI&: *OldMI, Num: CurStageNum - InstStageNum);
1008 return NewMI;
1009}
1010
1011/// Clone the instruction for the new pipelined loop. If needed, this
1012/// function updates the instruction using the values saved in the
1013/// InstrChanges structure.
1014MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
1015 MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
1016 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: OldMI);
1017 auto It = InstrChanges.find(Val: OldMI);
1018 if (It != InstrChanges.end()) {
1019 std::pair<Register, int64_t> RegAndOffset = It->second;
1020 unsigned BasePos, OffsetPos;
1021 if (!TII->getBaseAndOffsetPosition(MI: *OldMI, BasePos, OffsetPos))
1022 return nullptr;
1023 int64_t NewOffset = OldMI->getOperand(i: OffsetPos).getImm();
1024 MachineInstr *LoopDef = findDefInLoop(Reg: RegAndOffset.first);
1025 if (Schedule.getStage(MI: LoopDef) > (signed)InstStageNum)
1026 NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
1027 NewMI->getOperand(i: OffsetPos).setImm(NewOffset);
1028 }
1029 updateMemOperands(NewMI&: *NewMI, OldMI&: *OldMI, Num: CurStageNum - InstStageNum);
1030 return NewMI;
1031}
1032
1033/// Update the machine instruction with new virtual registers. This
1034/// function may change the definitions and/or uses.
1035void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
1036 bool LastDef,
1037 unsigned CurStageNum,
1038 unsigned InstrStageNum,
1039 ValueMapTy *VRMap) {
1040 for (MachineOperand &MO : NewMI->operands()) {
1041 if (!MO.isReg() || !MO.getReg().isVirtual())
1042 continue;
1043 Register reg = MO.getReg();
1044 if (MO.isDef()) {
1045 // Create a new virtual register for the definition.
1046 const TargetRegisterClass *RC = MRI.getRegClass(Reg: reg);
1047 Register NewReg = MRI.createVirtualRegister(RegClass: RC);
1048 MO.setReg(NewReg);
1049 VRMap[CurStageNum][reg] = NewReg;
1050 if (LastDef)
1051 replaceRegUsesAfterLoop(FromReg: reg, ToReg: NewReg, MBB: BB, MRI);
1052 } else if (MO.isUse()) {
1053 MachineInstr *Def = MRI.getVRegDef(Reg: reg);
1054 // Compute the stage that contains the last definition for instruction.
1055 int DefStageNum = Schedule.getStage(MI: Def);
1056 unsigned StageNum = CurStageNum;
1057 if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
1058 // Compute the difference in stages between the defintion and the use.
1059 unsigned StageDiff = (InstrStageNum - DefStageNum);
1060 // Make an adjustment to get the last definition.
1061 StageNum -= StageDiff;
1062 }
1063 if (auto It = VRMap[StageNum].find(Val: reg); It != VRMap[StageNum].end())
1064 MO.setReg(It->second);
1065 }
1066 }
1067}
1068
1069/// Return the instruction in the loop that defines the register.
1070/// If the definition is a Phi, then follow the Phi operand to
1071/// the instruction in the loop.
1072MachineInstr *ModuloScheduleExpander::findDefInLoop(Register Reg) {
1073 SmallPtrSet<MachineInstr *, 8> Visited;
1074 MachineInstr *Def = MRI.getVRegDef(Reg);
1075 while (Def->isPHI()) {
1076 if (!Visited.insert(Ptr: Def).second)
1077 break;
1078 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
1079 if (Def->getOperand(i: i + 1).getMBB() == BB) {
1080 Def = MRI.getVRegDef(Reg: Def->getOperand(i).getReg());
1081 break;
1082 }
1083 }
1084 return Def;
1085}
1086
1087/// Return the new name for the value from the previous stage.
1088Register ModuloScheduleExpander::getPrevMapVal(
1089 unsigned StageNum, unsigned PhiStage, Register LoopVal, unsigned LoopStage,
1090 ValueMapTy *VRMap, MachineBasicBlock *BB) {
1091 Register PrevVal;
1092 if (StageNum > PhiStage) {
1093 MachineInstr *LoopInst = MRI.getVRegDef(Reg: LoopVal);
1094 if (PhiStage == LoopStage && VRMap[StageNum - 1].count(Val: LoopVal))
1095 // The name is defined in the previous stage.
1096 PrevVal = VRMap[StageNum - 1][LoopVal];
1097 else if (VRMap[StageNum].count(Val: LoopVal))
1098 // The previous name is defined in the current stage when the instruction
1099 // order is swapped.
1100 PrevVal = VRMap[StageNum][LoopVal];
1101 else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
1102 // The loop value hasn't yet been scheduled.
1103 PrevVal = LoopVal;
1104 else if (StageNum == PhiStage + 1)
1105 // The loop value is another phi, which has not been scheduled.
1106 PrevVal = getInitPhiReg(Phi&: *LoopInst, LoopBB: BB);
1107 else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
1108 // The loop value is another phi, which has been scheduled.
1109 PrevVal =
1110 getPrevMapVal(StageNum: StageNum - 1, PhiStage, LoopVal: getLoopPhiReg(Phi&: *LoopInst, LoopBB: BB),
1111 LoopStage, VRMap, BB);
1112 }
1113 return PrevVal;
1114}
1115
1116/// Rewrite the Phi values in the specified block to use the mappings
1117/// from the initial operand. Once the Phi is scheduled, we switch
1118/// to using the loop value instead of the Phi value, so those names
1119/// do not need to be rewritten.
1120void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
1121 unsigned StageNum,
1122 ValueMapTy *VRMap,
1123 InstrMapTy &InstrMap) {
1124 for (auto &PHI : BB->phis()) {
1125 Register InitVal;
1126 Register LoopVal;
1127 getPhiRegs(Phi&: PHI, Loop: BB, InitVal, LoopVal);
1128 Register PhiDef = PHI.getOperand(i: 0).getReg();
1129
1130 unsigned PhiStage = (unsigned)Schedule.getStage(MI: MRI.getVRegDef(Reg: PhiDef));
1131 unsigned LoopStage = (unsigned)Schedule.getStage(MI: MRI.getVRegDef(Reg: LoopVal));
1132 unsigned NumPhis = getStagesForPhi(Reg: PhiDef);
1133 if (NumPhis > StageNum)
1134 NumPhis = StageNum;
1135 for (unsigned np = 0; np <= NumPhis; ++np) {
1136 Register NewVal =
1137 getPrevMapVal(StageNum: StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
1138 if (!NewVal)
1139 NewVal = InitVal;
1140 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum: StageNum - np, PhiNum: np, Phi: &PHI, OldReg: PhiDef,
1141 NewReg: NewVal);
1142 }
1143 }
1144}
1145
1146/// Rewrite a previously scheduled instruction to use the register value
1147/// from the new instruction. Make sure the instruction occurs in the
1148/// basic block, and we don't change the uses in the new instruction.
1149void ModuloScheduleExpander::rewriteScheduledInstr(
1150 MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
1151 unsigned PhiNum, MachineInstr *Phi, Register OldReg, Register NewReg,
1152 Register PrevReg) {
1153 bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
1154 int StagePhi = Schedule.getStage(MI: Phi) + PhiNum;
1155 // Rewrite uses that have been scheduled already to use the new
1156 // Phi register.
1157 for (MachineOperand &UseOp :
1158 llvm::make_early_inc_range(Range: MRI.use_operands(Reg: OldReg))) {
1159 MachineInstr *UseMI = UseOp.getParent();
1160 if (UseMI->getParent() != BB)
1161 continue;
1162 if (UseMI->isPHI()) {
1163 if (!Phi->isPHI() && UseMI->getOperand(i: 0).getReg() == NewReg)
1164 continue;
1165 if (getLoopPhiReg(Phi&: *UseMI, LoopBB: BB) != OldReg)
1166 continue;
1167 }
1168 InstrMapTy::iterator OrigInstr = InstrMap.find(Val: UseMI);
1169 assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
1170 MachineInstr *OrigMI = OrigInstr->second;
1171 int StageSched = Schedule.getStage(MI: OrigMI);
1172 int CycleSched = Schedule.getCycle(MI: OrigMI);
1173 Register ReplaceReg;
1174 // This is the stage for the scheduled instruction.
1175 if (StagePhi == StageSched && Phi->isPHI()) {
1176 int CyclePhi = Schedule.getCycle(MI: Phi);
1177 if (PrevReg && InProlog)
1178 ReplaceReg = PrevReg;
1179 else if (PrevReg && !isLoopCarried(Phi&: *Phi) &&
1180 (CyclePhi <= CycleSched || OrigMI->isPHI()))
1181 ReplaceReg = PrevReg;
1182 else
1183 ReplaceReg = NewReg;
1184 }
1185 // The scheduled instruction occurs before the scheduled Phi, and the
1186 // Phi is not loop carried.
1187 if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(Phi&: *Phi))
1188 ReplaceReg = NewReg;
1189 if (StagePhi > StageSched && Phi->isPHI())
1190 ReplaceReg = NewReg;
1191 if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
1192 ReplaceReg = NewReg;
1193 if (ReplaceReg) {
1194 const TargetRegisterClass *NRC =
1195 MRI.constrainRegClass(Reg: ReplaceReg, RC: MRI.getRegClass(Reg: OldReg));
1196 if (NRC)
1197 UseOp.setReg(ReplaceReg);
1198 else {
1199 Register SplitReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OldReg));
1200 MachineInstr *newCopy = BuildMI(BB&: *BB, I: UseMI, MIMD: UseMI->getDebugLoc(),
1201 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SplitReg)
1202 .addReg(RegNo: ReplaceReg);
1203 UseOp.setReg(SplitReg);
1204 LIS.InsertMachineInstrInMaps(MI&: *newCopy);
1205 }
1206 }
1207 }
1208}
1209
1210bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
1211 if (!Phi.isPHI())
1212 return false;
1213 int DefCycle = Schedule.getCycle(MI: &Phi);
1214 int DefStage = Schedule.getStage(MI: &Phi);
1215
1216 Register InitVal;
1217 Register LoopVal;
1218 getPhiRegs(Phi, Loop: Phi.getParent(), InitVal, LoopVal);
1219 MachineInstr *Use = MRI.getVRegDef(Reg: LoopVal);
1220 if (!Use || Use->isPHI())
1221 return true;
1222 int LoopCycle = Schedule.getCycle(MI: Use);
1223 int LoopStage = Schedule.getStage(MI: Use);
1224 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
1225}
1226
1227//===----------------------------------------------------------------------===//
1228// PeelingModuloScheduleExpander implementation
1229//===----------------------------------------------------------------------===//
1230// This is a reimplementation of ModuloScheduleExpander that works by creating
1231// a fully correct steady-state kernel and peeling off the prolog and epilogs.
1232//===----------------------------------------------------------------------===//
1233
1234namespace {
1235// Remove any dead phis in MBB. Dead phis either have only one block as input
1236// (in which case they are the identity) or have no uses.
1237void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
1238 LiveIntervals *LIS, bool KeepSingleSrcPhi = false) {
1239 bool Changed = true;
1240 while (Changed) {
1241 Changed = false;
1242 for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB->phis())) {
1243 assert(MI.isPHI());
1244 if (MRI.use_empty(RegNo: MI.getOperand(i: 0).getReg())) {
1245 if (LIS)
1246 LIS->RemoveMachineInstrFromMaps(MI);
1247 MI.eraseFromParent();
1248 Changed = true;
1249 } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
1250 const TargetRegisterClass *ConstrainRegClass =
1251 MRI.constrainRegClass(Reg: MI.getOperand(i: 1).getReg(),
1252 RC: MRI.getRegClass(Reg: MI.getOperand(i: 0).getReg()));
1253 assert(ConstrainRegClass &&
1254 "Expected a valid constrained register class!");
1255 (void)ConstrainRegClass;
1256 MRI.replaceRegWith(FromReg: MI.getOperand(i: 0).getReg(),
1257 ToReg: MI.getOperand(i: 1).getReg());
1258 if (LIS)
1259 LIS->RemoveMachineInstrFromMaps(MI);
1260 MI.eraseFromParent();
1261 Changed = true;
1262 }
1263 }
1264 }
1265}
1266
1267/// Rewrites the kernel block in-place to adhere to the given schedule.
1268/// KernelRewriter holds all of the state required to perform the rewriting.
1269class KernelRewriter {
1270 ModuloSchedule &S;
1271 MachineBasicBlock *BB;
1272 MachineBasicBlock *PreheaderBB, *ExitBB;
1273 MachineRegisterInfo &MRI;
1274 const TargetInstrInfo *TII;
1275 LiveIntervals *LIS;
1276
1277 // Map from register class to canonical undef register for that class.
1278 DenseMap<const TargetRegisterClass *, Register> Undefs;
1279 // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
1280 // this map is only used when InitReg is non-undef.
1281 DenseMap<std::pair<Register, Register>, Register> Phis;
1282 // Map from LoopReg to phi register where the InitReg is undef.
1283 DenseMap<Register, Register> UndefPhis;
1284
1285 // Reg is used by MI. Return the new register MI should use to adhere to the
1286 // schedule. Insert phis as necessary.
1287 Register remapUse(Register Reg, MachineInstr &MI);
1288 // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
1289 // If InitReg is not given it is chosen arbitrarily. It will either be undef
1290 // or will be chosen so as to share another phi.
1291 Register phi(Register LoopReg, std::optional<Register> InitReg = {},
1292 const TargetRegisterClass *RC = nullptr);
1293 // Create an undef register of the given register class.
1294 Register undef(const TargetRegisterClass *RC);
1295
1296public:
1297 KernelRewriter(MachineLoop &L, ModuloSchedule &S, MachineBasicBlock *LoopBB,
1298 LiveIntervals *LIS = nullptr);
1299 void rewrite();
1300};
1301} // namespace
1302
1303KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1304 MachineBasicBlock *LoopBB, LiveIntervals *LIS)
1305 : S(S), BB(LoopBB), PreheaderBB(L.getLoopPreheader()),
1306 ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
1307 TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
1308 PreheaderBB = *BB->pred_begin();
1309 if (PreheaderBB == BB)
1310 PreheaderBB = *std::next(x: BB->pred_begin());
1311}
1312
1313void KernelRewriter::rewrite() {
1314 // Rearrange the loop to be in schedule order. Note that the schedule may
1315 // contain instructions that are not owned by the loop block (InstrChanges and
1316 // friends), so we gracefully handle unowned instructions and delete any
1317 // instructions that weren't in the schedule.
1318 auto InsertPt = BB->getFirstTerminator();
1319 MachineInstr *FirstMI = nullptr;
1320 for (MachineInstr *MI : S.getInstructions()) {
1321 if (MI->isPHI())
1322 continue;
1323 if (MI->getParent())
1324 MI->removeFromParent();
1325 BB->insert(I: InsertPt, MI);
1326 if (!FirstMI)
1327 FirstMI = MI;
1328 }
1329 assert(FirstMI && "Failed to find first MI in schedule");
1330
1331 // At this point all of the scheduled instructions are between FirstMI
1332 // and the end of the block. Kill from the first non-phi to FirstMI.
1333 for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
1334 if (LIS)
1335 LIS->RemoveMachineInstrFromMaps(MI&: *I);
1336 (I++)->eraseFromParent();
1337 }
1338
1339 // Now remap every instruction in the loop.
1340 for (MachineInstr &MI : *BB) {
1341 if (MI.isPHI() || MI.isTerminator())
1342 continue;
1343 for (MachineOperand &MO : MI.uses()) {
1344 if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
1345 continue;
1346 Register Reg = remapUse(Reg: MO.getReg(), MI);
1347 MO.setReg(Reg);
1348 }
1349 }
1350 EliminateDeadPhis(MBB: BB, MRI, LIS);
1351
1352 // Ensure a phi exists for all instructions that are either referenced by
1353 // an illegal phi or by an instruction outside the loop. This allows us to
1354 // treat remaps of these values the same as "normal" values that come from
1355 // loop-carried phis.
1356 for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
1357 if (MI->isPHI()) {
1358 Register R = MI->getOperand(i: 0).getReg();
1359 phi(LoopReg: R);
1360 continue;
1361 }
1362
1363 for (MachineOperand &Def : MI->defs()) {
1364 for (MachineInstr &MI : MRI.use_instructions(Reg: Def.getReg())) {
1365 if (MI.getParent() != BB) {
1366 phi(LoopReg: Def.getReg());
1367 break;
1368 }
1369 }
1370 }
1371 }
1372}
1373
1374Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
1375 MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
1376 if (!Producer)
1377 return Reg;
1378
1379 int ConsumerStage = S.getStage(MI: &MI);
1380 if (!Producer->isPHI()) {
1381 // Non-phi producers are simple to remap. Insert as many phis as the
1382 // difference between the consumer and producer stages.
1383 if (Producer->getParent() != BB)
1384 // Producer was not inside the loop. Use the register as-is.
1385 return Reg;
1386 int ProducerStage = S.getStage(MI: Producer);
1387 assert(ConsumerStage != -1 &&
1388 "In-loop consumer should always be scheduled!");
1389 assert(ConsumerStage >= ProducerStage);
1390 unsigned StageDiff = ConsumerStage - ProducerStage;
1391
1392 for (unsigned I = 0; I < StageDiff; ++I)
1393 Reg = phi(LoopReg: Reg);
1394 return Reg;
1395 }
1396
1397 // First, dive through the phi chain to find the defaults for the generated
1398 // phis.
1399 SmallVector<std::optional<Register>, 4> Defaults;
1400 Register LoopReg = Reg;
1401 auto LoopProducer = Producer;
1402 while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
1403 LoopReg = getLoopPhiReg(Phi&: *LoopProducer, LoopBB: BB);
1404 Defaults.emplace_back(Args: getInitPhiReg(Phi&: *LoopProducer, LoopBB: BB));
1405 LoopProducer = MRI.getUniqueVRegDef(Reg: LoopReg);
1406 assert(LoopProducer);
1407 }
1408 int LoopProducerStage = S.getStage(MI: LoopProducer);
1409
1410 std::optional<Register> IllegalPhiDefault;
1411
1412 if (LoopProducerStage == -1) {
1413 // Do nothing.
1414 } else if (LoopProducerStage > ConsumerStage) {
1415 // This schedule is only representable if ProducerStage == ConsumerStage+1.
1416 // In addition, Consumer's cycle must be scheduled after Producer in the
1417 // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
1418 // functions.
1419#ifndef NDEBUG // Silence unused variables in non-asserts mode.
1420 int LoopProducerCycle = S.getCycle(LoopProducer);
1421 int ConsumerCycle = S.getCycle(&MI);
1422#endif
1423 assert(LoopProducerCycle <= ConsumerCycle);
1424 assert(LoopProducerStage == ConsumerStage + 1);
1425 // Peel off the first phi from Defaults and insert a phi between producer
1426 // and consumer. This phi will not be at the front of the block so we
1427 // consider it illegal. It will only exist during the rewrite process; it
1428 // needs to exist while we peel off prologs because these could take the
1429 // default value. After that we can replace all uses with the loop producer
1430 // value.
1431 IllegalPhiDefault = Defaults.front();
1432 Defaults.erase(CI: Defaults.begin());
1433 } else {
1434 assert(ConsumerStage >= LoopProducerStage);
1435 int StageDiff = ConsumerStage - LoopProducerStage;
1436 if (StageDiff > 0) {
1437 LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
1438 << " to " << (Defaults.size() + StageDiff) << "\n");
1439 // If we need more phis than we have defaults for, pad out with undefs for
1440 // the earliest phis, which are at the end of the defaults chain (the
1441 // chain is in reverse order).
1442 Defaults.resize(N: Defaults.size() + StageDiff,
1443 NV: Defaults.empty() ? std::optional<Register>()
1444 : Defaults.back());
1445 }
1446 }
1447
1448 // Now we know the number of stages to jump back, insert the phi chain.
1449 auto DefaultI = Defaults.rbegin();
1450 while (DefaultI != Defaults.rend())
1451 LoopReg = phi(LoopReg, InitReg: *DefaultI++, RC: MRI.getRegClass(Reg));
1452
1453 if (IllegalPhiDefault) {
1454 // The consumer optionally consumes LoopProducer in the same iteration
1455 // (because the producer is scheduled at an earlier cycle than the consumer)
1456 // or the initial value. To facilitate this we create an illegal block here
1457 // by embedding a phi in the middle of the block. We will fix this up
1458 // immediately prior to pruning.
1459 auto RC = MRI.getRegClass(Reg);
1460 Register R = MRI.createVirtualRegister(RegClass: RC);
1461 MachineInstr *IllegalPhi =
1462 BuildMI(BB&: *BB, I&: MI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: R)
1463 .addReg(RegNo: *IllegalPhiDefault)
1464 .addMBB(MBB: PreheaderBB) // Block choice is arbitrary and has no effect.
1465 .addReg(RegNo: LoopReg)
1466 .addMBB(MBB: BB); // Block choice is arbitrary and has no effect.
1467 // Illegal phi should belong to the producer stage so that it can be
1468 // filtered correctly during peeling.
1469 S.setStage(MI: IllegalPhi, MIStage: LoopProducerStage);
1470 return R;
1471 }
1472
1473 return LoopReg;
1474}
1475
1476Register KernelRewriter::phi(Register LoopReg, std::optional<Register> InitReg,
1477 const TargetRegisterClass *RC) {
1478 // If the init register is not undef, try and find an existing phi.
1479 if (InitReg) {
1480 auto I = Phis.find(Val: {LoopReg, *InitReg});
1481 if (I != Phis.end())
1482 return I->second;
1483 } else {
1484 for (auto &KV : Phis) {
1485 if (KV.first.first == LoopReg)
1486 return KV.second;
1487 }
1488 }
1489
1490 // InitReg is either undef or no existing phi takes InitReg as input. Try and
1491 // find a phi that takes undef as input.
1492 auto I = UndefPhis.find(Val: LoopReg);
1493 if (I != UndefPhis.end()) {
1494 Register R = I->second;
1495 if (!InitReg)
1496 // Found a phi taking undef as input, and this input is undef so return
1497 // without any more changes.
1498 return R;
1499 // Found a phi taking undef as input, so rewrite it to take InitReg.
1500 MachineInstr *MI = MRI.getVRegDef(Reg: R);
1501 MI->getOperand(i: 1).setReg(*InitReg);
1502 Phis.insert(KV: {{LoopReg, *InitReg}, R});
1503 const TargetRegisterClass *ConstrainRegClass =
1504 MRI.constrainRegClass(Reg: R, RC: MRI.getRegClass(Reg: *InitReg));
1505 assert(ConstrainRegClass && "Expected a valid constrained register class!");
1506 (void)ConstrainRegClass;
1507 UndefPhis.erase(I);
1508 return R;
1509 }
1510
1511 // Failed to find any existing phi to reuse, so create a new one.
1512 if (!RC)
1513 RC = MRI.getRegClass(Reg: LoopReg);
1514 Register R = MRI.createVirtualRegister(RegClass: RC);
1515 if (InitReg) {
1516 const TargetRegisterClass *ConstrainRegClass =
1517 MRI.constrainRegClass(Reg: R, RC: MRI.getRegClass(Reg: *InitReg));
1518 assert(ConstrainRegClass && "Expected a valid constrained register class!");
1519 (void)ConstrainRegClass;
1520 }
1521 BuildMI(BB&: *BB, I: BB->getFirstNonPHI(), MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: R)
1522 .addReg(RegNo: InitReg ? *InitReg : undef(RC))
1523 .addMBB(MBB: PreheaderBB)
1524 .addReg(RegNo: LoopReg)
1525 .addMBB(MBB: BB);
1526 if (!InitReg)
1527 UndefPhis[LoopReg] = R;
1528 else
1529 Phis[{LoopReg, *InitReg}] = R;
1530 return R;
1531}
1532
1533Register KernelRewriter::undef(const TargetRegisterClass *RC) {
1534 Register &R = Undefs[RC];
1535 if (R == 0) {
1536 // Create an IMPLICIT_DEF that defines this register if we need it.
1537 // All uses of this should be removed by the time we have finished unrolling
1538 // prologs and epilogs.
1539 R = MRI.createVirtualRegister(RegClass: RC);
1540 auto *InsertBB = &PreheaderBB->getParent()->front();
1541 BuildMI(BB&: *InsertBB, I: InsertBB->getFirstTerminator(), MIMD: DebugLoc(),
1542 MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: R);
1543 }
1544 return R;
1545}
1546
1547namespace {
1548/// Describes an operand in the kernel of a pipelined loop. Characteristics of
1549/// the operand are discovered, such as how many in-loop PHIs it has to jump
1550/// through and defaults for these phis.
1551class KernelOperandInfo {
1552 MachineBasicBlock *BB;
1553 MachineRegisterInfo &MRI;
1554 SmallVector<Register, 4> PhiDefaults;
1555 MachineOperand *Source;
1556 MachineOperand *Target;
1557
1558public:
1559 KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
1560 const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
1561 : MRI(MRI) {
1562 Source = MO;
1563 BB = MO->getParent()->getParent();
1564 while (isRegInLoop(MO)) {
1565 MachineInstr *MI = MRI.getVRegDef(Reg: MO->getReg());
1566 if (MI->isFullCopy()) {
1567 MO = &MI->getOperand(i: 1);
1568 continue;
1569 }
1570 if (!MI->isPHI())
1571 break;
1572 // If this is an illegal phi, don't count it in distance.
1573 if (IllegalPhis.count(Ptr: MI)) {
1574 MO = &MI->getOperand(i: 3);
1575 continue;
1576 }
1577
1578 Register Default = getInitPhiReg(Phi&: *MI, LoopBB: BB);
1579 MO = MI->getOperand(i: 2).getMBB() == BB ? &MI->getOperand(i: 1)
1580 : &MI->getOperand(i: 3);
1581 PhiDefaults.push_back(Elt: Default);
1582 }
1583 Target = MO;
1584 }
1585
1586 bool operator==(const KernelOperandInfo &Other) const {
1587 return PhiDefaults.size() == Other.PhiDefaults.size();
1588 }
1589
1590 void print(raw_ostream &OS) const {
1591 OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
1592 << *Source->getParent();
1593 }
1594
1595private:
1596 bool isRegInLoop(MachineOperand *MO) {
1597 return MO->isReg() && MO->getReg().isVirtual() &&
1598 MRI.getVRegDef(Reg: MO->getReg())->getParent() == BB;
1599 }
1600};
1601} // namespace
1602
1603MachineBasicBlock *
1604PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1605 MachineBasicBlock *NewBB = PeelSingleBlockLoop(Direction: LPD, Loop: BB, MRI, TII);
1606 if (LPD == LPD_Front)
1607 PeeledFront.push_back(x: NewBB);
1608 else
1609 PeeledBack.push_front(x: NewBB);
1610 for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
1611 ++I, ++NI) {
1612 CanonicalMIs[&*I] = &*I;
1613 CanonicalMIs[&*NI] = &*I;
1614 BlockMIs[{NewBB, &*I}] = &*NI;
1615 BlockMIs[{BB, &*I}] = &*I;
1616 }
1617 return NewBB;
1618}
1619
1620void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
1621 int MinStage) {
1622 for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
1623 I != std::next(x: MB->getFirstNonPHI()->getReverseIterator());) {
1624 MachineInstr *MI = &*I++;
1625 int Stage = getStage(MI);
1626 if (Stage == -1 || Stage >= MinStage)
1627 continue;
1628
1629 for (MachineOperand &DefMO : MI->defs()) {
1630 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1631 for (MachineInstr &UseMI : MRI.use_instructions(Reg: DefMO.getReg())) {
1632 // Only PHIs can use values from this block by construction.
1633 // Match with the equivalent PHI in B.
1634 assert(UseMI.isPHI());
1635 Register Reg = getEquivalentRegisterIn(Reg: UseMI.getOperand(i: 0).getReg(),
1636 BB: MI->getParent());
1637 Subs.emplace_back(Args: &UseMI, Args&: Reg);
1638 }
1639 for (auto &Sub : Subs)
1640 Sub.first->substituteRegister(FromReg: DefMO.getReg(), ToReg: Sub.second, /*SubIdx=*/0,
1641 RegInfo: *MRI.getTargetRegisterInfo());
1642 }
1643 if (LIS)
1644 LIS->RemoveMachineInstrFromMaps(MI&: *MI);
1645 MI->eraseFromParent();
1646 }
1647}
1648
1649void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
1650 MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1651 auto InsertPt = DestBB->getFirstNonPHI();
1652 DenseMap<Register, Register> Remaps;
1653 for (MachineInstr &MI : llvm::make_early_inc_range(
1654 Range: llvm::make_range(x: SourceBB->getFirstNonPHI(), y: SourceBB->end()))) {
1655 if (MI.isPHI()) {
1656 // This is an illegal PHI. If we move any instructions using an illegal
1657 // PHI, we need to create a legal Phi.
1658 if (getStage(MI: &MI) != Stage) {
1659 // The legal Phi is not necessary if the illegal phi's stage
1660 // is being moved.
1661 Register PhiR = MI.getOperand(i: 0).getReg();
1662 auto RC = MRI.getRegClass(Reg: PhiR);
1663 Register NR = MRI.createVirtualRegister(RegClass: RC);
1664 MachineInstr *NI = BuildMI(BB&: *DestBB, I: DestBB->getFirstNonPHI(),
1665 MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: NR)
1666 .addReg(RegNo: PhiR)
1667 .addMBB(MBB: SourceBB);
1668 BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
1669 CanonicalMIs[NI] = CanonicalMIs[&MI];
1670 Remaps[PhiR] = NR;
1671 }
1672 }
1673 if (getStage(MI: &MI) != Stage)
1674 continue;
1675 MI.removeFromParent();
1676 DestBB->insert(I: InsertPt, MI: &MI);
1677 auto *KernelMI = CanonicalMIs[&MI];
1678 BlockMIs[{DestBB, KernelMI}] = &MI;
1679 BlockMIs.erase(Val: {SourceBB, KernelMI});
1680 }
1681 SmallVector<MachineInstr *, 4> PhiToDelete;
1682 for (MachineInstr &MI : DestBB->phis()) {
1683 assert(MI.getNumOperands() == 3);
1684 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 1).getReg());
1685 // If the instruction referenced by the phi is moved inside the block
1686 // we don't need the phi anymore.
1687 if (getStage(MI: Def) == Stage) {
1688 Register PhiReg = MI.getOperand(i: 0).getReg();
1689 assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg(),
1690 /*TRI=*/nullptr) != -1);
1691 MRI.replaceRegWith(FromReg: MI.getOperand(i: 0).getReg(), ToReg: MI.getOperand(i: 1).getReg());
1692 MI.getOperand(i: 0).setReg(PhiReg);
1693 PhiToDelete.push_back(Elt: &MI);
1694 }
1695 }
1696 for (auto *P : PhiToDelete)
1697 P->eraseFromParent();
1698 InsertPt = DestBB->getFirstNonPHI();
1699 // Helper to clone Phi instructions into the destination block. We clone Phi
1700 // greedily to avoid combinatorial explosion of Phi instructions.
1701 auto clonePhi = [&](MachineInstr *Phi) {
1702 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: Phi);
1703 DestBB->insert(I: InsertPt, MI: NewMI);
1704 Register OrigR = Phi->getOperand(i: 0).getReg();
1705 Register R = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OrigR));
1706 NewMI->getOperand(i: 0).setReg(R);
1707 NewMI->getOperand(i: 1).setReg(OrigR);
1708 NewMI->getOperand(i: 2).setMBB(*DestBB->pred_begin());
1709 Remaps[OrigR] = R;
1710 CanonicalMIs[NewMI] = CanonicalMIs[Phi];
1711 BlockMIs[{DestBB, CanonicalMIs[Phi]}] = NewMI;
1712 PhiNodeLoopIteration[NewMI] = PhiNodeLoopIteration[Phi];
1713 return R;
1714 };
1715 for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) {
1716 for (MachineOperand &MO : I->uses()) {
1717 if (!MO.isReg())
1718 continue;
1719 if (auto It = Remaps.find(Val: MO.getReg()); It != Remaps.end())
1720 MO.setReg(It->second);
1721 else {
1722 // If we are using a phi from the source block we need to add a new phi
1723 // pointing to the old one.
1724 MachineInstr *Use = MRI.getUniqueVRegDef(Reg: MO.getReg());
1725 if (Use && Use->isPHI() && Use->getParent() == SourceBB) {
1726 Register R = clonePhi(Use);
1727 MO.setReg(R);
1728 }
1729 }
1730 }
1731 }
1732}
1733
1734Register
1735PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
1736 MachineInstr *Phi) {
1737 unsigned distance = PhiNodeLoopIteration[Phi];
1738 MachineInstr *CanonicalUse = CanonicalPhi;
1739 Register CanonicalUseReg = CanonicalUse->getOperand(i: 0).getReg();
1740 for (unsigned I = 0; I < distance; ++I) {
1741 assert(CanonicalUse->isPHI());
1742 assert(CanonicalUse->getNumOperands() == 5);
1743 unsigned LoopRegIdx = 3, InitRegIdx = 1;
1744 if (CanonicalUse->getOperand(i: 2).getMBB() == CanonicalUse->getParent())
1745 std::swap(a&: LoopRegIdx, b&: InitRegIdx);
1746 CanonicalUseReg = CanonicalUse->getOperand(i: LoopRegIdx).getReg();
1747 CanonicalUse = MRI.getVRegDef(Reg: CanonicalUseReg);
1748 }
1749 return CanonicalUseReg;
1750}
1751
1752void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1753 BitVector LS(Schedule.getNumStages(), true);
1754 BitVector AS(Schedule.getNumStages(), true);
1755 LiveStages[BB] = LS;
1756 AvailableStages[BB] = AS;
1757
1758 // Peel out the prologs.
1759 LS.reset();
1760 for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
1761 LS[I] = true;
1762 Prologs.push_back(Elt: peelKernel(LPD: LPD_Front));
1763 LiveStages[Prologs.back()] = LS;
1764 AvailableStages[Prologs.back()] = LS;
1765 }
1766
1767 // Create a block that will end up as the new loop exiting block (dominated by
1768 // all prologs and epilogs). It will only contain PHIs, in the same order as
1769 // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
1770 // that the exiting block is a (sub) clone of BB. This in turn gives us the
1771 // property that any value deffed in BB but used outside of BB is used by a
1772 // PHI in the exiting block.
1773 MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
1774 EliminateDeadPhis(MBB: ExitingBB, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1775 // Push out the epilogs, again in reverse order.
1776 // We can't assume anything about the minumum loop trip count at this point,
1777 // so emit a fairly complex epilog.
1778
1779 // We first peel number of stages minus one epilogue. Then we remove dead
1780 // stages and reorder instructions based on their stage. If we have 3 stages
1781 // we generate first:
1782 // E0[3, 2, 1]
1783 // E1[3', 2']
1784 // E2[3'']
1785 // And then we move instructions based on their stages to have:
1786 // E0[3]
1787 // E1[2, 3']
1788 // E2[1, 2', 3'']
1789 // The transformation is legal because we only move instructions past
1790 // instructions of a previous loop iteration.
1791 for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1792 Epilogs.push_back(Elt: peelKernel(LPD: LPD_Back));
1793 MachineBasicBlock *B = Epilogs.back();
1794 filterInstructions(MB: B, MinStage: Schedule.getNumStages() - I);
1795 // Keep track at which iteration each phi belongs to. We need it to know
1796 // what version of the variable to use during prologue/epilogue stitching.
1797 EliminateDeadPhis(MBB: B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1798 for (MachineInstr &Phi : B->phis())
1799 PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
1800 }
1801 for (size_t I = 0; I < Epilogs.size(); I++) {
1802 LS.reset();
1803 for (size_t J = I; J < Epilogs.size(); J++) {
1804 int Iteration = J;
1805 unsigned Stage = Schedule.getNumStages() - 1 + I - J;
1806 // Move stage one block at a time so that Phi nodes are updated correctly.
1807 for (size_t K = Iteration; K > I; K--)
1808 moveStageBetweenBlocks(DestBB: Epilogs[K - 1], SourceBB: Epilogs[K], Stage);
1809 LS[Stage] = true;
1810 }
1811 LiveStages[Epilogs[I]] = LS;
1812 AvailableStages[Epilogs[I]] = AS;
1813 }
1814
1815 // Now we've defined all the prolog and epilog blocks as a fallthrough
1816 // sequence, add the edges that will be followed if the loop trip count is
1817 // lower than the number of stages (connecting prologs directly with epilogs).
1818 auto PI = Prologs.begin();
1819 auto EI = Epilogs.begin();
1820 assert(Prologs.size() == Epilogs.size());
1821 for (; PI != Prologs.end(); ++PI, ++EI) {
1822 MachineBasicBlock *Pred = *(*EI)->pred_begin();
1823 (*PI)->addSuccessor(Succ: *EI);
1824 for (MachineInstr &MI : (*EI)->phis()) {
1825 Register Reg = MI.getOperand(i: 1).getReg();
1826 MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
1827 if (Use && Use->getParent() == Pred) {
1828 MachineInstr *CanonicalUse = CanonicalMIs[Use];
1829 if (CanonicalUse->isPHI()) {
1830 // If the use comes from a phi we need to skip as many phi as the
1831 // distance between the epilogue and the kernel. Trace through the phi
1832 // chain to find the right value.
1833 Reg = getPhiCanonicalReg(CanonicalPhi: CanonicalUse, Phi: Use);
1834 }
1835 Reg = getEquivalentRegisterIn(Reg, BB: *PI);
1836 }
1837 MI.addOperand(Op: MachineOperand::CreateReg(Reg, /*isDef=*/false));
1838 MI.addOperand(Op: MachineOperand::CreateMBB(MBB: *PI));
1839 }
1840 }
1841
1842 // Create a list of all blocks in order.
1843 SmallVector<MachineBasicBlock *, 8> Blocks;
1844 llvm::append_range(C&: Blocks, R&: PeeledFront);
1845 Blocks.push_back(Elt: BB);
1846 llvm::append_range(C&: Blocks, R&: PeeledBack);
1847
1848 // Iterate in reverse order over all instructions, remapping as we go.
1849 for (MachineBasicBlock *B : reverse(C&: Blocks)) {
1850 for (auto I = B->instr_rbegin();
1851 I != std::next(x: B->getFirstNonPHI()->getReverseIterator());) {
1852 MachineBasicBlock::reverse_instr_iterator MI = I++;
1853 rewriteUsesOf(MI: &*MI);
1854 }
1855 }
1856 for (auto *MI : IllegalPhisToDelete) {
1857 if (LIS)
1858 LIS->RemoveMachineInstrFromMaps(MI&: *MI);
1859 MI->eraseFromParent();
1860 }
1861 IllegalPhisToDelete.clear();
1862
1863 // Now all remapping has been done, we're free to optimize the generated code.
1864 for (MachineBasicBlock *B : reverse(C&: Blocks))
1865 EliminateDeadPhis(MBB: B, MRI, LIS);
1866 EliminateDeadPhis(MBB: ExitingBB, MRI, LIS);
1867}
1868
1869MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
1870 MachineFunction &MF = *BB->getParent();
1871 MachineBasicBlock *Exit = *BB->succ_begin();
1872 if (Exit == BB)
1873 Exit = *std::next(x: BB->succ_begin());
1874
1875 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB: BB->getBasicBlock());
1876 MF.insert(MBBI: std::next(x: BB->getIterator()), MBB: NewBB);
1877
1878 // Clone all phis in BB into NewBB and rewrite.
1879 for (MachineInstr &MI : BB->phis()) {
1880 auto RC = MRI.getRegClass(Reg: MI.getOperand(i: 0).getReg());
1881 Register OldR = MI.getOperand(i: 3).getReg();
1882 Register R = MRI.createVirtualRegister(RegClass: RC);
1883 SmallVector<MachineInstr *, 4> Uses;
1884 for (MachineInstr &Use : MRI.use_instructions(Reg: OldR))
1885 if (Use.getParent() != BB)
1886 Uses.push_back(Elt: &Use);
1887 for (MachineInstr *Use : Uses)
1888 Use->substituteRegister(FromReg: OldR, ToReg: R, /*SubIdx=*/0,
1889 RegInfo: *MRI.getTargetRegisterInfo());
1890 MachineInstr *NI = BuildMI(BB: NewBB, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: R)
1891 .addReg(RegNo: OldR)
1892 .addMBB(MBB: BB);
1893 BlockMIs[{NewBB, &MI}] = NI;
1894 CanonicalMIs[NI] = &MI;
1895 }
1896 BB->replaceSuccessor(Old: Exit, New: NewBB);
1897 Exit->replacePhiUsesWith(Old: BB, New: NewBB);
1898 NewBB->addSuccessor(Succ: Exit);
1899
1900 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
1901 SmallVector<MachineOperand, 4> Cond;
1902 bool CanAnalyzeBr = !TII->analyzeBranch(MBB&: *BB, TBB, FBB, Cond);
1903 (void)CanAnalyzeBr;
1904 assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
1905 TII->removeBranch(MBB&: *BB);
1906 TII->insertBranch(MBB&: *BB, TBB: TBB == Exit ? NewBB : TBB, FBB: FBB == Exit ? NewBB : FBB,
1907 Cond, DL: DebugLoc());
1908 TII->insertUnconditionalBranch(MBB&: *NewBB, DestBB: Exit, DL: DebugLoc());
1909 return NewBB;
1910}
1911
1912Register
1913PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
1914 MachineBasicBlock *BB) {
1915 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
1916 unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
1917 return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(i: OpIdx).getReg();
1918}
1919
1920void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1921 if (MI->isPHI()) {
1922 // This is an illegal PHI. The loop-carried (desired) value is operand 3,
1923 // and it is produced by this block.
1924 Register PhiR = MI->getOperand(i: 0).getReg();
1925 Register R = MI->getOperand(i: 3).getReg();
1926 int RMIStage = getStage(MI: MRI.getUniqueVRegDef(Reg: R));
1927 if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(Idx: RMIStage))
1928 R = MI->getOperand(i: 1).getReg();
1929 MRI.setRegClass(Reg: R, RC: MRI.getRegClass(Reg: PhiR));
1930 MRI.replaceRegWith(FromReg: PhiR, ToReg: R);
1931 // Postpone deleting the Phi as it may be referenced by BlockMIs and used
1932 // later to figure out how to remap registers.
1933 MI->getOperand(i: 0).setReg(PhiR);
1934 IllegalPhisToDelete.push_back(Elt: MI);
1935 return;
1936 }
1937
1938 int Stage = getStage(MI);
1939 if (Stage == -1 || LiveStages.count(Val: MI->getParent()) == 0 ||
1940 LiveStages[MI->getParent()].test(Idx: Stage))
1941 // Instruction is live, no rewriting to do.
1942 return;
1943
1944 for (MachineOperand &DefMO : MI->defs()) {
1945 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1946 for (MachineInstr &UseMI : MRI.use_instructions(Reg: DefMO.getReg())) {
1947 // Only PHIs can use values from this block by construction.
1948 // Match with the equivalent PHI in B.
1949 assert(UseMI.isPHI());
1950 Register Reg = getEquivalentRegisterIn(Reg: UseMI.getOperand(i: 0).getReg(),
1951 BB: MI->getParent());
1952 Subs.emplace_back(Args: &UseMI, Args&: Reg);
1953 }
1954 for (auto &Sub : Subs)
1955 Sub.first->substituteRegister(FromReg: DefMO.getReg(), ToReg: Sub.second, /*SubIdx=*/0,
1956 RegInfo: *MRI.getTargetRegisterInfo());
1957 }
1958 if (LIS)
1959 LIS->RemoveMachineInstrFromMaps(MI&: *MI);
1960 MI->eraseFromParent();
1961}
1962
1963void PeelingModuloScheduleExpander::fixupBranches() {
1964 // Work outwards from the kernel.
1965 bool KernelDisposed = false;
1966 int TC = Schedule.getNumStages() - 1;
1967 for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
1968 ++PI, ++EI, --TC) {
1969 MachineBasicBlock *Prolog = *PI;
1970 MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
1971 MachineBasicBlock *Epilog = *EI;
1972 SmallVector<MachineOperand, 4> Cond;
1973 TII->removeBranch(MBB&: *Prolog);
1974 std::optional<bool> StaticallyGreater =
1975 LoopInfo->createTripCountGreaterCondition(TC, MBB&: *Prolog, Cond);
1976 if (!StaticallyGreater) {
1977 LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
1978 // Dynamically branch based on Cond.
1979 TII->insertBranch(MBB&: *Prolog, TBB: Epilog, FBB: Fallthrough, Cond, DL: DebugLoc());
1980 } else if (*StaticallyGreater == false) {
1981 LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
1982 // Prolog never falls through; branch to epilog and orphan interior
1983 // blocks. Leave it to unreachable-block-elim to clean up.
1984 Prolog->removeSuccessor(Succ: Fallthrough);
1985 for (MachineInstr &P : Fallthrough->phis()) {
1986 P.removeOperand(OpNo: 2);
1987 P.removeOperand(OpNo: 1);
1988 }
1989 TII->insertUnconditionalBranch(MBB&: *Prolog, DestBB: Epilog, DL: DebugLoc());
1990 KernelDisposed = true;
1991 } else {
1992 LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
1993 // Prolog always falls through; remove incoming values in epilog.
1994 Prolog->removeSuccessor(Succ: Epilog);
1995 for (MachineInstr &P : Epilog->phis()) {
1996 P.removeOperand(OpNo: 4);
1997 P.removeOperand(OpNo: 3);
1998 }
1999 }
2000 }
2001
2002 if (!KernelDisposed) {
2003 LoopInfo->adjustTripCount(TripCountAdjust: -(Schedule.getNumStages() - 1));
2004 LoopInfo->setPreheader(Prologs.back());
2005 } else {
2006 LoopInfo->disposed();
2007 }
2008}
2009
2010void PeelingModuloScheduleExpander::rewriteKernel() {
2011 KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2012 KR.rewrite();
2013}
2014
2015void PeelingModuloScheduleExpander::expand() {
2016 BB = Schedule.getLoop()->getTopBlock();
2017 Preheader = Schedule.getLoop()->getLoopPreheader();
2018 LLVM_DEBUG(Schedule.dump());
2019 LoopInfo = TII->analyzeLoopForPipelining(LoopBB: BB);
2020 assert(LoopInfo);
2021
2022 rewriteKernel();
2023 peelPrologAndEpilogs();
2024 fixupBranches();
2025}
2026
2027void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
2028 BB = Schedule.getLoop()->getTopBlock();
2029 Preheader = Schedule.getLoop()->getLoopPreheader();
2030
2031 // Dump the schedule before we invalidate and remap all its instructions.
2032 // Stash it in a string so we can print it if we found an error.
2033 std::string ScheduleDump;
2034 raw_string_ostream OS(ScheduleDump);
2035 Schedule.print(OS);
2036
2037 // First, run the normal ModuleScheduleExpander. We don't support any
2038 // InstrChanges.
2039 assert(LIS && "Requires LiveIntervals!");
2040 ModuloScheduleExpander MSE(MF, Schedule, *LIS,
2041 ModuloScheduleExpander::InstrChangesTy());
2042 MSE.expand();
2043 MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
2044 if (!ExpandedKernel) {
2045 // The expander optimized away the kernel. We can't do any useful checking.
2046 MSE.cleanup();
2047 return;
2048 }
2049 // Before running the KernelRewriter, re-add BB into the CFG.
2050 Preheader->addSuccessor(Succ: BB);
2051
2052 // Now run the new expansion algorithm.
2053 KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2054 KR.rewrite();
2055 peelPrologAndEpilogs();
2056
2057 // Collect all illegal phis that the new algorithm created. We'll give these
2058 // to KernelOperandInfo.
2059 SmallPtrSet<MachineInstr *, 4> IllegalPhis;
2060 for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
2061 if (NI->isPHI())
2062 IllegalPhis.insert(Ptr: &*NI);
2063 }
2064
2065 // Co-iterate across both kernels. We expect them to be identical apart from
2066 // phis and full COPYs (we look through both).
2067 SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
2068 auto OI = ExpandedKernel->begin();
2069 auto NI = BB->begin();
2070 for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
2071 while (OI->isPHI() || OI->isFullCopy())
2072 ++OI;
2073 while (NI->isPHI() || NI->isFullCopy())
2074 ++NI;
2075 assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
2076 // Analyze every operand separately.
2077 for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
2078 OOpI != OI->operands_end(); ++OOpI, ++NOpI)
2079 KOIs.emplace_back(Args: KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
2080 Args: KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
2081 }
2082
2083 bool Failed = false;
2084 for (auto &OldAndNew : KOIs) {
2085 if (OldAndNew.first == OldAndNew.second)
2086 continue;
2087 Failed = true;
2088 errs() << "Modulo kernel validation error: [\n";
2089 errs() << " [golden] ";
2090 OldAndNew.first.print(OS&: errs());
2091 errs() << " ";
2092 OldAndNew.second.print(OS&: errs());
2093 errs() << "]\n";
2094 }
2095
2096 if (Failed) {
2097 errs() << "Golden reference kernel:\n";
2098 ExpandedKernel->print(OS&: errs());
2099 errs() << "New kernel:\n";
2100 BB->print(OS&: errs());
2101 errs() << ScheduleDump;
2102 report_fatal_error(
2103 reason: "Modulo kernel validation (-pipeliner-experimental-cg) failed");
2104 }
2105
2106 // Cleanup by removing BB from the CFG again as the original
2107 // ModuloScheduleExpander intended.
2108 Preheader->removeSuccessor(Succ: BB);
2109 MSE.cleanup();
2110}
2111
2112MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
2113 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: OldMI);
2114
2115 // TODO: Offset information needs to be corrected.
2116 NewMI->dropMemRefs(MF);
2117
2118 return NewMI;
2119}
2120
2121/// Create a dedicated exit for Loop. Exit is the original exit for Loop.
2122/// If it is already dedicated exit, return it. Otherwise, insert a new
2123/// block between them and return the new block.
2124static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
2125 MachineBasicBlock *Exit,
2126 LiveIntervals &LIS) {
2127 if (Exit->pred_size() == 1)
2128 return Exit;
2129
2130 MachineFunction *MF = Loop->getParent();
2131 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
2132
2133 MachineBasicBlock *NewExit =
2134 MF->CreateMachineBasicBlock(BB: Loop->getBasicBlock());
2135 MF->insert(MBBI: Loop->getIterator(), MBB: NewExit);
2136 LIS.insertMBBInMaps(MBB: NewExit);
2137
2138 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2139 SmallVector<MachineOperand, 4> Cond;
2140 TII->analyzeBranch(MBB&: *Loop, TBB, FBB, Cond);
2141 if (TBB == Loop)
2142 FBB = NewExit;
2143 else if (FBB == Loop)
2144 TBB = NewExit;
2145 else
2146 llvm_unreachable("unexpected loop structure");
2147 TII->removeBranch(MBB&: *Loop);
2148 TII->insertBranch(MBB&: *Loop, TBB, FBB, Cond, DL: DebugLoc());
2149 Loop->replaceSuccessor(Old: Exit, New: NewExit);
2150 TII->insertUnconditionalBranch(MBB&: *NewExit, DestBB: Exit, DL: DebugLoc());
2151 NewExit->addSuccessor(Succ: Exit);
2152
2153 Exit->replacePhiUsesWith(Old: Loop, New: NewExit);
2154
2155 return NewExit;
2156}
2157
2158/// Insert branch code into the end of MBB. It branches to GreaterThan if the
2159/// remaining trip count for instructions in LastStage0Insts is greater than
2160/// RequiredTC, and to Otherwise otherwise.
2161void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
2162 int RequiredTC,
2163 InstrMapTy &LastStage0Insts,
2164 MachineBasicBlock &GreaterThan,
2165 MachineBasicBlock &Otherwise) {
2166 SmallVector<MachineOperand, 4> Cond;
2167 LoopInfo->createRemainingIterationsGreaterCondition(TC: RequiredTC, MBB, Cond,
2168 LastStage0Insts);
2169
2170 if (SwapBranchTargetsMVE) {
2171 // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and
2172 // FBB for optimal performance.
2173 if (TII->reverseBranchCondition(Cond))
2174 llvm_unreachable("can not reverse branch condition");
2175 TII->insertBranch(MBB, TBB: &Otherwise, FBB: &GreaterThan, Cond, DL: DebugLoc());
2176 } else {
2177 TII->insertBranch(MBB, TBB: &GreaterThan, FBB: &Otherwise, Cond, DL: DebugLoc());
2178 }
2179}
2180
2181/// Generate a pipelined loop that is unrolled by using MVE algorithm and any
2182/// other necessary blocks. The control flow is modified to execute the
2183/// pipelined loop if the trip count satisfies the condition, otherwise the
2184/// original loop. The original loop is also used to execute the remainder
2185/// iterations which occur due to unrolling.
2186void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
2187 // The control flow for pipelining with MVE:
2188 //
2189 // OrigPreheader:
2190 // // The block that is originally the loop preheader
2191 // goto Check
2192 //
2193 // Check:
2194 // // Check whether the trip count satisfies the requirements to pipeline.
2195 // if (LoopCounter > NumStages + NumUnroll - 2)
2196 // // The minimum number of iterations to pipeline =
2197 // // iterations executed in prolog/epilog (NumStages-1) +
2198 // // iterations executed in one kernel run (NumUnroll)
2199 // goto Prolog
2200 // // fallback to the original loop
2201 // goto NewPreheader
2202 //
2203 // Prolog:
2204 // // All prolog stages. There are no direct branches to the epilogue.
2205 // goto NewKernel
2206 //
2207 // NewKernel:
2208 // // NumUnroll copies of the kernel
2209 // if (LoopCounter > MVE-1)
2210 // goto NewKernel
2211 // goto Epilog
2212 //
2213 // Epilog:
2214 // // All epilog stages.
2215 // if (LoopCounter > 0)
2216 // // The remainder is executed in the original loop
2217 // goto NewPreheader
2218 // goto NewExit
2219 //
2220 // NewPreheader:
2221 // // Newly created preheader for the original loop.
2222 // // The initial values of the phis in the loop are merged from two paths.
2223 // NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
2224 // goto OrigKernel
2225 //
2226 // OrigKernel:
2227 // // The original loop block.
2228 // if (LoopCounter != 0)
2229 // goto OrigKernel
2230 // goto NewExit
2231 //
2232 // NewExit:
2233 // // Newly created dedicated exit for the original loop.
2234 // // Merge values which are referenced after the loop
2235 // Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
2236 // goto OrigExit
2237 //
2238 // OrigExit:
2239 // // The block that is originally the loop exit.
2240 // // If it is already deicated exit, NewExit is not created.
2241
2242 // An example of where each stage is executed:
2243 // Assume #Stages 3, #MVE 4, #Iterations 12
2244 // Iter 0 1 2 3 4 5 6 7 8 9 10-11
2245 // -------------------------------------------------
2246 // Stage 0 Prolog#0
2247 // Stage 1 0 Prolog#1
2248 // Stage 2 1 0 Kernel Unroll#0 Iter#0
2249 // Stage 2 1 0 Kernel Unroll#1 Iter#0
2250 // Stage 2 1 0 Kernel Unroll#2 Iter#0
2251 // Stage 2 1 0 Kernel Unroll#3 Iter#0
2252 // Stage 2 1 0 Kernel Unroll#0 Iter#1
2253 // Stage 2 1 0 Kernel Unroll#1 Iter#1
2254 // Stage 2 1 0 Kernel Unroll#2 Iter#1
2255 // Stage 2 1 0 Kernel Unroll#3 Iter#1
2256 // Stage 2 1 Epilog#0
2257 // Stage 2 Epilog#1
2258 // Stage 0-2 OrigKernel
2259
2260 LoopInfo = TII->analyzeLoopForPipelining(LoopBB: OrigKernel);
2261 assert(LoopInfo && "Must be able to analyze loop!");
2262
2263 calcNumUnroll();
2264
2265 Check = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2266 Prolog = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2267 NewKernel = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2268 Epilog = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2269 NewPreheader = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2270
2271 MF.insert(MBBI: OrigKernel->getIterator(), MBB: Check);
2272 LIS.insertMBBInMaps(MBB: Check);
2273 MF.insert(MBBI: OrigKernel->getIterator(), MBB: Prolog);
2274 LIS.insertMBBInMaps(MBB: Prolog);
2275 MF.insert(MBBI: OrigKernel->getIterator(), MBB: NewKernel);
2276 LIS.insertMBBInMaps(MBB: NewKernel);
2277 MF.insert(MBBI: OrigKernel->getIterator(), MBB: Epilog);
2278 LIS.insertMBBInMaps(MBB: Epilog);
2279 MF.insert(MBBI: OrigKernel->getIterator(), MBB: NewPreheader);
2280 LIS.insertMBBInMaps(MBB: NewPreheader);
2281
2282 NewExit = createDedicatedExit(Loop: OrigKernel, Exit: OrigExit, LIS);
2283
2284 NewPreheader->transferSuccessorsAndUpdatePHIs(FromMBB: OrigPreheader);
2285 TII->insertUnconditionalBranch(MBB&: *NewPreheader, DestBB: OrigKernel, DL: DebugLoc());
2286
2287 OrigPreheader->addSuccessor(Succ: Check);
2288 TII->removeBranch(MBB&: *OrigPreheader);
2289 TII->insertUnconditionalBranch(MBB&: *OrigPreheader, DestBB: Check, DL: DebugLoc());
2290
2291 Check->addSuccessor(Succ: Prolog);
2292 Check->addSuccessor(Succ: NewPreheader);
2293
2294 Prolog->addSuccessor(Succ: NewKernel);
2295
2296 NewKernel->addSuccessor(Succ: NewKernel);
2297 NewKernel->addSuccessor(Succ: Epilog);
2298
2299 Epilog->addSuccessor(Succ: NewPreheader);
2300 Epilog->addSuccessor(Succ: NewExit);
2301
2302 InstrMapTy LastStage0Insts;
2303 insertCondBranch(MBB&: *Check, RequiredTC: Schedule.getNumStages() + NumUnroll - 2,
2304 LastStage0Insts, GreaterThan&: *Prolog, Otherwise&: *NewPreheader);
2305
2306 // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
2307 // register#
2308 SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
2309 generateProlog(VRMap&: PrologVRMap);
2310 generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
2311 generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
2312}
2313
2314/// Replace MI's use operands according to the maps.
2315void ModuloScheduleExpanderMVE::updateInstrUse(
2316 MachineInstr *MI, int StageNum, int PhaseNum,
2317 SmallVectorImpl<ValueMapTy> &CurVRMap,
2318 SmallVectorImpl<ValueMapTy> *PrevVRMap) {
2319 // If MI is in the prolog/kernel/epilog block, CurVRMap is
2320 // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
2321 // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
2322 // Refer to the appropriate map according to the stage difference between
2323 // MI and the definition of an operand.
2324
2325 for (MachineOperand &UseMO : MI->uses()) {
2326 if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
2327 continue;
2328 int DiffStage = 0;
2329 Register OrigReg = UseMO.getReg();
2330 MachineInstr *DefInst = MRI.getVRegDef(Reg: OrigReg);
2331 if (!DefInst || DefInst->getParent() != OrigKernel)
2332 continue;
2333 Register InitReg;
2334 Register DefReg = OrigReg;
2335 if (DefInst->isPHI()) {
2336 ++DiffStage;
2337 Register LoopReg;
2338 getPhiRegs(Phi&: *DefInst, Loop: OrigKernel, InitVal&: InitReg, LoopVal&: LoopReg);
2339 // LoopReg is guaranteed to be defined within the loop by canApply()
2340 DefReg = LoopReg;
2341 DefInst = MRI.getVRegDef(Reg: LoopReg);
2342 }
2343 unsigned DefStageNum = Schedule.getStage(MI: DefInst);
2344 DiffStage += StageNum - DefStageNum;
2345 Register NewReg;
2346 if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(Val: DefReg))
2347 // NewReg is defined in a previous phase of the same block
2348 NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
2349 else if (!PrevVRMap)
2350 // Since this is the first iteration, refer the initial register of the
2351 // loop
2352 NewReg = InitReg;
2353 else
2354 // Cases where DiffStage is larger than PhaseNum.
2355 // If MI is in the kernel block, the value is defined by the previous
2356 // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
2357 // value is defined in the kernel block and KernelVRMap is referenced.
2358 NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
2359
2360 const TargetRegisterClass *NRC =
2361 MRI.constrainRegClass(Reg: NewReg, RC: MRI.getRegClass(Reg: OrigReg));
2362 if (NRC)
2363 UseMO.setReg(NewReg);
2364 else {
2365 Register SplitReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OrigReg));
2366 MachineInstr *NewCopy = BuildMI(BB&: *OrigKernel, I: MI, MIMD: MI->getDebugLoc(),
2367 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SplitReg)
2368 .addReg(RegNo: NewReg);
2369 LIS.InsertMachineInstrInMaps(MI&: *NewCopy);
2370 UseMO.setReg(SplitReg);
2371 }
2372 }
2373}
2374
2375/// Return a phi if Reg is referenced by the phi.
2376/// canApply() guarantees that at most only one such phi exists.
2377static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
2378 for (MachineInstr &Phi : Loop->phis()) {
2379 Register InitVal, LoopVal;
2380 getPhiRegs(Phi, Loop, InitVal, LoopVal);
2381 if (LoopVal == Reg)
2382 return &Phi;
2383 }
2384 return nullptr;
2385}
2386
2387/// Generate phis for registers defined by OrigMI.
2388void ModuloScheduleExpanderMVE::generatePhi(
2389 MachineInstr *OrigMI, int UnrollNum,
2390 SmallVectorImpl<ValueMapTy> &PrologVRMap,
2391 SmallVectorImpl<ValueMapTy> &KernelVRMap,
2392 SmallVectorImpl<ValueMapTy> &PhiVRMap) {
2393 int StageNum = Schedule.getStage(MI: OrigMI);
2394 bool UsePrologReg;
2395 if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
2396 UsePrologReg = true;
2397 else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
2398 UsePrologReg = false;
2399 else
2400 return;
2401
2402 // Examples that show which stages are merged by phi.
2403 // Meaning of the symbol following the stage number:
2404 // a/b: Stages with the same letter are merged (UsePrologReg == true)
2405 // +: Merged with the initial value (UsePrologReg == false)
2406 // *: No phis required
2407 //
2408 // #Stages 3, #MVE 4
2409 // Iter 0 1 2 3 4 5 6 7 8
2410 // -----------------------------------------
2411 // Stage 0a Prolog#0
2412 // Stage 1a 0b Prolog#1
2413 // Stage 2* 1* 0* Kernel Unroll#0
2414 // Stage 2* 1* 0+ Kernel Unroll#1
2415 // Stage 2* 1+ 0a Kernel Unroll#2
2416 // Stage 2+ 1a 0b Kernel Unroll#3
2417 //
2418 // #Stages 3, #MVE 2
2419 // Iter 0 1 2 3 4 5 6 7 8
2420 // -----------------------------------------
2421 // Stage 0a Prolog#0
2422 // Stage 1a 0b Prolog#1
2423 // Stage 2* 1+ 0a Kernel Unroll#0
2424 // Stage 2+ 1a 0b Kernel Unroll#1
2425 //
2426 // #Stages 3, #MVE 1
2427 // Iter 0 1 2 3 4 5 6 7 8
2428 // -----------------------------------------
2429 // Stage 0* Prolog#0
2430 // Stage 1a 0b Prolog#1
2431 // Stage 2+ 1a 0b Kernel Unroll#0
2432
2433 for (MachineOperand &DefMO : OrigMI->defs()) {
2434 if (!DefMO.isReg() || DefMO.isDead())
2435 continue;
2436 Register OrigReg = DefMO.getReg();
2437 auto NewReg = KernelVRMap[UnrollNum].find(Val: OrigReg);
2438 if (NewReg == KernelVRMap[UnrollNum].end())
2439 continue;
2440 Register CorrespondReg;
2441 if (UsePrologReg) {
2442 int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
2443 CorrespondReg = PrologVRMap[PrologNum][OrigReg];
2444 } else {
2445 MachineInstr *Phi = getLoopPhiUser(Reg: OrigReg, Loop: OrigKernel);
2446 if (!Phi)
2447 continue;
2448 CorrespondReg = getInitPhiReg(Phi&: *Phi, LoopBB: OrigKernel);
2449 }
2450
2451 assert(CorrespondReg.isValid());
2452 Register PhiReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OrigReg));
2453 MachineInstr *NewPhi =
2454 BuildMI(BB&: *NewKernel, I: NewKernel->getFirstNonPHI(), MIMD: DebugLoc(),
2455 MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
2456 .addReg(RegNo: NewReg->second)
2457 .addMBB(MBB: NewKernel)
2458 .addReg(RegNo: CorrespondReg)
2459 .addMBB(MBB: Prolog);
2460 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
2461 PhiVRMap[UnrollNum][OrigReg] = PhiReg;
2462 }
2463}
2464
2465static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
2466 MachineBasicBlock *NewMBB) {
2467 for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
2468 if (Phi.getOperand(i: Idx).getReg() == OrigReg) {
2469 Phi.getOperand(i: Idx).setReg(NewReg);
2470 Phi.getOperand(i: Idx + 1).setMBB(NewMBB);
2471 return;
2472 }
2473 }
2474}
2475
2476/// Generate phis that merge values from multiple routes
2477void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
2478 Register NewReg) {
2479 SmallVector<MachineOperand *> UsesAfterLoop;
2480 SmallVector<MachineInstr *> LoopPhis;
2481 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(RegNo: OrigReg),
2482 E = MRI.use_end();
2483 I != E; ++I) {
2484 MachineOperand &O = *I;
2485 if (O.getParent()->getParent() != OrigKernel &&
2486 O.getParent()->getParent() != Prolog &&
2487 O.getParent()->getParent() != NewKernel &&
2488 O.getParent()->getParent() != Epilog)
2489 UsesAfterLoop.push_back(Elt: &O);
2490 if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
2491 LoopPhis.push_back(Elt: O.getParent());
2492 }
2493
2494 // Merge the route that only execute the pipelined loop (when there are no
2495 // remaining iterations) with the route that execute the original loop.
2496 if (!UsesAfterLoop.empty()) {
2497 Register PhiReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OrigReg));
2498 MachineInstr *NewPhi =
2499 BuildMI(BB&: *NewExit, I: NewExit->getFirstNonPHI(), MIMD: DebugLoc(),
2500 MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
2501 .addReg(RegNo: OrigReg)
2502 .addMBB(MBB: OrigKernel)
2503 .addReg(RegNo: NewReg)
2504 .addMBB(MBB: Epilog);
2505 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
2506
2507 for (MachineOperand *MO : UsesAfterLoop)
2508 MO->setReg(PhiReg);
2509
2510 // The interval of OrigReg is invalid and should be recalculated when
2511 // LiveInterval::getInterval() is called.
2512 if (LIS.hasInterval(Reg: OrigReg))
2513 LIS.removeInterval(Reg: OrigReg);
2514 }
2515
2516 // Merge routes from the pipelined loop and the bypassed route before the
2517 // original loop
2518 if (!LoopPhis.empty()) {
2519 for (MachineInstr *Phi : LoopPhis) {
2520 Register InitReg, LoopReg;
2521 getPhiRegs(Phi&: *Phi, Loop: OrigKernel, InitVal&: InitReg, LoopVal&: LoopReg);
2522 Register NewInit = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: InitReg));
2523 MachineInstr *NewPhi =
2524 BuildMI(BB&: *NewPreheader, I: NewPreheader->getFirstNonPHI(),
2525 MIMD: Phi->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: NewInit)
2526 .addReg(RegNo: InitReg)
2527 .addMBB(MBB: Check)
2528 .addReg(RegNo: NewReg)
2529 .addMBB(MBB: Epilog);
2530 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
2531 replacePhiSrc(Phi&: *Phi, OrigReg: InitReg, NewReg: NewInit, NewMBB: NewPreheader);
2532 }
2533 }
2534}
2535
2536void ModuloScheduleExpanderMVE::generateProlog(
2537 SmallVectorImpl<ValueMapTy> &PrologVRMap) {
2538 PrologVRMap.clear();
2539 PrologVRMap.resize(N: Schedule.getNumStages() - 1);
2540 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2541 for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
2542 ++PrologNum) {
2543 for (MachineInstr *MI : Schedule.getInstructions()) {
2544 if (MI->isPHI())
2545 continue;
2546 int StageNum = Schedule.getStage(MI);
2547 if (StageNum > PrologNum)
2548 continue;
2549 MachineInstr *NewMI = cloneInstr(OldMI: MI);
2550 updateInstrDef(NewMI, VRMap&: PrologVRMap[PrologNum], LastDef: false);
2551 NewMIMap[NewMI] = {PrologNum, StageNum};
2552 Prolog->push_back(MI: NewMI);
2553 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
2554 }
2555 }
2556
2557 for (auto I : NewMIMap) {
2558 MachineInstr *MI = I.first;
2559 int PrologNum = I.second.first;
2560 int StageNum = I.second.second;
2561 updateInstrUse(MI, StageNum, PhaseNum: PrologNum, CurVRMap&: PrologVRMap, PrevVRMap: nullptr);
2562 }
2563
2564 LLVM_DEBUG({
2565 dbgs() << "prolog:\n";
2566 Prolog->dump();
2567 });
2568}
2569
2570void ModuloScheduleExpanderMVE::generateKernel(
2571 SmallVectorImpl<ValueMapTy> &PrologVRMap,
2572 SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
2573 KernelVRMap.clear();
2574 KernelVRMap.resize(N: NumUnroll);
2575 SmallVector<ValueMapTy> PhiVRMap;
2576 PhiVRMap.resize(N: NumUnroll);
2577 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2578 for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
2579 for (MachineInstr *MI : Schedule.getInstructions()) {
2580 if (MI->isPHI())
2581 continue;
2582 int StageNum = Schedule.getStage(MI);
2583 MachineInstr *NewMI = cloneInstr(OldMI: MI);
2584 if (UnrollNum == NumUnroll - 1)
2585 LastStage0Insts[MI] = NewMI;
2586 updateInstrDef(NewMI, VRMap&: KernelVRMap[UnrollNum],
2587 LastDef: (UnrollNum == NumUnroll - 1 && StageNum == 0));
2588 generatePhi(OrigMI: MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
2589 NewMIMap[NewMI] = {UnrollNum, StageNum};
2590 NewKernel->push_back(MI: NewMI);
2591 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
2592 }
2593 }
2594
2595 for (auto I : NewMIMap) {
2596 MachineInstr *MI = I.first;
2597 int UnrollNum = I.second.first;
2598 int StageNum = I.second.second;
2599 updateInstrUse(MI, StageNum, PhaseNum: UnrollNum, CurVRMap&: KernelVRMap, PrevVRMap: &PhiVRMap);
2600 }
2601
2602 // If remaining trip count is greater than NumUnroll-1, loop continues
2603 insertCondBranch(MBB&: *NewKernel, RequiredTC: NumUnroll - 1, LastStage0Insts, GreaterThan&: *NewKernel,
2604 Otherwise&: *Epilog);
2605
2606 LLVM_DEBUG({
2607 dbgs() << "kernel:\n";
2608 NewKernel->dump();
2609 });
2610}
2611
2612void ModuloScheduleExpanderMVE::generateEpilog(
2613 SmallVectorImpl<ValueMapTy> &KernelVRMap,
2614 SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
2615 EpilogVRMap.clear();
2616 EpilogVRMap.resize(N: Schedule.getNumStages() - 1);
2617 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2618 for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
2619 ++EpilogNum) {
2620 for (MachineInstr *MI : Schedule.getInstructions()) {
2621 if (MI->isPHI())
2622 continue;
2623 int StageNum = Schedule.getStage(MI);
2624 if (StageNum <= EpilogNum)
2625 continue;
2626 MachineInstr *NewMI = cloneInstr(OldMI: MI);
2627 updateInstrDef(NewMI, VRMap&: EpilogVRMap[EpilogNum], LastDef: StageNum - 1 == EpilogNum);
2628 NewMIMap[NewMI] = {EpilogNum, StageNum};
2629 Epilog->push_back(MI: NewMI);
2630 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
2631 }
2632 }
2633
2634 for (auto I : NewMIMap) {
2635 MachineInstr *MI = I.first;
2636 int EpilogNum = I.second.first;
2637 int StageNum = I.second.second;
2638 updateInstrUse(MI, StageNum, PhaseNum: EpilogNum, CurVRMap&: EpilogVRMap, PrevVRMap: &KernelVRMap);
2639 }
2640
2641 // If there are remaining iterations, they are executed in the original loop.
2642 // Instructions related to loop control, such as loop counter comparison,
2643 // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
2644 // in stage 0. Thus, the map is for the last one in the kernel.
2645 insertCondBranch(MBB&: *Epilog, RequiredTC: 0, LastStage0Insts, GreaterThan&: *NewPreheader, Otherwise&: *NewExit);
2646
2647 LLVM_DEBUG({
2648 dbgs() << "epilog:\n";
2649 Epilog->dump();
2650 });
2651}
2652
2653/// Calculate the number of unroll required and set it to NumUnroll
2654void ModuloScheduleExpanderMVE::calcNumUnroll() {
2655 DenseMap<MachineInstr *, unsigned> Inst2Idx;
2656 NumUnroll = 1;
2657 for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
2658 Inst2Idx[Schedule.getInstructions()[I]] = I;
2659
2660 for (MachineInstr *MI : Schedule.getInstructions()) {
2661 if (MI->isPHI())
2662 continue;
2663 int StageNum = Schedule.getStage(MI);
2664 for (const MachineOperand &MO : MI->uses()) {
2665 if (!MO.isReg() || !MO.getReg().isVirtual())
2666 continue;
2667 MachineInstr *DefMI = MRI.getVRegDef(Reg: MO.getReg());
2668 if (DefMI->getParent() != OrigKernel)
2669 continue;
2670
2671 int NumUnrollLocal = 1;
2672 if (DefMI->isPHI()) {
2673 ++NumUnrollLocal;
2674 // canApply() guarantees that DefMI is not phi and is an instruction in
2675 // the loop
2676 DefMI = MRI.getVRegDef(Reg: getLoopPhiReg(Phi&: *DefMI, LoopBB: OrigKernel));
2677 }
2678 NumUnrollLocal += StageNum - Schedule.getStage(MI: DefMI);
2679 if (Inst2Idx[MI] <= Inst2Idx[DefMI])
2680 --NumUnrollLocal;
2681 NumUnroll = std::max(a: NumUnroll, b: NumUnrollLocal);
2682 }
2683 }
2684 LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
2685}
2686
2687/// Create new virtual registers for definitions of NewMI and update NewMI.
2688/// If the definitions are referenced after the pipelined loop, phis are
2689/// created to merge with other routes.
2690void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
2691 ValueMapTy &VRMap,
2692 bool LastDef) {
2693 for (MachineOperand &MO : NewMI->all_defs()) {
2694 if (!MO.getReg().isVirtual())
2695 continue;
2696 Register Reg = MO.getReg();
2697 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
2698 Register NewReg = MRI.createVirtualRegister(RegClass: RC);
2699 MO.setReg(NewReg);
2700 VRMap[Reg] = NewReg;
2701 if (LastDef)
2702 mergeRegUsesAfterPipeline(OrigReg: Reg, NewReg);
2703 }
2704}
2705
2706void ModuloScheduleExpanderMVE::expand() {
2707 OrigKernel = Schedule.getLoop()->getTopBlock();
2708 OrigPreheader = Schedule.getLoop()->getLoopPreheader();
2709 OrigExit = Schedule.getLoop()->getExitBlock();
2710
2711 LLVM_DEBUG(Schedule.dump());
2712
2713 generatePipelinedLoop();
2714}
2715
2716/// Check if ModuloScheduleExpanderMVE can be applied to L
2717bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
2718 if (!L.getExitBlock()) {
2719 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n");
2720 return false;
2721 }
2722
2723 MachineBasicBlock *BB = L.getTopBlock();
2724 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2725
2726 // Put some constraints on the operands of the phis to simplify the
2727 // transformation
2728 DenseSet<Register> UsedByPhi;
2729 for (MachineInstr &MI : BB->phis()) {
2730 // Registers defined by phis must be used only inside the loop and be never
2731 // used by phis.
2732 for (MachineOperand &MO : MI.defs())
2733 if (MO.isReg())
2734 for (MachineInstr &Ref : MRI.use_instructions(Reg: MO.getReg()))
2735 if (Ref.getParent() != BB || Ref.isPHI()) {
2736 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
2737 "referenced outside of the loop or by phi.\n");
2738 return false;
2739 }
2740
2741 // A source register from the loop block must be defined inside the loop.
2742 // A register defined inside the loop must be referenced by only one phi at
2743 // most.
2744 Register InitVal, LoopVal;
2745 getPhiRegs(Phi&: MI, Loop: MI.getParent(), InitVal, LoopVal);
2746 if (!Register(LoopVal).isVirtual() ||
2747 MRI.getVRegDef(Reg: LoopVal)->getParent() != BB) {
2748 LLVM_DEBUG(
2749 dbgs() << "Can not apply MVE expander: A phi source value coming "
2750 "from the loop is not defined in the loop.\n");
2751 return false;
2752 }
2753 if (UsedByPhi.count(V: LoopVal)) {
2754 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
2755 "loop is referenced by two or more phis.\n");
2756 return false;
2757 }
2758 UsedByPhi.insert(V: LoopVal);
2759 }
2760
2761 return true;
2762}
2763
2764//===----------------------------------------------------------------------===//
2765// ModuloScheduleTestPass implementation
2766//===----------------------------------------------------------------------===//
2767// This pass constructs a ModuloSchedule from its module and runs
2768// ModuloScheduleExpander.
2769//
2770// The module is expected to contain a single-block analyzable loop.
2771// The total order of instructions is taken from the loop as-is.
2772// Instructions are expected to be annotated with a PostInstrSymbol.
2773// This PostInstrSymbol must have the following format:
2774// "Stage=%d Cycle=%d".
2775//===----------------------------------------------------------------------===//
2776
2777namespace {
2778class ModuloScheduleTest : public MachineFunctionPass {
2779public:
2780 static char ID;
2781
2782 ModuloScheduleTest() : MachineFunctionPass(ID) {}
2783
2784 bool runOnMachineFunction(MachineFunction &MF) override;
2785 void runOnLoop(MachineFunction &MF, MachineLoop &L);
2786
2787 void getAnalysisUsage(AnalysisUsage &AU) const override {
2788 AU.addRequired<MachineLoopInfoWrapperPass>();
2789 AU.addRequired<LiveIntervalsWrapperPass>();
2790 MachineFunctionPass::getAnalysisUsage(AU);
2791 }
2792};
2793} // namespace
2794
2795char ModuloScheduleTest::ID = 0;
2796
2797INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
2798 "Modulo Schedule test pass", false, false)
2799INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
2800INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
2801INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
2802 "Modulo Schedule test pass", false, false)
2803
2804bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
2805 MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2806 for (auto *L : MLI) {
2807 if (L->getTopBlock() != L->getBottomBlock())
2808 continue;
2809 runOnLoop(MF, L&: *L);
2810 return false;
2811 }
2812 return false;
2813}
2814
2815static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
2816 std::pair<StringRef, StringRef> StageAndCycle = getToken(Source: S, Delimiters: "_");
2817 std::pair<StringRef, StringRef> StageTokenAndValue =
2818 getToken(Source: StageAndCycle.first, Delimiters: "-");
2819 std::pair<StringRef, StringRef> CycleTokenAndValue =
2820 getToken(Source: StageAndCycle.second, Delimiters: "-");
2821 if (StageTokenAndValue.first != "Stage" ||
2822 CycleTokenAndValue.first != "_Cycle") {
2823 llvm_unreachable(
2824 "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
2825 return;
2826 }
2827
2828 StageTokenAndValue.second.drop_front().getAsInteger(Radix: 10, Result&: Stage);
2829 CycleTokenAndValue.second.drop_front().getAsInteger(Radix: 10, Result&: Cycle);
2830
2831 dbgs() << " Stage=" << Stage << ", Cycle=" << Cycle << "\n";
2832}
2833
2834void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
2835 LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
2836 MachineBasicBlock *BB = L.getTopBlock();
2837 dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
2838
2839 DenseMap<MachineInstr *, int> Cycle, Stage;
2840 std::vector<MachineInstr *> Instrs;
2841 for (MachineInstr &MI : *BB) {
2842 if (MI.isTerminator())
2843 continue;
2844 Instrs.push_back(x: &MI);
2845 if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
2846 dbgs() << "Parsing post-instr symbol for " << MI;
2847 parseSymbolString(S: Sym->getName(), Cycle&: Cycle[&MI], Stage&: Stage[&MI]);
2848 }
2849 }
2850
2851 ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
2852 std::move(Stage));
2853 ModuloScheduleExpander MSE(
2854 MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
2855 MSE.expand();
2856 MSE.cleanup();
2857}
2858
2859//===----------------------------------------------------------------------===//
2860// ModuloScheduleTestAnnotater implementation
2861//===----------------------------------------------------------------------===//
2862
2863void ModuloScheduleTestAnnotater::annotate() {
2864 for (MachineInstr *MI : S.getInstructions()) {
2865 SmallVector<char, 16> SV;
2866 raw_svector_ostream OS(SV);
2867 OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
2868 MCSymbol *Sym = MF.getContext().getOrCreateSymbol(Name: OS.str());
2869 MI->setPostInstrSymbol(MF, Symbol: Sym);
2870 }
2871}
2872