1//===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "llvm/CodeGen/ModuloSchedule.h"
10#include "llvm/ADT/StringExtras.h"
11#include "llvm/Analysis/MemoryLocation.h"
12#include "llvm/CodeGen/LiveIntervals.h"
13#include "llvm/CodeGen/MachineInstrBuilder.h"
14#include "llvm/CodeGen/MachineLoopInfo.h"
15#include "llvm/CodeGen/MachineRegisterInfo.h"
16#include "llvm/InitializePasses.h"
17#include "llvm/MC/MCContext.h"
18#include "llvm/Support/Debug.h"
19#include "llvm/Support/ErrorHandling.h"
20#include "llvm/Support/raw_ostream.h"
21
22#define DEBUG_TYPE "pipeliner"
23using namespace llvm;
24
25static cl::opt<bool> SwapBranchTargetsMVE(
26 "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(Val: false),
27 cl::desc("Swap target blocks of a conditional branch for MVE expander"));
28
29void ModuloSchedule::print(raw_ostream &OS) {
30 for (MachineInstr *MI : ScheduledInstrs)
31 OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
32}
33
34//===----------------------------------------------------------------------===//
35// ModuloScheduleExpander implementation
36//===----------------------------------------------------------------------===//
37
38/// Return the register values for the operands of a Phi instruction.
39/// This function assume the instruction is a Phi.
40static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
41 Register &InitVal, Register &LoopVal) {
42 assert(Phi.isPHI() && "Expecting a Phi.");
43
44 InitVal = Register();
45 LoopVal = Register();
46 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
47 if (Phi.getOperand(i: i + 1).getMBB() != Loop)
48 InitVal = Phi.getOperand(i).getReg();
49 else
50 LoopVal = Phi.getOperand(i).getReg();
51
52 assert(InitVal && LoopVal && "Unexpected Phi structure.");
53}
54
55/// Return the Phi register value that comes from the incoming block.
56static Register getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
57 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
58 if (Phi.getOperand(i: i + 1).getMBB() != LoopBB)
59 return Phi.getOperand(i).getReg();
60 return Register();
61}
62
63/// Return the Phi register value that comes the loop block.
64static Register getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
65 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
66 if (Phi.getOperand(i: i + 1).getMBB() == LoopBB)
67 return Phi.getOperand(i).getReg();
68 return Register();
69}
70
71void ModuloScheduleExpander::expand() {
72 BB = Schedule.getLoop()->getTopBlock();
73 Preheader = *BB->pred_begin();
74 if (Preheader == BB)
75 Preheader = *std::next(x: BB->pred_begin());
76
77 // Iterate over the definitions in each instruction, and compute the
78 // stage difference for each use. Keep the maximum value.
79 for (MachineInstr *MI : Schedule.getInstructions()) {
80 int DefStage = Schedule.getStage(MI);
81 for (const MachineOperand &Op : MI->all_defs()) {
82 Register Reg = Op.getReg();
83 unsigned MaxDiff = 0;
84 bool PhiIsSwapped = false;
85 for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
86 MachineInstr *UseMI = UseOp.getParent();
87 int UseStage = Schedule.getStage(MI: UseMI);
88 unsigned Diff = 0;
89 if (UseStage != -1 && UseStage >= DefStage)
90 Diff = UseStage - DefStage;
91 if (MI->isPHI()) {
92 if (isLoopCarried(Phi&: *MI))
93 ++Diff;
94 else
95 PhiIsSwapped = true;
96 }
97 MaxDiff = std::max(a: Diff, b: MaxDiff);
98 }
99 RegToStageDiff[Reg] = std::make_pair(x&: MaxDiff, y&: PhiIsSwapped);
100 }
101 }
102
103 generatePipelinedLoop();
104}
105
106void ModuloScheduleExpander::generatePipelinedLoop() {
107 LoopInfo = TII->analyzeLoopForPipelining(LoopBB: BB);
108 assert(LoopInfo && "Must be able to analyze loop!");
109
110 // Create a new basic block for the kernel and add it to the CFG.
111 MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB: BB->getBasicBlock());
112
113 unsigned MaxStageCount = Schedule.getNumStages() - 1;
114
115 // Remember the registers that are used in different stages. The index is
116 // the iteration, or stage, that the instruction is scheduled in. This is
117 // a map between register names in the original block and the names created
118 // in each stage of the pipelined loop.
119 ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
120
121 // The renaming destination by Phis for the registers across stages.
122 // This map is updated during Phis generation to point to the most recent
123 // renaming destination.
124 ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
125
126 InstrMapTy InstrMap;
127
128 SmallVector<MachineBasicBlock *, 4> PrologBBs;
129
130 // Generate the prolog instructions that set up the pipeline.
131 generateProlog(LastStage: MaxStageCount, KernelBB, VRMap, PrologBBs);
132 MF.insert(MBBI: BB->getIterator(), MBB: KernelBB);
133 LIS.insertMBBInMaps(MBB: KernelBB);
134
135 // Rearrange the instructions to generate the new, pipelined loop,
136 // and update register names as needed.
137 for (MachineInstr *CI : Schedule.getInstructions()) {
138 if (CI->isPHI())
139 continue;
140 unsigned StageNum = Schedule.getStage(MI: CI);
141 MachineInstr *NewMI = cloneInstr(OldMI: CI, CurStageNum: MaxStageCount, InstStageNum: StageNum);
142 updateInstruction(NewMI, LastDef: false, CurStageNum: MaxStageCount, InstrStageNum: StageNum, VRMap);
143 KernelBB->push_back(MI: NewMI);
144 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
145 InstrMap[NewMI] = CI;
146 }
147
148 // Copy any terminator instructions to the new kernel, and update
149 // names as needed.
150 for (MachineInstr &MI : BB->terminators()) {
151 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: &MI);
152 updateInstruction(NewMI, LastDef: false, CurStageNum: MaxStageCount, InstrStageNum: 0, VRMap);
153 KernelBB->push_back(MI: NewMI);
154 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
155 InstrMap[NewMI] = &MI;
156 }
157
158 NewKernel = KernelBB;
159 KernelBB->transferSuccessors(FromMBB: BB);
160 KernelBB->replaceSuccessor(Old: BB, New: KernelBB);
161
162 generateExistingPhis(NewBB: KernelBB, BB1: PrologBBs.back(), BB2: KernelBB, KernelBB, VRMap,
163 InstrMap, LastStageNum: MaxStageCount, CurStageNum: MaxStageCount, IsLast: false);
164 generatePhis(NewBB: KernelBB, BB1: PrologBBs.back(), BB2: KernelBB, KernelBB, VRMap, VRMapPhi,
165 InstrMap, LastStageNum: MaxStageCount, CurStageNum: MaxStageCount, IsLast: false);
166
167 LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
168
169 SmallVector<MachineBasicBlock *, 4> EpilogBBs;
170 // Generate the epilog instructions to complete the pipeline.
171 generateEpilog(LastStage: MaxStageCount, KernelBB, OrigBB: BB, VRMap, VRMapPhi, EpilogBBs,
172 PrologBBs);
173
174 // We need this step because the register allocation doesn't handle some
175 // situations well, so we insert copies to help out.
176 splitLifetimes(KernelBB, EpilogBBs);
177
178 // Remove dead instructions due to loop induction variables.
179 removeDeadInstructions(KernelBB, EpilogBBs);
180
181 // Add branches between prolog and epilog blocks.
182 addBranches(PreheaderBB&: *Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
183
184 delete[] VRMap;
185 delete[] VRMapPhi;
186}
187
188void ModuloScheduleExpander::cleanup() {
189 // Remove the original loop since it's no longer referenced.
190 for (auto &I : *BB)
191 LIS.RemoveMachineInstrFromMaps(MI&: I);
192 BB->clear();
193 BB->eraseFromParent();
194}
195
196/// Generate the pipeline prolog code.
197void ModuloScheduleExpander::generateProlog(unsigned LastStage,
198 MachineBasicBlock *KernelBB,
199 ValueMapTy *VRMap,
200 MBBVectorTy &PrologBBs) {
201 MachineBasicBlock *PredBB = Preheader;
202 InstrMapTy InstrMap;
203
204 // Generate a basic block for each stage, not including the last stage,
205 // which will be generated in the kernel. Each basic block may contain
206 // instructions from multiple stages/iterations.
207 for (unsigned i = 0; i < LastStage; ++i) {
208 // Create and insert the prolog basic block prior to the original loop
209 // basic block. The original loop is removed later.
210 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB: BB->getBasicBlock());
211 PrologBBs.push_back(Elt: NewBB);
212 MF.insert(MBBI: BB->getIterator(), MBB: NewBB);
213 NewBB->transferSuccessors(FromMBB: PredBB);
214 PredBB->addSuccessor(Succ: NewBB);
215 PredBB = NewBB;
216 LIS.insertMBBInMaps(MBB: NewBB);
217
218 // Generate instructions for each appropriate stage. Process instructions
219 // in original program order.
220 for (int StageNum = i; StageNum >= 0; --StageNum) {
221 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
222 BBE = BB->getFirstTerminator();
223 BBI != BBE; ++BBI) {
224 if (Schedule.getStage(MI: &*BBI) == StageNum) {
225 if (BBI->isPHI())
226 continue;
227 MachineInstr *NewMI =
228 cloneAndChangeInstr(OldMI: &*BBI, CurStageNum: i, InstStageNum: (unsigned)StageNum);
229 updateInstruction(NewMI, LastDef: false, CurStageNum: i, InstrStageNum: (unsigned)StageNum, VRMap);
230 NewBB->push_back(MI: NewMI);
231 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
232 InstrMap[NewMI] = &*BBI;
233 }
234 }
235 }
236 rewritePhiValues(NewBB, StageNum: i, VRMap, InstrMap);
237 LLVM_DEBUG({
238 dbgs() << "prolog:\n";
239 NewBB->dump();
240 });
241 }
242
243 PredBB->replaceSuccessor(Old: BB, New: KernelBB);
244
245 // Check if we need to remove the branch from the preheader to the original
246 // loop, and replace it with a branch to the new loop.
247 unsigned numBranches = TII->removeBranch(MBB&: *Preheader);
248 if (numBranches) {
249 SmallVector<MachineOperand, 0> Cond;
250 TII->insertBranch(MBB&: *Preheader, TBB: PrologBBs[0], FBB: nullptr, Cond, DL: DebugLoc());
251 }
252}
253
254/// Generate the pipeline epilog code. The epilog code finishes the iterations
255/// that were started in either the prolog or the kernel. We create a basic
256/// block for each stage that needs to complete.
257void ModuloScheduleExpander::generateEpilog(
258 unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
259 ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
260 MBBVectorTy &PrologBBs) {
261 // We need to change the branch from the kernel to the first epilog block, so
262 // this call to analyze branch uses the kernel rather than the original BB.
263 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
264 SmallVector<MachineOperand, 4> Cond;
265 bool checkBranch = TII->analyzeBranch(MBB&: *KernelBB, TBB, FBB, Cond);
266 assert(!checkBranch && "generateEpilog must be able to analyze the branch");
267 if (checkBranch)
268 return;
269
270 MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
271 if (*LoopExitI == KernelBB)
272 ++LoopExitI;
273 assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
274 MachineBasicBlock *LoopExitBB = *LoopExitI;
275
276 MachineBasicBlock *PredBB = KernelBB;
277 MachineBasicBlock *EpilogStart = LoopExitBB;
278 InstrMapTy InstrMap;
279
280 // Generate a basic block for each stage, not including the last stage,
281 // which was generated for the kernel. Each basic block may contain
282 // instructions from multiple stages/iterations.
283 int EpilogStage = LastStage + 1;
284 for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
285 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
286 EpilogBBs.push_back(Elt: NewBB);
287 MF.insert(MBBI: BB->getIterator(), MBB: NewBB);
288
289 PredBB->replaceSuccessor(Old: LoopExitBB, New: NewBB);
290 NewBB->addSuccessor(Succ: LoopExitBB);
291 LIS.insertMBBInMaps(MBB: NewBB);
292
293 if (EpilogStart == LoopExitBB)
294 EpilogStart = NewBB;
295
296 // Add instructions to the epilog depending on the current block.
297 // Process instructions in original program order.
298 for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
299 for (auto &BBI : *BB) {
300 if (BBI.isPHI())
301 continue;
302 MachineInstr *In = &BBI;
303 if ((unsigned)Schedule.getStage(MI: In) == StageNum) {
304 // Instructions with memoperands in the epilog are updated with
305 // conservative values.
306 MachineInstr *NewMI = cloneInstr(OldMI: In, UINT_MAX, InstStageNum: 0);
307 updateInstruction(NewMI, LastDef: i == 1, CurStageNum: EpilogStage, InstrStageNum: 0, VRMap);
308 NewBB->push_back(MI: NewMI);
309 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
310 InstrMap[NewMI] = In;
311 }
312 }
313 }
314 generateExistingPhis(NewBB, BB1: PrologBBs[i - 1], BB2: PredBB, KernelBB, VRMap,
315 InstrMap, LastStageNum: LastStage, CurStageNum: EpilogStage, IsLast: i == 1);
316 generatePhis(NewBB, BB1: PrologBBs[i - 1], BB2: PredBB, KernelBB, VRMap, VRMapPhi,
317 InstrMap, LastStageNum: LastStage, CurStageNum: EpilogStage, IsLast: i == 1);
318 PredBB = NewBB;
319
320 LLVM_DEBUG({
321 dbgs() << "epilog:\n";
322 NewBB->dump();
323 });
324 }
325
326 // Fix any Phi nodes in the loop exit block.
327 LoopExitBB->replacePhiUsesWith(Old: BB, New: PredBB);
328
329 // Create a branch to the new epilog from the kernel.
330 // Remove the original branch and add a new branch to the epilog.
331 TII->removeBranch(MBB&: *KernelBB);
332 assert((OrigBB == TBB || OrigBB == FBB) &&
333 "Unable to determine looping branch direction");
334 if (OrigBB != TBB)
335 TII->insertBranch(MBB&: *KernelBB, TBB: EpilogStart, FBB: KernelBB, Cond, DL: DebugLoc());
336 else
337 TII->insertBranch(MBB&: *KernelBB, TBB: KernelBB, FBB: EpilogStart, Cond, DL: DebugLoc());
338 // Add a branch to the loop exit.
339 if (EpilogBBs.size() > 0) {
340 MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
341 SmallVector<MachineOperand, 4> Cond1;
342 TII->insertBranch(MBB&: *LastEpilogBB, TBB: LoopExitBB, FBB: nullptr, Cond: Cond1, DL: DebugLoc());
343 }
344}
345
346/// Replace all uses of FromReg that appear outside the specified
347/// basic block with ToReg.
348static void replaceRegUsesAfterLoop(Register FromReg, Register ToReg,
349 MachineBasicBlock *MBB,
350 MachineRegisterInfo &MRI) {
351 for (MachineOperand &O :
352 llvm::make_early_inc_range(Range: MRI.use_operands(Reg: FromReg)))
353 if (O.getParent()->getParent() != MBB)
354 O.setReg(ToReg);
355}
356
357/// Return true if the register has a use that occurs outside the
358/// specified loop.
359static bool hasUseAfterLoop(Register Reg, MachineBasicBlock *BB,
360 MachineRegisterInfo &MRI) {
361 for (const MachineOperand &MO : MRI.use_operands(Reg))
362 if (MO.getParent()->getParent() != BB)
363 return true;
364 return false;
365}
366
367/// Generate Phis for the specific block in the generated pipelined code.
368/// This function looks at the Phis from the original code to guide the
369/// creation of new Phis.
370void ModuloScheduleExpander::generateExistingPhis(
371 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
372 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
373 unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
374 // Compute the stage number for the initial value of the Phi, which
375 // comes from the prolog. The prolog to use depends on to which kernel/
376 // epilog that we're adding the Phi.
377 unsigned PrologStage = 0;
378 unsigned PrevStage = 0;
379 bool InKernel = (LastStageNum == CurStageNum);
380 if (InKernel) {
381 PrologStage = LastStageNum - 1;
382 PrevStage = CurStageNum;
383 } else {
384 PrologStage = LastStageNum - (CurStageNum - LastStageNum);
385 PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
386 }
387
388 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
389 BBE = BB->getFirstNonPHI();
390 BBI != BBE; ++BBI) {
391 Register Def = BBI->getOperand(i: 0).getReg();
392
393 Register InitVal;
394 Register LoopVal;
395 getPhiRegs(Phi&: *BBI, Loop: BB, InitVal, LoopVal);
396
397 Register PhiOp1;
398 // The Phi value from the loop body typically is defined in the loop, but
399 // not always. So, we need to check if the value is defined in the loop.
400 Register PhiOp2 = LoopVal;
401 if (auto It = VRMap[LastStageNum].find(Val: LoopVal);
402 It != VRMap[LastStageNum].end())
403 PhiOp2 = It->second;
404
405 int StageScheduled = Schedule.getStage(MI: &*BBI);
406 int LoopValStage = Schedule.getStage(MI: MRI.getVRegDef(Reg: LoopVal));
407 unsigned NumStages = getStagesForReg(Reg: Def, CurStage: CurStageNum);
408 if (NumStages == 0) {
409 // We don't need to generate a Phi anymore, but we need to rename any uses
410 // of the Phi value.
411 Register NewReg = VRMap[PrevStage][LoopVal];
412 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: 0, Phi: &*BBI, OldReg: Def,
413 NewReg: InitVal, PrevReg: NewReg);
414 auto It = VRMap[CurStageNum].find(Val: LoopVal);
415 if (It != VRMap[CurStageNum].end()) {
416 Register Reg = It->second;
417 VRMap[CurStageNum][Def] = Reg;
418 }
419 }
420 // Adjust the number of Phis needed depending on the number of prologs left,
421 // and the distance from where the Phi is first scheduled. The number of
422 // Phis cannot exceed the number of prolog stages. Each stage can
423 // potentially define two values.
424 unsigned MaxPhis = PrologStage + 2;
425 if (!InKernel && (int)PrologStage <= LoopValStage)
426 MaxPhis = std::max(a: (int)MaxPhis - (int)LoopValStage, b: 1);
427 unsigned NumPhis = std::min(a: NumStages, b: MaxPhis);
428
429 Register NewReg;
430 unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
431 // In the epilog, we may need to look back one stage to get the correct
432 // Phi name, because the epilog and prolog blocks execute the same stage.
433 // The correct name is from the previous block only when the Phi has
434 // been completely scheduled prior to the epilog, and Phi value is not
435 // needed in multiple stages.
436 int StageDiff = 0;
437 if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
438 NumPhis == 1)
439 StageDiff = 1;
440 // Adjust the computations below when the phi and the loop definition
441 // are scheduled in different stages.
442 if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
443 StageDiff = StageScheduled - LoopValStage;
444 for (unsigned np = 0; np < NumPhis; ++np) {
445 // If the Phi hasn't been scheduled, then use the initial Phi operand
446 // value. Otherwise, use the scheduled version of the instruction. This
447 // is a little complicated when a Phi references another Phi.
448 if (np > PrologStage || StageScheduled >= (int)LastStageNum)
449 PhiOp1 = InitVal;
450 // Check if the Phi has already been scheduled in a prolog stage.
451 else if (PrologStage >= AccessStage + StageDiff + np &&
452 VRMap[PrologStage - StageDiff - np].count(Val: LoopVal) != 0)
453 PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
454 // Check if the Phi has already been scheduled, but the loop instruction
455 // is either another Phi, or doesn't occur in the loop.
456 else if (PrologStage >= AccessStage + StageDiff + np) {
457 // If the Phi references another Phi, we need to examine the other
458 // Phi to get the correct value.
459 PhiOp1 = LoopVal;
460 MachineInstr *InstOp1 = MRI.getVRegDef(Reg: PhiOp1);
461 int Indirects = 1;
462 while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
463 int PhiStage = Schedule.getStage(MI: InstOp1);
464 if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
465 PhiOp1 = getInitPhiReg(Phi&: *InstOp1, LoopBB: BB);
466 else
467 PhiOp1 = getLoopPhiReg(Phi&: *InstOp1, LoopBB: BB);
468 InstOp1 = MRI.getVRegDef(Reg: PhiOp1);
469 int PhiOpStage = Schedule.getStage(MI: InstOp1);
470 int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
471 if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np) {
472 auto &M = VRMap[PrologStage - StageAdj - Indirects - np];
473 if (auto It = M.find(Val: PhiOp1); It != M.end()) {
474 PhiOp1 = It->second;
475 break;
476 }
477 }
478 ++Indirects;
479 }
480 } else
481 PhiOp1 = InitVal;
482 // If this references a generated Phi in the kernel, get the Phi operand
483 // from the incoming block.
484 if (MachineInstr *InstOp1 = MRI.getVRegDef(Reg: PhiOp1))
485 if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
486 PhiOp1 = getInitPhiReg(Phi&: *InstOp1, LoopBB: KernelBB);
487
488 MachineInstr *PhiInst = MRI.getVRegDef(Reg: LoopVal);
489 bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
490 // In the epilog, a map lookup is needed to get the value from the kernel,
491 // or previous epilog block. How is does this depends on if the
492 // instruction is scheduled in the previous block.
493 if (!InKernel) {
494 int StageDiffAdj = 0;
495 if (LoopValStage != -1 && StageScheduled > LoopValStage)
496 StageDiffAdj = StageScheduled - LoopValStage;
497 // Use the loop value defined in the kernel, unless the kernel
498 // contains the last definition of the Phi.
499 if (np == 0 && PrevStage == LastStageNum &&
500 (StageScheduled != 0 || LoopValStage != 0) &&
501 VRMap[PrevStage - StageDiffAdj].count(Val: LoopVal))
502 PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
503 // Use the value defined by the Phi. We add one because we switch
504 // from looking at the loop value to the Phi definition.
505 else if (np > 0 && PrevStage == LastStageNum &&
506 VRMap[PrevStage - np + 1].count(Val: Def))
507 PhiOp2 = VRMap[PrevStage - np + 1][Def];
508 // Use the loop value defined in the kernel.
509 else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
510 VRMap[PrevStage - StageDiffAdj - np].count(Val: LoopVal))
511 PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
512 // Use the value defined by the Phi, unless we're generating the first
513 // epilog and the Phi refers to a Phi in a different stage.
514 else if (VRMap[PrevStage - np].count(Val: Def) &&
515 (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
516 (LoopValStage == StageScheduled)))
517 PhiOp2 = VRMap[PrevStage - np][Def];
518 }
519
520 // Check if we can reuse an existing Phi. This occurs when a Phi
521 // references another Phi, and the other Phi is scheduled in an
522 // earlier stage. We can try to reuse an existing Phi up until the last
523 // stage of the current Phi.
524 if (LoopDefIsPhi) {
525 if (static_cast<int>(PrologStage - np) >= StageScheduled) {
526 int LVNumStages = getStagesForPhi(Reg: LoopVal);
527 int StageDiff = (StageScheduled - LoopValStage);
528 LVNumStages -= StageDiff;
529 // Make sure the loop value Phi has been processed already.
530 if (LVNumStages > (int)np && VRMap[CurStageNum].count(Val: LoopVal)) {
531 NewReg = PhiOp2;
532 unsigned ReuseStage = CurStageNum;
533 if (isLoopCarried(Phi&: *PhiInst))
534 ReuseStage -= LVNumStages;
535 // Check if the Phi to reuse has been generated yet. If not, then
536 // there is nothing to reuse.
537 if (VRMap[ReuseStage - np].count(Val: LoopVal)) {
538 NewReg = VRMap[ReuseStage - np][LoopVal];
539
540 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI,
541 OldReg: Def, NewReg);
542 // Update the map with the new Phi name.
543 VRMap[CurStageNum - np][Def] = NewReg;
544 PhiOp2 = NewReg;
545 if (VRMap[LastStageNum - np - 1].count(Val: LoopVal))
546 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
547
548 if (IsLast && np == NumPhis - 1)
549 replaceRegUsesAfterLoop(FromReg: Def, ToReg: NewReg, MBB: BB, MRI);
550 continue;
551 }
552 }
553 }
554 if (InKernel && StageDiff > 0 &&
555 VRMap[CurStageNum - StageDiff - np].count(Val: LoopVal))
556 PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
557 }
558
559 const TargetRegisterClass *RC = MRI.getRegClass(Reg: Def);
560 NewReg = MRI.createVirtualRegister(RegClass: RC);
561
562 MachineInstrBuilder NewPhi =
563 BuildMI(BB&: *NewBB, I: NewBB->getFirstNonPHI(), MIMD: DebugLoc(),
564 MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: NewReg);
565 NewPhi.addReg(RegNo: PhiOp1).addMBB(MBB: BB1);
566 NewPhi.addReg(RegNo: PhiOp2).addMBB(MBB: BB2);
567 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
568 if (np == 0)
569 InstrMap[NewPhi] = &*BBI;
570
571 // We define the Phis after creating the new pipelined code, so
572 // we need to rename the Phi values in scheduled instructions.
573
574 Register PrevReg;
575 if (InKernel && VRMap[PrevStage - np].count(Val: LoopVal))
576 PrevReg = VRMap[PrevStage - np][LoopVal];
577 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: Def,
578 NewReg, PrevReg);
579 // If the Phi has been scheduled, use the new name for rewriting.
580 if (VRMap[CurStageNum - np].count(Val: Def)) {
581 Register R = VRMap[CurStageNum - np][Def];
582 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: R,
583 NewReg);
584 }
585
586 // Check if we need to rename any uses that occurs after the loop. The
587 // register to replace depends on whether the Phi is scheduled in the
588 // epilog.
589 if (IsLast && np == NumPhis - 1)
590 replaceRegUsesAfterLoop(FromReg: Def, ToReg: NewReg, MBB: BB, MRI);
591
592 // In the kernel, a dependent Phi uses the value from this Phi.
593 if (InKernel)
594 PhiOp2 = NewReg;
595
596 // Update the map with the new Phi name.
597 VRMap[CurStageNum - np][Def] = NewReg;
598 }
599
600 while (NumPhis++ < NumStages) {
601 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: NumPhis, Phi: &*BBI, OldReg: Def,
602 NewReg, PrevReg: 0);
603 }
604
605 // Check if we need to rename a Phi that has been eliminated due to
606 // scheduling.
607 if (NumStages == 0 && IsLast) {
608 auto &CurStageMap = VRMap[CurStageNum];
609 auto It = CurStageMap.find(Val: LoopVal);
610 if (It != CurStageMap.end())
611 replaceRegUsesAfterLoop(FromReg: Def, ToReg: It->second, MBB: BB, MRI);
612 }
613 }
614}
615
616/// Generate Phis for the specified block in the generated pipelined code.
617/// These are new Phis needed because the definition is scheduled after the
618/// use in the pipelined sequence.
619void ModuloScheduleExpander::generatePhis(
620 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
621 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
622 InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
623 bool IsLast) {
624 // Compute the stage number that contains the initial Phi value, and
625 // the Phi from the previous stage.
626 unsigned PrologStage = 0;
627 unsigned PrevStage = 0;
628 unsigned StageDiff = CurStageNum - LastStageNum;
629 bool InKernel = (StageDiff == 0);
630 if (InKernel) {
631 PrologStage = LastStageNum - 1;
632 PrevStage = CurStageNum;
633 } else {
634 PrologStage = LastStageNum - StageDiff;
635 PrevStage = LastStageNum + StageDiff - 1;
636 }
637
638 for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
639 BBE = BB->instr_end();
640 BBI != BBE; ++BBI) {
641 for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
642 MachineOperand &MO = BBI->getOperand(i);
643 if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
644 continue;
645
646 int StageScheduled = Schedule.getStage(MI: &*BBI);
647 assert(StageScheduled != -1 && "Expecting scheduled instruction.");
648 Register Def = MO.getReg();
649 unsigned NumPhis = getStagesForReg(Reg: Def, CurStage: CurStageNum);
650 // An instruction scheduled in stage 0 and is used after the loop
651 // requires a phi in the epilog for the last definition from either
652 // the kernel or prolog.
653 if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
654 hasUseAfterLoop(Reg: Def, BB, MRI))
655 NumPhis = 1;
656 if (!InKernel && (unsigned)StageScheduled > PrologStage)
657 continue;
658
659 Register PhiOp2;
660 if (InKernel) {
661 PhiOp2 = VRMap[PrevStage][Def];
662 if (MachineInstr *InstOp2 = MRI.getVRegDef(Reg: PhiOp2))
663 if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
664 PhiOp2 = getLoopPhiReg(Phi&: *InstOp2, LoopBB: BB2);
665 }
666 // The number of Phis can't exceed the number of prolog stages. The
667 // prolog stage number is zero based.
668 if (NumPhis > PrologStage + 1 - StageScheduled)
669 NumPhis = PrologStage + 1 - StageScheduled;
670 for (unsigned np = 0; np < NumPhis; ++np) {
671 // Example for
672 // Org:
673 // %Org = ... (Scheduled at Stage#0, NumPhi = 2)
674 //
675 // Prolog0 (Stage0):
676 // %Clone0 = ...
677 // Prolog1 (Stage1):
678 // %Clone1 = ...
679 // Kernel (Stage2):
680 // %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
681 // %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
682 // %Clone2 = ...
683 // Epilog0 (Stage3):
684 // %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
685 // %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
686 // Epilog1 (Stage4):
687 // %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
688 //
689 // VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
690 // VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
691 // VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
692
693 Register PhiOp1 = VRMap[PrologStage][Def];
694 if (np <= PrologStage)
695 PhiOp1 = VRMap[PrologStage - np][Def];
696 if (!InKernel) {
697 if (PrevStage == LastStageNum && np == 0)
698 PhiOp2 = VRMap[LastStageNum][Def];
699 else
700 PhiOp2 = VRMapPhi[PrevStage - np][Def];
701 }
702
703 const TargetRegisterClass *RC = MRI.getRegClass(Reg: Def);
704 Register NewReg = MRI.createVirtualRegister(RegClass: RC);
705
706 MachineInstrBuilder NewPhi =
707 BuildMI(BB&: *NewBB, I: NewBB->getFirstNonPHI(), MIMD: DebugLoc(),
708 MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: NewReg);
709 NewPhi.addReg(RegNo: PhiOp1).addMBB(MBB: BB1);
710 NewPhi.addReg(RegNo: PhiOp2).addMBB(MBB: BB2);
711 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
712 if (np == 0)
713 InstrMap[NewPhi] = &*BBI;
714
715 // Rewrite uses and update the map. The actions depend upon whether
716 // we generating code for the kernel or epilog blocks.
717 if (InKernel) {
718 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: PhiOp1,
719 NewReg);
720 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: PhiOp2,
721 NewReg);
722
723 PhiOp2 = NewReg;
724 VRMapPhi[PrevStage - np - 1][Def] = NewReg;
725 } else {
726 VRMapPhi[CurStageNum - np][Def] = NewReg;
727 if (np == NumPhis - 1)
728 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum, PhiNum: np, Phi: &*BBI, OldReg: Def,
729 NewReg);
730 }
731 if (IsLast && np == NumPhis - 1)
732 replaceRegUsesAfterLoop(FromReg: Def, ToReg: NewReg, MBB: BB, MRI);
733 }
734 }
735 }
736}
737
738/// Remove instructions that generate values with no uses.
739/// Typically, these are induction variable operations that generate values
740/// used in the loop itself. A dead instruction has a definition with
741/// no uses, or uses that occur in the original loop only.
742void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
743 MBBVectorTy &EpilogBBs) {
744 // For each epilog block, check that the value defined by each instruction
745 // is used. If not, delete it.
746 for (MachineBasicBlock *MBB : llvm::reverse(C&: EpilogBBs))
747 for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
748 ME = MBB->instr_rend();
749 MI != ME;) {
750 // From DeadMachineInstructionElem. Don't delete inline assembly.
751 if (MI->isInlineAsm()) {
752 ++MI;
753 continue;
754 }
755 bool SawStore = false;
756 // Check if it's safe to remove the instruction due to side effects.
757 // We can, and want to, remove Phis here.
758 if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
759 ++MI;
760 continue;
761 }
762 bool used = true;
763 for (const MachineOperand &MO : MI->all_defs()) {
764 Register reg = MO.getReg();
765 // Assume physical registers are used, unless they are marked dead.
766 if (reg.isPhysical()) {
767 used = !MO.isDead();
768 if (used)
769 break;
770 continue;
771 }
772 unsigned realUses = 0;
773 for (const MachineOperand &U : MRI.use_operands(Reg: reg)) {
774 // Check if there are any uses that occur only in the original
775 // loop. If so, that's not a real use.
776 if (U.getParent()->getParent() != BB) {
777 realUses++;
778 used = true;
779 break;
780 }
781 }
782 if (realUses > 0)
783 break;
784 used = false;
785 }
786 if (!used) {
787 LIS.RemoveMachineInstrFromMaps(MI&: *MI);
788 MI++->eraseFromParent();
789 continue;
790 }
791 ++MI;
792 }
793 // In the kernel block, check if we can remove a Phi that generates a value
794 // used in an instruction removed in the epilog block.
795 for (MachineInstr &MI : llvm::make_early_inc_range(Range: KernelBB->phis())) {
796 Register reg = MI.getOperand(i: 0).getReg();
797 if (MRI.use_begin(RegNo: reg) == MRI.use_end()) {
798 LIS.RemoveMachineInstrFromMaps(MI);
799 MI.eraseFromParent();
800 }
801 }
802}
803
804/// For loop carried definitions, we split the lifetime of a virtual register
805/// that has uses past the definition in the next iteration. A copy with a new
806/// virtual register is inserted before the definition, which helps with
807/// generating a better register assignment.
808///
809/// v1 = phi(a, v2) v1 = phi(a, v2)
810/// v2 = phi(b, v3) v2 = phi(b, v3)
811/// v3 = .. v4 = copy v1
812/// .. = V1 v3 = ..
813/// .. = v4
814void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
815 MBBVectorTy &EpilogBBs) {
816 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
817 for (auto &PHI : KernelBB->phis()) {
818 Register Def = PHI.getOperand(i: 0).getReg();
819 // Check for any Phi definition that used as an operand of another Phi
820 // in the same block.
821 for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(RegNo: Def),
822 E = MRI.use_instr_end();
823 I != E; ++I) {
824 if (I->isPHI() && I->getParent() == KernelBB) {
825 // Get the loop carried definition.
826 Register LCDef = getLoopPhiReg(Phi&: PHI, LoopBB: KernelBB);
827 if (!LCDef)
828 continue;
829 MachineInstr *MI = MRI.getVRegDef(Reg: LCDef);
830 if (!MI || MI->getParent() != KernelBB || MI->isPHI())
831 continue;
832 // Search through the rest of the block looking for uses of the Phi
833 // definition. If one occurs, then split the lifetime.
834 Register SplitReg;
835 for (auto &BBJ : make_range(x: MachineBasicBlock::instr_iterator(MI),
836 y: KernelBB->instr_end()))
837 if (BBJ.readsRegister(Reg: Def, /*TRI=*/nullptr)) {
838 // We split the lifetime when we find the first use.
839 if (!SplitReg) {
840 SplitReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: Def));
841 MachineInstr *newCopy =
842 BuildMI(BB&: *KernelBB, I: MI, MIMD: MI->getDebugLoc(),
843 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SplitReg)
844 .addReg(RegNo: Def);
845 LIS.InsertMachineInstrInMaps(MI&: *newCopy);
846 }
847 BBJ.substituteRegister(FromReg: Def, ToReg: SplitReg, SubIdx: 0, RegInfo: *TRI);
848 }
849 if (!SplitReg)
850 continue;
851 // Search through each of the epilog blocks for any uses to be renamed.
852 for (auto &Epilog : EpilogBBs)
853 for (auto &I : *Epilog)
854 if (I.readsRegister(Reg: Def, /*TRI=*/nullptr))
855 I.substituteRegister(FromReg: Def, ToReg: SplitReg, SubIdx: 0, RegInfo: *TRI);
856 break;
857 }
858 }
859 }
860}
861
862/// Remove the incoming block from the Phis in a basic block.
863static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
864 for (MachineInstr &MI : *BB) {
865 if (!MI.isPHI())
866 break;
867 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
868 if (MI.getOperand(i: i + 1).getMBB() == Incoming) {
869 MI.removeOperand(OpNo: i + 1);
870 MI.removeOperand(OpNo: i);
871 break;
872 }
873 }
874}
875
876/// Create branches from each prolog basic block to the appropriate epilog
877/// block. These edges are needed if the loop ends before reaching the
878/// kernel.
879void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
880 MBBVectorTy &PrologBBs,
881 MachineBasicBlock *KernelBB,
882 MBBVectorTy &EpilogBBs,
883 ValueMapTy *VRMap) {
884 assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
885 MachineBasicBlock *LastPro = KernelBB;
886 MachineBasicBlock *LastEpi = KernelBB;
887
888 // Start from the blocks connected to the kernel and work "out"
889 // to the first prolog and the last epilog blocks.
890 unsigned MaxIter = PrologBBs.size() - 1;
891 for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
892 // Add branches to the prolog that go to the corresponding
893 // epilog, and the fall-thru prolog/kernel block.
894 MachineBasicBlock *Prolog = PrologBBs[j];
895 MachineBasicBlock *Epilog = EpilogBBs[i];
896
897 SmallVector<MachineOperand, 4> Cond;
898 std::optional<bool> StaticallyGreater =
899 LoopInfo->createTripCountGreaterCondition(TC: j + 1, MBB&: *Prolog, Cond);
900 unsigned numAdded = 0;
901 if (!StaticallyGreater) {
902 Prolog->addSuccessor(Succ: Epilog);
903 numAdded = TII->insertBranch(MBB&: *Prolog, TBB: Epilog, FBB: LastPro, Cond, DL: DebugLoc());
904 } else if (*StaticallyGreater == false) {
905 Prolog->addSuccessor(Succ: Epilog);
906 Prolog->removeSuccessor(Succ: LastPro);
907 LastEpi->removeSuccessor(Succ: Epilog);
908 numAdded = TII->insertBranch(MBB&: *Prolog, TBB: Epilog, FBB: nullptr, Cond, DL: DebugLoc());
909 removePhis(BB: Epilog, Incoming: LastEpi);
910 // Remove the blocks that are no longer referenced.
911 if (LastPro != LastEpi) {
912 for (auto &MI : *LastEpi)
913 LIS.RemoveMachineInstrFromMaps(MI);
914 LastEpi->clear();
915 LastEpi->eraseFromParent();
916 }
917 if (LastPro == KernelBB) {
918 LoopInfo->disposed(LIS: &LIS);
919 NewKernel = nullptr;
920 }
921 for (auto &MI : *LastPro)
922 LIS.RemoveMachineInstrFromMaps(MI);
923 LastPro->clear();
924 LastPro->eraseFromParent();
925 } else {
926 numAdded = TII->insertBranch(MBB&: *Prolog, TBB: LastPro, FBB: nullptr, Cond, DL: DebugLoc());
927 removePhis(BB: Epilog, Incoming: Prolog);
928 }
929 LastPro = Prolog;
930 LastEpi = Epilog;
931 for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
932 E = Prolog->instr_rend();
933 I != E && numAdded > 0; ++I, --numAdded)
934 updateInstruction(NewMI: &*I, LastDef: false, CurStageNum: j, InstrStageNum: 0, VRMap);
935 }
936
937 if (NewKernel) {
938 LoopInfo->setPreheader(PrologBBs[MaxIter]);
939 LoopInfo->adjustTripCount(TripCountAdjust: -(MaxIter + 1));
940 }
941}
942
943/// Return true if we can compute the amount the instruction changes
944/// during each iteration. Set Delta to the amount of the change.
945bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
946 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
947 const MachineOperand *BaseOp;
948 int64_t Offset;
949 bool OffsetIsScalable;
950 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
951 return false;
952
953 // FIXME: This algorithm assumes instructions have fixed-size offsets.
954 if (OffsetIsScalable)
955 return false;
956
957 if (!BaseOp->isReg())
958 return false;
959
960 Register BaseReg = BaseOp->getReg();
961
962 MachineRegisterInfo &MRI = MF.getRegInfo();
963 // Check if there is a Phi. If so, get the definition in the loop.
964 MachineInstr *BaseDef = MRI.getVRegDef(Reg: BaseReg);
965 if (BaseDef && BaseDef->isPHI()) {
966 BaseReg = getLoopPhiReg(Phi&: *BaseDef, LoopBB: MI.getParent());
967 BaseDef = MRI.getVRegDef(Reg: BaseReg);
968 }
969 if (!BaseDef)
970 return false;
971
972 int D = 0;
973 if (!TII->getIncrementValue(MI: *BaseDef, Value&: D) && D >= 0)
974 return false;
975
976 Delta = D;
977 return true;
978}
979
980/// Update the memory operand with a new offset when the pipeliner
981/// generates a new copy of the instruction that refers to a
982/// different memory location.
983void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
984 MachineInstr &OldMI,
985 unsigned Num) {
986 if (Num == 0)
987 return;
988 // If the instruction has memory operands, then adjust the offset
989 // when the instruction appears in different stages.
990 if (NewMI.memoperands_empty())
991 return;
992 SmallVector<MachineMemOperand *, 2> NewMMOs;
993 for (MachineMemOperand *MMO : NewMI.memoperands()) {
994 // TODO: Figure out whether isAtomic is really necessary (see D57601).
995 if (MMO->isVolatile() || MMO->isAtomic() ||
996 (MMO->isInvariant() && MMO->isDereferenceable()) ||
997 (!MMO->getValue())) {
998 NewMMOs.push_back(Elt: MMO);
999 continue;
1000 }
1001 unsigned Delta;
1002 if (Num != UINT_MAX && computeDelta(MI&: OldMI, Delta)) {
1003 int64_t AdjOffset = Delta * Num;
1004 NewMMOs.push_back(
1005 Elt: MF.getMachineMemOperand(MMO, Offset: AdjOffset, Size: MMO->getSize()));
1006 } else {
1007 NewMMOs.push_back(Elt: MF.getMachineMemOperand(
1008 MMO, Offset: 0, Size: LocationSize::beforeOrAfterPointer()));
1009 }
1010 }
1011 NewMI.setMemRefs(MF, MemRefs: NewMMOs);
1012}
1013
1014/// Clone the instruction for the new pipelined loop and update the
1015/// memory operands, if needed.
1016MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
1017 unsigned CurStageNum,
1018 unsigned InstStageNum) {
1019 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: OldMI);
1020 updateMemOperands(NewMI&: *NewMI, OldMI&: *OldMI, Num: CurStageNum - InstStageNum);
1021 return NewMI;
1022}
1023
1024/// Clone the instruction for the new pipelined loop. If needed, this
1025/// function updates the instruction using the values saved in the
1026/// InstrChanges structure.
1027MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
1028 MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
1029 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: OldMI);
1030 auto It = InstrChanges.find(Val: OldMI);
1031 if (It != InstrChanges.end()) {
1032 std::pair<Register, int64_t> RegAndOffset = It->second;
1033 unsigned BasePos, OffsetPos;
1034 if (!TII->getBaseAndOffsetPosition(MI: *OldMI, BasePos, OffsetPos))
1035 return nullptr;
1036 int64_t NewOffset = OldMI->getOperand(i: OffsetPos).getImm();
1037 MachineInstr *LoopDef = findDefInLoop(Reg: RegAndOffset.first);
1038 if (Schedule.getStage(MI: LoopDef) > (signed)InstStageNum)
1039 NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
1040 NewMI->getOperand(i: OffsetPos).setImm(NewOffset);
1041 }
1042 updateMemOperands(NewMI&: *NewMI, OldMI&: *OldMI, Num: CurStageNum - InstStageNum);
1043 return NewMI;
1044}
1045
1046/// Update the machine instruction with new virtual registers. This
1047/// function may change the definitions and/or uses.
1048void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
1049 bool LastDef,
1050 unsigned CurStageNum,
1051 unsigned InstrStageNum,
1052 ValueMapTy *VRMap) {
1053 for (MachineOperand &MO : NewMI->operands()) {
1054 if (!MO.isReg() || !MO.getReg().isVirtual())
1055 continue;
1056 Register reg = MO.getReg();
1057 if (MO.isDef()) {
1058 // Create a new virtual register for the definition.
1059 const TargetRegisterClass *RC = MRI.getRegClass(Reg: reg);
1060 Register NewReg = MRI.createVirtualRegister(RegClass: RC);
1061 MO.setReg(NewReg);
1062 VRMap[CurStageNum][reg] = NewReg;
1063 if (LastDef)
1064 replaceRegUsesAfterLoop(FromReg: reg, ToReg: NewReg, MBB: BB, MRI);
1065 } else if (MO.isUse()) {
1066 MachineInstr *Def = MRI.getVRegDef(Reg: reg);
1067 // Compute the stage that contains the last definition for instruction.
1068 int DefStageNum = Schedule.getStage(MI: Def);
1069 unsigned StageNum = CurStageNum;
1070 if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
1071 // Compute the difference in stages between the defintion and the use.
1072 unsigned StageDiff = (InstrStageNum - DefStageNum);
1073 // Make an adjustment to get the last definition.
1074 StageNum -= StageDiff;
1075 }
1076 if (auto It = VRMap[StageNum].find(Val: reg); It != VRMap[StageNum].end())
1077 MO.setReg(It->second);
1078 }
1079 }
1080}
1081
1082/// Return the instruction in the loop that defines the register.
1083/// If the definition is a Phi, then follow the Phi operand to
1084/// the instruction in the loop.
1085MachineInstr *ModuloScheduleExpander::findDefInLoop(Register Reg) {
1086 SmallPtrSet<MachineInstr *, 8> Visited;
1087 MachineInstr *Def = MRI.getVRegDef(Reg);
1088 while (Def->isPHI()) {
1089 if (!Visited.insert(Ptr: Def).second)
1090 break;
1091 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
1092 if (Def->getOperand(i: i + 1).getMBB() == BB) {
1093 Def = MRI.getVRegDef(Reg: Def->getOperand(i).getReg());
1094 break;
1095 }
1096 }
1097 return Def;
1098}
1099
1100/// Return the new name for the value from the previous stage.
1101Register ModuloScheduleExpander::getPrevMapVal(
1102 unsigned StageNum, unsigned PhiStage, Register LoopVal, unsigned LoopStage,
1103 ValueMapTy *VRMap, MachineBasicBlock *BB) {
1104 Register PrevVal;
1105 if (StageNum > PhiStage) {
1106 MachineInstr *LoopInst = MRI.getVRegDef(Reg: LoopVal);
1107 if (PhiStage == LoopStage && VRMap[StageNum - 1].count(Val: LoopVal))
1108 // The name is defined in the previous stage.
1109 PrevVal = VRMap[StageNum - 1][LoopVal];
1110 else if (VRMap[StageNum].count(Val: LoopVal))
1111 // The previous name is defined in the current stage when the instruction
1112 // order is swapped.
1113 PrevVal = VRMap[StageNum][LoopVal];
1114 else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
1115 // The loop value hasn't yet been scheduled.
1116 PrevVal = LoopVal;
1117 else if (StageNum == PhiStage + 1)
1118 // The loop value is another phi, which has not been scheduled.
1119 PrevVal = getInitPhiReg(Phi&: *LoopInst, LoopBB: BB);
1120 else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
1121 // The loop value is another phi, which has been scheduled.
1122 PrevVal =
1123 getPrevMapVal(StageNum: StageNum - 1, PhiStage, LoopVal: getLoopPhiReg(Phi&: *LoopInst, LoopBB: BB),
1124 LoopStage, VRMap, BB);
1125 }
1126 return PrevVal;
1127}
1128
1129/// Rewrite the Phi values in the specified block to use the mappings
1130/// from the initial operand. Once the Phi is scheduled, we switch
1131/// to using the loop value instead of the Phi value, so those names
1132/// do not need to be rewritten.
1133void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
1134 unsigned StageNum,
1135 ValueMapTy *VRMap,
1136 InstrMapTy &InstrMap) {
1137 for (auto &PHI : BB->phis()) {
1138 Register InitVal;
1139 Register LoopVal;
1140 getPhiRegs(Phi&: PHI, Loop: BB, InitVal, LoopVal);
1141 Register PhiDef = PHI.getOperand(i: 0).getReg();
1142
1143 unsigned PhiStage = (unsigned)Schedule.getStage(MI: MRI.getVRegDef(Reg: PhiDef));
1144 unsigned LoopStage = (unsigned)Schedule.getStage(MI: MRI.getVRegDef(Reg: LoopVal));
1145 unsigned NumPhis = getStagesForPhi(Reg: PhiDef);
1146 if (NumPhis > StageNum)
1147 NumPhis = StageNum;
1148 for (unsigned np = 0; np <= NumPhis; ++np) {
1149 Register NewVal =
1150 getPrevMapVal(StageNum: StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
1151 if (!NewVal)
1152 NewVal = InitVal;
1153 rewriteScheduledInstr(BB: NewBB, InstrMap, CurStageNum: StageNum - np, PhiNum: np, Phi: &PHI, OldReg: PhiDef,
1154 NewReg: NewVal);
1155 }
1156 }
1157}
1158
1159/// Rewrite a previously scheduled instruction to use the register value
1160/// from the new instruction. Make sure the instruction occurs in the
1161/// basic block, and we don't change the uses in the new instruction.
1162void ModuloScheduleExpander::rewriteScheduledInstr(
1163 MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
1164 unsigned PhiNum, MachineInstr *Phi, Register OldReg, Register NewReg,
1165 Register PrevReg) {
1166 bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
1167 int StagePhi = Schedule.getStage(MI: Phi) + PhiNum;
1168 // Rewrite uses that have been scheduled already to use the new
1169 // Phi register.
1170 for (MachineOperand &UseOp :
1171 llvm::make_early_inc_range(Range: MRI.use_operands(Reg: OldReg))) {
1172 MachineInstr *UseMI = UseOp.getParent();
1173 if (UseMI->getParent() != BB)
1174 continue;
1175 if (UseMI->isPHI()) {
1176 if (!Phi->isPHI() && UseMI->getOperand(i: 0).getReg() == NewReg)
1177 continue;
1178 if (getLoopPhiReg(Phi&: *UseMI, LoopBB: BB) != OldReg)
1179 continue;
1180 }
1181 InstrMapTy::iterator OrigInstr = InstrMap.find(Val: UseMI);
1182 assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
1183 MachineInstr *OrigMI = OrigInstr->second;
1184 int StageSched = Schedule.getStage(MI: OrigMI);
1185 int CycleSched = Schedule.getCycle(MI: OrigMI);
1186 Register ReplaceReg;
1187 // This is the stage for the scheduled instruction.
1188 if (StagePhi == StageSched && Phi->isPHI()) {
1189 int CyclePhi = Schedule.getCycle(MI: Phi);
1190 if (PrevReg && InProlog)
1191 ReplaceReg = PrevReg;
1192 else if (PrevReg && !isLoopCarried(Phi&: *Phi) &&
1193 (CyclePhi <= CycleSched || OrigMI->isPHI()))
1194 ReplaceReg = PrevReg;
1195 else
1196 ReplaceReg = NewReg;
1197 }
1198 // The scheduled instruction occurs before the scheduled Phi, and the
1199 // Phi is not loop carried.
1200 if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(Phi&: *Phi))
1201 ReplaceReg = NewReg;
1202 if (StagePhi > StageSched && Phi->isPHI())
1203 ReplaceReg = NewReg;
1204 if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
1205 ReplaceReg = NewReg;
1206 if (ReplaceReg) {
1207 const TargetRegisterClass *NRC =
1208 MRI.constrainRegClass(Reg: ReplaceReg, RC: MRI.getRegClass(Reg: OldReg));
1209 if (NRC)
1210 UseOp.setReg(ReplaceReg);
1211 else {
1212 Register SplitReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OldReg));
1213 MachineInstr *newCopy = BuildMI(BB&: *BB, I: UseMI, MIMD: UseMI->getDebugLoc(),
1214 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SplitReg)
1215 .addReg(RegNo: ReplaceReg);
1216 UseOp.setReg(SplitReg);
1217 LIS.InsertMachineInstrInMaps(MI&: *newCopy);
1218 }
1219 }
1220 }
1221}
1222
1223bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
1224 if (!Phi.isPHI())
1225 return false;
1226 int DefCycle = Schedule.getCycle(MI: &Phi);
1227 int DefStage = Schedule.getStage(MI: &Phi);
1228
1229 Register InitVal;
1230 Register LoopVal;
1231 getPhiRegs(Phi, Loop: Phi.getParent(), InitVal, LoopVal);
1232 MachineInstr *Use = MRI.getVRegDef(Reg: LoopVal);
1233 if (!Use || Use->isPHI())
1234 return true;
1235 int LoopCycle = Schedule.getCycle(MI: Use);
1236 int LoopStage = Schedule.getStage(MI: Use);
1237 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
1238}
1239
1240//===----------------------------------------------------------------------===//
1241// PeelingModuloScheduleExpander implementation
1242//===----------------------------------------------------------------------===//
1243// This is a reimplementation of ModuloScheduleExpander that works by creating
1244// a fully correct steady-state kernel and peeling off the prolog and epilogs.
1245//===----------------------------------------------------------------------===//
1246
1247namespace {
1248// Remove any dead phis in MBB. Dead phis either have only one block as input
1249// (in which case they are the identity) or have no uses.
1250void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
1251 LiveIntervals *LIS, bool KeepSingleSrcPhi = false) {
1252 bool Changed = true;
1253 while (Changed) {
1254 Changed = false;
1255 for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB->phis())) {
1256 assert(MI.isPHI());
1257 if (MRI.use_empty(RegNo: MI.getOperand(i: 0).getReg())) {
1258 if (LIS)
1259 LIS->RemoveMachineInstrFromMaps(MI);
1260 MI.eraseFromParent();
1261 Changed = true;
1262 } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
1263 const TargetRegisterClass *ConstrainRegClass =
1264 MRI.constrainRegClass(Reg: MI.getOperand(i: 1).getReg(),
1265 RC: MRI.getRegClass(Reg: MI.getOperand(i: 0).getReg()));
1266 assert(ConstrainRegClass &&
1267 "Expected a valid constrained register class!");
1268 (void)ConstrainRegClass;
1269 MRI.replaceRegWith(FromReg: MI.getOperand(i: 0).getReg(),
1270 ToReg: MI.getOperand(i: 1).getReg());
1271 if (LIS)
1272 LIS->RemoveMachineInstrFromMaps(MI);
1273 MI.eraseFromParent();
1274 Changed = true;
1275 }
1276 }
1277 }
1278}
1279
1280/// Rewrites the kernel block in-place to adhere to the given schedule.
1281/// KernelRewriter holds all of the state required to perform the rewriting.
1282class KernelRewriter {
1283 ModuloSchedule &S;
1284 MachineBasicBlock *BB;
1285 MachineBasicBlock *PreheaderBB, *ExitBB;
1286 MachineRegisterInfo &MRI;
1287 const TargetInstrInfo *TII;
1288 LiveIntervals *LIS;
1289
1290 // Map from register class to canonical undef register for that class.
1291 DenseMap<const TargetRegisterClass *, Register> Undefs;
1292 // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
1293 // this map is only used when InitReg is non-undef.
1294 DenseMap<std::pair<Register, Register>, Register> Phis;
1295 // Map from LoopReg to phi register where the InitReg is undef.
1296 DenseMap<Register, Register> UndefPhis;
1297
1298 // Reg is used by MI. Return the new register MI should use to adhere to the
1299 // schedule. Insert phis as necessary.
1300 Register remapUse(Register Reg, MachineInstr &MI);
1301 // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
1302 // If InitReg is not given it is chosen arbitrarily. It will either be undef
1303 // or will be chosen so as to share another phi.
1304 Register phi(Register LoopReg, std::optional<Register> InitReg = {},
1305 const TargetRegisterClass *RC = nullptr);
1306 // Create an undef register of the given register class.
1307 Register undef(const TargetRegisterClass *RC);
1308
1309public:
1310 KernelRewriter(MachineLoop &L, ModuloSchedule &S, MachineBasicBlock *LoopBB,
1311 LiveIntervals *LIS = nullptr);
1312 void rewrite();
1313};
1314} // namespace
1315
1316KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1317 MachineBasicBlock *LoopBB, LiveIntervals *LIS)
1318 : S(S), BB(LoopBB), PreheaderBB(L.getLoopPreheader()),
1319 ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
1320 TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
1321 PreheaderBB = *BB->pred_begin();
1322 if (PreheaderBB == BB)
1323 PreheaderBB = *std::next(x: BB->pred_begin());
1324}
1325
1326void KernelRewriter::rewrite() {
1327 // Rearrange the loop to be in schedule order. Note that the schedule may
1328 // contain instructions that are not owned by the loop block (InstrChanges and
1329 // friends), so we gracefully handle unowned instructions and delete any
1330 // instructions that weren't in the schedule.
1331 auto InsertPt = BB->getFirstTerminator();
1332 MachineInstr *FirstMI = nullptr;
1333 for (MachineInstr *MI : S.getInstructions()) {
1334 if (MI->isPHI())
1335 continue;
1336 if (MI->getParent())
1337 MI->removeFromParent();
1338 BB->insert(I: InsertPt, MI);
1339 if (!FirstMI)
1340 FirstMI = MI;
1341 }
1342 assert(FirstMI && "Failed to find first MI in schedule");
1343
1344 // At this point all of the scheduled instructions are between FirstMI
1345 // and the end of the block. Kill from the first non-phi to FirstMI.
1346 for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
1347 if (LIS)
1348 LIS->RemoveMachineInstrFromMaps(MI&: *I);
1349 (I++)->eraseFromParent();
1350 }
1351
1352 // Now remap every instruction in the loop.
1353 for (MachineInstr &MI : *BB) {
1354 if (MI.isPHI() || MI.isTerminator())
1355 continue;
1356 for (MachineOperand &MO : MI.uses()) {
1357 if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
1358 continue;
1359 Register Reg = remapUse(Reg: MO.getReg(), MI);
1360 MO.setReg(Reg);
1361 }
1362 }
1363 EliminateDeadPhis(MBB: BB, MRI, LIS);
1364
1365 // Ensure a phi exists for all instructions that are either referenced by
1366 // an illegal phi or by an instruction outside the loop. This allows us to
1367 // treat remaps of these values the same as "normal" values that come from
1368 // loop-carried phis.
1369 for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
1370 if (MI->isPHI()) {
1371 Register R = MI->getOperand(i: 0).getReg();
1372 phi(LoopReg: R);
1373 continue;
1374 }
1375
1376 for (MachineOperand &Def : MI->defs()) {
1377 for (MachineInstr &MI : MRI.use_instructions(Reg: Def.getReg())) {
1378 if (MI.getParent() != BB) {
1379 phi(LoopReg: Def.getReg());
1380 break;
1381 }
1382 }
1383 }
1384 }
1385}
1386
1387Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
1388 MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
1389 if (!Producer)
1390 return Reg;
1391
1392 int ConsumerStage = S.getStage(MI: &MI);
1393 if (!Producer->isPHI()) {
1394 // Non-phi producers are simple to remap. Insert as many phis as the
1395 // difference between the consumer and producer stages.
1396 if (Producer->getParent() != BB)
1397 // Producer was not inside the loop. Use the register as-is.
1398 return Reg;
1399 int ProducerStage = S.getStage(MI: Producer);
1400 assert(ConsumerStage != -1 &&
1401 "In-loop consumer should always be scheduled!");
1402 assert(ConsumerStage >= ProducerStage);
1403 unsigned StageDiff = ConsumerStage - ProducerStage;
1404
1405 for (unsigned I = 0; I < StageDiff; ++I)
1406 Reg = phi(LoopReg: Reg);
1407 return Reg;
1408 }
1409
1410 // First, dive through the phi chain to find the defaults for the generated
1411 // phis.
1412 SmallVector<std::optional<Register>, 4> Defaults;
1413 Register LoopReg = Reg;
1414 auto LoopProducer = Producer;
1415 while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
1416 LoopReg = getLoopPhiReg(Phi&: *LoopProducer, LoopBB: BB);
1417 Defaults.emplace_back(Args: getInitPhiReg(Phi&: *LoopProducer, LoopBB: BB));
1418 LoopProducer = MRI.getUniqueVRegDef(Reg: LoopReg);
1419 assert(LoopProducer);
1420 }
1421 int LoopProducerStage = S.getStage(MI: LoopProducer);
1422
1423 std::optional<Register> IllegalPhiDefault;
1424
1425 if (LoopProducerStage == -1) {
1426 // Do nothing.
1427 } else if (LoopProducerStage > ConsumerStage) {
1428 // This schedule is only representable if ProducerStage == ConsumerStage+1.
1429 // In addition, Consumer's cycle must be scheduled after Producer in the
1430 // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
1431 // functions.
1432#ifndef NDEBUG // Silence unused variables in non-asserts mode.
1433 int LoopProducerCycle = S.getCycle(LoopProducer);
1434 int ConsumerCycle = S.getCycle(&MI);
1435#endif
1436 assert(LoopProducerCycle <= ConsumerCycle);
1437 assert(LoopProducerStage == ConsumerStage + 1);
1438 // Peel off the first phi from Defaults and insert a phi between producer
1439 // and consumer. This phi will not be at the front of the block so we
1440 // consider it illegal. It will only exist during the rewrite process; it
1441 // needs to exist while we peel off prologs because these could take the
1442 // default value. After that we can replace all uses with the loop producer
1443 // value.
1444 IllegalPhiDefault = Defaults.front();
1445 Defaults.erase(CI: Defaults.begin());
1446 } else {
1447 assert(ConsumerStage >= LoopProducerStage);
1448 int StageDiff = ConsumerStage - LoopProducerStage;
1449 if (StageDiff > 0) {
1450 LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
1451 << " to " << (Defaults.size() + StageDiff) << "\n");
1452 // If we need more phis than we have defaults for, pad out with undefs for
1453 // the earliest phis, which are at the end of the defaults chain (the
1454 // chain is in reverse order).
1455 Defaults.resize(N: Defaults.size() + StageDiff,
1456 NV: Defaults.empty() ? std::optional<Register>()
1457 : Defaults.back());
1458 }
1459 }
1460
1461 // Now we know the number of stages to jump back, insert the phi chain.
1462 auto DefaultI = Defaults.rbegin();
1463 while (DefaultI != Defaults.rend())
1464 LoopReg = phi(LoopReg, InitReg: *DefaultI++, RC: MRI.getRegClass(Reg));
1465
1466 if (IllegalPhiDefault) {
1467 // The consumer optionally consumes LoopProducer in the same iteration
1468 // (because the producer is scheduled at an earlier cycle than the consumer)
1469 // or the initial value. To facilitate this we create an illegal block here
1470 // by embedding a phi in the middle of the block. We will fix this up
1471 // immediately prior to pruning.
1472 auto RC = MRI.getRegClass(Reg);
1473 Register R = MRI.createVirtualRegister(RegClass: RC);
1474 MachineInstr *IllegalPhi =
1475 BuildMI(BB&: *BB, I&: MI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: R)
1476 .addReg(RegNo: *IllegalPhiDefault)
1477 .addMBB(MBB: PreheaderBB) // Block choice is arbitrary and has no effect.
1478 .addReg(RegNo: LoopReg)
1479 .addMBB(MBB: BB); // Block choice is arbitrary and has no effect.
1480 // Illegal phi should belong to the producer stage so that it can be
1481 // filtered correctly during peeling.
1482 S.setStage(MI: IllegalPhi, MIStage: LoopProducerStage);
1483 return R;
1484 }
1485
1486 return LoopReg;
1487}
1488
1489Register KernelRewriter::phi(Register LoopReg, std::optional<Register> InitReg,
1490 const TargetRegisterClass *RC) {
1491 // If the init register is not undef, try and find an existing phi.
1492 if (InitReg) {
1493 auto I = Phis.find(Val: {LoopReg, *InitReg});
1494 if (I != Phis.end())
1495 return I->second;
1496 } else {
1497 for (auto &KV : Phis) {
1498 if (KV.first.first == LoopReg)
1499 return KV.second;
1500 }
1501 }
1502
1503 // InitReg is either undef or no existing phi takes InitReg as input. Try and
1504 // find a phi that takes undef as input.
1505 auto I = UndefPhis.find(Val: LoopReg);
1506 if (I != UndefPhis.end()) {
1507 Register R = I->second;
1508 if (!InitReg)
1509 // Found a phi taking undef as input, and this input is undef so return
1510 // without any more changes.
1511 return R;
1512 // Found a phi taking undef as input, so rewrite it to take InitReg.
1513 MachineInstr *MI = MRI.getVRegDef(Reg: R);
1514 MI->getOperand(i: 1).setReg(*InitReg);
1515 Phis.insert(KV: {{LoopReg, *InitReg}, R});
1516 const TargetRegisterClass *ConstrainRegClass =
1517 MRI.constrainRegClass(Reg: R, RC: MRI.getRegClass(Reg: *InitReg));
1518 assert(ConstrainRegClass && "Expected a valid constrained register class!");
1519 (void)ConstrainRegClass;
1520 UndefPhis.erase(I);
1521 return R;
1522 }
1523
1524 // Failed to find any existing phi to reuse, so create a new one.
1525 if (!RC)
1526 RC = MRI.getRegClass(Reg: LoopReg);
1527 Register R = MRI.createVirtualRegister(RegClass: RC);
1528 if (InitReg) {
1529 const TargetRegisterClass *ConstrainRegClass =
1530 MRI.constrainRegClass(Reg: R, RC: MRI.getRegClass(Reg: *InitReg));
1531 assert(ConstrainRegClass && "Expected a valid constrained register class!");
1532 (void)ConstrainRegClass;
1533 }
1534 BuildMI(BB&: *BB, I: BB->getFirstNonPHI(), MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: R)
1535 .addReg(RegNo: InitReg ? *InitReg : undef(RC))
1536 .addMBB(MBB: PreheaderBB)
1537 .addReg(RegNo: LoopReg)
1538 .addMBB(MBB: BB);
1539 if (!InitReg)
1540 UndefPhis[LoopReg] = R;
1541 else
1542 Phis[{LoopReg, *InitReg}] = R;
1543 return R;
1544}
1545
1546Register KernelRewriter::undef(const TargetRegisterClass *RC) {
1547 Register &R = Undefs[RC];
1548 if (R == 0) {
1549 // Create an IMPLICIT_DEF that defines this register if we need it.
1550 // All uses of this should be removed by the time we have finished unrolling
1551 // prologs and epilogs.
1552 R = MRI.createVirtualRegister(RegClass: RC);
1553 auto *InsertBB = &PreheaderBB->getParent()->front();
1554 BuildMI(BB&: *InsertBB, I: InsertBB->getFirstTerminator(), MIMD: DebugLoc(),
1555 MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: R);
1556 }
1557 return R;
1558}
1559
1560namespace {
1561/// Describes an operand in the kernel of a pipelined loop. Characteristics of
1562/// the operand are discovered, such as how many in-loop PHIs it has to jump
1563/// through and defaults for these phis.
1564class KernelOperandInfo {
1565 MachineBasicBlock *BB;
1566 MachineRegisterInfo &MRI;
1567 SmallVector<Register, 4> PhiDefaults;
1568 MachineOperand *Source;
1569 MachineOperand *Target;
1570
1571public:
1572 KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
1573 const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
1574 : MRI(MRI) {
1575 Source = MO;
1576 BB = MO->getParent()->getParent();
1577 while (isRegInLoop(MO)) {
1578 MachineInstr *MI = MRI.getVRegDef(Reg: MO->getReg());
1579 if (MI->isFullCopy()) {
1580 MO = &MI->getOperand(i: 1);
1581 continue;
1582 }
1583 if (!MI->isPHI())
1584 break;
1585 // If this is an illegal phi, don't count it in distance.
1586 if (IllegalPhis.count(Ptr: MI)) {
1587 MO = &MI->getOperand(i: 3);
1588 continue;
1589 }
1590
1591 Register Default = getInitPhiReg(Phi&: *MI, LoopBB: BB);
1592 MO = MI->getOperand(i: 2).getMBB() == BB ? &MI->getOperand(i: 1)
1593 : &MI->getOperand(i: 3);
1594 PhiDefaults.push_back(Elt: Default);
1595 }
1596 Target = MO;
1597 }
1598
1599 bool operator==(const KernelOperandInfo &Other) const {
1600 return PhiDefaults.size() == Other.PhiDefaults.size();
1601 }
1602
1603 void print(raw_ostream &OS) const {
1604 OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
1605 << *Source->getParent();
1606 }
1607
1608private:
1609 bool isRegInLoop(MachineOperand *MO) {
1610 return MO->isReg() && MO->getReg().isVirtual() &&
1611 MRI.getVRegDef(Reg: MO->getReg())->getParent() == BB;
1612 }
1613};
1614} // namespace
1615
1616MachineBasicBlock *
1617PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1618 MachineBasicBlock *NewBB = PeelSingleBlockLoop(Direction: LPD, Loop: BB, MRI, TII);
1619 if (LPD == LPD_Front)
1620 PeeledFront.push_back(x: NewBB);
1621 else
1622 PeeledBack.push_front(x: NewBB);
1623 for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
1624 ++I, ++NI) {
1625 CanonicalMIs[&*I] = &*I;
1626 CanonicalMIs[&*NI] = &*I;
1627 BlockMIs[{NewBB, &*I}] = &*NI;
1628 BlockMIs[{BB, &*I}] = &*I;
1629 }
1630 return NewBB;
1631}
1632
1633void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
1634 int MinStage) {
1635 for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
1636 I != std::next(x: MB->getFirstNonPHI()->getReverseIterator());) {
1637 MachineInstr *MI = &*I++;
1638 int Stage = getStage(MI);
1639 if (Stage == -1 || Stage >= MinStage)
1640 continue;
1641
1642 for (MachineOperand &DefMO : MI->defs()) {
1643 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1644 for (MachineInstr &UseMI : MRI.use_instructions(Reg: DefMO.getReg())) {
1645 // Only PHIs can use values from this block by construction.
1646 // Match with the equivalent PHI in B.
1647 assert(UseMI.isPHI());
1648 Register Reg = getEquivalentRegisterIn(Reg: UseMI.getOperand(i: 0).getReg(),
1649 BB: MI->getParent());
1650 Subs.emplace_back(Args: &UseMI, Args&: Reg);
1651 }
1652 for (auto &Sub : Subs)
1653 Sub.first->substituteRegister(FromReg: DefMO.getReg(), ToReg: Sub.second, /*SubIdx=*/0,
1654 RegInfo: *MRI.getTargetRegisterInfo());
1655 }
1656 if (LIS)
1657 LIS->RemoveMachineInstrFromMaps(MI&: *MI);
1658 MI->eraseFromParent();
1659 }
1660}
1661
1662void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
1663 MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1664 auto InsertPt = DestBB->getFirstNonPHI();
1665 DenseMap<Register, Register> Remaps;
1666 for (MachineInstr &MI : llvm::make_early_inc_range(
1667 Range: llvm::make_range(x: SourceBB->getFirstNonPHI(), y: SourceBB->end()))) {
1668 if (MI.isPHI()) {
1669 // This is an illegal PHI. If we move any instructions using an illegal
1670 // PHI, we need to create a legal Phi.
1671 if (getStage(MI: &MI) != Stage) {
1672 // The legal Phi is not necessary if the illegal phi's stage
1673 // is being moved.
1674 Register PhiR = MI.getOperand(i: 0).getReg();
1675 auto RC = MRI.getRegClass(Reg: PhiR);
1676 Register NR = MRI.createVirtualRegister(RegClass: RC);
1677 MachineInstr *NI = BuildMI(BB&: *DestBB, I: DestBB->getFirstNonPHI(),
1678 MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: NR)
1679 .addReg(RegNo: PhiR)
1680 .addMBB(MBB: SourceBB);
1681 BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
1682 CanonicalMIs[NI] = CanonicalMIs[&MI];
1683 Remaps[PhiR] = NR;
1684 }
1685 }
1686 if (getStage(MI: &MI) != Stage)
1687 continue;
1688 MI.removeFromParent();
1689 DestBB->insert(I: InsertPt, MI: &MI);
1690 auto *KernelMI = CanonicalMIs[&MI];
1691 BlockMIs[{DestBB, KernelMI}] = &MI;
1692 BlockMIs.erase(Val: {SourceBB, KernelMI});
1693 }
1694 SmallVector<MachineInstr *, 4> PhiToDelete;
1695 for (MachineInstr &MI : DestBB->phis()) {
1696 assert(MI.getNumOperands() == 3);
1697 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 1).getReg());
1698 // If the instruction referenced by the phi is moved inside the block
1699 // we don't need the phi anymore.
1700 if (getStage(MI: Def) == Stage) {
1701 Register PhiReg = MI.getOperand(i: 0).getReg();
1702 assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg(),
1703 /*TRI=*/nullptr) != -1);
1704 MRI.replaceRegWith(FromReg: MI.getOperand(i: 0).getReg(), ToReg: MI.getOperand(i: 1).getReg());
1705 MI.getOperand(i: 0).setReg(PhiReg);
1706 PhiToDelete.push_back(Elt: &MI);
1707 }
1708 }
1709 for (auto *P : PhiToDelete)
1710 P->eraseFromParent();
1711 InsertPt = DestBB->getFirstNonPHI();
1712 // Helper to clone Phi instructions into the destination block. We clone Phi
1713 // greedily to avoid combinatorial explosion of Phi instructions.
1714 auto clonePhi = [&](MachineInstr *Phi) {
1715 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: Phi);
1716 DestBB->insert(I: InsertPt, MI: NewMI);
1717 Register OrigR = Phi->getOperand(i: 0).getReg();
1718 Register R = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OrigR));
1719 NewMI->getOperand(i: 0).setReg(R);
1720 NewMI->getOperand(i: 1).setReg(OrigR);
1721 NewMI->getOperand(i: 2).setMBB(*DestBB->pred_begin());
1722 Remaps[OrigR] = R;
1723 CanonicalMIs[NewMI] = CanonicalMIs[Phi];
1724 BlockMIs[{DestBB, CanonicalMIs[Phi]}] = NewMI;
1725 PhiNodeLoopIteration[NewMI] = PhiNodeLoopIteration[Phi];
1726 return R;
1727 };
1728 for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) {
1729 for (MachineOperand &MO : I->uses()) {
1730 if (!MO.isReg())
1731 continue;
1732 if (auto It = Remaps.find(Val: MO.getReg()); It != Remaps.end())
1733 MO.setReg(It->second);
1734 else {
1735 // If we are using a phi from the source block we need to add a new phi
1736 // pointing to the old one.
1737 MachineInstr *Use = MRI.getUniqueVRegDef(Reg: MO.getReg());
1738 if (Use && Use->isPHI() && Use->getParent() == SourceBB) {
1739 Register R = clonePhi(Use);
1740 MO.setReg(R);
1741 }
1742 }
1743 }
1744 }
1745}
1746
1747Register
1748PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
1749 MachineInstr *Phi) {
1750 unsigned distance = PhiNodeLoopIteration[Phi];
1751 MachineInstr *CanonicalUse = CanonicalPhi;
1752 Register CanonicalUseReg = CanonicalUse->getOperand(i: 0).getReg();
1753 for (unsigned I = 0; I < distance; ++I) {
1754 assert(CanonicalUse->isPHI());
1755 assert(CanonicalUse->getNumOperands() == 5);
1756 unsigned LoopRegIdx = 3, InitRegIdx = 1;
1757 if (CanonicalUse->getOperand(i: 2).getMBB() == CanonicalUse->getParent())
1758 std::swap(a&: LoopRegIdx, b&: InitRegIdx);
1759 CanonicalUseReg = CanonicalUse->getOperand(i: LoopRegIdx).getReg();
1760 CanonicalUse = MRI.getVRegDef(Reg: CanonicalUseReg);
1761 }
1762 return CanonicalUseReg;
1763}
1764
1765void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1766 BitVector LS(Schedule.getNumStages(), true);
1767 BitVector AS(Schedule.getNumStages(), true);
1768 LiveStages[BB] = LS;
1769 AvailableStages[BB] = AS;
1770
1771 // Peel out the prologs.
1772 LS.reset();
1773 for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
1774 LS[I] = true;
1775 Prologs.push_back(Elt: peelKernel(LPD: LPD_Front));
1776 LiveStages[Prologs.back()] = LS;
1777 AvailableStages[Prologs.back()] = LS;
1778 }
1779
1780 // Create a block that will end up as the new loop exiting block (dominated by
1781 // all prologs and epilogs). It will only contain PHIs, in the same order as
1782 // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
1783 // that the exiting block is a (sub) clone of BB. This in turn gives us the
1784 // property that any value deffed in BB but used outside of BB is used by a
1785 // PHI in the exiting block.
1786 MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
1787 EliminateDeadPhis(MBB: ExitingBB, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1788 // Push out the epilogs, again in reverse order.
1789 // We can't assume anything about the minumum loop trip count at this point,
1790 // so emit a fairly complex epilog.
1791
1792 // We first peel number of stages minus one epilogue. Then we remove dead
1793 // stages and reorder instructions based on their stage. If we have 3 stages
1794 // we generate first:
1795 // E0[3, 2, 1]
1796 // E1[3', 2']
1797 // E2[3'']
1798 // And then we move instructions based on their stages to have:
1799 // E0[3]
1800 // E1[2, 3']
1801 // E2[1, 2', 3'']
1802 // The transformation is legal because we only move instructions past
1803 // instructions of a previous loop iteration.
1804 for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1805 Epilogs.push_back(Elt: peelKernel(LPD: LPD_Back));
1806 MachineBasicBlock *B = Epilogs.back();
1807 filterInstructions(MB: B, MinStage: Schedule.getNumStages() - I);
1808 // Keep track at which iteration each phi belongs to. We need it to know
1809 // what version of the variable to use during prologue/epilogue stitching.
1810 EliminateDeadPhis(MBB: B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1811 for (MachineInstr &Phi : B->phis())
1812 PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
1813 }
1814 for (size_t I = 0; I < Epilogs.size(); I++) {
1815 LS.reset();
1816 for (size_t J = I; J < Epilogs.size(); J++) {
1817 int Iteration = J;
1818 unsigned Stage = Schedule.getNumStages() - 1 + I - J;
1819 // Move stage one block at a time so that Phi nodes are updated correctly.
1820 for (size_t K = Iteration; K > I; K--)
1821 moveStageBetweenBlocks(DestBB: Epilogs[K - 1], SourceBB: Epilogs[K], Stage);
1822 LS[Stage] = true;
1823 }
1824 LiveStages[Epilogs[I]] = LS;
1825 AvailableStages[Epilogs[I]] = AS;
1826 }
1827
1828 // Now we've defined all the prolog and epilog blocks as a fallthrough
1829 // sequence, add the edges that will be followed if the loop trip count is
1830 // lower than the number of stages (connecting prologs directly with epilogs).
1831 auto PI = Prologs.begin();
1832 auto EI = Epilogs.begin();
1833 assert(Prologs.size() == Epilogs.size());
1834 for (; PI != Prologs.end(); ++PI, ++EI) {
1835 MachineBasicBlock *Pred = *(*EI)->pred_begin();
1836 (*PI)->addSuccessor(Succ: *EI);
1837 for (MachineInstr &MI : (*EI)->phis()) {
1838 Register Reg = MI.getOperand(i: 1).getReg();
1839 MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
1840 if (Use && Use->getParent() == Pred) {
1841 MachineInstr *CanonicalUse = CanonicalMIs[Use];
1842 if (CanonicalUse->isPHI()) {
1843 // If the use comes from a phi we need to skip as many phi as the
1844 // distance between the epilogue and the kernel. Trace through the phi
1845 // chain to find the right value.
1846 Reg = getPhiCanonicalReg(CanonicalPhi: CanonicalUse, Phi: Use);
1847 }
1848 Reg = getEquivalentRegisterIn(Reg, BB: *PI);
1849 }
1850 MI.addOperand(Op: MachineOperand::CreateReg(Reg, /*isDef=*/false));
1851 MI.addOperand(Op: MachineOperand::CreateMBB(MBB: *PI));
1852 }
1853 }
1854
1855 // Create a list of all blocks in order.
1856 SmallVector<MachineBasicBlock *, 8> Blocks;
1857 llvm::append_range(C&: Blocks, R&: PeeledFront);
1858 Blocks.push_back(Elt: BB);
1859 llvm::append_range(C&: Blocks, R&: PeeledBack);
1860
1861 // Iterate in reverse order over all instructions, remapping as we go.
1862 for (MachineBasicBlock *B : reverse(C&: Blocks)) {
1863 for (auto I = B->instr_rbegin();
1864 I != std::next(x: B->getFirstNonPHI()->getReverseIterator());) {
1865 MachineBasicBlock::reverse_instr_iterator MI = I++;
1866 rewriteUsesOf(MI: &*MI);
1867 }
1868 }
1869 for (auto *MI : IllegalPhisToDelete) {
1870 if (LIS)
1871 LIS->RemoveMachineInstrFromMaps(MI&: *MI);
1872 MI->eraseFromParent();
1873 }
1874 IllegalPhisToDelete.clear();
1875
1876 // Now all remapping has been done, we're free to optimize the generated code.
1877 for (MachineBasicBlock *B : reverse(C&: Blocks))
1878 EliminateDeadPhis(MBB: B, MRI, LIS);
1879 EliminateDeadPhis(MBB: ExitingBB, MRI, LIS);
1880}
1881
1882MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
1883 MachineFunction &MF = *BB->getParent();
1884 MachineBasicBlock *Exit = *BB->succ_begin();
1885 if (Exit == BB)
1886 Exit = *std::next(x: BB->succ_begin());
1887
1888 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB: BB->getBasicBlock());
1889 MF.insert(MBBI: std::next(x: BB->getIterator()), MBB: NewBB);
1890
1891 // Clone all phis in BB into NewBB and rewrite.
1892 for (MachineInstr &MI : BB->phis()) {
1893 auto RC = MRI.getRegClass(Reg: MI.getOperand(i: 0).getReg());
1894 Register OldR = MI.getOperand(i: 3).getReg();
1895 Register R = MRI.createVirtualRegister(RegClass: RC);
1896 SmallVector<MachineInstr *, 4> Uses;
1897 for (MachineInstr &Use : MRI.use_instructions(Reg: OldR))
1898 if (Use.getParent() != BB)
1899 Uses.push_back(Elt: &Use);
1900 for (MachineInstr *Use : Uses)
1901 Use->substituteRegister(FromReg: OldR, ToReg: R, /*SubIdx=*/0,
1902 RegInfo: *MRI.getTargetRegisterInfo());
1903 MachineInstr *NI = BuildMI(BB: NewBB, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: R)
1904 .addReg(RegNo: OldR)
1905 .addMBB(MBB: BB);
1906 BlockMIs[{NewBB, &MI}] = NI;
1907 CanonicalMIs[NI] = &MI;
1908 }
1909 BB->replaceSuccessor(Old: Exit, New: NewBB);
1910 Exit->replacePhiUsesWith(Old: BB, New: NewBB);
1911 NewBB->addSuccessor(Succ: Exit);
1912
1913 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
1914 SmallVector<MachineOperand, 4> Cond;
1915 bool CanAnalyzeBr = !TII->analyzeBranch(MBB&: *BB, TBB, FBB, Cond);
1916 (void)CanAnalyzeBr;
1917 assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
1918 TII->removeBranch(MBB&: *BB);
1919 TII->insertBranch(MBB&: *BB, TBB: TBB == Exit ? NewBB : TBB, FBB: FBB == Exit ? NewBB : FBB,
1920 Cond, DL: DebugLoc());
1921 TII->insertUnconditionalBranch(MBB&: *NewBB, DestBB: Exit, DL: DebugLoc());
1922 return NewBB;
1923}
1924
1925Register
1926PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
1927 MachineBasicBlock *BB) {
1928 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
1929 unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
1930 return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(i: OpIdx).getReg();
1931}
1932
1933void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1934 if (MI->isPHI()) {
1935 // This is an illegal PHI. The loop-carried (desired) value is operand 3,
1936 // and it is produced by this block.
1937 Register PhiR = MI->getOperand(i: 0).getReg();
1938 Register R = MI->getOperand(i: 3).getReg();
1939 int RMIStage = getStage(MI: MRI.getUniqueVRegDef(Reg: R));
1940 if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(Idx: RMIStage))
1941 R = MI->getOperand(i: 1).getReg();
1942 MRI.setRegClass(Reg: R, RC: MRI.getRegClass(Reg: PhiR));
1943 MRI.replaceRegWith(FromReg: PhiR, ToReg: R);
1944 // Postpone deleting the Phi as it may be referenced by BlockMIs and used
1945 // later to figure out how to remap registers.
1946 MI->getOperand(i: 0).setReg(PhiR);
1947 IllegalPhisToDelete.push_back(Elt: MI);
1948 return;
1949 }
1950
1951 int Stage = getStage(MI);
1952 if (Stage == -1 || LiveStages.count(Val: MI->getParent()) == 0 ||
1953 LiveStages[MI->getParent()].test(Idx: Stage))
1954 // Instruction is live, no rewriting to do.
1955 return;
1956
1957 for (MachineOperand &DefMO : MI->defs()) {
1958 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1959 for (MachineInstr &UseMI : MRI.use_instructions(Reg: DefMO.getReg())) {
1960 // Only PHIs can use values from this block by construction.
1961 // Match with the equivalent PHI in B.
1962 assert(UseMI.isPHI());
1963 Register Reg = getEquivalentRegisterIn(Reg: UseMI.getOperand(i: 0).getReg(),
1964 BB: MI->getParent());
1965 Subs.emplace_back(Args: &UseMI, Args&: Reg);
1966 }
1967 for (auto &Sub : Subs)
1968 Sub.first->substituteRegister(FromReg: DefMO.getReg(), ToReg: Sub.second, /*SubIdx=*/0,
1969 RegInfo: *MRI.getTargetRegisterInfo());
1970 }
1971 if (LIS)
1972 LIS->RemoveMachineInstrFromMaps(MI&: *MI);
1973 MI->eraseFromParent();
1974}
1975
1976void PeelingModuloScheduleExpander::fixupBranches() {
1977 // Work outwards from the kernel.
1978 bool KernelDisposed = false;
1979 int TC = Schedule.getNumStages() - 1;
1980 for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
1981 ++PI, ++EI, --TC) {
1982 MachineBasicBlock *Prolog = *PI;
1983 MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
1984 MachineBasicBlock *Epilog = *EI;
1985 SmallVector<MachineOperand, 4> Cond;
1986 TII->removeBranch(MBB&: *Prolog);
1987 std::optional<bool> StaticallyGreater =
1988 LoopInfo->createTripCountGreaterCondition(TC, MBB&: *Prolog, Cond);
1989 if (!StaticallyGreater) {
1990 LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
1991 // Dynamically branch based on Cond.
1992 TII->insertBranch(MBB&: *Prolog, TBB: Epilog, FBB: Fallthrough, Cond, DL: DebugLoc());
1993 } else if (*StaticallyGreater == false) {
1994 LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
1995 // Prolog never falls through; branch to epilog and orphan interior
1996 // blocks. Leave it to unreachable-block-elim to clean up.
1997 Prolog->removeSuccessor(Succ: Fallthrough);
1998 for (MachineInstr &P : Fallthrough->phis()) {
1999 P.removeOperand(OpNo: 2);
2000 P.removeOperand(OpNo: 1);
2001 }
2002 TII->insertUnconditionalBranch(MBB&: *Prolog, DestBB: Epilog, DL: DebugLoc());
2003 KernelDisposed = true;
2004 } else {
2005 LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
2006 // Prolog always falls through; remove incoming values in epilog.
2007 Prolog->removeSuccessor(Succ: Epilog);
2008 for (MachineInstr &P : Epilog->phis()) {
2009 P.removeOperand(OpNo: 4);
2010 P.removeOperand(OpNo: 3);
2011 }
2012 }
2013 }
2014
2015 if (!KernelDisposed) {
2016 LoopInfo->adjustTripCount(TripCountAdjust: -(Schedule.getNumStages() - 1));
2017 LoopInfo->setPreheader(Prologs.back());
2018 } else {
2019 LoopInfo->disposed();
2020 }
2021}
2022
2023void PeelingModuloScheduleExpander::rewriteKernel() {
2024 KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2025 KR.rewrite();
2026}
2027
2028void PeelingModuloScheduleExpander::expand() {
2029 BB = Schedule.getLoop()->getTopBlock();
2030 Preheader = Schedule.getLoop()->getLoopPreheader();
2031 LLVM_DEBUG(Schedule.dump());
2032 LoopInfo = TII->analyzeLoopForPipelining(LoopBB: BB);
2033 assert(LoopInfo);
2034
2035 rewriteKernel();
2036 peelPrologAndEpilogs();
2037 fixupBranches();
2038}
2039
2040void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
2041 BB = Schedule.getLoop()->getTopBlock();
2042 Preheader = Schedule.getLoop()->getLoopPreheader();
2043
2044 // Dump the schedule before we invalidate and remap all its instructions.
2045 // Stash it in a string so we can print it if we found an error.
2046 std::string ScheduleDump;
2047 raw_string_ostream OS(ScheduleDump);
2048 Schedule.print(OS);
2049 OS.flush();
2050
2051 // First, run the normal ModuleScheduleExpander. We don't support any
2052 // InstrChanges.
2053 assert(LIS && "Requires LiveIntervals!");
2054 ModuloScheduleExpander MSE(MF, Schedule, *LIS,
2055 ModuloScheduleExpander::InstrChangesTy());
2056 MSE.expand();
2057 MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
2058 if (!ExpandedKernel) {
2059 // The expander optimized away the kernel. We can't do any useful checking.
2060 MSE.cleanup();
2061 return;
2062 }
2063 // Before running the KernelRewriter, re-add BB into the CFG.
2064 Preheader->addSuccessor(Succ: BB);
2065
2066 // Now run the new expansion algorithm.
2067 KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2068 KR.rewrite();
2069 peelPrologAndEpilogs();
2070
2071 // Collect all illegal phis that the new algorithm created. We'll give these
2072 // to KernelOperandInfo.
2073 SmallPtrSet<MachineInstr *, 4> IllegalPhis;
2074 for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
2075 if (NI->isPHI())
2076 IllegalPhis.insert(Ptr: &*NI);
2077 }
2078
2079 // Co-iterate across both kernels. We expect them to be identical apart from
2080 // phis and full COPYs (we look through both).
2081 SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
2082 auto OI = ExpandedKernel->begin();
2083 auto NI = BB->begin();
2084 for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
2085 while (OI->isPHI() || OI->isFullCopy())
2086 ++OI;
2087 while (NI->isPHI() || NI->isFullCopy())
2088 ++NI;
2089 assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
2090 // Analyze every operand separately.
2091 for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
2092 OOpI != OI->operands_end(); ++OOpI, ++NOpI)
2093 KOIs.emplace_back(Args: KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
2094 Args: KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
2095 }
2096
2097 bool Failed = false;
2098 for (auto &OldAndNew : KOIs) {
2099 if (OldAndNew.first == OldAndNew.second)
2100 continue;
2101 Failed = true;
2102 errs() << "Modulo kernel validation error: [\n";
2103 errs() << " [golden] ";
2104 OldAndNew.first.print(OS&: errs());
2105 errs() << " ";
2106 OldAndNew.second.print(OS&: errs());
2107 errs() << "]\n";
2108 }
2109
2110 if (Failed) {
2111 errs() << "Golden reference kernel:\n";
2112 ExpandedKernel->print(OS&: errs());
2113 errs() << "New kernel:\n";
2114 BB->print(OS&: errs());
2115 errs() << ScheduleDump;
2116 report_fatal_error(
2117 reason: "Modulo kernel validation (-pipeliner-experimental-cg) failed");
2118 }
2119
2120 // Cleanup by removing BB from the CFG again as the original
2121 // ModuloScheduleExpander intended.
2122 Preheader->removeSuccessor(Succ: BB);
2123 MSE.cleanup();
2124}
2125
2126MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
2127 MachineInstr *NewMI = MF.CloneMachineInstr(Orig: OldMI);
2128
2129 // TODO: Offset information needs to be corrected.
2130 NewMI->dropMemRefs(MF);
2131
2132 return NewMI;
2133}
2134
2135/// Create a dedicated exit for Loop. Exit is the original exit for Loop.
2136/// If it is already dedicated exit, return it. Otherwise, insert a new
2137/// block between them and return the new block.
2138static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
2139 MachineBasicBlock *Exit,
2140 LiveIntervals &LIS) {
2141 if (Exit->pred_size() == 1)
2142 return Exit;
2143
2144 MachineFunction *MF = Loop->getParent();
2145 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
2146
2147 MachineBasicBlock *NewExit =
2148 MF->CreateMachineBasicBlock(BB: Loop->getBasicBlock());
2149 MF->insert(MBBI: Loop->getIterator(), MBB: NewExit);
2150 LIS.insertMBBInMaps(MBB: NewExit);
2151
2152 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2153 SmallVector<MachineOperand, 4> Cond;
2154 TII->analyzeBranch(MBB&: *Loop, TBB, FBB, Cond);
2155 if (TBB == Loop)
2156 FBB = NewExit;
2157 else if (FBB == Loop)
2158 TBB = NewExit;
2159 else
2160 llvm_unreachable("unexpected loop structure");
2161 TII->removeBranch(MBB&: *Loop);
2162 TII->insertBranch(MBB&: *Loop, TBB, FBB, Cond, DL: DebugLoc());
2163 Loop->replaceSuccessor(Old: Exit, New: NewExit);
2164 TII->insertUnconditionalBranch(MBB&: *NewExit, DestBB: Exit, DL: DebugLoc());
2165 NewExit->addSuccessor(Succ: Exit);
2166
2167 Exit->replacePhiUsesWith(Old: Loop, New: NewExit);
2168
2169 return NewExit;
2170}
2171
2172/// Insert branch code into the end of MBB. It branches to GreaterThan if the
2173/// remaining trip count for instructions in LastStage0Insts is greater than
2174/// RequiredTC, and to Otherwise otherwise.
2175void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
2176 int RequiredTC,
2177 InstrMapTy &LastStage0Insts,
2178 MachineBasicBlock &GreaterThan,
2179 MachineBasicBlock &Otherwise) {
2180 SmallVector<MachineOperand, 4> Cond;
2181 LoopInfo->createRemainingIterationsGreaterCondition(TC: RequiredTC, MBB, Cond,
2182 LastStage0Insts);
2183
2184 if (SwapBranchTargetsMVE) {
2185 // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and
2186 // FBB for optimal performance.
2187 if (TII->reverseBranchCondition(Cond))
2188 llvm_unreachable("can not reverse branch condition");
2189 TII->insertBranch(MBB, TBB: &Otherwise, FBB: &GreaterThan, Cond, DL: DebugLoc());
2190 } else {
2191 TII->insertBranch(MBB, TBB: &GreaterThan, FBB: &Otherwise, Cond, DL: DebugLoc());
2192 }
2193}
2194
2195/// Generate a pipelined loop that is unrolled by using MVE algorithm and any
2196/// other necessary blocks. The control flow is modified to execute the
2197/// pipelined loop if the trip count satisfies the condition, otherwise the
2198/// original loop. The original loop is also used to execute the remainder
2199/// iterations which occur due to unrolling.
2200void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
2201 // The control flow for pipelining with MVE:
2202 //
2203 // OrigPreheader:
2204 // // The block that is originally the loop preheader
2205 // goto Check
2206 //
2207 // Check:
2208 // // Check whether the trip count satisfies the requirements to pipeline.
2209 // if (LoopCounter > NumStages + NumUnroll - 2)
2210 // // The minimum number of iterations to pipeline =
2211 // // iterations executed in prolog/epilog (NumStages-1) +
2212 // // iterations executed in one kernel run (NumUnroll)
2213 // goto Prolog
2214 // // fallback to the original loop
2215 // goto NewPreheader
2216 //
2217 // Prolog:
2218 // // All prolog stages. There are no direct branches to the epilogue.
2219 // goto NewKernel
2220 //
2221 // NewKernel:
2222 // // NumUnroll copies of the kernel
2223 // if (LoopCounter > MVE-1)
2224 // goto NewKernel
2225 // goto Epilog
2226 //
2227 // Epilog:
2228 // // All epilog stages.
2229 // if (LoopCounter > 0)
2230 // // The remainder is executed in the original loop
2231 // goto NewPreheader
2232 // goto NewExit
2233 //
2234 // NewPreheader:
2235 // // Newly created preheader for the original loop.
2236 // // The initial values of the phis in the loop are merged from two paths.
2237 // NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
2238 // goto OrigKernel
2239 //
2240 // OrigKernel:
2241 // // The original loop block.
2242 // if (LoopCounter != 0)
2243 // goto OrigKernel
2244 // goto NewExit
2245 //
2246 // NewExit:
2247 // // Newly created dedicated exit for the original loop.
2248 // // Merge values which are referenced after the loop
2249 // Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
2250 // goto OrigExit
2251 //
2252 // OrigExit:
2253 // // The block that is originally the loop exit.
2254 // // If it is already deicated exit, NewExit is not created.
2255
2256 // An example of where each stage is executed:
2257 // Assume #Stages 3, #MVE 4, #Iterations 12
2258 // Iter 0 1 2 3 4 5 6 7 8 9 10-11
2259 // -------------------------------------------------
2260 // Stage 0 Prolog#0
2261 // Stage 1 0 Prolog#1
2262 // Stage 2 1 0 Kernel Unroll#0 Iter#0
2263 // Stage 2 1 0 Kernel Unroll#1 Iter#0
2264 // Stage 2 1 0 Kernel Unroll#2 Iter#0
2265 // Stage 2 1 0 Kernel Unroll#3 Iter#0
2266 // Stage 2 1 0 Kernel Unroll#0 Iter#1
2267 // Stage 2 1 0 Kernel Unroll#1 Iter#1
2268 // Stage 2 1 0 Kernel Unroll#2 Iter#1
2269 // Stage 2 1 0 Kernel Unroll#3 Iter#1
2270 // Stage 2 1 Epilog#0
2271 // Stage 2 Epilog#1
2272 // Stage 0-2 OrigKernel
2273
2274 LoopInfo = TII->analyzeLoopForPipelining(LoopBB: OrigKernel);
2275 assert(LoopInfo && "Must be able to analyze loop!");
2276
2277 calcNumUnroll();
2278
2279 Check = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2280 Prolog = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2281 NewKernel = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2282 Epilog = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2283 NewPreheader = MF.CreateMachineBasicBlock(BB: OrigKernel->getBasicBlock());
2284
2285 MF.insert(MBBI: OrigKernel->getIterator(), MBB: Check);
2286 LIS.insertMBBInMaps(MBB: Check);
2287 MF.insert(MBBI: OrigKernel->getIterator(), MBB: Prolog);
2288 LIS.insertMBBInMaps(MBB: Prolog);
2289 MF.insert(MBBI: OrigKernel->getIterator(), MBB: NewKernel);
2290 LIS.insertMBBInMaps(MBB: NewKernel);
2291 MF.insert(MBBI: OrigKernel->getIterator(), MBB: Epilog);
2292 LIS.insertMBBInMaps(MBB: Epilog);
2293 MF.insert(MBBI: OrigKernel->getIterator(), MBB: NewPreheader);
2294 LIS.insertMBBInMaps(MBB: NewPreheader);
2295
2296 NewExit = createDedicatedExit(Loop: OrigKernel, Exit: OrigExit, LIS);
2297
2298 NewPreheader->transferSuccessorsAndUpdatePHIs(FromMBB: OrigPreheader);
2299 TII->insertUnconditionalBranch(MBB&: *NewPreheader, DestBB: OrigKernel, DL: DebugLoc());
2300
2301 OrigPreheader->addSuccessor(Succ: Check);
2302 TII->removeBranch(MBB&: *OrigPreheader);
2303 TII->insertUnconditionalBranch(MBB&: *OrigPreheader, DestBB: Check, DL: DebugLoc());
2304
2305 Check->addSuccessor(Succ: Prolog);
2306 Check->addSuccessor(Succ: NewPreheader);
2307
2308 Prolog->addSuccessor(Succ: NewKernel);
2309
2310 NewKernel->addSuccessor(Succ: NewKernel);
2311 NewKernel->addSuccessor(Succ: Epilog);
2312
2313 Epilog->addSuccessor(Succ: NewPreheader);
2314 Epilog->addSuccessor(Succ: NewExit);
2315
2316 InstrMapTy LastStage0Insts;
2317 insertCondBranch(MBB&: *Check, RequiredTC: Schedule.getNumStages() + NumUnroll - 2,
2318 LastStage0Insts, GreaterThan&: *Prolog, Otherwise&: *NewPreheader);
2319
2320 // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
2321 // register#
2322 SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
2323 generateProlog(VRMap&: PrologVRMap);
2324 generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
2325 generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
2326}
2327
2328/// Replace MI's use operands according to the maps.
2329void ModuloScheduleExpanderMVE::updateInstrUse(
2330 MachineInstr *MI, int StageNum, int PhaseNum,
2331 SmallVectorImpl<ValueMapTy> &CurVRMap,
2332 SmallVectorImpl<ValueMapTy> *PrevVRMap) {
2333 // If MI is in the prolog/kernel/epilog block, CurVRMap is
2334 // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
2335 // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
2336 // Refer to the appropriate map according to the stage difference between
2337 // MI and the definition of an operand.
2338
2339 for (MachineOperand &UseMO : MI->uses()) {
2340 if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
2341 continue;
2342 int DiffStage = 0;
2343 Register OrigReg = UseMO.getReg();
2344 MachineInstr *DefInst = MRI.getVRegDef(Reg: OrigReg);
2345 if (!DefInst || DefInst->getParent() != OrigKernel)
2346 continue;
2347 Register InitReg;
2348 Register DefReg = OrigReg;
2349 if (DefInst->isPHI()) {
2350 ++DiffStage;
2351 Register LoopReg;
2352 getPhiRegs(Phi&: *DefInst, Loop: OrigKernel, InitVal&: InitReg, LoopVal&: LoopReg);
2353 // LoopReg is guaranteed to be defined within the loop by canApply()
2354 DefReg = LoopReg;
2355 DefInst = MRI.getVRegDef(Reg: LoopReg);
2356 }
2357 unsigned DefStageNum = Schedule.getStage(MI: DefInst);
2358 DiffStage += StageNum - DefStageNum;
2359 Register NewReg;
2360 if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(Val: DefReg))
2361 // NewReg is defined in a previous phase of the same block
2362 NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
2363 else if (!PrevVRMap)
2364 // Since this is the first iteration, refer the initial register of the
2365 // loop
2366 NewReg = InitReg;
2367 else
2368 // Cases where DiffStage is larger than PhaseNum.
2369 // If MI is in the kernel block, the value is defined by the previous
2370 // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
2371 // value is defined in the kernel block and KernelVRMap is referenced.
2372 NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
2373
2374 const TargetRegisterClass *NRC =
2375 MRI.constrainRegClass(Reg: NewReg, RC: MRI.getRegClass(Reg: OrigReg));
2376 if (NRC)
2377 UseMO.setReg(NewReg);
2378 else {
2379 Register SplitReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OrigReg));
2380 MachineInstr *NewCopy = BuildMI(BB&: *OrigKernel, I: MI, MIMD: MI->getDebugLoc(),
2381 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SplitReg)
2382 .addReg(RegNo: NewReg);
2383 LIS.InsertMachineInstrInMaps(MI&: *NewCopy);
2384 UseMO.setReg(SplitReg);
2385 }
2386 }
2387}
2388
2389/// Return a phi if Reg is referenced by the phi.
2390/// canApply() guarantees that at most only one such phi exists.
2391static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
2392 for (MachineInstr &Phi : Loop->phis()) {
2393 Register InitVal, LoopVal;
2394 getPhiRegs(Phi, Loop, InitVal, LoopVal);
2395 if (LoopVal == Reg)
2396 return &Phi;
2397 }
2398 return nullptr;
2399}
2400
2401/// Generate phis for registers defined by OrigMI.
2402void ModuloScheduleExpanderMVE::generatePhi(
2403 MachineInstr *OrigMI, int UnrollNum,
2404 SmallVectorImpl<ValueMapTy> &PrologVRMap,
2405 SmallVectorImpl<ValueMapTy> &KernelVRMap,
2406 SmallVectorImpl<ValueMapTy> &PhiVRMap) {
2407 int StageNum = Schedule.getStage(MI: OrigMI);
2408 bool UsePrologReg;
2409 if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
2410 UsePrologReg = true;
2411 else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
2412 UsePrologReg = false;
2413 else
2414 return;
2415
2416 // Examples that show which stages are merged by phi.
2417 // Meaning of the symbol following the stage number:
2418 // a/b: Stages with the same letter are merged (UsePrologReg == true)
2419 // +: Merged with the initial value (UsePrologReg == false)
2420 // *: No phis required
2421 //
2422 // #Stages 3, #MVE 4
2423 // Iter 0 1 2 3 4 5 6 7 8
2424 // -----------------------------------------
2425 // Stage 0a Prolog#0
2426 // Stage 1a 0b Prolog#1
2427 // Stage 2* 1* 0* Kernel Unroll#0
2428 // Stage 2* 1* 0+ Kernel Unroll#1
2429 // Stage 2* 1+ 0a Kernel Unroll#2
2430 // Stage 2+ 1a 0b Kernel Unroll#3
2431 //
2432 // #Stages 3, #MVE 2
2433 // Iter 0 1 2 3 4 5 6 7 8
2434 // -----------------------------------------
2435 // Stage 0a Prolog#0
2436 // Stage 1a 0b Prolog#1
2437 // Stage 2* 1+ 0a Kernel Unroll#0
2438 // Stage 2+ 1a 0b Kernel Unroll#1
2439 //
2440 // #Stages 3, #MVE 1
2441 // Iter 0 1 2 3 4 5 6 7 8
2442 // -----------------------------------------
2443 // Stage 0* Prolog#0
2444 // Stage 1a 0b Prolog#1
2445 // Stage 2+ 1a 0b Kernel Unroll#0
2446
2447 for (MachineOperand &DefMO : OrigMI->defs()) {
2448 if (!DefMO.isReg() || DefMO.isDead())
2449 continue;
2450 Register OrigReg = DefMO.getReg();
2451 auto NewReg = KernelVRMap[UnrollNum].find(Val: OrigReg);
2452 if (NewReg == KernelVRMap[UnrollNum].end())
2453 continue;
2454 Register CorrespondReg;
2455 if (UsePrologReg) {
2456 int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
2457 CorrespondReg = PrologVRMap[PrologNum][OrigReg];
2458 } else {
2459 MachineInstr *Phi = getLoopPhiUser(Reg: OrigReg, Loop: OrigKernel);
2460 if (!Phi)
2461 continue;
2462 CorrespondReg = getInitPhiReg(Phi&: *Phi, LoopBB: OrigKernel);
2463 }
2464
2465 assert(CorrespondReg.isValid());
2466 Register PhiReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OrigReg));
2467 MachineInstr *NewPhi =
2468 BuildMI(BB&: *NewKernel, I: NewKernel->getFirstNonPHI(), MIMD: DebugLoc(),
2469 MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
2470 .addReg(RegNo: NewReg->second)
2471 .addMBB(MBB: NewKernel)
2472 .addReg(RegNo: CorrespondReg)
2473 .addMBB(MBB: Prolog);
2474 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
2475 PhiVRMap[UnrollNum][OrigReg] = PhiReg;
2476 }
2477}
2478
2479static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
2480 MachineBasicBlock *NewMBB) {
2481 for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
2482 if (Phi.getOperand(i: Idx).getReg() == OrigReg) {
2483 Phi.getOperand(i: Idx).setReg(NewReg);
2484 Phi.getOperand(i: Idx + 1).setMBB(NewMBB);
2485 return;
2486 }
2487 }
2488}
2489
2490/// Generate phis that merge values from multiple routes
2491void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
2492 Register NewReg) {
2493 SmallVector<MachineOperand *> UsesAfterLoop;
2494 SmallVector<MachineInstr *> LoopPhis;
2495 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(RegNo: OrigReg),
2496 E = MRI.use_end();
2497 I != E; ++I) {
2498 MachineOperand &O = *I;
2499 if (O.getParent()->getParent() != OrigKernel &&
2500 O.getParent()->getParent() != Prolog &&
2501 O.getParent()->getParent() != NewKernel &&
2502 O.getParent()->getParent() != Epilog)
2503 UsesAfterLoop.push_back(Elt: &O);
2504 if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
2505 LoopPhis.push_back(Elt: O.getParent());
2506 }
2507
2508 // Merge the route that only execute the pipelined loop (when there are no
2509 // remaining iterations) with the route that execute the original loop.
2510 if (!UsesAfterLoop.empty()) {
2511 Register PhiReg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: OrigReg));
2512 MachineInstr *NewPhi =
2513 BuildMI(BB&: *NewExit, I: NewExit->getFirstNonPHI(), MIMD: DebugLoc(),
2514 MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
2515 .addReg(RegNo: OrigReg)
2516 .addMBB(MBB: OrigKernel)
2517 .addReg(RegNo: NewReg)
2518 .addMBB(MBB: Epilog);
2519 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
2520
2521 for (MachineOperand *MO : UsesAfterLoop)
2522 MO->setReg(PhiReg);
2523
2524 // The interval of OrigReg is invalid and should be recalculated when
2525 // LiveInterval::getInterval() is called.
2526 if (LIS.hasInterval(Reg: OrigReg))
2527 LIS.removeInterval(Reg: OrigReg);
2528 }
2529
2530 // Merge routes from the pipelined loop and the bypassed route before the
2531 // original loop
2532 if (!LoopPhis.empty()) {
2533 for (MachineInstr *Phi : LoopPhis) {
2534 Register InitReg, LoopReg;
2535 getPhiRegs(Phi&: *Phi, Loop: OrigKernel, InitVal&: InitReg, LoopVal&: LoopReg);
2536 Register NewInit = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: InitReg));
2537 MachineInstr *NewPhi =
2538 BuildMI(BB&: *NewPreheader, I: NewPreheader->getFirstNonPHI(),
2539 MIMD: Phi->getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: NewInit)
2540 .addReg(RegNo: InitReg)
2541 .addMBB(MBB: Check)
2542 .addReg(RegNo: NewReg)
2543 .addMBB(MBB: Epilog);
2544 LIS.InsertMachineInstrInMaps(MI&: *NewPhi);
2545 replacePhiSrc(Phi&: *Phi, OrigReg: InitReg, NewReg: NewInit, NewMBB: NewPreheader);
2546 }
2547 }
2548}
2549
2550void ModuloScheduleExpanderMVE::generateProlog(
2551 SmallVectorImpl<ValueMapTy> &PrologVRMap) {
2552 PrologVRMap.clear();
2553 PrologVRMap.resize(N: Schedule.getNumStages() - 1);
2554 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2555 for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
2556 ++PrologNum) {
2557 for (MachineInstr *MI : Schedule.getInstructions()) {
2558 if (MI->isPHI())
2559 continue;
2560 int StageNum = Schedule.getStage(MI);
2561 if (StageNum > PrologNum)
2562 continue;
2563 MachineInstr *NewMI = cloneInstr(OldMI: MI);
2564 updateInstrDef(NewMI, VRMap&: PrologVRMap[PrologNum], LastDef: false);
2565 NewMIMap[NewMI] = {PrologNum, StageNum};
2566 Prolog->push_back(MI: NewMI);
2567 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
2568 }
2569 }
2570
2571 for (auto I : NewMIMap) {
2572 MachineInstr *MI = I.first;
2573 int PrologNum = I.second.first;
2574 int StageNum = I.second.second;
2575 updateInstrUse(MI, StageNum, PhaseNum: PrologNum, CurVRMap&: PrologVRMap, PrevVRMap: nullptr);
2576 }
2577
2578 LLVM_DEBUG({
2579 dbgs() << "prolog:\n";
2580 Prolog->dump();
2581 });
2582}
2583
2584void ModuloScheduleExpanderMVE::generateKernel(
2585 SmallVectorImpl<ValueMapTy> &PrologVRMap,
2586 SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
2587 KernelVRMap.clear();
2588 KernelVRMap.resize(N: NumUnroll);
2589 SmallVector<ValueMapTy> PhiVRMap;
2590 PhiVRMap.resize(N: NumUnroll);
2591 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2592 for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
2593 for (MachineInstr *MI : Schedule.getInstructions()) {
2594 if (MI->isPHI())
2595 continue;
2596 int StageNum = Schedule.getStage(MI);
2597 MachineInstr *NewMI = cloneInstr(OldMI: MI);
2598 if (UnrollNum == NumUnroll - 1)
2599 LastStage0Insts[MI] = NewMI;
2600 updateInstrDef(NewMI, VRMap&: KernelVRMap[UnrollNum],
2601 LastDef: (UnrollNum == NumUnroll - 1 && StageNum == 0));
2602 generatePhi(OrigMI: MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
2603 NewMIMap[NewMI] = {UnrollNum, StageNum};
2604 NewKernel->push_back(MI: NewMI);
2605 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
2606 }
2607 }
2608
2609 for (auto I : NewMIMap) {
2610 MachineInstr *MI = I.first;
2611 int UnrollNum = I.second.first;
2612 int StageNum = I.second.second;
2613 updateInstrUse(MI, StageNum, PhaseNum: UnrollNum, CurVRMap&: KernelVRMap, PrevVRMap: &PhiVRMap);
2614 }
2615
2616 // If remaining trip count is greater than NumUnroll-1, loop continues
2617 insertCondBranch(MBB&: *NewKernel, RequiredTC: NumUnroll - 1, LastStage0Insts, GreaterThan&: *NewKernel,
2618 Otherwise&: *Epilog);
2619
2620 LLVM_DEBUG({
2621 dbgs() << "kernel:\n";
2622 NewKernel->dump();
2623 });
2624}
2625
2626void ModuloScheduleExpanderMVE::generateEpilog(
2627 SmallVectorImpl<ValueMapTy> &KernelVRMap,
2628 SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
2629 EpilogVRMap.clear();
2630 EpilogVRMap.resize(N: Schedule.getNumStages() - 1);
2631 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2632 for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
2633 ++EpilogNum) {
2634 for (MachineInstr *MI : Schedule.getInstructions()) {
2635 if (MI->isPHI())
2636 continue;
2637 int StageNum = Schedule.getStage(MI);
2638 if (StageNum <= EpilogNum)
2639 continue;
2640 MachineInstr *NewMI = cloneInstr(OldMI: MI);
2641 updateInstrDef(NewMI, VRMap&: EpilogVRMap[EpilogNum], LastDef: StageNum - 1 == EpilogNum);
2642 NewMIMap[NewMI] = {EpilogNum, StageNum};
2643 Epilog->push_back(MI: NewMI);
2644 LIS.InsertMachineInstrInMaps(MI&: *NewMI);
2645 }
2646 }
2647
2648 for (auto I : NewMIMap) {
2649 MachineInstr *MI = I.first;
2650 int EpilogNum = I.second.first;
2651 int StageNum = I.second.second;
2652 updateInstrUse(MI, StageNum, PhaseNum: EpilogNum, CurVRMap&: EpilogVRMap, PrevVRMap: &KernelVRMap);
2653 }
2654
2655 // If there are remaining iterations, they are executed in the original loop.
2656 // Instructions related to loop control, such as loop counter comparison,
2657 // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
2658 // in stage 0. Thus, the map is for the last one in the kernel.
2659 insertCondBranch(MBB&: *Epilog, RequiredTC: 0, LastStage0Insts, GreaterThan&: *NewPreheader, Otherwise&: *NewExit);
2660
2661 LLVM_DEBUG({
2662 dbgs() << "epilog:\n";
2663 Epilog->dump();
2664 });
2665}
2666
2667/// Calculate the number of unroll required and set it to NumUnroll
2668void ModuloScheduleExpanderMVE::calcNumUnroll() {
2669 DenseMap<MachineInstr *, unsigned> Inst2Idx;
2670 NumUnroll = 1;
2671 for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
2672 Inst2Idx[Schedule.getInstructions()[I]] = I;
2673
2674 for (MachineInstr *MI : Schedule.getInstructions()) {
2675 if (MI->isPHI())
2676 continue;
2677 int StageNum = Schedule.getStage(MI);
2678 for (const MachineOperand &MO : MI->uses()) {
2679 if (!MO.isReg() || !MO.getReg().isVirtual())
2680 continue;
2681 MachineInstr *DefMI = MRI.getVRegDef(Reg: MO.getReg());
2682 if (DefMI->getParent() != OrigKernel)
2683 continue;
2684
2685 int NumUnrollLocal = 1;
2686 if (DefMI->isPHI()) {
2687 ++NumUnrollLocal;
2688 // canApply() guarantees that DefMI is not phi and is an instruction in
2689 // the loop
2690 DefMI = MRI.getVRegDef(Reg: getLoopPhiReg(Phi&: *DefMI, LoopBB: OrigKernel));
2691 }
2692 NumUnrollLocal += StageNum - Schedule.getStage(MI: DefMI);
2693 if (Inst2Idx[MI] <= Inst2Idx[DefMI])
2694 --NumUnrollLocal;
2695 NumUnroll = std::max(a: NumUnroll, b: NumUnrollLocal);
2696 }
2697 }
2698 LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
2699}
2700
2701/// Create new virtual registers for definitions of NewMI and update NewMI.
2702/// If the definitions are referenced after the pipelined loop, phis are
2703/// created to merge with other routes.
2704void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
2705 ValueMapTy &VRMap,
2706 bool LastDef) {
2707 for (MachineOperand &MO : NewMI->all_defs()) {
2708 if (!MO.getReg().isVirtual())
2709 continue;
2710 Register Reg = MO.getReg();
2711 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
2712 Register NewReg = MRI.createVirtualRegister(RegClass: RC);
2713 MO.setReg(NewReg);
2714 VRMap[Reg] = NewReg;
2715 if (LastDef)
2716 mergeRegUsesAfterPipeline(OrigReg: Reg, NewReg);
2717 }
2718}
2719
2720void ModuloScheduleExpanderMVE::expand() {
2721 OrigKernel = Schedule.getLoop()->getTopBlock();
2722 OrigPreheader = Schedule.getLoop()->getLoopPreheader();
2723 OrigExit = Schedule.getLoop()->getExitBlock();
2724
2725 LLVM_DEBUG(Schedule.dump());
2726
2727 generatePipelinedLoop();
2728}
2729
2730/// Check if ModuloScheduleExpanderMVE can be applied to L
2731bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
2732 if (!L.getExitBlock()) {
2733 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n");
2734 return false;
2735 }
2736
2737 MachineBasicBlock *BB = L.getTopBlock();
2738 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2739
2740 // Put some constraints on the operands of the phis to simplify the
2741 // transformation
2742 DenseSet<Register> UsedByPhi;
2743 for (MachineInstr &MI : BB->phis()) {
2744 // Registers defined by phis must be used only inside the loop and be never
2745 // used by phis.
2746 for (MachineOperand &MO : MI.defs())
2747 if (MO.isReg())
2748 for (MachineInstr &Ref : MRI.use_instructions(Reg: MO.getReg()))
2749 if (Ref.getParent() != BB || Ref.isPHI()) {
2750 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
2751 "referenced outside of the loop or by phi.\n");
2752 return false;
2753 }
2754
2755 // A source register from the loop block must be defined inside the loop.
2756 // A register defined inside the loop must be referenced by only one phi at
2757 // most.
2758 Register InitVal, LoopVal;
2759 getPhiRegs(Phi&: MI, Loop: MI.getParent(), InitVal, LoopVal);
2760 if (!Register(LoopVal).isVirtual() ||
2761 MRI.getVRegDef(Reg: LoopVal)->getParent() != BB) {
2762 LLVM_DEBUG(
2763 dbgs() << "Can not apply MVE expander: A phi source value coming "
2764 "from the loop is not defined in the loop.\n");
2765 return false;
2766 }
2767 if (UsedByPhi.count(V: LoopVal)) {
2768 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
2769 "loop is referenced by two or more phis.\n");
2770 return false;
2771 }
2772 UsedByPhi.insert(V: LoopVal);
2773 }
2774
2775 return true;
2776}
2777
2778//===----------------------------------------------------------------------===//
2779// ModuloScheduleTestPass implementation
2780//===----------------------------------------------------------------------===//
2781// This pass constructs a ModuloSchedule from its module and runs
2782// ModuloScheduleExpander.
2783//
2784// The module is expected to contain a single-block analyzable loop.
2785// The total order of instructions is taken from the loop as-is.
2786// Instructions are expected to be annotated with a PostInstrSymbol.
2787// This PostInstrSymbol must have the following format:
2788// "Stage=%d Cycle=%d".
2789//===----------------------------------------------------------------------===//
2790
2791namespace {
2792class ModuloScheduleTest : public MachineFunctionPass {
2793public:
2794 static char ID;
2795
2796 ModuloScheduleTest() : MachineFunctionPass(ID) {
2797 initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
2798 }
2799
2800 bool runOnMachineFunction(MachineFunction &MF) override;
2801 void runOnLoop(MachineFunction &MF, MachineLoop &L);
2802
2803 void getAnalysisUsage(AnalysisUsage &AU) const override {
2804 AU.addRequired<MachineLoopInfoWrapperPass>();
2805 AU.addRequired<LiveIntervalsWrapperPass>();
2806 MachineFunctionPass::getAnalysisUsage(AU);
2807 }
2808};
2809} // namespace
2810
2811char ModuloScheduleTest::ID = 0;
2812
2813INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
2814 "Modulo Schedule test pass", false, false)
2815INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
2816INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
2817INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
2818 "Modulo Schedule test pass", false, false)
2819
2820bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
2821 MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2822 for (auto *L : MLI) {
2823 if (L->getTopBlock() != L->getBottomBlock())
2824 continue;
2825 runOnLoop(MF, L&: *L);
2826 return false;
2827 }
2828 return false;
2829}
2830
2831static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
2832 std::pair<StringRef, StringRef> StageAndCycle = getToken(Source: S, Delimiters: "_");
2833 std::pair<StringRef, StringRef> StageTokenAndValue =
2834 getToken(Source: StageAndCycle.first, Delimiters: "-");
2835 std::pair<StringRef, StringRef> CycleTokenAndValue =
2836 getToken(Source: StageAndCycle.second, Delimiters: "-");
2837 if (StageTokenAndValue.first != "Stage" ||
2838 CycleTokenAndValue.first != "_Cycle") {
2839 llvm_unreachable(
2840 "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
2841 return;
2842 }
2843
2844 StageTokenAndValue.second.drop_front().getAsInteger(Radix: 10, Result&: Stage);
2845 CycleTokenAndValue.second.drop_front().getAsInteger(Radix: 10, Result&: Cycle);
2846
2847 dbgs() << " Stage=" << Stage << ", Cycle=" << Cycle << "\n";
2848}
2849
2850void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
2851 LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
2852 MachineBasicBlock *BB = L.getTopBlock();
2853 dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
2854
2855 DenseMap<MachineInstr *, int> Cycle, Stage;
2856 std::vector<MachineInstr *> Instrs;
2857 for (MachineInstr &MI : *BB) {
2858 if (MI.isTerminator())
2859 continue;
2860 Instrs.push_back(x: &MI);
2861 if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
2862 dbgs() << "Parsing post-instr symbol for " << MI;
2863 parseSymbolString(S: Sym->getName(), Cycle&: Cycle[&MI], Stage&: Stage[&MI]);
2864 }
2865 }
2866
2867 ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
2868 std::move(Stage));
2869 ModuloScheduleExpander MSE(
2870 MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
2871 MSE.expand();
2872 MSE.cleanup();
2873}
2874
2875//===----------------------------------------------------------------------===//
2876// ModuloScheduleTestAnnotater implementation
2877//===----------------------------------------------------------------------===//
2878
2879void ModuloScheduleTestAnnotater::annotate() {
2880 for (MachineInstr *MI : S.getInstructions()) {
2881 SmallVector<char, 16> SV;
2882 raw_svector_ostream OS(SV);
2883 OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
2884 MCSymbol *Sym = MF.getContext().getOrCreateSymbol(Name: OS.str());
2885 MI->setPostInstrSymbol(MF, Symbol: Sym);
2886 }
2887}
2888