1//===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Software pipelining (SWP) is an instruction scheduling technique for loops
10// that overlaps loop iterations and exploits ILP via compiler transformations.
11//
12// There are multiple methods for analyzing a loop and creating a schedule.
13// An example algorithm is Swing Modulo Scheduling (implemented by the
14// MachinePipeliner). The details of how a schedule is arrived at are irrelevant
15// for the task of actually rewriting a loop to adhere to the schedule, which
16// is what this file does.
17//
18// A schedule is, for every instruction in a block, a Cycle and a Stage. Note
19// that we only support single-block loops, so "block" and "loop" can be used
20// interchangably.
21//
22// The Cycle of an instruction defines a partial order of the instructions in
23// the remapped loop. Instructions within a cycle must not consume the output
24// of any instruction in the same cycle. Cycle information is assumed to have
25// been calculated such that the processor will execute instructions in
26// lock-step (for example in a VLIW ISA).
27//
28// The Stage of an instruction defines the mapping between logical loop
29// iterations and pipelined loop iterations. An example (unrolled) pipeline
30// may look something like:
31//
32// I0[0] Execute instruction I0 of iteration 0
33// I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1
34// I1[1], I0[2]
35// I1[2], I0[3]
36//
37// In the schedule for this unrolled sequence we would say that I0 was scheduled
38// in stage 0 and I1 in stage 1:
39//
40// loop:
41// [stage 0] x = I0
42// [stage 1] I1 x (from stage 0)
43//
44// And to actually generate valid code we must insert a phi:
45//
46// loop:
47// x' = phi(x)
48// x = I0
49// I1 x'
50//
51// This is a simple example; the rules for how to generate correct code given
52// an arbitrary schedule containing loop-carried values are complex.
53//
54// Note that these examples only mention the steady-state kernel of the
55// generated loop; prologs and epilogs must be generated also that prime and
56// flush the pipeline. Doing so is nontrivial.
57//
58//===----------------------------------------------------------------------===//
59
60#ifndef LLVM_CODEGEN_MODULOSCHEDULE_H
61#define LLVM_CODEGEN_MODULOSCHEDULE_H
62
63#include "llvm/CodeGen/MachineFunction.h"
64#include "llvm/CodeGen/MachineLoopUtils.h"
65#include "llvm/CodeGen/TargetInstrInfo.h"
66#include "llvm/CodeGen/TargetSubtargetInfo.h"
67#include <deque>
68#include <map>
69#include <vector>
70
71namespace llvm {
72class MachineBasicBlock;
73class MachineLoop;
74class MachineRegisterInfo;
75class MachineInstr;
76class LiveIntervals;
77
78/// Represents a schedule for a single-block loop. For every instruction we
79/// maintain a Cycle and Stage.
80class ModuloSchedule {
81private:
82 /// The block containing the loop instructions.
83 MachineLoop *Loop;
84
85 /// The instructions to be generated, in total order. Cycle provides a partial
86 /// order; the total order within cycles has been decided by the schedule
87 /// producer.
88 std::vector<MachineInstr *> ScheduledInstrs;
89
90 /// The cycle for each instruction.
91 DenseMap<MachineInstr *, int> Cycle;
92
93 /// The stage for each instruction.
94 DenseMap<MachineInstr *, int> Stage;
95
96 /// The number of stages in this schedule (Max(Stage) + 1).
97 int NumStages;
98
99public:
100 /// Create a new ModuloSchedule.
101 /// \arg ScheduledInstrs The new loop instructions, in total resequenced
102 /// order.
103 /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
104 /// not need to start at zero. ScheduledInstrs must be partially ordered by
105 /// Cycle.
106 /// \arg Stage Stage index for all instructions in ScheduleInstrs.
107 ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
108 std::vector<MachineInstr *> ScheduledInstrs,
109 DenseMap<MachineInstr *, int> Cycle,
110 DenseMap<MachineInstr *, int> Stage)
111 : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
112 Stage(std::move(Stage)) {
113 NumStages = 0;
114 for (auto &KV : this->Stage)
115 NumStages = std::max(a: NumStages, b: KV.second);
116 ++NumStages;
117 }
118
119 /// Return the single-block loop being scheduled.
120 MachineLoop *getLoop() const { return Loop; }
121
122 /// Return the number of stages contained in this schedule, which is the
123 /// largest stage index + 1.
124 int getNumStages() const { return NumStages; }
125
126 /// Return the first cycle in the schedule, which is the cycle index of the
127 /// first instruction.
128 int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
129
130 /// Return the final cycle in the schedule, which is the cycle index of the
131 /// last instruction.
132 int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
133
134 /// Return the stage that MI is scheduled in, or -1.
135 int getStage(MachineInstr *MI) {
136 auto I = Stage.find(Val: MI);
137 return I == Stage.end() ? -1 : I->second;
138 }
139
140 /// Return the cycle that MI is scheduled at, or -1.
141 int getCycle(MachineInstr *MI) {
142 auto I = Cycle.find(Val: MI);
143 return I == Cycle.end() ? -1 : I->second;
144 }
145
146 /// Set the stage of a newly created instruction.
147 void setStage(MachineInstr *MI, int MIStage) {
148 assert(Stage.count(MI) == 0);
149 Stage[MI] = MIStage;
150 }
151
152 /// Return the rescheduled instructions in order.
153 ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
154
155 void dump() { print(OS&: dbgs()); }
156 void print(raw_ostream &OS);
157};
158
159/// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
160/// rewriting the old loop and inserting prologs and epilogs as required.
161class ModuloScheduleExpander {
162public:
163 using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
164
165private:
166 using ValueMapTy = DenseMap<unsigned, unsigned>;
167 using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
168 using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
169
170 ModuloSchedule &Schedule;
171 MachineFunction &MF;
172 const TargetSubtargetInfo &ST;
173 MachineRegisterInfo &MRI;
174 const TargetInstrInfo *TII = nullptr;
175 LiveIntervals &LIS;
176
177 MachineBasicBlock *BB = nullptr;
178 MachineBasicBlock *Preheader = nullptr;
179 MachineBasicBlock *NewKernel = nullptr;
180 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
181
182 /// Map for each register and the max difference between its uses and def.
183 /// The first element in the pair is the max difference in stages. The
184 /// second is true if the register defines a Phi value and loop value is
185 /// scheduled before the Phi.
186 std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
187
188 /// Instructions to change when emitting the final schedule.
189 InstrChangesTy InstrChanges;
190
191 void generatePipelinedLoop();
192 void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
193 ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
194 void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
195 MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
196 ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
197 MBBVectorTy &PrologBBs);
198 void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
199 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
200 ValueMapTy *VRMap, InstrMapTy &InstrMap,
201 unsigned LastStageNum, unsigned CurStageNum,
202 bool IsLast);
203 void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
204 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
205 ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
206 InstrMapTy &InstrMap, unsigned LastStageNum,
207 unsigned CurStageNum, bool IsLast);
208 void removeDeadInstructions(MachineBasicBlock *KernelBB,
209 MBBVectorTy &EpilogBBs);
210 void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
211 void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
212 MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
213 ValueMapTy *VRMap);
214 bool computeDelta(MachineInstr &MI, unsigned &Delta);
215 void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
216 unsigned Num);
217 MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
218 unsigned InstStageNum);
219 MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
220 unsigned InstStageNum);
221 void updateInstruction(MachineInstr *NewMI, bool LastDef,
222 unsigned CurStageNum, unsigned InstrStageNum,
223 ValueMapTy *VRMap);
224 MachineInstr *findDefInLoop(unsigned Reg);
225 unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
226 unsigned LoopStage, ValueMapTy *VRMap,
227 MachineBasicBlock *BB);
228 void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
229 ValueMapTy *VRMap, InstrMapTy &InstrMap);
230 void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
231 unsigned CurStageNum, unsigned PhiNum,
232 MachineInstr *Phi, unsigned OldReg,
233 unsigned NewReg, unsigned PrevReg = 0);
234 bool isLoopCarried(MachineInstr &Phi);
235
236 /// Return the max. number of stages/iterations that can occur between a
237 /// register definition and its uses.
238 unsigned getStagesForReg(int Reg, unsigned CurStage) {
239 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
240 if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
241 Stages.second)
242 return 1;
243 return Stages.first;
244 }
245
246 /// The number of stages for a Phi is a little different than other
247 /// instructions. The minimum value computed in RegToStageDiff is 1
248 /// because we assume the Phi is needed for at least 1 iteration.
249 /// This is not the case if the loop value is scheduled prior to the
250 /// Phi in the same stage. This function returns the number of stages
251 /// or iterations needed between the Phi definition and any uses.
252 unsigned getStagesForPhi(int Reg) {
253 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
254 if (Stages.second)
255 return Stages.first;
256 return Stages.first - 1;
257 }
258
259public:
260 /// Create a new ModuloScheduleExpander.
261 /// \arg InstrChanges Modifications to make to instructions with memory
262 /// operands.
263 /// FIXME: InstrChanges is opaque and is an implementation detail of an
264 /// optimization in MachinePipeliner that crosses abstraction boundaries.
265 ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
266 LiveIntervals &LIS, InstrChangesTy InstrChanges)
267 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
268 TII(ST.getInstrInfo()), LIS(LIS),
269 InstrChanges(std::move(InstrChanges)) {}
270
271 /// Performs the actual expansion.
272 void expand();
273 /// Performs final cleanup after expansion.
274 void cleanup();
275
276 /// Returns the newly rewritten kernel block, or nullptr if this was
277 /// optimized away.
278 MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
279};
280
281/// A reimplementation of ModuloScheduleExpander. It works by generating a
282/// standalone kernel loop and peeling out the prologs and epilogs.
283class PeelingModuloScheduleExpander {
284public:
285 PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
286 LiveIntervals *LIS)
287 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
288 TII(ST.getInstrInfo()), LIS(LIS) {}
289
290 void expand();
291
292 /// Runs ModuloScheduleExpander and treats it as a golden input to validate
293 /// aspects of the code generated by PeelingModuloScheduleExpander.
294 void validateAgainstModuloScheduleExpander();
295
296protected:
297 ModuloSchedule &Schedule;
298 MachineFunction &MF;
299 const TargetSubtargetInfo &ST;
300 MachineRegisterInfo &MRI;
301 const TargetInstrInfo *TII = nullptr;
302 LiveIntervals *LIS = nullptr;
303
304 /// The original loop block that gets rewritten in-place.
305 MachineBasicBlock *BB = nullptr;
306 /// The original loop preheader.
307 MachineBasicBlock *Preheader = nullptr;
308 /// All prolog and epilog blocks.
309 SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
310 /// For every block, the stages that are produced.
311 DenseMap<MachineBasicBlock *, BitVector> LiveStages;
312 /// For every block, the stages that are available. A stage can be available
313 /// but not produced (in the epilog) or produced but not available (in the
314 /// prolog).
315 DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
316 /// When peeling the epilogue keep track of the distance between the phi
317 /// nodes and the kernel.
318 DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration;
319
320 /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
321 /// loop kernel clones.
322 DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
323 DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
324 BlockMIs;
325
326 /// State passed from peelKernel to peelPrologAndEpilogs().
327 std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
328 /// Illegal phis that need to be deleted once we re-link stages.
329 SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
330
331 /// Converts BB from the original loop body to the rewritten, pipelined
332 /// steady-state.
333 void rewriteKernel();
334
335 /// Peels one iteration of the rewritten kernel (BB) in the specified
336 /// direction.
337 MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
338 // Delete instructions whose stage is less than MinStage in the given basic
339 // block.
340 void filterInstructions(MachineBasicBlock *MB, int MinStage);
341 // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
342 // instructions to keep a valid IR.
343 void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
344 MachineBasicBlock *SourceBB, unsigned Stage);
345 /// Peel the kernel forwards and backwards to produce prologs and epilogs,
346 /// and stitch them together.
347 void peelPrologAndEpilogs();
348 /// All prolog and epilog blocks are clones of the kernel, so any produced
349 /// register in one block has an corollary in all other blocks.
350 Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
351 /// Change all users of MI, if MI is predicated out
352 /// (LiveStages[MI->getParent()] == false).
353 void rewriteUsesOf(MachineInstr *MI);
354 /// Insert branches between prologs, kernel and epilogs.
355 void fixupBranches();
356 /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
357 /// to a block dominated by all prologs and epilogs. This allows us to treat
358 /// the loop exiting block as any other kernel clone.
359 MachineBasicBlock *CreateLCSSAExitingBlock();
360 /// Helper to get the stage of an instruction in the schedule.
361 unsigned getStage(MachineInstr *MI) {
362 if (CanonicalMIs.count(Val: MI))
363 MI = CanonicalMIs[MI];
364 return Schedule.getStage(MI);
365 }
366 /// Helper function to find the right canonical register for a phi instruction
367 /// coming from a peeled out prologue.
368 Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
369 /// Target loop info before kernel peeling.
370 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
371};
372
373/// Expand the kernel using modulo variable expansion algorithm (MVE).
374/// It unrolls the kernel enough to avoid overlap of register lifetime.
375class ModuloScheduleExpanderMVE {
376private:
377 using ValueMapTy = DenseMap<unsigned, unsigned>;
378 using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
379 using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
380
381 ModuloSchedule &Schedule;
382 MachineFunction &MF;
383 const TargetSubtargetInfo &ST;
384 MachineRegisterInfo &MRI;
385 const TargetInstrInfo *TII = nullptr;
386 LiveIntervals &LIS;
387
388 MachineBasicBlock *OrigKernel = nullptr;
389 MachineBasicBlock *OrigPreheader = nullptr;
390 MachineBasicBlock *OrigExit = nullptr;
391 MachineBasicBlock *Check = nullptr;
392 MachineBasicBlock *Prolog = nullptr;
393 MachineBasicBlock *NewKernel = nullptr;
394 MachineBasicBlock *Epilog = nullptr;
395 MachineBasicBlock *NewPreheader = nullptr;
396 MachineBasicBlock *NewExit = nullptr;
397 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
398
399 /// The number of unroll required to avoid overlap of live ranges.
400 /// NumUnroll = 1 means no unrolling.
401 int NumUnroll;
402
403 void calcNumUnroll();
404 void generatePipelinedLoop();
405 void generateProlog(SmallVectorImpl<ValueMapTy> &VRMap);
406 void generatePhi(MachineInstr *OrigMI, int UnrollNum,
407 SmallVectorImpl<ValueMapTy> &PrologVRMap,
408 SmallVectorImpl<ValueMapTy> &KernelVRMap,
409 SmallVectorImpl<ValueMapTy> &PhiVRMap);
410 void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap,
411 SmallVectorImpl<ValueMapTy> &KernelVRMap,
412 InstrMapTy &LastStage0Insts);
413 void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap,
414 SmallVectorImpl<ValueMapTy> &EpilogVRMap,
415 InstrMapTy &LastStage0Insts);
416 void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg);
417
418 MachineInstr *cloneInstr(MachineInstr *OldMI);
419
420 void updateInstrDef(MachineInstr *NewMI, ValueMapTy &VRMap, bool LastDef);
421
422 void generateKernelPhi(Register OrigLoopVal, Register NewLoopVal,
423 unsigned UnrollNum,
424 SmallVectorImpl<ValueMapTy> &VRMapProlog,
425 SmallVectorImpl<ValueMapTy> &VRMapPhi);
426 void updateInstrUse(MachineInstr *MI, int StageNum, int PhaseNum,
427 SmallVectorImpl<ValueMapTy> &CurVRMap,
428 SmallVectorImpl<ValueMapTy> *PrevVRMap);
429
430 void insertCondBranch(MachineBasicBlock &MBB, int RequiredTC,
431 InstrMapTy &LastStage0Insts,
432 MachineBasicBlock &GreaterThan,
433 MachineBasicBlock &Otherwise);
434
435public:
436 ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S,
437 LiveIntervals &LIS)
438 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
439 TII(ST.getInstrInfo()), LIS(LIS) {}
440
441 void expand();
442 static bool canApply(MachineLoop &L);
443};
444
445/// Expander that simply annotates each scheduled instruction with a post-instr
446/// symbol that can be consumed by the ModuloScheduleTest pass.
447///
448/// The post-instr symbol is a way of annotating an instruction that can be
449/// roundtripped in MIR. The syntax is:
450/// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
451class ModuloScheduleTestAnnotater {
452 MachineFunction &MF;
453 ModuloSchedule &S;
454
455public:
456 ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
457 : MF(MF), S(S) {}
458
459 /// Performs the annotation.
460 void annotate();
461};
462
463} // end namespace llvm
464
465#endif // LLVM_CODEGEN_MODULOSCHEDULE_H
466