1 | //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Software pipelining (SWP) is an instruction scheduling technique for loops |
10 | // that overlaps loop iterations and exploits ILP via compiler transformations. |
11 | // |
12 | // There are multiple methods for analyzing a loop and creating a schedule. |
13 | // An example algorithm is Swing Modulo Scheduling (implemented by the |
14 | // MachinePipeliner). The details of how a schedule is arrived at are irrelevant |
15 | // for the task of actually rewriting a loop to adhere to the schedule, which |
16 | // is what this file does. |
17 | // |
18 | // A schedule is, for every instruction in a block, a Cycle and a Stage. Note |
19 | // that we only support single-block loops, so "block" and "loop" can be used |
20 | // interchangably. |
21 | // |
22 | // The Cycle of an instruction defines a partial order of the instructions in |
23 | // the remapped loop. Instructions within a cycle must not consume the output |
24 | // of any instruction in the same cycle. Cycle information is assumed to have |
25 | // been calculated such that the processor will execute instructions in |
26 | // lock-step (for example in a VLIW ISA). |
27 | // |
28 | // The Stage of an instruction defines the mapping between logical loop |
29 | // iterations and pipelined loop iterations. An example (unrolled) pipeline |
30 | // may look something like: |
31 | // |
32 | // I0[0] Execute instruction I0 of iteration 0 |
33 | // I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1 |
34 | // I1[1], I0[2] |
35 | // I1[2], I0[3] |
36 | // |
37 | // In the schedule for this unrolled sequence we would say that I0 was scheduled |
38 | // in stage 0 and I1 in stage 1: |
39 | // |
40 | // loop: |
41 | // [stage 0] x = I0 |
42 | // [stage 1] I1 x (from stage 0) |
43 | // |
44 | // And to actually generate valid code we must insert a phi: |
45 | // |
46 | // loop: |
47 | // x' = phi(x) |
48 | // x = I0 |
49 | // I1 x' |
50 | // |
51 | // This is a simple example; the rules for how to generate correct code given |
52 | // an arbitrary schedule containing loop-carried values are complex. |
53 | // |
54 | // Note that these examples only mention the steady-state kernel of the |
55 | // generated loop; prologs and epilogs must be generated also that prime and |
56 | // flush the pipeline. Doing so is nontrivial. |
57 | // |
58 | //===----------------------------------------------------------------------===// |
59 | |
60 | #ifndef LLVM_CODEGEN_MODULOSCHEDULE_H |
61 | #define LLVM_CODEGEN_MODULOSCHEDULE_H |
62 | |
63 | #include "llvm/CodeGen/MachineFunction.h" |
64 | #include "llvm/CodeGen/MachineLoopUtils.h" |
65 | #include "llvm/CodeGen/TargetInstrInfo.h" |
66 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
67 | #include <deque> |
68 | #include <map> |
69 | #include <vector> |
70 | |
71 | namespace llvm { |
72 | class MachineBasicBlock; |
73 | class MachineLoop; |
74 | class MachineRegisterInfo; |
75 | class MachineInstr; |
76 | class LiveIntervals; |
77 | |
78 | /// Represents a schedule for a single-block loop. For every instruction we |
79 | /// maintain a Cycle and Stage. |
80 | class ModuloSchedule { |
81 | private: |
82 | /// The block containing the loop instructions. |
83 | MachineLoop *Loop; |
84 | |
85 | /// The instructions to be generated, in total order. Cycle provides a partial |
86 | /// order; the total order within cycles has been decided by the schedule |
87 | /// producer. |
88 | std::vector<MachineInstr *> ScheduledInstrs; |
89 | |
90 | /// The cycle for each instruction. |
91 | DenseMap<MachineInstr *, int> Cycle; |
92 | |
93 | /// The stage for each instruction. |
94 | DenseMap<MachineInstr *, int> Stage; |
95 | |
96 | /// The number of stages in this schedule (Max(Stage) + 1). |
97 | int NumStages; |
98 | |
99 | public: |
100 | /// Create a new ModuloSchedule. |
101 | /// \arg ScheduledInstrs The new loop instructions, in total resequenced |
102 | /// order. |
103 | /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does |
104 | /// not need to start at zero. ScheduledInstrs must be partially ordered by |
105 | /// Cycle. |
106 | /// \arg Stage Stage index for all instructions in ScheduleInstrs. |
107 | ModuloSchedule(MachineFunction &MF, MachineLoop *Loop, |
108 | std::vector<MachineInstr *> ScheduledInstrs, |
109 | DenseMap<MachineInstr *, int> Cycle, |
110 | DenseMap<MachineInstr *, int> Stage) |
111 | : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)), |
112 | Stage(std::move(Stage)) { |
113 | NumStages = 0; |
114 | for (auto &KV : this->Stage) |
115 | NumStages = std::max(a: NumStages, b: KV.second); |
116 | ++NumStages; |
117 | } |
118 | |
119 | /// Return the single-block loop being scheduled. |
120 | MachineLoop *getLoop() const { return Loop; } |
121 | |
122 | /// Return the number of stages contained in this schedule, which is the |
123 | /// largest stage index + 1. |
124 | int getNumStages() const { return NumStages; } |
125 | |
126 | /// Return the first cycle in the schedule, which is the cycle index of the |
127 | /// first instruction. |
128 | int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; } |
129 | |
130 | /// Return the final cycle in the schedule, which is the cycle index of the |
131 | /// last instruction. |
132 | int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; } |
133 | |
134 | /// Return the stage that MI is scheduled in, or -1. |
135 | int getStage(MachineInstr *MI) { |
136 | auto I = Stage.find(Val: MI); |
137 | return I == Stage.end() ? -1 : I->second; |
138 | } |
139 | |
140 | /// Return the cycle that MI is scheduled at, or -1. |
141 | int getCycle(MachineInstr *MI) { |
142 | auto I = Cycle.find(Val: MI); |
143 | return I == Cycle.end() ? -1 : I->second; |
144 | } |
145 | |
146 | /// Set the stage of a newly created instruction. |
147 | void setStage(MachineInstr *MI, int MIStage) { |
148 | assert(Stage.count(MI) == 0); |
149 | Stage[MI] = MIStage; |
150 | } |
151 | |
152 | /// Return the rescheduled instructions in order. |
153 | ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; } |
154 | |
155 | void dump() { print(OS&: dbgs()); } |
156 | void print(raw_ostream &OS); |
157 | }; |
158 | |
159 | /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, |
160 | /// rewriting the old loop and inserting prologs and epilogs as required. |
161 | class ModuloScheduleExpander { |
162 | public: |
163 | using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>; |
164 | |
165 | private: |
166 | using ValueMapTy = DenseMap<unsigned, unsigned>; |
167 | using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>; |
168 | using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>; |
169 | |
170 | ModuloSchedule &Schedule; |
171 | MachineFunction &MF; |
172 | const TargetSubtargetInfo &ST; |
173 | MachineRegisterInfo &MRI; |
174 | const TargetInstrInfo *TII = nullptr; |
175 | LiveIntervals &LIS; |
176 | |
177 | MachineBasicBlock *BB = nullptr; |
178 | MachineBasicBlock *Preheader = nullptr; |
179 | MachineBasicBlock *NewKernel = nullptr; |
180 | std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; |
181 | |
182 | /// Map for each register and the max difference between its uses and def. |
183 | /// The first element in the pair is the max difference in stages. The |
184 | /// second is true if the register defines a Phi value and loop value is |
185 | /// scheduled before the Phi. |
186 | std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff; |
187 | |
188 | /// Instructions to change when emitting the final schedule. |
189 | InstrChangesTy InstrChanges; |
190 | |
191 | void generatePipelinedLoop(); |
192 | void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB, |
193 | ValueMapTy *VRMap, MBBVectorTy &PrologBBs); |
194 | void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB, |
195 | MachineBasicBlock *OrigBB, ValueMapTy *VRMap, |
196 | ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs, |
197 | MBBVectorTy &PrologBBs); |
198 | void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, |
199 | MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, |
200 | ValueMapTy *VRMap, InstrMapTy &InstrMap, |
201 | unsigned LastStageNum, unsigned CurStageNum, |
202 | bool IsLast); |
203 | void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, |
204 | MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, |
205 | ValueMapTy *VRMap, ValueMapTy *VRMapPhi, |
206 | InstrMapTy &InstrMap, unsigned LastStageNum, |
207 | unsigned CurStageNum, bool IsLast); |
208 | void removeDeadInstructions(MachineBasicBlock *KernelBB, |
209 | MBBVectorTy &EpilogBBs); |
210 | void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs); |
211 | void addBranches(MachineBasicBlock &, MBBVectorTy &PrologBBs, |
212 | MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, |
213 | ValueMapTy *VRMap); |
214 | bool computeDelta(MachineInstr &MI, unsigned &Delta); |
215 | void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI, |
216 | unsigned Num); |
217 | MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum, |
218 | unsigned InstStageNum); |
219 | MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum, |
220 | unsigned InstStageNum); |
221 | void updateInstruction(MachineInstr *NewMI, bool LastDef, |
222 | unsigned CurStageNum, unsigned InstrStageNum, |
223 | ValueMapTy *VRMap); |
224 | MachineInstr *findDefInLoop(unsigned Reg); |
225 | unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal, |
226 | unsigned LoopStage, ValueMapTy *VRMap, |
227 | MachineBasicBlock *BB); |
228 | void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum, |
229 | ValueMapTy *VRMap, InstrMapTy &InstrMap); |
230 | void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap, |
231 | unsigned CurStageNum, unsigned PhiNum, |
232 | MachineInstr *Phi, unsigned OldReg, |
233 | unsigned NewReg, unsigned PrevReg = 0); |
234 | bool isLoopCarried(MachineInstr &Phi); |
235 | |
236 | /// Return the max. number of stages/iterations that can occur between a |
237 | /// register definition and its uses. |
238 | unsigned getStagesForReg(int Reg, unsigned CurStage) { |
239 | std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; |
240 | if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 && |
241 | Stages.second) |
242 | return 1; |
243 | return Stages.first; |
244 | } |
245 | |
246 | /// The number of stages for a Phi is a little different than other |
247 | /// instructions. The minimum value computed in RegToStageDiff is 1 |
248 | /// because we assume the Phi is needed for at least 1 iteration. |
249 | /// This is not the case if the loop value is scheduled prior to the |
250 | /// Phi in the same stage. This function returns the number of stages |
251 | /// or iterations needed between the Phi definition and any uses. |
252 | unsigned getStagesForPhi(int Reg) { |
253 | std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; |
254 | if (Stages.second) |
255 | return Stages.first; |
256 | return Stages.first - 1; |
257 | } |
258 | |
259 | public: |
260 | /// Create a new ModuloScheduleExpander. |
261 | /// \arg InstrChanges Modifications to make to instructions with memory |
262 | /// operands. |
263 | /// FIXME: InstrChanges is opaque and is an implementation detail of an |
264 | /// optimization in MachinePipeliner that crosses abstraction boundaries. |
265 | ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, |
266 | LiveIntervals &LIS, InstrChangesTy InstrChanges) |
267 | : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), |
268 | TII(ST.getInstrInfo()), LIS(LIS), |
269 | InstrChanges(std::move(InstrChanges)) {} |
270 | |
271 | /// Performs the actual expansion. |
272 | void expand(); |
273 | /// Performs final cleanup after expansion. |
274 | void cleanup(); |
275 | |
276 | /// Returns the newly rewritten kernel block, or nullptr if this was |
277 | /// optimized away. |
278 | MachineBasicBlock *getRewrittenKernel() { return NewKernel; } |
279 | }; |
280 | |
281 | /// A reimplementation of ModuloScheduleExpander. It works by generating a |
282 | /// standalone kernel loop and peeling out the prologs and epilogs. |
283 | class PeelingModuloScheduleExpander { |
284 | public: |
285 | PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, |
286 | LiveIntervals *LIS) |
287 | : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), |
288 | TII(ST.getInstrInfo()), LIS(LIS) {} |
289 | |
290 | void expand(); |
291 | |
292 | /// Runs ModuloScheduleExpander and treats it as a golden input to validate |
293 | /// aspects of the code generated by PeelingModuloScheduleExpander. |
294 | void validateAgainstModuloScheduleExpander(); |
295 | |
296 | protected: |
297 | ModuloSchedule &Schedule; |
298 | MachineFunction &MF; |
299 | const TargetSubtargetInfo &ST; |
300 | MachineRegisterInfo &MRI; |
301 | const TargetInstrInfo *TII = nullptr; |
302 | LiveIntervals *LIS = nullptr; |
303 | |
304 | /// The original loop block that gets rewritten in-place. |
305 | MachineBasicBlock *BB = nullptr; |
306 | /// The original loop preheader. |
307 | MachineBasicBlock *Preheader = nullptr; |
308 | /// All prolog and epilog blocks. |
309 | SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs; |
310 | /// For every block, the stages that are produced. |
311 | DenseMap<MachineBasicBlock *, BitVector> LiveStages; |
312 | /// For every block, the stages that are available. A stage can be available |
313 | /// but not produced (in the epilog) or produced but not available (in the |
314 | /// prolog). |
315 | DenseMap<MachineBasicBlock *, BitVector> AvailableStages; |
316 | /// When peeling the epilogue keep track of the distance between the phi |
317 | /// nodes and the kernel. |
318 | DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration; |
319 | |
320 | /// CanonicalMIs and BlockMIs form a bidirectional map between any of the |
321 | /// loop kernel clones. |
322 | DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs; |
323 | DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *> |
324 | BlockMIs; |
325 | |
326 | /// State passed from peelKernel to peelPrologAndEpilogs(). |
327 | std::deque<MachineBasicBlock *> PeeledFront, PeeledBack; |
328 | /// Illegal phis that need to be deleted once we re-link stages. |
329 | SmallVector<MachineInstr *, 4> IllegalPhisToDelete; |
330 | |
331 | /// Converts BB from the original loop body to the rewritten, pipelined |
332 | /// steady-state. |
333 | void rewriteKernel(); |
334 | |
335 | /// Peels one iteration of the rewritten kernel (BB) in the specified |
336 | /// direction. |
337 | MachineBasicBlock *peelKernel(LoopPeelDirection LPD); |
338 | // Delete instructions whose stage is less than MinStage in the given basic |
339 | // block. |
340 | void filterInstructions(MachineBasicBlock *MB, int MinStage); |
341 | // Move instructions of the given stage from sourceBB to DestBB. Remap the phi |
342 | // instructions to keep a valid IR. |
343 | void moveStageBetweenBlocks(MachineBasicBlock *DestBB, |
344 | MachineBasicBlock *SourceBB, unsigned Stage); |
345 | /// Peel the kernel forwards and backwards to produce prologs and epilogs, |
346 | /// and stitch them together. |
347 | void peelPrologAndEpilogs(); |
348 | /// All prolog and epilog blocks are clones of the kernel, so any produced |
349 | /// register in one block has an corollary in all other blocks. |
350 | Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB); |
351 | /// Change all users of MI, if MI is predicated out |
352 | /// (LiveStages[MI->getParent()] == false). |
353 | void rewriteUsesOf(MachineInstr *MI); |
354 | /// Insert branches between prologs, kernel and epilogs. |
355 | void fixupBranches(); |
356 | /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block |
357 | /// to a block dominated by all prologs and epilogs. This allows us to treat |
358 | /// the loop exiting block as any other kernel clone. |
359 | MachineBasicBlock *CreateLCSSAExitingBlock(); |
360 | /// Helper to get the stage of an instruction in the schedule. |
361 | unsigned getStage(MachineInstr *MI) { |
362 | if (CanonicalMIs.count(Val: MI)) |
363 | MI = CanonicalMIs[MI]; |
364 | return Schedule.getStage(MI); |
365 | } |
366 | /// Helper function to find the right canonical register for a phi instruction |
367 | /// coming from a peeled out prologue. |
368 | Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi); |
369 | /// Target loop info before kernel peeling. |
370 | std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; |
371 | }; |
372 | |
373 | /// Expand the kernel using modulo variable expansion algorithm (MVE). |
374 | /// It unrolls the kernel enough to avoid overlap of register lifetime. |
375 | class ModuloScheduleExpanderMVE { |
376 | private: |
377 | using ValueMapTy = DenseMap<unsigned, unsigned>; |
378 | using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>; |
379 | using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>; |
380 | |
381 | ModuloSchedule &Schedule; |
382 | MachineFunction &MF; |
383 | const TargetSubtargetInfo &ST; |
384 | MachineRegisterInfo &MRI; |
385 | const TargetInstrInfo *TII = nullptr; |
386 | LiveIntervals &LIS; |
387 | |
388 | MachineBasicBlock *OrigKernel = nullptr; |
389 | MachineBasicBlock *OrigPreheader = nullptr; |
390 | MachineBasicBlock *OrigExit = nullptr; |
391 | MachineBasicBlock *Check = nullptr; |
392 | MachineBasicBlock *Prolog = nullptr; |
393 | MachineBasicBlock *NewKernel = nullptr; |
394 | MachineBasicBlock *Epilog = nullptr; |
395 | MachineBasicBlock *NewPreheader = nullptr; |
396 | MachineBasicBlock *NewExit = nullptr; |
397 | std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; |
398 | |
399 | /// The number of unroll required to avoid overlap of live ranges. |
400 | /// NumUnroll = 1 means no unrolling. |
401 | int NumUnroll; |
402 | |
403 | void calcNumUnroll(); |
404 | void generatePipelinedLoop(); |
405 | void generateProlog(SmallVectorImpl<ValueMapTy> &VRMap); |
406 | void generatePhi(MachineInstr *OrigMI, int UnrollNum, |
407 | SmallVectorImpl<ValueMapTy> &PrologVRMap, |
408 | SmallVectorImpl<ValueMapTy> &KernelVRMap, |
409 | SmallVectorImpl<ValueMapTy> &PhiVRMap); |
410 | void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap, |
411 | SmallVectorImpl<ValueMapTy> &KernelVRMap, |
412 | InstrMapTy &LastStage0Insts); |
413 | void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap, |
414 | SmallVectorImpl<ValueMapTy> &EpilogVRMap, |
415 | InstrMapTy &LastStage0Insts); |
416 | void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg); |
417 | |
418 | MachineInstr *cloneInstr(MachineInstr *OldMI); |
419 | |
420 | void updateInstrDef(MachineInstr *NewMI, ValueMapTy &VRMap, bool LastDef); |
421 | |
422 | void generateKernelPhi(Register OrigLoopVal, Register NewLoopVal, |
423 | unsigned UnrollNum, |
424 | SmallVectorImpl<ValueMapTy> &VRMapProlog, |
425 | SmallVectorImpl<ValueMapTy> &VRMapPhi); |
426 | void updateInstrUse(MachineInstr *MI, int StageNum, int PhaseNum, |
427 | SmallVectorImpl<ValueMapTy> &CurVRMap, |
428 | SmallVectorImpl<ValueMapTy> *PrevVRMap); |
429 | |
430 | void insertCondBranch(MachineBasicBlock &MBB, int RequiredTC, |
431 | InstrMapTy &LastStage0Insts, |
432 | MachineBasicBlock &GreaterThan, |
433 | MachineBasicBlock &Otherwise); |
434 | |
435 | public: |
436 | ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S, |
437 | LiveIntervals &LIS) |
438 | : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), |
439 | TII(ST.getInstrInfo()), LIS(LIS) {} |
440 | |
441 | void expand(); |
442 | static bool canApply(MachineLoop &L); |
443 | }; |
444 | |
445 | /// Expander that simply annotates each scheduled instruction with a post-instr |
446 | /// symbol that can be consumed by the ModuloScheduleTest pass. |
447 | /// |
448 | /// The post-instr symbol is a way of annotating an instruction that can be |
449 | /// roundtripped in MIR. The syntax is: |
450 | /// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5> |
451 | class ModuloScheduleTestAnnotater { |
452 | MachineFunction &MF; |
453 | ModuloSchedule &S; |
454 | |
455 | public: |
456 | ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S) |
457 | : MF(MF), S(S) {} |
458 | |
459 | /// Performs the annotation. |
460 | void annotate(); |
461 | }; |
462 | |
463 | } // end namespace llvm |
464 | |
465 | #endif // LLVM_CODEGEN_MODULOSCHEDULE_H |
466 | |