ModuloSchedule.h source code [llvm_projects/llvm/include/llvm/CodeGen/ModuloSchedule.h]

1	//===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Software pipelining (SWP) is an instruction scheduling technique for loops
10	// that overlaps loop iterations and exploits ILP via compiler transformations.
11	//
12	// There are multiple methods for analyzing a loop and creating a schedule.
13	// An example algorithm is Swing Modulo Scheduling (implemented by the
14	// MachinePipeliner). The details of how a schedule is arrived at are irrelevant
15	// for the task of actually rewriting a loop to adhere to the schedule, which
16	// is what this file does.
17	//
18	// A schedule is, for every instruction in a block, a Cycle and a Stage. Note
19	// that we only support single-block loops, so "block" and "loop" can be used
20	// interchangably.
21	//
22	// The Cycle of an instruction defines a partial order of the instructions in
23	// the remapped loop. Instructions within a cycle must not consume the output
24	// of any instruction in the same cycle. Cycle information is assumed to have
25	// been calculated such that the processor will execute instructions in
26	// lock-step (for example in a VLIW ISA).
27	//
28	// The Stage of an instruction defines the mapping between logical loop
29	// iterations and pipelined loop iterations. An example (unrolled) pipeline
30	// may look something like:
31	//
32	// I0[0] Execute instruction I0 of iteration 0
33	// I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1
34	// I1[1], I0[2]
35	// I1[2], I0[3]
36	//
37	// In the schedule for this unrolled sequence we would say that I0 was scheduled
38	// in stage 0 and I1 in stage 1:
39	//
40	// loop:
41	// [stage 0] x = I0
42	// [stage 1] I1 x (from stage 0)
43	//
44	// And to actually generate valid code we must insert a phi:
45	//
46	// loop:
47	// x' = phi(x)
48	// x = I0
49	// I1 x'
50	//
51	// This is a simple example; the rules for how to generate correct code given
52	// an arbitrary schedule containing loop-carried values are complex.
53	//
54	// Note that these examples only mention the steady-state kernel of the
55	// generated loop; prologs and epilogs must be generated also that prime and
56	// flush the pipeline. Doing so is nontrivial.
57	//
58	//===----------------------------------------------------------------------===//
59
60	#ifndef LLVM_CODEGEN_MODULOSCHEDULE_H
61	#define LLVM_CODEGEN_MODULOSCHEDULE_H
62
63	#include "llvm/CodeGen/MachineFunction.h"
64	#include "llvm/CodeGen/MachineLoopUtils.h"
65	#include "llvm/CodeGen/TargetInstrInfo.h"
66	#include "llvm/CodeGen/TargetSubtargetInfo.h"
67	#include <deque>
68	#include <map>
69	#include <vector>
70
71	namespace llvm {
72	class MachineBasicBlock;
73	class MachineLoop;
74	class MachineRegisterInfo;
75	class MachineInstr;
76	class LiveIntervals;
77
78	/// Represents a schedule for a single-block loop. For every instruction we
79	/// maintain a Cycle and Stage.
80	class ModuloSchedule {
81	private:
82	/// The block containing the loop instructions.
83	MachineLoop *Loop;
84
85	/// The instructions to be generated, in total order. Cycle provides a partial
86	/// order; the total order within cycles has been decided by the schedule
87	/// producer.
88	std::vector<MachineInstr *> ScheduledInstrs;
89
90	/// The cycle for each instruction.
91	DenseMap<MachineInstr , int*> Cycle;
92
93	/// The stage for each instruction.
94	DenseMap<MachineInstr , int*> Stage;
95
96	/// The number of stages in this schedule (Max(Stage) + 1).
97	int NumStages;
98
99	public:
100	/// Create a new ModuloSchedule.
101	/// \arg ScheduledInstrs The new loop instructions, in total resequenced
102	/// order.
103	/// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
104	/// not need to start at zero. ScheduledInstrs must be partially ordered by
105	/// Cycle.
106	/// \arg Stage Stage index for all instructions in ScheduleInstrs.
107	ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
108	std::vector<MachineInstr *> ScheduledInstrs,
109	DenseMap<MachineInstr , int*> Cycle,
110	DenseMap<MachineInstr , int*> Stage)
111	: Loop(Loop), ScheduledInstrs (ScheduledInstrs), Cycle (std::move(Cycle)),
112	Stage (std::move(Stage)) {
113	NumStages = `0`;
114	for (auto &KV : this->Stage)
115	NumStages = std::max(a: NumStages, b: KV.second);
116	++NumStages;
117	}
118
119	/// Return the single-block loop being scheduled.
120	MachineLoop getLoop() const* { return Loop; }
121
122	/// Return the number of stages contained in this schedule, which is the
123	/// largest stage index + 1.
124	int getNumStages() const { return NumStages; }
125
126	/// Return the first cycle in the schedule, which is the cycle index of the
127	/// first instruction.
128	int getFirstCycle() { return Cycle [ScheduledInstrs.front()]; }
129
130	/// Return the final cycle in the schedule, which is the cycle index of the
131	/// last instruction.
132	int getFinalCycle() { return Cycle [ScheduledInstrs.back()]; }
133
134	/// Return the stage that MI is scheduled in, or -1.
135	int getStage(MachineInstr *MI) {
136	auto I = Stage.find(Val: MI);
137	return I == Stage.end() ? -`1` : I ->second;
138	}
139
140	/// Return the cycle that MI is scheduled at, or -1.
141	int getCycle(MachineInstr *MI) {
142	auto I = Cycle.find(Val: MI);
143	return I == Cycle.end() ? -`1` : I ->second;
144	}
145
146	/// Set the stage of a newly created instruction.
147	void setStage(MachineInstr MI, int* MIStage) {
148	assert(Stage.count(MI) == `0`);
149	Stage [MI] = MIStage;
150	}
151
152	/// Return the rescheduled instructions in order.
153	ArrayRef<MachineInstr > getInstructions() { return* ScheduledInstrs; }
154
155	void dump() { print(OS&: dbgs()); }
156	void print(raw_ostream &OS);
157	};
158
159	/// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
160	/// rewriting the old loop and inserting prologs and epilogs as required.
161	class ModuloScheduleExpander {
162	public:
163	using InstrChangesTy = DenseMap<MachineInstr , std::pair<unsigned*, int64_t>>;
164
165	private:
166	using ValueMapTy = DenseMap<unsigned, unsigned>;
167	using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
168	using InstrMapTy = DenseMap<MachineInstr , MachineInstr >;
169
170	ModuloSchedule &Schedule;
171	MachineFunction &MF;
172	const TargetSubtargetInfo &ST;
173	MachineRegisterInfo &MRI;
174	const TargetInstrInfo TII = nullptr*;
175	LiveIntervals &LIS;
176
177	MachineBasicBlock BB = nullptr*;
178	MachineBasicBlock Preheader = nullptr*;
179	MachineBasicBlock NewKernel = nullptr*;
180	std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
181
182	/// Map for each register and the max difference between its uses and def.
183	/// The first element in the pair is the max difference in stages. The
184	/// second is true if the register defines a Phi value and loop value is
185	/// scheduled before the Phi.
186	std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
187
188	/// Instructions to change when emitting the final schedule.
189	InstrChangesTy InstrChanges;
190
191	void generatePipelinedLoop();
192	void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
193	ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
194	void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
195	MachineBasicBlock OrigBB, ValueMapTy VRMap,
196	ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
197	MBBVectorTy &PrologBBs);
198	void generateExistingPhis(MachineBasicBlock NewBB, MachineBasicBlock BB1,
199	MachineBasicBlock BB2, MachineBasicBlock KernelBB,
200	ValueMapTy *VRMap, InstrMapTy &InstrMap,
201	unsigned LastStageNum, unsigned CurStageNum,
202	bool IsLast);
203	void generatePhis(MachineBasicBlock NewBB, MachineBasicBlock BB1,
204	MachineBasicBlock BB2, MachineBasicBlock KernelBB,
205	ValueMapTy VRMap, ValueMapTy VRMapPhi,
206	InstrMapTy &InstrMap, unsigned LastStageNum,
207	unsigned CurStageNum, bool IsLast);
208	void removeDeadInstructions(MachineBasicBlock *KernelBB,
209	MBBVectorTy &EpilogBBs);
210	void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
211	void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
212	MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
213	ValueMapTy *VRMap);
214	bool computeDelta(MachineInstr &MI, unsigned &Delta);
215	void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
216	unsigned Num);
217	MachineInstr cloneInstr(MachineInstr OldMI, unsigned CurStageNum,
218	unsigned InstStageNum);
219	MachineInstr cloneAndChangeInstr(MachineInstr OldMI, unsigned CurStageNum,
220	unsigned InstStageNum);
221	void updateInstruction(MachineInstr NewMI, bool* LastDef,
222	unsigned CurStageNum, unsigned InstrStageNum,
223	ValueMapTy *VRMap);
224	MachineInstr findDefInLoop(unsigned* Reg);
225	unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
226	unsigned LoopStage, ValueMapTy *VRMap,
227	MachineBasicBlock *BB);
228	void rewritePhiValues(MachineBasicBlock NewBB, unsigned* StageNum,
229	ValueMapTy *VRMap, InstrMapTy &InstrMap);
230	void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
231	unsigned CurStageNum, unsigned PhiNum,
232	MachineInstr Phi, unsigned* OldReg,
233	unsigned NewReg, unsigned PrevReg = `0`);
234	bool isLoopCarried(MachineInstr &Phi);
235
236	/// Return the max. number of stages/iterations that can occur between a
237	/// register definition and its uses.
238	unsigned getStagesForReg(int Reg, unsigned CurStage) {
239	std::pair<unsigned, bool> Stages = RegToStageDiff [Reg];
240	if ((int)CurStage > Schedule.getNumStages() - `1` && Stages.first == `0` &&
241	Stages.second)
242	return `1`;
243	return Stages.first;
244	}
245
246	/// The number of stages for a Phi is a little different than other
247	/// instructions. The minimum value computed in RegToStageDiff is 1
248	/// because we assume the Phi is needed for at least 1 iteration.
249	/// This is not the case if the loop value is scheduled prior to the
250	/// Phi in the same stage. This function returns the number of stages
251	/// or iterations needed between the Phi definition and any uses.
252	unsigned getStagesForPhi(int Reg) {
253	std::pair<unsigned, bool> Stages = RegToStageDiff [Reg];
254	if (Stages.second)
255	return Stages.first;
256	return Stages.first - `1`;
257	}
258
259	public:
260	/// Create a new ModuloScheduleExpander.
261	/// \arg InstrChanges Modifications to make to instructions with memory
262	/// operands.
263	/// FIXME: InstrChanges is opaque and is an implementation detail of an
264	/// optimization in MachinePipeliner that crosses abstraction boundaries.
265	ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
266	LiveIntervals &LIS, InstrChangesTy InstrChanges)
267	: Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
268	TII(ST.getInstrInfo()), LIS(LIS),
269	InstrChanges (std::move(InstrChanges)) {}
270
271	/// Performs the actual expansion.
272	void expand();
273	/// Performs final cleanup after expansion.
274	void cleanup();
275
276	/// Returns the newly rewritten kernel block, or nullptr if this was
277	/// optimized away.
278	MachineBasicBlock getRewrittenKernel() { return* NewKernel; }
279	};
280
281	/// A reimplementation of ModuloScheduleExpander. It works by generating a
282	/// standalone kernel loop and peeling out the prologs and epilogs.
283	class PeelingModuloScheduleExpander {
284	public:
285	PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
286	LiveIntervals *LIS)
287	: Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
288	TII(ST.getInstrInfo()), LIS(LIS) {}
289
290	void expand();
291
292	/// Runs ModuloScheduleExpander and treats it as a golden input to validate
293	/// aspects of the code generated by PeelingModuloScheduleExpander.
294	void validateAgainstModuloScheduleExpander();
295
296	protected:
297	ModuloSchedule &Schedule;
298	MachineFunction &MF;
299	const TargetSubtargetInfo &ST;
300	MachineRegisterInfo &MRI;
301	const TargetInstrInfo TII = nullptr*;
302	LiveIntervals LIS = nullptr*;
303
304	/// The original loop block that gets rewritten in-place.
305	MachineBasicBlock BB = nullptr*;
306	/// The original loop preheader.
307	MachineBasicBlock Preheader = nullptr*;
308	/// All prolog and epilog blocks.
309	SmallVector<MachineBasicBlock *, `4`> Prologs, Epilogs;
310	/// For every block, the stages that are produced.
311	DenseMap<MachineBasicBlock *, BitVector> LiveStages;
312	/// For every block, the stages that are available. A stage can be available
313	/// but not produced (in the epilog) or produced but not available (in the
314	/// prolog).
315	DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
316	/// When peeling the epilogue keep track of the distance between the phi
317	/// nodes and the kernel.
318	DenseMap<MachineInstr , unsigned*> PhiNodeLoopIteration;
319
320	/// CanonicalMIs and BlockMIs form a bidirectional map between any of the
321	/// loop kernel clones.
322	DenseMap<MachineInstr , MachineInstr > CanonicalMIs;
323	DenseMap<std::pair<MachineBasicBlock , MachineInstr >, MachineInstr *>
324	BlockMIs;
325
326	/// State passed from peelKernel to peelPrologAndEpilogs().
327	std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
328	/// Illegal phis that need to be deleted once we re-link stages.
329	SmallVector<MachineInstr *, `4`> IllegalPhisToDelete;
330
331	/// Converts BB from the original loop body to the rewritten, pipelined
332	/// steady-state.
333	void rewriteKernel();
334
335	/// Peels one iteration of the rewritten kernel (BB) in the specified
336	/// direction.
337	MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
338	// Delete instructions whose stage is less than MinStage in the given basic
339	// block.
340	void filterInstructions(MachineBasicBlock MB, int* MinStage);
341	// Move instructions of the given stage from sourceBB to DestBB. Remap the phi
342	// instructions to keep a valid IR.
343	void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
344	MachineBasicBlock SourceBB, unsigned* Stage);
345	/// Peel the kernel forwards and backwards to produce prologs and epilogs,
346	/// and stitch them together.
347	void peelPrologAndEpilogs();
348	/// All prolog and epilog blocks are clones of the kernel, so any produced
349	/// register in one block has an corollary in all other blocks.
350	Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
351	/// Change all users of MI, if MI is predicated out
352	/// (LiveStages[MI->getParent()] == false).
353	void rewriteUsesOf(MachineInstr *MI);
354	/// Insert branches between prologs, kernel and epilogs.
355	void fixupBranches();
356	/// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
357	/// to a block dominated by all prologs and epilogs. This allows us to treat
358	/// the loop exiting block as any other kernel clone.
359	MachineBasicBlock *CreateLCSSAExitingBlock();
360	/// Helper to get the stage of an instruction in the schedule.
361	unsigned getStage(MachineInstr *MI) {
362	if (CanonicalMIs.count(Val: MI))
363	MI = CanonicalMIs [MI];
364	return Schedule.getStage(MI);
365	}
366	/// Helper function to find the right canonical register for a phi instruction
367	/// coming from a peeled out prologue.
368	Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
369	/// Target loop info before kernel peeling.
370	std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
371	};
372
373	/// Expand the kernel using modulo variable expansion algorithm (MVE).
374	/// It unrolls the kernel enough to avoid overlap of register lifetime.
375	class ModuloScheduleExpanderMVE {
376	private:
377	using ValueMapTy = DenseMap<unsigned, unsigned>;
378	using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
379	using InstrMapTy = DenseMap<MachineInstr , MachineInstr >;
380
381	ModuloSchedule &Schedule;
382	MachineFunction &MF;
383	const TargetSubtargetInfo &ST;
384	MachineRegisterInfo &MRI;
385	const TargetInstrInfo TII = nullptr*;
386	LiveIntervals &LIS;
387
388	MachineBasicBlock OrigKernel = nullptr*;
389	MachineBasicBlock OrigPreheader = nullptr*;
390	MachineBasicBlock OrigExit = nullptr*;
391	MachineBasicBlock Check = nullptr*;
392	MachineBasicBlock Prolog = nullptr*;
393	MachineBasicBlock NewKernel = nullptr*;
394	MachineBasicBlock Epilog = nullptr*;
395	MachineBasicBlock NewPreheader = nullptr*;
396	MachineBasicBlock NewExit = nullptr*;
397	std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
398
399	/// The number of unroll required to avoid overlap of live ranges.
400	/// NumUnroll = 1 means no unrolling.
401	int NumUnroll;
402
403	void calcNumUnroll();
404	void generatePipelinedLoop();
405	void generateProlog(SmallVectorImpl<ValueMapTy> &VRMap);
406	void generatePhi(MachineInstr OrigMI, int* UnrollNum,
407	SmallVectorImpl<ValueMapTy> &PrologVRMap,
408	SmallVectorImpl<ValueMapTy> &KernelVRMap,
409	SmallVectorImpl<ValueMapTy> &PhiVRMap);
410	void generateKernel(SmallVectorImpl<ValueMapTy> &PrologVRMap,
411	SmallVectorImpl<ValueMapTy> &KernelVRMap,
412	InstrMapTy &LastStage0Insts);
413	void generateEpilog(SmallVectorImpl<ValueMapTy> &KernelVRMap,
414	SmallVectorImpl<ValueMapTy> &EpilogVRMap,
415	InstrMapTy &LastStage0Insts);
416	void mergeRegUsesAfterPipeline(Register OrigReg, Register NewReg);
417
418	MachineInstr cloneInstr(MachineInstr OldMI);
419
420	void updateInstrDef(MachineInstr NewMI, ValueMapTy &VRMap, bool* LastDef);
421
422	void generateKernelPhi(Register OrigLoopVal, Register NewLoopVal,
423	unsigned UnrollNum,
424	SmallVectorImpl<ValueMapTy> &VRMapProlog,
425	SmallVectorImpl<ValueMapTy> &VRMapPhi);
426	void updateInstrUse(MachineInstr MI, int* StageNum, int PhaseNum,
427	SmallVectorImpl<ValueMapTy> &CurVRMap,
428	SmallVectorImpl<ValueMapTy> *PrevVRMap);
429
430	void insertCondBranch(MachineBasicBlock &MBB, int RequiredTC,
431	InstrMapTy &LastStage0Insts,
432	MachineBasicBlock &GreaterThan,
433	MachineBasicBlock &Otherwise);
434
435	public:
436	ModuloScheduleExpanderMVE(MachineFunction &MF, ModuloSchedule &S,
437	LiveIntervals &LIS)
438	: Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
439	TII(ST.getInstrInfo()), LIS(LIS) {}
440
441	void expand();
442	static bool canApply(MachineLoop &L);
443	};
444
445	/// Expander that simply annotates each scheduled instruction with a post-instr
446	/// symbol that can be consumed by the ModuloScheduleTest pass.
447	///
448	/// The post-instr symbol is a way of annotating an instruction that can be
449	/// roundtripped in MIR. The syntax is:
450	/// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
451	class ModuloScheduleTestAnnotater {
452	MachineFunction &MF;
453	ModuloSchedule &S;
454
455	public:
456	ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
457	: MF(MF), S(S) {}
458
459	/// Performs the annotation.
460	void annotate();
461	};
462
463	} // end namespace llvm
464
465	#endif // LLVM_CODEGEN_MODULOSCHEDULE_H
466

Browse the source code of llvm_projects/llvm/include/llvm/CodeGen/ModuloSchedule.h