AMDGPUInsertDelayAlu.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp]

1	//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPU.h"
15	#include "GCNSubtarget.h"
16	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17	#include "SIInstrInfo.h"
18	#include "llvm/ADT/SetVector.h"
19
20	using namespace llvm;
21
22	#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24	namespace {
25
26	class AMDGPUInsertDelayAlu {
27	public:
28	const SIInstrInfo *SII;
29	const TargetRegisterInfo *TRI;
30
31	const TargetSchedModel *SchedModel;
32
33	// Return true if MI waits for all outstanding VALU instructions to complete.
34	static bool instructionWaitsForVALU(const MachineInstr &MI) {
35	// These instruction types wait for VA_VDST==0 before issuing.
36	const uint64_t VA_VDST_0 = SIInstrFlags::DS \| SIInstrFlags::EXP \|
37	SIInstrFlags::FLAT \| SIInstrFlags::MIMG \|
38	SIInstrFlags::MTBUF \| SIInstrFlags::MUBUF;
39	if (MI.getDesc().TSFlags & VA_VDST_0)
40	return true;
41	if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 \|\|
42	MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
43	return true;
44	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
45	AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: `0`).getImm()) == `0`)
46	return true;
47	return false;
48	}
49
50	static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
51	// These instruction types wait for VA_SDST==0 before issuing.
52	uint64_t MIFlags = MI.getDesc().TSFlags;
53	if (MIFlags & SIInstrFlags::SMRD)
54	return true;
55
56	if (MIFlags & SIInstrFlags::SALU) {
57	for (auto &Op : MI.operands()) {
58	if (Op.isReg())
59	return true;
60	}
61	}
62	return false;
63	}
64
65	// Types of delay that can be encoded in an s_delay_alu instruction.
66	enum DelayType { VALU, TRANS, SALU, OTHER };
67
68	// Get the delay type for an instruction with the specified TSFlags.
69	static DelayType getDelayType(uint64_t TSFlags) {
70	if (TSFlags & SIInstrFlags::TRANS)
71	return TRANS;
72	if (TSFlags & SIInstrFlags::VALU)
73	return VALU;
74	if (TSFlags & SIInstrFlags::SALU)
75	return SALU;
76	return OTHER;
77	}
78
79	// Information about the last instruction(s) that wrote to a particular
80	// regunit. In straight-line code there will only be one such instruction, but
81	// when control flow converges we merge the delay information from each path
82	// to represent the union of the worst-case delays of each type.
83	struct DelayInfo {
84	// One larger than the maximum number of (non-TRANS) VALU instructions we
85	// can encode in an s_delay_alu instruction.
86	static constexpr unsigned VALU_MAX = `5`;
87
88	// One larger than the maximum number of TRANS instructions we can encode in
89	// an s_delay_alu instruction.
90	static constexpr unsigned TRANS_MAX = `4`;
91
92	// One larger than the maximum number of SALU cycles we can encode in an
93	// s_delay_alu instruction.
94	static constexpr unsigned SALU_CYCLES_MAX = `4`;
95
96	// If it was written by a (non-TRANS) VALU, remember how many clock cycles
97	// are left until it completes, and how many other (non-TRANS) VALU we have
98	// seen since it was issued.
99	uint8_t VALUCycles = `0`;
100	uint8_t VALUNum = VALU_MAX;
101
102	// If it was written by a TRANS, remember how many clock cycles are left
103	// until it completes, and how many other TRANS we have seen since it was
104	// issued.
105	uint8_t TRANSCycles = `0`;
106	uint8_t TRANSNum = TRANS_MAX;
107	// Also remember how many other (non-TRANS) VALU we have seen since it was
108	// issued. When an instruction depends on both a prior TRANS and a prior
109	// non-TRANS VALU, this is used to decide whether to encode a wait for just
110	// one or both of them.
111	uint8_t TRANSNumVALU = VALU_MAX;
112
113	// If it was written by an SALU, remember how many clock cycles are left
114	// until it completes.
115	uint8_t SALUCycles = `0`;
116
117	DelayInfo() = default;
118
119	DelayInfo(DelayType Type, unsigned Cycles) {
120	switch (Type) {
121	default:
122	llvm_unreachable("unexpected type");
123	case VALU:
124	VALUCycles = Cycles;
125	VALUNum = `0`;
126	break;
127	case TRANS:
128	TRANSCycles = Cycles;
129	TRANSNum = `0`;
130	TRANSNumVALU = `0`;
131	break;
132	case SALU:
133	// Guard against pseudo-instructions like SI_CALL which are marked as
134	// SALU but with a very high latency.
135	SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX);
136	break;
137	}
138	}
139
140	bool operator==(const DelayInfo &RHS) const {
141	return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
142	TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
143	TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
144	}
145
146	bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
147
148	// Merge another DelayInfo into this one, to represent the union of the
149	// worst-case delays of each type.
150	void merge(const DelayInfo &RHS) {
151	VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles);
152	VALUNum = std::min(a: VALUNum, b: RHS.VALUNum);
153	TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles);
154	TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum);
155	TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU);
156	SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles);
157	}
158
159	// Update this DelayInfo after issuing an instruction of the specified type.
160	// Cycles is the number of cycles it takes to issue the instruction. Return
161	// true if there is no longer any useful delay info.
162	bool advance(DelayType Type, unsigned Cycles) {
163	bool Erase = true;
164
165	VALUNum += (Type == VALU);
166	if (VALUNum >= VALU_MAX \|\| VALUCycles <= Cycles) {
167	// Forget about the VALU instruction. It was too far back or has
168	// definitely completed by now.
169	VALUNum = VALU_MAX;
170	VALUCycles = `0`;
171	} else {
172	VALUCycles -= Cycles;
173	Erase = false;
174	}
175
176	TRANSNum += (Type == TRANS);
177	TRANSNumVALU += (Type == VALU);
178	if (TRANSNum >= TRANS_MAX \|\| TRANSCycles <= Cycles) {
179	// Forget about any TRANS instruction. It was too far back or has
180	// definitely completed by now.
181	TRANSNum = TRANS_MAX;
182	TRANSNumVALU = VALU_MAX;
183	TRANSCycles = `0`;
184	} else {
185	TRANSCycles -= Cycles;
186	Erase = false;
187	}
188
189	if (SALUCycles <= Cycles) {
190	// Forget about any SALU instruction. It has definitely completed by
191	// now.
192	SALUCycles = `0`;
193	} else {
194	SALUCycles -= Cycles;
195	Erase = false;
196	}
197
198	return Erase;
199	}
200
201	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
202	void dump() const {
203	if (VALUCycles)
204	dbgs() << " VALUCycles=" << (int)VALUCycles;
205	if (VALUNum < VALU_MAX)
206	dbgs() << " VALUNum=" << (int)VALUNum;
207	if (TRANSCycles)
208	dbgs() << " TRANSCycles=" << (int)TRANSCycles;
209	if (TRANSNum < TRANS_MAX)
210	dbgs() << " TRANSNum=" << (int)TRANSNum;
211	if (TRANSNumVALU < VALU_MAX)
212	dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
213	if (SALUCycles)
214	dbgs() << " SALUCycles=" << (int)SALUCycles;
215	}
216	#endif
217	};
218
219	// A map from regunits to the delay info for that regunit.
220	struct DelayState : DenseMap<unsigned, DelayInfo> {
221	// Merge another DelayState into this one by merging the delay info for each
222	// regunit.
223	void merge(const DelayState &RHS) {
224	for (const auto &KV : RHS) {
225	iterator It;
226	bool Inserted;
227	std::tie(args&: It, args&: Inserted) = insert(KV);
228	if (!Inserted)
229	It ->second.merge(RHS: KV.second);
230	}
231	}
232
233	// Advance the delay info for each regunit, erasing any that are no longer
234	// useful.
235	void advance(DelayType Type, unsigned Cycles) {
236	iterator Next;
237	for (auto I = begin(), E = end(); I != E; I = Next) {
238	Next = std::next(x: I);
239	if (I ->second.advance(Type, Cycles))
240	erase(I);
241	}
242	}
243
244	void advanceByVALUNum(unsigned VALUNum) {
245	iterator Next;
246	for (auto I = begin(), E = end(); I != E; I = Next) {
247	Next = std::next(x: I);
248	if (I ->second.VALUNum >= VALUNum && I ->second.VALUCycles > `0`) {
249	erase(I);
250	}
251	}
252	}
253
254	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
255	void dump(const TargetRegisterInfo TRI) const* {
256	if (empty()) {
257	dbgs() << " empty\n";
258	return;
259	}
260
261	// Dump DelayInfo for each RegUnit in numerical order.
262	SmallVector<const_iterator, `8`> Order;
263	Order.reserve(size());
264	for (const_iterator I = begin(), E = end(); I != E; ++I)
265	Order.push_back(I);
266	llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
267	return A->first < B->first;
268	});
269	for (const_iterator I : Order) {
270	dbgs() << " " << printRegUnit(I->first, TRI);
271	I->second.dump();
272	dbgs() << "\n";
273	}
274	}
275	#endif
276	};
277
278	// The saved delay state at the end of each basic block.
279	DenseMap<MachineBasicBlock *, DelayState> BlockState;
280
281	// Emit an s_delay_alu instruction if necessary before MI.
282	MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
283	MachineInstr *LastDelayAlu) {
284	unsigned Imm = `0`;
285
286	// Wait for a TRANS instruction.
287	if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
288	Imm \|= `4` + Delay.TRANSNum;
289
290	// Wait for a VALU instruction (if it's more recent than any TRANS
291	// instruction that we're also waiting for).
292	if (Delay.VALUNum < DelayInfo::VALU_MAX &&
293	Delay.VALUNum <= Delay.TRANSNumVALU) {
294	if (Imm & `0xf`)
295	Imm \|= Delay.VALUNum << `7`;
296	else
297	Imm \|= Delay.VALUNum;
298	}
299
300	// Wait for an SALU instruction.
301	if (Delay.SALUCycles) {
302	assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
303	if (Imm & `0x780`) {
304	// We have already encoded a VALU and a TRANS delay. There's no room in
305	// the encoding for an SALU delay as well, so just drop it.
306	} else if (Imm & `0xf`) {
307	Imm \|= (Delay.SALUCycles + `8`) << `7`;
308	} else {
309	Imm \|= Delay.SALUCycles + `8`;
310	}
311	}
312
313	// Don't emit the s_delay_alu instruction if there's nothing to wait for.
314	if (!Imm)
315	return LastDelayAlu;
316
317	// If we only need to wait for one instruction, try encoding it in the last
318	// s_delay_alu that we emitted.
319	if (!(Imm & `0x780`) && LastDelayAlu) {
320	unsigned Skip = `0`;
321	for (auto I = MachineBasicBlock::instr_iterator (LastDelayAlu),
322	E = MachineBasicBlock::instr_iterator (MI);
323	++I != E;) {
324	if (!I ->isBundle() && !I ->isMetaInstruction())
325	++Skip;
326	}
327	if (Skip < `6`) {
328	MachineOperand &Op = LastDelayAlu->getOperand(i: `0`);
329	unsigned LastImm = Op.getImm();
330	assert((LastImm & ~`0xf`) == `0` &&
331	"Remembered an s_delay_alu with no room for another delay!");
332	LastImm \|= Imm << `7` \| Skip << `4`;
333	Op.setImm(LastImm);
334	return nullptr;
335	}
336	}
337
338	auto &MBB = *MI.getParent();
339	MachineInstr *DelayAlu =
340	BuildMI(BB&: MBB, I&: MI, MIMD: DebugLoc (), MCID: SII->get(Opcode: AMDGPU::S_DELAY_ALU)).addImm(Val: Imm);
341	// Remember the s_delay_alu for next time if there is still room in it to
342	// encode another delay.
343	return (Imm & `0x780`) ? nullptr : DelayAlu;
344	}
345
346	bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
347	DelayState State;
348	for (auto *Pred : MBB.predecessors())
349	State.merge(RHS: BlockState [Pred]);
350
351	LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
352	<< "\n";
353	State.dump(TRI););
354
355	bool Changed = false;
356	MachineInstr LastDelayAlu = nullptr*;
357
358	MCRegUnit LastSGPRFromVALU = `0`;
359	// Iterate over the contents of bundles, but don't emit any instructions
360	// inside a bundle.
361	for (auto &MI : MBB.instrs()) {
362	if (MI.isBundle() \|\| MI.isMetaInstruction())
363	continue;
364
365	// Ignore some more instructions that do not generate any code.
366	switch (MI.getOpcode()) {
367	case AMDGPU::SI_RETURN_TO_EPILOG:
368	continue;
369	}
370
371	DelayType Type = getDelayType(TSFlags: MI.getDesc().TSFlags);
372
373	if (instructionWaitsForSGPRWrites(MI)) {
374	auto It = State.find(Val: LastSGPRFromVALU);
375	if (It != State.end()) {
376	DelayInfo Info = It ->getSecond();
377	State.advanceByVALUNum(VALUNum: Info.VALUNum);
378	LastSGPRFromVALU = `0`;
379	}
380	}
381
382	if (instructionWaitsForVALU(MI)) {
383	// Forget about all outstanding VALU delays.
384	// TODO: This is overkill since it also forgets about SALU delays.
385	State = DelayState ();
386	} else if (Type != OTHER) {
387	DelayInfo Delay;
388	// TODO: Scan implicit uses too?
389	for (const auto &Op : MI.explicit_uses()) {
390	if (Op.isReg()) {
391	// One of the operands of the writelane is also the output operand.
392	// This creates the insertion of redundant delays. Hence, we have to
393	// ignore this operand.
394	if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
395	continue;
396	for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) {
397	auto It = State.find(Val: Unit);
398	if (It != State.end()) {
399	Delay.merge(RHS: It ->second);
400	State.erase(Val: Unit);
401	}
402	}
403	}
404	}
405
406	if (SII->isVALU(Opcode: MI.getOpcode())) {
407	for (const auto &Op : MI.defs()) {
408	Register Reg = Op.getReg();
409	if (AMDGPU::isSGPR(Reg, TRI)) {
410	LastSGPRFromVALU = *TRI->regunits(Reg).begin();
411	break;
412	}
413	}
414	}
415
416	if (Emit && !MI.isBundledWithPred()) {
417	// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
418	// just ignore them?
419	LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
420	}
421	}
422
423	if (Type != OTHER) {
424	// TODO: Scan implicit defs too?
425	for (const auto &Op : MI.defs()) {
426	unsigned Latency = SchedModel->computeOperandLatency(
427	DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: `0`);
428	for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg()))
429	State [Unit] = DelayInfo (Type, Latency);
430	}
431	}
432
433	// Advance by the number of cycles it takes to issue this instruction.
434	// TODO: Use a more advanced model that accounts for instructions that
435	// take multiple cycles to issue on a particular pipeline.
436	unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
437	// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
438	// instructions on the assumption that they will usually have to be issued
439	// twice?
440	State.advance(Type, Cycles);
441
442	LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
443	}
444
445	if (Emit) {
446	assert(State == BlockState[&MBB] &&
447	"Basic block state should not have changed on final pass!");
448	} else if (DelayState &BS = BlockState [&MBB]; State != BS) {
449	BS = std::move(State);
450	Changed = true;
451	}
452	return Changed;
453	}
454
455	bool run(MachineFunction &MF) {
456	LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
457	<< "\n");
458
459	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
460	if (!ST.hasDelayAlu())
461	return false;
462
463	SII = ST.getInstrInfo();
464	TRI = ST.getRegisterInfo();
465	SchedModel = &SII->getSchedModel();
466
467	// Calculate the delay state for each basic block, iterating until we reach
468	// a fixed point.
469	SetVector<MachineBasicBlock *> WorkList;
470	for (auto &MBB : reverse(C&: MF))
471	WorkList.insert(X: &MBB);
472	while (!WorkList.empty()) {
473	auto &MBB = *WorkList.pop_back_val();
474	bool Changed = runOnMachineBasicBlock(MBB, Emit: false);
475	if (Changed)
476	WorkList.insert_range(R: MBB.successors());
477	}
478
479	LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
480
481	// Make one last pass over all basic blocks to emit s_delay_alu
482	// instructions.
483	bool Changed = false;
484	for (auto &MBB : MF)
485	Changed \|= runOnMachineBasicBlock(MBB, Emit: true);
486	return Changed;
487	}
488	};
489
490	class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass {
491	public:
492	static char ID;
493
494	AMDGPUInsertDelayAluLegacy() : MachineFunctionPass (ID) {}
495
496	void getAnalysisUsage(AnalysisUsage &AU) const override {
497	AU.setPreservesCFG();
498	MachineFunctionPass::getAnalysisUsage(AU);
499	}
500
501	bool runOnMachineFunction(MachineFunction &MF) override {
502	if (skipFunction(F: MF.getFunction()))
503	return false;
504	AMDGPUInsertDelayAlu Impl;
505	return Impl.run(MF);
506	}
507	};
508	} // namespace
509
510	PreservedAnalyses
511	AMDGPUInsertDelayAluPass::run(MachineFunction &MF,
512	MachineFunctionAnalysisManager &MFAM) {
513	if (!AMDGPUInsertDelayAlu ().run(MF))
514	return PreservedAnalyses::all();
515	auto PA = getMachineFunctionPassPreservedAnalyses();
516	PA.preserveSet<CFGAnalyses>();
517	return PA;
518	} // end namespace llvm
519
520	char AMDGPUInsertDelayAluLegacy::ID = `0`;
521
522	char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID;
523
524	INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE,
525	"AMDGPU Insert Delay ALU", false, false)
526

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp