AMDGPUInsertDelayAlu.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp]

1	//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPU.h"
15	#include "GCNSubtarget.h"
16	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17	#include "SIInstrInfo.h"
18	#include "llvm/ADT/SetVector.h"
19
20	using namespace llvm;
21
22	#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24	namespace {
25
26	class AMDGPUInsertDelayAlu {
27	public:
28	const GCNSubtarget *ST;
29	const SIInstrInfo *SII;
30	const TargetRegisterInfo *TRI;
31
32	const TargetSchedModel *SchedModel;
33
34	// Return true if MI waits for all outstanding VALU instructions to complete.
35	static bool instructionWaitsForVALU(const MachineInstr &MI) {
36	// These instruction types wait for VA_VDST==0 before issuing.
37	const uint64_t VA_VDST_0 = SIInstrFlags::DS \| SIInstrFlags::EXP \|
38	SIInstrFlags::FLAT \| SIInstrFlags::MIMG \|
39	SIInstrFlags::MTBUF \| SIInstrFlags::MUBUF;
40	if (MI.getDesc().TSFlags & VA_VDST_0)
41	return true;
42	if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 \|\|
43	MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
44	return true;
45	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
46	AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: `0`).getImm()) == `0`)
47	return true;
48	return false;
49	}
50
51	static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
52	// These instruction types wait for VA_SDST==0 before issuing.
53	uint64_t MIFlags = MI.getDesc().TSFlags;
54	if (MIFlags & SIInstrFlags::SMRD)
55	return true;
56
57	if (MIFlags & SIInstrFlags::SALU) {
58	for (auto &Op : MI.operands()) {
59	if (Op.isReg())
60	return true;
61	}
62	}
63	return false;
64	}
65
66	// Types of delay that can be encoded in an s_delay_alu instruction.
67	enum DelayType { VALU, TRANS, SALU, OTHER };
68
69	// Get the delay type for a MachineInstr.
70	DelayType getDelayType(const MachineInstr &MI) {
71	// Non-F64 TRANS instructions use a separate delay type.
72	if (SIInstrInfo::isTRANS(MI) &&
73	!AMDGPU::isDPMACCInstruction(Opc: MI.getOpcode()))
74	return TRANS;
75	// WMMA XDL ops are treated the same as TRANS.
76	if (ST->hasGFX1250Insts() && SII->isXDLWMMA(MI))
77	return TRANS;
78	if (SIInstrInfo::isVALU(MI))
79	return VALU;
80	if (SIInstrInfo::isSALU(MI))
81	return SALU;
82	return OTHER;
83	}
84
85	// Information about the last instruction(s) that wrote to a particular
86	// regunit. In straight-line code there will only be one such instruction, but
87	// when control flow converges we merge the delay information from each path
88	// to represent the union of the worst-case delays of each type.
89	struct DelayInfo {
90	// One larger than the maximum number of (non-TRANS) VALU instructions we
91	// can encode in an s_delay_alu instruction.
92	static constexpr unsigned VALU_MAX = `5`;
93
94	// One larger than the maximum number of TRANS instructions we can encode in
95	// an s_delay_alu instruction.
96	static constexpr unsigned TRANS_MAX = `4`;
97
98	// One larger than the maximum number of SALU cycles we can encode in an
99	// s_delay_alu instruction.
100	static constexpr unsigned SALU_CYCLES_MAX = `4`;
101
102	// If it was written by a (non-TRANS) VALU, remember how many clock cycles
103	// are left until it completes, and how many other (non-TRANS) VALU we have
104	// seen since it was issued.
105	uint8_t VALUCycles = `0`;
106	uint8_t VALUNum = VALU_MAX;
107
108	// If it was written by a TRANS, remember how many clock cycles are left
109	// until it completes, and how many other TRANS we have seen since it was
110	// issued.
111	uint8_t TRANSCycles = `0`;
112	uint8_t TRANSNum = TRANS_MAX;
113	// Also remember how many other (non-TRANS) VALU we have seen since it was
114	// issued. When an instruction depends on both a prior TRANS and a prior
115	// non-TRANS VALU, this is used to decide whether to encode a wait for just
116	// one or both of them.
117	uint8_t TRANSNumVALU = VALU_MAX;
118
119	// If it was written by an SALU, remember how many clock cycles are left
120	// until it completes.
121	uint8_t SALUCycles = `0`;
122
123	DelayInfo() = default;
124
125	DelayInfo(DelayType Type, unsigned Cycles) {
126	switch (Type) {
127	default:
128	llvm_unreachable("unexpected type");
129	case VALU:
130	VALUCycles = Cycles;
131	VALUNum = `0`;
132	break;
133	case TRANS:
134	TRANSCycles = Cycles;
135	TRANSNum = `0`;
136	TRANSNumVALU = `0`;
137	break;
138	case SALU:
139	// Guard against pseudo-instructions like SI_CALL which are marked as
140	// SALU but with a very high latency.
141	SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX);
142	break;
143	}
144	}
145
146	bool operator==(const DelayInfo &RHS) const {
147	return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
148	TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
149	TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
150	}
151
152	bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
153
154	// Merge another DelayInfo into this one, to represent the union of the
155	// worst-case delays of each type.
156	void merge(const DelayInfo &RHS) {
157	VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles);
158	VALUNum = std::min(a: VALUNum, b: RHS.VALUNum);
159	TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles);
160	TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum);
161	TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU);
162	SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles);
163	}
164
165	// Update this DelayInfo after issuing an instruction of the specified type.
166	// Cycles is the number of cycles it takes to issue the instruction. Return
167	// true if there is no longer any useful delay info.
168	bool advance(DelayType Type, unsigned Cycles) {
169	bool Erase = true;
170
171	VALUNum += (Type == VALU);
172	if (VALUNum >= VALU_MAX \|\| VALUCycles <= Cycles) {
173	// Forget about the VALU instruction. It was too far back or has
174	// definitely completed by now.
175	VALUNum = VALU_MAX;
176	VALUCycles = `0`;
177	} else {
178	VALUCycles -= Cycles;
179	Erase = false;
180	}
181
182	TRANSNum += (Type == TRANS);
183	TRANSNumVALU += (Type == VALU);
184	if (TRANSNum >= TRANS_MAX \|\| TRANSCycles <= Cycles) {
185	// Forget about any TRANS instruction. It was too far back or has
186	// definitely completed by now.
187	TRANSNum = TRANS_MAX;
188	TRANSNumVALU = VALU_MAX;
189	TRANSCycles = `0`;
190	} else {
191	TRANSCycles -= Cycles;
192	Erase = false;
193	}
194
195	if (SALUCycles <= Cycles) {
196	// Forget about any SALU instruction. It has definitely completed by
197	// now.
198	SALUCycles = `0`;
199	} else {
200	SALUCycles -= Cycles;
201	Erase = false;
202	}
203
204	return Erase;
205	}
206
207	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
208	void dump() const {
209	if (VALUCycles)
210	dbgs() << " VALUCycles=" << (int)VALUCycles;
211	if (VALUNum < VALU_MAX)
212	dbgs() << " VALUNum=" << (int)VALUNum;
213	if (TRANSCycles)
214	dbgs() << " TRANSCycles=" << (int)TRANSCycles;
215	if (TRANSNum < TRANS_MAX)
216	dbgs() << " TRANSNum=" << (int)TRANSNum;
217	if (TRANSNumVALU < VALU_MAX)
218	dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
219	if (SALUCycles)
220	dbgs() << " SALUCycles=" << (int)SALUCycles;
221	}
222	#endif
223	};
224
225	// A map from regunits to the delay info for that regunit.
226	struct DelayState : DenseMap<MCRegUnit, DelayInfo> {
227	// Merge another DelayState into this one by merging the delay info for each
228	// regunit.
229	void merge(const DelayState &RHS) {
230	for (const auto &KV : RHS) {
231	iterator It;
232	bool Inserted;
233	std::tie(args&: It, args&: Inserted) = insert(KV);
234	if (!Inserted)
235	It ->second.merge(RHS: KV.second);
236	}
237	}
238
239	// Advance the delay info for each regunit, erasing any that are no longer
240	// useful.
241	void advance(DelayType Type, unsigned Cycles) {
242	iterator Next;
243	for (auto I = begin(), E = end(); I != E; I = Next) {
244	Next = std::next(x: I);
245	if (I ->second.advance(Type, Cycles))
246	erase(I);
247	}
248	}
249
250	void advanceByVALUNum(unsigned VALUNum) {
251	iterator Next;
252	for (auto I = begin(), E = end(); I != E; I = Next) {
253	Next = std::next(x: I);
254	if (I ->second.VALUNum >= VALUNum && I ->second.VALUCycles > `0`) {
255	erase(I);
256	}
257	}
258	}
259
260	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
261	void dump(const TargetRegisterInfo TRI) const* {
262	if (empty()) {
263	dbgs() << " empty\n";
264	return;
265	}
266
267	// Dump DelayInfo for each RegUnit in numerical order.
268	SmallVector<const_iterator, `8`> Order;
269	Order.reserve(size());
270	for (const_iterator I = begin(), E = end(); I != E; ++I)
271	Order.push_back(I);
272	llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
273	return A->first < B->first;
274	});
275	for (const_iterator I : Order) {
276	dbgs() << " " << printRegUnit(I->first, TRI);
277	I->second.dump();
278	dbgs() << "\n";
279	}
280	}
281	#endif
282	};
283
284	// The saved delay state at the end of each basic block.
285	DenseMap<MachineBasicBlock *, DelayState> BlockState;
286
287	// Emit an s_delay_alu instruction if necessary before MI.
288	MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
289	MachineInstr *LastDelayAlu) {
290	unsigned Imm = `0`;
291
292	// Wait for a TRANS instruction.
293	if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
294	Imm \|= `4` + Delay.TRANSNum;
295
296	// Wait for a VALU instruction (if it's more recent than any TRANS
297	// instruction that we're also waiting for).
298	if (Delay.VALUNum < DelayInfo::VALU_MAX &&
299	Delay.VALUNum <= Delay.TRANSNumVALU) {
300	if (Imm & `0xf`)
301	Imm \|= Delay.VALUNum << `7`;
302	else
303	Imm \|= Delay.VALUNum;
304	}
305
306	// Wait for an SALU instruction.
307	if (Delay.SALUCycles) {
308	assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
309	if (Imm & `0x780`) {
310	// We have already encoded a VALU and a TRANS delay. There's no room in
311	// the encoding for an SALU delay as well, so just drop it.
312	} else if (Imm & `0xf`) {
313	Imm \|= (Delay.SALUCycles + `8`) << `7`;
314	} else {
315	Imm \|= Delay.SALUCycles + `8`;
316	}
317	}
318
319	// Don't emit the s_delay_alu instruction if there's nothing to wait for.
320	if (!Imm)
321	return LastDelayAlu;
322
323	// If we only need to wait for one instruction, try encoding it in the last
324	// s_delay_alu that we emitted.
325	if (!(Imm & `0x780`) && LastDelayAlu) {
326	unsigned Skip = `0`;
327	for (auto I = MachineBasicBlock::instr_iterator (LastDelayAlu),
328	E = MachineBasicBlock::instr_iterator (MI);
329	++I != E;) {
330	if (I ->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
331	// It is not deterministic whether the skip count counts
332	// S_SET_VGPR_MSB instructions or not, so do not include them in a
333	// skip region.
334	Skip = `6`;
335	break;
336	}
337	if (!I ->isBundle() && !I ->isMetaInstruction())
338	++Skip;
339	}
340	if (Skip < `6`) {
341	MachineOperand &Op = LastDelayAlu->getOperand(i: `0`);
342	unsigned LastImm = Op.getImm();
343	assert((LastImm & ~`0xf`) == `0` &&
344	"Remembered an s_delay_alu with no room for another delay!");
345	LastImm \|= Imm << `7` \| Skip << `4`;
346	Op.setImm(LastImm);
347	return nullptr;
348	}
349	}
350
351	auto &MBB = *MI.getParent();
352	MachineInstr *DelayAlu =
353	BuildMI(BB&: MBB, I&: MI, MIMD: DebugLoc (), MCID: SII->get(Opcode: AMDGPU::S_DELAY_ALU)).addImm(Val: Imm);
354	// Remember the s_delay_alu for next time if there is still room in it to
355	// encode another delay.
356	return (Imm & `0x780`) ? nullptr : DelayAlu;
357	}
358
359	bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
360	DelayState State;
361	for (auto *Pred : MBB.predecessors())
362	State.merge(RHS: BlockState [Pred]);
363
364	LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
365	<< "\n";
366	State.dump(TRI););
367
368	bool Changed = false;
369	MachineInstr LastDelayAlu = nullptr*;
370
371	// FIXME: 0 is a valid register unit.
372	MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(`0`);
373	// Iterate over the contents of bundles, but don't emit any instructions
374	// inside a bundle.
375	for (auto &MI : MBB.instrs()) {
376	if (MI.isBundle() \|\| MI.isMetaInstruction())
377	continue;
378
379	// Ignore some more instructions that do not generate any code.
380	switch (MI.getOpcode()) {
381	case AMDGPU::SI_RETURN_TO_EPILOG:
382	continue;
383	}
384
385	DelayType Type = getDelayType(MI);
386
387	if (instructionWaitsForSGPRWrites(MI)) {
388	auto It = State.find(Val: LastSGPRFromVALU);
389	if (It != State.end()) {
390	DelayInfo Info = It ->getSecond();
391	State.advanceByVALUNum(VALUNum: Info.VALUNum);
392	// FIXME: 0 is a valid register unit.
393	LastSGPRFromVALU = static_cast<MCRegUnit>(`0`);
394	}
395	}
396
397	if (instructionWaitsForVALU(MI)) {
398	// Forget about all outstanding VALU delays.
399	// TODO: This is overkill since it also forgets about SALU delays.
400	State = DelayState ();
401	} else if (Type != OTHER) {
402	DelayInfo Delay;
403	// TODO: Scan implicit uses too?
404	for (const auto &Op : MI.explicit_uses()) {
405	if (Op.isReg()) {
406	// One of the operands of the writelane is also the output operand.
407	// This creates the insertion of redundant delays. Hence, we have to
408	// ignore this operand.
409	if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
410	continue;
411	for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) {
412	auto It = State.find(Val: Unit);
413	if (It != State.end()) {
414	Delay.merge(RHS: It ->second);
415	State.erase(Val: Unit);
416	}
417	}
418	}
419	}
420
421	if (SII->isVALU(Opcode: MI.getOpcode())) {
422	for (const auto &Op : MI.defs()) {
423	Register Reg = Op.getReg();
424	if (AMDGPU::isSGPR(Reg, TRI)) {
425	LastSGPRFromVALU = *TRI->regunits(Reg).begin();
426	break;
427	}
428	}
429	}
430
431	if (Emit && !MI.isBundledWithPred()) {
432	// TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
433	// just ignore them?
434	LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
435	}
436	}
437
438	if (Type != OTHER) {
439	// TODO: Scan implicit defs too?
440	for (const auto &Op : MI.defs()) {
441	unsigned Latency = SchedModel->computeOperandLatency(
442	DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: `0`);
443	for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg()))
444	State [Unit] = DelayInfo (Type, Latency);
445	}
446	}
447
448	// Advance by the number of cycles it takes to issue this instruction.
449	// TODO: Use a more advanced model that accounts for instructions that
450	// take multiple cycles to issue on a particular pipeline.
451	unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
452	// TODO: In wave64 mode, double the number of cycles for VALU and VMEM
453	// instructions on the assumption that they will usually have to be issued
454	// twice?
455	State.advance(Type, Cycles);
456
457	LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
458	}
459
460	if (Emit) {
461	assert(State == BlockState[&MBB] &&
462	"Basic block state should not have changed on final pass!");
463	} else if (DelayState &BS = BlockState [&MBB]; State != BS) {
464	BS = std::move(State);
465	Changed = true;
466	}
467	return Changed;
468	}
469
470	bool run(MachineFunction &MF) {
471	LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
472	<< "\n");
473
474	ST = &MF.getSubtarget<GCNSubtarget>();
475	if (!ST->hasDelayAlu())
476	return false;
477
478	SII = ST->getInstrInfo();
479	TRI = ST->getRegisterInfo();
480	SchedModel = &SII->getSchedModel();
481
482	// Calculate the delay state for each basic block, iterating until we reach
483	// a fixed point.
484	SetVector<MachineBasicBlock *> WorkList;
485	for (auto &MBB : reverse(C&: MF))
486	WorkList.insert(X: &MBB);
487	while (!WorkList.empty()) {
488	auto &MBB = *WorkList.pop_back_val();
489	bool Changed = runOnMachineBasicBlock(MBB, Emit: false);
490	if (Changed)
491	WorkList.insert_range(R: MBB.successors());
492	}
493
494	LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
495
496	// Make one last pass over all basic blocks to emit s_delay_alu
497	// instructions.
498	bool Changed = false;
499	for (auto &MBB : MF)
500	Changed \|= runOnMachineBasicBlock(MBB, Emit: true);
501	return Changed;
502	}
503	};
504
505	class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass {
506	public:
507	static char ID;
508
509	AMDGPUInsertDelayAluLegacy() : MachineFunctionPass (ID) {}
510
511	void getAnalysisUsage(AnalysisUsage &AU) const override {
512	AU.setPreservesCFG();
513	MachineFunctionPass::getAnalysisUsage(AU);
514	}
515
516	bool runOnMachineFunction(MachineFunction &MF) override {
517	if (skipFunction(F: MF.getFunction()))
518	return false;
519	AMDGPUInsertDelayAlu Impl;
520	return Impl.run(MF);
521	}
522	};
523	} // namespace
524
525	PreservedAnalyses
526	AMDGPUInsertDelayAluPass::run(MachineFunction &MF,
527	MachineFunctionAnalysisManager &MFAM) {
528	if (!AMDGPUInsertDelayAlu ().run(MF))
529	return PreservedAnalyses::all();
530	auto PA = getMachineFunctionPassPreservedAnalyses();
531	PA.preserveSet<CFGAnalyses>();
532	return PA;
533	} // end namespace llvm
534
535	char AMDGPUInsertDelayAluLegacy::ID = `0`;
536
537	char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID;
538
539	INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE,
540	"AMDGPU Insert Delay ALU", false, false)
541

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp