GCNHazardRecognizer.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp]

1	//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements hazard recognizers for scheduling on GCN processors.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "GCNHazardRecognizer.h"
14	#include "GCNSubtarget.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "SIMachineFunctionInfo.h"
17	#include "llvm/CodeGen/MachineFrameInfo.h"
18	#include "llvm/CodeGen/MachineFunction.h"
19	#include "llvm/CodeGen/ScheduleDAG.h"
20	#include "llvm/TargetParser/TargetParser.h"
21
22	using namespace llvm;
23
24	namespace {
25
26	struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27	MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29	bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30	if (Arg.getAsInteger(Radix: `0`, Result&: Value))
31	return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
32
33	if (Value > `100`)
34	return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
35
36	return false;
37	}
38	};
39
40	} // end anonymous namespace
41
42	static cl::opt<unsigned, false, MFMAPaddingRatioParser>
43	MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: `0`), cl::Hidden,
44	cl::desc ("Fill a percentage of the latency between "
45	"neighboring MFMA with s_nops."));
46
47	//===----------------------------------------------------------------------===//
48	// Hazard Recognizer Implementation
49	//===----------------------------------------------------------------------===//
50
51	static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
52	const GCNSubtarget &ST);
53
54	GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
55	IsHazardRecognizerMode(false),
56	CurrCycleInstr(nullptr),
57	MF(MF),
58	ST(MF.getSubtarget<GCNSubtarget>()),
59	TII(*ST.getInstrInfo()),
60	TRI(TII.getRegisterInfo()),
61	ClauseUses (TRI.getNumRegUnits()),
62	ClauseDefs (TRI.getNumRegUnits()) {
63	MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? `19` : `5`;
64	TSchedModel.init(TSInfo: &ST);
65	RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66	}
67
68	void GCNHazardRecognizer::Reset() {
69	EmittedInstrs.clear();
70	}
71
72	void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
73	EmitInstruction(MI: SU->getInstr());
74	}
75
76	void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
77	CurrCycleInstr = MI;
78	}
79
80	static bool isDivFMas(unsigned Opcode) {
81	return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 \|\| Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82	}
83
84	static bool isSGetReg(unsigned Opcode) {
85	return Opcode == AMDGPU::S_GETREG_B32;
86	}
87
88	static bool isSSetReg(unsigned Opcode) {
89	switch (Opcode) {
90	case AMDGPU::S_SETREG_B32:
91	case AMDGPU::S_SETREG_B32_mode:
92	case AMDGPU::S_SETREG_IMM32_B32:
93	case AMDGPU::S_SETREG_IMM32_B32_mode:
94	return true;
95	}
96	return false;
97	}
98
99	static bool isRWLane(unsigned Opcode) {
100	return Opcode == AMDGPU::V_READLANE_B32 \|\| Opcode == AMDGPU::V_WRITELANE_B32;
101	}
102
103	static bool isRFE(unsigned Opcode) {
104	return Opcode == AMDGPU::S_RFE_B64;
105	}
106
107	static bool isSMovRel(unsigned Opcode) {
108	switch (Opcode) {
109	case AMDGPU::S_MOVRELS_B32:
110	case AMDGPU::S_MOVRELS_B64:
111	case AMDGPU::S_MOVRELD_B32:
112	case AMDGPU::S_MOVRELD_B64:
113	return true;
114	default:
115	return false;
116	}
117	}
118
119	static bool isDGEMM(unsigned Opcode) {
120	return AMDGPU::getMAIIsDGEMM(Opc: Opcode);
121	}
122
123	static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
124	unsigned Opcode = MI.getOpcode();
125
126	if (!SIInstrInfo::isMAI(MI) \|\|
127	isDGEMM(Opcode) \|\|
128	Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 \|\|
129	Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
130	return false;
131
132	if (!ST.hasGFX940Insts())
133	return true;
134
135	return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode);
136	}
137
138	static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
139	const MachineInstr &MI) {
140	if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
141	return true;
142
143	switch (MI.getOpcode()) {
144	case AMDGPU::S_SENDMSG:
145	case AMDGPU::S_SENDMSGHALT:
146	case AMDGPU::S_TTRACEDATA:
147	return true;
148	// These DS opcodes don't support GDS.
149	case AMDGPU::DS_NOP:
150	case AMDGPU::DS_PERMUTE_B32:
151	case AMDGPU::DS_BPERMUTE_B32:
152	return false;
153	default:
154	if (TII.isDS(Opcode: MI.getOpcode())) {
155	int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
156	NamedIdx: AMDGPU::OpName::gds);
157	if (MI.getOperand(i: GDS).getImm())
158	return true;
159	}
160	return false;
161	}
162	}
163
164	static bool isPermlane(const MachineInstr &MI) {
165	unsigned Opcode = MI.getOpcode();
166	return Opcode == AMDGPU::V_PERMLANE16_B32_e64 \|\|
167	Opcode == AMDGPU::V_PERMLANE64_B32 \|\|
168	Opcode == AMDGPU::V_PERMLANEX16_B32_e64 \|\|
169	Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 \|\|
170	Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
171	}
172
173	static bool isLdsDma(const MachineInstr &MI) {
174	return SIInstrInfo::isVALU(MI) &&
175	(SIInstrInfo::isMUBUF(MI) \|\| SIInstrInfo::isFLAT(MI));
176	}
177
178	static unsigned getHWReg(const SIInstrInfo TII, const* MachineInstr &RegInstr) {
179	const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr,
180	OpName: AMDGPU::OpName::simm16);
181	return std::get<`0`>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm()));
182	}
183
184	ScheduleHazardRecognizer::HazardType
185	GCNHazardRecognizer::getHazardType(SUnit SU, int* Stalls) {
186	MachineInstr *MI = SU->getInstr();
187	// If we are not in "HazardRecognizerMode" and therefore not being run from
188	// the scheduler, track possible stalls from hazards but don't insert noops.
189	auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
190
191	if (MI->isBundle())
192	return NoHazard;
193
194	if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > `0`)
195	return HazardType;
196
197	if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > `0`)
198	return HazardType;
199
200	if (checkFPAtomicToDenormModeHazard(MI) > `0`)
201	return HazardType;
202
203	if (ST.hasNoDataDepHazard())
204	return NoHazard;
205
206	// FIXME: Should flat be considered vmem?
207	if ((SIInstrInfo::isVMEM(MI: *MI) \|\|
208	SIInstrInfo::isFLAT(MI: *MI))
209	&& checkVMEMHazards(VMEM: MI) > `0`)
210	return HazardType;
211
212	if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > `0`)
213	return HazardType;
214
215	if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > `0`)
216	return HazardType;
217
218	if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > `0`)
219	return HazardType;
220
221	if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > `0`)
222	return HazardType;
223
224	if ((SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isVMEM(MI: MI) \|\|
225	SIInstrInfo::isFLAT(MI: MI) \|\| SIInstrInfo::isDS(MI: MI) \|\|
226	SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > `0`)
227	return HazardType;
228
229	if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > `0`)
230	return HazardType;
231
232	if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > `0`)
233	return HazardType;
234
235	if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > `0`)
236	return HazardType;
237
238	if (((ST.hasReadM0MovRelInterpHazard() &&
239	(TII.isVINTRP(MI: *MI) \|\| isSMovRel(Opcode: MI->getOpcode()) \|\|
240	MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 \|\|
241	MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) \|\|
242	(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) \|\|
243	(ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) \|\|
244	(ST.hasReadM0LdsDirectHazard() &&
245	MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /TRI=/nullptr))) &&
246	checkReadM0Hazards(SMovRel: MI) > `0`)
247	return HazardType;
248
249	if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > `0`)
250	return HazardType;
251
252	if ((SIInstrInfo::isVMEM(MI: *MI) \|\|
253	SIInstrInfo::isFLAT(MI: *MI) \|\|
254	SIInstrInfo::isDS(MI: *MI)) && checkMAILdStHazards(MI) > `0`)
255	return HazardType;
256
257	if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > `0`)
258	return HazardType;
259
260	return NoHazard;
261	}
262
263	static void insertNoopsInBundle(MachineInstr MI, const* SIInstrInfo &TII,
264	unsigned Quantity) {
265	while (Quantity > `0`) {
266	unsigned Arg = std::min(a: Quantity, b: `8u`);
267	Quantity -= Arg;
268	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
269	.addImm(Val: Arg - `1`);
270	}
271	}
272
273	unsigned
274	GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
275	const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
276	assert(TSchedModel.getWriteProcResBegin(SC) !=
277	TSchedModel.getWriteProcResEnd(SC));
278	return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
279	}
280
281	void GCNHazardRecognizer::processBundle() {
282	MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
283	MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
284	// Check bundled MachineInstr's for hazards.
285	for (; MI != E && MI ->isInsideBundle(); ++MI) {
286	CurrCycleInstr = &*MI;
287	unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
288
289	if (IsHazardRecognizerMode) {
290	fixHazards(MI: CurrCycleInstr);
291
292	insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
293	}
294
295	// It’s unnecessary to track more than MaxLookAhead instructions. Since we
296	// include the bundled MI directly after, only add a maximum of
297	// (MaxLookAhead - 1) noops to EmittedInstrs.
298	for (unsigned i = `0`, e = std::min(a: WaitStates, b: MaxLookAhead - `1`); i < e; ++i)
299	EmittedInstrs.push_front(x: nullptr);
300
301	EmittedInstrs.push_front(x: CurrCycleInstr);
302	EmittedInstrs.resize(new_size: MaxLookAhead);
303	}
304	CurrCycleInstr = nullptr;
305	}
306
307	void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
308	assert(IsHazardRecognizerMode);
309
310	unsigned NumPreNoops = PreEmitNoops(MI);
311	EmitNoops(Quantity: NumPreNoops);
312	if (MI->isInsideBundle())
313	insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
314	else
315	TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator (MI),
316	Quantity: NumPreNoops);
317	EmitInstruction(MI);
318	AdvanceCycle();
319	}
320
321	unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
322	IsHazardRecognizerMode = true;
323	CurrCycleInstr = MI;
324	unsigned W = PreEmitNoopsCommon(MI);
325	fixHazards(MI);
326	CurrCycleInstr = nullptr;
327	return W;
328	}
329
330	unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
331	if (MI->isBundle())
332	return `0`;
333
334	int WaitStates = `0`;
335
336	if (SIInstrInfo::isSMRD(MI: *MI))
337	return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
338
339	if (ST.hasNSAtoVMEMBug())
340	WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
341
342	WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
343
344	if (ST.hasNoDataDepHazard())
345	return WaitStates;
346
347	if (SIInstrInfo::isVMEM(MI: MI) \|\| SIInstrInfo::isFLAT(MI: MI))
348	WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
349
350	if (SIInstrInfo::isVALU(MI: *MI))
351	WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
352
353	if (SIInstrInfo::isDPP(MI: *MI))
354	WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
355
356	if (isDivFMas(Opcode: MI->getOpcode()))
357	WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
358
359	if (isRWLane(Opcode: MI->getOpcode()))
360	WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
361
362	if ((SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isVMEM(MI: MI) \|\|
363	SIInstrInfo::isFLAT(MI: MI) \|\| SIInstrInfo::isDS(MI: MI) \|\|
364	SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > `0`)
365	WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
366
367	if (MI->isInlineAsm())
368	return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
369
370	if (isSGetReg(Opcode: MI->getOpcode()))
371	return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
372
373	if (isSSetReg(Opcode: MI->getOpcode()))
374	return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
375
376	if (isRFE(Opcode: MI->getOpcode()))
377	return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
378
379	if ((ST.hasReadM0MovRelInterpHazard() &&
380	(TII.isVINTRP(MI: *MI) \|\| isSMovRel(Opcode: MI->getOpcode()) \|\|
381	MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 \|\|
382	MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) \|\|
383	(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) \|\|
384	(ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) \|\|
385	(ST.hasReadM0LdsDirectHazard() &&
386	MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /TRI=/nullptr)))
387	return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
388
389	if (SIInstrInfo::isMAI(MI: *MI))
390	return std::max(a: WaitStates, b: checkMAIHazards(MI));
391
392	if (SIInstrInfo::isVMEM(MI: *MI) \|\|
393	SIInstrInfo::isFLAT(MI: *MI) \|\|
394	SIInstrInfo::isDS(MI: *MI))
395	return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
396
397	return WaitStates;
398	}
399
400	void GCNHazardRecognizer::EmitNoop() {
401	EmittedInstrs.push_front(x: nullptr);
402	}
403
404	void GCNHazardRecognizer::AdvanceCycle() {
405	// When the scheduler detects a stall, it will call AdvanceCycle() without
406	// emitting any instructions.
407	if (!CurrCycleInstr) {
408	EmittedInstrs.push_front(x: nullptr);
409	return;
410	}
411
412	if (CurrCycleInstr->isBundle()) {
413	processBundle();
414	return;
415	}
416
417	unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
418	if (!NumWaitStates) {
419	CurrCycleInstr = nullptr;
420	return;
421	}
422
423	// Keep track of emitted instructions
424	EmittedInstrs.push_front(x: CurrCycleInstr);
425
426	// Add a nullptr for each additional wait state after the first. Make sure
427	// not to add more than getMaxLookAhead() items to the list, since we
428	// truncate the list to that size right after this loop.
429	for (unsigned i = `1`, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
430	i < e; ++i) {
431	EmittedInstrs.push_front(x: nullptr);
432	}
433
434	// getMaxLookahead() is the largest number of wait states we will ever need
435	// to insert, so there is no point in keeping track of more than that many
436	// wait states.
437	EmittedInstrs.resize(new_size: getMaxLookAhead());
438
439	CurrCycleInstr = nullptr;
440	}
441
442	void GCNHazardRecognizer::RecedeCycle() {
443	llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
444	}
445
446	//===----------------------------------------------------------------------===//
447	// Helper Functions
448	//===----------------------------------------------------------------------===//
449
450	using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
451
452	using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
453	using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
454
455	// Search for a hazard in a block and its predecessors.
456	template <typename StateT>
457	static bool
458	hasHazard(StateT State,
459	function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
460	function_ref<void(StateT &, const MachineInstr &)> UpdateState,
461	const MachineBasicBlock *MBB,
462	MachineBasicBlock::const_reverse_instr_iterator I,
463	DenseSet<const MachineBasicBlock *> &Visited) {
464	for (auto E = MBB->instr_rend(); I != E; ++I) {
465	// No need to look at parent BUNDLE instructions.
466	if (I ->isBundle())
467	continue;
468
469	switch (IsHazard(State, *I)) {
470	case HazardFound:
471	return true;
472	case HazardExpired:
473	return false;
474	default:
475	// Continue search
476	break;
477	}
478
479	if (I ->isInlineAsm() \|\| I ->isMetaInstruction())
480	continue;
481
482	UpdateState(State, *I);
483	}
484
485	for (MachineBasicBlock *Pred : MBB->predecessors()) {
486	if (!Visited.insert(V: Pred).second)
487	continue;
488
489	if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
490	Visited))
491	return true;
492	}
493
494	return false;
495	}
496
497	// Returns a minimum wait states since \p I walking all predecessors.
498	// Only scans until \p IsExpired does not return true.
499	// Can only be run in a hazard recognizer mode.
500	static int getWaitStatesSince(
501	GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
502	MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
503	IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
504	GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
505	for (auto E = MBB->instr_rend(); I != E; ++I) {
506	// Don't add WaitStates for parent BUNDLE instructions.
507	if (I ->isBundle())
508	continue;
509
510	if (IsHazard (*I))
511	return WaitStates;
512
513	if (I ->isInlineAsm())
514	continue;
515
516	WaitStates += GetNumWaitStates (*I);
517
518	if (IsExpired (*I, WaitStates))
519	return std::numeric_limits<int>::max();
520	}
521
522	int MinWaitStates = std::numeric_limits<int>::max();
523	for (MachineBasicBlock *Pred : MBB->predecessors()) {
524	if (!Visited.insert(V: Pred).second)
525	continue;
526
527	int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
528	IsExpired, Visited, GetNumWaitStates);
529
530	MinWaitStates = std::min(a: MinWaitStates, b: W);
531	}
532
533	return MinWaitStates;
534	}
535
536	static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
537	const MachineInstr *MI, IsExpiredFn IsExpired) {
538	DenseSet<const MachineBasicBlock *> Visited;
539	return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
540	I: std::next(x: MI->getReverseIterator()),
541	WaitStates: `0`, IsExpired, Visited);
542	}
543
544	int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
545	if (IsHazardRecognizerMode) {
546	auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
547	return WaitStates >= Limit;
548	};
549	return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn);
550	}
551
552	int WaitStates = `0`;
553	for (MachineInstr *MI : EmittedInstrs) {
554	if (MI) {
555	if (IsHazard (*MI))
556	return WaitStates;
557
558	if (MI->isInlineAsm())
559	continue;
560	}
561	++WaitStates;
562
563	if (WaitStates >= Limit)
564	break;
565	}
566	return std::numeric_limits<int>::max();
567	}
568
569	int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
570	IsHazardFn IsHazardDef,
571	int Limit) {
572	const SIRegisterInfo *TRI = ST.getRegisterInfo();
573
574	auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
575	return IsHazardDef (MI) && MI.modifiesRegister(Reg, TRI);
576	};
577
578	return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
579	}
580
581	int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
582	int Limit) {
583	auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
584	return isSSetReg(Opcode: MI.getOpcode()) && IsHazard (MI);
585	};
586
587	return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
588	}
589
590	//===----------------------------------------------------------------------===//
591	// No-op Hazard Detection
592	//===----------------------------------------------------------------------===//
593
594	static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
595	MCRegister Reg) {
596	for (MCRegUnit Unit : TRI.regunits(Reg))
597	BV.set(Unit);
598	}
599
600	static void addRegsToSet(const SIRegisterInfo &TRI,
601	iterator_range<MachineInstr::const_mop_iterator> Ops,
602	BitVector &DefSet, BitVector &UseSet) {
603	for (const MachineOperand &Op : Ops) {
604	if (Op.isReg())
605	addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
606	}
607	}
608
609	void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
610	addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
611	}
612
613	static bool breaksSMEMSoftClause(MachineInstr *MI) {
614	return !SIInstrInfo::isSMRD(MI: *MI);
615	}
616
617	static bool breaksVMEMSoftClause(MachineInstr *MI) {
618	return !SIInstrInfo::isVMEM(MI: MI) && !SIInstrInfo::isFLAT(MI: MI);
619	}
620
621	int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
622	// SMEM soft clause are only present on VI+, and only matter if xnack is
623	// enabled.
624	if (!ST.isXNACKEnabled())
625	return `0`;
626
627	bool IsSMRD = TII.isSMRD(MI: *MEM);
628
629	resetClause();
630
631	// A soft-clause is any group of consecutive SMEM instructions. The
632	// instructions in this group may return out of order and/or may be
633	// replayed (i.e. the same instruction issued more than once).
634	//
635	// In order to handle these situations correctly we need to make sure that
636	// when a clause has more than one instruction, no instruction in the clause
637	// writes to a register that is read by another instruction in the clause
638	// (including itself). If we encounter this situation, we need to break the
639	// clause by inserting a non SMEM instruction.
640
641	for (MachineInstr *MI : EmittedInstrs) {
642	// When we hit a non-SMEM instruction then we have passed the start of the
643	// clause and we can stop.
644	if (!MI)
645	break;
646
647	if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
648	break;
649
650	addClauseInst(MI: *MI);
651	}
652
653	if (ClauseDefs.none())
654	return `0`;
655
656	// We need to make sure not to put loads and stores in the same clause if they
657	// use the same address. For now, just start a new clause whenever we see a
658	// store.
659	if (MEM->mayStore())
660	return `1`;
661
662	addClauseInst(MI: *MEM);
663
664	// If the set of defs and uses intersect then we cannot add this instruction
665	// to the clause, so we have a hazard.
666	return ClauseDefs.anyCommon(RHS: ClauseUses) ? `1` : `0`;
667	}
668
669	int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
670	int WaitStatesNeeded = `0`;
671
672	WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
673
674	// This SMRD hazard only affects SI.
675	if (!ST.hasSMRDReadVALUDefHazard())
676	return WaitStatesNeeded;
677
678	// A read of an SGPR by SMRD instruction requires 4 wait states when the
679	// SGPR was written by a VALU instruction.
680	int SmrdSgprWaitStates = `4`;
681	auto IsHazardDefFn = [this](const MachineInstr &MI) {
682	return TII.isVALU(MI);
683	};
684	auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
685	return TII.isSALU(MI);
686	};
687
688	bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
689
690	for (const MachineOperand &Use : SMRD->uses()) {
691	if (!Use.isReg())
692	continue;
693	int WaitStatesNeededForUse =
694	SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
695	Limit: SmrdSgprWaitStates);
696	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
697
698	// This fixes what appears to be undocumented hardware behavior in SI where
699	// s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
700	// needs some number of nops in between. We don't know how many we need, but
701	// let's use 4. This wasn't discovered before probably because the only
702	// case when this happens is when we expand a 64-bit pointer into a full
703	// descriptor and use s_buffer_load_dword instead of s_load_dword, which was
704	// probably never encountered in the closed-source land.
705	if (IsBufferSMRD) {
706	int WaitStatesNeededForUse =
707	SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
708	IsHazardDef: IsBufferHazardDefFn,
709	Limit: SmrdSgprWaitStates);
710	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
711	}
712	}
713
714	return WaitStatesNeeded;
715	}
716
717	int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
718	if (!ST.hasVMEMReadSGPRVALUDefHazard())
719	return `0`;
720
721	int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
722
723	// A read of an SGPR by a VMEM instruction requires 5 wait states when the
724	// SGPR was written by a VALU Instruction.
725	const int VmemSgprWaitStates = `5`;
726	auto IsHazardDefFn = [this](const MachineInstr &MI) {
727	return TII.isVALU(MI);
728	};
729	for (const MachineOperand &Use : VMEM->uses()) {
730	if (!Use.isReg() \|\| TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
731	continue;
732
733	int WaitStatesNeededForUse =
734	VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
735	Limit: VmemSgprWaitStates);
736	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
737	}
738	return WaitStatesNeeded;
739	}
740
741	int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
742	const SIRegisterInfo *TRI = ST.getRegisterInfo();
743	const SIInstrInfo *TII = ST.getInstrInfo();
744
745	// Check for DPP VGPR read after VALU VGPR write and EXEC write.
746	int DppVgprWaitStates = `2`;
747	int DppExecWaitStates = `5`;
748	int WaitStatesNeeded = `0`;
749	auto IsHazardDefFn = [TII](const MachineInstr &MI) {
750	return TII->isVALU(MI);
751	};
752
753	for (const MachineOperand &Use : DPP->uses()) {
754	if (!Use.isReg() \|\| !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
755	continue;
756	int WaitStatesNeededForUse =
757	DppVgprWaitStates - getWaitStatesSinceDef(
758	Reg: Use.getReg(),
759	IsHazardDef: [](const MachineInstr &) { return true; },
760	Limit: DppVgprWaitStates);
761	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
762	}
763
764	WaitStatesNeeded = std::max(
765	a: WaitStatesNeeded,
766	b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn,
767	Limit: DppExecWaitStates));
768
769	return WaitStatesNeeded;
770	}
771
772	int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
773	const SIInstrInfo *TII = ST.getInstrInfo();
774
775	// v_div_fmas requires 4 wait states after a write to vcc from a VALU
776	// instruction.
777	const int DivFMasWaitStates = `4`;
778	auto IsHazardDefFn = [TII](const MachineInstr &MI) {
779	return TII->isVALU(MI);
780	};
781	int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn,
782	Limit: DivFMasWaitStates);
783
784	return DivFMasWaitStates - WaitStatesNeeded;
785	}
786
787	int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
788	const SIInstrInfo *TII = ST.getInstrInfo();
789	unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
790
791	const int GetRegWaitStates = `2`;
792	auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
793	return GetRegHWReg == getHWReg(TII, RegInstr: MI);
794	};
795	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
796
797	return GetRegWaitStates - WaitStatesNeeded;
798	}
799
800	int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
801	const SIInstrInfo *TII = ST.getInstrInfo();
802	unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
803
804	const int SetRegWaitStates = ST.getSetRegWaitStates();
805	auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
806	return HWReg == getHWReg(TII, RegInstr: MI);
807	};
808	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
809	return SetRegWaitStates - WaitStatesNeeded;
810	}
811
812	int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
813	if (!MI.mayStore())
814	return -`1`;
815
816	const SIInstrInfo *TII = ST.getInstrInfo();
817	unsigned Opcode = MI.getOpcode();
818	const MCInstrDesc &Desc = MI.getDesc();
819
820	int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::vdata);
821	int VDataRCID = -`1`;
822	if (VDataIdx != -`1`)
823	VDataRCID = Desc.operands()[VDataIdx].RegClass;
824
825	if (TII->isMUBUF(MI) \|\| TII->isMTBUF(MI)) {
826	// There is no hazard if the instruction does not use vector regs
827	// (like wbinvl1)
828	if (VDataIdx == -`1`)
829	return -`1`;
830	// For MUBUF/MTBUF instructions this hazard only exists if the
831	// instruction is not using a register in the soffset field.
832	const MachineOperand *SOffset =
833	TII->getNamedOperand(MI, OpName: AMDGPU::OpName::soffset);
834	// If we have no soffset operand, then assume this field has been
835	// hardcoded to zero.
836	if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > `64` &&
837	(!SOffset \|\| !SOffset->isReg()))
838	return VDataIdx;
839	}
840
841	// MIMG instructions create a hazard if they don't use a 256-bit T# and
842	// the store size is greater than 8 bytes and they have more than two bits
843	// of their dmask set.
844	// All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
845	if (TII->isMIMG(MI)) {
846	int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::srsrc);
847	assert(SRsrcIdx != -`1` &&
848	AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == `256`);
849	(void)SRsrcIdx;
850	}
851
852	if (TII->isFLAT(MI)) {
853	int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::vdata);
854	if (AMDGPU::getRegBitWidth(RCID: Desc.operands()[DataIdx].RegClass) > `64`)
855	return DataIdx;
856	}
857
858	return -`1`;
859	}
860
861	int
862	GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
863	const MachineRegisterInfo &MRI) {
864	// Helper to check for the hazard where VMEM instructions that store more than
865	// 8 bytes can have there store data over written by the next instruction.
866	const SIRegisterInfo *TRI = ST.getRegisterInfo();
867
868	const int VALUWaitStates = ST.hasGFX940Insts() ? `2` : `1`;
869	int WaitStatesNeeded = `0`;
870
871	if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
872	return WaitStatesNeeded;
873	Register Reg = Def.getReg();
874	auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
875	int DataIdx = createsVALUHazard(MI);
876	return DataIdx >= `0` &&
877	TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg);
878	};
879	int WaitStatesNeededForDef =
880	VALUWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: VALUWaitStates);
881	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
882
883	return WaitStatesNeeded;
884	}
885
886	int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
887	int WaitStatesNeeded = `0`;
888
889	if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
890	const int TransDefWaitstates = `1`;
891
892	auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
893	if (!SIInstrInfo::isTRANS(MI))
894	return false;
895	const SIRegisterInfo *TRI = ST.getRegisterInfo();
896	const SIInstrInfo *TII = ST.getInstrInfo();
897	Register Def = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::vdst)->getReg();
898
899	for (const MachineOperand &Use : VALU->explicit_uses()) {
900	if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
901	return true;
902	}
903
904	return false;
905	};
906
907	int WaitStatesNeededForDef =
908	TransDefWaitstates -
909	getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
910	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
911	}
912
913	if (ST.hasDstSelForwardingHazard()) {
914	const int Shift16DefWaitstates = `1`;
915
916	auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
917	if (!SIInstrInfo::isVALU(MI))
918	return false;
919	const SIInstrInfo *TII = ST.getInstrInfo();
920	if (SIInstrInfo::isSDWA(MI)) {
921	if (auto *DstSel = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::dst_sel))
922	if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
923	return false;
924	} else {
925	if (!AMDGPU::hasNamedOperand(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::op_sel) \|\|
926	!(TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src0_modifiers)
927	->getImm() &
928	SISrcMods::DST_OP_SEL))
929	return false;
930	}
931	const SIRegisterInfo *TRI = ST.getRegisterInfo();
932	if (auto *Dst = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::vdst)) {
933	Register Def = Dst->getReg();
934
935	for (const MachineOperand &Use : VALU->explicit_uses()) {
936	if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
937	return true;
938	}
939	}
940
941	return false;
942	};
943
944	int WaitStatesNeededForDef =
945	Shift16DefWaitstates -
946	getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
947	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
948	}
949
950	if (ST.hasVDecCoExecHazard()) {
951	const int VALUWriteSGPRVALUReadWaitstates = `2`;
952	const int VALUWriteEXECRWLane = `4`;
953	const int VALUWriteVGPRReadlaneRead = `1`;
954
955	const SIRegisterInfo *TRI = ST.getRegisterInfo();
956	const MachineRegisterInfo &MRI = MF.getRegInfo();
957	Register UseReg;
958	auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
959	if (!SIInstrInfo::isVALU(MI))
960	return false;
961	return MI.modifiesRegister(Reg: UseReg, TRI);
962	};
963
964	for (const MachineOperand &Use : VALU->explicit_uses()) {
965	if (!Use.isReg())
966	continue;
967
968	UseReg = Use.getReg();
969	if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
970	int WaitStatesNeededForDef =
971	VALUWriteSGPRVALUReadWaitstates -
972	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn,
973	Limit: VALUWriteSGPRVALUReadWaitstates);
974	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
975	}
976	}
977
978	if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) {
979	UseReg = AMDGPU::VCC;
980	int WaitStatesNeededForDef =
981	VALUWriteSGPRVALUReadWaitstates -
982	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates);
983	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
984	}
985
986	switch (VALU->getOpcode()) {
987	case AMDGPU::V_READLANE_B32:
988	case AMDGPU::V_READFIRSTLANE_B32: {
989	MachineOperand Src = TII.getNamedOperand(MI&: VALU, OperandName: AMDGPU::OpName::src0);
990	UseReg = Src->getReg();
991	int WaitStatesNeededForDef =
992	VALUWriteVGPRReadlaneRead -
993	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead);
994	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
995	}
996	[[fallthrough]];
997	case AMDGPU::V_WRITELANE_B32: {
998	UseReg = AMDGPU::EXEC;
999	int WaitStatesNeededForDef =
1000	VALUWriteEXECRWLane -
1001	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane);
1002	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1003	break;
1004	}
1005	default:
1006	break;
1007	}
1008	}
1009
1010	// This checks for the hazard where VMEM instructions that store more than
1011	// 8 bytes can have there store data over written by the next instruction.
1012	if (!ST.has12DWordStoreHazard())
1013	return WaitStatesNeeded;
1014
1015	const MachineRegisterInfo &MRI = MF.getRegInfo();
1016
1017	for (const MachineOperand &Def : VALU->defs()) {
1018	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1019	}
1020
1021	return WaitStatesNeeded;
1022	}
1023
1024	int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1025	// This checks for hazards associated with inline asm statements.
1026	// Since inline asms can contain just about anything, we use this
1027	// to call/leverage other checkHazard routines. Note that*
1028	// this function doesn't attempt to address all possible inline asm
1029	// hazards (good luck), but is a collection of what has been
1030	// problematic thus far.
1031
1032	// see checkVALUHazards()
1033	if (!ST.has12DWordStoreHazard())
1034	return `0`;
1035
1036	const MachineRegisterInfo &MRI = MF.getRegInfo();
1037	int WaitStatesNeeded = `0`;
1038
1039	for (const MachineOperand &Op :
1040	llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1041	if (Op.isReg() && Op.isDef()) {
1042	WaitStatesNeeded =
1043	std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1044	}
1045	}
1046
1047	return WaitStatesNeeded;
1048	}
1049
1050	int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1051	const SIInstrInfo *TII = ST.getInstrInfo();
1052	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1053	const MachineRegisterInfo &MRI = MF.getRegInfo();
1054
1055	const MachineOperand *LaneSelectOp =
1056	TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1);
1057
1058	if (!LaneSelectOp->isReg() \|\| !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1059	return `0`;
1060
1061	Register LaneSelectReg = LaneSelectOp->getReg();
1062	auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1063
1064	const int RWLaneWaitStates = `4`;
1065	int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1066	Limit: RWLaneWaitStates);
1067	return RWLaneWaitStates - WaitStatesSince;
1068	}
1069
1070	int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1071	if (!ST.hasRFEHazards())
1072	return `0`;
1073
1074	const SIInstrInfo *TII = ST.getInstrInfo();
1075
1076	const int RFEWaitStates = `1`;
1077
1078	auto IsHazardFn = [TII](const MachineInstr &MI) {
1079	return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1080	};
1081	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1082	return RFEWaitStates - WaitStatesNeeded;
1083	}
1084
1085	int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1086	const SIInstrInfo *TII = ST.getInstrInfo();
1087	const int ReadM0WaitStates = `1`;
1088	auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1089	return ReadM0WaitStates -
1090	getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates);
1091	}
1092
1093	void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1094	fixVMEMtoScalarWriteHazards(MI);
1095	fixVcmpxPermlaneHazards(MI);
1096	fixSMEMtoVectorWriteHazards(MI);
1097	fixVcmpxExecWARHazard(MI);
1098	fixLdsBranchVmemWARHazard(MI);
1099	if (ST.hasLdsDirect()) {
1100	fixLdsDirectVALUHazard(MI);
1101	fixLdsDirectVMEMHazard(MI);
1102	}
1103	fixVALUPartialForwardingHazard(MI);
1104	fixVALUTransUseHazard(MI);
1105	fixWMMAHazards(MI);
1106	fixShift64HighRegBug(MI);
1107	fixVALUMaskWriteHazard(MI);
1108	fixRequiredExportPriority(MI);
1109	}
1110
1111	bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1112	if (!ST.hasVcmpxPermlaneHazard() \|\| !isPermlane(MI: *MI))
1113	return false;
1114
1115	const SIInstrInfo *TII = ST.getInstrInfo();
1116	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1117	auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1118	return (TII->isVOPC(MI) \|\|
1119	((TII->isVOP3(MI) \|\| TII->isSDWA(MI)) && MI.isCompare())) &&
1120	MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI);
1121	};
1122
1123	auto IsExpiredFn = [](const MachineInstr &MI, int) {
1124	unsigned Opc = MI.getOpcode();
1125	return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1126	Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1127	};
1128
1129	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1130	std::numeric_limits<int>::max())
1131	return false;
1132
1133	// V_NOP will be discarded by SQ.
1134	// Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1135	// which is always a VGPR and available.
1136	auto Src0 = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src0);
1137	Register Reg = Src0->getReg();
1138	bool IsUndef = Src0->isUndef();
1139	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1140	MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32))
1141	.addReg(RegNo: Reg, flags: RegState::Define \| (IsUndef ? RegState::Dead : `0`))
1142	.addReg(RegNo: Reg, flags: IsUndef ? RegState::Undef : RegState::Kill);
1143
1144	return true;
1145	}
1146
1147	bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1148	if (!ST.hasVMEMtoScalarWriteHazard())
1149	return false;
1150	assert(!ST.hasExtendedWaitCounts());
1151
1152	if (!SIInstrInfo::isSALU(MI: MI) && !SIInstrInfo::isSMRD(MI: MI))
1153	return false;
1154
1155	if (MI->getNumDefs() == `0`)
1156	return false;
1157
1158	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1159
1160	auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1161	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I) &&
1162	!SIInstrInfo::isFLAT(MI: I))
1163	return false;
1164
1165	for (const MachineOperand &Def : MI->defs()) {
1166	const MachineOperand *Op =
1167	I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false);
1168	if (!Op)
1169	continue;
1170	return true;
1171	}
1172	return false;
1173	};
1174
1175	auto IsExpiredFn = [](const MachineInstr &MI, int) {
1176	return SIInstrInfo::isVALU(MI) \|\|
1177	(MI.getOpcode() == AMDGPU::S_WAITCNT &&
1178	!MI.getOperand(i: `0`).getImm()) \|\|
1179	(MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1180	AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: `0`).getImm()) == `0`);
1181	};
1182
1183	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1184	std::numeric_limits<int>::max())
1185	return false;
1186
1187	const SIInstrInfo *TII = ST.getInstrInfo();
1188	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1189	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1190	.addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: `0`));
1191	return true;
1192	}
1193
1194	bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1195	if (!ST.hasSMEMtoVectorWriteHazard())
1196	return false;
1197	assert(!ST.hasExtendedWaitCounts());
1198
1199	if (!SIInstrInfo::isVALU(MI: *MI))
1200	return false;
1201
1202	unsigned SDSTName;
1203	switch (MI->getOpcode()) {
1204	case AMDGPU::V_READLANE_B32:
1205	case AMDGPU::V_READFIRSTLANE_B32:
1206	SDSTName = AMDGPU::OpName::vdst;
1207	break;
1208	default:
1209	SDSTName = AMDGPU::OpName::sdst;
1210	break;
1211	}
1212
1213	const SIInstrInfo *TII = ST.getInstrInfo();
1214	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1215	const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1216	const MachineOperand SDST = TII->getNamedOperand(MI&: MI, OperandName: SDSTName);
1217	if (!SDST) {
1218	for (const auto &MO : MI->implicit_operands()) {
1219	if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) {
1220	SDST = &MO;
1221	break;
1222	}
1223	}
1224	}
1225
1226	if (!SDST)
1227	return false;
1228
1229	const Register SDSTReg = SDST->getReg();
1230	auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1231	return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI);
1232	};
1233
1234	auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1235	if (TII->isSALU(MI)) {
1236	switch (MI.getOpcode()) {
1237	case AMDGPU::S_SETVSKIP:
1238	case AMDGPU::S_VERSION:
1239	case AMDGPU::S_WAITCNT_VSCNT:
1240	case AMDGPU::S_WAITCNT_VMCNT:
1241	case AMDGPU::S_WAITCNT_EXPCNT:
1242	// These instructions cannot not mitigate the hazard.
1243	return false;
1244	case AMDGPU::S_WAITCNT_LGKMCNT:
1245	// Reducing lgkmcnt count to 0 always mitigates the hazard.
1246	return (MI.getOperand(i: `1`).getImm() == `0`) &&
1247	(MI.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL);
1248	case AMDGPU::S_WAITCNT: {
1249	const int64_t Imm = MI.getOperand(i: `0`).getImm();
1250	AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1251	// DsCnt corresponds to LGKMCnt here.
1252	return (Decoded.DsCnt == `0`);
1253	}
1254	default:
1255	// SOPP instructions cannot mitigate the hazard.
1256	if (TII->isSOPP(MI))
1257	return false;
1258	// At this point the SALU can be assumed to mitigate the hazard
1259	// because either:
1260	// (a) it is independent of the at risk SMEM (breaking chain),
1261	// or
1262	// (b) it is dependent on the SMEM, in which case an appropriate
1263	// s_waitcnt lgkmcnt _must_ exist between it and the at risk
1264	// SMEM instruction.
1265	return true;
1266	}
1267	}
1268	return false;
1269	};
1270
1271	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1272	std::numeric_limits<int>::max())
1273	return false;
1274
1275	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1276	MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL)
1277	.addImm(Val: `0`);
1278	return true;
1279	}
1280
1281	bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1282	if (!ST.hasVcmpxExecWARHazard())
1283	return false;
1284	assert(!ST.hasExtendedWaitCounts());
1285
1286	if (!SIInstrInfo::isVALU(MI: *MI))
1287	return false;
1288
1289	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1290	if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
1291	return false;
1292
1293	auto IsHazardFn = [TRI](const MachineInstr &I) {
1294	if (SIInstrInfo::isVALU(MI: I))
1295	return false;
1296	return I.readsRegister(Reg: AMDGPU::EXEC, TRI);
1297	};
1298
1299	const SIInstrInfo *TII = ST.getInstrInfo();
1300	auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1301	if (SIInstrInfo::isVALU(MI)) {
1302	if (TII->getNamedOperand(MI, OpName: AMDGPU::OpName::sdst))
1303	return true;
1304	for (auto MO : MI.implicit_operands())
1305	if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg())))
1306	return true;
1307	}
1308	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1309	AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: `0`).getImm()) == `0`)
1310	return true;
1311	return false;
1312	};
1313
1314	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1315	std::numeric_limits<int>::max())
1316	return false;
1317
1318	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1319	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1320	.addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: `0`));
1321	return true;
1322	}
1323
1324	static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1325	const GCNSubtarget &ST) {
1326	if (!ST.hasLdsBranchVmemWARHazard())
1327	return false;
1328
1329	// Check if the necessary condition for the hazard is met: both LDS and VMEM
1330	// instructions need to appear in the same function.
1331	bool HasLds = false;
1332	bool HasVmem = false;
1333	for (auto &MBB : MF) {
1334	for (auto &MI : MBB) {
1335	HasLds \|= SIInstrInfo::isDS(MI);
1336	HasVmem \|=
1337	SIInstrInfo::isVMEM(MI) \|\| SIInstrInfo::isSegmentSpecificFLAT(MI);
1338	if (HasLds && HasVmem)
1339	return true;
1340	}
1341	}
1342	return false;
1343	}
1344
1345	static bool isStoreCountWaitZero(const MachineInstr &I) {
1346	return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1347	I.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL &&
1348	!I.getOperand(i: `1`).getImm();
1349	}
1350
1351	bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1352	if (!RunLdsBranchVmemWARHazardFixup)
1353	return false;
1354
1355	assert(ST.hasLdsBranchVmemWARHazard());
1356	assert(!ST.hasExtendedWaitCounts());
1357
1358	auto IsHazardInst = [](const MachineInstr &MI) {
1359	if (SIInstrInfo::isDS(MI))
1360	return `1`;
1361	if (SIInstrInfo::isVMEM(MI) \|\| SIInstrInfo::isSegmentSpecificFLAT(MI))
1362	return `2`;
1363	return `0`;
1364	};
1365
1366	auto InstType = IsHazardInst (*MI);
1367	if (!InstType)
1368	return false;
1369
1370	auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1371	return IsHazardInst (I) \|\| isStoreCountWaitZero(I);
1372	};
1373
1374	auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1375	if (!I.isBranch())
1376	return false;
1377
1378	auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1379	auto InstType2 = IsHazardInst (I);
1380	return InstType2 && InstType != InstType2;
1381	};
1382
1383	auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1384	auto InstType2 = IsHazardInst (I);
1385	if (InstType == InstType2)
1386	return true;
1387
1388	return isStoreCountWaitZero(I);
1389	};
1390
1391	return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1392	std::numeric_limits<int>::max();
1393	};
1394
1395	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1396	std::numeric_limits<int>::max())
1397	return false;
1398
1399	const SIInstrInfo *TII = ST.getInstrInfo();
1400	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1401	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1402	.addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
1403	.addImm(Val: `0`);
1404
1405	return true;
1406	}
1407
1408	bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1409	if (!SIInstrInfo::isLDSDIR(MI: *MI))
1410	return false;
1411
1412	const int NoHazardWaitStates = `15`;
1413	const MachineOperand VDST = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::vdst);
1414	const Register VDSTReg = VDST->getReg();
1415
1416	bool VisitedTrans = false;
1417	auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1418	if (!SIInstrInfo::isVALU(MI: I))
1419	return false;
1420	VisitedTrans = VisitedTrans \|\| SIInstrInfo::isTRANS(MI: I);
1421	// Cover both WAR and WAW
1422	return I.readsRegister(Reg: VDSTReg, TRI: &TRI) \|\| I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1423	};
1424	auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1425	if (WaitStates >= NoHazardWaitStates)
1426	return true;
1427	// Instructions which cause va_vdst==0 expire hazard
1428	return SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isFLAT(MI: I) \|\|
1429	SIInstrInfo::isDS(MI: I) \|\| SIInstrInfo::isEXP(MI: I);
1430	};
1431	auto GetWaitStatesFn = [](const MachineInstr &MI) {
1432	return SIInstrInfo::isVALU(MI) ? `1` : `0`;
1433	};
1434
1435	DenseSet<const MachineBasicBlock *> Visited;
1436	auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(),
1437	I: std::next(x: MI->getReverseIterator()), WaitStates: `0`,
1438	IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn);
1439
1440	// Transcendentals can execute in parallel to other VALUs.
1441	// This makes va_vdst count unusable with a mixture of VALU and TRANS.
1442	if (VisitedTrans)
1443	Count = `0`;
1444
1445	MachineOperand *WaitVdstOp =
1446	TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst);
1447	WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates));
1448
1449	return true;
1450	}
1451
1452	bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1453	if (!SIInstrInfo::isLDSDIR(MI: *MI))
1454	return false;
1455
1456	const MachineOperand VDST = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::vdst);
1457	const Register VDSTReg = VDST->getReg();
1458
1459	auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1460	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I) &&
1461	!SIInstrInfo::isDS(MI: I))
1462	return false;
1463	return I.readsRegister(Reg: VDSTReg, TRI: &TRI) \|\| I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1464	};
1465	bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1466	// TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1467	// according to the type of VMEM instruction.
1468	auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1469	return SIInstrInfo::isVALU(MI: I) \|\| SIInstrInfo::isEXP(MI: I) \|\|
1470	(I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: `0`).getImm()) \|\|
1471	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1472	AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: `0`).getImm()) == `0`) \|\|
1473	(LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) &&
1474	!TII.getNamedOperand(MI: I, OpName: AMDGPU::OpName::waitvsrc)->getImm());
1475	};
1476
1477	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1478	std::numeric_limits<int>::max())
1479	return false;
1480
1481	if (LdsdirCanWait) {
1482	TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(`0`);
1483	} else {
1484	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1485	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1486	.addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: `0`));
1487	}
1488
1489	return true;
1490	}
1491
1492	bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1493	if (!ST.hasVALUPartialForwardingHazard())
1494	return false;
1495	assert(!ST.hasExtendedWaitCounts());
1496
1497	if (!ST.isWave64() \|\| !SIInstrInfo::isVALU(MI: *MI))
1498	return false;
1499
1500	SmallSetVector<Register, `4`> SrcVGPRs;
1501
1502	for (const MachineOperand &Use : MI->explicit_uses()) {
1503	if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1504	SrcVGPRs.insert(X: Use.getReg());
1505	}
1506
1507	// Only applies with >= 2 unique VGPR sources
1508	if (SrcVGPRs.size() <= `1`)
1509	return false;
1510
1511	// Look for the following pattern:
1512	// Va <- VALU [PreExecPos]
1513	// intv1
1514	// Exec <- SALU [ExecPos]
1515	// intv2
1516	// Vb <- VALU [PostExecPos]
1517	// intv3
1518	// MI Va, Vb (WaitState = 0)
1519	//
1520	// Where:
1521	// intv1 + intv2 <= 2 VALUs
1522	// intv3 <= 4 VALUs
1523	//
1524	// If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1525
1526	const int Intv1plus2MaxVALUs = `2`;
1527	const int Intv3MaxVALUs = `4`;
1528	const int IntvMaxVALUs = `6`;
1529	const int NoHazardVALUWaitStates = IntvMaxVALUs + `2`;
1530
1531	struct StateType {
1532	SmallDenseMap<Register, int, `4`> DefPos;
1533	int ExecPos = std::numeric_limits<int>::max();
1534	int VALUs = `0`;
1535	};
1536
1537	StateType State;
1538
1539	// This overloads expiry testing with all the hazard detection
1540	auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1541	// Too many VALU states have passed
1542	if (State.VALUs > NoHazardVALUWaitStates)
1543	return HazardExpired;
1544
1545	// Instructions which cause va_vdst==0 expire hazard
1546	if (SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isFLAT(MI: I) \|\|
1547	SIInstrInfo::isDS(MI: I) \|\| SIInstrInfo::isEXP(MI: I) \|\|
1548	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1549	AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: `0`).getImm()) == `0`))
1550	return HazardExpired;
1551
1552	// Track registers writes
1553	bool Changed = false;
1554	if (SIInstrInfo::isVALU(MI: I)) {
1555	for (Register Src : SrcVGPRs) {
1556	if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1557	State.DefPos [Src] = State.VALUs;
1558	Changed = true;
1559	}
1560	}
1561	} else if (SIInstrInfo::isSALU(MI: I)) {
1562	if (State.ExecPos == std::numeric_limits<int>::max()) {
1563	if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
1564	State.ExecPos = State.VALUs;
1565	Changed = true;
1566	}
1567	}
1568	}
1569
1570	// Early expiration: too many VALUs in intv3
1571	if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1572	return HazardExpired;
1573
1574	// Only evaluate state if something changed
1575	if (!Changed)
1576	return NoHazardFound;
1577
1578	// Determine positions of VALUs pre/post exec change
1579	if (State.ExecPos == std::numeric_limits<int>::max())
1580	return NoHazardFound;
1581
1582	int PreExecPos = std::numeric_limits<int>::max();
1583	int PostExecPos = std::numeric_limits<int>::max();
1584
1585	for (auto Entry : State.DefPos) {
1586	int DefVALUs = Entry.second;
1587	if (DefVALUs != std::numeric_limits<int>::max()) {
1588	if (DefVALUs >= State.ExecPos)
1589	PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1590	else
1591	PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1592	}
1593	}
1594
1595	// Need a VALUs post exec change
1596	if (PostExecPos == std::numeric_limits<int>::max())
1597	return NoHazardFound;
1598
1599	// Too many VALUs in intv3?
1600	int Intv3VALUs = PostExecPos;
1601	if (Intv3VALUs > Intv3MaxVALUs)
1602	return HazardExpired;
1603
1604	// Too many VALUs in intv2?
1605	int Intv2VALUs = (State.ExecPos - PostExecPos) - `1`;
1606	if (Intv2VALUs > Intv1plus2MaxVALUs)
1607	return HazardExpired;
1608
1609	// Need a VALUs pre exec change
1610	if (PreExecPos == std::numeric_limits<int>::max())
1611	return NoHazardFound;
1612
1613	// Too many VALUs in intv1?
1614	int Intv1VALUs = PreExecPos - State.ExecPos;
1615	if (Intv1VALUs > Intv1plus2MaxVALUs)
1616	return HazardExpired;
1617
1618	// Too many VALUs in intv1 + intv2
1619	if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1620	return HazardExpired;
1621
1622	return HazardFound;
1623	};
1624	auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1625	if (SIInstrInfo::isVALU(MI))
1626	State.VALUs += `1`;
1627	};
1628
1629	DenseSet<const MachineBasicBlock *> Visited;
1630	if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1631	I: std::next(x: MI->getReverseIterator()), Visited))
1632	return false;
1633
1634	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1635	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1636	.addImm(Val: `0x0fff`);
1637
1638	return true;
1639	}
1640
1641	bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1642	if (!ST.hasVALUTransUseHazard())
1643	return false;
1644	assert(!ST.hasExtendedWaitCounts());
1645
1646	if (!SIInstrInfo::isVALU(MI: *MI))
1647	return false;
1648
1649	SmallSet<Register, `4`> SrcVGPRs;
1650
1651	for (const MachineOperand &Use : MI->explicit_uses()) {
1652	if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1653	SrcVGPRs.insert(V: Use.getReg());
1654	}
1655
1656	// Look for the following pattern:
1657	// Va <- TRANS VALU
1658	// intv
1659	// MI Va (WaitState = 0)
1660	//
1661	// Where:
1662	// intv <= 5 VALUs / 1 TRANS
1663	//
1664	// If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1665
1666	const int IntvMaxVALUs = `5`;
1667	const int IntvMaxTRANS = `1`;
1668
1669	struct StateType {
1670	int VALUs = `0`;
1671	int TRANS = `0`;
1672	};
1673
1674	StateType State;
1675
1676	// This overloads expiry testing with all the hazard detection
1677	auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1678	// Too many VALU states have passed
1679	if (State.VALUs > IntvMaxVALUs \|\| State.TRANS > IntvMaxTRANS)
1680	return HazardExpired;
1681
1682	// Instructions which cause va_vdst==0 expire hazard
1683	if (SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isFLAT(MI: I) \|\|
1684	SIInstrInfo::isDS(MI: I) \|\| SIInstrInfo::isEXP(MI: I) \|\|
1685	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1686	I.getOperand(i: `0`).getImm() == `0x0fff`))
1687	return HazardExpired;
1688
1689	// Track registers writes
1690	if (SIInstrInfo::isTRANS(MI: I)) {
1691	for (Register Src : SrcVGPRs) {
1692	if (I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1693	return HazardFound;
1694	}
1695	}
1696	}
1697
1698	return NoHazardFound;
1699	};
1700	auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1701	if (SIInstrInfo::isVALU(MI))
1702	State.VALUs += `1`;
1703	if (SIInstrInfo::isTRANS(MI))
1704	State.TRANS += `1`;
1705	};
1706
1707	DenseSet<const MachineBasicBlock *> Visited;
1708	if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1709	I: std::next(x: MI->getReverseIterator()), Visited))
1710	return false;
1711
1712	// Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1713	// avoided.
1714	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1715	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1716	.addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: `0`));
1717
1718	return true;
1719	}
1720
1721	bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1722	if (!SIInstrInfo::isWMMA(MI: MI) && !SIInstrInfo::isSWMMAC(MI: MI))
1723	return false;
1724
1725	const SIInstrInfo *TII = ST.getInstrInfo();
1726	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1727
1728	auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1729	if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
1730	return false;
1731
1732	// Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1733	// with the dest(matrix D) of the previous wmma.
1734	const Register CurSrc0Reg =
1735	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
1736	const Register CurSrc1Reg =
1737	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
1738
1739	const Register PrevDstReg =
1740	TII->getNamedOperand(MI: I, OpName: AMDGPU::OpName::vdst)->getReg();
1741
1742	if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) \|\|
1743	TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) {
1744	return true;
1745	}
1746
1747	// GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1748	// but Index can't overlap with PrevDstReg.
1749	if (AMDGPU::isGFX12Plus(STI: ST)) {
1750	if (SIInstrInfo::isSWMMAC(MI: *MI)) {
1751	const Register CurIndex =
1752	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
1753	if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex))
1754	return true;
1755	}
1756	return false;
1757	}
1758
1759	return false;
1760	};
1761
1762	auto IsExpiredFn = [](const MachineInstr &I, int) {
1763	return SIInstrInfo::isVALU(MI: I);
1764	};
1765
1766	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1767	std::numeric_limits<int>::max())
1768	return false;
1769
1770	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
1771
1772	return true;
1773	}
1774
1775	bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1776	if (!ST.hasShift64HighRegBug())
1777	return false;
1778	assert(!ST.hasExtendedWaitCounts());
1779
1780	switch (MI->getOpcode()) {
1781	default:
1782	return false;
1783	case AMDGPU::V_LSHLREV_B64_e64:
1784	case AMDGPU::V_LSHRREV_B64_e64:
1785	case AMDGPU::V_ASHRREV_I64_e64:
1786	break;
1787	}
1788
1789	MachineOperand Amt = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src0);
1790	if (!Amt->isReg())
1791	return false;
1792
1793	Register AmtReg = Amt->getReg();
1794	const MachineRegisterInfo &MRI = MF.getRegInfo();
1795	// Check if this is a last VGPR in the allocation block.
1796	if (!TRI.isVGPR(MRI, Reg: AmtReg) \|\| ((AmtReg - AMDGPU::VGPR0) & `7`) != `7`)
1797	return false;
1798
1799	if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + `1`))
1800	return false;
1801
1802	MachineOperand Src1 = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src1);
1803	bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(RegA: Src1->getReg(), RegB: AmtReg);
1804	bool OverlappedDst = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI);
1805	bool Overlapped = OverlappedSrc \|\| OverlappedDst;
1806
1807	assert(!OverlappedDst \|\| !OverlappedSrc \|\|
1808	Src1->getReg() == MI->getOperand(`0`).getReg());
1809	assert(ST.needsAlignedVGPRs());
1810	static_assert(AMDGPU::VGPR0 + `1` == AMDGPU::VGPR1);
1811
1812	Register NewReg;
1813	for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1814	: AMDGPU::VGPR_32RegClass) {
1815	if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) {
1816	NewReg = Reg;
1817	break;
1818	}
1819	}
1820
1821	Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1)
1822	: NewReg;
1823	Register NewAmtLo;
1824
1825	if (Overlapped)
1826	NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0);
1827
1828	DebugLoc DL = MI->getDebugLoc();
1829	MachineBasicBlock *MBB = MI->getParent();
1830	// Insert a full wait count because found register might be pending a wait.
1831	BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
1832	.addImm(Val: `0`);
1833
1834	// Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1835	if (Overlapped)
1836	runOnInstruction(
1837	MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo)
1838	.addDef(RegNo: AmtReg - `1`)
1839	.addReg(RegNo: AmtReg - `1`, flags: RegState::Undef)
1840	.addReg(RegNo: NewAmtLo, flags: RegState::Undef));
1841	runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt)
1842	.addDef(RegNo: AmtReg)
1843	.addReg(RegNo: AmtReg, flags: RegState::Undef)
1844	.addReg(RegNo: NewAmt, flags: RegState::Undef));
1845
1846	// Instructions emitted after the current instruction will be processed by the
1847	// parent loop of the hazard recognizer in a natural way.
1848	BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
1849	DestReg: AmtReg)
1850	.addDef(RegNo: NewAmt)
1851	.addReg(RegNo: NewAmt)
1852	.addReg(RegNo: AmtReg);
1853	if (Overlapped)
1854	BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
1855	DestReg: AmtReg - `1`)
1856	.addDef(RegNo: NewAmtLo)
1857	.addReg(RegNo: NewAmtLo)
1858	.addReg(RegNo: AmtReg - `1`);
1859
1860	// Re-running hazard recognizer on the modified instruction is not necessary,
1861	// inserted V_SWAP_B32 has already both read and write new registers so
1862	// hazards related to these register has already been handled.
1863	Amt->setReg(NewAmt);
1864	Amt->setIsKill(false);
1865	// We do not update liveness, so verifier may see it as undef.
1866	Amt->setIsUndef();
1867	if (OverlappedDst)
1868	MI->getOperand(i: `0`).setReg(NewReg);
1869	if (OverlappedSrc) {
1870	Src1->setReg(NewReg);
1871	Src1->setIsKill(false);
1872	Src1->setIsUndef();
1873	}
1874
1875	return true;
1876	}
1877
1878	int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1879	int NSAtoVMEMWaitStates = `1`;
1880
1881	if (!ST.hasNSAtoVMEMBug())
1882	return `0`;
1883
1884	if (!SIInstrInfo::isMUBUF(MI: MI) && !SIInstrInfo::isMTBUF(MI: MI))
1885	return `0`;
1886
1887	const SIInstrInfo *TII = ST.getInstrInfo();
1888	const auto Offset = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::offset);
1889	if (!Offset \|\| (Offset->getImm() & `6`) == `0`)
1890	return `0`;
1891
1892	auto IsHazardFn = [TII](const MachineInstr &I) {
1893	if (!SIInstrInfo::isMIMG(MI: I))
1894	return false;
1895	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
1896	return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1897	TII->getInstSizeInBytes(MI: I) >= `16`;
1898	};
1899
1900	return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: `1`);
1901	}
1902
1903	int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1904	int FPAtomicToDenormModeWaitStates = `3`;
1905
1906	if (!ST.hasFPAtomicToDenormModeHazard())
1907	return `0`;
1908	assert(!ST.hasExtendedWaitCounts());
1909
1910	if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1911	return `0`;
1912
1913	auto IsHazardFn = [](const MachineInstr &I) {
1914	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I))
1915	return false;
1916	return SIInstrInfo::isFPAtomic(MI: I);
1917	};
1918
1919	auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1920	if (WaitStates >= `3` \|\| SIInstrInfo::isVALU(MI))
1921	return true;
1922
1923	switch (MI.getOpcode()) {
1924	case AMDGPU::S_WAITCNT:
1925	case AMDGPU::S_WAITCNT_VSCNT:
1926	case AMDGPU::S_WAITCNT_VMCNT:
1927	case AMDGPU::S_WAITCNT_EXPCNT:
1928	case AMDGPU::S_WAITCNT_LGKMCNT:
1929	case AMDGPU::S_WAIT_IDLE:
1930	return true;
1931	default:
1932	break;
1933	}
1934
1935	return false;
1936	};
1937
1938	return FPAtomicToDenormModeWaitStates -
1939	::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
1940	}
1941
1942	int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1943	assert(SIInstrInfo::isMAI(*MI));
1944
1945	return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1946	}
1947
1948	int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1949	// Early exit if no padding is requested.
1950	if (MFMAPaddingRatio == `0`)
1951	return `0`;
1952
1953	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1954	if (!SIInstrInfo::isMFMA(MI: *MI) \|\| MFI->getOccupancy() < `2`)
1955	return `0`;
1956
1957	int NeighborMFMALatency = `0`;
1958	auto IsNeighboringMFMA = [&NeighborMFMALatency,
1959	this](const MachineInstr &MI) {
1960	if (!SIInstrInfo::isMFMA(MI))
1961	return false;
1962
1963	NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1964	return true;
1965	};
1966
1967	const int MaxMFMAPipelineWaitStates = `16`;
1968	int WaitStatesSinceNeighborMFMA =
1969	getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
1970
1971	int NeighborMFMAPaddingNeeded =
1972	(NeighborMFMALatency * MFMAPaddingRatio / `100`) -
1973	WaitStatesSinceNeighborMFMA;
1974
1975	return std::max(a: `0`, b: NeighborMFMAPaddingNeeded);
1976	}
1977
1978	int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1979	int WaitStatesNeeded = `0`;
1980	unsigned Opc = MI->getOpcode();
1981
1982	auto IsVALUFn = [](const MachineInstr &MI) {
1983	return SIInstrInfo::isVALU(MI) \|\| MI.isInlineAsm();
1984	};
1985
1986	if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1987	const int LegacyVALUWritesVGPRWaitStates = `2`;
1988	const int VALUWritesExecWaitStates = `4`;
1989	const int MaxWaitStates = `4`;
1990
1991	int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1992	getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
1993	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
1994
1995	if (WaitStatesNeeded < MaxWaitStates) {
1996	for (const MachineOperand &Use : MI->explicit_uses()) {
1997	const int MaxWaitStates = `2`;
1998
1999	if (!Use.isReg() \|\| !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
2000	continue;
2001
2002	int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2003	getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2004	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2005
2006	if (WaitStatesNeeded == MaxWaitStates)
2007	break;
2008	}
2009	}
2010	}
2011
2012	for (const MachineOperand &Op : MI->explicit_operands()) {
2013	if (!Op.isReg() \|\| !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2014	continue;
2015
2016	if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2017	continue;
2018
2019	const int MFMAWritesAGPROverlappedSrcABWaitStates = `4`;
2020	const int MFMAWritesAGPROverlappedSrcCWaitStates = `2`;
2021	const int MFMA4x4WritesAGPRAccVgprReadWaitStates = `4`;
2022	const int MFMA16x16WritesAGPRAccVgprReadWaitStates = `10`;
2023	const int MFMA32x32WritesAGPRAccVgprReadWaitStates = `18`;
2024	const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = `1`;
2025	const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = `7`;
2026	const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = `15`;
2027	const int MaxWaitStates = `18`;
2028	Register Reg = Op.getReg();
2029	unsigned HazardDefLatency = `0`;
2030
2031	auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2032	this](const MachineInstr &MI) {
2033	if (!SIInstrInfo::isMFMA(MI))
2034	return false;
2035	Register DstReg = MI.getOperand(i: `0`).getReg();
2036	if (DstReg == Reg)
2037	return false;
2038	HazardDefLatency =
2039	std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2040	return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2041	};
2042
2043	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn,
2044	Limit: MaxWaitStates);
2045	int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2046	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src2);
2047	int OpNo = Op.getOperandNo();
2048	if (OpNo == SrcCIdx) {
2049	NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2050	} else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2051	switch (HazardDefLatency) {
2052	case `2`: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2053	break;
2054	case `8`: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2055	break;
2056	case `16`: [[fallthrough]];
2057	default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2058	break;
2059	}
2060	} else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2061	switch (HazardDefLatency) {
2062	case `2`: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2063	break;
2064	case `8`: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2065	break;
2066	case `16`: [[fallthrough]];
2067	default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2068	break;
2069	}
2070	}
2071
2072	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2073	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2074
2075	if (WaitStatesNeeded == MaxWaitStates)
2076	return WaitStatesNeeded; // Early exit.
2077
2078	auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2079	if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2080	return false;
2081	Register DstReg = MI.getOperand(i: `0`).getReg();
2082	return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2083	};
2084
2085	const int AccVGPRWriteMFMAReadSrcCWaitStates = `1`;
2086	const int AccVGPRWriteMFMAReadSrcABWaitStates = `3`;
2087	const int AccVGPRWriteAccVgprReadWaitStates = `3`;
2088	NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2089	if (OpNo == SrcCIdx)
2090	NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2091	else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2092	NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2093
2094	WaitStatesNeededForUse = NeedWaitStates -
2095	getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates);
2096	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2097
2098	if (WaitStatesNeeded == MaxWaitStates)
2099	return WaitStatesNeeded; // Early exit.
2100	}
2101
2102	if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2103	const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = `0`;
2104	const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = `5`;
2105	const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = `13`;
2106	const int MaxWaitStates = `13`;
2107	Register DstReg = MI->getOperand(i: `0`).getReg();
2108	unsigned HazardDefLatency = `0`;
2109
2110	auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2111	this](const MachineInstr &MI) {
2112	if (!SIInstrInfo::isMFMA(MI))
2113	return false;
2114	Register Reg = TII.getNamedOperand(MI, OpName: AMDGPU::OpName::src2)->getReg();
2115	HazardDefLatency =
2116	std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2117	return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2118	};
2119
2120	int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates);
2121	int NeedWaitStates;
2122	switch (HazardDefLatency) {
2123	case `2`: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2124	break;
2125	case `8`: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2126	break;
2127	case `16`: [[fallthrough]];
2128	default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2129	break;
2130	}
2131
2132	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2133	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2134	}
2135
2136	// Pad neighboring MFMA with noops for better inter-wave performance.
2137	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2138
2139	return WaitStatesNeeded;
2140	}
2141
2142	static int
2143	GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2144	// 2 pass -> 3
2145	// 4 pass -> 5
2146	// 8 pass -> 9
2147	// 16 pass -> 17
2148	return NumPasses + `1`;
2149	}
2150
2151	static int
2152	GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2153	// 2 pass -> 2
2154	// 4 pass -> 4
2155	// 8 pass -> 8
2156	// 16 pass -> 16
2157	return NumPasses;
2158	}
2159
2160	static int
2161	GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2162	// 2 pass -> 4
2163	// 4 pass -> 6
2164	// 8 pass -> 10
2165	// 16 pass -> 18
2166	return NumPasses + `2`;
2167	}
2168
2169	static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2170	// 2 pass -> 5
2171	// 4 pass -> 7
2172	// 8 pass -> 11
2173	// 16 pass -> 19
2174	return NumPasses + `3`;
2175	}
2176
2177	int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2178	int WaitStatesNeeded = `0`;
2179	unsigned Opc = MI->getOpcode();
2180
2181	auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2182	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2183	};
2184
2185	auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2186	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2187	!SIInstrInfo::isDOT(MI);
2188	};
2189
2190	if (!SIInstrInfo::isMFMA(MI: *MI))
2191	return WaitStatesNeeded;
2192
2193	const int VALUWritesExecWaitStates = `4`;
2194	int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2195	getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn,
2196	Limit: VALUWritesExecWaitStates);
2197	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2198
2199	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src2);
2200
2201	// Loop for both DGEMM and S/HGEMM 2nd instruction.
2202	for (const MachineOperand &Use : MI->explicit_uses()) {
2203	const int LegacyVALUNotDotWritesVGPRWaitStates = `2`;
2204	const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = `2`;
2205	const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = `8`;
2206	const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = `16`;
2207	const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = `3`;
2208	const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = `9`;
2209	const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = `17`;
2210	const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = `9`;
2211	const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = `4`;
2212	const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = `5`;
2213	const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = `11`;
2214	const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = `19`;
2215	const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = `6`;
2216	const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = `11`;
2217	const int DMFMA4x4WritesVGPRFullSrcCWaitStates = `4`;
2218	const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = `2`;
2219	const int MaxWaitStates = `19`;
2220
2221	if (!Use.isReg())
2222	continue;
2223	Register Reg = Use.getReg();
2224	bool FullReg;
2225	const MachineInstr *MI1;
2226
2227	auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2228	this](const MachineInstr &MI) {
2229	if (!SIInstrInfo::isMFMA(MI))
2230	return false;
2231	Register DstReg = MI.getOperand(i: `0`).getReg();
2232	FullReg = (DstReg == Reg);
2233	MI1 = &MI;
2234	return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2235	};
2236
2237	WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2238	getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2239	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2240
2241	int NumWaitStates =
2242	getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates);
2243	if (NumWaitStates == std::numeric_limits<int>::max())
2244	continue;
2245
2246	int OpNo = Use.getOperandNo();
2247	unsigned Opc1 = MI1->getOpcode();
2248	int NeedWaitStates = `0`;
2249	if (OpNo == SrcCIdx) {
2250	if (!isDGEMM(Opcode: Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opcode: Opc1))) {
2251	NeedWaitStates = `0`;
2252	} else if (FullReg) {
2253	if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 \|\|
2254	Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2255	(Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 \|\|
2256	Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2257	NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2258	else if (ST.hasGFX940Insts() &&
2259	TSchedModel.computeInstrLatency(MI: MI1) == `2`)
2260	NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2261	} else {
2262	switch (Opc1) {
2263	case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2264	case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2265	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2266	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2267	if (!isXDL(ST, MI: *MI))
2268	NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2269	break;
2270	case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2271	case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2272	if (!isXDL(ST, MI: *MI))
2273	NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2274	break;
2275	default:
2276	int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2277	if (ST.hasGFX940Insts()) {
2278	if (isXDL(ST, MI: MI) && !isXDL(ST, MI: MI1))
2279	break;
2280
2281	NeedWaitStates =
2282	isXDL(ST, MI: *MI1)
2283	? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2284	NumPasses)
2285	: GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2286	NumPasses);
2287	break;
2288	}
2289
2290	switch (NumPasses) {
2291	case `2`:
2292	NeedWaitStates =
2293	isDGEMM(Opcode: Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2294	: SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2295	break;
2296	case `8`:
2297	NeedWaitStates =
2298	isDGEMM(Opcode: Opc)
2299	? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2300	: SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2301	break;
2302	case `16`:
2303	NeedWaitStates =
2304	isDGEMM(Opcode: Opc)
2305	? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2306	: SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2307	break;
2308	default:
2309	llvm_unreachable("unexpected number of passes");
2310	}
2311	}
2312	}
2313	} else {
2314	switch (Opc1) {
2315	case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2316	case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2317	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2318	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2319	NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2320	break;
2321	case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2322	case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2323	NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2324	break;
2325	default:
2326	int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2327
2328	if (ST.hasGFX940Insts()) {
2329	NeedWaitStates =
2330	isXDL(ST, MI: *MI1)
2331	? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2332	NumPasses)
2333	: GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2334	NumPasses);
2335	break;
2336	}
2337
2338	switch (NumPasses) {
2339	case `2`:
2340	NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2341	break;
2342	case `4`:
2343	llvm_unreachable("unexpected number of passes for mfma");
2344	case `8`:
2345	NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2346	break;
2347	case `16`:
2348	default:
2349	NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2350	}
2351	}
2352	}
2353	if (WaitStatesNeeded >= NeedWaitStates)
2354	continue;
2355
2356	WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2357	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2358
2359	if (WaitStatesNeeded == MaxWaitStates)
2360	break;
2361	}
2362
2363	// Pad neighboring MFMA with noops for better inter-wave performance.
2364	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2365
2366	return WaitStatesNeeded;
2367	}
2368
2369	int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2370	// On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2371	if (!ST.hasMAIInsts() \|\| ST.hasGFX90AInsts())
2372	return `0`;
2373
2374	int WaitStatesNeeded = `0`;
2375
2376	auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2377	return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2378	};
2379
2380	for (const MachineOperand &Op : MI->explicit_uses()) {
2381	if (!Op.isReg() \|\| !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2382	continue;
2383
2384	Register Reg = Op.getReg();
2385
2386	const int AccVgprReadLdStWaitStates = `2`;
2387	const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = `1`;
2388	const int MaxWaitStates = `2`;
2389
2390	int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2391	getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates);
2392	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2393
2394	if (WaitStatesNeeded == MaxWaitStates)
2395	return WaitStatesNeeded; // Early exit.
2396
2397	auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2398	if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2399	MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2400	return false;
2401	auto IsVALUFn = [](const MachineInstr &MI) {
2402	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2403	};
2404	return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: `2` /MaxWaitStates/) <
2405	std::numeric_limits<int>::max();
2406	};
2407
2408	WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2409	getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2410	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2411	}
2412
2413	return WaitStatesNeeded;
2414	}
2415
2416	static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2417	// 2 pass -> 4
2418	// 4 pass -> 6
2419	// 8 pass -> 10
2420	// 16 pass -> 18
2421	return NumPasses + `2`;
2422	}
2423
2424	static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2425	// 2 pass -> 5
2426	// 4 pass -> 7
2427	// 8 pass -> 11
2428	// 16 pass -> 19
2429	return NumPasses + `3`;
2430	}
2431
2432	static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2433	// 2 pass -> 5
2434	// 4 pass -> 7
2435	// 8 pass -> 11
2436	// 16 pass -> 19
2437	return NumPasses + `3`;
2438	}
2439
2440	static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2441	// 2 pass -> 4
2442	// 4 pass -> 6
2443	// 8 pass -> 10
2444	// 16 pass -> 18
2445	return NumPasses + `2`;
2446	}
2447
2448	int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2449	if (!ST.hasGFX90AInsts())
2450	return `0`;
2451
2452	auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2453	return isDGEMM(Opcode: MI.getOpcode());
2454	};
2455
2456	// This is checked in checkMAIHazards90A()
2457	if (SIInstrInfo::isMFMA(MI: *MI))
2458	return `0`;
2459
2460	const MachineRegisterInfo &MRI = MF.getRegInfo();
2461
2462	int WaitStatesNeeded = `0`;
2463
2464	bool IsMem = SIInstrInfo::isVMEM(MI: *MI) \|\|
2465	SIInstrInfo::isFLAT(MI: *MI) \|\|
2466	SIInstrInfo::isDS(MI: *MI);
2467	bool IsMemOrExport = IsMem \|\| SIInstrInfo::isEXP(MI: *MI);
2468	bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
2469
2470	const MachineInstr MFMA = nullptr*;
2471	unsigned Reg;
2472	auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2473	if (!SIInstrInfo::isMFMA(MI) \|\|
2474	!TRI.regsOverlap(RegA: MI.getOperand(i: `0`).getReg(), RegB: Reg))
2475	return false;
2476	MFMA = &MI;
2477	return true;
2478	};
2479
2480	const MachineInstr DOT = nullptr*;
2481	auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2482	if (!SIInstrInfo::isDOT(MI) \|\|
2483	!TRI.regsOverlap(RegA: MI.getOperand(i: `0`).getReg(), RegB: Reg))
2484	return false;
2485	DOT = &MI;
2486	return true;
2487	};
2488
2489	bool DGEMMAfterVALUWrite = false;
2490	auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2491	// Found DGEMM on reverse traversal to def.
2492	if (isDGEMM(Opcode: MI.getOpcode()))
2493	DGEMMAfterVALUWrite = true;
2494
2495	// Only hazard if register is defined by a VALU and a DGEMM is found after
2496	// after the def.
2497	if (!TII.isVALU(MI) \|\| !DGEMMAfterVALUWrite)
2498	return false;
2499
2500	return true;
2501	};
2502
2503	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(),
2504	NamedIdx: AMDGPU::OpName::src2);
2505
2506	if (IsMemOrExport \|\| IsVALU) {
2507	const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = `5`;
2508	const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = `11`;
2509	const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = `19`;
2510	const int DMFMA4x4WriteVgprMemExpReadWaitStates = `9`;
2511	const int DMFMA16x16WriteVgprMemExpReadWaitStates = `18`;
2512	const int DMFMA4x4WriteVgprVALUReadWaitStates = `6`;
2513	const int DMFMA16x16WriteVgprVALUReadWaitStates = `11`;
2514	const int DotWriteSameDotReadSrcAB = `3`;
2515	const int DotWriteDifferentVALURead = `3`;
2516	const int DMFMABetweenVALUWriteVMEMRead = `2`;
2517	const int MaxWaitStates = `19`;
2518
2519	for (const MachineOperand &Use : MI->explicit_uses()) {
2520	if (!Use.isReg())
2521	continue;
2522	Reg = Use.getReg();
2523
2524	DOT = nullptr;
2525	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2526	Limit: MaxWaitStates);
2527	if (DOT) {
2528	int NeedWaitStates = `0`;
2529	if (DOT->getOpcode() == MI->getOpcode()) {
2530	if (&Use - &MI->getOperand(i: `0`) != SrcCIdx)
2531	NeedWaitStates = DotWriteSameDotReadSrcAB;
2532	} else {
2533	NeedWaitStates = DotWriteDifferentVALURead;
2534	}
2535
2536	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2537	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2538	}
2539
2540	// Workaround for HW data hazard bug observed only in GFX90A. When there
2541	// is a DGEMM instruction in-between a VALU and a VMEM instruction it
2542	// causes the SQ to incorrectly not insert two wait states between the two
2543	// instructions needed to avoid data hazard.
2544	if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2545	DGEMMAfterVALUWrite = false;
2546	if (TRI.isVectorRegister(MRI, Reg)) {
2547	int WaitStatesNeededForUse =
2548	DMFMABetweenVALUWriteVMEMRead -
2549	getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
2550	Limit: DMFMABetweenVALUWriteVMEMRead);
2551
2552	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2553	}
2554	}
2555
2556	MFMA = nullptr;
2557	WaitStatesSinceDef =
2558	getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2559	if (!MFMA)
2560	continue;
2561
2562	unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2563	int NumPasses = HazardDefLatency;
2564	int NeedWaitStates = MaxWaitStates;
2565
2566	if (isDGEMM(Opcode: MFMA->getOpcode())) {
2567	switch (HazardDefLatency) {
2568	case `4`:
2569	NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2570	: DMFMA4x4WriteVgprVALUReadWaitStates;
2571	break;
2572	case `8`:
2573	case `16`:
2574	NeedWaitStates = IsMemOrExport
2575	? DMFMA16x16WriteVgprMemExpReadWaitStates
2576	: DMFMA16x16WriteVgprVALUReadWaitStates;
2577	break;
2578	default:
2579	llvm_unreachable("unexpected dgemm");
2580	}
2581	} else if (ST.hasGFX940Insts()) {
2582	NeedWaitStates =
2583	isXDL(ST, MI: *MFMA)
2584	? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2585	: GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2586	NumPasses);
2587	} else {
2588	switch (HazardDefLatency) {
2589	case `2`:
2590	NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2591	break;
2592	case `8`:
2593	NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2594	break;
2595	case `16`:
2596	NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2597	break;
2598	default:
2599	llvm_unreachable("unexpected number of passes for mfma");
2600	}
2601	}
2602
2603	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2604	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2605
2606	if (WaitStatesNeeded == MaxWaitStates)
2607	break;
2608	}
2609	}
2610
2611	unsigned Opc = MI->getOpcode();
2612	const int DMFMAToFMA64WaitStates = `2`;
2613	if ((Opc == AMDGPU::V_FMA_F64_e64 \|\|
2614	Opc == AMDGPU::V_FMAC_F64_e32 \|\| Opc == AMDGPU::V_FMAC_F64_e64 \|\|
2615	Opc == AMDGPU::V_FMAC_F64_dpp) &&
2616	WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2617	int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2618	getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
2619	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2620	}
2621
2622	if (!IsVALU && !IsMemOrExport)
2623	return WaitStatesNeeded;
2624
2625	for (const MachineOperand &Def : MI->defs()) {
2626	const int SMFMA4x4WriteVgprVALUWawWaitStates = `5`;
2627	const int SMFMA16x16WriteVgprVALUWawWaitStates = `11`;
2628	const int SMFMA32x32WriteVgprVALUWawWaitStates = `19`;
2629	const int SMFMA4x4ReadVgprVALUWarWaitStates = `1`;
2630	const int GFX940_XDL4PassReadVgprVALUWarWaitStates = `3`;
2631	const int SMFMA16x16ReadVgprVALUWarWaitStates = `7`;
2632	const int SMFMA32x32ReadVgprVALUWarWaitStates = `15`;
2633	const int DMFMA4x4WriteVgprVALUWriteWaitStates = `6`;
2634	const int DMFMA16x16WriteVgprVALUWriteWaitStates = `11`;
2635	const int DotWriteDifferentVALUWrite = `3`;
2636	const int MaxWaitStates = `19`;
2637	const int MaxWarWaitStates = `15`;
2638
2639	Reg = Def.getReg();
2640
2641	DOT = nullptr;
2642	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2643	Limit: MaxWaitStates);
2644	if (DOT && DOT->getOpcode() != MI->getOpcode())
2645	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
2646	WaitStatesSinceDef);
2647
2648	MFMA = nullptr;
2649	WaitStatesSinceDef =
2650	getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2651	if (MFMA) {
2652	int NeedWaitStates = MaxWaitStates;
2653	int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
2654
2655	if (isDGEMM(Opcode: MFMA->getOpcode())) {
2656	switch (NumPasses) {
2657	case `4`:
2658	NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2659	break;
2660	case `8`:
2661	case `16`:
2662	NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2663	break;
2664	default:
2665	llvm_unreachable("unexpected number of cycles for dgemm");
2666	}
2667	} else if (ST.hasGFX940Insts()) {
2668	NeedWaitStates =
2669	isXDL(ST, MI: *MFMA)
2670	? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses)
2671	: GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2672	} else {
2673	switch (NumPasses) {
2674	case `2`:
2675	NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2676	break;
2677	case `8`:
2678	NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2679	break;
2680	case `16`:
2681	NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2682	break;
2683	default:
2684	llvm_unreachable("Unexpected number of passes for mfma");
2685	}
2686	}
2687
2688	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2689	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2690
2691	if (WaitStatesNeeded == MaxWaitStates)
2692	break;
2693	}
2694
2695	auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2696	if (!SIInstrInfo::isMFMA(MI) \|\| isDGEMM(Opcode: MI.getOpcode()) \|\|
2697	!MI.readsRegister(Reg, TRI: &TRI))
2698	return false;
2699
2700	if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2701	return false;
2702
2703	const MachineOperand *SrcC =
2704	TII.getNamedOperand(MI, OpName: AMDGPU::OpName::src2);
2705	assert(SrcC);
2706	if (!SrcC->isReg() \|\| !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg))
2707	return false;
2708
2709	MFMA = &MI;
2710	return true;
2711	};
2712
2713	MFMA = nullptr;
2714	int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
2715	Limit: MaxWarWaitStates);
2716	if (!MFMA)
2717	continue;
2718
2719	unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2720	int NeedWaitStates = MaxWaitStates;
2721	switch (HazardDefLatency) {
2722	case `2`: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2723	break;
2724	case `4`: assert(ST.hasGFX940Insts());
2725	NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2726	break;
2727	case `8`: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2728	break;
2729	case `16`: [[fallthrough]];
2730	default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2731	break;
2732	}
2733
2734	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2735	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2736	}
2737
2738	return WaitStatesNeeded;
2739	}
2740
2741	bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2742	if (!SU->isInstr())
2743	return false;
2744
2745	const MachineInstr MAI = nullptr*;
2746
2747	auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2748	MAI = nullptr;
2749	if (SIInstrInfo::isMFMA(MI))
2750	MAI = &MI;
2751	return MAI != nullptr;
2752	};
2753
2754	MachineInstr *MI = SU->getInstr();
2755	if (IsMFMAFn (*MI)) {
2756	int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: `16`);
2757	if (MAI)
2758	return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
2759	}
2760
2761	return false;
2762	}
2763
2764	bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2765	if (!ST.hasVALUMaskWriteHazard())
2766	return false;
2767	assert(!ST.hasExtendedWaitCounts());
2768
2769	if (!ST.isWave64() \|\| !SIInstrInfo::isSALU(MI: *MI))
2770	return false;
2771
2772	// The hazard sequence is three instructions:
2773	// 1. VALU reads SGPR as mask
2774	// 2. SALU writes SGPR
2775	// 3. SALU reads SGPR
2776	// The hazard can expire if the distance between 2 and 3 is sufficient.
2777	// In practice this happens <10% of the time, hence this always assumes
2778	// the hazard exists if 1 and 2 are present to avoid searching.
2779
2780	const MachineOperand SDSTOp = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::sdst);
2781	if (!SDSTOp \|\| !SDSTOp->isReg())
2782	return false;
2783
2784	const Register HazardReg = SDSTOp->getReg();
2785	if (HazardReg == AMDGPU::EXEC \|\|
2786	HazardReg == AMDGPU::EXEC_LO \|\|
2787	HazardReg == AMDGPU::EXEC_HI \|\|
2788	HazardReg == AMDGPU::M0)
2789	return false;
2790
2791	auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2792	switch (I.getOpcode()) {
2793	case AMDGPU::V_ADDC_U32_e32:
2794	case AMDGPU::V_ADDC_U32_dpp:
2795	case AMDGPU::V_CNDMASK_B16_e32:
2796	case AMDGPU::V_CNDMASK_B16_dpp:
2797	case AMDGPU::V_CNDMASK_B32_e32:
2798	case AMDGPU::V_CNDMASK_B32_dpp:
2799	case AMDGPU::V_DIV_FMAS_F32_e64:
2800	case AMDGPU::V_DIV_FMAS_F64_e64:
2801	case AMDGPU::V_SUBB_U32_e32:
2802	case AMDGPU::V_SUBB_U32_dpp:
2803	case AMDGPU::V_SUBBREV_U32_e32:
2804	case AMDGPU::V_SUBBREV_U32_dpp:
2805	// These implicitly read VCC as mask source.
2806	return HazardReg == AMDGPU::VCC \|\|
2807	HazardReg == AMDGPU::VCC_LO \|\|
2808	HazardReg == AMDGPU::VCC_HI;
2809	case AMDGPU::V_ADDC_U32_e64:
2810	case AMDGPU::V_ADDC_U32_e64_dpp:
2811	case AMDGPU::V_CNDMASK_B16_e64:
2812	case AMDGPU::V_CNDMASK_B16_e64_dpp:
2813	case AMDGPU::V_CNDMASK_B32_e64:
2814	case AMDGPU::V_CNDMASK_B32_e64_dpp:
2815	case AMDGPU::V_SUBB_U32_e64:
2816	case AMDGPU::V_SUBB_U32_e64_dpp:
2817	case AMDGPU::V_SUBBREV_U32_e64:
2818	case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2819	// Only check mask register overlaps.
2820	const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OpName: AMDGPU::OpName::src2);
2821	assert(SSRCOp);
2822	return TRI.regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg);
2823	}
2824	default:
2825	return false;
2826	}
2827	};
2828
2829	const MachineRegisterInfo &MRI = MF.getRegInfo();
2830	auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2831	// s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2832	if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2833	AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: I.getOperand(i: `0`).getImm()) == `0`)
2834	return true;
2835
2836	// VALU access to any SGPR or literal constant other than HazardReg
2837	// mitigates hazard. No need to check HazardReg here as this will
2838	// only be called when !IsHazardFn.
2839	if (!SIInstrInfo::isVALU(MI: I))
2840	return false;
2841	for (int OpNo = `0`, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2842	const MachineOperand &Op = I.getOperand(i: OpNo);
2843	if (Op.isReg()) {
2844	Register OpReg = Op.getReg();
2845	// Only consider uses
2846	if (!Op.isUse())
2847	continue;
2848	// Ignore EXEC
2849	if (OpReg == AMDGPU::EXEC \|\|
2850	OpReg == AMDGPU::EXEC_LO \|\|
2851	OpReg == AMDGPU::EXEC_HI)
2852	continue;
2853	// Ignore all implicit uses except VCC
2854	if (Op.isImplicit()) {
2855	if (OpReg == AMDGPU::VCC \|\|
2856	OpReg == AMDGPU::VCC_LO \|\|
2857	OpReg == AMDGPU::VCC_HI)
2858	return true;
2859	continue;
2860	}
2861	if (TRI.isSGPRReg(MRI, Reg: OpReg))
2862	return true;
2863	} else {
2864	const MCInstrDesc &InstDesc = I.getDesc();
2865	const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2866	if (!TII.isInlineConstant(MO: Op, OpInfo))
2867	return true;
2868	}
2869	}
2870	return false;
2871	};
2872
2873	// Check for hazard
2874	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
2875	std::numeric_limits<int>::max())
2876	return false;
2877
2878	auto NextMI = std::next(x: MI->getIterator());
2879
2880	// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2881	BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(),
2882	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
2883	.addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: `0`));
2884
2885	// SALU write may be s_getpc in a bundle.
2886	if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2887	// Update offsets of any references in the bundle.
2888	while (NextMI != MI->getParent()->end() &&
2889	NextMI ->isBundledWithPred()) {
2890	for (auto &Operand : NextMI ->operands()) {
2891	if (Operand.isGlobal())
2892	Operand.setOffset(Operand.getOffset() + `4`);
2893	}
2894	NextMI ++;
2895	}
2896	}
2897
2898	return true;
2899	}
2900
2901	static bool ensureEntrySetPrio(MachineFunction MF, int* Priority,
2902	const SIInstrInfo &TII) {
2903	MachineBasicBlock &EntryMBB = MF->front();
2904	if (EntryMBB.begin() != EntryMBB.end()) {
2905	auto &EntryMI = *EntryMBB.begin();
2906	if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
2907	EntryMI.getOperand(i: `0`).getImm() >= Priority)
2908	return false;
2909	}
2910
2911	BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
2912	.addImm(Val: Priority);
2913	return true;
2914	}
2915
2916	bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
2917	if (!ST.hasRequiredExportPriority())
2918	return false;
2919
2920	// Assume the following shader types will never have exports,
2921	// and avoid adding or adjusting S_SETPRIO.
2922	MachineBasicBlock *MBB = MI->getParent();
2923	MachineFunction *MF = MBB->getParent();
2924	auto CC = MF->getFunction().getCallingConv();
2925	switch (CC) {
2926	case CallingConv::AMDGPU_CS:
2927	case CallingConv::AMDGPU_CS_Chain:
2928	case CallingConv::AMDGPU_CS_ChainPreserve:
2929	case CallingConv::AMDGPU_KERNEL:
2930	return false;
2931	default:
2932	break;
2933	}
2934
2935	const int MaxPriority = `3`;
2936	const int NormalPriority = `2`;
2937	const int PostExportPriority = `0`;
2938
2939	auto It = MI->getIterator();
2940	switch (MI->getOpcode()) {
2941	case AMDGPU::S_ENDPGM:
2942	case AMDGPU::S_ENDPGM_SAVED:
2943	case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
2944	case AMDGPU::SI_RETURN_TO_EPILOG:
2945	// Ensure shader with calls raises priority at entry.
2946	// This ensures correct priority if exports exist in callee.
2947	if (MF->getFrameInfo().hasCalls())
2948	return ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
2949	return false;
2950	case AMDGPU::S_SETPRIO: {
2951	// Raise minimum priority unless in workaround.
2952	auto &PrioOp = MI->getOperand(i: `0`);
2953	int Prio = PrioOp.getImm();
2954	bool InWA = (Prio == PostExportPriority) &&
2955	(It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It)));
2956	if (InWA \|\| Prio >= NormalPriority)
2957	return false;
2958	PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority));
2959	return true;
2960	}
2961	default:
2962	if (!TII.isEXP(MI: *MI))
2963	return false;
2964	break;
2965	}
2966
2967	// Check entry priority at each export (as there will only be a few).
2968	// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
2969	bool Changed = false;
2970	if (CC != CallingConv::AMDGPU_Gfx)
2971	Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
2972
2973	auto NextMI = std::next(x: It);
2974	bool EndOfShader = false;
2975	if (NextMI != MBB->end()) {
2976	// Only need WA at end of sequence of exports.
2977	if (TII.isEXP(MI: *NextMI))
2978	return Changed;
2979	// Assume appropriate S_SETPRIO after export means WA already applied.
2980	if (NextMI ->getOpcode() == AMDGPU::S_SETPRIO &&
2981	NextMI ->getOperand(i: `0`).getImm() == PostExportPriority)
2982	return Changed;
2983	EndOfShader = NextMI ->getOpcode() == AMDGPU::S_ENDPGM;
2984	}
2985
2986	const DebugLoc &DL = MI->getDebugLoc();
2987
2988	// Lower priority.
2989	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
2990	.addImm(Val: PostExportPriority);
2991
2992	if (!EndOfShader) {
2993	// Wait for exports to complete.
2994	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT))
2995	.addReg(RegNo: AMDGPU::SGPR_NULL)
2996	.addImm(Val: `0`);
2997	}
2998
2999	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: `0`);
3000	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: `0`);
3001
3002	if (!EndOfShader) {
3003	// Return to normal (higher) priority.
3004	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3005	.addImm(Val: NormalPriority);
3006	}
3007
3008	return true;
3009	}
3010

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp