GCNHazardRecognizer.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp]

1	//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements hazard recognizers for scheduling on GCN processors.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "GCNHazardRecognizer.h"
14	#include "GCNSubtarget.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "SIMachineFunctionInfo.h"
17	#include "llvm/CodeGen/MachineFrameInfo.h"
18	#include "llvm/CodeGen/MachineFunction.h"
19	#include "llvm/CodeGen/ScheduleDAG.h"
20	#include "llvm/TargetParser/TargetParser.h"
21
22	using namespace llvm;
23
24	namespace {
25
26	struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27	MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29	bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30	if (Arg.getAsInteger(Radix: `0`, Result&: Value))
31	return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
32
33	if (Value > `100`)
34	return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
35
36	return false;
37	}
38	};
39
40	} // end anonymous namespace
41
42	static cl::opt<unsigned, false, MFMAPaddingRatioParser>
43	MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: `0`), cl::Hidden,
44	cl::desc ("Fill a percentage of the latency between "
45	"neighboring MFMA with s_nops."));
46
47	// This is intended for debugging purposes only.
48	static cl::opt<unsigned>
49	NopPadding("amdgpu-snop-padding", cl::init(Val: `0`), cl::Hidden,
50	cl::desc ("Insert a s_nop x before every instruction"));
51
52	//===----------------------------------------------------------------------===//
53	// Hazard Recognizer Implementation
54	//===----------------------------------------------------------------------===//
55
56	static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57	const GCNSubtarget &ST);
58
59	GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60	: IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61	ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62	TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63	ClauseUses (TRI.getNumRegUnits()), ClauseDefs (TRI.getNumRegUnits()) {
64	MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? `19` : `5`;
65	RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66	}
67
68	void GCNHazardRecognizer::Reset() {
69	EmittedInstrs.clear();
70	}
71
72	void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
73	EmitInstruction(MI: SU->getInstr());
74	}
75
76	void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
77	CurrCycleInstr = MI;
78	}
79
80	static bool isDivFMas(unsigned Opcode) {
81	return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 \|\| Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82	}
83
84	static bool isSGetReg(unsigned Opcode) {
85	return Opcode == AMDGPU::S_GETREG_B32;
86	}
87
88	static bool isSSetReg(unsigned Opcode) {
89	switch (Opcode) {
90	case AMDGPU::S_SETREG_B32:
91	case AMDGPU::S_SETREG_B32_mode:
92	case AMDGPU::S_SETREG_IMM32_B32:
93	case AMDGPU::S_SETREG_IMM32_B32_mode:
94	return true;
95	}
96	return false;
97	}
98
99	static bool isRWLane(unsigned Opcode) {
100	return Opcode == AMDGPU::V_READLANE_B32 \|\| Opcode == AMDGPU::V_WRITELANE_B32;
101	}
102
103	static bool isRFE(unsigned Opcode) {
104	return Opcode == AMDGPU::S_RFE_B64;
105	}
106
107	static bool isSMovRel(unsigned Opcode) {
108	switch (Opcode) {
109	case AMDGPU::S_MOVRELS_B32:
110	case AMDGPU::S_MOVRELS_B64:
111	case AMDGPU::S_MOVRELD_B32:
112	case AMDGPU::S_MOVRELD_B64:
113	return true;
114	default:
115	return false;
116	}
117	}
118
119	static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
120	const MachineInstr &MI) {
121	if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
122	return true;
123
124	switch (MI.getOpcode()) {
125	case AMDGPU::S_SENDMSG:
126	case AMDGPU::S_SENDMSGHALT:
127	case AMDGPU::S_TTRACEDATA:
128	return true;
129	// These DS opcodes don't support GDS.
130	case AMDGPU::DS_NOP:
131	case AMDGPU::DS_PERMUTE_B32:
132	case AMDGPU::DS_BPERMUTE_B32:
133	return false;
134	default:
135	if (TII.isDS(Opcode: MI.getOpcode())) {
136	int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
137	Name: AMDGPU::OpName::gds);
138	if (MI.getOperand(i: GDS).getImm())
139	return true;
140	}
141	return false;
142	}
143	}
144
145	static bool isPermlane(const MachineInstr &MI) {
146	unsigned Opcode = MI.getOpcode();
147	return Opcode == AMDGPU::V_PERMLANE16_B32_e64 \|\|
148	Opcode == AMDGPU::V_PERMLANE64_B32 \|\|
149	Opcode == AMDGPU::V_PERMLANEX16_B32_e64 \|\|
150	Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 \|\|
151	Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 \|\|
152	Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 \|\|
153	Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 \|\|
154	Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 \|\|
155	Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
156	}
157
158	static bool isLdsDma(const MachineInstr &MI) {
159	return SIInstrInfo::isVALU(MI) &&
160	(SIInstrInfo::isMUBUF(MI) \|\| SIInstrInfo::isFLAT(MI));
161	}
162
163	static unsigned getHWReg(const SIInstrInfo TII, const* MachineInstr &RegInstr) {
164	const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr,
165	OperandName: AMDGPU::OpName::simm16);
166	return std::get<`0`>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm()));
167	}
168
169	ScheduleHazardRecognizer::HazardType
170	GCNHazardRecognizer::getHazardType(SUnit SU, int* Stalls) {
171	MachineInstr *MI = SU->getInstr();
172	// If we are not in "HazardRecognizerMode" and therefore not being run from
173	// the scheduler, track possible stalls from hazards but don't insert noops.
174	auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
175
176	if (MI->isBundle())
177	return NoHazard;
178
179	if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > `0`)
180	return HazardType;
181
182	if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > `0`)
183	return HazardType;
184
185	if (checkFPAtomicToDenormModeHazard(MI) > `0`)
186	return HazardType;
187
188	if (ST.hasNoDataDepHazard())
189	return NoHazard;
190
191	if (SIInstrInfo::isVMEM(MI: *MI) && checkVMEMHazards(VMEM: MI) > `0`)
192	return HazardType;
193
194	if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > `0`)
195	return HazardType;
196
197	if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > `0`)
198	return HazardType;
199
200	if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > `0`)
201	return HazardType;
202
203	if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > `0`)
204	return HazardType;
205
206	if ((SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isVMEM(MI: MI) \|\|
207	SIInstrInfo::isDS(MI: MI) \|\| SIInstrInfo::isEXP(MI: MI)) &&
208	checkMAIVALUHazards(MI) > `0`)
209	return HazardType;
210
211	if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > `0`)
212	return HazardType;
213
214	if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > `0`)
215	return HazardType;
216
217	if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > `0`)
218	return HazardType;
219
220	if (((ST.hasReadM0MovRelInterpHazard() &&
221	(TII.isVINTRP(MI: *MI) \|\| isSMovRel(Opcode: MI->getOpcode()) \|\|
222	MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 \|\|
223	MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) \|\|
224	(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) \|\|
225	(ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) \|\|
226	(ST.hasReadM0LdsDirectHazard() &&
227	MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /TRI=/nullptr))) &&
228	checkReadM0Hazards(SMovRel: MI) > `0`)
229	return HazardType;
230
231	if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > `0`)
232	return HazardType;
233
234	if ((SIInstrInfo::isVMEM(MI: MI) \|\| SIInstrInfo::isDS(MI: MI)) &&
235	checkMAILdStHazards(MI) > `0`)
236	return HazardType;
237
238	if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > `0`)
239	return HazardType;
240
241	return NoHazard;
242	}
243
244	static void insertNoopsInBundle(MachineInstr MI, const* SIInstrInfo &TII,
245	unsigned Quantity) {
246	while (Quantity > `0`) {
247	unsigned Arg = std::min(a: Quantity, b: `8u`);
248	Quantity -= Arg;
249	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
250	.addImm(Val: Arg - `1`);
251	}
252	}
253
254	unsigned
255	GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
256	const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
257	assert(TSchedModel.getWriteProcResBegin(SC) !=
258	TSchedModel.getWriteProcResEnd(SC));
259	return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
260	}
261
262	void GCNHazardRecognizer::processBundle() {
263	MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
264	MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
265	// Check bundled MachineInstr's for hazards.
266	for (; MI != E && MI ->isInsideBundle(); ++MI) {
267	CurrCycleInstr = &*MI;
268	unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
269
270	if (IsHazardRecognizerMode) {
271	fixHazards(MI: CurrCycleInstr);
272
273	insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
274	}
275
276	// It’s unnecessary to track more than MaxLookAhead instructions. Since we
277	// include the bundled MI directly after, only add a maximum of
278	// (MaxLookAhead - 1) noops to EmittedInstrs.
279	for (unsigned i = `0`, e = std::min(a: WaitStates, b: MaxLookAhead - `1`); i < e; ++i)
280	EmittedInstrs.push_front(x: nullptr);
281
282	EmittedInstrs.push_front(x: CurrCycleInstr);
283	EmittedInstrs.resize(new_size: MaxLookAhead);
284	}
285	CurrCycleInstr = nullptr;
286	}
287
288	void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
289	assert(IsHazardRecognizerMode);
290
291	unsigned NumPreNoops = PreEmitNoops(MI);
292	EmitNoops(Quantity: NumPreNoops);
293	if (MI->isInsideBundle())
294	insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
295	else
296	TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator (MI),
297	Quantity: NumPreNoops);
298	EmitInstruction(MI);
299	AdvanceCycle();
300	}
301
302	unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
303	IsHazardRecognizerMode = true;
304	CurrCycleInstr = MI;
305	unsigned W = PreEmitNoopsCommon(MI);
306	fixHazards(MI);
307	CurrCycleInstr = nullptr;
308	return std::max(a: W, b: NopPadding.getValue());
309	}
310
311	unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
312	if (MI->isBundle())
313	return `0`;
314
315	int WaitStates = `0`;
316
317	if (SIInstrInfo::isSMRD(MI: *MI))
318	return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
319
320	if (ST.hasNSAtoVMEMBug())
321	WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
322
323	WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
324
325	if (ST.hasNoDataDepHazard())
326	return WaitStates;
327
328	if (SIInstrInfo::isVMEM(MI: *MI))
329	WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
330
331	if (SIInstrInfo::isVALU(MI: *MI))
332	WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
333
334	if (SIInstrInfo::isDPP(MI: *MI))
335	WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
336
337	if (isDivFMas(Opcode: MI->getOpcode()))
338	WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
339
340	if (isRWLane(Opcode: MI->getOpcode()))
341	WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
342
343	if ((SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isVMEM(MI: MI) \|\|
344	SIInstrInfo::isDS(MI: MI) \|\| SIInstrInfo::isEXP(MI: MI)) &&
345	checkMAIVALUHazards(MI) > `0`)
346	WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
347
348	if (MI->isInlineAsm())
349	return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
350
351	if (isSGetReg(Opcode: MI->getOpcode()))
352	return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
353
354	if (isSSetReg(Opcode: MI->getOpcode()))
355	return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
356
357	if (isRFE(Opcode: MI->getOpcode()))
358	return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
359
360	if ((ST.hasReadM0MovRelInterpHazard() &&
361	(TII.isVINTRP(MI: *MI) \|\| isSMovRel(Opcode: MI->getOpcode()) \|\|
362	MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 \|\|
363	MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) \|\|
364	(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) \|\|
365	(ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) \|\|
366	(ST.hasReadM0LdsDirectHazard() &&
367	MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /TRI=/nullptr)))
368	return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
369
370	if (SIInstrInfo::isMAI(MI: *MI))
371	return std::max(a: WaitStates, b: checkMAIHazards(MI));
372
373	if (SIInstrInfo::isVMEM(MI: MI) \|\| SIInstrInfo::isDS(MI: MI))
374	return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
375
376	if (ST.hasGFX950Insts() && isPermlane(MI: *MI))
377	return std::max(a: WaitStates, b: checkPermlaneHazards(MI));
378
379	return WaitStates;
380	}
381
382	void GCNHazardRecognizer::EmitNoop() {
383	EmittedInstrs.push_front(x: nullptr);
384	}
385
386	void GCNHazardRecognizer::AdvanceCycle() {
387	// When the scheduler detects a stall, it will call AdvanceCycle() without
388	// emitting any instructions.
389	if (!CurrCycleInstr) {
390	EmittedInstrs.push_front(x: nullptr);
391	return;
392	}
393
394	if (CurrCycleInstr->isBundle()) {
395	processBundle();
396	return;
397	}
398
399	unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
400	if (!NumWaitStates) {
401	CurrCycleInstr = nullptr;
402	return;
403	}
404
405	// Keep track of emitted instructions
406	EmittedInstrs.push_front(x: CurrCycleInstr);
407
408	// Add a nullptr for each additional wait state after the first. Make sure
409	// not to add more than getMaxLookAhead() items to the list, since we
410	// truncate the list to that size right after this loop.
411	for (unsigned i = `1`, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
412	i < e; ++i) {
413	EmittedInstrs.push_front(x: nullptr);
414	}
415
416	// getMaxLookahead() is the largest number of wait states we will ever need
417	// to insert, so there is no point in keeping track of more than that many
418	// wait states.
419	EmittedInstrs.resize(new_size: getMaxLookAhead());
420
421	CurrCycleInstr = nullptr;
422	}
423
424	void GCNHazardRecognizer::RecedeCycle() {
425	assert(!IsHazardRecognizerMode &&
426	"Bottom-up scheduling shouldn't run in hazard recognizer mode");
427	}
428
429	//===----------------------------------------------------------------------===//
430	// Helper Functions
431	//===----------------------------------------------------------------------===//
432
433	using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
434
435	using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
436	using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
437
438	// Search for a hazard in a block and its predecessors.
439	template <typename StateT>
440	static bool
441	hasHazard(StateT State,
442	function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
443	function_ref<void(StateT &, const MachineInstr &)> UpdateState,
444	const MachineBasicBlock *MBB,
445	MachineBasicBlock::const_reverse_instr_iterator I,
446	DenseSet<const MachineBasicBlock *> &Visited) {
447	for (auto E = MBB->instr_rend(); I != E; ++I) {
448	// No need to look at parent BUNDLE instructions.
449	if (I ->isBundle())
450	continue;
451
452	switch (IsHazard(State, *I)) {
453	case HazardFound:
454	return true;
455	case HazardExpired:
456	return false;
457	default:
458	// Continue search
459	break;
460	}
461
462	if (I ->isInlineAsm() \|\| I ->isMetaInstruction())
463	continue;
464
465	UpdateState(State, *I);
466	}
467
468	for (MachineBasicBlock *Pred : MBB->predecessors()) {
469	if (!Visited.insert(V: Pred).second)
470	continue;
471
472	if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
473	Visited))
474	return true;
475	}
476
477	return false;
478	}
479
480	// Returns a minimum wait states since \p I walking all predecessors.
481	// Only scans until \p IsExpired does not return true.
482	// Can only be run in a hazard recognizer mode.
483	static int getWaitStatesSince(
484	GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
485	MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
486	IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
487	GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
488	for (auto E = MBB->instr_rend(); I != E; ++I) {
489	// Don't add WaitStates for parent BUNDLE instructions.
490	if (I ->isBundle())
491	continue;
492
493	if (IsHazard (*I))
494	return WaitStates;
495
496	if (I ->isInlineAsm())
497	continue;
498
499	WaitStates += GetNumWaitStates (*I);
500
501	if (IsExpired (*I, WaitStates))
502	return std::numeric_limits<int>::max();
503	}
504
505	int MinWaitStates = std::numeric_limits<int>::max();
506	for (MachineBasicBlock *Pred : MBB->predecessors()) {
507	if (!Visited.insert(V: Pred).second)
508	continue;
509
510	int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
511	IsExpired, Visited, GetNumWaitStates);
512
513	MinWaitStates = std::min(a: MinWaitStates, b: W);
514	}
515
516	return MinWaitStates;
517	}
518
519	static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
520	const MachineInstr *MI, IsExpiredFn IsExpired) {
521	DenseSet<const MachineBasicBlock *> Visited;
522	return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
523	I: std::next(x: MI->getReverseIterator()),
524	WaitStates: `0`, IsExpired, Visited);
525	}
526
527	int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
528	if (IsHazardRecognizerMode) {
529	auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
530	return WaitStates >= Limit;
531	};
532	return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn);
533	}
534
535	int WaitStates = `0`;
536	for (MachineInstr *MI : EmittedInstrs) {
537	if (MI) {
538	if (IsHazard (*MI))
539	return WaitStates;
540
541	if (MI->isInlineAsm())
542	continue;
543	}
544	++WaitStates;
545
546	if (WaitStates >= Limit)
547	break;
548	}
549	return std::numeric_limits<int>::max();
550	}
551
552	int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
553	IsHazardFn IsHazardDef,
554	int Limit) {
555	const SIRegisterInfo *TRI = ST.getRegisterInfo();
556
557	auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
558	return IsHazardDef (MI) && MI.modifiesRegister(Reg, TRI);
559	};
560
561	return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
562	}
563
564	int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
565	int Limit) {
566	auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
567	return isSSetReg(Opcode: MI.getOpcode()) && IsHazard (MI);
568	};
569
570	return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
571	}
572
573	//===----------------------------------------------------------------------===//
574	// No-op Hazard Detection
575	//===----------------------------------------------------------------------===//
576
577	static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
578	MCRegister Reg) {
579	for (MCRegUnit Unit : TRI.regunits(Reg))
580	BV.set(Unit);
581	}
582
583	static void addRegsToSet(const SIRegisterInfo &TRI,
584	iterator_range<MachineInstr::const_mop_iterator> Ops,
585	BitVector &DefSet, BitVector &UseSet) {
586	for (const MachineOperand &Op : Ops) {
587	if (Op.isReg())
588	addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
589	}
590	}
591
592	void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
593	addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
594	}
595
596	static bool breaksSMEMSoftClause(MachineInstr *MI) {
597	return !SIInstrInfo::isSMRD(MI: *MI);
598	}
599
600	static bool breaksVMEMSoftClause(MachineInstr *MI) {
601	return !SIInstrInfo::isVMEM(MI: *MI);
602	}
603
604	int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
605	// SMEM soft clause are only present on VI+, and only matter if xnack is
606	// enabled.
607	if (!ST.isXNACKEnabled())
608	return `0`;
609
610	bool IsSMRD = TII.isSMRD(MI: *MEM);
611
612	resetClause();
613
614	// A soft-clause is any group of consecutive SMEM instructions. The
615	// instructions in this group may return out of order and/or may be
616	// replayed (i.e. the same instruction issued more than once).
617	//
618	// In order to handle these situations correctly we need to make sure that
619	// when a clause has more than one instruction, no instruction in the clause
620	// writes to a register that is read by another instruction in the clause
621	// (including itself). If we encounter this situation, we need to break the
622	// clause by inserting a non SMEM instruction.
623
624	for (MachineInstr *MI : EmittedInstrs) {
625	// When we hit a non-SMEM instruction then we have passed the start of the
626	// clause and we can stop.
627	if (!MI)
628	break;
629
630	if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
631	break;
632
633	addClauseInst(MI: *MI);
634	}
635
636	if (ClauseDefs.none())
637	return `0`;
638
639	// We need to make sure not to put loads and stores in the same clause if they
640	// use the same address. For now, just start a new clause whenever we see a
641	// store.
642	if (MEM->mayStore())
643	return `1`;
644
645	addClauseInst(MI: *MEM);
646
647	// If the set of defs and uses intersect then we cannot add this instruction
648	// to the clause, so we have a hazard.
649	return ClauseDefs.anyCommon(RHS: ClauseUses) ? `1` : `0`;
650	}
651
652	int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
653	int WaitStatesNeeded = `0`;
654
655	WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
656
657	// This SMRD hazard only affects SI.
658	if (!ST.hasSMRDReadVALUDefHazard())
659	return WaitStatesNeeded;
660
661	// A read of an SGPR by SMRD instruction requires 4 wait states when the
662	// SGPR was written by a VALU instruction.
663	int SmrdSgprWaitStates = `4`;
664	auto IsHazardDefFn = [this](const MachineInstr &MI) {
665	return TII.isVALU(MI);
666	};
667	auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
668	return TII.isSALU(MI);
669	};
670
671	bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
672
673	for (const MachineOperand &Use : SMRD->uses()) {
674	if (!Use.isReg())
675	continue;
676	int WaitStatesNeededForUse =
677	SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
678	Limit: SmrdSgprWaitStates);
679	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
680
681	// This fixes what appears to be undocumented hardware behavior in SI where
682	// s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
683	// needs some number of nops in between. We don't know how many we need, but
684	// let's use 4. This wasn't discovered before probably because the only
685	// case when this happens is when we expand a 64-bit pointer into a full
686	// descriptor and use s_buffer_load_dword instead of s_load_dword, which was
687	// probably never encountered in the closed-source land.
688	if (IsBufferSMRD) {
689	int WaitStatesNeededForUse =
690	SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
691	IsHazardDef: IsBufferHazardDefFn,
692	Limit: SmrdSgprWaitStates);
693	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
694	}
695	}
696
697	return WaitStatesNeeded;
698	}
699
700	int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
701	if (!ST.hasVMEMReadSGPRVALUDefHazard())
702	return `0`;
703
704	int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
705
706	// A read of an SGPR by a VMEM instruction requires 5 wait states when the
707	// SGPR was written by a VALU Instruction.
708	const int VmemSgprWaitStates = `5`;
709	auto IsHazardDefFn = [this](const MachineInstr &MI) {
710	return TII.isVALU(MI);
711	};
712	for (const MachineOperand &Use : VMEM->uses()) {
713	if (!Use.isReg() \|\| TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
714	continue;
715
716	int WaitStatesNeededForUse =
717	VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
718	Limit: VmemSgprWaitStates);
719	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
720	}
721	return WaitStatesNeeded;
722	}
723
724	int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
725	const SIRegisterInfo *TRI = ST.getRegisterInfo();
726	const SIInstrInfo *TII = ST.getInstrInfo();
727
728	// Check for DPP VGPR read after VALU VGPR write and EXEC write.
729	int DppVgprWaitStates = `2`;
730	int DppExecWaitStates = `5`;
731	int WaitStatesNeeded = `0`;
732	auto IsHazardDefFn = [TII](const MachineInstr &MI) {
733	return TII->isVALU(MI);
734	};
735
736	for (const MachineOperand &Use : DPP->uses()) {
737	if (!Use.isReg() \|\| !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
738	continue;
739	int WaitStatesNeededForUse =
740	DppVgprWaitStates - getWaitStatesSinceDef(
741	Reg: Use.getReg(),
742	IsHazardDef: [](const MachineInstr &) { return true; },
743	Limit: DppVgprWaitStates);
744	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
745	}
746
747	WaitStatesNeeded = std::max(
748	a: WaitStatesNeeded,
749	b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn,
750	Limit: DppExecWaitStates));
751
752	return WaitStatesNeeded;
753	}
754
755	int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
756	const SIInstrInfo *TII = ST.getInstrInfo();
757
758	// v_div_fmas requires 4 wait states after a write to vcc from a VALU
759	// instruction.
760	const int DivFMasWaitStates = `4`;
761	auto IsHazardDefFn = [TII](const MachineInstr &MI) {
762	return TII->isVALU(MI);
763	};
764	int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn,
765	Limit: DivFMasWaitStates);
766
767	return DivFMasWaitStates - WaitStatesNeeded;
768	}
769
770	int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
771	const SIInstrInfo *TII = ST.getInstrInfo();
772	unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
773
774	const int GetRegWaitStates = `2`;
775	auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
776	return GetRegHWReg == getHWReg(TII, RegInstr: MI);
777	};
778	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
779
780	return GetRegWaitStates - WaitStatesNeeded;
781	}
782
783	int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
784	const SIInstrInfo *TII = ST.getInstrInfo();
785	unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
786
787	const int SetRegWaitStates = ST.getSetRegWaitStates();
788	auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
789	return HWReg == getHWReg(TII, RegInstr: MI);
790	};
791	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
792	return SetRegWaitStates - WaitStatesNeeded;
793	}
794
795	int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
796	if (!MI.mayStore())
797	return -`1`;
798
799	const SIInstrInfo *TII = ST.getInstrInfo();
800	unsigned Opcode = MI.getOpcode();
801	const MCInstrDesc &Desc = MI.getDesc();
802
803	int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
804	int VDataRCID = -`1`;
805	if (VDataIdx != -`1`)
806	VDataRCID = Desc.operands()[VDataIdx].RegClass;
807
808	if (TII->isMUBUF(MI) \|\| TII->isMTBUF(MI)) {
809	// There is no hazard if the instruction does not use vector regs
810	// (like wbinvl1)
811	if (VDataIdx == -`1`)
812	return -`1`;
813	// For MUBUF/MTBUF instructions this hazard only exists if the
814	// instruction is not using a register in the soffset field.
815	const MachineOperand *SOffset =
816	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
817	// If we have no soffset operand, then assume this field has been
818	// hardcoded to zero.
819	if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > `64` &&
820	(!SOffset \|\| !SOffset->isReg()))
821	return VDataIdx;
822	}
823
824	// MIMG instructions create a hazard if they don't use a 256-bit T# and
825	// the store size is greater than 8 bytes and they have more than two bits
826	// of their dmask set.
827	// All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
828	if (TII->isMIMG(MI)) {
829	int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::srsrc);
830	assert(SRsrcIdx != -`1` &&
831	AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == `256`);
832	(void)SRsrcIdx;
833	}
834
835	if (TII->isFLAT(MI)) {
836	// There is no hazard if the instruction does not use vector regs
837	if (VDataIdx == -`1`)
838	return -`1`;
839
840	if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > `64`)
841	return VDataIdx;
842	}
843
844	return -`1`;
845	}
846
847	int
848	GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
849	const MachineRegisterInfo &MRI) {
850	// Helper to check for the hazard where VMEM instructions that store more than
851	// 8 bytes can have there store data over written by the next instruction.
852	const SIRegisterInfo *TRI = ST.getRegisterInfo();
853
854	const int VALUWaitStates = ST.hasGFX940Insts() ? `2` : `1`;
855	int WaitStatesNeeded = `0`;
856
857	if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
858	return WaitStatesNeeded;
859	Register Reg = Def.getReg();
860	auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
861	int DataIdx = createsVALUHazard(MI);
862	return DataIdx >= `0` &&
863	TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg);
864	};
865
866	int WaitStatesNeededForDef =
867	VALUWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: VALUWaitStates);
868	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
869
870	return WaitStatesNeeded;
871	}
872
873	/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
874	/// pack the computed value into correct bit position of the dest register. This
875	/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
876	/// dst_sel that is not aligned to the register. This function analayzes the \p
877	/// MI and \returns an operand with dst forwarding issue, or nullptr if
878	/// none exists.
879	static const MachineOperand *
880	getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
881	if (!SIInstrInfo::isVALU(MI))
882	return nullptr;
883
884	const SIInstrInfo *TII = ST.getInstrInfo();
885
886	unsigned Opcode = MI.getOpcode();
887
888	// There are three different types of instructions
889	// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
890	// which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
891	// (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
892	// op_sel[3:2]
893	// != 0
894	if (SIInstrInfo::isSDWA(MI)) {
895	// Type 1: SDWA with dst_sel != DWORD
896	if (auto *DstSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel))
897	if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
898	return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
899	}
900
901	AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opc: Opcode);
902	if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel)) {
903	// Type 2: VOP3 which write the hi bits
904	if (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers) &
905	SISrcMods::DST_OP_SEL)
906	return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
907
908	// Type 3: FP8DstSelInst with op_sel[3:2] != 0)
909	if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
910	(TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers) &
911	SISrcMods::OP_SEL_0))
912	return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
913	}
914
915	// Special case: nop is required for all the opsel values for fp4 sr variant
916	// cvt scale instructions
917	if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
918	return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
919
920	return nullptr;
921	}
922
923	/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
924	/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
925	/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
926	static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
927	const MachineOperand *Dst,
928	const SIRegisterInfo *TRI) {
929	// We must consider implicit reads of the VALU. SDWA with dst_sel and
930	// UNUSED_PRESERVE will implicitly read the result from forwarded dest,
931	// and we must account for that hazard.
932	// We also must account for WAW hazards. In particular, WAW with dest
933	// preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
934	// !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
935	// check for ECC. Without accounting for this hazard, the ECC will be
936	// wrong.
937	// TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
938	// complete zeroesHigh16BitsOfDest)
939	for (auto &Operand : VALU->operands()) {
940	if (Operand.isReg() && TRI->regsOverlap(RegA: Dst->getReg(), RegB: Operand.getReg())) {
941	return true;
942	}
943	}
944	return false;
945	}
946
947	int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
948	int WaitStatesNeeded = `0`;
949
950	if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
951	const int TransDefWaitstates = `1`;
952
953	auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
954	if (!SIInstrInfo::isTRANS(MI))
955	return false;
956	const SIRegisterInfo *TRI = ST.getRegisterInfo();
957	const SIInstrInfo *TII = ST.getInstrInfo();
958	Register Def = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)->getReg();
959
960	for (const MachineOperand &Use : VALU->explicit_uses()) {
961	if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
962	return true;
963	}
964
965	return false;
966	};
967
968	int WaitStatesNeededForDef =
969	TransDefWaitstates -
970	getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
971	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
972	}
973
974	if (ST.hasDstSelForwardingHazard() \|\| ST.hasCvtScaleForwardingHazard()) {
975	const int Shift16DefWaitstates = `1`;
976
977	auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
978	const SIRegisterInfo *TRI = ST.getRegisterInfo();
979	const MachineOperand *ForwardedDst =
980	getDstSelForwardingOperand(MI: ProducerMI, ST);
981	if (ForwardedDst) {
982	return consumesDstSelForwardingOperand(VALU, Dst: ForwardedDst, TRI);
983	}
984
985	if (ProducerMI.isInlineAsm()) {
986	// Assume inline asm has dst forwarding hazard
987	for (auto &Def : ProducerMI.all_defs()) {
988	if (consumesDstSelForwardingOperand(VALU, Dst: &Def, TRI))
989	return true;
990	}
991	}
992
993	return false;
994	};
995
996	int WaitStatesNeededForDef =
997	Shift16DefWaitstates -
998	getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
999	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1000	}
1001
1002	if (ST.hasVDecCoExecHazard()) {
1003	const int VALUWriteSGPRVALUReadWaitstates = `2`;
1004	const int VALUWriteEXECRWLane = `4`;
1005	const int VALUWriteVGPRReadlaneRead = `1`;
1006
1007	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1008	const MachineRegisterInfo &MRI = MF.getRegInfo();
1009	Register UseReg;
1010	auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1011	if (!SIInstrInfo::isVALU(MI))
1012	return false;
1013	return MI.modifiesRegister(Reg: UseReg, TRI);
1014	};
1015
1016	for (const MachineOperand &Use : VALU->explicit_uses()) {
1017	if (!Use.isReg())
1018	continue;
1019
1020	UseReg = Use.getReg();
1021	if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
1022	int WaitStatesNeededForDef =
1023	VALUWriteSGPRVALUReadWaitstates -
1024	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn,
1025	Limit: VALUWriteSGPRVALUReadWaitstates);
1026	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1027	}
1028	}
1029
1030	if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) {
1031	UseReg = AMDGPU::VCC;
1032	int WaitStatesNeededForDef =
1033	VALUWriteSGPRVALUReadWaitstates -
1034	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates);
1035	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1036	}
1037
1038	switch (VALU->getOpcode()) {
1039	case AMDGPU::V_READLANE_B32:
1040	case AMDGPU::V_READFIRSTLANE_B32: {
1041	MachineOperand Src = TII.getNamedOperand(MI&: VALU, OperandName: AMDGPU::OpName::src0);
1042	UseReg = Src->getReg();
1043	int WaitStatesNeededForDef =
1044	VALUWriteVGPRReadlaneRead -
1045	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead);
1046	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1047	}
1048	[[fallthrough]];
1049	case AMDGPU::V_WRITELANE_B32: {
1050	UseReg = AMDGPU::EXEC;
1051	int WaitStatesNeededForDef =
1052	VALUWriteEXECRWLane -
1053	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane);
1054	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1055	break;
1056	}
1057	default:
1058	break;
1059	}
1060	}
1061
1062	// This checks for the hazard where VMEM instructions that store more than
1063	// 8 bytes can have there store data over written by the next instruction.
1064	if (!ST.has12DWordStoreHazard())
1065	return WaitStatesNeeded;
1066
1067	const MachineRegisterInfo &MRI = MF.getRegInfo();
1068
1069	for (const MachineOperand &Def : VALU->defs()) {
1070	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1071	}
1072
1073	return WaitStatesNeeded;
1074	}
1075
1076	int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1077	// This checks for hazards associated with inline asm statements.
1078	// Since inline asms can contain just about anything, we use this
1079	// to call/leverage other checkHazard routines. Note that*
1080	// this function doesn't attempt to address all possible inline asm
1081	// hazards (good luck), but is a collection of what has been
1082	// problematic thus far.
1083
1084	// see checkVALUHazards()
1085	if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1086	!ST.hasCvtScaleForwardingHazard())
1087	return `0`;
1088
1089	const MachineRegisterInfo &MRI = MF.getRegInfo();
1090	int WaitStatesNeeded = `0`;
1091
1092	for (const MachineOperand &Op :
1093	llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1094	if (Op.isReg() && Op.isDef()) {
1095	if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1096	continue;
1097
1098	if (ST.has12DWordStoreHazard()) {
1099	WaitStatesNeeded =
1100	std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1101	}
1102	}
1103	}
1104
1105	if (ST.hasDstSelForwardingHazard()) {
1106	const int Shift16DefWaitstates = `1`;
1107
1108	auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1109	const MachineOperand *Dst = getDstSelForwardingOperand(MI: ProducerMI, ST);
1110	// Assume inline asm reads the dst
1111	if (Dst)
1112	return IA->modifiesRegister(Reg: Dst->getReg(), TRI: &TRI) \|\|
1113	IA->readsRegister(Reg: Dst->getReg(), TRI: &TRI);
1114
1115	if (ProducerMI.isInlineAsm()) {
1116	// If MI is inline asm, assume it has dst forwarding hazard
1117	for (auto &Def : ProducerMI.all_defs()) {
1118	if (IA->modifiesRegister(Reg: Def.getReg(), TRI: &TRI) \|\|
1119	IA->readsRegister(Reg: Def.getReg(), TRI: &TRI)) {
1120	return true;
1121	}
1122	}
1123	}
1124
1125	return false;
1126	};
1127
1128	int WaitStatesNeededForDef =
1129	Shift16DefWaitstates -
1130	getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1131	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1132	}
1133
1134	return WaitStatesNeeded;
1135	}
1136
1137	int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1138	const SIInstrInfo *TII = ST.getInstrInfo();
1139	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1140	const MachineRegisterInfo &MRI = MF.getRegInfo();
1141
1142	const MachineOperand *LaneSelectOp =
1143	TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1);
1144
1145	if (!LaneSelectOp->isReg() \|\| !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1146	return `0`;
1147
1148	Register LaneSelectReg = LaneSelectOp->getReg();
1149	auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1150
1151	const int RWLaneWaitStates = `4`;
1152	int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1153	Limit: RWLaneWaitStates);
1154	return RWLaneWaitStates - WaitStatesSince;
1155	}
1156
1157	int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1158	if (!ST.hasRFEHazards())
1159	return `0`;
1160
1161	const SIInstrInfo *TII = ST.getInstrInfo();
1162
1163	const int RFEWaitStates = `1`;
1164
1165	auto IsHazardFn = [TII](const MachineInstr &MI) {
1166	return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1167	};
1168	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1169	return RFEWaitStates - WaitStatesNeeded;
1170	}
1171
1172	int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1173	const SIInstrInfo *TII = ST.getInstrInfo();
1174	const int ReadM0WaitStates = `1`;
1175	auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1176	return ReadM0WaitStates -
1177	getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates);
1178	}
1179
1180	void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1181	fixVMEMtoScalarWriteHazards(MI);
1182	fixVcmpxPermlaneHazards(MI);
1183	fixSMEMtoVectorWriteHazards(MI);
1184	fixVcmpxExecWARHazard(MI);
1185	fixLdsBranchVmemWARHazard(MI);
1186	if (ST.hasLdsDirect()) {
1187	fixLdsDirectVALUHazard(MI);
1188	fixLdsDirectVMEMHazard(MI);
1189	}
1190	fixVALUPartialForwardingHazard(MI);
1191	fixVALUTransUseHazard(MI);
1192	fixWMMAHazards(MI);
1193	fixShift64HighRegBug(MI);
1194	fixVALUMaskWriteHazard(MI);
1195	fixRequiredExportPriority(MI);
1196	}
1197
1198	static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1199	const MachineInstr &MI) {
1200	return (TII.isVOPC(MI) \|\|
1201	(MI.isCompare() && (TII.isVOP3(MI) \|\| TII.isSDWA(MI)))) &&
1202	MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI);
1203	}
1204
1205	bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1206	if (!ST.hasVcmpxPermlaneHazard() \|\| !isPermlane(MI: *MI))
1207	return false;
1208
1209	const SIInstrInfo *TII = ST.getInstrInfo();
1210	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1211	auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1212	return isVCmpXWritesExec(TII: TII, TRI: TRI, MI);
1213	};
1214
1215	auto IsExpiredFn = [](const MachineInstr &MI, int) {
1216	unsigned Opc = MI.getOpcode();
1217	return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1218	Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1219	};
1220
1221	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1222	std::numeric_limits<int>::max())
1223	return false;
1224
1225	// V_NOP will be discarded by SQ.
1226	// Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1227	// which is always a VGPR and available.
1228	auto Src0 = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src0);
1229	Register Reg = Src0->getReg();
1230	bool IsUndef = Src0->isUndef();
1231	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1232	MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32))
1233	.addReg(RegNo: Reg, flags: RegState::Define \| (IsUndef ? RegState::Dead : `0`))
1234	.addReg(RegNo: Reg, flags: IsUndef ? RegState::Undef : RegState::Kill);
1235
1236	return true;
1237	}
1238
1239	bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1240	if (!ST.hasVMEMtoScalarWriteHazard())
1241	return false;
1242	assert(!ST.hasExtendedWaitCounts());
1243
1244	if (!SIInstrInfo::isSALU(MI: MI) && !SIInstrInfo::isSMRD(MI: MI))
1245	return false;
1246
1247	if (MI->getNumDefs() == `0`)
1248	return false;
1249
1250	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1251
1252	auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1253	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1254	return false;
1255
1256	for (const MachineOperand &Def : MI->defs()) {
1257	const MachineOperand *Op =
1258	I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false);
1259	if (!Op)
1260	continue;
1261	return true;
1262	}
1263	return false;
1264	};
1265
1266	auto IsExpiredFn = [](const MachineInstr &MI, int) {
1267	return SIInstrInfo::isVALU(MI) \|\|
1268	(MI.getOpcode() == AMDGPU::S_WAITCNT &&
1269	!MI.getOperand(i: `0`).getImm()) \|\|
1270	(MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1271	AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: `0`).getImm()) == `0`);
1272	};
1273
1274	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1275	std::numeric_limits<int>::max())
1276	return false;
1277
1278	const SIInstrInfo *TII = ST.getInstrInfo();
1279	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1280	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1281	.addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: `0`));
1282	return true;
1283	}
1284
1285	bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1286	if (!ST.hasSMEMtoVectorWriteHazard())
1287	return false;
1288	assert(!ST.hasExtendedWaitCounts());
1289
1290	if (!SIInstrInfo::isVALU(MI: *MI))
1291	return false;
1292
1293	AMDGPU::OpName SDSTName;
1294	switch (MI->getOpcode()) {
1295	case AMDGPU::V_READLANE_B32:
1296	case AMDGPU::V_READFIRSTLANE_B32:
1297	SDSTName = AMDGPU::OpName::vdst;
1298	break;
1299	default:
1300	SDSTName = AMDGPU::OpName::sdst;
1301	break;
1302	}
1303
1304	const SIInstrInfo *TII = ST.getInstrInfo();
1305	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1306	const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1307	const MachineOperand SDST = TII->getNamedOperand(MI&: MI, OperandName: SDSTName);
1308	if (!SDST) {
1309	for (const auto &MO : MI->implicit_operands()) {
1310	if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) {
1311	SDST = &MO;
1312	break;
1313	}
1314	}
1315	}
1316
1317	if (!SDST)
1318	return false;
1319
1320	const Register SDSTReg = SDST->getReg();
1321	auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1322	return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI);
1323	};
1324
1325	auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1326	if (TII->isSALU(MI)) {
1327	switch (MI.getOpcode()) {
1328	case AMDGPU::S_SETVSKIP:
1329	case AMDGPU::S_VERSION:
1330	case AMDGPU::S_WAITCNT_VSCNT:
1331	case AMDGPU::S_WAITCNT_VMCNT:
1332	case AMDGPU::S_WAITCNT_EXPCNT:
1333	// These instructions cannot not mitigate the hazard.
1334	return false;
1335	case AMDGPU::S_WAITCNT_LGKMCNT:
1336	// Reducing lgkmcnt count to 0 always mitigates the hazard.
1337	return (MI.getOperand(i: `1`).getImm() == `0`) &&
1338	(MI.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL);
1339	case AMDGPU::S_WAITCNT: {
1340	const int64_t Imm = MI.getOperand(i: `0`).getImm();
1341	AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1342	// DsCnt corresponds to LGKMCnt here.
1343	return (Decoded.DsCnt == `0`);
1344	}
1345	default:
1346	// SOPP instructions cannot mitigate the hazard.
1347	if (TII->isSOPP(MI))
1348	return false;
1349	// At this point the SALU can be assumed to mitigate the hazard
1350	// because either:
1351	// (a) it is independent of the at risk SMEM (breaking chain),
1352	// or
1353	// (b) it is dependent on the SMEM, in which case an appropriate
1354	// s_waitcnt lgkmcnt _must_ exist between it and the at risk
1355	// SMEM instruction.
1356	return true;
1357	}
1358	}
1359	return false;
1360	};
1361
1362	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1363	std::numeric_limits<int>::max())
1364	return false;
1365
1366	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1367	MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL)
1368	.addImm(Val: `0`);
1369	return true;
1370	}
1371
1372	bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1373	if (!ST.hasVcmpxExecWARHazard())
1374	return false;
1375	assert(!ST.hasExtendedWaitCounts());
1376
1377	if (!SIInstrInfo::isVALU(MI: *MI))
1378	return false;
1379
1380	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1381	if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
1382	return false;
1383
1384	auto IsHazardFn = [TRI](const MachineInstr &I) {
1385	if (SIInstrInfo::isVALU(MI: I))
1386	return false;
1387	return I.readsRegister(Reg: AMDGPU::EXEC, TRI);
1388	};
1389
1390	const SIInstrInfo *TII = ST.getInstrInfo();
1391	auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1392	if (SIInstrInfo::isVALU(MI)) {
1393	if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))
1394	return true;
1395	for (auto MO : MI.implicit_operands())
1396	if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg())))
1397	return true;
1398	}
1399	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1400	AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: `0`).getImm()) == `0`)
1401	return true;
1402	return false;
1403	};
1404
1405	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1406	std::numeric_limits<int>::max())
1407	return false;
1408
1409	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1410	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1411	.addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: `0`));
1412	return true;
1413	}
1414
1415	static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1416	const GCNSubtarget &ST) {
1417	if (!ST.hasLdsBranchVmemWARHazard())
1418	return false;
1419
1420	// Check if the necessary condition for the hazard is met: both LDS and VMEM
1421	// instructions need to appear in the same function.
1422	bool HasLds = false;
1423	bool HasVmem = false;
1424	for (auto &MBB : MF) {
1425	for (auto &MI : MBB) {
1426	HasLds \|= SIInstrInfo::isDS(MI);
1427	HasVmem \|= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) \|\|
1428	SIInstrInfo::isSegmentSpecificFLAT(MI);
1429	if (HasLds && HasVmem)
1430	return true;
1431	}
1432	}
1433	return false;
1434	}
1435
1436	static bool isStoreCountWaitZero(const MachineInstr &I) {
1437	return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1438	I.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL &&
1439	!I.getOperand(i: `1`).getImm();
1440	}
1441
1442	bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1443	if (!RunLdsBranchVmemWARHazardFixup)
1444	return false;
1445
1446	assert(ST.hasLdsBranchVmemWARHazard());
1447	assert(!ST.hasExtendedWaitCounts());
1448
1449	auto IsHazardInst = [](const MachineInstr &MI) {
1450	if (SIInstrInfo::isDS(MI))
1451	return `1`;
1452	if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) \|\|
1453	SIInstrInfo::isSegmentSpecificFLAT(MI))
1454	return `2`;
1455	return `0`;
1456	};
1457
1458	auto InstType = IsHazardInst (*MI);
1459	if (!InstType)
1460	return false;
1461
1462	auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1463	return IsHazardInst (I) \|\| isStoreCountWaitZero(I);
1464	};
1465
1466	auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1467	if (!I.isBranch())
1468	return false;
1469
1470	auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1471	auto InstType2 = IsHazardInst (I);
1472	return InstType2 && InstType != InstType2;
1473	};
1474
1475	auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1476	auto InstType2 = IsHazardInst (I);
1477	if (InstType == InstType2)
1478	return true;
1479
1480	return isStoreCountWaitZero(I);
1481	};
1482
1483	return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1484	std::numeric_limits<int>::max();
1485	};
1486
1487	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1488	std::numeric_limits<int>::max())
1489	return false;
1490
1491	const SIInstrInfo *TII = ST.getInstrInfo();
1492	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1493	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1494	.addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
1495	.addImm(Val: `0`);
1496
1497	return true;
1498	}
1499
1500	bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1501	if (!SIInstrInfo::isLDSDIR(MI: *MI))
1502	return false;
1503
1504	const int NoHazardWaitStates = `15`;
1505	const MachineOperand VDST = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::vdst);
1506	const Register VDSTReg = VDST->getReg();
1507
1508	bool VisitedTrans = false;
1509	auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1510	if (!SIInstrInfo::isVALU(MI: I))
1511	return false;
1512	VisitedTrans = VisitedTrans \|\| SIInstrInfo::isTRANS(MI: I);
1513	// Cover both WAR and WAW
1514	return I.readsRegister(Reg: VDSTReg, TRI: &TRI) \|\| I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1515	};
1516	auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1517	if (WaitStates >= NoHazardWaitStates)
1518	return true;
1519	// Instructions which cause va_vdst==0 expire hazard
1520	return SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isDS(MI: I) \|\|
1521	SIInstrInfo::isEXP(MI: I);
1522	};
1523	auto GetWaitStatesFn = [](const MachineInstr &MI) {
1524	return SIInstrInfo::isVALU(MI) ? `1` : `0`;
1525	};
1526
1527	DenseSet<const MachineBasicBlock *> Visited;
1528	auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(),
1529	I: std::next(x: MI->getReverseIterator()), WaitStates: `0`,
1530	IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn);
1531
1532	// Transcendentals can execute in parallel to other VALUs.
1533	// This makes va_vdst count unusable with a mixture of VALU and TRANS.
1534	if (VisitedTrans)
1535	Count = `0`;
1536
1537	MachineOperand *WaitVdstOp =
1538	TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst);
1539	WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates));
1540
1541	return true;
1542	}
1543
1544	bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1545	if (!SIInstrInfo::isLDSDIR(MI: *MI))
1546	return false;
1547
1548	const MachineOperand VDST = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::vdst);
1549	const Register VDSTReg = VDST->getReg();
1550
1551	auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1552	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1553	return false;
1554	return I.readsRegister(Reg: VDSTReg, TRI: &TRI) \|\| I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1555	};
1556	bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1557	// TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1558	// according to the type of VMEM instruction.
1559	auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1560	return SIInstrInfo::isVALU(MI: I) \|\| SIInstrInfo::isEXP(MI: I) \|\|
1561	(I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: `0`).getImm()) \|\|
1562	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1563	AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: `0`).getImm()) == `0`) \|\|
1564	(LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) &&
1565	!TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::waitvsrc)->getImm());
1566	};
1567
1568	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1569	std::numeric_limits<int>::max())
1570	return false;
1571
1572	if (LdsdirCanWait) {
1573	TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(`0`);
1574	} else {
1575	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1576	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1577	.addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: `0`));
1578	}
1579
1580	return true;
1581	}
1582
1583	bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1584	if (!ST.hasVALUPartialForwardingHazard())
1585	return false;
1586	assert(!ST.hasExtendedWaitCounts());
1587
1588	if (!ST.isWave64() \|\| !SIInstrInfo::isVALU(MI: *MI))
1589	return false;
1590
1591	SmallSetVector<Register, `4`> SrcVGPRs;
1592
1593	for (const MachineOperand &Use : MI->explicit_uses()) {
1594	if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1595	SrcVGPRs.insert(X: Use.getReg());
1596	}
1597
1598	// Only applies with >= 2 unique VGPR sources
1599	if (SrcVGPRs.size() <= `1`)
1600	return false;
1601
1602	// Look for the following pattern:
1603	// Va <- VALU [PreExecPos]
1604	// intv1
1605	// Exec <- SALU [ExecPos]
1606	// intv2
1607	// Vb <- VALU [PostExecPos]
1608	// intv3
1609	// MI Va, Vb (WaitState = 0)
1610	//
1611	// Where:
1612	// intv1 + intv2 <= 2 VALUs
1613	// intv3 <= 4 VALUs
1614	//
1615	// If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1616
1617	const int Intv1plus2MaxVALUs = `2`;
1618	const int Intv3MaxVALUs = `4`;
1619	const int IntvMaxVALUs = `6`;
1620	const int NoHazardVALUWaitStates = IntvMaxVALUs + `2`;
1621
1622	struct StateType {
1623	SmallDenseMap<Register, int, `4`> DefPos;
1624	int ExecPos = std::numeric_limits<int>::max();
1625	int VALUs = `0`;
1626	};
1627
1628	StateType State;
1629
1630	// This overloads expiry testing with all the hazard detection
1631	auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1632	// Too many VALU states have passed
1633	if (State.VALUs > NoHazardVALUWaitStates)
1634	return HazardExpired;
1635
1636	// Instructions which cause va_vdst==0 expire hazard
1637	if (SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isDS(MI: I) \|\|
1638	SIInstrInfo::isEXP(MI: I) \|\|
1639	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1640	AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: `0`).getImm()) == `0`))
1641	return HazardExpired;
1642
1643	// Track registers writes
1644	bool Changed = false;
1645	if (SIInstrInfo::isVALU(MI: I)) {
1646	for (Register Src : SrcVGPRs) {
1647	if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1648	State.DefPos [Src] = State.VALUs;
1649	Changed = true;
1650	}
1651	}
1652	} else if (SIInstrInfo::isSALU(MI: I)) {
1653	if (State.ExecPos == std::numeric_limits<int>::max()) {
1654	if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
1655	State.ExecPos = State.VALUs;
1656	Changed = true;
1657	}
1658	}
1659	}
1660
1661	// Early expiration: too many VALUs in intv3
1662	if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1663	return HazardExpired;
1664
1665	// Only evaluate state if something changed
1666	if (!Changed)
1667	return NoHazardFound;
1668
1669	// Determine positions of VALUs pre/post exec change
1670	if (State.ExecPos == std::numeric_limits<int>::max())
1671	return NoHazardFound;
1672
1673	int PreExecPos = std::numeric_limits<int>::max();
1674	int PostExecPos = std::numeric_limits<int>::max();
1675
1676	for (auto Entry : State.DefPos) {
1677	int DefVALUs = Entry.second;
1678	if (DefVALUs != std::numeric_limits<int>::max()) {
1679	if (DefVALUs >= State.ExecPos)
1680	PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1681	else
1682	PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1683	}
1684	}
1685
1686	// Need a VALUs post exec change
1687	if (PostExecPos == std::numeric_limits<int>::max())
1688	return NoHazardFound;
1689
1690	// Too many VALUs in intv3?
1691	int Intv3VALUs = PostExecPos;
1692	if (Intv3VALUs > Intv3MaxVALUs)
1693	return HazardExpired;
1694
1695	// Too many VALUs in intv2?
1696	int Intv2VALUs = (State.ExecPos - PostExecPos) - `1`;
1697	if (Intv2VALUs > Intv1plus2MaxVALUs)
1698	return HazardExpired;
1699
1700	// Need a VALUs pre exec change
1701	if (PreExecPos == std::numeric_limits<int>::max())
1702	return NoHazardFound;
1703
1704	// Too many VALUs in intv1?
1705	int Intv1VALUs = PreExecPos - State.ExecPos;
1706	if (Intv1VALUs > Intv1plus2MaxVALUs)
1707	return HazardExpired;
1708
1709	// Too many VALUs in intv1 + intv2
1710	if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1711	return HazardExpired;
1712
1713	return HazardFound;
1714	};
1715	auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1716	if (SIInstrInfo::isVALU(MI))
1717	State.VALUs += `1`;
1718	};
1719
1720	DenseSet<const MachineBasicBlock *> Visited;
1721	if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1722	I: std::next(x: MI->getReverseIterator()), Visited))
1723	return false;
1724
1725	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1726	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1727	.addImm(Val: `0x0fff`);
1728
1729	return true;
1730	}
1731
1732	bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1733	if (!ST.hasVALUTransUseHazard())
1734	return false;
1735	assert(!ST.hasExtendedWaitCounts());
1736
1737	if (!SIInstrInfo::isVALU(MI: *MI))
1738	return false;
1739
1740	SmallSet<Register, `4`> SrcVGPRs;
1741
1742	for (const MachineOperand &Use : MI->explicit_uses()) {
1743	if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1744	SrcVGPRs.insert(V: Use.getReg());
1745	}
1746
1747	// Look for the following pattern:
1748	// Va <- TRANS VALU
1749	// intv
1750	// MI Va (WaitState = 0)
1751	//
1752	// Where:
1753	// intv <= 5 VALUs / 1 TRANS
1754	//
1755	// If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1756
1757	const int IntvMaxVALUs = `5`;
1758	const int IntvMaxTRANS = `1`;
1759
1760	struct StateType {
1761	int VALUs = `0`;
1762	int TRANS = `0`;
1763	};
1764
1765	StateType State;
1766
1767	// This overloads expiry testing with all the hazard detection
1768	auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1769	// Too many VALU states have passed
1770	if (State.VALUs > IntvMaxVALUs \|\| State.TRANS > IntvMaxTRANS)
1771	return HazardExpired;
1772
1773	// Instructions which cause va_vdst==0 expire hazard
1774	if (SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isDS(MI: I) \|\|
1775	SIInstrInfo::isEXP(MI: I) \|\|
1776	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1777	I.getOperand(i: `0`).getImm() == `0x0fff`))
1778	return HazardExpired;
1779
1780	// Track registers writes
1781	if (SIInstrInfo::isTRANS(MI: I)) {
1782	for (Register Src : SrcVGPRs) {
1783	if (I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1784	return HazardFound;
1785	}
1786	}
1787	}
1788
1789	return NoHazardFound;
1790	};
1791	auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1792	if (SIInstrInfo::isVALU(MI))
1793	State.VALUs += `1`;
1794	if (SIInstrInfo::isTRANS(MI))
1795	State.TRANS += `1`;
1796	};
1797
1798	DenseSet<const MachineBasicBlock *> Visited;
1799	if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1800	I: std::next(x: MI->getReverseIterator()), Visited))
1801	return false;
1802
1803	// Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1804	// avoided.
1805	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1806	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1807	.addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: `0`));
1808
1809	return true;
1810	}
1811
1812	bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1813	if (!SIInstrInfo::isWMMA(MI: MI) && !SIInstrInfo::isSWMMAC(MI: MI))
1814	return false;
1815
1816	const SIInstrInfo *TII = ST.getInstrInfo();
1817	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1818
1819	auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1820	if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
1821	return false;
1822
1823	// Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1824	// with the dest(matrix D) of the previous wmma.
1825	const Register CurSrc0Reg =
1826	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
1827	const Register CurSrc1Reg =
1828	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
1829
1830	const Register PrevDstReg =
1831	TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
1832
1833	if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) \|\|
1834	TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) {
1835	return true;
1836	}
1837
1838	// GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1839	// but Index can't overlap with PrevDstReg.
1840	if (AMDGPU::isGFX12Plus(STI: ST)) {
1841	if (SIInstrInfo::isSWMMAC(MI: *MI)) {
1842	const Register CurIndex =
1843	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
1844	if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex))
1845	return true;
1846	}
1847	return false;
1848	}
1849
1850	return false;
1851	};
1852
1853	auto IsExpiredFn = [](const MachineInstr &I, int) {
1854	return SIInstrInfo::isVALU(MI: I);
1855	};
1856
1857	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1858	std::numeric_limits<int>::max())
1859	return false;
1860
1861	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
1862
1863	return true;
1864	}
1865
1866	bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1867	if (!ST.hasShift64HighRegBug())
1868	return false;
1869	assert(!ST.hasExtendedWaitCounts());
1870
1871	switch (MI->getOpcode()) {
1872	default:
1873	return false;
1874	case AMDGPU::V_LSHLREV_B64_e64:
1875	case AMDGPU::V_LSHRREV_B64_e64:
1876	case AMDGPU::V_ASHRREV_I64_e64:
1877	break;
1878	}
1879
1880	MachineOperand Amt = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src0);
1881	if (!Amt->isReg())
1882	return false;
1883
1884	Register AmtReg = Amt->getReg();
1885	const MachineRegisterInfo &MRI = MF.getRegInfo();
1886	// Check if this is a last VGPR in the allocation block.
1887	if (!TRI.isVGPR(MRI, Reg: AmtReg) \|\| ((AmtReg - AMDGPU::VGPR0) & `7`) != `7`)
1888	return false;
1889
1890	if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + `1`))
1891	return false;
1892
1893	MachineOperand Src1 = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src1);
1894	bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(RegA: Src1->getReg(), RegB: AmtReg);
1895	bool OverlappedDst = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI);
1896	bool Overlapped = OverlappedSrc \|\| OverlappedDst;
1897
1898	assert(!OverlappedDst \|\| !OverlappedSrc \|\|
1899	Src1->getReg() == MI->getOperand(`0`).getReg());
1900	assert(ST.needsAlignedVGPRs());
1901	static_assert(AMDGPU::VGPR0 + `1` == AMDGPU::VGPR1);
1902
1903	Register NewReg;
1904	for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1905	: AMDGPU::VGPR_32RegClass) {
1906	if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) {
1907	NewReg = Reg;
1908	break;
1909	}
1910	}
1911
1912	Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1)
1913	: NewReg;
1914	Register NewAmtLo;
1915
1916	if (Overlapped)
1917	NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0);
1918
1919	DebugLoc DL = MI->getDebugLoc();
1920	MachineBasicBlock *MBB = MI->getParent();
1921	// Insert a full wait count because found register might be pending a wait.
1922	BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
1923	.addImm(Val: `0`);
1924
1925	// Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1926	if (Overlapped)
1927	runOnInstruction(
1928	MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo)
1929	.addDef(RegNo: AmtReg - `1`)
1930	.addReg(RegNo: AmtReg - `1`, flags: RegState::Undef)
1931	.addReg(RegNo: NewAmtLo, flags: RegState::Undef));
1932	runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt)
1933	.addDef(RegNo: AmtReg)
1934	.addReg(RegNo: AmtReg, flags: RegState::Undef)
1935	.addReg(RegNo: NewAmt, flags: RegState::Undef));
1936
1937	// Instructions emitted after the current instruction will be processed by the
1938	// parent loop of the hazard recognizer in a natural way.
1939	BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
1940	DestReg: AmtReg)
1941	.addDef(RegNo: NewAmt)
1942	.addReg(RegNo: NewAmt)
1943	.addReg(RegNo: AmtReg);
1944	if (Overlapped)
1945	BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
1946	DestReg: AmtReg - `1`)
1947	.addDef(RegNo: NewAmtLo)
1948	.addReg(RegNo: NewAmtLo)
1949	.addReg(RegNo: AmtReg - `1`);
1950
1951	// Re-running hazard recognizer on the modified instruction is not necessary,
1952	// inserted V_SWAP_B32 has already both read and write new registers so
1953	// hazards related to these register has already been handled.
1954	Amt->setReg(NewAmt);
1955	Amt->setIsKill(false);
1956	// We do not update liveness, so verifier may see it as undef.
1957	Amt->setIsUndef();
1958	if (OverlappedDst)
1959	MI->getOperand(i: `0`).setReg(NewReg);
1960	if (OverlappedSrc) {
1961	Src1->setReg(NewReg);
1962	Src1->setIsKill(false);
1963	Src1->setIsUndef();
1964	}
1965
1966	return true;
1967	}
1968
1969	int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1970	int NSAtoVMEMWaitStates = `1`;
1971
1972	if (!ST.hasNSAtoVMEMBug())
1973	return `0`;
1974
1975	if (!SIInstrInfo::isMUBUF(MI: MI) && !SIInstrInfo::isMTBUF(MI: MI))
1976	return `0`;
1977
1978	const SIInstrInfo *TII = ST.getInstrInfo();
1979	const auto Offset = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::offset);
1980	if (!Offset \|\| (Offset->getImm() & `6`) == `0`)
1981	return `0`;
1982
1983	auto IsHazardFn = [TII](const MachineInstr &I) {
1984	if (!SIInstrInfo::isMIMG(MI: I))
1985	return false;
1986	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
1987	return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1988	TII->getInstSizeInBytes(MI: I) >= `16`;
1989	};
1990
1991	return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: `1`);
1992	}
1993
1994	int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1995	int FPAtomicToDenormModeWaitStates = `3`;
1996
1997	if (!ST.hasFPAtomicToDenormModeHazard())
1998	return `0`;
1999	assert(!ST.hasExtendedWaitCounts());
2000
2001	if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2002	return `0`;
2003
2004	auto IsHazardFn = [](const MachineInstr &I) {
2005	if (!SIInstrInfo::isVMEM(MI: I))
2006	return false;
2007	return SIInstrInfo::isFPAtomic(MI: I);
2008	};
2009
2010	auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2011	if (WaitStates >= `3` \|\| SIInstrInfo::isVALU(MI))
2012	return true;
2013
2014	switch (MI.getOpcode()) {
2015	case AMDGPU::S_WAITCNT:
2016	case AMDGPU::S_WAITCNT_VSCNT:
2017	case AMDGPU::S_WAITCNT_VMCNT:
2018	case AMDGPU::S_WAITCNT_EXPCNT:
2019	case AMDGPU::S_WAITCNT_LGKMCNT:
2020	case AMDGPU::S_WAIT_IDLE:
2021	return true;
2022	default:
2023	break;
2024	}
2025
2026	return false;
2027	};
2028
2029	return FPAtomicToDenormModeWaitStates -
2030	::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
2031	}
2032
2033	int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2034	assert(SIInstrInfo::isMAI(*MI));
2035
2036	return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2037	}
2038
2039	int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2040	// Early exit if no padding is requested.
2041	if (MFMAPaddingRatio == `0`)
2042	return `0`;
2043
2044	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2045	if (!SIInstrInfo::isMFMA(MI: *MI) \|\| MFI->getOccupancy() < `2`)
2046	return `0`;
2047
2048	int NeighborMFMALatency = `0`;
2049	auto IsNeighboringMFMA = [&NeighborMFMALatency,
2050	this](const MachineInstr &MI) {
2051	if (!SIInstrInfo::isMFMA(MI))
2052	return false;
2053
2054	NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2055	return true;
2056	};
2057
2058	const int MaxMFMAPipelineWaitStates = `16`;
2059	int WaitStatesSinceNeighborMFMA =
2060	getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
2061
2062	int NeighborMFMAPaddingNeeded =
2063	(NeighborMFMALatency * MFMAPaddingRatio / `100`) -
2064	WaitStatesSinceNeighborMFMA;
2065
2066	return std::max(a: `0`, b: NeighborMFMAPaddingNeeded);
2067	}
2068
2069	int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2070	int WaitStatesNeeded = `0`;
2071	unsigned Opc = MI->getOpcode();
2072
2073	auto IsVALUFn = [](const MachineInstr &MI) {
2074	return SIInstrInfo::isVALU(MI) \|\| MI.isInlineAsm();
2075	};
2076
2077	if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2078	const int LegacyVALUWritesVGPRWaitStates = `2`;
2079	const int VALUWritesExecWaitStates = `4`;
2080	const int MaxWaitStates = `4`;
2081
2082	int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2083	getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2084	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2085
2086	if (WaitStatesNeeded < MaxWaitStates) {
2087	for (const MachineOperand &Use : MI->explicit_uses()) {
2088	const int MaxWaitStates = `2`;
2089
2090	if (!Use.isReg() \|\| !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
2091	continue;
2092
2093	int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2094	getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2095	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2096
2097	if (WaitStatesNeeded == MaxWaitStates)
2098	break;
2099	}
2100	}
2101	}
2102
2103	for (const MachineOperand &Op : MI->explicit_operands()) {
2104	if (!Op.isReg() \|\| !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2105	continue;
2106
2107	if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2108	continue;
2109
2110	const int MFMAWritesAGPROverlappedSrcABWaitStates = `4`;
2111	const int MFMAWritesAGPROverlappedSrcCWaitStates = `2`;
2112	const int MFMA4x4WritesAGPRAccVgprReadWaitStates = `4`;
2113	const int MFMA16x16WritesAGPRAccVgprReadWaitStates = `10`;
2114	const int MFMA32x32WritesAGPRAccVgprReadWaitStates = `18`;
2115	const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = `1`;
2116	const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = `7`;
2117	const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = `15`;
2118	const int MaxWaitStates = `18`;
2119	Register Reg = Op.getReg();
2120	unsigned HazardDefLatency = `0`;
2121
2122	auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2123	this](const MachineInstr &MI) {
2124	if (!SIInstrInfo::isMFMA(MI))
2125	return false;
2126	Register DstReg = MI.getOperand(i: `0`).getReg();
2127	if (DstReg == Reg)
2128	return false;
2129	HazardDefLatency =
2130	std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2131	return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2132	};
2133
2134	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn,
2135	Limit: MaxWaitStates);
2136	int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2137	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2138	int OpNo = Op.getOperandNo();
2139	if (OpNo == SrcCIdx) {
2140	NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2141	} else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2142	switch (HazardDefLatency) {
2143	case `2`: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2144	break;
2145	case `8`: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2146	break;
2147	case `16`: [[fallthrough]];
2148	default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2149	break;
2150	}
2151	} else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2152	switch (HazardDefLatency) {
2153	case `2`: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2154	break;
2155	case `8`: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2156	break;
2157	case `16`: [[fallthrough]];
2158	default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2159	break;
2160	}
2161	}
2162
2163	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2164	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2165
2166	if (WaitStatesNeeded == MaxWaitStates)
2167	return WaitStatesNeeded; // Early exit.
2168
2169	auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2170	if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2171	return false;
2172	Register DstReg = MI.getOperand(i: `0`).getReg();
2173	return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2174	};
2175
2176	const int AccVGPRWriteMFMAReadSrcCWaitStates = `1`;
2177	const int AccVGPRWriteMFMAReadSrcABWaitStates = `3`;
2178	const int AccVGPRWriteAccVgprReadWaitStates = `3`;
2179	NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2180	if (OpNo == SrcCIdx)
2181	NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2182	else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2183	NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2184
2185	WaitStatesNeededForUse = NeedWaitStates -
2186	getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates);
2187	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2188
2189	if (WaitStatesNeeded == MaxWaitStates)
2190	return WaitStatesNeeded; // Early exit.
2191	}
2192
2193	if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2194	const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = `0`;
2195	const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = `5`;
2196	const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = `13`;
2197	const int MaxWaitStates = `13`;
2198	Register DstReg = MI->getOperand(i: `0`).getReg();
2199	unsigned HazardDefLatency = `0`;
2200
2201	auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2202	this](const MachineInstr &MI) {
2203	if (!SIInstrInfo::isMFMA(MI))
2204	return false;
2205	Register Reg = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2206	HazardDefLatency =
2207	std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2208	return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2209	};
2210
2211	int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates);
2212	int NeedWaitStates;
2213	switch (HazardDefLatency) {
2214	case `2`: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2215	break;
2216	case `8`: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2217	break;
2218	case `16`: [[fallthrough]];
2219	default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2220	break;
2221	}
2222
2223	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2224	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2225	}
2226
2227	// Pad neighboring MFMA with noops for better inter-wave performance.
2228	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2229
2230	return WaitStatesNeeded;
2231	}
2232
2233	static int
2234	GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2235	bool IsGFX950) {
2236	// xdl def cycles \| gfx940 \| gfx950
2237	// 2 pass \| 3 4
2238	// 4 pass \| 5 6
2239	// 8 pass \| 9 10
2240	// 16 pass \| 17 18
2241	return NumPasses + `1` + IsGFX950;
2242	}
2243
2244	static int
2245	GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2246	bool IsGFX950) {
2247	// xdl def cycles \| gfx940 \| gfx950
2248	// 2 pass \| 3 3
2249	// 4 pass \| 5 6
2250	// 8 pass \| 9 10
2251	// 16 pass \| 17 18
2252	return NumPasses + `1` + (NumPasses != `2` && IsGFX950);
2253	}
2254
2255	static int
2256	GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2257	// 2 pass -> 2
2258	// 4 pass -> 4
2259	// 8 pass -> 8
2260	// 16 pass -> 16
2261	return NumPasses;
2262	}
2263
2264	static int
2265	GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2266	// 2 pass -> 4
2267	// 4 pass -> 6
2268	// 8 pass -> 10
2269	// 16 pass -> 18
2270	return NumPasses + `2`;
2271	}
2272
2273	static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
2274	bool IsGFX950) {
2275	// xdl def cycles \| gfx942 \| gfx950
2276	// 2 pass \| 5 5
2277	// 4 pass \| 7 8
2278	// 8 pass \| 11 12
2279	// 16 pass \| 19 20
2280	return NumPasses + `3` + (NumPasses != `2` && IsGFX950);
2281	}
2282
2283	int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2284	int WaitStatesNeeded = `0`;
2285	unsigned Opc = MI->getOpcode();
2286
2287	auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2288	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2289	};
2290
2291	auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2292	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2293	!SIInstrInfo::isDOT(MI);
2294	};
2295
2296	if (!SIInstrInfo::isMFMA(MI: *MI))
2297	return WaitStatesNeeded;
2298
2299	const int VALUWritesExecWaitStates = `4`;
2300	int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2301	getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn,
2302	Limit: VALUWritesExecWaitStates);
2303	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2304
2305	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2306
2307	// Loop for both DGEMM and S/HGEMM 2nd instruction.
2308	for (const MachineOperand &Use : MI->explicit_uses()) {
2309	const int LegacyVALUNotDotWritesVGPRWaitStates = `2`;
2310	const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = `2`;
2311	const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = `8`;
2312	const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = `16`;
2313	const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = `3`;
2314	const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = `9`;
2315	const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = `17`;
2316	const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = `9`;
2317	const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = `17`;
2318	const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = `4`;
2319	const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = `5`;
2320	const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = `11`;
2321	const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = `19`;
2322	const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = `6`;
2323	const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = `11`;
2324	const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = `19`;
2325	const int DMFMA4x4WritesVGPRFullSrcCWaitStates = `4`;
2326	const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = `2`;
2327	const int MaxWaitStates = `19`;
2328
2329	if (!Use.isReg())
2330	continue;
2331	Register Reg = Use.getReg();
2332	bool FullReg;
2333	const MachineInstr *MI1;
2334
2335	auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2336	this](const MachineInstr &MI) {
2337	if (!SIInstrInfo::isMFMA(MI))
2338	return false;
2339	Register DstReg = MI.getOperand(i: `0`).getReg();
2340	FullReg = (DstReg == Reg);
2341	MI1 = &MI;
2342	return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2343	};
2344
2345	WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2346	getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2347	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2348
2349	int NumWaitStates =
2350	getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates);
2351	if (NumWaitStates == std::numeric_limits<int>::max())
2352	continue;
2353
2354	int OpNo = Use.getOperandNo();
2355	unsigned Opc1 = MI1->getOpcode();
2356	int NeedWaitStates = `0`;
2357	if (OpNo == SrcCIdx) {
2358	if (!SIInstrInfo::isDGEMM(Opcode: Opc) &&
2359	(!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opcode: Opc1))) {
2360	NeedWaitStates = `0`;
2361	} else if (FullReg) {
2362	if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 \|\|
2363	Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2364	(Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 \|\|
2365	Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2366	NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2367	else if (ST.hasGFX940Insts() &&
2368	TSchedModel.computeInstrLatency(MI: MI1) == `2`)
2369	NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2370	} else {
2371	switch (Opc1) {
2372	case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2373	case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2374	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2375	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2376	if (!TII.isXDL(MI: *MI))
2377	NeedWaitStates =
2378	ST.hasGFX950Insts()
2379	? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2380	: DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2381	break;
2382	case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2383	case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2384	if (!TII.isXDL(MI: *MI))
2385	NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2386	break;
2387	default:
2388	int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2389	if (ST.hasGFX940Insts()) {
2390	if (TII.isXDL(MI: MI) && !TII.isXDL(MI: MI1))
2391	break;
2392
2393	NeedWaitStates =
2394	TII.isXDL(MI: *MI1)
2395	? (TII.isXDL(MI: *MI)
2396	? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2397	NumPasses, IsGFX950: ST.hasGFX950Insts())
2398	: GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2399	NumPasses, IsGFX950: ST.hasGFX950Insts()))
2400	: GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2401	NumPasses);
2402	break;
2403	}
2404
2405	switch (NumPasses) {
2406	case `2`:
2407	NeedWaitStates =
2408	SIInstrInfo::isDGEMM(Opcode: Opc)
2409	? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2410	: SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2411	break;
2412	case `8`:
2413	NeedWaitStates =
2414	SIInstrInfo::isDGEMM(Opcode: Opc)
2415	? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2416	: SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2417	break;
2418	case `16`:
2419	NeedWaitStates =
2420	SIInstrInfo::isDGEMM(Opcode: Opc)
2421	? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2422	: SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2423	break;
2424	default:
2425	llvm_unreachable("unexpected number of passes");
2426	}
2427	}
2428	}
2429	} else {
2430	switch (Opc1) {
2431	case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2432	case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2433	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2434	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2435	NeedWaitStates =
2436	ST.hasGFX950Insts()
2437	? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2438	: DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2439	break;
2440	case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2441	case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2442	NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2443	break;
2444	default:
2445	int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2446
2447	if (ST.hasGFX940Insts()) {
2448	NeedWaitStates =
2449	TII.isXDL(MI: *MI1)
2450	? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2451	NumPasses, IsGFX950: ST.hasGFX950Insts())
2452	: GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2453	NumPasses);
2454	break;
2455	}
2456
2457	switch (NumPasses) {
2458	case `2`:
2459	NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2460	break;
2461	case `4`:
2462	llvm_unreachable("unexpected number of passes for mfma");
2463	case `8`:
2464	NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2465	break;
2466	case `16`:
2467	default:
2468	NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2469	}
2470	}
2471	}
2472	if (WaitStatesNeeded >= NeedWaitStates)
2473	continue;
2474
2475	WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2476	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2477
2478	if (WaitStatesNeeded == MaxWaitStates)
2479	break;
2480	}
2481
2482	// Pad neighboring MFMA with noops for better inter-wave performance.
2483	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2484
2485	return WaitStatesNeeded;
2486	}
2487
2488	int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2489	// On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2490	if (!ST.hasMAIInsts() \|\| ST.hasGFX90AInsts())
2491	return `0`;
2492
2493	int WaitStatesNeeded = `0`;
2494
2495	auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2496	return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2497	};
2498
2499	for (const MachineOperand &Op : MI->explicit_uses()) {
2500	if (!Op.isReg() \|\| !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2501	continue;
2502
2503	Register Reg = Op.getReg();
2504
2505	const int AccVgprReadLdStWaitStates = `2`;
2506	const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = `1`;
2507	const int MaxWaitStates = `2`;
2508
2509	int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2510	getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates);
2511	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2512
2513	if (WaitStatesNeeded == MaxWaitStates)
2514	return WaitStatesNeeded; // Early exit.
2515
2516	auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2517	if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2518	MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2519	return false;
2520	auto IsVALUFn = [](const MachineInstr &MI) {
2521	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2522	};
2523	return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: `2` /MaxWaitStates/) <
2524	std::numeric_limits<int>::max();
2525	};
2526
2527	WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2528	getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2529	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2530	}
2531
2532	return WaitStatesNeeded;
2533	}
2534
2535	int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2536	assert(!ST.hasVcmpxPermlaneHazard() &&
2537	"this is a different vcmpx+permlane hazard");
2538	const SIRegisterInfo *TRI = ST.getRegisterInfo();
2539	const SIInstrInfo *TII = ST.getInstrInfo();
2540
2541	auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2542	return isVCmpXWritesExec(TII: TII, TRI: TRI, MI);
2543	};
2544
2545	auto IsVALUFn = [](const MachineInstr &MI) {
2546	return SIInstrInfo::isVALU(MI);
2547	};
2548
2549	const int VCmpXWritesExecWaitStates = `4`;
2550	const int VALUWritesVDstWaitStates = `2`;
2551	int WaitStatesNeeded = `0`;
2552
2553	for (const MachineOperand &Op : MI->explicit_uses()) {
2554	if (!Op.isReg() \|\| !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2555	continue;
2556	Register Reg = Op.getReg();
2557
2558	int WaitStatesSinceDef =
2559	VALUWritesVDstWaitStates -
2560	getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn,
2561	/MaxWaitStates=/Limit: VALUWritesVDstWaitStates);
2562	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesSinceDef);
2563	if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2564	break;
2565	}
2566
2567	int VCmpXHazardWaits =
2568	VCmpXWritesExecWaitStates -
2569	getWaitStatesSince(IsHazard: IsVCmpXWritesExecFn, Limit: VCmpXWritesExecWaitStates);
2570
2571	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: VCmpXHazardWaits);
2572	return WaitStatesNeeded;
2573	}
2574
2575	static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2576	// 2 pass -> 4
2577	// 4 pass -> 6
2578	// 8 pass -> 10
2579	// 16 pass -> 18
2580	return NumPasses + `2`;
2581	}
2582
2583	static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
2584	bool IsGFX950) {
2585	// xdl def cycles \| gfx942 \| gfx950
2586	// 2 pass \| 5 5
2587	// 4 pass \| 7 8
2588	// 8 pass \| 11 12
2589	// 16 pass \| 19 20
2590	return NumPasses + `3` + (NumPasses != `2` && IsGFX950);
2591	}
2592
2593	static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
2594	bool IsGFX950) {
2595	// xdl def cycles \| gfx942 \| gfx950
2596	// 2 pass \| 5 5
2597	// 4 pass \| 7 8
2598	// 8 pass \| 11 12
2599	// 16 pass \| 19 20
2600	return NumPasses + `3` + (NumPasses != `2` && IsGFX950);
2601	}
2602
2603	static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2604	// 2 pass -> 4
2605	// 4 pass -> 6
2606	// 8 pass -> 10
2607	// 16 pass -> 18
2608	return NumPasses + `2`;
2609	}
2610
2611	int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2612	if (!ST.hasGFX90AInsts())
2613	return `0`;
2614
2615	auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2616	return SIInstrInfo::isDGEMM(Opcode: MI.getOpcode());
2617	};
2618
2619	// This is checked in checkMAIHazards90A()
2620	if (SIInstrInfo::isMFMA(MI: *MI))
2621	return `0`;
2622
2623	const MachineRegisterInfo &MRI = MF.getRegInfo();
2624
2625	int WaitStatesNeeded = `0`;
2626
2627	bool IsMem = SIInstrInfo::isVMEM(MI: MI) \|\| SIInstrInfo::isDS(MI: MI);
2628	bool IsMemOrExport = IsMem \|\| SIInstrInfo::isEXP(MI: *MI);
2629	bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
2630
2631	const MachineInstr MFMA = nullptr*;
2632	unsigned Reg;
2633	auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2634	if (!SIInstrInfo::isMFMA(MI) \|\|
2635	!TRI.regsOverlap(RegA: MI.getOperand(i: `0`).getReg(), RegB: Reg))
2636	return false;
2637	MFMA = &MI;
2638	return true;
2639	};
2640
2641	const MachineInstr DOT = nullptr*;
2642	auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2643	if (!SIInstrInfo::isDOT(MI) \|\|
2644	!TRI.regsOverlap(RegA: MI.getOperand(i: `0`).getReg(), RegB: Reg))
2645	return false;
2646	DOT = &MI;
2647	return true;
2648	};
2649
2650	bool DGEMMAfterVALUWrite = false;
2651	auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2652	// Found DGEMM on reverse traversal to def.
2653	if (SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()))
2654	DGEMMAfterVALUWrite = true;
2655
2656	// Only hazard if register is defined by a VALU and a DGEMM is found after
2657	// after the def.
2658	if (!TII.isVALU(MI) \|\| !DGEMMAfterVALUWrite)
2659	return false;
2660
2661	return true;
2662	};
2663
2664	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(),
2665	Name: AMDGPU::OpName::src2);
2666
2667	if (IsMemOrExport \|\| IsVALU) {
2668	const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = `5`;
2669	const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = `11`;
2670	const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = `19`;
2671	const int DMFMA4x4WriteVgprMemExpReadWaitStates = `9`;
2672	const int DMFMA16x16WriteVgprMemExpReadWaitStates = `18`;
2673	const int DMFMA4x4WriteVgprVALUReadWaitStates = `6`;
2674	const int DMFMA16x16WriteVgprVALUReadWaitStates = `11`;
2675	const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = `19`;
2676	const int DotWriteSameDotReadSrcAB = `3`;
2677	const int DotWriteDifferentVALURead = `3`;
2678	const int DMFMABetweenVALUWriteVMEMRead = `2`;
2679	const int MaxWaitStates = `19`;
2680
2681	for (const MachineOperand &Use : MI->explicit_uses()) {
2682	if (!Use.isReg())
2683	continue;
2684	Reg = Use.getReg();
2685
2686	DOT = nullptr;
2687	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2688	Limit: MaxWaitStates);
2689	if (DOT) {
2690	int NeedWaitStates = `0`;
2691	if (DOT->getOpcode() == MI->getOpcode()) {
2692	if (&Use - &MI->getOperand(i: `0`) != SrcCIdx)
2693	NeedWaitStates = DotWriteSameDotReadSrcAB;
2694	} else {
2695	NeedWaitStates = DotWriteDifferentVALURead;
2696	}
2697
2698	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2699	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2700	}
2701
2702	// Workaround for HW data hazard bug observed only in GFX90A. When there
2703	// is a DGEMM instruction in-between a VALU and a VMEM instruction it
2704	// causes the SQ to incorrectly not insert two wait states between the two
2705	// instructions needed to avoid data hazard.
2706	if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2707	DGEMMAfterVALUWrite = false;
2708	if (TRI.isVectorRegister(MRI, Reg)) {
2709	int WaitStatesNeededForUse =
2710	DMFMABetweenVALUWriteVMEMRead -
2711	getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
2712	Limit: DMFMABetweenVALUWriteVMEMRead);
2713
2714	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2715	}
2716	}
2717
2718	MFMA = nullptr;
2719	WaitStatesSinceDef =
2720	getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2721	if (!MFMA)
2722	continue;
2723
2724	unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2725	int NumPasses = HazardDefLatency;
2726	int NeedWaitStates = MaxWaitStates;
2727
2728	if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
2729	switch (HazardDefLatency) {
2730	case `4`:
2731	NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2732	: DMFMA4x4WriteVgprVALUReadWaitStates;
2733	break;
2734	case `8`:
2735	case `16`:
2736	NeedWaitStates =
2737	IsMemOrExport
2738	? DMFMA16x16WriteVgprMemExpReadWaitStates
2739	: (ST.hasGFX950Insts()
2740	? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2741	: DMFMA16x16WriteVgprVALUReadWaitStates);
2742	break;
2743	default:
2744	llvm_unreachable("unexpected dgemm");
2745	}
2746	} else if (ST.hasGFX940Insts()) {
2747	NeedWaitStates =
2748	TII.isXDL(MI: *MFMA)
2749	? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
2750	NumPasses, IsGFX950: ST.hasGFX950Insts())
2751	: GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2752	NumPasses);
2753	} else {
2754	switch (HazardDefLatency) {
2755	case `2`:
2756	NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2757	break;
2758	case `8`:
2759	NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2760	break;
2761	case `16`:
2762	NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2763	break;
2764	default:
2765	llvm_unreachable("unexpected number of passes for mfma");
2766	}
2767	}
2768
2769	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2770	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2771
2772	if (WaitStatesNeeded == MaxWaitStates)
2773	break;
2774	}
2775	}
2776
2777	unsigned Opc = MI->getOpcode();
2778	const int DMFMAToFMA64WaitStates = `2`;
2779	if ((Opc == AMDGPU::V_FMA_F64_e64 \|\|
2780	Opc == AMDGPU::V_FMAC_F64_e32 \|\| Opc == AMDGPU::V_FMAC_F64_e64 \|\|
2781	Opc == AMDGPU::V_FMAC_F64_dpp) &&
2782	WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2783	int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2784	getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
2785	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2786	}
2787
2788	if (!IsVALU && !IsMemOrExport)
2789	return WaitStatesNeeded;
2790
2791	for (const MachineOperand &Def : MI->defs()) {
2792	const int SMFMA4x4WriteVgprVALUWawWaitStates = `5`;
2793	const int SMFMA16x16WriteVgprVALUWawWaitStates = `11`;
2794	const int SMFMA32x32WriteVgprVALUWawWaitStates = `19`;
2795	const int SMFMA4x4ReadVgprVALUWarWaitStates = `1`;
2796	const int GFX940_XDL4PassReadVgprVALUWarWaitStates = `3`;
2797	const int SMFMA16x16ReadVgprVALUWarWaitStates = `7`;
2798	const int SMFMA32x32ReadVgprVALUWarWaitStates = `15`;
2799	const int DMFMA4x4WriteVgprVALUWriteWaitStates = `6`;
2800	const int DMFMA16x16WriteVgprVALUWriteWaitStates = `11`;
2801	const int DotWriteDifferentVALUWrite = `3`;
2802	const int MaxWaitStates = `19`;
2803	const int MaxWarWaitStates = `15`;
2804
2805	Reg = Def.getReg();
2806
2807	DOT = nullptr;
2808	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2809	Limit: MaxWaitStates);
2810	if (DOT && DOT->getOpcode() != MI->getOpcode())
2811	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
2812	WaitStatesSinceDef);
2813
2814	MFMA = nullptr;
2815	WaitStatesSinceDef =
2816	getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2817	if (MFMA) {
2818	int NeedWaitStates = MaxWaitStates;
2819	int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
2820
2821	if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
2822	switch (NumPasses) {
2823	case `4`:
2824	NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2825	break;
2826	case `8`:
2827	case `16`:
2828	NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2829	break;
2830	default:
2831	llvm_unreachable("unexpected number of cycles for dgemm");
2832	}
2833	} else if (ST.hasGFX940Insts()) {
2834	NeedWaitStates =
2835	TII.isXDL(MI: *MFMA)
2836	? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
2837	NumPasses, IsGFX950: ST.hasGFX950Insts())
2838	: GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2839	} else {
2840	switch (NumPasses) {
2841	case `2`:
2842	NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2843	break;
2844	case `8`:
2845	NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2846	break;
2847	case `16`:
2848	NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2849	break;
2850	default:
2851	llvm_unreachable("Unexpected number of passes for mfma");
2852	}
2853	}
2854
2855	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2856	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2857
2858	if (WaitStatesNeeded == MaxWaitStates)
2859	break;
2860	}
2861
2862	auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2863	if (!SIInstrInfo::isMFMA(MI) \|\| SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()) \|\|
2864	!MI.readsRegister(Reg, TRI: &TRI))
2865	return false;
2866
2867	if (ST.hasGFX940Insts() && !TII.isXDL(MI))
2868	return false;
2869
2870	const MachineOperand *SrcC =
2871	TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
2872	assert(SrcC);
2873	if (!SrcC->isReg() \|\| !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg))
2874	return false;
2875
2876	MFMA = &MI;
2877	return true;
2878	};
2879
2880	MFMA = nullptr;
2881	int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
2882	Limit: MaxWarWaitStates);
2883	if (!MFMA)
2884	continue;
2885
2886	unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2887	int NeedWaitStates = MaxWaitStates;
2888	switch (HazardDefLatency) {
2889	case `2`: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2890	break;
2891	case `4`: assert(ST.hasGFX940Insts());
2892	NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2893	break;
2894	case `8`: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2895	break;
2896	case `16`: [[fallthrough]];
2897	default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2898	break;
2899	}
2900
2901	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2902	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2903	}
2904
2905	return WaitStatesNeeded;
2906	}
2907
2908	bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2909	if (!SU->isInstr())
2910	return false;
2911
2912	const MachineInstr MAI = nullptr*;
2913
2914	auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2915	MAI = nullptr;
2916	if (SIInstrInfo::isMFMA(MI))
2917	MAI = &MI;
2918	return MAI != nullptr;
2919	};
2920
2921	MachineInstr *MI = SU->getInstr();
2922	if (IsMFMAFn (*MI)) {
2923	int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: `16`);
2924	if (MAI)
2925	return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
2926	}
2927
2928	return false;
2929	}
2930
2931	// Adjust global offsets for instructions bundled with S_GETPC_B64 after
2932	// insertion of a new instruction.
2933	static void updateGetPCBundle(MachineInstr *NewMI) {
2934	if (!NewMI->isBundled())
2935	return;
2936
2937	// Find start of bundle.
2938	auto I = NewMI->getIterator();
2939	while (I ->isBundledWithPred())
2940	I --;
2941	if (I ->isBundle())
2942	I ++;
2943
2944	// Bail if this is not an S_GETPC bundle.
2945	if (I ->getOpcode() != AMDGPU::S_GETPC_B64)
2946	return;
2947
2948	// Update offsets of any references in the bundle.
2949	const unsigned NewBytes = `4`;
2950	assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2951	"Unexpected instruction insertion in bundle");
2952	auto NextMI = std::next(x: NewMI->getIterator());
2953	auto End = NewMI->getParent()->end();
2954	while (NextMI != End && NextMI ->isBundledWithPred()) {
2955	for (auto &Operand : NextMI ->operands()) {
2956	if (Operand.isGlobal())
2957	Operand.setOffset(Operand.getOffset() + NewBytes);
2958	}
2959	NextMI ++;
2960	}
2961	}
2962
2963	bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2964	if (!ST.hasVALUMaskWriteHazard())
2965	return false;
2966	assert(!ST.hasExtendedWaitCounts());
2967
2968	if (!ST.isWave64() \|\| !SIInstrInfo::isSALU(MI: *MI))
2969	return false;
2970
2971	// The hazard sequence is three instructions:
2972	// 1. VALU reads SGPR as mask
2973	// 2. SALU writes SGPR
2974	// 3. SALU reads SGPR
2975	// The hazard can expire if the distance between 2 and 3 is sufficient.
2976	// In practice this happens <10% of the time, hence this always assumes
2977	// the hazard exists if 1 and 2 are present to avoid searching.
2978
2979	const MachineOperand SDSTOp = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::sdst);
2980	if (!SDSTOp \|\| !SDSTOp->isReg())
2981	return false;
2982
2983	const Register HazardReg = SDSTOp->getReg();
2984	if (HazardReg == AMDGPU::EXEC \|\|
2985	HazardReg == AMDGPU::EXEC_LO \|\|
2986	HazardReg == AMDGPU::EXEC_HI \|\|
2987	HazardReg == AMDGPU::M0)
2988	return false;
2989
2990	auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2991	switch (I.getOpcode()) {
2992	case AMDGPU::V_ADDC_U32_e32:
2993	case AMDGPU::V_ADDC_U32_dpp:
2994	case AMDGPU::V_CNDMASK_B16_t16_e32:
2995	case AMDGPU::V_CNDMASK_B16_fake16_e32:
2996	case AMDGPU::V_CNDMASK_B16_t16_dpp:
2997	case AMDGPU::V_CNDMASK_B16_fake16_dpp:
2998	case AMDGPU::V_CNDMASK_B32_e32:
2999	case AMDGPU::V_CNDMASK_B32_dpp:
3000	case AMDGPU::V_DIV_FMAS_F32_e64:
3001	case AMDGPU::V_DIV_FMAS_F64_e64:
3002	case AMDGPU::V_SUBB_U32_e32:
3003	case AMDGPU::V_SUBB_U32_dpp:
3004	case AMDGPU::V_SUBBREV_U32_e32:
3005	case AMDGPU::V_SUBBREV_U32_dpp:
3006	// These implicitly read VCC as mask source.
3007	return HazardReg == AMDGPU::VCC \|\|
3008	HazardReg == AMDGPU::VCC_LO \|\|
3009	HazardReg == AMDGPU::VCC_HI;
3010	case AMDGPU::V_ADDC_U32_e64:
3011	case AMDGPU::V_ADDC_U32_e64_dpp:
3012	case AMDGPU::V_CNDMASK_B16_t16_e64:
3013	case AMDGPU::V_CNDMASK_B16_fake16_e64:
3014	case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3015	case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3016	case AMDGPU::V_CNDMASK_B32_e64:
3017	case AMDGPU::V_CNDMASK_B32_e64_dpp:
3018	case AMDGPU::V_SUBB_U32_e64:
3019	case AMDGPU::V_SUBB_U32_e64_dpp:
3020	case AMDGPU::V_SUBBREV_U32_e64:
3021	case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3022	// Only check mask register overlaps.
3023	const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src2);
3024	assert(SSRCOp);
3025	return TRI.regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg);
3026	}
3027	default:
3028	return false;
3029	}
3030	};
3031
3032	const MachineRegisterInfo &MRI = MF.getRegInfo();
3033	auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3034	// s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3035	if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3036	AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: I.getOperand(i: `0`).getImm()) == `0`)
3037	return true;
3038
3039	// VALU access to any SGPR or literal constant other than HazardReg
3040	// mitigates hazard. No need to check HazardReg here as this will
3041	// only be called when !IsHazardFn.
3042	if (!SIInstrInfo::isVALU(MI: I))
3043	return false;
3044	for (int OpNo = `0`, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3045	const MachineOperand &Op = I.getOperand(i: OpNo);
3046	if (Op.isReg()) {
3047	Register OpReg = Op.getReg();
3048	// Only consider uses
3049	if (!Op.isUse())
3050	continue;
3051	// Ignore EXEC
3052	if (OpReg == AMDGPU::EXEC \|\|
3053	OpReg == AMDGPU::EXEC_LO \|\|
3054	OpReg == AMDGPU::EXEC_HI)
3055	continue;
3056	// Ignore all implicit uses except VCC
3057	if (Op.isImplicit()) {
3058	if (OpReg == AMDGPU::VCC \|\|
3059	OpReg == AMDGPU::VCC_LO \|\|
3060	OpReg == AMDGPU::VCC_HI)
3061	return true;
3062	continue;
3063	}
3064	if (TRI.isSGPRReg(MRI, Reg: OpReg))
3065	return true;
3066	} else {
3067	const MCInstrDesc &InstDesc = I.getDesc();
3068	const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3069	if (!TII.isInlineConstant(MO: Op, OpInfo))
3070	return true;
3071	}
3072	}
3073	return false;
3074	};
3075
3076	// Check for hazard
3077	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
3078	std::numeric_limits<int>::max())
3079	return false;
3080
3081	auto NextMI = std::next(x: MI->getIterator());
3082
3083	// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3084	auto NewMI = BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(),
3085	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3086	.addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: `0`));
3087
3088	// SALU write may be s_getpc in a bundle.
3089	updateGetPCBundle(NewMI);
3090
3091	return true;
3092	}
3093
3094	static bool ensureEntrySetPrio(MachineFunction MF, int* Priority,
3095	const SIInstrInfo &TII) {
3096	MachineBasicBlock &EntryMBB = MF->front();
3097	if (EntryMBB.begin() != EntryMBB.end()) {
3098	auto &EntryMI = *EntryMBB.begin();
3099	if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3100	EntryMI.getOperand(i: `0`).getImm() >= Priority)
3101	return false;
3102	}
3103
3104	BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3105	.addImm(Val: Priority);
3106	return true;
3107	}
3108
3109	bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3110	if (!ST.hasRequiredExportPriority())
3111	return false;
3112
3113	// Assume the following shader types will never have exports,
3114	// and avoid adding or adjusting S_SETPRIO.
3115	MachineBasicBlock *MBB = MI->getParent();
3116	MachineFunction *MF = MBB->getParent();
3117	auto CC = MF->getFunction().getCallingConv();
3118	switch (CC) {
3119	case CallingConv::AMDGPU_CS:
3120	case CallingConv::AMDGPU_CS_Chain:
3121	case CallingConv::AMDGPU_CS_ChainPreserve:
3122	case CallingConv::AMDGPU_KERNEL:
3123	return false;
3124	default:
3125	break;
3126	}
3127
3128	const int MaxPriority = `3`;
3129	const int NormalPriority = `2`;
3130	const int PostExportPriority = `0`;
3131
3132	auto It = MI->getIterator();
3133	switch (MI->getOpcode()) {
3134	case AMDGPU::S_ENDPGM:
3135	case AMDGPU::S_ENDPGM_SAVED:
3136	case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3137	case AMDGPU::SI_RETURN_TO_EPILOG:
3138	// Ensure shader with calls raises priority at entry.
3139	// This ensures correct priority if exports exist in callee.
3140	if (MF->getFrameInfo().hasCalls())
3141	return ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3142	return false;
3143	case AMDGPU::S_SETPRIO: {
3144	// Raise minimum priority unless in workaround.
3145	auto &PrioOp = MI->getOperand(i: `0`);
3146	int Prio = PrioOp.getImm();
3147	bool InWA = (Prio == PostExportPriority) &&
3148	(It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It)));
3149	if (InWA \|\| Prio >= NormalPriority)
3150	return false;
3151	PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority));
3152	return true;
3153	}
3154	default:
3155	if (!TII.isEXP(MI: *MI))
3156	return false;
3157	break;
3158	}
3159
3160	// Check entry priority at each export (as there will only be a few).
3161	// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3162	bool Changed = false;
3163	if (CC != CallingConv::AMDGPU_Gfx)
3164	Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3165
3166	auto NextMI = std::next(x: It);
3167	bool EndOfShader = false;
3168	if (NextMI != MBB->end()) {
3169	// Only need WA at end of sequence of exports.
3170	if (TII.isEXP(MI: *NextMI))
3171	return Changed;
3172	// Assume appropriate S_SETPRIO after export means WA already applied.
3173	if (NextMI ->getOpcode() == AMDGPU::S_SETPRIO &&
3174	NextMI ->getOperand(i: `0`).getImm() == PostExportPriority)
3175	return Changed;
3176	EndOfShader = NextMI ->getOpcode() == AMDGPU::S_ENDPGM;
3177	}
3178
3179	const DebugLoc &DL = MI->getDebugLoc();
3180
3181	// Lower priority.
3182	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3183	.addImm(Val: PostExportPriority);
3184
3185	if (!EndOfShader) {
3186	// Wait for exports to complete.
3187	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT))
3188	.addReg(RegNo: AMDGPU::SGPR_NULL)
3189	.addImm(Val: `0`);
3190	}
3191
3192	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: `0`);
3193	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: `0`);
3194
3195	if (!EndOfShader) {
3196	// Return to normal (higher) priority.
3197	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3198	.addImm(Val: NormalPriority);
3199	}
3200
3201	return true;
3202	}
3203

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp