GCNHazardRecognizer.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp]

1	//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements hazard recognizers for scheduling on GCN processors.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "GCNHazardRecognizer.h"
14	#include "GCNSubtarget.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "SIMachineFunctionInfo.h"
17	#include "llvm/ADT/Statistic.h"
18	#include "llvm/CodeGen/MachineFrameInfo.h"
19	#include "llvm/CodeGen/MachineFunction.h"
20	#include "llvm/CodeGen/MachineInstrBuilder.h"
21	#include "llvm/CodeGen/ScheduleDAG.h"
22	#include "llvm/Support/Debug.h"
23	#include "llvm/TargetParser/TargetParser.h"
24
25	using namespace llvm;
26
27	#define DEBUG_TYPE "gcn-hazard-recognizer"
28
29	STATISTIC(NumWMMANopsHoisted,
30	"Number of WMMA hazard V_NOPs hoisted from loops");
31	STATISTIC(NumWMMAHoistingBailed,
32	"Number of WMMA hazards where V_NOP hoisting was not possible");
33
34	namespace {
35
36	struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
37	MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
38
39	bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
40	if (Arg.getAsInteger(Radix: `0`, Result&: Value))
41	return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
42
43	if (Value > `100`)
44	return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
45
46	return false;
47	}
48	};
49
50	} // end anonymous namespace
51
52	static cl::opt<unsigned, false, MFMAPaddingRatioParser>
53	MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: `0`), cl::Hidden,
54	cl::desc ("Fill a percentage of the latency between "
55	"neighboring MFMA with s_nops."));
56
57	// This is intended for debugging purposes only.
58	static cl::opt<unsigned>
59	NopPadding("amdgpu-snop-padding", cl::init(Val: `0`), cl::Hidden,
60	cl::desc ("Insert a s_nop x before every instruction"));
61
62	static cl::opt<bool> EnableWMMAVnopHoisting(
63	"amdgpu-wmma-vnop-hoisting", cl::init(Val: true), cl::Hidden,
64	cl::desc ("Hoist WMMA hazard V_NOPs from loops to preheaders"));
65
66	//===----------------------------------------------------------------------===//
67	// Hazard Recognizer Implementation
68	//===----------------------------------------------------------------------===//
69
70	static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
71	const GCNSubtarget &ST);
72
73	GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF,
74	MachineLoopInfo *MLI)
75	: IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
76	ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
77	TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
78	ClauseUses (TRI.getNumRegUnits()), ClauseDefs (TRI.getNumRegUnits()) {
79	MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? `19` : `5`;
80	RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
81	}
82
83	void GCNHazardRecognizer::Reset() {
84	EmittedInstrs.clear();
85	}
86
87	void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
88	EmitInstruction(MI: SU->getInstr());
89	}
90
91	void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
92	CurrCycleInstr = MI;
93	}
94
95	static bool isDivFMas(unsigned Opcode) {
96	return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 \|\| Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
97	}
98
99	static bool isSGetReg(unsigned Opcode) {
100	return Opcode == AMDGPU::S_GETREG_B32 \|\| Opcode == AMDGPU::S_GETREG_B32_const;
101	}
102
103	static bool isSSetReg(unsigned Opcode) {
104	switch (Opcode) {
105	case AMDGPU::S_SETREG_B32:
106	case AMDGPU::S_SETREG_B32_mode:
107	case AMDGPU::S_SETREG_IMM32_B32:
108	case AMDGPU::S_SETREG_IMM32_B32_mode:
109	return true;
110	}
111	return false;
112	}
113
114	static bool isRWLane(unsigned Opcode) {
115	return Opcode == AMDGPU::V_READLANE_B32 \|\| Opcode == AMDGPU::V_WRITELANE_B32;
116	}
117
118	static bool isRFE(unsigned Opcode) {
119	return Opcode == AMDGPU::S_RFE_B64;
120	}
121
122	static bool isSMovRel(unsigned Opcode) {
123	switch (Opcode) {
124	case AMDGPU::S_MOVRELS_B32:
125	case AMDGPU::S_MOVRELS_B64:
126	case AMDGPU::S_MOVRELD_B32:
127	case AMDGPU::S_MOVRELD_B64:
128	return true;
129	default:
130	return false;
131	}
132	}
133
134	static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
135	const MachineInstr &MI) {
136	if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
137	return true;
138
139	switch (MI.getOpcode()) {
140	case AMDGPU::S_SENDMSG:
141	case AMDGPU::S_SENDMSGHALT:
142	case AMDGPU::S_TTRACEDATA:
143	return true;
144	// These DS opcodes don't support GDS.
145	case AMDGPU::DS_NOP:
146	case AMDGPU::DS_PERMUTE_B32:
147	case AMDGPU::DS_BPERMUTE_B32:
148	return false;
149	default:
150	if (TII.isDS(Opcode: MI.getOpcode())) {
151	int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
152	Name: AMDGPU::OpName::gds);
153	if (MI.getOperand(i: GDS).getImm())
154	return true;
155	}
156	return false;
157	}
158	}
159
160	static bool isPermlane(const MachineInstr &MI) {
161	unsigned Opcode = MI.getOpcode();
162	return Opcode == AMDGPU::V_PERMLANE16_B32_e64 \|\|
163	Opcode == AMDGPU::V_PERMLANE64_B32 \|\|
164	Opcode == AMDGPU::V_PERMLANEX16_B32_e64 \|\|
165	Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 \|\|
166	Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 \|\|
167	Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 \|\|
168	Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 \|\|
169	Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 \|\|
170	Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 \|\|
171	Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 \|\|
172	Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 \|\|
173	Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 \|\|
174	Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 \|\|
175	Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
176	}
177
178	static bool isLdsDma(const MachineInstr &MI) {
179	return SIInstrInfo::isVALU(MI) &&
180	(SIInstrInfo::isMUBUF(MI) \|\| SIInstrInfo::isFLAT(MI));
181	}
182
183	static unsigned getHWReg(const SIInstrInfo TII, const* MachineInstr &RegInstr) {
184	const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr,
185	OperandName: AMDGPU::OpName::simm16);
186	return std::get<`0`>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm()));
187	}
188
189	ScheduleHazardRecognizer::HazardType
190	GCNHazardRecognizer::getHazardType(SUnit SU, int* Stalls) {
191	MachineInstr *MI = SU->getInstr();
192	// If we are not in "HazardRecognizerMode" and therefore not being run from
193	// the scheduler, track possible stalls from hazards but don't insert noops.
194	auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195
196	if (MI->isBundle())
197	return NoHazard;
198
199	if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > `0`)
200	return HazardType;
201
202	if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > `0`)
203	return HazardType;
204
205	if (checkFPAtomicToDenormModeHazard(MI) > `0`)
206	return HazardType;
207
208	// Hazards which cannot be mitigated with S_NOPs.
209	if (!IsHazardRecognizerMode) {
210	if (checkWMMACoexecutionHazards(MI) > `0`)
211	return Hazard;
212	}
213
214	if (ST.hasNoDataDepHazard())
215	return NoHazard;
216
217	if (SIInstrInfo::isVMEM(MI: *MI) && checkVMEMHazards(VMEM: MI) > `0`)
218	return HazardType;
219
220	if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > `0`)
221	return HazardType;
222
223	if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > `0`)
224	return HazardType;
225
226	if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > `0`)
227	return HazardType;
228
229	if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > `0`)
230	return HazardType;
231
232	if ((SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isVMEM(MI: MI) \|\|
233	SIInstrInfo::isDS(MI: MI) \|\| SIInstrInfo::isEXP(MI: MI)) &&
234	checkMAIVALUHazards(MI) > `0`)
235	return HazardType;
236
237	if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > `0`)
238	return HazardType;
239
240	if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > `0`)
241	return HazardType;
242
243	if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > `0`)
244	return HazardType;
245
246	if (((ST.hasReadM0MovRelInterpHazard() &&
247	(TII.isVINTRP(MI: *MI) \|\| isSMovRel(Opcode: MI->getOpcode()) \|\|
248	MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 \|\|
249	MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) \|\|
250	(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) \|\|
251	(ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) \|\|
252	(ST.hasReadM0LdsDirectHazard() &&
253	MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /TRI=/nullptr))) &&
254	checkReadM0Hazards(SMovRel: MI) > `0`)
255	return HazardType;
256
257	if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > `0`)
258	return HazardType;
259
260	if ((SIInstrInfo::isVMEM(MI: MI) \|\| SIInstrInfo::isDS(MI: MI)) &&
261	checkMAILdStHazards(MI) > `0`)
262	return HazardType;
263
264	if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > `0`)
265	return HazardType;
266
267	return NoHazard;
268	}
269
270	static void insertNoopsInBundle(MachineInstr MI, const* SIInstrInfo &TII,
271	unsigned Quantity) {
272	while (Quantity > `0`) {
273	unsigned Arg = std::min(a: Quantity, b: `8u`);
274	Quantity -= Arg;
275	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
276	.addImm(Val: Arg - `1`);
277	}
278	}
279
280	unsigned
281	GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
282	const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
283	assert(TSchedModel.getWriteProcResBegin(SC) !=
284	TSchedModel.getWriteProcResEnd(SC));
285	return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
286	}
287
288	void GCNHazardRecognizer::processBundle() {
289	MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
290	MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
291	// Check bundled MachineInstr's for hazards.
292	for (; MI != E && MI ->isInsideBundle(); ++MI) {
293	CurrCycleInstr = &*MI;
294	unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
295
296	if (IsHazardRecognizerMode) {
297	fixHazards(MI: CurrCycleInstr);
298
299	insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
300	}
301
302	// It’s unnecessary to track more than MaxLookAhead instructions. Since we
303	// include the bundled MI directly after, only add a maximum of
304	// (MaxLookAhead - 1) noops to EmittedInstrs.
305	for (unsigned i = `0`, e = std::min(a: WaitStates, b: MaxLookAhead - `1`); i < e; ++i)
306	EmittedInstrs.push_front(x: nullptr);
307
308	EmittedInstrs.push_front(x: CurrCycleInstr);
309	EmittedInstrs.resize(new_size: MaxLookAhead);
310	}
311	CurrCycleInstr = nullptr;
312	}
313
314	void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
315	assert(IsHazardRecognizerMode);
316
317	unsigned NumPreNoops = PreEmitNoops(MI);
318	EmitNoops(Quantity: NumPreNoops);
319	if (MI->isInsideBundle())
320	insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
321	else
322	TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator (MI),
323	Quantity: NumPreNoops);
324	EmitInstruction(MI);
325	AdvanceCycle();
326	}
327
328	unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
329	IsHazardRecognizerMode = true;
330	CurrCycleInstr = MI;
331	unsigned W = PreEmitNoopsCommon(MI);
332	fixHazards(MI);
333	CurrCycleInstr = nullptr;
334	return std::max(a: W, b: NopPadding.getValue());
335	}
336
337	unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr MI) const* {
338	if (MI->isBundle())
339	return `0`;
340
341	int WaitStates = `0`;
342
343	if (SIInstrInfo::isSMRD(MI: *MI))
344	return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
345
346	if (ST.hasNSAtoVMEMBug())
347	WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
348
349	WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
350
351	if (ST.hasNoDataDepHazard())
352	return WaitStates;
353
354	if (SIInstrInfo::isVMEM(MI: *MI))
355	WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
356
357	if (SIInstrInfo::isVALU(MI: *MI))
358	WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
359
360	if (SIInstrInfo::isDPP(MI: *MI))
361	WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
362
363	if (isDivFMas(Opcode: MI->getOpcode()))
364	WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
365
366	if (isRWLane(Opcode: MI->getOpcode()))
367	WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
368
369	if ((SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isVMEM(MI: MI) \|\|
370	SIInstrInfo::isDS(MI: MI) \|\| SIInstrInfo::isEXP(MI: MI)) &&
371	checkMAIVALUHazards(MI) > `0`)
372	WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
373
374	if (MI->isInlineAsm())
375	return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
376
377	if (isSGetReg(Opcode: MI->getOpcode()))
378	return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
379
380	if (isSSetReg(Opcode: MI->getOpcode()))
381	return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
382
383	if (isRFE(Opcode: MI->getOpcode()))
384	return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
385
386	if ((ST.hasReadM0MovRelInterpHazard() &&
387	(TII.isVINTRP(MI: *MI) \|\| isSMovRel(Opcode: MI->getOpcode()) \|\|
388	MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 \|\|
389	MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) \|\|
390	(ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) \|\|
391	(ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) \|\|
392	(ST.hasReadM0LdsDirectHazard() &&
393	MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /TRI=/nullptr)))
394	return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
395
396	if (SIInstrInfo::isMAI(MI: *MI))
397	return std::max(a: WaitStates, b: checkMAIHazards(MI));
398
399	if (SIInstrInfo::isVMEM(MI: MI) \|\| SIInstrInfo::isDS(MI: MI))
400	return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
401
402	if (ST.hasGFX950Insts() && isPermlane(MI: *MI))
403	return std::max(a: WaitStates, b: checkPermlaneHazards(MI));
404
405	return WaitStates;
406	}
407
408	void GCNHazardRecognizer::EmitNoop() {
409	EmittedInstrs.push_front(x: nullptr);
410	}
411
412	void GCNHazardRecognizer::AdvanceCycle() {
413	// When the scheduler detects a stall, it will call AdvanceCycle() without
414	// emitting any instructions.
415	if (!CurrCycleInstr) {
416	EmittedInstrs.push_front(x: nullptr);
417	return;
418	}
419
420	if (CurrCycleInstr->isBundle()) {
421	processBundle();
422	return;
423	}
424
425	unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
426	if (!NumWaitStates) {
427	CurrCycleInstr = nullptr;
428	return;
429	}
430
431	// Keep track of emitted instructions
432	EmittedInstrs.push_front(x: CurrCycleInstr);
433
434	// Add a nullptr for each additional wait state after the first. Make sure
435	// not to add more than getMaxLookAhead() items to the list, since we
436	// truncate the list to that size right after this loop.
437	for (unsigned i = `1`, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
438	i < e; ++i) {
439	EmittedInstrs.push_front(x: nullptr);
440	}
441
442	// getMaxLookahead() is the largest number of wait states we will ever need
443	// to insert, so there is no point in keeping track of more than that many
444	// wait states.
445	EmittedInstrs.resize(new_size: getMaxLookAhead());
446
447	CurrCycleInstr = nullptr;
448	}
449
450	void GCNHazardRecognizer::RecedeCycle() {
451	assert(!IsHazardRecognizerMode &&
452	"Bottom-up scheduling shouldn't run in hazard recognizer mode");
453	}
454
455	//===----------------------------------------------------------------------===//
456	// Helper Functions
457	//===----------------------------------------------------------------------===//
458
459	enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
460
461	// Search for a hazard in a block and its predecessors.
462	template <typename StateT>
463	static bool
464	hasHazard(StateT InitialState,
465	function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
466	function_ref<void(StateT &, const MachineInstr &)> UpdateState,
467	const MachineBasicBlock *InitialMBB,
468	MachineBasicBlock::const_reverse_instr_iterator InitialI) {
469	struct StateMapKey {
470	SmallVectorImpl<StateT> *States;
471	unsigned Idx;
472	static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
473	return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
474	}
475	};
476	struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
477	static inline StateMapKey getEmptyKey() {
478	return {static_cast<SmallVectorImpl<StateT> *>(
479	DenseMapInfo<void *>::getEmptyKey()),
480	DenseMapInfo<unsigned>::getEmptyKey()};
481	}
482	static inline StateMapKey getTombstoneKey() {
483	return {static_cast<SmallVectorImpl<StateT> *>(
484	DenseMapInfo<void *>::getTombstoneKey()),
485	DenseMapInfo<unsigned>::getTombstoneKey()};
486	}
487	static unsigned getHashValue(const StateMapKey &Key) {
488	return StateT::getHashValue((*Key.States)[Key.Idx]);
489	}
490	static unsigned getHashValue(const StateT &State) {
491	return StateT::getHashValue(State);
492	}
493	static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
494	const auto EKey = getEmptyKey();
495	const auto TKey = getTombstoneKey();
496	if (StateMapKey::isEqual(LHS, EKey) \|\| StateMapKey::isEqual(RHS, EKey) \|\|
497	StateMapKey::isEqual(LHS, TKey) \|\| StateMapKey::isEqual(RHS, TKey))
498	return StateMapKey::isEqual(LHS, RHS);
499	return StateT::isEqual((LHS.States)[LHS.Idx], (RHS.States)[RHS.Idx]);
500	}
501	static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
502	if (StateMapKey::isEqual(RHS, getEmptyKey()) \|\|
503	StateMapKey::isEqual(RHS, getTombstoneKey()))
504	return false;
505	return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
506	}
507	};
508
509	SmallDenseMap<StateMapKey, unsigned, `8`, StateMapKeyTraits> StateMap;
510	SmallVector<StateT, `8`> States;
511
512	MachineBasicBlock::const_reverse_instr_iterator I = InitialI;
513	const MachineBasicBlock *MBB = InitialMBB;
514	StateT State = InitialState;
515
516	SmallSetVector<std::pair<const MachineBasicBlock , unsigned*>, `16`> Worklist;
517	unsigned WorkIdx = `0`;
518	for (;;) {
519	bool Expired = false;
520	for (auto E = MBB->instr_rend(); I != E; ++I) {
521	// No need to look at parent BUNDLE instructions.
522	if (I ->isBundle())
523	continue;
524
525	auto Result = IsHazard(State, *I);
526	if (Result == HazardFound)
527	return true;
528	if (Result == HazardExpired) {
529	Expired = true;
530	break;
531	}
532
533	if (I ->isInlineAsm() \|\| I ->isMetaInstruction())
534	continue;
535
536	UpdateState(State, *I);
537	}
538
539	if (!Expired) {
540	unsigned StateIdx = States.size();
541	StateMapKey Key = {&States, StateIdx};
542	auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
543	if (Insertion.second) {
544	States.emplace_back(State);
545	} else {
546	StateIdx = Insertion.first->second;
547	}
548	for (MachineBasicBlock *Pred : MBB->predecessors())
549	Worklist.insert(X: std::pair(Pred, StateIdx));
550	}
551
552	if (WorkIdx == Worklist.size())
553	break;
554
555	unsigned StateIdx;
556	std::tie(args&: MBB, args&: StateIdx) = Worklist [WorkIdx++];
557	State = States[StateIdx];
558	I = MBB->instr_rbegin();
559	}
560
561	return false;
562	}
563
564	// Returns a minimum wait states since \p I walking all predecessors.
565	// Only scans until \p IsExpired does not return true.
566	// Can only be run in a hazard recognizer mode.
567	static int
568	getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
569	const MachineBasicBlock *MBB,
570	MachineBasicBlock::const_reverse_instr_iterator I,
571	int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
572	DenseSet<const MachineBasicBlock *> &Visited,
573	GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
574	SIInstrInfo::getNumWaitStates) {
575	for (auto E = MBB->instr_rend(); I != E; ++I) {
576	// Don't add WaitStates for parent BUNDLE instructions.
577	if (I ->isBundle())
578	continue;
579
580	if (IsHazard (*I))
581	return WaitStates;
582
583	if (I ->isInlineAsm())
584	continue;
585
586	WaitStates += GetNumWaitStates (*I);
587
588	if (IsExpired (*I, WaitStates))
589	return std::numeric_limits<int>::max();
590	}
591
592	int MinWaitStates = std::numeric_limits<int>::max();
593	for (MachineBasicBlock *Pred : MBB->predecessors()) {
594	if (!Visited.insert(V: Pred).second)
595	continue;
596
597	int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
598	IsExpired, Visited, GetNumWaitStates);
599
600	MinWaitStates = std::min(a: MinWaitStates, b: W);
601	}
602
603	return MinWaitStates;
604	}
605
606	static int
607	getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
608	const MachineInstr *MI,
609	GCNHazardRecognizer::IsExpiredFn IsExpired,
610	GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
611	SIInstrInfo::getNumWaitStates) {
612	DenseSet<const MachineBasicBlock *> Visited;
613	return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
614	I: std::next(x: MI->getReverseIterator()), WaitStates: `0`, IsExpired,
615	Visited, GetNumWaitStates);
616	}
617
618	int GCNHazardRecognizer::getWaitStatesSince(
619	IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
620	if (IsHazardRecognizerMode) {
621	auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
622	return WaitStates >= Limit;
623	};
624	return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn,
625	GetNumWaitStates);
626	}
627
628	int WaitStates = `0`;
629	for (MachineInstr *MI : EmittedInstrs) {
630	if (MI) {
631	if (IsHazard (*MI))
632	return WaitStates;
633
634	if (MI->isInlineAsm())
635	continue;
636	}
637	WaitStates += MI ? GetNumWaitStates (*MI) : `1`;
638
639	if (WaitStates >= Limit)
640	break;
641	}
642	return std::numeric_limits<int>::max();
643	}
644
645	int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
646	int Limit) const {
647	return getWaitStatesSince(IsHazard, Limit, GetNumWaitStates: SIInstrInfo::getNumWaitStates);
648	}
649
650	int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
651	IsHazardFn IsHazardDef,
652	int Limit) const {
653	const SIRegisterInfo *TRI = ST.getRegisterInfo();
654
655	auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
656	return IsHazardDef (MI) && MI.modifiesRegister(Reg, TRI);
657	};
658
659	return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
660	}
661
662	int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
663	int Limit) const {
664	auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
665	return isSSetReg(Opcode: MI.getOpcode()) && IsHazard (MI);
666	};
667
668	return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
669	}
670
671	//===----------------------------------------------------------------------===//
672	// No-op Hazard Detection
673	//===----------------------------------------------------------------------===//
674
675	static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
676	MCRegister Reg) {
677	for (MCRegUnit Unit : TRI.regunits(Reg))
678	BV.set(static_cast<unsigned>(Unit));
679	}
680
681	static void addRegsToSet(const SIRegisterInfo &TRI,
682	iterator_range<MachineInstr::const_mop_iterator> Ops,
683	BitVector &DefSet, BitVector &UseSet) {
684	for (const MachineOperand &Op : Ops) {
685	if (Op.isReg())
686	addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
687	}
688	}
689
690	void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
691	addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
692	}
693
694	static bool breaksSMEMSoftClause(MachineInstr *MI) {
695	return !SIInstrInfo::isSMRD(MI: *MI);
696	}
697
698	static bool breaksVMEMSoftClause(MachineInstr *MI) {
699	return !SIInstrInfo::isVMEM(MI: *MI);
700	}
701
702	int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr MEM) const* {
703	// SMEM soft clause are only present on VI+, and only matter if xnack is
704	// enabled.
705	if (!ST.isXNACKEnabled())
706	return `0`;
707
708	bool IsSMRD = TII.isSMRD(MI: *MEM);
709
710	resetClause();
711
712	// A soft-clause is any group of consecutive SMEM instructions. The
713	// instructions in this group may return out of order and/or may be
714	// replayed (i.e. the same instruction issued more than once).
715	//
716	// In order to handle these situations correctly we need to make sure that
717	// when a clause has more than one instruction, no instruction in the clause
718	// writes to a register that is read by another instruction in the clause
719	// (including itself). If we encounter this situation, we need to break the
720	// clause by inserting a non SMEM instruction.
721
722	for (MachineInstr *MI : EmittedInstrs) {
723	// When we hit a non-SMEM instruction then we have passed the start of the
724	// clause and we can stop.
725	if (!MI)
726	break;
727
728	if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
729	break;
730
731	addClauseInst(MI: *MI);
732	}
733
734	if (ClauseDefs.none())
735	return `0`;
736
737	// We need to make sure not to put loads and stores in the same clause if they
738	// use the same address. For now, just start a new clause whenever we see a
739	// store.
740	if (MEM->mayStore())
741	return `1`;
742
743	addClauseInst(MI: *MEM);
744
745	// If the set of defs and uses intersect then we cannot add this instruction
746	// to the clause, so we have a hazard.
747	return ClauseDefs.anyCommon(RHS: ClauseUses) ? `1` : `0`;
748	}
749
750	int GCNHazardRecognizer::checkSMRDHazards(MachineInstr SMRD) const* {
751	int WaitStatesNeeded = `0`;
752
753	WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
754
755	// This SMRD hazard only affects SI.
756	if (!ST.hasSMRDReadVALUDefHazard())
757	return WaitStatesNeeded;
758
759	// A read of an SGPR by SMRD instruction requires 4 wait states when the
760	// SGPR was written by a VALU instruction.
761	int SmrdSgprWaitStates = `4`;
762	auto IsHazardDefFn = [this](const MachineInstr &MI) {
763	return TII.isVALU(MI);
764	};
765	auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
766	return TII.isSALU(MI);
767	};
768
769	bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
770
771	for (const MachineOperand &Use : SMRD->uses()) {
772	if (!Use.isReg())
773	continue;
774	int WaitStatesNeededForUse =
775	SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
776	Limit: SmrdSgprWaitStates);
777	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
778
779	// This fixes what appears to be undocumented hardware behavior in SI where
780	// s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
781	// needs some number of nops in between. We don't know how many we need, but
782	// let's use 4. This wasn't discovered before probably because the only
783	// case when this happens is when we expand a 64-bit pointer into a full
784	// descriptor and use s_buffer_load_dword instead of s_load_dword, which was
785	// probably never encountered in the closed-source land.
786	if (IsBufferSMRD) {
787	int WaitStatesNeededForUse =
788	SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
789	IsHazardDef: IsBufferHazardDefFn,
790	Limit: SmrdSgprWaitStates);
791	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
792	}
793	}
794
795	return WaitStatesNeeded;
796	}
797
798	int GCNHazardRecognizer::checkVMEMHazards(MachineInstr VMEM) const* {
799	if (!ST.hasVMEMReadSGPRVALUDefHazard())
800	return `0`;
801
802	int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
803
804	// A read of an SGPR by a VMEM instruction requires 5 wait states when the
805	// SGPR was written by a VALU Instruction.
806	const int VmemSgprWaitStates = `5`;
807	auto IsHazardDefFn = [this](const MachineInstr &MI) {
808	return TII.isVALU(MI);
809	};
810	for (const MachineOperand &Use : VMEM->uses()) {
811	if (!Use.isReg() \|\| TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
812	continue;
813
814	int WaitStatesNeededForUse =
815	VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
816	Limit: VmemSgprWaitStates);
817	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
818	}
819	return WaitStatesNeeded;
820	}
821
822	int GCNHazardRecognizer::checkDPPHazards(MachineInstr DPP) const* {
823	const SIRegisterInfo *TRI = ST.getRegisterInfo();
824	const SIInstrInfo *TII = ST.getInstrInfo();
825
826	// Check for DPP VGPR read after VALU VGPR write and EXEC write.
827	int DppVgprWaitStates = `2`;
828	int DppExecWaitStates = `5`;
829	int WaitStatesNeeded = `0`;
830	auto IsHazardDefFn = [TII](const MachineInstr &MI) {
831	return TII->isVALU(MI);
832	};
833
834	for (const MachineOperand &Use : DPP->uses()) {
835	if (!Use.isReg() \|\| !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
836	continue;
837	int WaitStatesNeededForUse =
838	DppVgprWaitStates - getWaitStatesSinceDef(
839	Reg: Use.getReg(),
840	IsHazardDef: [](const MachineInstr &) { return true; },
841	Limit: DppVgprWaitStates);
842	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
843	}
844
845	WaitStatesNeeded = std::max(
846	a: WaitStatesNeeded,
847	b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn,
848	Limit: DppExecWaitStates));
849
850	return WaitStatesNeeded;
851	}
852
853	int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr DivFMas) const* {
854	const SIInstrInfo *TII = ST.getInstrInfo();
855
856	// v_div_fmas requires 4 wait states after a write to vcc from a VALU
857	// instruction.
858	const int DivFMasWaitStates = `4`;
859	auto IsHazardDefFn = [TII](const MachineInstr &MI) {
860	return TII->isVALU(MI);
861	};
862	int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn,
863	Limit: DivFMasWaitStates);
864
865	return DivFMasWaitStates - WaitStatesNeeded;
866	}
867
868	int GCNHazardRecognizer::checkGetRegHazards(MachineInstr GetRegInstr) const* {
869	const SIInstrInfo *TII = ST.getInstrInfo();
870	unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
871
872	const int GetRegWaitStates = `2`;
873	auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
874	return GetRegHWReg == getHWReg(TII, RegInstr: MI);
875	};
876	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
877
878	return GetRegWaitStates - WaitStatesNeeded;
879	}
880
881	int GCNHazardRecognizer::checkSetRegHazards(MachineInstr SetRegInstr) const* {
882	const SIInstrInfo *TII = ST.getInstrInfo();
883	unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
884
885	const int SetRegWaitStates = ST.getSetRegWaitStates();
886	auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
887	return HWReg == getHWReg(TII, RegInstr: MI);
888	};
889	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
890	return SetRegWaitStates - WaitStatesNeeded;
891	}
892
893	int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
894	if (!MI.mayStore())
895	return -`1`;
896
897	const SIInstrInfo *TII = ST.getInstrInfo();
898	unsigned Opcode = MI.getOpcode();
899	const MCInstrDesc &Desc = MI.getDesc();
900
901	int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
902	int VDataRCID = -`1`;
903	if (VDataIdx != -`1`)
904	VDataRCID = TII->getOpRegClassID(OpInfo: Desc.operands()[VDataIdx]);
905
906	if (TII->isMUBUF(MI) \|\| TII->isMTBUF(MI)) {
907	// There is no hazard if the instruction does not use vector regs
908	// (like wbinvl1)
909	if (VDataIdx == -`1`)
910	return -`1`;
911	// For MUBUF/MTBUF instructions this hazard only exists if the
912	// instruction is not using a register in the soffset field.
913	const MachineOperand *SOffset =
914	TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
915	// If we have no soffset operand, then assume this field has been
916	// hardcoded to zero.
917	if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > `64` &&
918	(!SOffset \|\| !SOffset->isReg()))
919	return VDataIdx;
920	}
921
922	// MIMG instructions create a hazard if they don't use a 256-bit T# and
923	// the store size is greater than 8 bytes and they have more than two bits
924	// of their dmask set.
925	// All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
926	if (TII->isMIMG(MI)) {
927	int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::srsrc);
928	assert(SRsrcIdx != -`1` && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
929	Desc.operands()[SRsrcIdx])) == `256`);
930	(void)SRsrcIdx;
931	}
932
933	if (TII->isFLAT(MI)) {
934	// There is no hazard if the instruction does not use vector regs
935	if (VDataIdx == -`1`)
936	return -`1`;
937
938	if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > `64`)
939	return VDataIdx;
940	}
941
942	return -`1`;
943	}
944
945	int GCNHazardRecognizer::checkVALUHazardsHelper(
946	const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
947	// Helper to check for the hazard where VMEM instructions that store more than
948	// 8 bytes can have there store data over written by the next instruction.
949	const SIRegisterInfo *TRI = ST.getRegisterInfo();
950
951	const int VALUWaitStates = ST.hasGFX940Insts() ? `2` : `1`;
952	int WaitStatesNeeded = `0`;
953
954	if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
955	return WaitStatesNeeded;
956	Register Reg = Def.getReg();
957	auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
958	int DataIdx = createsVALUHazard(MI);
959	return DataIdx >= `0` &&
960	TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg);
961	};
962
963	int WaitStatesNeededForDef =
964	VALUWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: VALUWaitStates);
965	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
966
967	return WaitStatesNeeded;
968	}
969
970	/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
971	/// pack the computed value into correct bit position of the dest register. This
972	/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
973	/// dst_sel that is not aligned to the register. This function analayzes the \p
974	/// MI and \returns an operand with dst forwarding issue, or nullptr if
975	/// none exists.
976	static const MachineOperand *
977	getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
978	if (!SIInstrInfo::isVALU(MI))
979	return nullptr;
980
981	const SIInstrInfo *TII = ST.getInstrInfo();
982
983	unsigned Opcode = MI.getOpcode();
984
985	// There are three different types of instructions
986	// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
987	// which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
988	// (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
989	// op_sel[3:2]
990	// != 0
991	if (SIInstrInfo::isSDWA(MI)) {
992	// Type 1: SDWA with dst_sel != DWORD
993	if (auto *DstSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel))
994	if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
995	return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
996	}
997
998	AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opc: Opcode);
999	if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel)) {
1000	// Type 2: VOP3 which write the hi bits
1001	if (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers) &
1002	SISrcMods::DST_OP_SEL)
1003	return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1004
1005	// Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1006	if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1007	(TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers) &
1008	SISrcMods::OP_SEL_0))
1009	return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1010	}
1011
1012	// Special case: nop is required for all the opsel values for fp4 sr variant
1013	// cvt scale instructions
1014	if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1015	return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1016
1017	return nullptr;
1018	}
1019
1020	/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1021	/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1022	/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1023	static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
1024	const MachineOperand *Dst,
1025	const SIRegisterInfo *TRI) {
1026	// We must consider implicit reads of the VALU. SDWA with dst_sel and
1027	// UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1028	// and we must account for that hazard.
1029	// We also must account for WAW hazards. In particular, WAW with dest
1030	// preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1031	// !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1032	// check for ECC. Without accounting for this hazard, the ECC will be
1033	// wrong.
1034	// TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1035	// complete zeroesHigh16BitsOfDest)
1036	for (auto &Operand : VALU->operands()) {
1037	if (Operand.isReg() && TRI->regsOverlap(RegA: Dst->getReg(), RegB: Operand.getReg())) {
1038	return true;
1039	}
1040	}
1041	return false;
1042	}
1043
1044	int GCNHazardRecognizer::checkVALUHazards(MachineInstr VALU) const* {
1045	int WaitStatesNeeded = `0`;
1046
1047	if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
1048	const int TransDefWaitstates = `1`;
1049
1050	auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1051	if (!SIInstrInfo::isTRANS(MI))
1052	return false;
1053	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1054	const SIInstrInfo *TII = ST.getInstrInfo();
1055	Register Def = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)->getReg();
1056
1057	for (const MachineOperand &Use : VALU->explicit_uses()) {
1058	if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
1059	return true;
1060	}
1061
1062	return false;
1063	};
1064
1065	int WaitStatesNeededForDef =
1066	TransDefWaitstates -
1067	getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
1068	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1069	}
1070
1071	if (ST.hasDstSelForwardingHazard() \|\| ST.hasCvtScaleForwardingHazard()) {
1072	const int Shift16DefWaitstates = `1`;
1073
1074	auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1075	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1076	const MachineOperand *ForwardedDst =
1077	getDstSelForwardingOperand(MI: ProducerMI, ST);
1078	if (ForwardedDst) {
1079	return consumesDstSelForwardingOperand(VALU, Dst: ForwardedDst, TRI);
1080	}
1081
1082	if (ProducerMI.isInlineAsm()) {
1083	// Assume inline asm has dst forwarding hazard
1084	for (auto &Def : ProducerMI.all_defs()) {
1085	if (consumesDstSelForwardingOperand(VALU, Dst: &Def, TRI))
1086	return true;
1087	}
1088	}
1089
1090	return false;
1091	};
1092
1093	int WaitStatesNeededForDef =
1094	Shift16DefWaitstates -
1095	getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1096	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1097	}
1098
1099	if (ST.hasVDecCoExecHazard()) {
1100	const int VALUWriteSGPRVALUReadWaitstates = `2`;
1101	const int VALUWriteEXECRWLane = `4`;
1102	const int VALUWriteVGPRReadlaneRead = `1`;
1103
1104	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1105	const MachineRegisterInfo &MRI = MF.getRegInfo();
1106	Register UseReg;
1107	auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1108	if (!SIInstrInfo::isVALU(MI))
1109	return false;
1110	return MI.modifiesRegister(Reg: UseReg, TRI);
1111	};
1112
1113	for (const MachineOperand &Use : VALU->explicit_uses()) {
1114	if (!Use.isReg())
1115	continue;
1116
1117	UseReg = Use.getReg();
1118	if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
1119	int WaitStatesNeededForDef =
1120	VALUWriteSGPRVALUReadWaitstates -
1121	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn,
1122	Limit: VALUWriteSGPRVALUReadWaitstates);
1123	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1124	}
1125	}
1126
1127	if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) {
1128	UseReg = AMDGPU::VCC;
1129	int WaitStatesNeededForDef =
1130	VALUWriteSGPRVALUReadWaitstates -
1131	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates);
1132	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1133	}
1134
1135	switch (VALU->getOpcode()) {
1136	case AMDGPU::V_READLANE_B32:
1137	case AMDGPU::V_READFIRSTLANE_B32: {
1138	MachineOperand Src = TII.getNamedOperand(MI&: VALU, OperandName: AMDGPU::OpName::src0);
1139	UseReg = Src->getReg();
1140	int WaitStatesNeededForDef =
1141	VALUWriteVGPRReadlaneRead -
1142	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead);
1143	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1144	}
1145	[[fallthrough]];
1146	case AMDGPU::V_WRITELANE_B32: {
1147	UseReg = AMDGPU::EXEC;
1148	int WaitStatesNeededForDef =
1149	VALUWriteEXECRWLane -
1150	getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane);
1151	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1152	break;
1153	}
1154	default:
1155	break;
1156	}
1157	}
1158
1159	// This checks for the hazard where VMEM instructions that store more than
1160	// 8 bytes can have there store data over written by the next instruction.
1161	if (!ST.has12DWordStoreHazard())
1162	return WaitStatesNeeded;
1163
1164	const MachineRegisterInfo &MRI = MF.getRegInfo();
1165
1166	for (const MachineOperand &Def : VALU->defs()) {
1167	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1168	}
1169
1170	return WaitStatesNeeded;
1171	}
1172
1173	int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr IA) const* {
1174	// This checks for hazards associated with inline asm statements.
1175	// Since inline asms can contain just about anything, we use this
1176	// to call/leverage other checkHazard routines. Note that*
1177	// this function doesn't attempt to address all possible inline asm
1178	// hazards (good luck), but is a collection of what has been
1179	// problematic thus far.
1180
1181	// see checkVALUHazards()
1182	if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1183	!ST.hasCvtScaleForwardingHazard())
1184	return `0`;
1185
1186	const MachineRegisterInfo &MRI = MF.getRegInfo();
1187	int WaitStatesNeeded = `0`;
1188
1189	for (const MachineOperand &Op :
1190	llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1191	if (Op.isReg() && Op.isDef()) {
1192	if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1193	continue;
1194
1195	if (ST.has12DWordStoreHazard()) {
1196	WaitStatesNeeded =
1197	std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1198	}
1199	}
1200	}
1201
1202	if (ST.hasDstSelForwardingHazard()) {
1203	const int Shift16DefWaitstates = `1`;
1204
1205	auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1206	const MachineOperand *Dst = getDstSelForwardingOperand(MI: ProducerMI, ST);
1207	// Assume inline asm reads the dst
1208	if (Dst)
1209	return IA->modifiesRegister(Reg: Dst->getReg(), TRI: &TRI) \|\|
1210	IA->readsRegister(Reg: Dst->getReg(), TRI: &TRI);
1211
1212	if (ProducerMI.isInlineAsm()) {
1213	// If MI is inline asm, assume it has dst forwarding hazard
1214	for (auto &Def : ProducerMI.all_defs()) {
1215	if (IA->modifiesRegister(Reg: Def.getReg(), TRI: &TRI) \|\|
1216	IA->readsRegister(Reg: Def.getReg(), TRI: &TRI)) {
1217	return true;
1218	}
1219	}
1220	}
1221
1222	return false;
1223	};
1224
1225	int WaitStatesNeededForDef =
1226	Shift16DefWaitstates -
1227	getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1228	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1229	}
1230
1231	return WaitStatesNeeded;
1232	}
1233
1234	int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr RWLane) const* {
1235	const SIInstrInfo *TII = ST.getInstrInfo();
1236	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1237	const MachineRegisterInfo &MRI = MF.getRegInfo();
1238
1239	const MachineOperand *LaneSelectOp =
1240	TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1);
1241
1242	if (!LaneSelectOp->isReg() \|\| !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1243	return `0`;
1244
1245	Register LaneSelectReg = LaneSelectOp->getReg();
1246	auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1247
1248	const int RWLaneWaitStates = `4`;
1249	int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1250	Limit: RWLaneWaitStates);
1251	return RWLaneWaitStates - WaitStatesSince;
1252	}
1253
1254	int GCNHazardRecognizer::checkRFEHazards(MachineInstr RFE) const* {
1255	if (!ST.hasRFEHazards())
1256	return `0`;
1257
1258	const SIInstrInfo *TII = ST.getInstrInfo();
1259
1260	const int RFEWaitStates = `1`;
1261
1262	auto IsHazardFn = [TII](const MachineInstr &MI) {
1263	return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1264	};
1265	int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1266	return RFEWaitStates - WaitStatesNeeded;
1267	}
1268
1269	int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr MI) const* {
1270	const SIInstrInfo *TII = ST.getInstrInfo();
1271	const int ReadM0WaitStates = `1`;
1272	auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1273	return ReadM0WaitStates -
1274	getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates);
1275	}
1276
1277	void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1278	MachineBasicBlock::iterator InsertPt,
1279	int WaitStatesNeeded, bool IsHoisting) {
1280	const DebugLoc &DL = IsHoisting ? DebugLoc () : InsertPt ->getDebugLoc();
1281	for (int I = `0`; I < WaitStatesNeeded; ++I)
1282	BuildMI(BB&: MBB, I: InsertPt, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
1283	}
1284
1285	void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1286	fixVMEMtoScalarWriteHazards(MI);
1287	fixVcmpxPermlaneHazards(MI);
1288	fixSMEMtoVectorWriteHazards(MI);
1289	fixVcmpxExecWARHazard(MI);
1290	fixLdsBranchVmemWARHazard(MI);
1291	if (ST.hasLdsDirect()) {
1292	fixLdsDirectVALUHazard(MI);
1293	fixLdsDirectVMEMHazard(MI);
1294	}
1295	fixVALUPartialForwardingHazard(MI);
1296	fixVALUTransUseHazard(MI);
1297	fixVALUTransCoexecutionHazards(MI);
1298	fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1299	fixWMMACoexecutionHazards(MI);
1300	fixShift64HighRegBug(MI);
1301	fixVALUMaskWriteHazard(MI);
1302	fixRequiredExportPriority(MI);
1303	if (ST.requiresWaitIdleBeforeGetReg())
1304	fixGetRegWaitIdle(MI);
1305	if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1306	fixDsAtomicAsyncBarrierArriveB64(MI);
1307	if (ST.hasScratchBaseForwardingHazard())
1308	fixScratchBaseForwardingHazard(MI);
1309	if (ST.setRegModeNeedsVNOPs())
1310	fixSetRegMode(MI);
1311	}
1312
1313	static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1314	const MachineInstr &MI) {
1315	return (TII.isVOPC(MI) \|\|
1316	(MI.isCompare() && (TII.isVOP3(MI) \|\| TII.isSDWA(MI)))) &&
1317	MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI);
1318	}
1319
1320	bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1321	if (!ST.hasVcmpxPermlaneHazard() \|\| !isPermlane(MI: *MI))
1322	return false;
1323
1324	const SIInstrInfo *TII = ST.getInstrInfo();
1325	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1326	auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1327	return isVCmpXWritesExec(TII: TII, TRI: TRI, MI);
1328	};
1329
1330	auto IsExpiredFn = [](const MachineInstr &MI, int) {
1331	unsigned Opc = MI.getOpcode();
1332	return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1333	Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1334	};
1335
1336	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1337	std::numeric_limits<int>::max())
1338	return false;
1339
1340	// V_NOP will be discarded by SQ.
1341	// Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1342	// which is always a VGPR and available.
1343	auto Src0 = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src0);
1344	Register Reg = Src0->getReg();
1345	bool IsUndef = Src0->isUndef();
1346	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1347	MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32))
1348	.addReg(RegNo: Reg, Flags: RegState::Define \| getDeadRegState(B: IsUndef))
1349	.addReg(RegNo: Reg, Flags: IsUndef ? RegState::Undef : RegState::Kill);
1350
1351	return true;
1352	}
1353
1354	bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1355	if (!ST.hasVMEMtoScalarWriteHazard())
1356	return false;
1357	assert(!ST.hasExtendedWaitCounts());
1358
1359	if (!SIInstrInfo::isSALU(MI: MI) && !SIInstrInfo::isSMRD(MI: MI))
1360	return false;
1361
1362	if (MI->getNumDefs() == `0`)
1363	return false;
1364
1365	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1366
1367	auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1368	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1369	return false;
1370
1371	for (const MachineOperand &Def : MI->defs()) {
1372	const MachineOperand *Op =
1373	I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false);
1374	if (!Op)
1375	continue;
1376	return true;
1377	}
1378	return false;
1379	};
1380
1381	auto IsExpiredFn = [](const MachineInstr &MI, int) {
1382	return SIInstrInfo::isVALU(MI) \|\|
1383	(MI.getOpcode() == AMDGPU::S_WAITCNT &&
1384	!MI.getOperand(i: `0`).getImm()) \|\|
1385	(MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1386	AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: `0`).getImm()) == `0`);
1387	};
1388
1389	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1390	std::numeric_limits<int>::max())
1391	return false;
1392
1393	const SIInstrInfo *TII = ST.getInstrInfo();
1394	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1395	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1396	.addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: `0`, STI: ST));
1397	return true;
1398	}
1399
1400	bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1401	if (!ST.hasSMEMtoVectorWriteHazard())
1402	return false;
1403	assert(!ST.hasExtendedWaitCounts());
1404
1405	if (!SIInstrInfo::isVALU(MI: *MI))
1406	return false;
1407
1408	AMDGPU::OpName SDSTName;
1409	switch (MI->getOpcode()) {
1410	case AMDGPU::V_READLANE_B32:
1411	case AMDGPU::V_READFIRSTLANE_B32:
1412	SDSTName = AMDGPU::OpName::vdst;
1413	break;
1414	default:
1415	SDSTName = AMDGPU::OpName::sdst;
1416	break;
1417	}
1418
1419	const SIInstrInfo *TII = ST.getInstrInfo();
1420	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1421	const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1422	const MachineOperand SDST = TII->getNamedOperand(MI&: MI, OperandName: SDSTName);
1423	if (!SDST) {
1424	for (const auto &MO : MI->implicit_operands()) {
1425	if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) {
1426	SDST = &MO;
1427	break;
1428	}
1429	}
1430	}
1431
1432	if (!SDST)
1433	return false;
1434
1435	const Register SDSTReg = SDST->getReg();
1436	auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1437	return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI);
1438	};
1439
1440	auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1441	if (TII->isSALU(MI)) {
1442	switch (MI.getOpcode()) {
1443	case AMDGPU::S_SETVSKIP:
1444	case AMDGPU::S_VERSION:
1445	case AMDGPU::S_WAITCNT_VSCNT:
1446	case AMDGPU::S_WAITCNT_VMCNT:
1447	case AMDGPU::S_WAITCNT_EXPCNT:
1448	// These instructions cannot not mitigate the hazard.
1449	return false;
1450	case AMDGPU::S_WAITCNT_LGKMCNT:
1451	// Reducing lgkmcnt count to 0 always mitigates the hazard.
1452	return (MI.getOperand(i: `1`).getImm() == `0`) &&
1453	(MI.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL);
1454	case AMDGPU::S_WAITCNT: {
1455	const int64_t Imm = MI.getOperand(i: `0`).getImm();
1456	AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1457	// DsCnt corresponds to LGKMCnt here.
1458	return Decoded.get(T: AMDGPU::DS_CNT) == `0`;
1459	}
1460	default:
1461	assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) \|\|
1462	MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1463	"unexpected wait count instruction");
1464	// SOPP instructions cannot mitigate the hazard.
1465	if (TII->isSOPP(MI))
1466	return false;
1467	// At this point the SALU can be assumed to mitigate the hazard
1468	// because either:
1469	// (a) it is independent of the at risk SMEM (breaking chain),
1470	// or
1471	// (b) it is dependent on the SMEM, in which case an appropriate
1472	// s_waitcnt lgkmcnt _must_ exist between it and the at risk
1473	// SMEM instruction.
1474	return true;
1475	}
1476	}
1477	return false;
1478	};
1479
1480	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1481	std::numeric_limits<int>::max())
1482	return false;
1483
1484	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1485	MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL)
1486	.addImm(Val: `0`);
1487	return true;
1488	}
1489
1490	bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1491	if (!ST.hasVcmpxExecWARHazard())
1492	return false;
1493	assert(!ST.hasExtendedWaitCounts());
1494
1495	if (!SIInstrInfo::isVALU(MI: *MI))
1496	return false;
1497
1498	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1499	if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
1500	return false;
1501
1502	auto IsHazardFn = [TRI](const MachineInstr &I) {
1503	if (SIInstrInfo::isVALU(MI: I))
1504	return false;
1505	return I.readsRegister(Reg: AMDGPU::EXEC, TRI);
1506	};
1507
1508	const SIInstrInfo *TII = ST.getInstrInfo();
1509	auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1510	if (SIInstrInfo::isVALU(MI)) {
1511	if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))
1512	return true;
1513	for (auto MO : MI.implicit_operands())
1514	if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg())))
1515	return true;
1516	}
1517	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1518	AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: `0`).getImm()) == `0`)
1519	return true;
1520	return false;
1521	};
1522
1523	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1524	std::numeric_limits<int>::max())
1525	return false;
1526
1527	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1528	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1529	.addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: `0`, STI: ST));
1530	return true;
1531	}
1532
1533	static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1534	const GCNSubtarget &ST) {
1535	if (!ST.hasLdsBranchVmemWARHazard())
1536	return false;
1537
1538	// Check if the necessary condition for the hazard is met: both LDS and VMEM
1539	// instructions need to appear in the same function.
1540	bool HasLds = false;
1541	bool HasVmem = false;
1542	for (auto &MBB : MF) {
1543	for (auto &MI : MBB) {
1544	HasLds \|= SIInstrInfo::isDS(MI) \|\| SIInstrInfo::isLDSDMA(MI);
1545	HasVmem \|= SIInstrInfo::isVMEM(MI);
1546	if (HasLds && HasVmem)
1547	return true;
1548	}
1549	}
1550	return false;
1551	}
1552
1553	static bool isStoreCountWaitZero(const MachineInstr &I) {
1554	return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1555	I.getOperand(i: `0`).getReg() == AMDGPU::SGPR_NULL &&
1556	!I.getOperand(i: `1`).getImm();
1557	}
1558
1559	bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1560	if (!RunLdsBranchVmemWARHazardFixup)
1561	return false;
1562
1563	assert(ST.hasLdsBranchVmemWARHazard());
1564	assert(!ST.hasExtendedWaitCounts());
1565
1566	auto IsHazardInst = [](const MachineInstr &MI) {
1567	if (SIInstrInfo::isDS(MI) \|\| SIInstrInfo::isLDSDMA(MI))
1568	return `1`;
1569	if (SIInstrInfo::isVMEM(MI))
1570	return `2`;
1571	return `0`;
1572	};
1573
1574	auto InstType = IsHazardInst (*MI);
1575	if (!InstType)
1576	return false;
1577
1578	auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1579	return IsHazardInst (I) \|\| isStoreCountWaitZero(I);
1580	};
1581
1582	auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1583	if (!I.isBranch())
1584	return false;
1585
1586	auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1587	auto InstType2 = IsHazardInst (I);
1588	return InstType2 && InstType != InstType2;
1589	};
1590
1591	auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1592	auto InstType2 = IsHazardInst (I);
1593	if (InstType == InstType2)
1594	return true;
1595
1596	return isStoreCountWaitZero(I);
1597	};
1598
1599	return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1600	std::numeric_limits<int>::max();
1601	};
1602
1603	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1604	std::numeric_limits<int>::max())
1605	return false;
1606
1607	const SIInstrInfo *TII = ST.getInstrInfo();
1608	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1609	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1610	.addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1611	.addImm(Val: `0`);
1612
1613	return true;
1614	}
1615
1616	bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1617	if (!SIInstrInfo::isLDSDIR(MI: *MI))
1618	return false;
1619
1620	const int NoHazardWaitStates = `15`;
1621	const MachineOperand VDST = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::vdst);
1622	const Register VDSTReg = VDST->getReg();
1623
1624	bool VisitedTrans = false;
1625	auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1626	if (!SIInstrInfo::isVALU(MI: I))
1627	return false;
1628	VisitedTrans = VisitedTrans \|\| SIInstrInfo::isTRANS(MI: I);
1629	// Cover both WAR and WAW
1630	return I.readsRegister(Reg: VDSTReg, TRI: &TRI) \|\| I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1631	};
1632	auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1633	if (WaitStates >= NoHazardWaitStates)
1634	return true;
1635	// Instructions which cause va_vdst==0 expire hazard
1636	return SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isDS(MI: I) \|\|
1637	SIInstrInfo::isEXP(MI: I);
1638	};
1639	auto GetWaitStatesFn = [](const MachineInstr &MI) {
1640	return SIInstrInfo::isVALU(MI) ? `1` : `0`;
1641	};
1642
1643	DenseSet<const MachineBasicBlock *> Visited;
1644	auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(),
1645	I: std::next(x: MI->getReverseIterator()), WaitStates: `0`,
1646	IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn);
1647
1648	// Transcendentals can execute in parallel to other VALUs.
1649	// This makes va_vdst count unusable with a mixture of VALU and TRANS.
1650	if (VisitedTrans)
1651	Count = `0`;
1652
1653	MachineOperand *WaitVdstOp =
1654	TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst);
1655	WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates));
1656
1657	return true;
1658	}
1659
1660	bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1661	if (!SIInstrInfo::isLDSDIR(MI: *MI))
1662	return false;
1663
1664	const MachineOperand VDST = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::vdst);
1665	const Register VDSTReg = VDST->getReg();
1666
1667	auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1668	if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1669	return false;
1670	return I.readsRegister(Reg: VDSTReg, TRI: &TRI) \|\| I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1671	};
1672	bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1673	// TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1674	// according to the type of VMEM instruction.
1675	auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1676	return SIInstrInfo::isVALU(MI: I) \|\| SIInstrInfo::isEXP(MI: I) \|\|
1677	(I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: `0`).getImm()) \|\|
1678	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1679	AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: `0`).getImm()) == `0`) \|\|
1680	(LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) &&
1681	!TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::waitvsrc)->getImm());
1682	};
1683
1684	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1685	std::numeric_limits<int>::max())
1686	return false;
1687
1688	if (LdsdirCanWait) {
1689	TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(`0`);
1690	} else {
1691	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1692	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1693	.addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: `0`, STI: ST));
1694	}
1695
1696	return true;
1697	}
1698
1699	bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1700	if (!ST.hasVALUPartialForwardingHazard())
1701	return false;
1702	assert(!ST.hasExtendedWaitCounts());
1703
1704	if (!ST.isWave64() \|\| !SIInstrInfo::isVALU(MI: *MI))
1705	return false;
1706
1707	SmallSetVector<Register, `4`> SrcVGPRs;
1708
1709	for (const MachineOperand &Use : MI->explicit_uses()) {
1710	if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1711	SrcVGPRs.insert(X: Use.getReg());
1712	}
1713
1714	// Only applies with >= 2 unique VGPR sources
1715	if (SrcVGPRs.size() <= `1`)
1716	return false;
1717
1718	// Look for the following pattern:
1719	// Va <- VALU [PreExecPos]
1720	// intv1
1721	// Exec <- SALU [ExecPos]
1722	// intv2
1723	// Vb <- VALU [PostExecPos]
1724	// intv3
1725	// MI Va, Vb (WaitState = 0)
1726	//
1727	// Where:
1728	// intv1 + intv2 <= 2 VALUs
1729	// intv3 <= 4 VALUs
1730	//
1731	// If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1732
1733	const int Intv1plus2MaxVALUs = `2`;
1734	const int Intv3MaxVALUs = `4`;
1735	const int IntvMaxVALUs = `6`;
1736	const int NoHazardVALUWaitStates = IntvMaxVALUs + `2`;
1737
1738	struct StateType {
1739	SmallDenseMap<Register, int, `4`> DefPos;
1740	int ExecPos = std::numeric_limits<int>::max();
1741	int VALUs = `0`;
1742
1743	static unsigned getHashValue(const StateType &State) {
1744	return hash_combine(args: State.ExecPos, args: State.VALUs,
1745	args: hash_combine_range(R: State.DefPos));
1746	}
1747	static bool isEqual(const StateType &LHS, const StateType &RHS) {
1748	return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1749	LHS.VALUs == RHS.VALUs;
1750	}
1751	};
1752
1753	StateType State;
1754
1755	// This overloads expiry testing with all the hazard detection
1756	auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1757	// Too many VALU states have passed
1758	if (State.VALUs > NoHazardVALUWaitStates)
1759	return HazardExpired;
1760
1761	// Instructions which cause va_vdst==0 expire hazard
1762	if (SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isDS(MI: I) \|\|
1763	SIInstrInfo::isEXP(MI: I) \|\|
1764	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1765	AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: `0`).getImm()) == `0`))
1766	return HazardExpired;
1767
1768	// Track registers writes
1769	bool Changed = false;
1770	if (SIInstrInfo::isVALU(MI: I)) {
1771	for (Register Src : SrcVGPRs) {
1772	if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1773	State.DefPos [Src] = State.VALUs;
1774	Changed = true;
1775	}
1776	}
1777	} else if (SIInstrInfo::isSALU(MI: I)) {
1778	if (State.ExecPos == std::numeric_limits<int>::max()) {
1779	if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
1780	State.ExecPos = State.VALUs;
1781	Changed = true;
1782	}
1783	}
1784	}
1785
1786	// Early expiration: too many VALUs in intv3
1787	if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1788	return HazardExpired;
1789
1790	// Only evaluate state if something changed
1791	if (!Changed)
1792	return NoHazardFound;
1793
1794	// Determine positions of VALUs pre/post exec change
1795	if (State.ExecPos == std::numeric_limits<int>::max())
1796	return NoHazardFound;
1797
1798	int PreExecPos = std::numeric_limits<int>::max();
1799	int PostExecPos = std::numeric_limits<int>::max();
1800
1801	for (auto Entry : State.DefPos) {
1802	int DefVALUs = Entry.second;
1803	if (DefVALUs != std::numeric_limits<int>::max()) {
1804	if (DefVALUs >= State.ExecPos)
1805	PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1806	else
1807	PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1808	}
1809	}
1810
1811	// Need a VALUs post exec change
1812	if (PostExecPos == std::numeric_limits<int>::max())
1813	return NoHazardFound;
1814
1815	// Too many VALUs in intv3?
1816	int Intv3VALUs = PostExecPos;
1817	if (Intv3VALUs > Intv3MaxVALUs)
1818	return HazardExpired;
1819
1820	// Too many VALUs in intv2?
1821	int Intv2VALUs = (State.ExecPos - PostExecPos) - `1`;
1822	if (Intv2VALUs > Intv1plus2MaxVALUs)
1823	return HazardExpired;
1824
1825	// Need a VALUs pre exec change
1826	if (PreExecPos == std::numeric_limits<int>::max())
1827	return NoHazardFound;
1828
1829	// Too many VALUs in intv1?
1830	int Intv1VALUs = PreExecPos - State.ExecPos;
1831	if (Intv1VALUs > Intv1plus2MaxVALUs)
1832	return HazardExpired;
1833
1834	// Too many VALUs in intv1 + intv2
1835	if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1836	return HazardExpired;
1837
1838	return HazardFound;
1839	};
1840	auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1841	if (SIInstrInfo::isVALU(MI))
1842	State.VALUs += `1`;
1843	};
1844
1845	if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1846	InitialI: std::next(x: MI->getReverseIterator())))
1847	return false;
1848
1849	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1850	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1851	.addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: `0`, STI: ST));
1852
1853	return true;
1854	}
1855
1856	bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1857	if (!ST.hasVALUTransUseHazard())
1858	return false;
1859	assert(!ST.hasExtendedWaitCounts());
1860
1861	if (!SIInstrInfo::isVALU(MI: *MI))
1862	return false;
1863
1864	SmallSet<Register, `4`> SrcVGPRs;
1865
1866	for (const MachineOperand &Use : MI->explicit_uses()) {
1867	if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1868	SrcVGPRs.insert(V: Use.getReg());
1869	}
1870
1871	// Look for the following pattern:
1872	// Va <- TRANS VALU
1873	// intv
1874	// MI Va (WaitState = 0)
1875	//
1876	// Where:
1877	// intv <= 5 VALUs / 1 TRANS
1878	//
1879	// If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1880
1881	const int IntvMaxVALUs = `5`;
1882	const int IntvMaxTRANS = `1`;
1883
1884	struct StateType {
1885	int VALUs = `0`;
1886	int TRANS = `0`;
1887
1888	static unsigned getHashValue(const StateType &State) {
1889	return hash_combine(args: State.VALUs, args: State.TRANS);
1890	}
1891	static bool isEqual(const StateType &LHS, const StateType &RHS) {
1892	return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1893	}
1894	};
1895
1896	StateType State;
1897
1898	// This overloads expiry testing with all the hazard detection
1899	auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1900	// Too many VALU states have passed
1901	if (State.VALUs > IntvMaxVALUs \|\| State.TRANS > IntvMaxTRANS)
1902	return HazardExpired;
1903
1904	// Instructions which cause va_vdst==0 expire hazard
1905	if (SIInstrInfo::isVMEM(MI: I) \|\| SIInstrInfo::isDS(MI: I) \|\|
1906	SIInstrInfo::isEXP(MI: I) \|\|
1907	(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1908	AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: `0`).getImm()) == `0`))
1909	return HazardExpired;
1910
1911	// Track registers writes
1912	if (SIInstrInfo::isTRANS(MI: I)) {
1913	for (Register Src : SrcVGPRs) {
1914	if (I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1915	return HazardFound;
1916	}
1917	}
1918	}
1919
1920	return NoHazardFound;
1921	};
1922	auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1923	if (SIInstrInfo::isVALU(MI))
1924	State.VALUs += `1`;
1925	if (SIInstrInfo::isTRANS(MI))
1926	State.TRANS += `1`;
1927	};
1928
1929	if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1930	InitialI: std::next(x: MI->getReverseIterator())))
1931	return false;
1932
1933	// Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1934	// avoided.
1935	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1936	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1937	.addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: `0`, STI: ST));
1938
1939	return true;
1940	}
1941
1942	bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1943	if (!ST.hasGFX1250Insts() \|\| // Coexecution disabled.
1944	!SIInstrInfo::isVALU(MI: MI) \|\| SIInstrInfo::isTRANS(MI: MI))
1945	return false;
1946
1947	const SIInstrInfo *TII = ST.getInstrInfo();
1948	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1949
1950	auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1951	if (!SIInstrInfo::isTRANS(MI: I))
1952	return false;
1953
1954	// RAW: Trans(I) writes, VALU(MI) reads.
1955	Register TransDef = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
1956	for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1957	if (ValuUse.isReg() && TRI->regsOverlap(RegA: TransDef, RegB: ValuUse.getReg()))
1958	return true;
1959	}
1960
1961	auto ValuDst = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::vdst);
1962	if (!ValuDst \|\| !ValuDst->isReg())
1963	return false;
1964
1965	// WAR: Trans(I) reads, VALU(MI) writes.
1966	Register ValuDef = ValuDst->getReg();
1967	for (const MachineOperand &TransUse : I.explicit_uses()) {
1968	if (TransUse.isReg() && TRI->regsOverlap(RegA: ValuDef, RegB: TransUse.getReg()))
1969	return true;
1970	}
1971
1972	return false;
1973	};
1974
1975	auto IsExpiredFn = [](const MachineInstr &I, int) {
1976	return SIInstrInfo::isVALU(MI: I);
1977	};
1978
1979	const int HasVALU = std::numeric_limits<int>::max();
1980	if (::getWaitStatesSince(IsHazard: IsTransHazardFn, MI, IsExpired: IsExpiredFn) == HasVALU)
1981	return false;
1982
1983	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
1984	return true;
1985	}
1986
1987	bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1988	if (!SIInstrInfo::isWMMA(MI: MI) && !SIInstrInfo::isSWMMAC(MI: MI))
1989	return false;
1990
1991	const SIInstrInfo *TII = ST.getInstrInfo();
1992	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1993
1994	auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1995	if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
1996	return false;
1997
1998	// Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1999	// with the dest(matrix D) of the previous wmma.
2000	const Register CurSrc0Reg =
2001	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
2002	const Register CurSrc1Reg =
2003	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
2004
2005	const Register PrevDstReg =
2006	TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
2007
2008	if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) \|\|
2009	TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) {
2010	return true;
2011	}
2012
2013	// GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2014	// but Index can't overlap with PrevDstReg.
2015	if (AMDGPU::isGFX12Plus(STI: ST)) {
2016	if (SIInstrInfo::isSWMMAC(MI: *MI)) {
2017	const Register CurIndex =
2018	TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
2019	if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex))
2020	return true;
2021	}
2022	return false;
2023	}
2024
2025	return false;
2026	};
2027
2028	auto IsExpiredFn = [](const MachineInstr &I, int) {
2029	return SIInstrInfo::isVALU(MI: I);
2030	};
2031
2032	if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
2033	std::numeric_limits<int>::max())
2034	return false;
2035
2036	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
2037
2038	return true;
2039	}
2040
2041	static bool isCoexecutableVALUInst(const MachineInstr &MI) {
2042	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isWMMA(MI) &&
2043	!SIInstrInfo::isSWMMAC(MI) && !SIInstrInfo::isLDSDMA(MI);
2044	}
2045
2046	static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
2047	const SIInstrInfo TII, unsigned* Latency,
2048	unsigned Category) {
2049	assert(TII->isXDLWMMA(MI) && (Latency == `8` \|\| Latency == `16`) &&
2050	"Handle me if the xdl wmma instruction latency changes");
2051
2052	switch (Category) {
2053	case `0`: // Dense WMMA Instructions:
2054	// WMMA_F16, WMMA_BF16
2055	// WMMA_FP8FP8*
2056	// WMMA_FP8BF8*
2057	// WMMA_BF8FP8*
2058	// WMMA_BF8BF8*
2059	// WMMA_F8F6F4 if SRCA & SRCB != F8*
2060	return Latency == `8` && SIInstrInfo::isWMMA(MI);
2061
2062	case `1`: // Dense WMMA Instructions:
2063	// WMMA_IU8
2064	// WMMA_IU4
2065	// WMMA_F8F6F4 if SRCA OR SRCB == F8*
2066	return Latency == `16` && SIInstrInfo::isWMMA(MI);
2067
2068	case `2`: // Dense SWMMAC Instructions
2069	// SWMMAC_F16, SWMMAC_BF16,
2070	// SWMMAC_FP8FP8*
2071	// SWMMAC_BF8FP8*
2072	// SWMMAC_FP8BF8*
2073	// SWMMAC_BF8BF8*
2074	return Latency == `8` && SIInstrInfo::isSWMMAC(MI);
2075
2076	case `3`: // Sparse WMMA Instructions:
2077	// SWMMAC_IU8
2078	// SWMMAC_IU4
2079	return Latency == `16` && SIInstrInfo::isSWMMAC(MI);
2080	default:
2081	break;
2082	} // end switch.
2083
2084	return false;
2085	}
2086
2087	int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr MI) const* {
2088	if (!ST.hasGFX1250Insts())
2089	return `0`;
2090
2091	const SIInstrInfo *TII = ST.getInstrInfo();
2092	if (!TII->isXDLWMMA(MI: MI) && !isCoexecutableVALUInst(MI: MI))
2093	return `0`;
2094
2095	// WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2096	// be in between the first WMMA and the second instruction to cover the hazard
2097	// (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2098	// is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2099	// numbers, which depends on the category of the first WMMA.
2100	const int WMMAWaitStates[] = {`5`, `9`, `3`, `5`};
2101	const int VALUWaitStates[] = {`4`, `8`, `2`, `4`};
2102	unsigned Category = `0`;
2103
2104	auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2105	if (!TII->isXDLWMMA(MI: I))
2106	return false;
2107
2108	unsigned Latency = TSchedModel.computeInstrLatency(MI: &I);
2109	if (!IsWMMAHazardInstInCategory(MI: I, TII, Latency, Category))
2110	return false;
2111
2112	return hasWMMAToWMMARegOverlap(WMMA: I, MI: *MI);
2113	};
2114
2115	auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2116	if (!TII->isXDLWMMA(MI: I))
2117	return false;
2118
2119	unsigned Latency = TSchedModel.computeInstrLatency(MI: &I);
2120	if (!IsWMMAHazardInstInCategory(MI: I, TII, Latency, Category))
2121	return false;
2122
2123	return hasWMMAToVALURegOverlap(WMMA: I, MI: *MI);
2124	};
2125
2126	int Limit = `0`;
2127
2128	auto GetWaitStatesFn = [](const MachineInstr &I) {
2129	return SIInstrInfo::isVALU(MI: I) ? `1` : `0`;
2130	};
2131
2132	int WaitStatesNeeded = -`1`;
2133	if (TII->isXDLWMMA(MI: *MI)) {
2134	for (Category = `0`; WaitStatesNeeded < `0` && Category < `4`; Category++) {
2135	Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2136	// 'getWaitStatesSince' returns the number of VALUs in between if hazard
2137	// exists, and INT_MAX if there is no hazard. As a result, a negative
2138	// WaitStatesNeeded here means no hazard, and we will continue to search
2139	// for other categories.
2140	WaitStatesNeeded =
2141	Limit - getWaitStatesSince(IsHazard: IsWMMAHazardFn, Limit, GetNumWaitStates: GetWaitStatesFn);
2142	}
2143	} else { // Must be a co-executable VALU.
2144	for (Category = `0`; WaitStatesNeeded < `0` && Category < `4`; Category++) {
2145	Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2146	// 'getWaitStatesSince' returns the number of VALUs in between if hazard
2147	// exists, and INT_MAX if there is no hazard. As a result, a negative
2148	// WaitStatesNeeded here means no hazard, and we will continue to search
2149	// for other categories.
2150	WaitStatesNeeded =
2151	Limit - getWaitStatesSince(IsHazard: IsVALUHazardFn, Limit, GetNumWaitStates: GetWaitStatesFn);
2152	}
2153	}
2154
2155	return WaitStatesNeeded;
2156	}
2157
2158	bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2159	const MachineInstr &WMMA, const MachineInstr &MI) const {
2160	Register D0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::vdst)->getReg();
2161	Register A1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)->getReg();
2162	Register B1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)->getReg();
2163
2164	// WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2165	if (TRI.regsOverlap(RegA: D0, RegB: A1) \|\| TRI.regsOverlap(RegA: D0, RegB: B1))
2166	return true;
2167
2168	if (SIInstrInfo::isSWMMAC(MI)) {
2169	Register Idx1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2170	if (TRI.regsOverlap(RegA: D0, RegB: Idx1))
2171	return true;
2172	}
2173	return false;
2174	}
2175
2176	bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2177	const MachineInstr &WMMA, const MachineInstr &MI) const {
2178	// WMMA writes, VALU reads.
2179	Register D0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::vdst)->getReg();
2180	for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2181	if (ValuUse.isReg() && TRI.regsOverlap(RegA: D0, RegB: ValuUse.getReg()))
2182	return true;
2183	}
2184
2185	// WMMA reads or writes, VALU writes.
2186	Register A0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src0)->getReg();
2187	Register B0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src1)->getReg();
2188	SmallVector<Register, `4`> WMMARegs({D0, A0, B0});
2189
2190	if (SIInstrInfo::isSWMMAC(MI: WMMA)) {
2191	Register Idx0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src2)->getReg();
2192	WMMARegs.push_back(Elt: Idx0);
2193	}
2194
2195	for (const MachineOperand &ValuDef : MI.defs()) {
2196	Register VDstReg = ValuDef.getReg();
2197	for (Register WMMAReg : WMMARegs) {
2198	if (TRI.regsOverlap(RegA: VDstReg, RegB: WMMAReg))
2199	return true;
2200	}
2201	}
2202	return false;
2203	}
2204
2205	bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2206	const MachineInstr &MI) const {
2207	// I is the potential WMMA hazard source, MI is the instruction being checked
2208	// for hazard.
2209	if (!TII.isXDLWMMA(MI: I))
2210	return false;
2211
2212	// Dispatch based on MI type
2213	if (TII.isXDLWMMA(MI))
2214	return hasWMMAToWMMARegOverlap(WMMA: I, MI);
2215	if (isCoexecutableVALUInst(MI))
2216	return hasWMMAToVALURegOverlap(WMMA: I, MI);
2217
2218	return false;
2219	}
2220
2221	bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop L, MachineInstr MI,
2222	bool IncludeSubloops) {
2223	// Scan loop for any WMMA that hazards MI.
2224	// TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2225	for (MachineBasicBlock *MBB : L->getBlocks()) {
2226	if (!IncludeSubloops && MLI->getLoopFor(BB: MBB) != L)
2227	continue;
2228	for (MachineInstr &I : *MBB) {
2229	if (&I == MI)
2230	continue;
2231	if (isCoexecutionHazardFor(I, MI: *MI))
2232	return true;
2233	}
2234	}
2235	return false;
2236	}
2237
2238	bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2239	int WaitStatesNeeded) {
2240	if (!MLI)
2241	return false;
2242
2243	MachineLoop *L = MLI->getLoopFor(BB: MI->getParent());
2244	if (!L) {
2245	++NumWMMAHoistingBailed;
2246	return false;
2247	}
2248
2249	// If innermost loop has WMMA hazard, we can't hoist at all
2250	if (hasWMMAHazardInLoop(L, MI)) {
2251	++NumWMMAHoistingBailed;
2252	return false;
2253	}
2254
2255	// Find outermost loop with no internal hazard
2256	MachineLoop *TargetLoop = L;
2257	while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2258	if (hasWMMAHazardInLoop(L: Parent, MI, IncludeSubloops: false))
2259	break; // Parent has hazard in its own blocks, stop here
2260	TargetLoop = Parent; // Safe to hoist further out
2261	}
2262
2263	// Need valid preheader to insert V_NOPs
2264	MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2265	if (!Preheader) {
2266	++NumWMMAHoistingBailed;
2267	return false;
2268	}
2269
2270	LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2271	<< " V_NOPs from loop to " << printMBBReference(*Preheader)
2272	<< "\n");
2273
2274	emitVNops(MBB&: *Preheader, InsertPt: Preheader->getFirstTerminator(), WaitStatesNeeded,
2275	/IsHoisting=/true);
2276	NumWMMANopsHoisted += WaitStatesNeeded;
2277	return true;
2278	}
2279
2280	bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2281	int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2282	if (WaitStatesNeeded <= `0`)
2283	return false;
2284
2285	if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2286	return true;
2287
2288	emitVNops(MBB&: *MI->getParent(), InsertPt: MI->getIterator(), WaitStatesNeeded);
2289	return true;
2290	}
2291
2292	bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2293	if (!ST.hasShift64HighRegBug())
2294	return false;
2295	assert(!ST.hasExtendedWaitCounts());
2296
2297	switch (MI->getOpcode()) {
2298	default:
2299	return false;
2300	case AMDGPU::V_LSHLREV_B64_e64:
2301	case AMDGPU::V_LSHRREV_B64_e64:
2302	case AMDGPU::V_ASHRREV_I64_e64:
2303	break;
2304	}
2305
2306	MachineOperand Amt = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src0);
2307	if (!Amt->isReg())
2308	return false;
2309
2310	Register AmtReg = Amt->getReg();
2311	const MachineRegisterInfo &MRI = MF.getRegInfo();
2312	// Check if this is a last VGPR in the allocation block.
2313	if (!TRI.isVGPR(MRI, Reg: AmtReg) \|\| ((AmtReg - AMDGPU::VGPR0) & `7`) != `7`)
2314	return false;
2315
2316	if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + `1`))
2317	return false;
2318
2319	assert(ST.needsAlignedVGPRs());
2320	static_assert(AMDGPU::VGPR0 + `1` == AMDGPU::VGPR1);
2321
2322	const DebugLoc &DL = MI->getDebugLoc();
2323	MachineBasicBlock *MBB = MI->getParent();
2324	MachineOperand Src1 = TII.getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::src1);
2325
2326	// In:
2327	//
2328	// Dst = shiftrev64 Amt, Src1
2329	//
2330	// if Dst!=Src1 then avoid the bug with:
2331	//
2332	// Dst.sub0 = Amt
2333	// Dst = shift64 Dst.sub0, Src1
2334
2335	Register DstReg = MI->getOperand(i: `0`).getReg();
2336	if (!Src1->isReg() \|\| Src1->getReg() != DstReg) {
2337	Register DstLo = TRI.getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
2338	runOnInstruction(
2339	MI: BuildMI(BB&: MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo).add(MO: Amt));
2340	Amt->setReg(DstLo);
2341	Amt->setIsKill(true);
2342	return true;
2343	}
2344
2345	bool Overlapped = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI);
2346	Register NewReg;
2347	for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2348	: AMDGPU::VGPR_32RegClass) {
2349	if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) {
2350	NewReg = Reg;
2351	break;
2352	}
2353	}
2354
2355	Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1)
2356	: NewReg;
2357	Register NewAmtLo;
2358
2359	if (Overlapped)
2360	NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0);
2361
2362	// Insert a full wait count because found register might be pending a wait.
2363	BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
2364	.addImm(Val: `0`);
2365
2366	// Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2367	if (Overlapped)
2368	runOnInstruction(
2369	MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo)
2370	.addDef(RegNo: AmtReg - `1`)
2371	.addReg(RegNo: AmtReg - `1`, Flags: RegState::Undef)
2372	.addReg(RegNo: NewAmtLo, Flags: RegState::Undef));
2373	runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt)
2374	.addDef(RegNo: AmtReg)
2375	.addReg(RegNo: AmtReg, Flags: RegState::Undef)
2376	.addReg(RegNo: NewAmt, Flags: RegState::Undef));
2377
2378	// Instructions emitted after the current instruction will be processed by the
2379	// parent loop of the hazard recognizer in a natural way.
2380	BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2381	DestReg: AmtReg)
2382	.addDef(RegNo: NewAmt)
2383	.addReg(RegNo: NewAmt)
2384	.addReg(RegNo: AmtReg);
2385	if (Overlapped)
2386	BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2387	DestReg: AmtReg - `1`)
2388	.addDef(RegNo: NewAmtLo)
2389	.addReg(RegNo: NewAmtLo)
2390	.addReg(RegNo: AmtReg - `1`);
2391
2392	// Re-running hazard recognizer on the modified instruction is not necessary,
2393	// inserted V_SWAP_B32 has already both read and write new registers so
2394	// hazards related to these register has already been handled.
2395	Amt->setReg(NewAmt);
2396	Amt->setIsKill(false);
2397	// We do not update liveness, so verifier may see it as undef.
2398	Amt->setIsUndef();
2399	if (Overlapped) {
2400	MI->getOperand(i: `0`).setReg(NewReg);
2401	Src1->setReg(NewReg);
2402	Src1->setIsKill(false);
2403	Src1->setIsUndef();
2404	}
2405
2406	return true;
2407	}
2408
2409	int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr MI) const* {
2410	int NSAtoVMEMWaitStates = `1`;
2411
2412	if (!ST.hasNSAtoVMEMBug())
2413	return `0`;
2414
2415	if (!SIInstrInfo::isMUBUF(MI: MI) && !SIInstrInfo::isMTBUF(MI: MI))
2416	return `0`;
2417
2418	const SIInstrInfo *TII = ST.getInstrInfo();
2419	const auto Offset = TII->getNamedOperand(MI&: MI, OperandName: AMDGPU::OpName::offset);
2420	if (!Offset \|\| (Offset->getImm() & `6`) == `0`)
2421	return `0`;
2422
2423	auto IsHazardFn = [TII](const MachineInstr &I) {
2424	if (!SIInstrInfo::isMIMG(MI: I))
2425	return false;
2426	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
2427	return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2428	TII->getInstSizeInBytes(MI: I) >= `16`;
2429	};
2430
2431	return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: `1`);
2432	}
2433
2434	int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2435	MachineInstr MI) const* {
2436	int FPAtomicToDenormModeWaitStates = `3`;
2437
2438	if (!ST.hasFPAtomicToDenormModeHazard())
2439	return `0`;
2440	assert(!ST.hasExtendedWaitCounts());
2441
2442	if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2443	return `0`;
2444
2445	auto IsHazardFn = [](const MachineInstr &I) {
2446	if (!SIInstrInfo::isVMEM(MI: I))
2447	return false;
2448	return SIInstrInfo::isFPAtomic(MI: I);
2449	};
2450
2451	auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2452	if (WaitStates >= `3` \|\| SIInstrInfo::isVALU(MI))
2453	return true;
2454
2455	return SIInstrInfo::isWaitcnt(Opcode: MI.getOpcode());
2456	};
2457
2458	return FPAtomicToDenormModeWaitStates -
2459	::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
2460	}
2461
2462	int GCNHazardRecognizer::checkMAIHazards(MachineInstr MI) const* {
2463	assert(SIInstrInfo::isMAI(*MI));
2464
2465	return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2466	}
2467
2468	int GCNHazardRecognizer::checkMFMAPadding(MachineInstr MI) const* {
2469	// Early exit if no padding is requested.
2470	if (MFMAPaddingRatio == `0`)
2471	return `0`;
2472
2473	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2474	if (!SIInstrInfo::isMFMA(MI: *MI) \|\| MFI->getOccupancy() < `2`)
2475	return `0`;
2476
2477	int NeighborMFMALatency = `0`;
2478	auto IsNeighboringMFMA = [&NeighborMFMALatency,
2479	this](const MachineInstr &MI) {
2480	if (!SIInstrInfo::isMFMA(MI))
2481	return false;
2482
2483	NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2484	return true;
2485	};
2486
2487	const int MaxMFMAPipelineWaitStates = `16`;
2488	int WaitStatesSinceNeighborMFMA =
2489	getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
2490
2491	int NeighborMFMAPaddingNeeded =
2492	(NeighborMFMALatency * MFMAPaddingRatio / `100`) -
2493	WaitStatesSinceNeighborMFMA;
2494
2495	return std::max(a: `0`, b: NeighborMFMAPaddingNeeded);
2496	}
2497
2498	int GCNHazardRecognizer::checkMAIHazards908(MachineInstr MI) const* {
2499	int WaitStatesNeeded = `0`;
2500	unsigned Opc = MI->getOpcode();
2501
2502	auto IsVALUFn = [](const MachineInstr &MI) {
2503	return SIInstrInfo::isVALU(MI) \|\| MI.isInlineAsm();
2504	};
2505
2506	if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2507	const int LegacyVALUWritesVGPRWaitStates = `2`;
2508	const int VALUWritesExecWaitStates = `4`;
2509	const int MaxWaitStates = `4`;
2510
2511	int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2512	getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2513	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2514
2515	if (WaitStatesNeeded < MaxWaitStates) {
2516	for (const MachineOperand &Use : MI->explicit_uses()) {
2517	const int MaxWaitStates = `2`;
2518
2519	if (!Use.isReg() \|\| !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
2520	continue;
2521
2522	int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2523	getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2524	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2525
2526	if (WaitStatesNeeded == MaxWaitStates)
2527	break;
2528	}
2529	}
2530	}
2531
2532	for (const MachineOperand &Op : MI->explicit_operands()) {
2533	if (!Op.isReg() \|\| !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2534	continue;
2535
2536	if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2537	continue;
2538
2539	const int MFMAWritesAGPROverlappedSrcABWaitStates = `4`;
2540	const int MFMAWritesAGPROverlappedSrcCWaitStates = `2`;
2541	const int MFMA4x4WritesAGPRAccVgprReadWaitStates = `4`;
2542	const int MFMA16x16WritesAGPRAccVgprReadWaitStates = `10`;
2543	const int MFMA32x32WritesAGPRAccVgprReadWaitStates = `18`;
2544	const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = `1`;
2545	const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = `7`;
2546	const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = `15`;
2547	const int MaxWaitStates = `18`;
2548	Register Reg = Op.getReg();
2549	unsigned HazardDefLatency = `0`;
2550
2551	auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2552	this](const MachineInstr &MI) {
2553	if (!SIInstrInfo::isMFMA(MI))
2554	return false;
2555	Register DstReg = MI.getOperand(i: `0`).getReg();
2556	if (DstReg == Reg)
2557	return false;
2558	HazardDefLatency =
2559	std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2560	return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2561	};
2562
2563	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn,
2564	Limit: MaxWaitStates);
2565	int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2566	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2567	int OpNo = Op.getOperandNo();
2568	if (OpNo == SrcCIdx) {
2569	NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2570	} else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2571	switch (HazardDefLatency) {
2572	case `2`: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2573	break;
2574	case `8`: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2575	break;
2576	case `16`: [[fallthrough]];
2577	default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2578	break;
2579	}
2580	} else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2581	switch (HazardDefLatency) {
2582	case `2`: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2583	break;
2584	case `8`: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2585	break;
2586	case `16`: [[fallthrough]];
2587	default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2588	break;
2589	}
2590	}
2591
2592	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2593	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2594
2595	if (WaitStatesNeeded == MaxWaitStates)
2596	return WaitStatesNeeded; // Early exit.
2597
2598	auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2599	if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2600	return false;
2601	Register DstReg = MI.getOperand(i: `0`).getReg();
2602	return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2603	};
2604
2605	const int AccVGPRWriteMFMAReadSrcCWaitStates = `1`;
2606	const int AccVGPRWriteMFMAReadSrcABWaitStates = `3`;
2607	const int AccVGPRWriteAccVgprReadWaitStates = `3`;
2608	NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2609	if (OpNo == SrcCIdx)
2610	NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2611	else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2612	NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2613
2614	WaitStatesNeededForUse = NeedWaitStates -
2615	getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates);
2616	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2617
2618	if (WaitStatesNeeded == MaxWaitStates)
2619	return WaitStatesNeeded; // Early exit.
2620	}
2621
2622	if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2623	const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = `0`;
2624	const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = `5`;
2625	const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = `13`;
2626	const int MaxWaitStates = `13`;
2627	Register DstReg = MI->getOperand(i: `0`).getReg();
2628	unsigned HazardDefLatency = `0`;
2629
2630	auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2631	this](const MachineInstr &MI) {
2632	if (!SIInstrInfo::isMFMA(MI))
2633	return false;
2634	Register Reg = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2635	HazardDefLatency =
2636	std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2637	return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2638	};
2639
2640	int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates);
2641	int NeedWaitStates;
2642	switch (HazardDefLatency) {
2643	case `2`: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2644	break;
2645	case `8`: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2646	break;
2647	case `16`: [[fallthrough]];
2648	default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2649	break;
2650	}
2651
2652	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2653	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2654	}
2655
2656	// Pad neighboring MFMA with noops for better inter-wave performance.
2657	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2658
2659	return WaitStatesNeeded;
2660	}
2661
2662	static int
2663	GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2664	bool IsGFX950) {
2665	// xdl def cycles \| gfx940 \| gfx950
2666	// 2 pass \| 3 4
2667	// 4 pass \| 5 6
2668	// 8 pass \| 9 10
2669	// 16 pass \| 17 18
2670	return NumPasses + `1` + IsGFX950;
2671	}
2672
2673	static int
2674	GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2675	bool IsGFX950) {
2676	// xdl def cycles \| gfx940 \| gfx950
2677	// 2 pass \| 3 3
2678	// 4 pass \| 5 6
2679	// 8 pass \| 9 10
2680	// 16 pass \| 17 18
2681	return NumPasses + `1` + (NumPasses != `2` && IsGFX950);
2682	}
2683
2684	static int
2685	GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2686	// 2 pass -> 2
2687	// 4 pass -> 4
2688	// 8 pass -> 8
2689	// 16 pass -> 16
2690	return NumPasses;
2691	}
2692
2693	static int
2694	GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2695	// 2 pass -> 4
2696	// 4 pass -> 6
2697	// 8 pass -> 10
2698	// 16 pass -> 18
2699	return NumPasses + `2`;
2700	}
2701
2702	static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
2703	bool IsGFX950) {
2704	// xdl def cycles \| gfx942 \| gfx950
2705	// 2 pass \| 5 5
2706	// 4 pass \| 7 8
2707	// 8 pass \| 11 12
2708	// 16 pass \| 19 20
2709	return NumPasses + `3` + (NumPasses != `2` && IsGFX950);
2710	}
2711
2712	int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr MI) const* {
2713	int WaitStatesNeeded = `0`;
2714	unsigned Opc = MI->getOpcode();
2715
2716	auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2717	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2718	};
2719
2720	auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2721	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2722	!SIInstrInfo::isDOT(MI);
2723	};
2724
2725	if (!SIInstrInfo::isMFMA(MI: *MI))
2726	return WaitStatesNeeded;
2727
2728	const int VALUWritesExecWaitStates = `4`;
2729	int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2730	getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn,
2731	Limit: VALUWritesExecWaitStates);
2732	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2733
2734	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2735
2736	// Loop for both DGEMM and S/HGEMM 2nd instruction.
2737	for (const MachineOperand &Use : MI->explicit_uses()) {
2738	const int LegacyVALUNotDotWritesVGPRWaitStates = `2`;
2739	const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = `2`;
2740	const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = `8`;
2741	const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = `16`;
2742	const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = `3`;
2743	const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = `9`;
2744	const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = `17`;
2745	const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = `9`;
2746	const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = `17`;
2747	const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = `4`;
2748	const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = `5`;
2749	const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = `11`;
2750	const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = `19`;
2751	const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = `6`;
2752	const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = `11`;
2753	const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = `19`;
2754	const int DMFMA4x4WritesVGPRFullSrcCWaitStates = `4`;
2755	const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = `2`;
2756	const int MaxWaitStates = `19`;
2757
2758	if (!Use.isReg())
2759	continue;
2760	Register Reg = Use.getReg();
2761	bool FullReg;
2762	const MachineInstr *MI1;
2763
2764	auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2765	this](const MachineInstr &MI) {
2766	if (!SIInstrInfo::isMFMA(MI))
2767	return false;
2768	Register DstReg = MI.getOperand(i: `0`).getReg();
2769	FullReg = (DstReg == Reg);
2770	MI1 = &MI;
2771	return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2772	};
2773
2774	WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2775	getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2776	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2777
2778	int NumWaitStates =
2779	getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates);
2780	if (NumWaitStates == std::numeric_limits<int>::max())
2781	continue;
2782
2783	int OpNo = Use.getOperandNo();
2784	unsigned Opc1 = MI1->getOpcode();
2785	int NeedWaitStates = `0`;
2786	if (OpNo == SrcCIdx) {
2787	if (!SIInstrInfo::isDGEMM(Opcode: Opc) &&
2788	(!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opcode: Opc1))) {
2789	NeedWaitStates = `0`;
2790	} else if (FullReg) {
2791	if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 \|\|
2792	Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2793	(Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 \|\|
2794	Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2795	NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2796	else if (ST.hasGFX940Insts() &&
2797	TSchedModel.computeInstrLatency(MI: MI1) == `2`)
2798	NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2799	} else {
2800	switch (Opc1) {
2801	case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2802	case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2803	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2804	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2805	if (!TII.isXDL(MI: *MI))
2806	NeedWaitStates =
2807	ST.hasGFX950Insts()
2808	? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2809	: DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2810	break;
2811	case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2812	case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2813	if (!TII.isXDL(MI: *MI))
2814	NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2815	break;
2816	default:
2817	int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2818	if (ST.hasGFX940Insts()) {
2819	if (TII.isXDL(MI: MI) && !TII.isXDL(MI: MI1))
2820	break;
2821
2822	NeedWaitStates =
2823	TII.isXDL(MI: *MI1)
2824	? (TII.isXDL(MI: *MI)
2825	? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2826	NumPasses, IsGFX950: ST.hasGFX950Insts())
2827	: GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2828	NumPasses, IsGFX950: ST.hasGFX950Insts()))
2829	: GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2830	NumPasses);
2831	break;
2832	}
2833
2834	switch (NumPasses) {
2835	case `2`:
2836	NeedWaitStates =
2837	SIInstrInfo::isDGEMM(Opcode: Opc)
2838	? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2839	: SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2840	break;
2841	case `8`:
2842	NeedWaitStates =
2843	SIInstrInfo::isDGEMM(Opcode: Opc)
2844	? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2845	: SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2846	break;
2847	case `16`:
2848	NeedWaitStates =
2849	SIInstrInfo::isDGEMM(Opcode: Opc)
2850	? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2851	: SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2852	break;
2853	default:
2854	llvm_unreachable("unexpected number of passes");
2855	}
2856	}
2857	}
2858	} else {
2859	switch (Opc1) {
2860	case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2861	case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2862	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2863	case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2864	NeedWaitStates =
2865	ST.hasGFX950Insts()
2866	? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2867	: DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2868	break;
2869	case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2870	case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2871	NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2872	break;
2873	default:
2874	int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2875
2876	if (ST.hasGFX940Insts()) {
2877	NeedWaitStates =
2878	TII.isXDL(MI: *MI1)
2879	? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2880	NumPasses, IsGFX950: ST.hasGFX950Insts())
2881	: GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2882	NumPasses);
2883	break;
2884	}
2885
2886	switch (NumPasses) {
2887	case `2`:
2888	NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2889	break;
2890	case `4`:
2891	llvm_unreachable("unexpected number of passes for mfma");
2892	case `8`:
2893	NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2894	break;
2895	case `16`:
2896	default:
2897	NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2898	}
2899	}
2900	}
2901	if (WaitStatesNeeded >= NeedWaitStates)
2902	continue;
2903
2904	WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2905	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2906
2907	if (WaitStatesNeeded == MaxWaitStates)
2908	break;
2909	}
2910
2911	// Pad neighboring MFMA with noops for better inter-wave performance.
2912	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2913
2914	return WaitStatesNeeded;
2915	}
2916
2917	int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr MI) const* {
2918	// On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2919	if (!ST.hasMAIInsts() \|\| ST.hasGFX90AInsts())
2920	return `0`;
2921
2922	int WaitStatesNeeded = `0`;
2923
2924	auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2925	return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2926	};
2927
2928	for (const MachineOperand &Op : MI->explicit_uses()) {
2929	if (!Op.isReg() \|\| !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2930	continue;
2931
2932	Register Reg = Op.getReg();
2933
2934	const int AccVgprReadLdStWaitStates = `2`;
2935	const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = `1`;
2936	const int MaxWaitStates = `2`;
2937
2938	int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2939	getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates);
2940	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2941
2942	if (WaitStatesNeeded == MaxWaitStates)
2943	return WaitStatesNeeded; // Early exit.
2944
2945	auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2946	if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2947	MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2948	return false;
2949	auto IsVALUFn = [](const MachineInstr &MI) {
2950	return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2951	};
2952	return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: `2` /MaxWaitStates/) <
2953	std::numeric_limits<int>::max();
2954	};
2955
2956	WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2957	getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2958	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2959	}
2960
2961	return WaitStatesNeeded;
2962	}
2963
2964	int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr MI) const* {
2965	assert(!ST.hasVcmpxPermlaneHazard() &&
2966	"this is a different vcmpx+permlane hazard");
2967	const SIRegisterInfo *TRI = ST.getRegisterInfo();
2968	const SIInstrInfo *TII = ST.getInstrInfo();
2969
2970	auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2971	return isVCmpXWritesExec(TII: TII, TRI: TRI, MI);
2972	};
2973
2974	auto IsVALUFn = [](const MachineInstr &MI) {
2975	return SIInstrInfo::isVALU(MI);
2976	};
2977
2978	const int VCmpXWritesExecWaitStates = `4`;
2979	const int VALUWritesVDstWaitStates = `2`;
2980	int WaitStatesNeeded = `0`;
2981
2982	for (const MachineOperand &Op : MI->explicit_uses()) {
2983	if (!Op.isReg() \|\| !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2984	continue;
2985	Register Reg = Op.getReg();
2986
2987	int WaitStatesSinceDef =
2988	VALUWritesVDstWaitStates -
2989	getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn,
2990	/MaxWaitStates=/Limit: VALUWritesVDstWaitStates);
2991	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesSinceDef);
2992	if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2993	break;
2994	}
2995
2996	int VCmpXHazardWaits =
2997	VCmpXWritesExecWaitStates -
2998	getWaitStatesSince(IsHazard: IsVCmpXWritesExecFn, Limit: VCmpXWritesExecWaitStates);
2999
3000	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: VCmpXHazardWaits);
3001	return WaitStatesNeeded;
3002	}
3003
3004	static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
3005	// 2 pass -> 4
3006	// 4 pass -> 6
3007	// 8 pass -> 10
3008	// 16 pass -> 18
3009	return NumPasses + `2`;
3010	}
3011
3012	static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
3013	bool IsGFX950) {
3014	// xdl def cycles \| gfx942 \| gfx950
3015	// 2 pass \| 5 5
3016	// 4 pass \| 7 8
3017	// 8 pass \| 11 12
3018	// 16 pass \| 19 20
3019	return NumPasses + `3` + (NumPasses != `2` && IsGFX950);
3020	}
3021
3022	static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
3023	bool IsGFX950) {
3024	// xdl def cycles \| gfx942 \| gfx950
3025	// 2 pass \| 5 5
3026	// 4 pass \| 7 8
3027	// 8 pass \| 11 12
3028	// 16 pass \| 19 20
3029	return NumPasses + `3` + (NumPasses != `2` && IsGFX950);
3030	}
3031
3032	static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
3033	// 2 pass -> 4
3034	// 4 pass -> 6
3035	// 8 pass -> 10
3036	// 16 pass -> 18
3037	return NumPasses + `2`;
3038	}
3039
3040	int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr MI) const* {
3041	if (!ST.hasGFX90AInsts())
3042	return `0`;
3043
3044	auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3045	return SIInstrInfo::isDGEMM(Opcode: MI.getOpcode());
3046	};
3047
3048	// This is checked in checkMAIHazards90A()
3049	if (SIInstrInfo::isMFMA(MI: *MI))
3050	return `0`;
3051
3052	const MachineRegisterInfo &MRI = MF.getRegInfo();
3053
3054	int WaitStatesNeeded = `0`;
3055
3056	bool IsMem = SIInstrInfo::isVMEM(MI: MI) \|\| SIInstrInfo::isDS(MI: MI);
3057	bool IsMemOrExport = IsMem \|\| SIInstrInfo::isEXP(MI: *MI);
3058	bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
3059
3060	const MachineInstr MFMA = nullptr*;
3061	unsigned Reg;
3062	auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3063	if (!SIInstrInfo::isMFMA(MI) \|\|
3064	!TRI.regsOverlap(RegA: MI.getOperand(i: `0`).getReg(), RegB: Reg))
3065	return false;
3066	MFMA = &MI;
3067	return true;
3068	};
3069
3070	const MachineInstr DOT = nullptr*;
3071	auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3072	if (!SIInstrInfo::isDOT(MI) \|\|
3073	!TRI.regsOverlap(RegA: MI.getOperand(i: `0`).getReg(), RegB: Reg))
3074	return false;
3075	DOT = &MI;
3076	return true;
3077	};
3078
3079	bool DGEMMAfterVALUWrite = false;
3080	auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3081	// Found DGEMM on reverse traversal to def.
3082	if (SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()))
3083	DGEMMAfterVALUWrite = true;
3084
3085	// Only hazard if register is defined by a VALU and a DGEMM is found after
3086	// after the def.
3087	if (!TII.isVALU(MI) \|\| !DGEMMAfterVALUWrite)
3088	return false;
3089
3090	return true;
3091	};
3092
3093	int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(),
3094	Name: AMDGPU::OpName::src2);
3095
3096	if (IsMemOrExport \|\| IsVALU) {
3097	const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = `5`;
3098	const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = `11`;
3099	const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = `19`;
3100	const int DMFMA4x4WriteVgprMemExpReadWaitStates = `9`;
3101	const int DMFMA16x16WriteVgprMemExpReadWaitStates = `18`;
3102	const int DMFMA4x4WriteVgprVALUReadWaitStates = `6`;
3103	const int DMFMA16x16WriteVgprVALUReadWaitStates = `11`;
3104	const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = `19`;
3105	const int DotWriteSameDotReadSrcAB = `3`;
3106	const int DotWriteDifferentVALURead = `3`;
3107	const int DMFMABetweenVALUWriteVMEMRead = `2`;
3108	const int MaxWaitStates = `19`;
3109
3110	for (const MachineOperand &Use : MI->explicit_uses()) {
3111	if (!Use.isReg())
3112	continue;
3113	Reg = Use.getReg();
3114
3115	DOT = nullptr;
3116	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3117	Limit: MaxWaitStates);
3118	if (DOT) {
3119	int NeedWaitStates = `0`;
3120	if (DOT->getOpcode() == MI->getOpcode()) {
3121	if (&Use - &MI->getOperand(i: `0`) != SrcCIdx)
3122	NeedWaitStates = DotWriteSameDotReadSrcAB;
3123	} else {
3124	NeedWaitStates = DotWriteDifferentVALURead;
3125	}
3126
3127	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3128	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3129	}
3130
3131	// Workaround for HW data hazard bug observed only in GFX90A. When there
3132	// is a DGEMM instruction in-between a VALU and a VMEM instruction it
3133	// causes the SQ to incorrectly not insert two wait states between the two
3134	// instructions needed to avoid data hazard.
3135	if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3136	DGEMMAfterVALUWrite = false;
3137	if (TRI.isVectorRegister(MRI, Reg)) {
3138	int WaitStatesNeededForUse =
3139	DMFMABetweenVALUWriteVMEMRead -
3140	getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
3141	Limit: DMFMABetweenVALUWriteVMEMRead);
3142
3143	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3144	}
3145	}
3146
3147	MFMA = nullptr;
3148	WaitStatesSinceDef =
3149	getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3150	if (!MFMA)
3151	continue;
3152
3153	unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3154	int NumPasses = HazardDefLatency;
3155	int NeedWaitStates = MaxWaitStates;
3156
3157	if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3158	switch (HazardDefLatency) {
3159	case `4`:
3160	NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3161	: DMFMA4x4WriteVgprVALUReadWaitStates;
3162	break;
3163	case `8`:
3164	case `16`:
3165	NeedWaitStates =
3166	IsMemOrExport
3167	? DMFMA16x16WriteVgprMemExpReadWaitStates
3168	: (ST.hasGFX950Insts()
3169	? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3170	: DMFMA16x16WriteVgprVALUReadWaitStates);
3171	break;
3172	default:
3173	llvm_unreachable("unexpected dgemm");
3174	}
3175	} else if (ST.hasGFX940Insts()) {
3176	NeedWaitStates =
3177	TII.isXDL(MI: *MFMA)
3178	? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
3179	NumPasses, IsGFX950: ST.hasGFX950Insts())
3180	: GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
3181	NumPasses);
3182	} else {
3183	switch (HazardDefLatency) {
3184	case `2`:
3185	NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3186	break;
3187	case `8`:
3188	NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3189	break;
3190	case `16`:
3191	NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3192	break;
3193	default:
3194	llvm_unreachable("unexpected number of passes for mfma");
3195	}
3196	}
3197
3198	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3199	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3200
3201	if (WaitStatesNeeded == MaxWaitStates)
3202	break;
3203	}
3204	}
3205
3206	unsigned Opc = MI->getOpcode();
3207	const int DMFMAToFMA64WaitStates = `2`;
3208	if ((Opc == AMDGPU::V_FMA_F64_e64 \|\|
3209	Opc == AMDGPU::V_FMAC_F64_e32 \|\| Opc == AMDGPU::V_FMAC_F64_e64 \|\|
3210	Opc == AMDGPU::V_FMAC_F64_dpp) &&
3211	WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3212	int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3213	getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
3214	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3215	}
3216
3217	if (!IsVALU && !IsMemOrExport)
3218	return WaitStatesNeeded;
3219
3220	for (const MachineOperand &Def : MI->defs()) {
3221	const int SMFMA4x4WriteVgprVALUWawWaitStates = `5`;
3222	const int SMFMA16x16WriteVgprVALUWawWaitStates = `11`;
3223	const int SMFMA32x32WriteVgprVALUWawWaitStates = `19`;
3224	const int SMFMA4x4ReadVgprVALUWarWaitStates = `1`;
3225	const int GFX940_XDL4PassReadVgprVALUWarWaitStates = `3`;
3226	const int SMFMA16x16ReadVgprVALUWarWaitStates = `7`;
3227	const int SMFMA32x32ReadVgprVALUWarWaitStates = `15`;
3228	const int DMFMA4x4WriteVgprVALUWriteWaitStates = `6`;
3229	const int DMFMA16x16WriteVgprVALUWriteWaitStates = `11`;
3230	const int DotWriteDifferentVALUWrite = `3`;
3231	const int MaxWaitStates = `19`;
3232	const int MaxWarWaitStates = `15`;
3233
3234	Reg = Def.getReg();
3235
3236	DOT = nullptr;
3237	int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3238	Limit: MaxWaitStates);
3239	if (DOT && DOT->getOpcode() != MI->getOpcode())
3240	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
3241	WaitStatesSinceDef);
3242
3243	MFMA = nullptr;
3244	WaitStatesSinceDef =
3245	getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3246	if (MFMA) {
3247	int NeedWaitStates = MaxWaitStates;
3248	int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
3249
3250	if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3251	switch (NumPasses) {
3252	case `4`:
3253	NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3254	break;
3255	case `8`:
3256	case `16`:
3257	NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3258	break;
3259	default:
3260	llvm_unreachable("unexpected number of cycles for dgemm");
3261	}
3262	} else if (ST.hasGFX940Insts()) {
3263	NeedWaitStates =
3264	TII.isXDL(MI: *MFMA)
3265	? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
3266	NumPasses, IsGFX950: ST.hasGFX950Insts())
3267	: GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
3268	} else {
3269	switch (NumPasses) {
3270	case `2`:
3271	NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3272	break;
3273	case `8`:
3274	NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3275	break;
3276	case `16`:
3277	NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3278	break;
3279	default:
3280	llvm_unreachable("Unexpected number of passes for mfma");
3281	}
3282	}
3283
3284	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3285	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3286
3287	if (WaitStatesNeeded == MaxWaitStates)
3288	break;
3289	}
3290
3291	auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3292	if (!SIInstrInfo::isMFMA(MI) \|\| SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()) \|\|
3293	!MI.readsRegister(Reg, TRI: &TRI))
3294	return false;
3295
3296	if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3297	return false;
3298
3299	const MachineOperand *SrcC =
3300	TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
3301	assert(SrcC);
3302	if (!SrcC->isReg() \|\| !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg))
3303	return false;
3304
3305	MFMA = &MI;
3306	return true;
3307	};
3308
3309	MFMA = nullptr;
3310	int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
3311	Limit: MaxWarWaitStates);
3312	if (!MFMA)
3313	continue;
3314
3315	unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3316	int NeedWaitStates = MaxWaitStates;
3317	switch (HazardDefLatency) {
3318	case `2`: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3319	break;
3320	case `4`: assert(ST.hasGFX940Insts());
3321	NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3322	break;
3323	case `8`: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3324	break;
3325	case `16`: [[fallthrough]];
3326	default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3327	break;
3328	}
3329
3330	int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3331	WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3332	}
3333
3334	return WaitStatesNeeded;
3335	}
3336
3337	bool GCNHazardRecognizer::ShouldPreferAnother(SUnit SU) const* {
3338	if (!SU->isInstr())
3339	return false;
3340
3341	const MachineInstr MAI = nullptr*;
3342
3343	auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3344	MAI = nullptr;
3345	if (SIInstrInfo::isMFMA(MI))
3346	MAI = &MI;
3347	return MAI != nullptr;
3348	};
3349
3350	MachineInstr *MI = SU->getInstr();
3351	if (IsMFMAFn (*MI)) {
3352	int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: `16`);
3353	if (MAI)
3354	return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
3355	}
3356
3357	return false;
3358	}
3359
3360	// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3361	// insertion of a new instruction.
3362	static void updateGetPCBundle(MachineInstr *NewMI) {
3363	if (!NewMI->isBundled())
3364	return;
3365
3366	// Find start of bundle.
3367	auto I = NewMI->getIterator();
3368	while (I ->isBundledWithPred())
3369	I --;
3370	if (I ->isBundle())
3371	I ++;
3372
3373	// Bail if this is not an S_GETPC bundle.
3374	if (I ->getOpcode() != AMDGPU::S_GETPC_B64)
3375	return;
3376
3377	// Update offsets of any references in the bundle.
3378	const unsigned NewBytes = `4`;
3379	assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3380	"Unexpected instruction insertion in bundle");
3381	auto NextMI = std::next(x: NewMI->getIterator());
3382	auto End = NewMI->getParent()->end();
3383	while (NextMI != End && NextMI ->isBundledWithPred()) {
3384	for (auto &Operand : NextMI ->operands()) {
3385	if (Operand.isGlobal())
3386	Operand.setOffset(Operand.getOffset() + NewBytes);
3387	}
3388	NextMI ++;
3389	}
3390	}
3391
3392	bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3393	if (!ST.hasVALUMaskWriteHazard())
3394	return false;
3395	assert(!ST.hasExtendedWaitCounts());
3396
3397	if (!ST.isWave64())
3398	return false;
3399
3400	const bool IsSALU = SIInstrInfo::isSALU(MI: *MI);
3401	const bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
3402	if (!IsSALU && !IsVALU)
3403	return false;
3404
3405	// The hazard sequence is three instructions:
3406	// 1. VALU reads SGPR as mask
3407	// 2. VALU/SALU writes SGPR
3408	// 3. VALU/SALU reads SGPR
3409	// The hazard can expire if the distance between 2 and 3 is sufficient,
3410	// or (2) is VALU and (3) is SALU.
3411	// In practice this happens <10% of the time, hence always assume the hazard
3412	// exists if (1) and (2) are present to avoid searching all SGPR reads.
3413
3414	const SIRegisterInfo *TRI = ST.getRegisterInfo();
3415	const MachineRegisterInfo &MRI = MF.getRegInfo();
3416
3417	auto IgnoreableSGPR = [](const Register Reg) {
3418	switch (Reg) {
3419	case AMDGPU::EXEC:
3420	case AMDGPU::EXEC_LO:
3421	case AMDGPU::EXEC_HI:
3422	case AMDGPU::M0:
3423	case AMDGPU::SGPR_NULL:
3424	case AMDGPU::SGPR_NULL64:
3425	case AMDGPU::SCC:
3426	return true;
3427	default:
3428	return false;
3429	}
3430	};
3431	auto IsVCC = [](const Register Reg) {
3432	return Reg == AMDGPU::VCC \|\| Reg == AMDGPU::VCC_LO \|\| Reg == AMDGPU::VCC_HI;
3433	};
3434
3435	struct StateType {
3436	SmallSet<Register, `2`> HazardSGPRs;
3437
3438	static unsigned getHashValue(const StateType &State) {
3439	return hash_combine_range(R: State.HazardSGPRs);
3440	}
3441	static bool isEqual(const StateType &LHS, const StateType &RHS) {
3442	return LHS.HazardSGPRs == RHS.HazardSGPRs;
3443	}
3444	};
3445
3446	SmallVector<const MachineInstr *> WaitInstrs;
3447	bool HasSGPRRead = false;
3448	StateType InitialState;
3449
3450	// Look for SGPR write.
3451	MachineOperand HazardDef = nullptr*;
3452	for (MachineOperand &Op : MI->operands()) {
3453	if (!Op.isReg())
3454	continue;
3455	if (Op.isDef() && HazardDef)
3456	continue;
3457
3458	Register Reg = Op.getReg();
3459	if (IgnoreableSGPR (Reg))
3460	continue;
3461	if (!IsVCC (Reg)) {
3462	if (Op.isImplicit())
3463	continue;
3464	if (!TRI->isSGPRReg(MRI, Reg))
3465	continue;
3466	}
3467	// Also check for SGPR reads.
3468	if (Op.isUse()) {
3469	HasSGPRRead = true;
3470	continue;
3471	}
3472
3473	assert(!HazardDef);
3474	HazardDef = &Op;
3475	}
3476
3477	if (!HazardDef)
3478	return false;
3479
3480	// Setup to track writes to individual SGPRs
3481	const Register HazardReg = HazardDef->getReg();
3482	if (AMDGPU::SReg_32RegClass.contains(Reg: HazardReg)) {
3483	InitialState.HazardSGPRs.insert(V: HazardReg);
3484	} else {
3485	assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3486	InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub0));
3487	InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub1));
3488	}
3489
3490	auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3491	if (State.HazardSGPRs.empty())
3492	return HazardExpired;
3493
3494	switch (I.getOpcode()) {
3495	case AMDGPU::V_ADDC_U32_e32:
3496	case AMDGPU::V_ADDC_U32_dpp:
3497	case AMDGPU::V_CNDMASK_B16_t16_e32:
3498	case AMDGPU::V_CNDMASK_B16_fake16_e32:
3499	case AMDGPU::V_CNDMASK_B16_t16_dpp:
3500	case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3501	case AMDGPU::V_CNDMASK_B32_e32:
3502	case AMDGPU::V_CNDMASK_B32_dpp:
3503	case AMDGPU::V_DIV_FMAS_F32_e64:
3504	case AMDGPU::V_DIV_FMAS_F64_e64:
3505	case AMDGPU::V_SUBB_U32_e32:
3506	case AMDGPU::V_SUBB_U32_dpp:
3507	case AMDGPU::V_SUBBREV_U32_e32:
3508	case AMDGPU::V_SUBBREV_U32_dpp: {
3509	// These implicitly read VCC as mask source.
3510	return IsVCC (HazardReg) ? HazardFound : NoHazardFound;
3511	}
3512	case AMDGPU::V_ADDC_U32_e64:
3513	case AMDGPU::V_ADDC_U32_e64_dpp:
3514	case AMDGPU::V_CNDMASK_B16_t16_e64:
3515	case AMDGPU::V_CNDMASK_B16_fake16_e64:
3516	case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3517	case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3518	case AMDGPU::V_CNDMASK_B32_e64:
3519	case AMDGPU::V_CNDMASK_B32_e64_dpp:
3520	case AMDGPU::V_SUBB_U32_e64:
3521	case AMDGPU::V_SUBB_U32_e64_dpp:
3522	case AMDGPU::V_SUBBREV_U32_e64:
3523	case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3524	// Only check mask register overlaps.
3525	const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src2);
3526	assert(SSRCOp);
3527	bool Result = TRI->regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg);
3528	return Result ? HazardFound : NoHazardFound;
3529	}
3530	default:
3531	return NoHazardFound;
3532	}
3533	};
3534
3535	const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3536	Encoded: AMDGPU::DepCtr::encodeFieldVaSdst(Encoded: AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: `0`, STI: ST),
3537	VaSdst: `0`),
3538	SaSdst: `0`);
3539	auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3540	switch (I.getOpcode()) {
3541	case AMDGPU::S_WAITCNT_DEPCTR:
3542	// Record mergable waits within region of instructions free of SGPR reads.
3543	if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3544	(I.getOperand(i: `0`).getImm() & ConstantMaskBits) == ConstantMaskBits)
3545	WaitInstrs.push_back(Elt: &I);
3546	break;
3547	default:
3548	// Update tracking of SGPR reads and writes.
3549	for (auto &Op : I.operands()) {
3550	if (!Op.isReg())
3551	continue;
3552
3553	Register Reg = Op.getReg();
3554	if (IgnoreableSGPR (Reg))
3555	continue;
3556	if (!IsVCC (Reg)) {
3557	if (Op.isImplicit())
3558	continue;
3559	if (!TRI->isSGPRReg(MRI, Reg))
3560	continue;
3561	}
3562	if (Op.isUse()) {
3563	HasSGPRRead = true;
3564	continue;
3565	}
3566
3567	// Stop tracking any SGPRs with writes on the basis that they will
3568	// already have an appropriate wait inserted afterwards.
3569	SmallVector<Register, `2`> Found;
3570	for (Register SGPR : State.HazardSGPRs) {
3571	if (Reg == SGPR \|\| TRI->regsOverlap(RegA: Reg, RegB: SGPR))
3572	Found.push_back(Elt: SGPR);
3573	}
3574	for (Register SGPR : Found)
3575	State.HazardSGPRs.erase(V: SGPR);
3576	}
3577	break;
3578	}
3579	};
3580
3581	// Check for hazard
3582	if (!hasHazard<StateType>(InitialState, IsHazard: IsHazardFn, UpdateState: UpdateStateFn,
3583	InitialMBB: MI->getParent(),
3584	InitialI: std::next(x: MI->getReverseIterator())))
3585	return false;
3586
3587	// Compute counter mask
3588	unsigned DepCtr =
3589	IsVALU ? (IsVCC (HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: `0`, STI: ST)
3590	: AMDGPU::DepCtr::encodeFieldVaSdst(VaSdst: `0`, STI: ST))
3591	: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: `0`, STI: ST);
3592
3593	// Try to merge previous waits into this one for regions with no SGPR reads.
3594	if (!WaitInstrs.empty()) {
3595	// Note: WaitInstrs contains const pointers, so walk backward from MI to
3596	// obtain a mutable pointer to each instruction to be merged.
3597	// This is expected to be a very short walk within the same block.
3598	SmallVector<MachineInstr *> ToErase;
3599	unsigned Found = `0`;
3600	for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3601	End = MI->getParent()->rend();
3602	Found < WaitInstrs.size() && It != End; ++It) {
3603	MachineInstr WaitMI = &It;
3604	// Find next wait instruction.
3605	if (std::as_const(t&: WaitMI) != WaitInstrs [Found])
3606	continue;
3607	Found++;
3608	unsigned WaitMask = WaitMI->getOperand(i: `0`).getImm();
3609	assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3610	DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3611	Encoded: DepCtr, SaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: WaitMask),
3612	b: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: DepCtr)));
3613	DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3614	Encoded: DepCtr, VaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: WaitMask),
3615	b: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: DepCtr)));
3616	DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3617	Encoded: DepCtr, VaVcc: std::min(a: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: WaitMask),
3618	b: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: DepCtr)));
3619	ToErase.push_back(Elt: WaitMI);
3620	}
3621	assert(Found == WaitInstrs.size());
3622	for (MachineInstr *WaitMI : ToErase)
3623	WaitMI->eraseFromParent();
3624	}
3625
3626	// Add s_waitcnt_depctr after SGPR write.
3627	auto NextMI = std::next(x: MI->getIterator());
3628	auto NewMI = BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(),
3629	MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3630	.addImm(Val: DepCtr);
3631
3632	// SALU write may be s_getpc in a bundle.
3633	updateGetPCBundle(NewMI);
3634
3635	return true;
3636	}
3637
3638	static bool ensureEntrySetPrio(MachineFunction MF, int* Priority,
3639	const SIInstrInfo &TII) {
3640	MachineBasicBlock &EntryMBB = MF->front();
3641	if (EntryMBB.begin() != EntryMBB.end()) {
3642	auto &EntryMI = *EntryMBB.begin();
3643	if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3644	EntryMI.getOperand(i: `0`).getImm() >= Priority)
3645	return false;
3646	}
3647
3648	BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3649	.addImm(Val: Priority);
3650	return true;
3651	}
3652
3653	bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3654	if (!ST.hasRequiredExportPriority())
3655	return false;
3656
3657	// Assume the following shader types will never have exports,
3658	// and avoid adding or adjusting S_SETPRIO.
3659	MachineBasicBlock *MBB = MI->getParent();
3660	MachineFunction *MF = MBB->getParent();
3661	auto CC = MF->getFunction().getCallingConv();
3662	switch (CC) {
3663	case CallingConv::AMDGPU_CS:
3664	case CallingConv::AMDGPU_CS_Chain:
3665	case CallingConv::AMDGPU_CS_ChainPreserve:
3666	case CallingConv::AMDGPU_KERNEL:
3667	return false;
3668	default:
3669	break;
3670	}
3671
3672	const int MaxPriority = `3`;
3673	const int NormalPriority = `2`;
3674	const int PostExportPriority = `0`;
3675
3676	auto It = MI->getIterator();
3677	switch (MI->getOpcode()) {
3678	case AMDGPU::S_ENDPGM:
3679	case AMDGPU::S_ENDPGM_SAVED:
3680	case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3681	case AMDGPU::SI_RETURN_TO_EPILOG:
3682	// Ensure shader with calls raises priority at entry.
3683	// This ensures correct priority if exports exist in callee.
3684	if (MF->getFrameInfo().hasCalls())
3685	return ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3686	return false;
3687	case AMDGPU::S_SETPRIO: {
3688	// Raise minimum priority unless in workaround.
3689	auto &PrioOp = MI->getOperand(i: `0`);
3690	int Prio = PrioOp.getImm();
3691	bool InWA = (Prio == PostExportPriority) &&
3692	(It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It)));
3693	if (InWA \|\| Prio >= NormalPriority)
3694	return false;
3695	PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority));
3696	return true;
3697	}
3698	default:
3699	if (!TII.isEXP(MI: *MI))
3700	return false;
3701	break;
3702	}
3703
3704	// Check entry priority at each export (as there will only be a few).
3705	// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3706	bool Changed = false;
3707	if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
3708	Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3709
3710	auto NextMI = std::next(x: It);
3711	bool EndOfShader = false;
3712	if (NextMI != MBB->end()) {
3713	// Only need WA at end of sequence of exports.
3714	if (TII.isEXP(MI: *NextMI))
3715	return Changed;
3716	// Assume appropriate S_SETPRIO after export means WA already applied.
3717	if (NextMI ->getOpcode() == AMDGPU::S_SETPRIO &&
3718	NextMI ->getOperand(i: `0`).getImm() == PostExportPriority)
3719	return Changed;
3720	EndOfShader = NextMI ->getOpcode() == AMDGPU::S_ENDPGM;
3721	}
3722
3723	const DebugLoc &DL = MI->getDebugLoc();
3724
3725	// Lower priority.
3726	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3727	.addImm(Val: PostExportPriority);
3728
3729	if (!EndOfShader) {
3730	// Wait for exports to complete.
3731	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT))
3732	.addReg(RegNo: AMDGPU::SGPR_NULL)
3733	.addImm(Val: `0`);
3734	}
3735
3736	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: `0`);
3737	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: `0`);
3738
3739	if (!EndOfShader) {
3740	// Return to normal (higher) priority.
3741	BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3742	.addImm(Val: NormalPriority);
3743	}
3744
3745	return true;
3746	}
3747
3748	bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3749	if (!isSGetReg(Opcode: MI->getOpcode()))
3750	return false;
3751
3752	const SIInstrInfo *TII = ST.getInstrInfo();
3753	switch (getHWReg(TII, RegInstr: *MI)) {
3754	default:
3755	return false;
3756	case AMDGPU::Hwreg::ID_STATUS:
3757	case AMDGPU::Hwreg::ID_STATE_PRIV:
3758	case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV:
3759	case AMDGPU::Hwreg::ID_EXCP_FLAG_USER:
3760	break;
3761	}
3762
3763	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3764	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3765	.addImm(Val: `0`);
3766	return true;
3767	}
3768
3769	bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3770	if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3771	return false;
3772
3773	const SIInstrInfo *TII = ST.getInstrInfo();
3774	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3775	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3776	.addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: `0`, STI: ST));
3777	BuildMI(BB&: *MI->getParent(), I: std::next(x: MI->getIterator()), MIMD: MI->getDebugLoc(),
3778	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3779	.addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: `0`, STI: ST));
3780
3781	return true;
3782	}
3783
3784	bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3785	// No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3786	// for hazard to trigger.
3787	if (!IsHazardRecognizerMode)
3788	return false;
3789
3790	const SIRegisterInfo *TRI = ST.getRegisterInfo();
3791	const SIInstrInfo *TII = ST.getInstrInfo();
3792	// Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3793	const int FlatScrBaseWaitStates = `10`;
3794
3795	bool ReadsFlatScrLo =
3796	MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3797	bool ReadsFlatScrHi =
3798	MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3799	if (isSGetReg(Opcode: MI->getOpcode())) {
3800	switch (getHWReg(TII, RegInstr: *MI)) {
3801	default:
3802	break;
3803	case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
3804	ReadsFlatScrLo = true;
3805	break;
3806	case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
3807	ReadsFlatScrHi = true;
3808	break;
3809	}
3810	}
3811
3812	const MachineRegisterInfo &MRI = MF.getRegInfo();
3813
3814	auto IsRegDefHazard = [&](Register Reg) -> bool {
3815	DenseSet<const MachineBasicBlock *> Visited;
3816	auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3817	return MI.modifiesRegister(Reg, TRI);
3818	};
3819
3820	// This literally abuses the idea of waitstates. Instead of waitstates it
3821	// returns 1 for SGPR written and 0 otherwise.
3822	auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3823	if (!TII->isSALU(MI) && !TII->isVALU(MI))
3824	return `0`;
3825	for (const MachineOperand &MO : MI.all_defs()) {
3826	if (TRI->isSGPRReg(MRI, Reg: MO.getReg()))
3827	return `1`;
3828	}
3829	return `0`;
3830	};
3831
3832	auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3833	if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3834	unsigned Wait = MI.getOperand(i: `0`).getImm();
3835	if (AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: Wait) == `0` &&
3836	AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: Wait) == `0`)
3837	return true;
3838	}
3839	return SgprWrites >= FlatScrBaseWaitStates;
3840	};
3841
3842	return ::getWaitStatesSince(
3843	IsHazard: IsHazardFn, MBB: MI->getParent(), I: std::next(x: MI->getReverseIterator()),
3844	WaitStates: `0`, IsExpired: IsExpiredFn, Visited, GetNumWaitStates: IsSGPRDef) < FlatScrBaseWaitStates;
3845	};
3846
3847	if ((!ReadsFlatScrLo \|\| MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR102) \|\|
3848	!IsRegDefHazard (AMDGPU::SGPR102)) &&
3849	(!ReadsFlatScrHi \|\| MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR103) \|\|
3850	!IsRegDefHazard (AMDGPU::SGPR103)))
3851	return false;
3852
3853	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3854	MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3855	.addImm(Val: AMDGPU::DepCtr::encodeFieldVaSdst(
3856	Encoded: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: `0`, STI: ST), VaSdst: `0`));
3857	return true;
3858	}
3859
3860	bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3861	if (!isSSetReg(Opcode: MI->getOpcode()) \|\|
3862	MI->getOperand(i: `1`).getImm() != AMDGPU::Hwreg::ID_MODE)
3863	return false;
3864
3865	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3866	BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3867	return true;
3868	}
3869

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp