1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16#include "SIMachineFunctionInfo.h"
17#include "llvm/ADT/Statistic.h"
18#include "llvm/CodeGen/MachineFrameInfo.h"
19#include "llvm/CodeGen/MachineFunction.h"
20#include "llvm/CodeGen/MachineInstrBuilder.h"
21#include "llvm/CodeGen/ScheduleDAG.h"
22#include "llvm/Support/Debug.h"
23#include "llvm/TargetParser/TargetParser.h"
24
25using namespace llvm;
26
27#define DEBUG_TYPE "gcn-hazard-recognizer"
28
29STATISTIC(NumWMMANopsHoisted,
30 "Number of WMMA hazard V_NOPs hoisted from loops");
31STATISTIC(NumWMMAHoistingBailed,
32 "Number of WMMA hazards where V_NOP hoisting was not possible");
33
34namespace {
35
36struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
37 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
38
39 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
40 if (Arg.getAsInteger(Radix: 0, Result&: Value))
41 return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
42
43 if (Value > 100)
44 return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
45
46 return false;
47 }
48};
49
50} // end anonymous namespace
51
52static cl::opt<unsigned, false, MFMAPaddingRatioParser>
53 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: 0), cl::Hidden,
54 cl::desc("Fill a percentage of the latency between "
55 "neighboring MFMA with s_nops."));
56
57// This is intended for debugging purposes only.
58static cl::opt<unsigned>
59 NopPadding("amdgpu-snop-padding", cl::init(Val: 0), cl::Hidden,
60 cl::desc("Insert a s_nop x before every instruction"));
61
62static cl::opt<bool> EnableWMMAVnopHoisting(
63 "amdgpu-wmma-vnop-hoisting", cl::init(Val: true), cl::Hidden,
64 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
65
66//===----------------------------------------------------------------------===//
67// Hazard Recognizer Implementation
68//===----------------------------------------------------------------------===//
69
70static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
71 const GCNSubtarget &ST);
72
73GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF,
74 MachineLoopInfo *MLI)
75 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
76 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
77 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
78 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
79 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? 19 : 5;
80 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
81}
82
83void GCNHazardRecognizer::Reset() {
84 EmittedInstrs.clear();
85}
86
87void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
88 EmitInstruction(MI: SU->getInstr());
89}
90
91void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
92 CurrCycleInstr = MI;
93}
94
95static bool isDivFMas(unsigned Opcode) {
96 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
97}
98
99static bool isSGetReg(unsigned Opcode) {
100 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
101}
102
103static bool isSSetReg(unsigned Opcode) {
104 switch (Opcode) {
105 case AMDGPU::S_SETREG_B32:
106 case AMDGPU::S_SETREG_B32_mode:
107 case AMDGPU::S_SETREG_IMM32_B32:
108 case AMDGPU::S_SETREG_IMM32_B32_mode:
109 return true;
110 }
111 return false;
112}
113
114static bool isRWLane(unsigned Opcode) {
115 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
116}
117
118static bool isRFE(unsigned Opcode) {
119 return Opcode == AMDGPU::S_RFE_B64;
120}
121
122static bool isSMovRel(unsigned Opcode) {
123 switch (Opcode) {
124 case AMDGPU::S_MOVRELS_B32:
125 case AMDGPU::S_MOVRELS_B64:
126 case AMDGPU::S_MOVRELD_B32:
127 case AMDGPU::S_MOVRELD_B64:
128 return true;
129 default:
130 return false;
131 }
132}
133
134static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
135 const MachineInstr &MI) {
136 if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
137 return true;
138
139 switch (MI.getOpcode()) {
140 case AMDGPU::S_SENDMSG:
141 case AMDGPU::S_SENDMSGHALT:
142 case AMDGPU::S_TTRACEDATA:
143 return true;
144 // These DS opcodes don't support GDS.
145 case AMDGPU::DS_NOP:
146 case AMDGPU::DS_PERMUTE_B32:
147 case AMDGPU::DS_BPERMUTE_B32:
148 return false;
149 default:
150 if (TII.isDS(Opcode: MI.getOpcode())) {
151 int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
152 Name: AMDGPU::OpName::gds);
153 if (MI.getOperand(i: GDS).getImm())
154 return true;
155 }
156 return false;
157 }
158}
159
160static bool isPermlane(const MachineInstr &MI) {
161 unsigned Opcode = MI.getOpcode();
162 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
163 Opcode == AMDGPU::V_PERMLANE64_B32 ||
164 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
165 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
176}
177
178static bool isLdsDma(const MachineInstr &MI) {
179 return SIInstrInfo::isVALU(MI) &&
180 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
181}
182
183static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184 const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr,
185 OperandName: AMDGPU::OpName::simm16);
186 return std::get<0>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm()));
187}
188
189ScheduleHazardRecognizer::HazardType
190GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
191 MachineInstr *MI = SU->getInstr();
192 // If we are not in "HazardRecognizerMode" and therefore not being run from
193 // the scheduler, track possible stalls from hazards but don't insert noops.
194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195
196 if (MI->isBundle())
197 return NoHazard;
198
199 if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > 0)
200 return HazardType;
201
202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203 return HazardType;
204
205 if (checkFPAtomicToDenormModeHazard(MI) > 0)
206 return HazardType;
207
208 // Hazards which cannot be mitigated with S_NOPs.
209 if (!IsHazardRecognizerMode) {
210 if (checkWMMACoexecutionHazards(MI) > 0)
211 return Hazard;
212 }
213
214 if (ST.hasNoDataDepHazard())
215 return NoHazard;
216
217 if (SIInstrInfo::isVMEM(MI: *MI) && checkVMEMHazards(VMEM: MI) > 0)
218 return HazardType;
219
220 if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > 0)
221 return HazardType;
222
223 if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > 0)
224 return HazardType;
225
226 if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > 0)
227 return HazardType;
228
229 if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > 0)
230 return HazardType;
231
232 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
233 SIInstrInfo::isDS(MI: *MI) || SIInstrInfo::isEXP(MI: *MI)) &&
234 checkMAIVALUHazards(MI) > 0)
235 return HazardType;
236
237 if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > 0)
238 return HazardType;
239
240 if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > 0)
241 return HazardType;
242
243 if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > 0)
244 return HazardType;
245
246 if (((ST.hasReadM0MovRelInterpHazard() &&
247 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
248 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
249 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
250 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
251 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
252 (ST.hasReadM0LdsDirectHazard() &&
253 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
254 checkReadM0Hazards(SMovRel: MI) > 0)
255 return HazardType;
256
257 if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > 0)
258 return HazardType;
259
260 if ((SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI)) &&
261 checkMAILdStHazards(MI) > 0)
262 return HazardType;
263
264 if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > 0)
265 return HazardType;
266
267 return NoHazard;
268}
269
270static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
271 unsigned Quantity) {
272 while (Quantity > 0) {
273 unsigned Arg = std::min(a: Quantity, b: 8u);
274 Quantity -= Arg;
275 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
276 .addImm(Val: Arg - 1);
277 }
278}
279
280unsigned
281GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
282 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
283 assert(TSchedModel.getWriteProcResBegin(SC) !=
284 TSchedModel.getWriteProcResEnd(SC));
285 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
286}
287
288void GCNHazardRecognizer::processBundle() {
289 MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
290 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
291 // Check bundled MachineInstr's for hazards.
292 for (; MI != E && MI->isInsideBundle(); ++MI) {
293 CurrCycleInstr = &*MI;
294 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
295
296 if (IsHazardRecognizerMode) {
297 fixHazards(MI: CurrCycleInstr);
298
299 insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
300 }
301
302 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
303 // include the bundled MI directly after, only add a maximum of
304 // (MaxLookAhead - 1) noops to EmittedInstrs.
305 for (unsigned i = 0, e = std::min(a: WaitStates, b: MaxLookAhead - 1); i < e; ++i)
306 EmittedInstrs.push_front(x: nullptr);
307
308 EmittedInstrs.push_front(x: CurrCycleInstr);
309 EmittedInstrs.resize(new_size: MaxLookAhead);
310 }
311 CurrCycleInstr = nullptr;
312}
313
314void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
315 assert(IsHazardRecognizerMode);
316
317 unsigned NumPreNoops = PreEmitNoops(MI);
318 EmitNoops(Quantity: NumPreNoops);
319 if (MI->isInsideBundle())
320 insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
321 else
322 TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator(MI),
323 Quantity: NumPreNoops);
324 EmitInstruction(MI);
325 AdvanceCycle();
326}
327
328unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
329 IsHazardRecognizerMode = true;
330 CurrCycleInstr = MI;
331 unsigned W = PreEmitNoopsCommon(MI);
332 fixHazards(MI);
333 CurrCycleInstr = nullptr;
334 return std::max(a: W, b: NopPadding.getValue());
335}
336
337unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const {
338 return this->PreEmitNoopsCommon(MI);
339}
340
341unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) const {
342 if (MI->isBundle())
343 return 0;
344
345 int WaitStates = 0;
346
347 if (SIInstrInfo::isSMRD(MI: *MI))
348 return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
349
350 if (ST.hasNSAtoVMEMBug())
351 WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
352
353 WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
354
355 if (ST.hasNoDataDepHazard())
356 return WaitStates;
357
358 if (SIInstrInfo::isVMEM(MI: *MI))
359 WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
360
361 if (SIInstrInfo::isVALU(MI: *MI))
362 WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
363
364 if (SIInstrInfo::isDPP(MI: *MI))
365 WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
366
367 if (isDivFMas(Opcode: MI->getOpcode()))
368 WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
369
370 if (isRWLane(Opcode: MI->getOpcode()))
371 WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
372
373 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
374 SIInstrInfo::isDS(MI: *MI) || SIInstrInfo::isEXP(MI: *MI)) &&
375 checkMAIVALUHazards(MI) > 0)
376 WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
377
378 if (MI->isInlineAsm())
379 return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
380
381 if (isSGetReg(Opcode: MI->getOpcode()))
382 return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
383
384 if (isSSetReg(Opcode: MI->getOpcode()))
385 return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
386
387 if (isRFE(Opcode: MI->getOpcode()))
388 return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
389
390 if ((ST.hasReadM0MovRelInterpHazard() &&
391 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
392 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
393 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
394 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
395 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
396 (ST.hasReadM0LdsDirectHazard() &&
397 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
398 return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
399
400 if (SIInstrInfo::isMAI(MI: *MI))
401 return std::max(a: WaitStates, b: checkMAIHazards(MI));
402
403 if (SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI))
404 return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
405
406 if (ST.hasGFX950Insts() && isPermlane(MI: *MI))
407 return std::max(a: WaitStates, b: checkPermlaneHazards(MI));
408
409 return WaitStates;
410}
411
412void GCNHazardRecognizer::EmitNoop() {
413 EmittedInstrs.push_front(x: nullptr);
414}
415
416void GCNHazardRecognizer::AdvanceCycle() {
417 // When the scheduler detects a stall, it will call AdvanceCycle() without
418 // emitting any instructions.
419 if (!CurrCycleInstr) {
420 EmittedInstrs.push_front(x: nullptr);
421 return;
422 }
423
424 if (CurrCycleInstr->isBundle()) {
425 processBundle();
426 return;
427 }
428
429 unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
430 if (!NumWaitStates) {
431 CurrCycleInstr = nullptr;
432 return;
433 }
434
435 // Keep track of emitted instructions
436 EmittedInstrs.push_front(x: CurrCycleInstr);
437
438 // Add a nullptr for each additional wait state after the first. Make sure
439 // not to add more than getMaxLookAhead() items to the list, since we
440 // truncate the list to that size right after this loop.
441 for (unsigned i = 1, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
442 i < e; ++i) {
443 EmittedInstrs.push_front(x: nullptr);
444 }
445
446 // getMaxLookahead() is the largest number of wait states we will ever need
447 // to insert, so there is no point in keeping track of more than that many
448 // wait states.
449 EmittedInstrs.resize(new_size: getMaxLookAhead());
450
451 CurrCycleInstr = nullptr;
452}
453
454void GCNHazardRecognizer::RecedeCycle() {
455 assert(!IsHazardRecognizerMode &&
456 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
457}
458
459//===----------------------------------------------------------------------===//
460// Helper Functions
461//===----------------------------------------------------------------------===//
462
463enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
464
465// Search for a hazard in a block and its predecessors.
466template <typename StateT>
467static bool
468hasHazard(StateT InitialState,
469 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
470 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
471 const MachineBasicBlock *InitialMBB,
472 MachineBasicBlock::const_reverse_instr_iterator InitialI) {
473 struct StateMapKey {
474 SmallVectorImpl<StateT> *States;
475 unsigned Idx;
476 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
477 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
478 }
479 };
480 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
481 static inline StateMapKey getEmptyKey() {
482 return {static_cast<SmallVectorImpl<StateT> *>(
483 DenseMapInfo<void *>::getEmptyKey()),
484 DenseMapInfo<unsigned>::getEmptyKey()};
485 }
486 static inline StateMapKey getTombstoneKey() {
487 return {static_cast<SmallVectorImpl<StateT> *>(
488 DenseMapInfo<void *>::getTombstoneKey()),
489 DenseMapInfo<unsigned>::getTombstoneKey()};
490 }
491 static unsigned getHashValue(const StateMapKey &Key) {
492 return StateT::getHashValue((*Key.States)[Key.Idx]);
493 }
494 static unsigned getHashValue(const StateT &State) {
495 return StateT::getHashValue(State);
496 }
497 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
498 const auto EKey = getEmptyKey();
499 const auto TKey = getTombstoneKey();
500 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
501 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
502 return StateMapKey::isEqual(LHS, RHS);
503 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
504 }
505 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
506 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
507 StateMapKey::isEqual(RHS, getTombstoneKey()))
508 return false;
509 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
510 }
511 };
512
513 SmallDenseMap<StateMapKey, unsigned, 8, StateMapKeyTraits> StateMap;
514 SmallVector<StateT, 8> States;
515
516 MachineBasicBlock::const_reverse_instr_iterator I = InitialI;
517 const MachineBasicBlock *MBB = InitialMBB;
518 StateT State = InitialState;
519
520 SmallSetVector<std::pair<const MachineBasicBlock *, unsigned>, 16> Worklist;
521 unsigned WorkIdx = 0;
522 for (;;) {
523 bool Expired = false;
524 for (auto E = MBB->instr_rend(); I != E; ++I) {
525 // No need to look at parent BUNDLE instructions.
526 if (I->isBundle())
527 continue;
528
529 auto Result = IsHazard(State, *I);
530 if (Result == HazardFound)
531 return true;
532 if (Result == HazardExpired) {
533 Expired = true;
534 break;
535 }
536
537 if (I->isInlineAsm() || I->isMetaInstruction())
538 continue;
539
540 UpdateState(State, *I);
541 }
542
543 if (!Expired) {
544 unsigned StateIdx = States.size();
545 StateMapKey Key = {&States, StateIdx};
546 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
547 if (Insertion.second) {
548 States.emplace_back(State);
549 } else {
550 StateIdx = Insertion.first->second;
551 }
552 for (MachineBasicBlock *Pred : MBB->predecessors())
553 Worklist.insert(X: std::pair(Pred, StateIdx));
554 }
555
556 if (WorkIdx == Worklist.size())
557 break;
558
559 unsigned StateIdx;
560 std::tie(args&: MBB, args&: StateIdx) = Worklist[WorkIdx++];
561 State = States[StateIdx];
562 I = MBB->instr_rbegin();
563 }
564
565 return false;
566}
567
568// Returns a minimum wait states since \p I walking all predecessors.
569// Only scans until \p IsExpired does not return true.
570// Can only be run in a hazard recognizer mode.
571static int
572getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
573 const MachineBasicBlock *MBB,
574 MachineBasicBlock::const_reverse_instr_iterator I,
575 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
576 DenseSet<const MachineBasicBlock *> &Visited,
577 GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
578 SIInstrInfo::getNumWaitStates) {
579 for (auto E = MBB->instr_rend(); I != E; ++I) {
580 // Don't add WaitStates for parent BUNDLE instructions.
581 if (I->isBundle())
582 continue;
583
584 if (IsHazard(*I))
585 return WaitStates;
586
587 if (I->isInlineAsm())
588 continue;
589
590 WaitStates += GetNumWaitStates(*I);
591
592 if (IsExpired(*I, WaitStates))
593 return std::numeric_limits<int>::max();
594 }
595
596 int MinWaitStates = std::numeric_limits<int>::max();
597 for (MachineBasicBlock *Pred : MBB->predecessors()) {
598 if (!Visited.insert(V: Pred).second)
599 continue;
600
601 int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
602 IsExpired, Visited, GetNumWaitStates);
603
604 MinWaitStates = std::min(a: MinWaitStates, b: W);
605 }
606
607 return MinWaitStates;
608}
609
610static int
611getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
612 const MachineInstr *MI,
613 GCNHazardRecognizer::IsExpiredFn IsExpired,
614 GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
615 SIInstrInfo::getNumWaitStates) {
616 DenseSet<const MachineBasicBlock *> Visited;
617 return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
618 I: std::next(x: MI->getReverseIterator()), WaitStates: 0, IsExpired,
619 Visited, GetNumWaitStates);
620}
621
622int GCNHazardRecognizer::getWaitStatesSince(
623 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
624 if (IsHazardRecognizerMode) {
625 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
626 return WaitStates >= Limit;
627 };
628 return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn,
629 GetNumWaitStates);
630 }
631
632 int WaitStates = 0;
633 for (MachineInstr *MI : EmittedInstrs) {
634 if (MI) {
635 if (IsHazard(*MI))
636 return WaitStates;
637
638 if (MI->isInlineAsm())
639 continue;
640 }
641 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
642
643 if (WaitStates >= Limit)
644 break;
645 }
646 return std::numeric_limits<int>::max();
647}
648
649int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
650 int Limit) const {
651 return getWaitStatesSince(IsHazard, Limit, GetNumWaitStates: SIInstrInfo::getNumWaitStates);
652}
653
654int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
655 IsHazardFn IsHazardDef,
656 int Limit) const {
657 const SIRegisterInfo *TRI = ST.getRegisterInfo();
658
659 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
660 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
661 };
662
663 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
664}
665
666int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
667 int Limit) const {
668 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
669 return isSSetReg(Opcode: MI.getOpcode()) && IsHazard(MI);
670 };
671
672 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
673}
674
675//===----------------------------------------------------------------------===//
676// No-op Hazard Detection
677//===----------------------------------------------------------------------===//
678
679static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
680 MCRegister Reg) {
681 for (MCRegUnit Unit : TRI.regunits(Reg))
682 BV.set(static_cast<unsigned>(Unit));
683}
684
685static void addRegsToSet(const SIRegisterInfo &TRI,
686 iterator_range<MachineInstr::const_mop_iterator> Ops,
687 BitVector &DefSet, BitVector &UseSet) {
688 for (const MachineOperand &Op : Ops) {
689 if (Op.isReg())
690 addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
691 }
692}
693
694void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
695 addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
696}
697
698static bool breaksSMEMSoftClause(MachineInstr *MI) {
699 return !SIInstrInfo::isSMRD(MI: *MI);
700}
701
702static bool breaksVMEMSoftClause(MachineInstr *MI) {
703 return !SIInstrInfo::isVMEM(MI: *MI);
704}
705
706int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
707 // SMEM soft clause are only present on VI+, and only matter if xnack is
708 // enabled.
709 if (!ST.isXNACKEnabled())
710 return 0;
711
712 bool IsSMRD = TII.isSMRD(MI: *MEM);
713
714 resetClause();
715
716 // A soft-clause is any group of consecutive SMEM instructions. The
717 // instructions in this group may return out of order and/or may be
718 // replayed (i.e. the same instruction issued more than once).
719 //
720 // In order to handle these situations correctly we need to make sure that
721 // when a clause has more than one instruction, no instruction in the clause
722 // writes to a register that is read by another instruction in the clause
723 // (including itself). If we encounter this situation, we need to break the
724 // clause by inserting a non SMEM instruction.
725
726 for (MachineInstr *MI : EmittedInstrs) {
727 // When we hit a non-SMEM instruction then we have passed the start of the
728 // clause and we can stop.
729 if (!MI)
730 break;
731
732 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
733 break;
734
735 addClauseInst(MI: *MI);
736 }
737
738 if (ClauseDefs.none())
739 return 0;
740
741 // We need to make sure not to put loads and stores in the same clause if they
742 // use the same address. For now, just start a new clause whenever we see a
743 // store.
744 if (MEM->mayStore())
745 return 1;
746
747 addClauseInst(MI: *MEM);
748
749 // If the set of defs and uses intersect then we cannot add this instruction
750 // to the clause, so we have a hazard.
751 return ClauseDefs.anyCommon(RHS: ClauseUses) ? 1 : 0;
752}
753
754int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
755 int WaitStatesNeeded = 0;
756
757 WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
758
759 // This SMRD hazard only affects SI.
760 if (!ST.hasSMRDReadVALUDefHazard())
761 return WaitStatesNeeded;
762
763 // A read of an SGPR by SMRD instruction requires 4 wait states when the
764 // SGPR was written by a VALU instruction.
765 int SmrdSgprWaitStates = 4;
766 auto IsHazardDefFn = [this](const MachineInstr &MI) {
767 return TII.isVALU(MI);
768 };
769 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
770 return TII.isSALU(MI);
771 };
772
773 bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
774
775 for (const MachineOperand &Use : SMRD->uses()) {
776 if (!Use.isReg())
777 continue;
778 int WaitStatesNeededForUse =
779 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
780 Limit: SmrdSgprWaitStates);
781 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
782
783 // This fixes what appears to be undocumented hardware behavior in SI where
784 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
785 // needs some number of nops in between. We don't know how many we need, but
786 // let's use 4. This wasn't discovered before probably because the only
787 // case when this happens is when we expand a 64-bit pointer into a full
788 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
789 // probably never encountered in the closed-source land.
790 if (IsBufferSMRD) {
791 int WaitStatesNeededForUse =
792 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
793 IsHazardDef: IsBufferHazardDefFn,
794 Limit: SmrdSgprWaitStates);
795 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
796 }
797 }
798
799 return WaitStatesNeeded;
800}
801
802int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
803 if (!ST.hasVMEMReadSGPRVALUDefHazard())
804 return 0;
805
806 int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
807
808 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
809 // SGPR was written by a VALU Instruction.
810 const int VmemSgprWaitStates = 5;
811 auto IsHazardDefFn = [this](const MachineInstr &MI) {
812 return TII.isVALU(MI);
813 };
814 for (const MachineOperand &Use : VMEM->uses()) {
815 if (!Use.isReg() || TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
816 continue;
817
818 int WaitStatesNeededForUse =
819 VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
820 Limit: VmemSgprWaitStates);
821 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
822 }
823 return WaitStatesNeeded;
824}
825
826int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
827 const SIRegisterInfo *TRI = ST.getRegisterInfo();
828 const SIInstrInfo *TII = ST.getInstrInfo();
829
830 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
831 int DppVgprWaitStates = 2;
832 int DppExecWaitStates = 5;
833 int WaitStatesNeeded = 0;
834 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
835 return TII->isVALU(MI);
836 };
837
838 for (const MachineOperand &Use : DPP->uses()) {
839 if (!Use.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
840 continue;
841 int WaitStatesNeededForUse =
842 DppVgprWaitStates - getWaitStatesSinceDef(
843 Reg: Use.getReg(),
844 IsHazardDef: [](const MachineInstr &) { return true; },
845 Limit: DppVgprWaitStates);
846 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
847 }
848
849 WaitStatesNeeded = std::max(
850 a: WaitStatesNeeded,
851 b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn,
852 Limit: DppExecWaitStates));
853
854 return WaitStatesNeeded;
855}
856
857int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
858 const SIInstrInfo *TII = ST.getInstrInfo();
859
860 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
861 // instruction.
862 const int DivFMasWaitStates = 4;
863 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
864 return TII->isVALU(MI);
865 };
866 int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn,
867 Limit: DivFMasWaitStates);
868
869 return DivFMasWaitStates - WaitStatesNeeded;
870}
871
872int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
873 const SIInstrInfo *TII = ST.getInstrInfo();
874 unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
875
876 const int GetRegWaitStates = 2;
877 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
878 return GetRegHWReg == getHWReg(TII, RegInstr: MI);
879 };
880 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
881
882 return GetRegWaitStates - WaitStatesNeeded;
883}
884
885int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
886 const SIInstrInfo *TII = ST.getInstrInfo();
887 unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
888
889 const int SetRegWaitStates = ST.getSetRegWaitStates();
890 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
891 return HWReg == getHWReg(TII, RegInstr: MI);
892 };
893 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
894 return SetRegWaitStates - WaitStatesNeeded;
895}
896
897int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
898 if (!MI.mayStore())
899 return -1;
900
901 const SIInstrInfo *TII = ST.getInstrInfo();
902 unsigned Opcode = MI.getOpcode();
903 const MCInstrDesc &Desc = MI.getDesc();
904
905 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
906 int VDataRCID = -1;
907 if (VDataIdx != -1)
908 VDataRCID = TII->getOpRegClassID(OpInfo: Desc.operands()[VDataIdx]);
909
910 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
911 // There is no hazard if the instruction does not use vector regs
912 // (like wbinvl1)
913 if (VDataIdx == -1)
914 return -1;
915 // For MUBUF/MTBUF instructions this hazard only exists if the
916 // instruction is not using a register in the soffset field.
917 const MachineOperand *SOffset =
918 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
919 // If we have no soffset operand, then assume this field has been
920 // hardcoded to zero.
921 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64 &&
922 (!SOffset || !SOffset->isReg()))
923 return VDataIdx;
924 }
925
926 // MIMG instructions create a hazard if they don't use a 256-bit T# and
927 // the store size is greater than 8 bytes and they have more than two bits
928 // of their dmask set.
929 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
930 if (TII->isMIMG(MI)) {
931 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::srsrc);
932 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
933 Desc.operands()[SRsrcIdx])) == 256);
934 (void)SRsrcIdx;
935 }
936
937 if (TII->isFLAT(MI)) {
938 // There is no hazard if the instruction does not use vector regs
939 if (VDataIdx == -1)
940 return -1;
941
942 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64)
943 return VDataIdx;
944 }
945
946 return -1;
947}
948
949int GCNHazardRecognizer::checkVALUHazardsHelper(
950 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
951 // Helper to check for the hazard where VMEM instructions that store more than
952 // 8 bytes can have there store data over written by the next instruction.
953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
954
955 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
956 int WaitStatesNeeded = 0;
957
958 if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
959 return WaitStatesNeeded;
960 Register Reg = Def.getReg();
961 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
962 int DataIdx = createsVALUHazard(MI);
963 return DataIdx >= 0 &&
964 TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg);
965 };
966
967 int WaitStatesNeededForDef =
968 VALUWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: VALUWaitStates);
969 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
970
971 return WaitStatesNeeded;
972}
973
974/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
975/// pack the computed value into correct bit position of the dest register. This
976/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
977/// dst_sel that is not aligned to the register. This function analayzes the \p
978/// MI and \returns an operand with dst forwarding issue, or nullptr if
979/// none exists.
980static const MachineOperand *
981getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
982 if (!SIInstrInfo::isVALU(MI))
983 return nullptr;
984
985 const SIInstrInfo *TII = ST.getInstrInfo();
986
987 unsigned Opcode = MI.getOpcode();
988
989 // There are three different types of instructions
990 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
991 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
992 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
993 // op_sel[3:2]
994 // != 0
995 if (SIInstrInfo::isSDWA(MI)) {
996 // Type 1: SDWA with dst_sel != DWORD
997 if (auto *DstSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel))
998 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
999 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1000 }
1001
1002 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opc: Opcode);
1003 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel)) {
1004 // Type 2: VOP3 which write the hi bits
1005 if (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers) &
1006 SISrcMods::DST_OP_SEL)
1007 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1008
1009 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1010 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1011 (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers) &
1012 SISrcMods::OP_SEL_0))
1013 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1014 }
1015
1016 // Special case: nop is required for all the opsel values for fp4 sr variant
1017 // cvt scale instructions
1018 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1019 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1020
1021 return nullptr;
1022}
1023
1024/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1025/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1026/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1027static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
1028 const MachineOperand *Dst,
1029 const SIRegisterInfo *TRI) {
1030 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1031 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1032 // and we must account for that hazard.
1033 // We also must account for WAW hazards. In particular, WAW with dest
1034 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1035 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1036 // check for ECC. Without accounting for this hazard, the ECC will be
1037 // wrong.
1038 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1039 // complete zeroesHigh16BitsOfDest)
1040 for (auto &Operand : VALU->operands()) {
1041 if (Operand.isReg() && TRI->regsOverlap(RegA: Dst->getReg(), RegB: Operand.getReg())) {
1042 return true;
1043 }
1044 }
1045 return false;
1046}
1047
1048int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1049 int WaitStatesNeeded = 0;
1050
1051 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
1052 const int TransDefWaitstates = 1;
1053
1054 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1055 if (!SIInstrInfo::isTRANS(MI))
1056 return false;
1057 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1058 const SIInstrInfo *TII = ST.getInstrInfo();
1059 Register Def = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)->getReg();
1060
1061 for (const MachineOperand &Use : VALU->explicit_uses()) {
1062 if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
1063 return true;
1064 }
1065
1066 return false;
1067 };
1068
1069 int WaitStatesNeededForDef =
1070 TransDefWaitstates -
1071 getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
1072 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1073 }
1074
1075 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1076 const int Shift16DefWaitstates = 1;
1077
1078 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1079 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1080 const MachineOperand *ForwardedDst =
1081 getDstSelForwardingOperand(MI: ProducerMI, ST);
1082 if (ForwardedDst) {
1083 return consumesDstSelForwardingOperand(VALU, Dst: ForwardedDst, TRI);
1084 }
1085
1086 if (ProducerMI.isInlineAsm()) {
1087 // Assume inline asm has dst forwarding hazard
1088 for (auto &Def : ProducerMI.all_defs()) {
1089 if (consumesDstSelForwardingOperand(VALU, Dst: &Def, TRI))
1090 return true;
1091 }
1092 }
1093
1094 return false;
1095 };
1096
1097 int WaitStatesNeededForDef =
1098 Shift16DefWaitstates -
1099 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1100 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1101 }
1102
1103 if (ST.hasVDecCoExecHazard()) {
1104 const int VALUWriteSGPRVALUReadWaitstates = 2;
1105 const int VALUWriteEXECRWLane = 4;
1106 const int VALUWriteVGPRReadlaneRead = 1;
1107
1108 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1109 const MachineRegisterInfo &MRI = MF.getRegInfo();
1110 Register UseReg;
1111 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1112 if (!SIInstrInfo::isVALU(MI))
1113 return false;
1114 return MI.modifiesRegister(Reg: UseReg, TRI);
1115 };
1116
1117 for (const MachineOperand &Use : VALU->explicit_uses()) {
1118 if (!Use.isReg())
1119 continue;
1120
1121 UseReg = Use.getReg();
1122 if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
1123 int WaitStatesNeededForDef =
1124 VALUWriteSGPRVALUReadWaitstates -
1125 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn,
1126 Limit: VALUWriteSGPRVALUReadWaitstates);
1127 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1128 }
1129 }
1130
1131 if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) {
1132 UseReg = AMDGPU::VCC;
1133 int WaitStatesNeededForDef =
1134 VALUWriteSGPRVALUReadWaitstates -
1135 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates);
1136 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1137 }
1138
1139 switch (VALU->getOpcode()) {
1140 case AMDGPU::V_READLANE_B32:
1141 case AMDGPU::V_READFIRSTLANE_B32: {
1142 MachineOperand *Src = TII.getNamedOperand(MI&: *VALU, OperandName: AMDGPU::OpName::src0);
1143 UseReg = Src->getReg();
1144 int WaitStatesNeededForDef =
1145 VALUWriteVGPRReadlaneRead -
1146 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead);
1147 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1148 }
1149 [[fallthrough]];
1150 case AMDGPU::V_WRITELANE_B32: {
1151 UseReg = AMDGPU::EXEC;
1152 int WaitStatesNeededForDef =
1153 VALUWriteEXECRWLane -
1154 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane);
1155 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1156 break;
1157 }
1158 default:
1159 break;
1160 }
1161 }
1162
1163 // This checks for the hazard where VMEM instructions that store more than
1164 // 8 bytes can have there store data over written by the next instruction.
1165 if (!ST.has12DWordStoreHazard())
1166 return WaitStatesNeeded;
1167
1168 const MachineRegisterInfo &MRI = MF.getRegInfo();
1169
1170 for (const MachineOperand &Def : VALU->defs()) {
1171 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1172 }
1173
1174 return WaitStatesNeeded;
1175}
1176
1177int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1178 // This checks for hazards associated with inline asm statements.
1179 // Since inline asms can contain just about anything, we use this
1180 // to call/leverage other check*Hazard routines. Note that
1181 // this function doesn't attempt to address all possible inline asm
1182 // hazards (good luck), but is a collection of what has been
1183 // problematic thus far.
1184
1185 // see checkVALUHazards()
1186 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1187 !ST.hasCvtScaleForwardingHazard())
1188 return 0;
1189
1190 const MachineRegisterInfo &MRI = MF.getRegInfo();
1191 int WaitStatesNeeded = 0;
1192
1193 for (const MachineOperand &Op :
1194 llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1195 if (Op.isReg() && Op.isDef()) {
1196 if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1197 continue;
1198
1199 if (ST.has12DWordStoreHazard()) {
1200 WaitStatesNeeded =
1201 std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1202 }
1203 }
1204 }
1205
1206 if (ST.hasDstSelForwardingHazard()) {
1207 const int Shift16DefWaitstates = 1;
1208
1209 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1210 const MachineOperand *Dst = getDstSelForwardingOperand(MI: ProducerMI, ST);
1211 // Assume inline asm reads the dst
1212 if (Dst)
1213 return IA->modifiesRegister(Reg: Dst->getReg(), TRI: &TRI) ||
1214 IA->readsRegister(Reg: Dst->getReg(), TRI: &TRI);
1215
1216 if (ProducerMI.isInlineAsm()) {
1217 // If MI is inline asm, assume it has dst forwarding hazard
1218 for (auto &Def : ProducerMI.all_defs()) {
1219 if (IA->modifiesRegister(Reg: Def.getReg(), TRI: &TRI) ||
1220 IA->readsRegister(Reg: Def.getReg(), TRI: &TRI)) {
1221 return true;
1222 }
1223 }
1224 }
1225
1226 return false;
1227 };
1228
1229 int WaitStatesNeededForDef =
1230 Shift16DefWaitstates -
1231 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1232 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1233 }
1234
1235 return WaitStatesNeeded;
1236}
1237
1238int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1239 const SIInstrInfo *TII = ST.getInstrInfo();
1240 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1241 const MachineRegisterInfo &MRI = MF.getRegInfo();
1242
1243 const MachineOperand *LaneSelectOp =
1244 TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1);
1245
1246 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1247 return 0;
1248
1249 Register LaneSelectReg = LaneSelectOp->getReg();
1250 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1251
1252 const int RWLaneWaitStates = 4;
1253 int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1254 Limit: RWLaneWaitStates);
1255 return RWLaneWaitStates - WaitStatesSince;
1256}
1257
1258int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1259 if (!ST.hasRFEHazards())
1260 return 0;
1261
1262 const SIInstrInfo *TII = ST.getInstrInfo();
1263
1264 const int RFEWaitStates = 1;
1265
1266 auto IsHazardFn = [TII](const MachineInstr &MI) {
1267 return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1268 };
1269 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1270 return RFEWaitStates - WaitStatesNeeded;
1271}
1272
1273int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1274 const SIInstrInfo *TII = ST.getInstrInfo();
1275 const int ReadM0WaitStates = 1;
1276 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1277 return ReadM0WaitStates -
1278 getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates);
1279}
1280
1281void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1282 MachineBasicBlock::iterator InsertPt,
1283 int WaitStatesNeeded, bool IsHoisting) {
1284 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1285 for (int I = 0; I < WaitStatesNeeded; ++I)
1286 BuildMI(BB&: MBB, I: InsertPt, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
1287}
1288
1289void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1290 fixVMEMtoScalarWriteHazards(MI);
1291 fixVcmpxPermlaneHazards(MI);
1292 fixSMEMtoVectorWriteHazards(MI);
1293 fixVcmpxExecWARHazard(MI);
1294 fixLdsBranchVmemWARHazard(MI);
1295 if (ST.hasLdsDirect()) {
1296 fixLdsDirectVALUHazard(MI);
1297 fixLdsDirectVMEMHazard(MI);
1298 }
1299 fixVALUPartialForwardingHazard(MI);
1300 fixVALUTransUseHazard(MI);
1301 fixVALUTransCoexecutionHazards(MI);
1302 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1303 fixWMMACoexecutionHazards(MI);
1304 fixShift64HighRegBug(MI);
1305 fixVALUMaskWriteHazard(MI);
1306 fixRequiredExportPriority(MI);
1307 if (ST.requiresWaitIdleBeforeGetReg())
1308 fixGetRegWaitIdle(MI);
1309 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1310 fixDsAtomicAsyncBarrierArriveB64(MI);
1311 if (ST.hasScratchBaseForwardingHazard())
1312 fixScratchBaseForwardingHazard(MI);
1313 if (ST.setRegModeNeedsVNOPs())
1314 fixSetRegMode(MI);
1315}
1316
1317static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1318 const MachineInstr &MI) {
1319 return (TII.isVOPC(MI) ||
1320 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1321 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI);
1322}
1323
1324bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1325 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(MI: *MI))
1326 return false;
1327
1328 const SIInstrInfo *TII = ST.getInstrInfo();
1329 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1330 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1331 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
1332 };
1333
1334 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1335 unsigned Opc = MI.getOpcode();
1336 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1337 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1338 };
1339
1340 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1341 std::numeric_limits<int>::max())
1342 return false;
1343
1344 // V_NOP will be discarded by SQ.
1345 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1346 // which is always a VGPR and available.
1347 auto *Src0 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
1348 Register Reg = Src0->getReg();
1349 bool IsUndef = Src0->isUndef();
1350 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1351 MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32))
1352 .addReg(RegNo: Reg, Flags: RegState::Define | getDeadRegState(B: IsUndef))
1353 .addReg(RegNo: Reg, Flags: IsUndef ? RegState::Undef : RegState::Kill);
1354
1355 return true;
1356}
1357
1358bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1359 if (!ST.hasVMEMtoScalarWriteHazard())
1360 return false;
1361 assert(!ST.hasExtendedWaitCounts());
1362
1363 if (!SIInstrInfo::isSALU(MI: *MI) && !SIInstrInfo::isSMRD(MI: *MI))
1364 return false;
1365
1366 if (MI->getNumDefs() == 0)
1367 return false;
1368
1369 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1370
1371 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1372 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1373 return false;
1374
1375 for (const MachineOperand &Def : MI->defs()) {
1376 const MachineOperand *Op =
1377 I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false);
1378 if (!Op)
1379 continue;
1380 return true;
1381 }
1382 return false;
1383 };
1384
1385 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1386 return SIInstrInfo::isVALU(MI) ||
1387 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1388 !MI.getOperand(i: 0).getImm()) ||
1389 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1390 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: 0).getImm()) == 0);
1391 };
1392
1393 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1394 std::numeric_limits<int>::max())
1395 return false;
1396
1397 const SIInstrInfo *TII = ST.getInstrInfo();
1398 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1399 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1400 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
1401 return true;
1402}
1403
1404bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1405 if (!ST.hasSMEMtoVectorWriteHazard())
1406 return false;
1407 assert(!ST.hasExtendedWaitCounts());
1408
1409 if (!SIInstrInfo::isVALU(MI: *MI))
1410 return false;
1411
1412 AMDGPU::OpName SDSTName;
1413 switch (MI->getOpcode()) {
1414 case AMDGPU::V_READLANE_B32:
1415 case AMDGPU::V_READFIRSTLANE_B32:
1416 SDSTName = AMDGPU::OpName::vdst;
1417 break;
1418 default:
1419 SDSTName = AMDGPU::OpName::sdst;
1420 break;
1421 }
1422
1423 const SIInstrInfo *TII = ST.getInstrInfo();
1424 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1425 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1426 const MachineOperand *SDST = TII->getNamedOperand(MI&: *MI, OperandName: SDSTName);
1427 if (!SDST) {
1428 for (const auto &MO : MI->implicit_operands()) {
1429 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) {
1430 SDST = &MO;
1431 break;
1432 }
1433 }
1434 }
1435
1436 if (!SDST)
1437 return false;
1438
1439 const Register SDSTReg = SDST->getReg();
1440 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1441 return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI);
1442 };
1443
1444 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1445 if (TII->isSALU(MI)) {
1446 switch (MI.getOpcode()) {
1447 case AMDGPU::S_SETVSKIP:
1448 case AMDGPU::S_VERSION:
1449 case AMDGPU::S_WAITCNT_VSCNT:
1450 case AMDGPU::S_WAITCNT_VMCNT:
1451 case AMDGPU::S_WAITCNT_EXPCNT:
1452 // These instructions cannot not mitigate the hazard.
1453 return false;
1454 case AMDGPU::S_WAITCNT_LGKMCNT:
1455 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1456 return (MI.getOperand(i: 1).getImm() == 0) &&
1457 (MI.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL);
1458 case AMDGPU::S_WAITCNT: {
1459 const int64_t Imm = MI.getOperand(i: 0).getImm();
1460 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1461 // DsCnt corresponds to LGKMCnt here.
1462 return Decoded.get(T: AMDGPU::DS_CNT) == 0;
1463 }
1464 default:
1465 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1466 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1467 "unexpected wait count instruction");
1468 // SOPP instructions cannot mitigate the hazard.
1469 if (TII->isSOPP(MI))
1470 return false;
1471 // At this point the SALU can be assumed to mitigate the hazard
1472 // because either:
1473 // (a) it is independent of the at risk SMEM (breaking chain),
1474 // or
1475 // (b) it is dependent on the SMEM, in which case an appropriate
1476 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1477 // SMEM instruction.
1478 return true;
1479 }
1480 }
1481 return false;
1482 };
1483
1484 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1485 std::numeric_limits<int>::max())
1486 return false;
1487
1488 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1489 MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL)
1490 .addImm(Val: 0);
1491 return true;
1492}
1493
1494bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1495 if (!ST.hasVcmpxExecWARHazard())
1496 return false;
1497 assert(!ST.hasExtendedWaitCounts());
1498
1499 if (!SIInstrInfo::isVALU(MI: *MI))
1500 return false;
1501
1502 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1503 if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
1504 return false;
1505
1506 auto IsHazardFn = [TRI](const MachineInstr &I) {
1507 if (SIInstrInfo::isVALU(MI: I))
1508 return false;
1509 return I.readsRegister(Reg: AMDGPU::EXEC, TRI);
1510 };
1511
1512 const SIInstrInfo *TII = ST.getInstrInfo();
1513 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1514 if (SIInstrInfo::isVALU(MI)) {
1515 if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))
1516 return true;
1517 for (auto MO : MI.implicit_operands())
1518 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg())))
1519 return true;
1520 }
1521 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1522 AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
1523 return true;
1524 return false;
1525 };
1526
1527 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1528 std::numeric_limits<int>::max())
1529 return false;
1530
1531 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1532 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1533 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
1534 return true;
1535}
1536
1537static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1538 const GCNSubtarget &ST) {
1539 if (!ST.hasLdsBranchVmemWARHazard())
1540 return false;
1541
1542 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1543 // instructions need to appear in the same function.
1544 bool HasLds = false;
1545 bool HasVmem = false;
1546 for (auto &MBB : MF) {
1547 for (auto &MI : MBB) {
1548 HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI);
1549 HasVmem |= SIInstrInfo::isVMEM(MI);
1550 if (HasLds && HasVmem)
1551 return true;
1552 }
1553 }
1554 return false;
1555}
1556
1557static bool isStoreCountWaitZero(const MachineInstr &I) {
1558 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1559 I.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL &&
1560 !I.getOperand(i: 1).getImm();
1561}
1562
1563bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1564 if (!RunLdsBranchVmemWARHazardFixup)
1565 return false;
1566
1567 assert(ST.hasLdsBranchVmemWARHazard());
1568 assert(!ST.hasExtendedWaitCounts());
1569
1570 auto IsHazardInst = [](const MachineInstr &MI) {
1571 if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI))
1572 return 1;
1573 if (SIInstrInfo::isVMEM(MI))
1574 return 2;
1575 return 0;
1576 };
1577
1578 auto InstType = IsHazardInst(*MI);
1579 if (!InstType)
1580 return false;
1581
1582 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1583 return IsHazardInst(I) || isStoreCountWaitZero(I);
1584 };
1585
1586 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1587 if (!I.isBranch())
1588 return false;
1589
1590 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1591 auto InstType2 = IsHazardInst(I);
1592 return InstType2 && InstType != InstType2;
1593 };
1594
1595 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1596 auto InstType2 = IsHazardInst(I);
1597 if (InstType == InstType2)
1598 return true;
1599
1600 return isStoreCountWaitZero(I);
1601 };
1602
1603 return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1604 std::numeric_limits<int>::max();
1605 };
1606
1607 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1608 std::numeric_limits<int>::max())
1609 return false;
1610
1611 const SIInstrInfo *TII = ST.getInstrInfo();
1612 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1613 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1614 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1615 .addImm(Val: 0);
1616
1617 return true;
1618}
1619
1620bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1621 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1622 return false;
1623
1624 const int NoHazardWaitStates = 15;
1625 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1626 const Register VDSTReg = VDST->getReg();
1627
1628 bool VisitedTrans = false;
1629 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1630 if (!SIInstrInfo::isVALU(MI: I))
1631 return false;
1632 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(MI: I);
1633 // Cover both WAR and WAW
1634 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1635 };
1636 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1637 if (WaitStates >= NoHazardWaitStates)
1638 return true;
1639 // Instructions which cause va_vdst==0 expire hazard
1640 return SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1641 SIInstrInfo::isEXP(MI: I);
1642 };
1643 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1644 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1645 };
1646
1647 DenseSet<const MachineBasicBlock *> Visited;
1648 auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(),
1649 I: std::next(x: MI->getReverseIterator()), WaitStates: 0,
1650 IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn);
1651
1652 // Transcendentals can execute in parallel to other VALUs.
1653 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1654 if (VisitedTrans)
1655 Count = 0;
1656
1657 MachineOperand *WaitVdstOp =
1658 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst);
1659 WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates));
1660
1661 return true;
1662}
1663
1664bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1665 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1666 return false;
1667
1668 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1669 const Register VDSTReg = VDST->getReg();
1670
1671 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1672 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1673 return false;
1674 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1675 };
1676 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1677 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1678 // according to the type of VMEM instruction.
1679 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1680 return SIInstrInfo::isVALU(MI: I) || SIInstrInfo::isEXP(MI: I) ||
1681 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: 0).getImm()) ||
1682 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1683 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: 0).getImm()) == 0) ||
1684 (LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) &&
1685 !TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::waitvsrc)->getImm());
1686 };
1687
1688 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1689 std::numeric_limits<int>::max())
1690 return false;
1691
1692 if (LdsdirCanWait) {
1693 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(0);
1694 } else {
1695 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1696 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1697 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
1698 }
1699
1700 return true;
1701}
1702
1703bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1704 if (!ST.hasVALUPartialForwardingHazard())
1705 return false;
1706 assert(!ST.hasExtendedWaitCounts());
1707
1708 if (!ST.isWave64() || !SIInstrInfo::isVALU(MI: *MI))
1709 return false;
1710
1711 SmallSetVector<Register, 4> SrcVGPRs;
1712
1713 for (const MachineOperand &Use : MI->explicit_uses()) {
1714 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1715 SrcVGPRs.insert(X: Use.getReg());
1716 }
1717
1718 // Only applies with >= 2 unique VGPR sources
1719 if (SrcVGPRs.size() <= 1)
1720 return false;
1721
1722 // Look for the following pattern:
1723 // Va <- VALU [PreExecPos]
1724 // intv1
1725 // Exec <- SALU [ExecPos]
1726 // intv2
1727 // Vb <- VALU [PostExecPos]
1728 // intv3
1729 // MI Va, Vb (WaitState = 0)
1730 //
1731 // Where:
1732 // intv1 + intv2 <= 2 VALUs
1733 // intv3 <= 4 VALUs
1734 //
1735 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1736
1737 const int Intv1plus2MaxVALUs = 2;
1738 const int Intv3MaxVALUs = 4;
1739 const int IntvMaxVALUs = 6;
1740 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1741
1742 struct StateType {
1743 SmallDenseMap<Register, int, 4> DefPos;
1744 int ExecPos = std::numeric_limits<int>::max();
1745 int VALUs = 0;
1746
1747 static unsigned getHashValue(const StateType &State) {
1748 return hash_combine(args: State.ExecPos, args: State.VALUs,
1749 args: hash_combine_range(R: State.DefPos));
1750 }
1751 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1752 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1753 LHS.VALUs == RHS.VALUs;
1754 }
1755 };
1756
1757 StateType State;
1758
1759 // This overloads expiry testing with all the hazard detection
1760 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1761 // Too many VALU states have passed
1762 if (State.VALUs > NoHazardVALUWaitStates)
1763 return HazardExpired;
1764
1765 // Instructions which cause va_vdst==0 expire hazard
1766 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1767 SIInstrInfo::isEXP(MI: I) ||
1768 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1769 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
1770 return HazardExpired;
1771
1772 // Track registers writes
1773 bool Changed = false;
1774 if (SIInstrInfo::isVALU(MI: I)) {
1775 for (Register Src : SrcVGPRs) {
1776 if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1777 State.DefPos[Src] = State.VALUs;
1778 Changed = true;
1779 }
1780 }
1781 } else if (SIInstrInfo::isSALU(MI: I)) {
1782 if (State.ExecPos == std::numeric_limits<int>::max()) {
1783 if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
1784 State.ExecPos = State.VALUs;
1785 Changed = true;
1786 }
1787 }
1788 }
1789
1790 // Early expiration: too many VALUs in intv3
1791 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1792 return HazardExpired;
1793
1794 // Only evaluate state if something changed
1795 if (!Changed)
1796 return NoHazardFound;
1797
1798 // Determine positions of VALUs pre/post exec change
1799 if (State.ExecPos == std::numeric_limits<int>::max())
1800 return NoHazardFound;
1801
1802 int PreExecPos = std::numeric_limits<int>::max();
1803 int PostExecPos = std::numeric_limits<int>::max();
1804
1805 for (auto Entry : State.DefPos) {
1806 int DefVALUs = Entry.second;
1807 if (DefVALUs != std::numeric_limits<int>::max()) {
1808 if (DefVALUs >= State.ExecPos)
1809 PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1810 else
1811 PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1812 }
1813 }
1814
1815 // Need a VALUs post exec change
1816 if (PostExecPos == std::numeric_limits<int>::max())
1817 return NoHazardFound;
1818
1819 // Too many VALUs in intv3?
1820 int Intv3VALUs = PostExecPos;
1821 if (Intv3VALUs > Intv3MaxVALUs)
1822 return HazardExpired;
1823
1824 // Too many VALUs in intv2?
1825 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1826 if (Intv2VALUs > Intv1plus2MaxVALUs)
1827 return HazardExpired;
1828
1829 // Need a VALUs pre exec change
1830 if (PreExecPos == std::numeric_limits<int>::max())
1831 return NoHazardFound;
1832
1833 // Too many VALUs in intv1?
1834 int Intv1VALUs = PreExecPos - State.ExecPos;
1835 if (Intv1VALUs > Intv1plus2MaxVALUs)
1836 return HazardExpired;
1837
1838 // Too many VALUs in intv1 + intv2
1839 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1840 return HazardExpired;
1841
1842 return HazardFound;
1843 };
1844 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1845 if (SIInstrInfo::isVALU(MI))
1846 State.VALUs += 1;
1847 };
1848
1849 if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1850 InitialI: std::next(x: MI->getReverseIterator())))
1851 return false;
1852
1853 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1854 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1855 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST));
1856
1857 return true;
1858}
1859
1860bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1861 if (!ST.hasVALUTransUseHazard())
1862 return false;
1863 assert(!ST.hasExtendedWaitCounts());
1864
1865 if (!SIInstrInfo::isVALU(MI: *MI))
1866 return false;
1867
1868 SmallSet<Register, 4> SrcVGPRs;
1869
1870 for (const MachineOperand &Use : MI->explicit_uses()) {
1871 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1872 SrcVGPRs.insert(V: Use.getReg());
1873 }
1874
1875 // Look for the following pattern:
1876 // Va <- TRANS VALU
1877 // intv
1878 // MI Va (WaitState = 0)
1879 //
1880 // Where:
1881 // intv <= 5 VALUs / 1 TRANS
1882 //
1883 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1884
1885 const int IntvMaxVALUs = 5;
1886 const int IntvMaxTRANS = 1;
1887
1888 struct StateType {
1889 int VALUs = 0;
1890 int TRANS = 0;
1891
1892 static unsigned getHashValue(const StateType &State) {
1893 return hash_combine(args: State.VALUs, args: State.TRANS);
1894 }
1895 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1896 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1897 }
1898 };
1899
1900 StateType State;
1901
1902 // This overloads expiry testing with all the hazard detection
1903 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1904 // Too many VALU states have passed
1905 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1906 return HazardExpired;
1907
1908 // Instructions which cause va_vdst==0 expire hazard
1909 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1910 SIInstrInfo::isEXP(MI: I) ||
1911 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1912 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
1913 return HazardExpired;
1914
1915 // Track registers writes
1916 if (SIInstrInfo::isTRANS(MI: I)) {
1917 for (Register Src : SrcVGPRs) {
1918 if (I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1919 return HazardFound;
1920 }
1921 }
1922 }
1923
1924 return NoHazardFound;
1925 };
1926 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1927 if (SIInstrInfo::isVALU(MI))
1928 State.VALUs += 1;
1929 if (SIInstrInfo::isTRANS(MI))
1930 State.TRANS += 1;
1931 };
1932
1933 if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1934 InitialI: std::next(x: MI->getReverseIterator())))
1935 return false;
1936
1937 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1938 // avoided.
1939 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1940 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1941 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST));
1942
1943 return true;
1944}
1945
1946bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1947 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1948 !SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isTRANS(MI: *MI))
1949 return false;
1950
1951 const SIInstrInfo *TII = ST.getInstrInfo();
1952 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1953
1954 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1955 if (!SIInstrInfo::isTRANS(MI: I))
1956 return false;
1957
1958 // RAW: Trans(I) writes, VALU(MI) reads.
1959 Register TransDef = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
1960 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1961 if (ValuUse.isReg() && TRI->regsOverlap(RegA: TransDef, RegB: ValuUse.getReg()))
1962 return true;
1963 }
1964
1965 auto *ValuDst = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1966 if (!ValuDst || !ValuDst->isReg())
1967 return false;
1968
1969 // WAR: Trans(I) reads, VALU(MI) writes.
1970 Register ValuDef = ValuDst->getReg();
1971 for (const MachineOperand &TransUse : I.explicit_uses()) {
1972 if (TransUse.isReg() && TRI->regsOverlap(RegA: ValuDef, RegB: TransUse.getReg()))
1973 return true;
1974 }
1975
1976 return false;
1977 };
1978
1979 auto IsExpiredFn = [](const MachineInstr &I, int) {
1980 return SIInstrInfo::isVALU(MI: I);
1981 };
1982
1983 const int HasVALU = std::numeric_limits<int>::max();
1984 if (::getWaitStatesSince(IsHazard: IsTransHazardFn, MI, IsExpired: IsExpiredFn) == HasVALU)
1985 return false;
1986
1987 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
1988 return true;
1989}
1990
1991bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1992 if (!SIInstrInfo::isWMMA(MI: *MI) && !SIInstrInfo::isSWMMAC(MI: *MI))
1993 return false;
1994
1995 const SIInstrInfo *TII = ST.getInstrInfo();
1996 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1997
1998 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1999 if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
2000 return false;
2001
2002 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
2003 // with the dest(matrix D) of the previous wmma.
2004 const Register CurSrc0Reg =
2005 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
2006 const Register CurSrc1Reg =
2007 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
2008
2009 const Register PrevDstReg =
2010 TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
2011
2012 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) ||
2013 TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) {
2014 return true;
2015 }
2016
2017 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2018 // but Index can't overlap with PrevDstReg.
2019 if (AMDGPU::isGFX12Plus(STI: ST)) {
2020 if (SIInstrInfo::isSWMMAC(MI: *MI)) {
2021 const Register CurIndex =
2022 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
2023 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex))
2024 return true;
2025 }
2026 return false;
2027 }
2028
2029 return false;
2030 };
2031
2032 auto IsExpiredFn = [](const MachineInstr &I, int) {
2033 return SIInstrInfo::isVALU(MI: I);
2034 };
2035
2036 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
2037 std::numeric_limits<int>::max())
2038 return false;
2039
2040 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
2041
2042 return true;
2043}
2044
2045static bool isCoexecutableVALUInst(const MachineInstr &MI) {
2046 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isWMMA(MI) &&
2047 !SIInstrInfo::isSWMMAC(MI) && !SIInstrInfo::isLDSDMA(MI);
2048}
2049
2050static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
2051 const SIInstrInfo *TII, unsigned Latency,
2052 unsigned Category) {
2053 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2054 "Handle me if the xdl wmma instruction latency changes");
2055
2056 switch (Category) {
2057 case 0: // Dense WMMA Instructions:
2058 // WMMA_*F16, WMMA_*BF16
2059 // WMMA_*FP8FP8
2060 // WMMA_*FP8BF8
2061 // WMMA_*BF8FP8
2062 // WMMA_*BF8BF8
2063 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2064 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2065
2066 case 1: // Dense WMMA Instructions:
2067 // WMMA_IU8
2068 // WMMA_IU4
2069 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2070 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2071
2072 case 2: // Dense SWMMAC Instructions
2073 // SWMMAC_*F16, SWMMAC_*BF16,
2074 // SWMMAC_*FP8FP8
2075 // SWMMAC_*BF8FP8
2076 // SWMMAC_*FP8BF8
2077 // SWMMAC_*BF8BF8
2078 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2079
2080 case 3: // Sparse WMMA Instructions:
2081 // SWMMAC_IU8
2082 // SWMMAC_IU4
2083 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2084 default:
2085 break;
2086 } // end switch.
2087
2088 return false;
2089}
2090
2091int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2092 if (!ST.hasGFX1250Insts())
2093 return 0;
2094
2095 const SIInstrInfo *TII = ST.getInstrInfo();
2096 if (!TII->isXDLWMMA(MI: *MI) && !isCoexecutableVALUInst(MI: *MI))
2097 return 0;
2098
2099 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2100 // be in between the first WMMA and the second instruction to cover the hazard
2101 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2102 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2103 // numbers, which depends on the category of the first WMMA.
2104 const int WMMAWaitStates[] = {5, 9, 3, 5};
2105 const int VALUWaitStates[] = {4, 8, 2, 4};
2106 unsigned Category = 0;
2107
2108 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2109 if (!TII->isXDLWMMA(MI: I))
2110 return false;
2111
2112 unsigned Latency = TSchedModel.computeInstrLatency(MI: &I);
2113 if (!IsWMMAHazardInstInCategory(MI: I, TII, Latency, Category))
2114 return false;
2115
2116 return hasWMMAToWMMARegOverlap(WMMA: I, MI: *MI);
2117 };
2118
2119 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2120 if (!TII->isXDLWMMA(MI: I))
2121 return false;
2122
2123 unsigned Latency = TSchedModel.computeInstrLatency(MI: &I);
2124 if (!IsWMMAHazardInstInCategory(MI: I, TII, Latency, Category))
2125 return false;
2126
2127 return hasWMMAToVALURegOverlap(WMMA: I, MI: *MI);
2128 };
2129
2130 int Limit = 0;
2131
2132 auto GetWaitStatesFn = [](const MachineInstr &I) {
2133 return SIInstrInfo::isVALU(MI: I) ? 1 : 0;
2134 };
2135
2136 int WaitStatesNeeded = -1;
2137 if (TII->isXDLWMMA(MI: *MI)) {
2138 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2139 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2140 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2141 // exists, and INT_MAX if there is no hazard. As a result, a negative
2142 // WaitStatesNeeded here means no hazard, and we will continue to search
2143 // for other categories.
2144 WaitStatesNeeded =
2145 Limit - getWaitStatesSince(IsHazard: IsWMMAHazardFn, Limit, GetNumWaitStates: GetWaitStatesFn);
2146 }
2147 } else { // Must be a co-executable VALU.
2148 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2149 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2150 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2151 // exists, and INT_MAX if there is no hazard. As a result, a negative
2152 // WaitStatesNeeded here means no hazard, and we will continue to search
2153 // for other categories.
2154 WaitStatesNeeded =
2155 Limit - getWaitStatesSince(IsHazard: IsVALUHazardFn, Limit, GetNumWaitStates: GetWaitStatesFn);
2156 }
2157 }
2158
2159 return WaitStatesNeeded;
2160}
2161
2162bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2163 const MachineInstr &WMMA, const MachineInstr &MI) const {
2164 Register D0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::vdst)->getReg();
2165 Register A1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)->getReg();
2166 Register B1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)->getReg();
2167
2168 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2169 if (TRI.regsOverlap(RegA: D0, RegB: A1) || TRI.regsOverlap(RegA: D0, RegB: B1))
2170 return true;
2171
2172 if (SIInstrInfo::isSWMMAC(MI)) {
2173 Register Idx1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2174 if (TRI.regsOverlap(RegA: D0, RegB: Idx1))
2175 return true;
2176 }
2177 return false;
2178}
2179
2180bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2181 const MachineInstr &WMMA, const MachineInstr &MI) const {
2182 // WMMA writes, VALU reads.
2183 Register D0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::vdst)->getReg();
2184 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2185 if (ValuUse.isReg() && TRI.regsOverlap(RegA: D0, RegB: ValuUse.getReg()))
2186 return true;
2187 }
2188
2189 // WMMA reads or writes, VALU writes.
2190 Register A0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src0)->getReg();
2191 Register B0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src1)->getReg();
2192 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2193
2194 if (SIInstrInfo::isSWMMAC(MI: WMMA)) {
2195 Register Idx0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src2)->getReg();
2196 WMMARegs.push_back(Elt: Idx0);
2197 }
2198
2199 for (const MachineOperand &ValuDef : MI.defs()) {
2200 Register VDstReg = ValuDef.getReg();
2201 for (Register WMMAReg : WMMARegs) {
2202 if (TRI.regsOverlap(RegA: VDstReg, RegB: WMMAReg))
2203 return true;
2204 }
2205 }
2206 return false;
2207}
2208
2209bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2210 const MachineInstr &MI) const {
2211 // I is the potential WMMA hazard source, MI is the instruction being checked
2212 // for hazard.
2213 if (!TII.isXDLWMMA(MI: I))
2214 return false;
2215
2216 // Dispatch based on MI type
2217 if (TII.isXDLWMMA(MI))
2218 return hasWMMAToWMMARegOverlap(WMMA: I, MI);
2219 if (isCoexecutableVALUInst(MI))
2220 return hasWMMAToVALURegOverlap(WMMA: I, MI);
2221
2222 return false;
2223}
2224
2225bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2226 bool IncludeSubloops) {
2227 // Scan loop for any WMMA that hazards MI.
2228 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2229 for (MachineBasicBlock *MBB : L->getBlocks()) {
2230 if (!IncludeSubloops && MLI->getLoopFor(BB: MBB) != L)
2231 continue;
2232 for (MachineInstr &I : *MBB) {
2233 if (&I == MI)
2234 continue;
2235 if (isCoexecutionHazardFor(I, MI: *MI))
2236 return true;
2237 }
2238 }
2239 return false;
2240}
2241
2242bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2243 int WaitStatesNeeded) {
2244 if (!MLI)
2245 return false;
2246
2247 MachineLoop *L = MLI->getLoopFor(BB: MI->getParent());
2248 if (!L) {
2249 ++NumWMMAHoistingBailed;
2250 return false;
2251 }
2252
2253 // If innermost loop has WMMA hazard, we can't hoist at all
2254 if (hasWMMAHazardInLoop(L, MI)) {
2255 ++NumWMMAHoistingBailed;
2256 return false;
2257 }
2258
2259 // Find outermost loop with no internal hazard
2260 MachineLoop *TargetLoop = L;
2261 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2262 if (hasWMMAHazardInLoop(L: Parent, MI, IncludeSubloops: false))
2263 break; // Parent has hazard in its own blocks, stop here
2264 TargetLoop = Parent; // Safe to hoist further out
2265 }
2266
2267 // Need valid preheader to insert V_NOPs
2268 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2269 if (!Preheader) {
2270 ++NumWMMAHoistingBailed;
2271 return false;
2272 }
2273
2274 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2275 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2276 << "\n");
2277
2278 emitVNops(MBB&: *Preheader, InsertPt: Preheader->getFirstTerminator(), WaitStatesNeeded,
2279 /*IsHoisting=*/true);
2280 NumWMMANopsHoisted += WaitStatesNeeded;
2281 return true;
2282}
2283
2284bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2285 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2286 if (WaitStatesNeeded <= 0)
2287 return false;
2288
2289 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2290 return true;
2291
2292 emitVNops(MBB&: *MI->getParent(), InsertPt: MI->getIterator(), WaitStatesNeeded);
2293 return true;
2294}
2295
2296bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2297 if (!ST.hasShift64HighRegBug())
2298 return false;
2299 assert(!ST.hasExtendedWaitCounts());
2300
2301 switch (MI->getOpcode()) {
2302 default:
2303 return false;
2304 case AMDGPU::V_LSHLREV_B64_e64:
2305 case AMDGPU::V_LSHRREV_B64_e64:
2306 case AMDGPU::V_ASHRREV_I64_e64:
2307 break;
2308 }
2309
2310 MachineOperand *Amt = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
2311 if (!Amt->isReg())
2312 return false;
2313
2314 Register AmtReg = Amt->getReg();
2315 const MachineRegisterInfo &MRI = MF.getRegInfo();
2316 // Check if this is a last VGPR in the allocation block.
2317 if (!TRI.isVGPR(MRI, Reg: AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2318 return false;
2319
2320 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + 1))
2321 return false;
2322
2323 assert(ST.needsAlignedVGPRs());
2324 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2325
2326 const DebugLoc &DL = MI->getDebugLoc();
2327 MachineBasicBlock *MBB = MI->getParent();
2328 MachineOperand *Src1 = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1);
2329
2330 // In:
2331 //
2332 // Dst = shiftrev64 Amt, Src1
2333 //
2334 // if Dst!=Src1 then avoid the bug with:
2335 //
2336 // Dst.sub0 = Amt
2337 // Dst = shift64 Dst.sub0, Src1
2338
2339 Register DstReg = MI->getOperand(i: 0).getReg();
2340 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2341 Register DstLo = TRI.getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
2342 runOnInstruction(
2343 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo).add(MO: *Amt));
2344 Amt->setReg(DstLo);
2345 Amt->setIsKill(true);
2346 return true;
2347 }
2348
2349 bool Overlapped = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI);
2350 Register NewReg;
2351 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2352 : AMDGPU::VGPR_32RegClass) {
2353 if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) {
2354 NewReg = Reg;
2355 break;
2356 }
2357 }
2358
2359 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1)
2360 : NewReg;
2361 Register NewAmtLo;
2362
2363 if (Overlapped)
2364 NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0);
2365
2366 // Insert a full wait count because found register might be pending a wait.
2367 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
2368 .addImm(Val: 0);
2369
2370 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2371 if (Overlapped)
2372 runOnInstruction(
2373 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo)
2374 .addDef(RegNo: AmtReg - 1)
2375 .addReg(RegNo: AmtReg - 1, Flags: RegState::Undef)
2376 .addReg(RegNo: NewAmtLo, Flags: RegState::Undef));
2377 runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt)
2378 .addDef(RegNo: AmtReg)
2379 .addReg(RegNo: AmtReg, Flags: RegState::Undef)
2380 .addReg(RegNo: NewAmt, Flags: RegState::Undef));
2381
2382 // Instructions emitted after the current instruction will be processed by the
2383 // parent loop of the hazard recognizer in a natural way.
2384 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2385 DestReg: AmtReg)
2386 .addDef(RegNo: NewAmt)
2387 .addReg(RegNo: NewAmt)
2388 .addReg(RegNo: AmtReg);
2389 if (Overlapped)
2390 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2391 DestReg: AmtReg - 1)
2392 .addDef(RegNo: NewAmtLo)
2393 .addReg(RegNo: NewAmtLo)
2394 .addReg(RegNo: AmtReg - 1);
2395
2396 // Re-running hazard recognizer on the modified instruction is not necessary,
2397 // inserted V_SWAP_B32 has already both read and write new registers so
2398 // hazards related to these register has already been handled.
2399 Amt->setReg(NewAmt);
2400 Amt->setIsKill(false);
2401 // We do not update liveness, so verifier may see it as undef.
2402 Amt->setIsUndef();
2403 if (Overlapped) {
2404 MI->getOperand(i: 0).setReg(NewReg);
2405 Src1->setReg(NewReg);
2406 Src1->setIsKill(false);
2407 Src1->setIsUndef();
2408 }
2409
2410 return true;
2411}
2412
2413int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2414 int NSAtoVMEMWaitStates = 1;
2415
2416 if (!ST.hasNSAtoVMEMBug())
2417 return 0;
2418
2419 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isMTBUF(MI: *MI))
2420 return 0;
2421
2422 const SIInstrInfo *TII = ST.getInstrInfo();
2423 const auto *Offset = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2424 if (!Offset || (Offset->getImm() & 6) == 0)
2425 return 0;
2426
2427 auto IsHazardFn = [TII](const MachineInstr &I) {
2428 if (!SIInstrInfo::isMIMG(MI: I))
2429 return false;
2430 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
2431 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2432 TII->getInstSizeInBytes(MI: I) >= 16;
2433 };
2434
2435 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: 1);
2436}
2437
2438int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2439 MachineInstr *MI) const {
2440 int FPAtomicToDenormModeWaitStates = 3;
2441
2442 if (!ST.hasFPAtomicToDenormModeHazard())
2443 return 0;
2444 assert(!ST.hasExtendedWaitCounts());
2445
2446 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2447 return 0;
2448
2449 auto IsHazardFn = [](const MachineInstr &I) {
2450 if (!SIInstrInfo::isVMEM(MI: I))
2451 return false;
2452 return SIInstrInfo::isFPAtomic(MI: I);
2453 };
2454
2455 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2456 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2457 return true;
2458
2459 return SIInstrInfo::isWaitcnt(Opcode: MI.getOpcode());
2460 };
2461
2462 return FPAtomicToDenormModeWaitStates -
2463 ::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
2464}
2465
2466int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2467 assert(SIInstrInfo::isMAI(*MI));
2468
2469 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2470}
2471
2472int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2473 // Early exit if no padding is requested.
2474 if (MFMAPaddingRatio == 0)
2475 return 0;
2476
2477 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2478 if (!SIInstrInfo::isMFMA(MI: *MI) || MFI->getOccupancy() < 2)
2479 return 0;
2480
2481 int NeighborMFMALatency = 0;
2482 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2483 this](const MachineInstr &MI) {
2484 if (!SIInstrInfo::isMFMA(MI))
2485 return false;
2486
2487 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2488 return true;
2489 };
2490
2491 const int MaxMFMAPipelineWaitStates = 16;
2492 int WaitStatesSinceNeighborMFMA =
2493 getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
2494
2495 int NeighborMFMAPaddingNeeded =
2496 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2497 WaitStatesSinceNeighborMFMA;
2498
2499 return std::max(a: 0, b: NeighborMFMAPaddingNeeded);
2500}
2501
2502int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2503 int WaitStatesNeeded = 0;
2504 unsigned Opc = MI->getOpcode();
2505
2506 auto IsVALUFn = [](const MachineInstr &MI) {
2507 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2508 };
2509
2510 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2511 const int LegacyVALUWritesVGPRWaitStates = 2;
2512 const int VALUWritesExecWaitStates = 4;
2513 const int MaxWaitStates = 4;
2514
2515 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2516 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2517 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2518
2519 if (WaitStatesNeeded < MaxWaitStates) {
2520 for (const MachineOperand &Use : MI->explicit_uses()) {
2521 const int MaxWaitStates = 2;
2522
2523 if (!Use.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
2524 continue;
2525
2526 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2527 getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2528 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2529
2530 if (WaitStatesNeeded == MaxWaitStates)
2531 break;
2532 }
2533 }
2534 }
2535
2536 for (const MachineOperand &Op : MI->explicit_operands()) {
2537 if (!Op.isReg() || !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2538 continue;
2539
2540 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2541 continue;
2542
2543 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2544 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2545 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2546 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2547 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2548 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2549 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2550 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2551 const int MaxWaitStates = 18;
2552 Register Reg = Op.getReg();
2553 unsigned HazardDefLatency = 0;
2554
2555 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2556 this](const MachineInstr &MI) {
2557 if (!SIInstrInfo::isMFMA(MI))
2558 return false;
2559 Register DstReg = MI.getOperand(i: 0).getReg();
2560 if (DstReg == Reg)
2561 return false;
2562 HazardDefLatency =
2563 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2564 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2565 };
2566
2567 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn,
2568 Limit: MaxWaitStates);
2569 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2570 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2571 int OpNo = Op.getOperandNo();
2572 if (OpNo == SrcCIdx) {
2573 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2574 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2575 switch (HazardDefLatency) {
2576 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2577 break;
2578 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2579 break;
2580 case 16: [[fallthrough]];
2581 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2582 break;
2583 }
2584 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2585 switch (HazardDefLatency) {
2586 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2587 break;
2588 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2589 break;
2590 case 16: [[fallthrough]];
2591 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2592 break;
2593 }
2594 }
2595
2596 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2597 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2598
2599 if (WaitStatesNeeded == MaxWaitStates)
2600 return WaitStatesNeeded; // Early exit.
2601
2602 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2603 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2604 return false;
2605 Register DstReg = MI.getOperand(i: 0).getReg();
2606 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2607 };
2608
2609 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2610 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2611 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2612 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2613 if (OpNo == SrcCIdx)
2614 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2615 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2616 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2617
2618 WaitStatesNeededForUse = NeedWaitStates -
2619 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates);
2620 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2621
2622 if (WaitStatesNeeded == MaxWaitStates)
2623 return WaitStatesNeeded; // Early exit.
2624 }
2625
2626 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2627 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2628 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2629 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2630 const int MaxWaitStates = 13;
2631 Register DstReg = MI->getOperand(i: 0).getReg();
2632 unsigned HazardDefLatency = 0;
2633
2634 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2635 this](const MachineInstr &MI) {
2636 if (!SIInstrInfo::isMFMA(MI))
2637 return false;
2638 Register Reg = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2639 HazardDefLatency =
2640 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2641 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2642 };
2643
2644 int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates);
2645 int NeedWaitStates;
2646 switch (HazardDefLatency) {
2647 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2648 break;
2649 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2650 break;
2651 case 16: [[fallthrough]];
2652 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2653 break;
2654 }
2655
2656 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2657 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2658 }
2659
2660 // Pad neighboring MFMA with noops for better inter-wave performance.
2661 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2662
2663 return WaitStatesNeeded;
2664}
2665
2666static int
2667GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2668 bool IsGFX950) {
2669 // xdl def cycles | gfx940 | gfx950
2670 // 2 pass | 3 4
2671 // 4 pass | 5 6
2672 // 8 pass | 9 10
2673 // 16 pass | 17 18
2674 return NumPasses + 1 + IsGFX950;
2675}
2676
2677static int
2678GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2679 bool IsGFX950) {
2680 // xdl def cycles | gfx940 | gfx950
2681 // 2 pass | 3 3
2682 // 4 pass | 5 6
2683 // 8 pass | 9 10
2684 // 16 pass | 17 18
2685 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2686}
2687
2688static int
2689GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2690 // 2 pass -> 2
2691 // 4 pass -> 4
2692 // 8 pass -> 8
2693 // 16 pass -> 16
2694 return NumPasses;
2695}
2696
2697static int
2698GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2699 // 2 pass -> 4
2700 // 4 pass -> 6
2701 // 8 pass -> 10
2702 // 16 pass -> 18
2703 return NumPasses + 2;
2704}
2705
2706static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
2707 bool IsGFX950) {
2708 // xdl def cycles | gfx942 | gfx950
2709 // 2 pass | 5 5
2710 // 4 pass | 7 8
2711 // 8 pass | 11 12
2712 // 16 pass | 19 20
2713 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2714}
2715
2716int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2717 int WaitStatesNeeded = 0;
2718 unsigned Opc = MI->getOpcode();
2719
2720 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2721 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2722 };
2723
2724 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2725 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2726 !SIInstrInfo::isDOT(MI);
2727 };
2728
2729 if (!SIInstrInfo::isMFMA(MI: *MI))
2730 return WaitStatesNeeded;
2731
2732 const int VALUWritesExecWaitStates = 4;
2733 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2734 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn,
2735 Limit: VALUWritesExecWaitStates);
2736 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2737
2738 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2739
2740 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2741 for (const MachineOperand &Use : MI->explicit_uses()) {
2742 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2743 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2744 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2745 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2746 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2747 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2748 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2749 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2750 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2751 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2752 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2753 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2754 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2755 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2756 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2757 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2758 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2759 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2760 const int MaxWaitStates = 19;
2761
2762 if (!Use.isReg())
2763 continue;
2764 Register Reg = Use.getReg();
2765 bool FullReg;
2766 const MachineInstr *MI1;
2767
2768 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2769 this](const MachineInstr &MI) {
2770 if (!SIInstrInfo::isMFMA(MI))
2771 return false;
2772 Register DstReg = MI.getOperand(i: 0).getReg();
2773 FullReg = (DstReg == Reg);
2774 MI1 = &MI;
2775 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2776 };
2777
2778 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2779 getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2780 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2781
2782 int NumWaitStates =
2783 getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates);
2784 if (NumWaitStates == std::numeric_limits<int>::max())
2785 continue;
2786
2787 int OpNo = Use.getOperandNo();
2788 unsigned Opc1 = MI1->getOpcode();
2789 int NeedWaitStates = 0;
2790 if (OpNo == SrcCIdx) {
2791 if (!SIInstrInfo::isDGEMM(Opcode: Opc) &&
2792 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opcode: Opc1))) {
2793 NeedWaitStates = 0;
2794 } else if (FullReg) {
2795 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2796 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2797 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2798 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2799 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2800 else if (ST.hasGFX940Insts() &&
2801 TSchedModel.computeInstrLatency(MI: MI1) == 2)
2802 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2803 } else {
2804 switch (Opc1) {
2805 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2806 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2807 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2808 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2809 if (!TII.isXDL(MI: *MI))
2810 NeedWaitStates =
2811 ST.hasGFX950Insts()
2812 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2813 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2814 break;
2815 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2816 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2817 if (!TII.isXDL(MI: *MI))
2818 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2819 break;
2820 default:
2821 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2822 if (ST.hasGFX940Insts()) {
2823 if (TII.isXDL(MI: *MI) && !TII.isXDL(MI: *MI1))
2824 break;
2825
2826 NeedWaitStates =
2827 TII.isXDL(MI: *MI1)
2828 ? (TII.isXDL(MI: *MI)
2829 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2830 NumPasses, IsGFX950: ST.hasGFX950Insts())
2831 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2832 NumPasses, IsGFX950: ST.hasGFX950Insts()))
2833 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2834 NumPasses);
2835 break;
2836 }
2837
2838 switch (NumPasses) {
2839 case 2:
2840 NeedWaitStates =
2841 SIInstrInfo::isDGEMM(Opcode: Opc)
2842 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2843 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2844 break;
2845 case 8:
2846 NeedWaitStates =
2847 SIInstrInfo::isDGEMM(Opcode: Opc)
2848 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2849 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2850 break;
2851 case 16:
2852 NeedWaitStates =
2853 SIInstrInfo::isDGEMM(Opcode: Opc)
2854 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2855 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2856 break;
2857 default:
2858 llvm_unreachable("unexpected number of passes");
2859 }
2860 }
2861 }
2862 } else {
2863 switch (Opc1) {
2864 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2865 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2866 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2867 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2868 NeedWaitStates =
2869 ST.hasGFX950Insts()
2870 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2871 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2872 break;
2873 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2874 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2875 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2876 break;
2877 default:
2878 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2879
2880 if (ST.hasGFX940Insts()) {
2881 NeedWaitStates =
2882 TII.isXDL(MI: *MI1)
2883 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2884 NumPasses, IsGFX950: ST.hasGFX950Insts())
2885 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2886 NumPasses);
2887 break;
2888 }
2889
2890 switch (NumPasses) {
2891 case 2:
2892 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2893 break;
2894 case 4:
2895 llvm_unreachable("unexpected number of passes for mfma");
2896 case 8:
2897 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2898 break;
2899 case 16:
2900 default:
2901 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2902 }
2903 }
2904 }
2905 if (WaitStatesNeeded >= NeedWaitStates)
2906 continue;
2907
2908 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2909 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2910
2911 if (WaitStatesNeeded == MaxWaitStates)
2912 break;
2913 }
2914
2915 // Pad neighboring MFMA with noops for better inter-wave performance.
2916 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2917
2918 return WaitStatesNeeded;
2919}
2920
2921int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
2922 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2923 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2924 return 0;
2925
2926 int WaitStatesNeeded = 0;
2927
2928 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2929 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2930 };
2931
2932 for (const MachineOperand &Op : MI->explicit_uses()) {
2933 if (!Op.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2934 continue;
2935
2936 Register Reg = Op.getReg();
2937
2938 const int AccVgprReadLdStWaitStates = 2;
2939 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2940 const int MaxWaitStates = 2;
2941
2942 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2943 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates);
2944 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2945
2946 if (WaitStatesNeeded == MaxWaitStates)
2947 return WaitStatesNeeded; // Early exit.
2948
2949 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2950 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2951 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2952 return false;
2953 auto IsVALUFn = [](const MachineInstr &MI) {
2954 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2955 };
2956 return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: 2 /*MaxWaitStates*/) <
2957 std::numeric_limits<int>::max();
2958 };
2959
2960 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2961 getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2962 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2963 }
2964
2965 return WaitStatesNeeded;
2966}
2967
2968int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
2969 assert(!ST.hasVcmpxPermlaneHazard() &&
2970 "this is a different vcmpx+permlane hazard");
2971 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2972 const SIInstrInfo *TII = ST.getInstrInfo();
2973
2974 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2975 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
2976 };
2977
2978 auto IsVALUFn = [](const MachineInstr &MI) {
2979 return SIInstrInfo::isVALU(MI);
2980 };
2981
2982 const int VCmpXWritesExecWaitStates = 4;
2983 const int VALUWritesVDstWaitStates = 2;
2984 int WaitStatesNeeded = 0;
2985
2986 for (const MachineOperand &Op : MI->explicit_uses()) {
2987 if (!Op.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2988 continue;
2989 Register Reg = Op.getReg();
2990
2991 int WaitStatesSinceDef =
2992 VALUWritesVDstWaitStates -
2993 getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn,
2994 /*MaxWaitStates=*/Limit: VALUWritesVDstWaitStates);
2995 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesSinceDef);
2996 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2997 break;
2998 }
2999
3000 int VCmpXHazardWaits =
3001 VCmpXWritesExecWaitStates -
3002 getWaitStatesSince(IsHazard: IsVCmpXWritesExecFn, Limit: VCmpXWritesExecWaitStates);
3003
3004 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: VCmpXHazardWaits);
3005 return WaitStatesNeeded;
3006}
3007
3008static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
3009 // 2 pass -> 4
3010 // 4 pass -> 6
3011 // 8 pass -> 10
3012 // 16 pass -> 18
3013 return NumPasses + 2;
3014}
3015
3016static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
3017 bool IsGFX950) {
3018 // xdl def cycles | gfx942 | gfx950
3019 // 2 pass | 5 5
3020 // 4 pass | 7 8
3021 // 8 pass | 11 12
3022 // 16 pass | 19 20
3023 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3024}
3025
3026static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
3027 bool IsGFX950) {
3028 // xdl def cycles | gfx942 | gfx950
3029 // 2 pass | 5 5
3030 // 4 pass | 7 8
3031 // 8 pass | 11 12
3032 // 16 pass | 19 20
3033 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3034}
3035
3036static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
3037 // 2 pass -> 4
3038 // 4 pass -> 6
3039 // 8 pass -> 10
3040 // 16 pass -> 18
3041 return NumPasses + 2;
3042}
3043
3044int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3045 if (!ST.hasGFX90AInsts())
3046 return 0;
3047
3048 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3049 return SIInstrInfo::isDGEMM(Opcode: MI.getOpcode());
3050 };
3051
3052 // This is checked in checkMAIHazards90A()
3053 if (SIInstrInfo::isMFMA(MI: *MI))
3054 return 0;
3055
3056 const MachineRegisterInfo &MRI = MF.getRegInfo();
3057
3058 int WaitStatesNeeded = 0;
3059
3060 bool IsMem = SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI);
3061 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(MI: *MI);
3062 bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
3063
3064 const MachineInstr *MFMA = nullptr;
3065 unsigned Reg;
3066 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3067 if (!SIInstrInfo::isMFMA(MI) ||
3068 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
3069 return false;
3070 MFMA = &MI;
3071 return true;
3072 };
3073
3074 const MachineInstr *DOT = nullptr;
3075 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3076 if (!SIInstrInfo::isDOT(MI) ||
3077 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
3078 return false;
3079 DOT = &MI;
3080 return true;
3081 };
3082
3083 bool DGEMMAfterVALUWrite = false;
3084 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3085 // Found DGEMM on reverse traversal to def.
3086 if (SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()))
3087 DGEMMAfterVALUWrite = true;
3088
3089 // Only hazard if register is defined by a VALU and a DGEMM is found after
3090 // after the def.
3091 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
3092 return false;
3093
3094 return true;
3095 };
3096
3097 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(),
3098 Name: AMDGPU::OpName::src2);
3099
3100 if (IsMemOrExport || IsVALU) {
3101 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3102 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3103 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3104 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3105 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3106 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3107 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3108 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3109 const int DotWriteSameDotReadSrcAB = 3;
3110 const int DotWriteDifferentVALURead = 3;
3111 const int DMFMABetweenVALUWriteVMEMRead = 2;
3112 const int MaxWaitStates = 19;
3113
3114 for (const MachineOperand &Use : MI->explicit_uses()) {
3115 if (!Use.isReg())
3116 continue;
3117 Reg = Use.getReg();
3118
3119 DOT = nullptr;
3120 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3121 Limit: MaxWaitStates);
3122 if (DOT) {
3123 int NeedWaitStates = 0;
3124 if (DOT->getOpcode() == MI->getOpcode()) {
3125 if (&Use - &MI->getOperand(i: 0) != SrcCIdx)
3126 NeedWaitStates = DotWriteSameDotReadSrcAB;
3127 } else {
3128 NeedWaitStates = DotWriteDifferentVALURead;
3129 }
3130
3131 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3132 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3133 }
3134
3135 // Workaround for HW data hazard bug observed only in GFX90A. When there
3136 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3137 // causes the SQ to incorrectly not insert two wait states between the two
3138 // instructions needed to avoid data hazard.
3139 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3140 DGEMMAfterVALUWrite = false;
3141 if (TRI.isVectorRegister(MRI, Reg)) {
3142 int WaitStatesNeededForUse =
3143 DMFMABetweenVALUWriteVMEMRead -
3144 getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
3145 Limit: DMFMABetweenVALUWriteVMEMRead);
3146
3147 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3148 }
3149 }
3150
3151 MFMA = nullptr;
3152 WaitStatesSinceDef =
3153 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3154 if (!MFMA)
3155 continue;
3156
3157 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3158 int NumPasses = HazardDefLatency;
3159 int NeedWaitStates = MaxWaitStates;
3160
3161 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3162 switch (HazardDefLatency) {
3163 case 4:
3164 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3165 : DMFMA4x4WriteVgprVALUReadWaitStates;
3166 break;
3167 case 8:
3168 case 16:
3169 NeedWaitStates =
3170 IsMemOrExport
3171 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3172 : (ST.hasGFX950Insts()
3173 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3174 : DMFMA16x16WriteVgprVALUReadWaitStates);
3175 break;
3176 default:
3177 llvm_unreachable("unexpected dgemm");
3178 }
3179 } else if (ST.hasGFX940Insts()) {
3180 NeedWaitStates =
3181 TII.isXDL(MI: *MFMA)
3182 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
3183 NumPasses, IsGFX950: ST.hasGFX950Insts())
3184 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
3185 NumPasses);
3186 } else {
3187 switch (HazardDefLatency) {
3188 case 2:
3189 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3190 break;
3191 case 8:
3192 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3193 break;
3194 case 16:
3195 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3196 break;
3197 default:
3198 llvm_unreachable("unexpected number of passes for mfma");
3199 }
3200 }
3201
3202 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3203 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3204
3205 if (WaitStatesNeeded == MaxWaitStates)
3206 break;
3207 }
3208 }
3209
3210 unsigned Opc = MI->getOpcode();
3211 const int DMFMAToFMA64WaitStates = 2;
3212 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3213 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3214 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3215 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3216 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3217 getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
3218 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3219 }
3220
3221 if (!IsVALU && !IsMemOrExport)
3222 return WaitStatesNeeded;
3223
3224 for (const MachineOperand &Def : MI->defs()) {
3225 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3226 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3227 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3228 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3229 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3230 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3231 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3232 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3233 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3234 const int DotWriteDifferentVALUWrite = 3;
3235 const int MaxWaitStates = 19;
3236 const int MaxWarWaitStates = 15;
3237
3238 Reg = Def.getReg();
3239
3240 DOT = nullptr;
3241 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3242 Limit: MaxWaitStates);
3243 if (DOT && DOT->getOpcode() != MI->getOpcode())
3244 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
3245 WaitStatesSinceDef);
3246
3247 MFMA = nullptr;
3248 WaitStatesSinceDef =
3249 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3250 if (MFMA) {
3251 int NeedWaitStates = MaxWaitStates;
3252 int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
3253
3254 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3255 switch (NumPasses) {
3256 case 4:
3257 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3258 break;
3259 case 8:
3260 case 16:
3261 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3262 break;
3263 default:
3264 llvm_unreachable("unexpected number of cycles for dgemm");
3265 }
3266 } else if (ST.hasGFX940Insts()) {
3267 NeedWaitStates =
3268 TII.isXDL(MI: *MFMA)
3269 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
3270 NumPasses, IsGFX950: ST.hasGFX950Insts())
3271 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
3272 } else {
3273 switch (NumPasses) {
3274 case 2:
3275 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3276 break;
3277 case 8:
3278 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3279 break;
3280 case 16:
3281 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3282 break;
3283 default:
3284 llvm_unreachable("Unexpected number of passes for mfma");
3285 }
3286 }
3287
3288 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3289 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3290
3291 if (WaitStatesNeeded == MaxWaitStates)
3292 break;
3293 }
3294
3295 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3296 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()) ||
3297 !MI.readsRegister(Reg, TRI: &TRI))
3298 return false;
3299
3300 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3301 return false;
3302
3303 const MachineOperand *SrcC =
3304 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
3305 assert(SrcC);
3306 if (!SrcC->isReg() || !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg))
3307 return false;
3308
3309 MFMA = &MI;
3310 return true;
3311 };
3312
3313 MFMA = nullptr;
3314 int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
3315 Limit: MaxWarWaitStates);
3316 if (!MFMA)
3317 continue;
3318
3319 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3320 int NeedWaitStates = MaxWaitStates;
3321 switch (HazardDefLatency) {
3322 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3323 break;
3324 case 4: assert(ST.hasGFX940Insts());
3325 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3326 break;
3327 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3328 break;
3329 case 16: [[fallthrough]];
3330 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3331 break;
3332 }
3333
3334 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3335 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3336 }
3337
3338 return WaitStatesNeeded;
3339}
3340
3341bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) const {
3342 if (!SU->isInstr())
3343 return false;
3344
3345 const MachineInstr *MAI = nullptr;
3346
3347 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3348 MAI = nullptr;
3349 if (SIInstrInfo::isMFMA(MI))
3350 MAI = &MI;
3351 return MAI != nullptr;
3352 };
3353
3354 MachineInstr *MI = SU->getInstr();
3355 if (IsMFMAFn(*MI)) {
3356 int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: 16);
3357 if (MAI)
3358 return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
3359 }
3360
3361 return false;
3362}
3363
3364// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3365// insertion of a new instruction.
3366static void updateGetPCBundle(MachineInstr *NewMI) {
3367 if (!NewMI->isBundled())
3368 return;
3369
3370 // Find start of bundle.
3371 auto I = NewMI->getIterator();
3372 while (I->isBundledWithPred())
3373 I--;
3374 if (I->isBundle())
3375 I++;
3376
3377 // Bail if this is not an S_GETPC bundle.
3378 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3379 return;
3380
3381 // Update offsets of any references in the bundle.
3382 const unsigned NewBytes = 4;
3383 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3384 "Unexpected instruction insertion in bundle");
3385 auto NextMI = std::next(x: NewMI->getIterator());
3386 auto End = NewMI->getParent()->end();
3387 while (NextMI != End && NextMI->isBundledWithPred()) {
3388 for (auto &Operand : NextMI->operands()) {
3389 if (Operand.isGlobal())
3390 Operand.setOffset(Operand.getOffset() + NewBytes);
3391 }
3392 NextMI++;
3393 }
3394}
3395
3396bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3397 if (!ST.hasVALUMaskWriteHazard())
3398 return false;
3399 assert(!ST.hasExtendedWaitCounts());
3400
3401 if (!ST.isWave64())
3402 return false;
3403
3404 const bool IsSALU = SIInstrInfo::isSALU(MI: *MI);
3405 const bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
3406 if (!IsSALU && !IsVALU)
3407 return false;
3408
3409 // The hazard sequence is three instructions:
3410 // 1. VALU reads SGPR as mask
3411 // 2. VALU/SALU writes SGPR
3412 // 3. VALU/SALU reads SGPR
3413 // The hazard can expire if the distance between 2 and 3 is sufficient,
3414 // or (2) is VALU and (3) is SALU.
3415 // In practice this happens <10% of the time, hence always assume the hazard
3416 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3417
3418 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3419 const MachineRegisterInfo &MRI = MF.getRegInfo();
3420
3421 auto IgnoreableSGPR = [](const Register Reg) {
3422 switch (Reg) {
3423 case AMDGPU::EXEC:
3424 case AMDGPU::EXEC_LO:
3425 case AMDGPU::EXEC_HI:
3426 case AMDGPU::M0:
3427 case AMDGPU::SGPR_NULL:
3428 case AMDGPU::SGPR_NULL64:
3429 case AMDGPU::SCC:
3430 return true;
3431 default:
3432 return false;
3433 }
3434 };
3435 auto IsVCC = [](const Register Reg) {
3436 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3437 };
3438
3439 struct StateType {
3440 SmallSet<Register, 2> HazardSGPRs;
3441
3442 static unsigned getHashValue(const StateType &State) {
3443 return hash_combine_range(R: State.HazardSGPRs);
3444 }
3445 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3446 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3447 }
3448 };
3449
3450 SmallVector<const MachineInstr *> WaitInstrs;
3451 bool HasSGPRRead = false;
3452 StateType InitialState;
3453
3454 // Look for SGPR write.
3455 MachineOperand *HazardDef = nullptr;
3456 for (MachineOperand &Op : MI->operands()) {
3457 if (!Op.isReg())
3458 continue;
3459 if (Op.isDef() && HazardDef)
3460 continue;
3461
3462 Register Reg = Op.getReg();
3463 if (IgnoreableSGPR(Reg))
3464 continue;
3465 if (!IsVCC(Reg)) {
3466 if (Op.isImplicit())
3467 continue;
3468 if (!TRI->isSGPRReg(MRI, Reg))
3469 continue;
3470 }
3471 // Also check for SGPR reads.
3472 if (Op.isUse()) {
3473 HasSGPRRead = true;
3474 continue;
3475 }
3476
3477 assert(!HazardDef);
3478 HazardDef = &Op;
3479 }
3480
3481 if (!HazardDef)
3482 return false;
3483
3484 // Setup to track writes to individual SGPRs
3485 const Register HazardReg = HazardDef->getReg();
3486 if (AMDGPU::SReg_32RegClass.contains(Reg: HazardReg)) {
3487 InitialState.HazardSGPRs.insert(V: HazardReg);
3488 } else {
3489 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3490 InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub0));
3491 InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub1));
3492 }
3493
3494 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3495 if (State.HazardSGPRs.empty())
3496 return HazardExpired;
3497
3498 switch (I.getOpcode()) {
3499 case AMDGPU::V_ADDC_U32_e32:
3500 case AMDGPU::V_ADDC_U32_dpp:
3501 case AMDGPU::V_CNDMASK_B16_t16_e32:
3502 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3503 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3504 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3505 case AMDGPU::V_CNDMASK_B32_e32:
3506 case AMDGPU::V_CNDMASK_B32_dpp:
3507 case AMDGPU::V_DIV_FMAS_F32_e64:
3508 case AMDGPU::V_DIV_FMAS_F64_e64:
3509 case AMDGPU::V_SUBB_U32_e32:
3510 case AMDGPU::V_SUBB_U32_dpp:
3511 case AMDGPU::V_SUBBREV_U32_e32:
3512 case AMDGPU::V_SUBBREV_U32_dpp: {
3513 // These implicitly read VCC as mask source.
3514 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3515 }
3516 case AMDGPU::V_ADDC_U32_e64:
3517 case AMDGPU::V_ADDC_U32_e64_dpp:
3518 case AMDGPU::V_CNDMASK_B16_t16_e64:
3519 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3520 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3521 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3522 case AMDGPU::V_CNDMASK_B32_e64:
3523 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3524 case AMDGPU::V_SUBB_U32_e64:
3525 case AMDGPU::V_SUBB_U32_e64_dpp:
3526 case AMDGPU::V_SUBBREV_U32_e64:
3527 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3528 // Only check mask register overlaps.
3529 const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src2);
3530 assert(SSRCOp);
3531 bool Result = TRI->regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg);
3532 return Result ? HazardFound : NoHazardFound;
3533 }
3534 default:
3535 return NoHazardFound;
3536 }
3537 };
3538
3539 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3540 Encoded: AMDGPU::DepCtr::encodeFieldVaSdst(Encoded: AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: 0, STI: ST),
3541 VaSdst: 0),
3542 SaSdst: 0);
3543 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3544 switch (I.getOpcode()) {
3545 case AMDGPU::S_WAITCNT_DEPCTR:
3546 // Record mergable waits within region of instructions free of SGPR reads.
3547 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3548 (I.getOperand(i: 0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3549 WaitInstrs.push_back(Elt: &I);
3550 break;
3551 default:
3552 // Update tracking of SGPR reads and writes.
3553 for (auto &Op : I.operands()) {
3554 if (!Op.isReg())
3555 continue;
3556
3557 Register Reg = Op.getReg();
3558 if (IgnoreableSGPR(Reg))
3559 continue;
3560 if (!IsVCC(Reg)) {
3561 if (Op.isImplicit())
3562 continue;
3563 if (!TRI->isSGPRReg(MRI, Reg))
3564 continue;
3565 }
3566 if (Op.isUse()) {
3567 HasSGPRRead = true;
3568 continue;
3569 }
3570
3571 // Stop tracking any SGPRs with writes on the basis that they will
3572 // already have an appropriate wait inserted afterwards.
3573 SmallVector<Register, 2> Found;
3574 for (Register SGPR : State.HazardSGPRs) {
3575 if (Reg == SGPR || TRI->regsOverlap(RegA: Reg, RegB: SGPR))
3576 Found.push_back(Elt: SGPR);
3577 }
3578 for (Register SGPR : Found)
3579 State.HazardSGPRs.erase(V: SGPR);
3580 }
3581 break;
3582 }
3583 };
3584
3585 // Check for hazard
3586 if (!hasHazard<StateType>(InitialState, IsHazard: IsHazardFn, UpdateState: UpdateStateFn,
3587 InitialMBB: MI->getParent(),
3588 InitialI: std::next(x: MI->getReverseIterator())))
3589 return false;
3590
3591 // Compute counter mask
3592 unsigned DepCtr =
3593 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: 0, STI: ST)
3594 : AMDGPU::DepCtr::encodeFieldVaSdst(VaSdst: 0, STI: ST))
3595 : AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST);
3596
3597 // Try to merge previous waits into this one for regions with no SGPR reads.
3598 if (!WaitInstrs.empty()) {
3599 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3600 // obtain a mutable pointer to each instruction to be merged.
3601 // This is expected to be a very short walk within the same block.
3602 SmallVector<MachineInstr *> ToErase;
3603 unsigned Found = 0;
3604 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3605 End = MI->getParent()->rend();
3606 Found < WaitInstrs.size() && It != End; ++It) {
3607 MachineInstr *WaitMI = &*It;
3608 // Find next wait instruction.
3609 if (std::as_const(t&: WaitMI) != WaitInstrs[Found])
3610 continue;
3611 Found++;
3612 unsigned WaitMask = WaitMI->getOperand(i: 0).getImm();
3613 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3614 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3615 Encoded: DepCtr, SaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: WaitMask),
3616 b: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: DepCtr)));
3617 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3618 Encoded: DepCtr, VaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: WaitMask),
3619 b: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: DepCtr)));
3620 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3621 Encoded: DepCtr, VaVcc: std::min(a: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: WaitMask),
3622 b: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: DepCtr)));
3623 ToErase.push_back(Elt: WaitMI);
3624 }
3625 assert(Found == WaitInstrs.size());
3626 for (MachineInstr *WaitMI : ToErase)
3627 WaitMI->eraseFromParent();
3628 }
3629
3630 // Add s_waitcnt_depctr after SGPR write.
3631 auto NextMI = std::next(x: MI->getIterator());
3632 auto NewMI = BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(),
3633 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3634 .addImm(Val: DepCtr);
3635
3636 // SALU write may be s_getpc in a bundle.
3637 updateGetPCBundle(NewMI);
3638
3639 return true;
3640}
3641
3642static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3643 const SIInstrInfo &TII) {
3644 MachineBasicBlock &EntryMBB = MF->front();
3645 if (EntryMBB.begin() != EntryMBB.end()) {
3646 auto &EntryMI = *EntryMBB.begin();
3647 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3648 EntryMI.getOperand(i: 0).getImm() >= Priority)
3649 return false;
3650 }
3651
3652 BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3653 .addImm(Val: Priority);
3654 return true;
3655}
3656
3657bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3658 if (!ST.hasRequiredExportPriority())
3659 return false;
3660
3661 // Assume the following shader types will never have exports,
3662 // and avoid adding or adjusting S_SETPRIO.
3663 MachineBasicBlock *MBB = MI->getParent();
3664 MachineFunction *MF = MBB->getParent();
3665 auto CC = MF->getFunction().getCallingConv();
3666 switch (CC) {
3667 case CallingConv::AMDGPU_CS:
3668 case CallingConv::AMDGPU_CS_Chain:
3669 case CallingConv::AMDGPU_CS_ChainPreserve:
3670 case CallingConv::AMDGPU_KERNEL:
3671 return false;
3672 default:
3673 break;
3674 }
3675
3676 const int MaxPriority = 3;
3677 const int NormalPriority = 2;
3678 const int PostExportPriority = 0;
3679
3680 auto It = MI->getIterator();
3681 switch (MI->getOpcode()) {
3682 case AMDGPU::S_ENDPGM:
3683 case AMDGPU::S_ENDPGM_SAVED:
3684 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3685 case AMDGPU::SI_RETURN_TO_EPILOG:
3686 // Ensure shader with calls raises priority at entry.
3687 // This ensures correct priority if exports exist in callee.
3688 if (MF->getFrameInfo().hasCalls())
3689 return ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3690 return false;
3691 case AMDGPU::S_SETPRIO: {
3692 // Raise minimum priority unless in workaround.
3693 auto &PrioOp = MI->getOperand(i: 0);
3694 int Prio = PrioOp.getImm();
3695 bool InWA = (Prio == PostExportPriority) &&
3696 (It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It)));
3697 if (InWA || Prio >= NormalPriority)
3698 return false;
3699 PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority));
3700 return true;
3701 }
3702 default:
3703 if (!TII.isEXP(MI: *MI))
3704 return false;
3705 break;
3706 }
3707
3708 // Check entry priority at each export (as there will only be a few).
3709 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3710 bool Changed = false;
3711 if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
3712 Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3713
3714 auto NextMI = std::next(x: It);
3715 bool EndOfShader = false;
3716 if (NextMI != MBB->end()) {
3717 // Only need WA at end of sequence of exports.
3718 if (TII.isEXP(MI: *NextMI))
3719 return Changed;
3720 // Assume appropriate S_SETPRIO after export means WA already applied.
3721 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3722 NextMI->getOperand(i: 0).getImm() == PostExportPriority)
3723 return Changed;
3724 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3725 }
3726
3727 const DebugLoc &DL = MI->getDebugLoc();
3728
3729 // Lower priority.
3730 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3731 .addImm(Val: PostExportPriority);
3732
3733 if (!EndOfShader) {
3734 // Wait for exports to complete.
3735 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT))
3736 .addReg(RegNo: AMDGPU::SGPR_NULL)
3737 .addImm(Val: 0);
3738 }
3739
3740 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3741 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3742
3743 if (!EndOfShader) {
3744 // Return to normal (higher) priority.
3745 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3746 .addImm(Val: NormalPriority);
3747 }
3748
3749 return true;
3750}
3751
3752bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3753 if (!isSGetReg(Opcode: MI->getOpcode()))
3754 return false;
3755
3756 const SIInstrInfo *TII = ST.getInstrInfo();
3757 switch (getHWReg(TII, RegInstr: *MI)) {
3758 default:
3759 return false;
3760 case AMDGPU::Hwreg::ID_STATUS:
3761 case AMDGPU::Hwreg::ID_STATE_PRIV:
3762 case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV:
3763 case AMDGPU::Hwreg::ID_EXCP_FLAG_USER:
3764 break;
3765 }
3766
3767 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3768 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3769 .addImm(Val: 0);
3770 return true;
3771}
3772
3773bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3774 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3775 return false;
3776
3777 const SIInstrInfo *TII = ST.getInstrInfo();
3778 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3779 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3780 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
3781 BuildMI(BB&: *MI->getParent(), I: std::next(x: MI->getIterator()), MIMD: MI->getDebugLoc(),
3782 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3783 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
3784
3785 return true;
3786}
3787
3788bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3789 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3790 // for hazard to trigger.
3791 if (!IsHazardRecognizerMode)
3792 return false;
3793
3794 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3795 const SIInstrInfo *TII = ST.getInstrInfo();
3796 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3797 const int FlatScrBaseWaitStates = 10;
3798
3799 bool ReadsFlatScrLo =
3800 MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3801 bool ReadsFlatScrHi =
3802 MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3803 if (isSGetReg(Opcode: MI->getOpcode())) {
3804 switch (getHWReg(TII, RegInstr: *MI)) {
3805 default:
3806 break;
3807 case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
3808 ReadsFlatScrLo = true;
3809 break;
3810 case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
3811 ReadsFlatScrHi = true;
3812 break;
3813 }
3814 }
3815
3816 const MachineRegisterInfo &MRI = MF.getRegInfo();
3817
3818 auto IsRegDefHazard = [&](Register Reg) -> bool {
3819 DenseSet<const MachineBasicBlock *> Visited;
3820 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3821 return MI.modifiesRegister(Reg, TRI);
3822 };
3823
3824 // This literally abuses the idea of waitstates. Instead of waitstates it
3825 // returns 1 for SGPR written and 0 otherwise.
3826 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3827 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3828 return 0;
3829 for (const MachineOperand &MO : MI.all_defs()) {
3830 if (TRI->isSGPRReg(MRI, Reg: MO.getReg()))
3831 return 1;
3832 }
3833 return 0;
3834 };
3835
3836 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3837 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3838 unsigned Wait = MI.getOperand(i: 0).getImm();
3839 if (AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: Wait) == 0 &&
3840 AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: Wait) == 0)
3841 return true;
3842 }
3843 return SgprWrites >= FlatScrBaseWaitStates;
3844 };
3845
3846 return ::getWaitStatesSince(
3847 IsHazard: IsHazardFn, MBB: MI->getParent(), I: std::next(x: MI->getReverseIterator()),
3848 WaitStates: 0, IsExpired: IsExpiredFn, Visited, GetNumWaitStates: IsSGPRDef) < FlatScrBaseWaitStates;
3849 };
3850
3851 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR102) ||
3852 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3853 (!ReadsFlatScrHi || MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR103) ||
3854 !IsRegDefHazard(AMDGPU::SGPR103)))
3855 return false;
3856
3857 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3858 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3859 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaSdst(
3860 Encoded: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST), VaSdst: 0));
3861 return true;
3862}
3863
3864bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3865 if (!isSSetReg(Opcode: MI->getOpcode()) ||
3866 MI->getOperand(i: 1).getImm() != AMDGPU::Hwreg::ID_MODE)
3867 return false;
3868
3869 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3870 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3871 return true;
3872}
3873