1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16#include "SIMachineFunctionInfo.h"
17#include "llvm/ADT/Statistic.h"
18#include "llvm/CodeGen/MachineFrameInfo.h"
19#include "llvm/CodeGen/MachineFunction.h"
20#include "llvm/CodeGen/MachineInstrBuilder.h"
21#include "llvm/CodeGen/ScheduleDAG.h"
22#include "llvm/Support/Debug.h"
23#include "llvm/TargetParser/TargetParser.h"
24
25using namespace llvm;
26
27#define DEBUG_TYPE "gcn-hazard-recognizer"
28
29STATISTIC(NumWMMANopsHoisted,
30 "Number of WMMA hazard V_NOPs hoisted from loops");
31STATISTIC(NumWMMAHoistingBailed,
32 "Number of WMMA hazards where V_NOP hoisting was not possible");
33
34namespace {
35
36struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
37 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
38
39 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
40 if (Arg.getAsInteger(Radix: 0, Result&: Value))
41 return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
42
43 if (Value > 100)
44 return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
45
46 return false;
47 }
48};
49
50} // end anonymous namespace
51
52static cl::opt<unsigned, false, MFMAPaddingRatioParser>
53 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: 0), cl::Hidden,
54 cl::desc("Fill a percentage of the latency between "
55 "neighboring MFMA with s_nops."));
56
57// This is intended for debugging purposes only.
58static cl::opt<unsigned>
59 NopPadding("amdgpu-snop-padding", cl::init(Val: 0), cl::Hidden,
60 cl::desc("Insert a s_nop x before every instruction"));
61
62static cl::opt<bool> EnableWMMAVnopHoisting(
63 "amdgpu-wmma-vnop-hoisting", cl::init(Val: true), cl::Hidden,
64 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
65
66//===----------------------------------------------------------------------===//
67// Hazard Recognizer Implementation
68//===----------------------------------------------------------------------===//
69
70static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
71 const GCNSubtarget &ST);
72
73GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF,
74 MachineLoopInfo *MLI)
75 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
76 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
77 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
78 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
79 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? 19 : 5;
80 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
81}
82
83void GCNHazardRecognizer::Reset() {
84 EmittedInstrs.clear();
85}
86
87void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
88 EmitInstruction(MI: SU->getInstr());
89}
90
91void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
92 CurrCycleInstr = MI;
93}
94
95static bool isDivFMas(unsigned Opcode) {
96 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
97}
98
99static bool isSGetReg(unsigned Opcode) {
100 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
101}
102
103static bool isSSetReg(unsigned Opcode) {
104 switch (Opcode) {
105 case AMDGPU::S_SETREG_B32:
106 case AMDGPU::S_SETREG_B32_mode:
107 case AMDGPU::S_SETREG_IMM32_B32:
108 case AMDGPU::S_SETREG_IMM32_B32_mode:
109 return true;
110 }
111 return false;
112}
113
114static bool isRWLane(unsigned Opcode) {
115 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
116}
117
118static bool isRFE(unsigned Opcode) {
119 return Opcode == AMDGPU::S_RFE_B64;
120}
121
122static bool isSMovRel(unsigned Opcode) {
123 switch (Opcode) {
124 case AMDGPU::S_MOVRELS_B32:
125 case AMDGPU::S_MOVRELS_B64:
126 case AMDGPU::S_MOVRELD_B32:
127 case AMDGPU::S_MOVRELD_B64:
128 return true;
129 default:
130 return false;
131 }
132}
133
134static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
135 const MachineInstr &MI) {
136 if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
137 return true;
138
139 switch (MI.getOpcode()) {
140 case AMDGPU::S_SENDMSG:
141 case AMDGPU::S_SENDMSGHALT:
142 case AMDGPU::S_TTRACEDATA:
143 return true;
144 // These DS opcodes don't support GDS.
145 case AMDGPU::DS_NOP:
146 case AMDGPU::DS_PERMUTE_B32:
147 case AMDGPU::DS_BPERMUTE_B32:
148 return false;
149 default:
150 if (TII.isDS(Opcode: MI.getOpcode())) {
151 int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
152 Name: AMDGPU::OpName::gds);
153 if (MI.getOperand(i: GDS).getImm())
154 return true;
155 }
156 return false;
157 }
158}
159
160static bool isPermlane(const MachineInstr &MI) {
161 unsigned Opcode = MI.getOpcode();
162 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
163 Opcode == AMDGPU::V_PERMLANE64_B32 ||
164 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
165 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
176}
177
178static bool isLdsDma(const MachineInstr &MI) {
179 return SIInstrInfo::isVALU(MI) &&
180 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
181}
182
183static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
184 const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr,
185 OperandName: AMDGPU::OpName::simm16);
186 return std::get<0>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm()));
187}
188
189ScheduleHazardRecognizer::HazardType
190GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
191 MachineInstr *MI = SU->getInstr();
192 // If we are not in "HazardRecognizerMode" and therefore not being run from
193 // the scheduler, track possible stalls from hazards but don't insert noops.
194 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
195
196 if (MI->isBundle())
197 return NoHazard;
198
199 if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > 0)
200 return HazardType;
201
202 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
203 return HazardType;
204
205 if (checkFPAtomicToDenormModeHazard(MI) > 0)
206 return HazardType;
207
208 // Hazards which cannot be mitigated with S_NOPs.
209 if (!IsHazardRecognizerMode) {
210 if (checkWMMACoexecutionHazards(MI) > 0)
211 return Hazard;
212 }
213
214 if (ST.hasNoDataDepHazard())
215 return NoHazard;
216
217 if (SIInstrInfo::isVMEM(MI: *MI) && checkVMEMHazards(VMEM: MI) > 0)
218 return HazardType;
219
220 if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > 0)
221 return HazardType;
222
223 if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > 0)
224 return HazardType;
225
226 if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > 0)
227 return HazardType;
228
229 if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > 0)
230 return HazardType;
231
232 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
233 SIInstrInfo::isDS(MI: *MI) || SIInstrInfo::isEXP(MI: *MI)) &&
234 checkMAIVALUHazards(MI) > 0)
235 return HazardType;
236
237 if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > 0)
238 return HazardType;
239
240 if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > 0)
241 return HazardType;
242
243 if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > 0)
244 return HazardType;
245
246 if (((ST.hasReadM0MovRelInterpHazard() &&
247 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
248 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
249 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
250 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
251 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
252 (ST.hasReadM0LdsDirectHazard() &&
253 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
254 checkReadM0Hazards(SMovRel: MI) > 0)
255 return HazardType;
256
257 if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > 0)
258 return HazardType;
259
260 if ((SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI)) &&
261 checkMAILdStHazards(MI) > 0)
262 return HazardType;
263
264 if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > 0)
265 return HazardType;
266
267 return NoHazard;
268}
269
270static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
271 unsigned Quantity) {
272 while (Quantity > 0) {
273 unsigned Arg = std::min(a: Quantity, b: 8u);
274 Quantity -= Arg;
275 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
276 .addImm(Val: Arg - 1);
277 }
278}
279
280unsigned
281GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
282 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
283 assert(TSchedModel.getWriteProcResBegin(SC) !=
284 TSchedModel.getWriteProcResEnd(SC));
285 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
286}
287
288void GCNHazardRecognizer::processBundle() {
289 MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
290 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
291 // Check bundled MachineInstr's for hazards.
292 for (; MI != E && MI->isInsideBundle(); ++MI) {
293 CurrCycleInstr = &*MI;
294 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
295
296 if (IsHazardRecognizerMode) {
297 fixHazards(MI: CurrCycleInstr);
298
299 insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
300 }
301
302 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
303 // include the bundled MI directly after, only add a maximum of
304 // (MaxLookAhead - 1) noops to EmittedInstrs.
305 for (unsigned i = 0, e = std::min(a: WaitStates, b: MaxLookAhead - 1); i < e; ++i)
306 EmittedInstrs.push_front(x: nullptr);
307
308 EmittedInstrs.push_front(x: CurrCycleInstr);
309 EmittedInstrs.resize(new_size: MaxLookAhead);
310 }
311 CurrCycleInstr = nullptr;
312}
313
314void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
315 assert(IsHazardRecognizerMode);
316
317 unsigned NumPreNoops = PreEmitNoops(MI);
318 EmitNoops(Quantity: NumPreNoops);
319 if (MI->isInsideBundle())
320 insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
321 else
322 TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator(MI),
323 Quantity: NumPreNoops);
324 EmitInstruction(MI);
325 AdvanceCycle();
326}
327
328unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
329 IsHazardRecognizerMode = true;
330 CurrCycleInstr = MI;
331 unsigned W = PreEmitNoopsCommon(MI);
332 fixHazards(MI);
333 CurrCycleInstr = nullptr;
334 return std::max(a: W, b: NopPadding.getValue());
335}
336
337unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) const {
338 if (MI->isBundle())
339 return 0;
340
341 int WaitStates = 0;
342
343 if (SIInstrInfo::isSMRD(MI: *MI))
344 return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
345
346 if (ST.hasNSAtoVMEMBug())
347 WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
348
349 WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
350
351 if (ST.hasNoDataDepHazard())
352 return WaitStates;
353
354 if (SIInstrInfo::isVMEM(MI: *MI))
355 WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
356
357 if (SIInstrInfo::isVALU(MI: *MI))
358 WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
359
360 if (SIInstrInfo::isDPP(MI: *MI))
361 WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
362
363 if (isDivFMas(Opcode: MI->getOpcode()))
364 WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
365
366 if (isRWLane(Opcode: MI->getOpcode()))
367 WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
368
369 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
370 SIInstrInfo::isDS(MI: *MI) || SIInstrInfo::isEXP(MI: *MI)) &&
371 checkMAIVALUHazards(MI) > 0)
372 WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
373
374 if (MI->isInlineAsm())
375 return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
376
377 if (isSGetReg(Opcode: MI->getOpcode()))
378 return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
379
380 if (isSSetReg(Opcode: MI->getOpcode()))
381 return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
382
383 if (isRFE(Opcode: MI->getOpcode()))
384 return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
385
386 if ((ST.hasReadM0MovRelInterpHazard() &&
387 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
388 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
389 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
390 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
391 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
392 (ST.hasReadM0LdsDirectHazard() &&
393 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
394 return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
395
396 if (SIInstrInfo::isMAI(MI: *MI))
397 return std::max(a: WaitStates, b: checkMAIHazards(MI));
398
399 if (SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI))
400 return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
401
402 if (ST.hasGFX950Insts() && isPermlane(MI: *MI))
403 return std::max(a: WaitStates, b: checkPermlaneHazards(MI));
404
405 return WaitStates;
406}
407
408void GCNHazardRecognizer::EmitNoop() {
409 EmittedInstrs.push_front(x: nullptr);
410}
411
412void GCNHazardRecognizer::AdvanceCycle() {
413 // When the scheduler detects a stall, it will call AdvanceCycle() without
414 // emitting any instructions.
415 if (!CurrCycleInstr) {
416 EmittedInstrs.push_front(x: nullptr);
417 return;
418 }
419
420 if (CurrCycleInstr->isBundle()) {
421 processBundle();
422 return;
423 }
424
425 unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
426 if (!NumWaitStates) {
427 CurrCycleInstr = nullptr;
428 return;
429 }
430
431 // Keep track of emitted instructions
432 EmittedInstrs.push_front(x: CurrCycleInstr);
433
434 // Add a nullptr for each additional wait state after the first. Make sure
435 // not to add more than getMaxLookAhead() items to the list, since we
436 // truncate the list to that size right after this loop.
437 for (unsigned i = 1, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
438 i < e; ++i) {
439 EmittedInstrs.push_front(x: nullptr);
440 }
441
442 // getMaxLookahead() is the largest number of wait states we will ever need
443 // to insert, so there is no point in keeping track of more than that many
444 // wait states.
445 EmittedInstrs.resize(new_size: getMaxLookAhead());
446
447 CurrCycleInstr = nullptr;
448}
449
450void GCNHazardRecognizer::RecedeCycle() {
451 assert(!IsHazardRecognizerMode &&
452 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
453}
454
455//===----------------------------------------------------------------------===//
456// Helper Functions
457//===----------------------------------------------------------------------===//
458
459enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
460
461// Search for a hazard in a block and its predecessors.
462template <typename StateT>
463static bool
464hasHazard(StateT InitialState,
465 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
466 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
467 const MachineBasicBlock *InitialMBB,
468 MachineBasicBlock::const_reverse_instr_iterator InitialI) {
469 struct StateMapKey {
470 SmallVectorImpl<StateT> *States;
471 unsigned Idx;
472 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
473 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
474 }
475 };
476 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
477 static inline StateMapKey getEmptyKey() {
478 return {static_cast<SmallVectorImpl<StateT> *>(
479 DenseMapInfo<void *>::getEmptyKey()),
480 DenseMapInfo<unsigned>::getEmptyKey()};
481 }
482 static inline StateMapKey getTombstoneKey() {
483 return {static_cast<SmallVectorImpl<StateT> *>(
484 DenseMapInfo<void *>::getTombstoneKey()),
485 DenseMapInfo<unsigned>::getTombstoneKey()};
486 }
487 static unsigned getHashValue(const StateMapKey &Key) {
488 return StateT::getHashValue((*Key.States)[Key.Idx]);
489 }
490 static unsigned getHashValue(const StateT &State) {
491 return StateT::getHashValue(State);
492 }
493 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
494 const auto EKey = getEmptyKey();
495 const auto TKey = getTombstoneKey();
496 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
497 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
498 return StateMapKey::isEqual(LHS, RHS);
499 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
500 }
501 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
502 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
503 StateMapKey::isEqual(RHS, getTombstoneKey()))
504 return false;
505 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
506 }
507 };
508
509 SmallDenseMap<StateMapKey, unsigned, 8, StateMapKeyTraits> StateMap;
510 SmallVector<StateT, 8> States;
511
512 MachineBasicBlock::const_reverse_instr_iterator I = InitialI;
513 const MachineBasicBlock *MBB = InitialMBB;
514 StateT State = InitialState;
515
516 SmallSetVector<std::pair<const MachineBasicBlock *, unsigned>, 16> Worklist;
517 unsigned WorkIdx = 0;
518 for (;;) {
519 bool Expired = false;
520 for (auto E = MBB->instr_rend(); I != E; ++I) {
521 // No need to look at parent BUNDLE instructions.
522 if (I->isBundle())
523 continue;
524
525 auto Result = IsHazard(State, *I);
526 if (Result == HazardFound)
527 return true;
528 if (Result == HazardExpired) {
529 Expired = true;
530 break;
531 }
532
533 if (I->isInlineAsm() || I->isMetaInstruction())
534 continue;
535
536 UpdateState(State, *I);
537 }
538
539 if (!Expired) {
540 unsigned StateIdx = States.size();
541 StateMapKey Key = {&States, StateIdx};
542 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
543 if (Insertion.second) {
544 States.emplace_back(State);
545 } else {
546 StateIdx = Insertion.first->second;
547 }
548 for (MachineBasicBlock *Pred : MBB->predecessors())
549 Worklist.insert(X: std::pair(Pred, StateIdx));
550 }
551
552 if (WorkIdx == Worklist.size())
553 break;
554
555 unsigned StateIdx;
556 std::tie(args&: MBB, args&: StateIdx) = Worklist[WorkIdx++];
557 State = States[StateIdx];
558 I = MBB->instr_rbegin();
559 }
560
561 return false;
562}
563
564// Returns a minimum wait states since \p I walking all predecessors.
565// Only scans until \p IsExpired does not return true.
566// Can only be run in a hazard recognizer mode.
567static int
568getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
569 const MachineBasicBlock *MBB,
570 MachineBasicBlock::const_reverse_instr_iterator I,
571 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
572 DenseSet<const MachineBasicBlock *> &Visited,
573 GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
574 SIInstrInfo::getNumWaitStates) {
575 for (auto E = MBB->instr_rend(); I != E; ++I) {
576 // Don't add WaitStates for parent BUNDLE instructions.
577 if (I->isBundle())
578 continue;
579
580 if (IsHazard(*I))
581 return WaitStates;
582
583 if (I->isInlineAsm())
584 continue;
585
586 WaitStates += GetNumWaitStates(*I);
587
588 if (IsExpired(*I, WaitStates))
589 return std::numeric_limits<int>::max();
590 }
591
592 int MinWaitStates = std::numeric_limits<int>::max();
593 for (MachineBasicBlock *Pred : MBB->predecessors()) {
594 if (!Visited.insert(V: Pred).second)
595 continue;
596
597 int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
598 IsExpired, Visited, GetNumWaitStates);
599
600 MinWaitStates = std::min(a: MinWaitStates, b: W);
601 }
602
603 return MinWaitStates;
604}
605
606static int
607getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
608 const MachineInstr *MI,
609 GCNHazardRecognizer::IsExpiredFn IsExpired,
610 GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
611 SIInstrInfo::getNumWaitStates) {
612 DenseSet<const MachineBasicBlock *> Visited;
613 return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
614 I: std::next(x: MI->getReverseIterator()), WaitStates: 0, IsExpired,
615 Visited, GetNumWaitStates);
616}
617
618int GCNHazardRecognizer::getWaitStatesSince(
619 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
620 if (IsHazardRecognizerMode) {
621 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
622 return WaitStates >= Limit;
623 };
624 return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn,
625 GetNumWaitStates);
626 }
627
628 int WaitStates = 0;
629 for (MachineInstr *MI : EmittedInstrs) {
630 if (MI) {
631 if (IsHazard(*MI))
632 return WaitStates;
633
634 if (MI->isInlineAsm())
635 continue;
636 }
637 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
638
639 if (WaitStates >= Limit)
640 break;
641 }
642 return std::numeric_limits<int>::max();
643}
644
645int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
646 int Limit) const {
647 return getWaitStatesSince(IsHazard, Limit, GetNumWaitStates: SIInstrInfo::getNumWaitStates);
648}
649
650int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
651 IsHazardFn IsHazardDef,
652 int Limit) const {
653 const SIRegisterInfo *TRI = ST.getRegisterInfo();
654
655 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
656 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
657 };
658
659 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
660}
661
662int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
663 int Limit) const {
664 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
665 return isSSetReg(Opcode: MI.getOpcode()) && IsHazard(MI);
666 };
667
668 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
669}
670
671//===----------------------------------------------------------------------===//
672// No-op Hazard Detection
673//===----------------------------------------------------------------------===//
674
675static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
676 MCRegister Reg) {
677 for (MCRegUnit Unit : TRI.regunits(Reg))
678 BV.set(static_cast<unsigned>(Unit));
679}
680
681static void addRegsToSet(const SIRegisterInfo &TRI,
682 iterator_range<MachineInstr::const_mop_iterator> Ops,
683 BitVector &DefSet, BitVector &UseSet) {
684 for (const MachineOperand &Op : Ops) {
685 if (Op.isReg())
686 addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
687 }
688}
689
690void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
691 addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
692}
693
694static bool breaksSMEMSoftClause(MachineInstr *MI) {
695 return !SIInstrInfo::isSMRD(MI: *MI);
696}
697
698static bool breaksVMEMSoftClause(MachineInstr *MI) {
699 return !SIInstrInfo::isVMEM(MI: *MI);
700}
701
702int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
703 // SMEM soft clause are only present on VI+, and only matter if xnack is
704 // enabled.
705 if (!ST.isXNACKEnabled())
706 return 0;
707
708 bool IsSMRD = TII.isSMRD(MI: *MEM);
709
710 resetClause();
711
712 // A soft-clause is any group of consecutive SMEM instructions. The
713 // instructions in this group may return out of order and/or may be
714 // replayed (i.e. the same instruction issued more than once).
715 //
716 // In order to handle these situations correctly we need to make sure that
717 // when a clause has more than one instruction, no instruction in the clause
718 // writes to a register that is read by another instruction in the clause
719 // (including itself). If we encounter this situation, we need to break the
720 // clause by inserting a non SMEM instruction.
721
722 for (MachineInstr *MI : EmittedInstrs) {
723 // When we hit a non-SMEM instruction then we have passed the start of the
724 // clause and we can stop.
725 if (!MI)
726 break;
727
728 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
729 break;
730
731 addClauseInst(MI: *MI);
732 }
733
734 if (ClauseDefs.none())
735 return 0;
736
737 // We need to make sure not to put loads and stores in the same clause if they
738 // use the same address. For now, just start a new clause whenever we see a
739 // store.
740 if (MEM->mayStore())
741 return 1;
742
743 addClauseInst(MI: *MEM);
744
745 // If the set of defs and uses intersect then we cannot add this instruction
746 // to the clause, so we have a hazard.
747 return ClauseDefs.anyCommon(RHS: ClauseUses) ? 1 : 0;
748}
749
750int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
751 int WaitStatesNeeded = 0;
752
753 WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
754
755 // This SMRD hazard only affects SI.
756 if (!ST.hasSMRDReadVALUDefHazard())
757 return WaitStatesNeeded;
758
759 // A read of an SGPR by SMRD instruction requires 4 wait states when the
760 // SGPR was written by a VALU instruction.
761 int SmrdSgprWaitStates = 4;
762 auto IsHazardDefFn = [this](const MachineInstr &MI) {
763 return TII.isVALU(MI);
764 };
765 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
766 return TII.isSALU(MI);
767 };
768
769 bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
770
771 for (const MachineOperand &Use : SMRD->uses()) {
772 if (!Use.isReg())
773 continue;
774 int WaitStatesNeededForUse =
775 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
776 Limit: SmrdSgprWaitStates);
777 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
778
779 // This fixes what appears to be undocumented hardware behavior in SI where
780 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
781 // needs some number of nops in between. We don't know how many we need, but
782 // let's use 4. This wasn't discovered before probably because the only
783 // case when this happens is when we expand a 64-bit pointer into a full
784 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
785 // probably never encountered in the closed-source land.
786 if (IsBufferSMRD) {
787 int WaitStatesNeededForUse =
788 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
789 IsHazardDef: IsBufferHazardDefFn,
790 Limit: SmrdSgprWaitStates);
791 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
792 }
793 }
794
795 return WaitStatesNeeded;
796}
797
798int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
799 if (!ST.hasVMEMReadSGPRVALUDefHazard())
800 return 0;
801
802 int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
803
804 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
805 // SGPR was written by a VALU Instruction.
806 const int VmemSgprWaitStates = 5;
807 auto IsHazardDefFn = [this](const MachineInstr &MI) {
808 return TII.isVALU(MI);
809 };
810 for (const MachineOperand &Use : VMEM->uses()) {
811 if (!Use.isReg() || TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
812 continue;
813
814 int WaitStatesNeededForUse =
815 VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
816 Limit: VmemSgprWaitStates);
817 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
818 }
819 return WaitStatesNeeded;
820}
821
822int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
823 const SIRegisterInfo *TRI = ST.getRegisterInfo();
824 const SIInstrInfo *TII = ST.getInstrInfo();
825
826 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
827 int DppVgprWaitStates = 2;
828 int DppExecWaitStates = 5;
829 int WaitStatesNeeded = 0;
830 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
831 return TII->isVALU(MI);
832 };
833
834 for (const MachineOperand &Use : DPP->uses()) {
835 if (!Use.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
836 continue;
837 int WaitStatesNeededForUse =
838 DppVgprWaitStates - getWaitStatesSinceDef(
839 Reg: Use.getReg(),
840 IsHazardDef: [](const MachineInstr &) { return true; },
841 Limit: DppVgprWaitStates);
842 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
843 }
844
845 WaitStatesNeeded = std::max(
846 a: WaitStatesNeeded,
847 b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn,
848 Limit: DppExecWaitStates));
849
850 return WaitStatesNeeded;
851}
852
853int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
854 const SIInstrInfo *TII = ST.getInstrInfo();
855
856 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
857 // instruction.
858 const int DivFMasWaitStates = 4;
859 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
860 return TII->isVALU(MI);
861 };
862 int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn,
863 Limit: DivFMasWaitStates);
864
865 return DivFMasWaitStates - WaitStatesNeeded;
866}
867
868int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
869 const SIInstrInfo *TII = ST.getInstrInfo();
870 unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
871
872 const int GetRegWaitStates = 2;
873 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
874 return GetRegHWReg == getHWReg(TII, RegInstr: MI);
875 };
876 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
877
878 return GetRegWaitStates - WaitStatesNeeded;
879}
880
881int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
882 const SIInstrInfo *TII = ST.getInstrInfo();
883 unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
884
885 const int SetRegWaitStates = ST.getSetRegWaitStates();
886 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
887 return HWReg == getHWReg(TII, RegInstr: MI);
888 };
889 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
890 return SetRegWaitStates - WaitStatesNeeded;
891}
892
893int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
894 if (!MI.mayStore())
895 return -1;
896
897 const SIInstrInfo *TII = ST.getInstrInfo();
898 unsigned Opcode = MI.getOpcode();
899 const MCInstrDesc &Desc = MI.getDesc();
900
901 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
902 int VDataRCID = -1;
903 if (VDataIdx != -1)
904 VDataRCID = TII->getOpRegClassID(OpInfo: Desc.operands()[VDataIdx]);
905
906 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
907 // There is no hazard if the instruction does not use vector regs
908 // (like wbinvl1)
909 if (VDataIdx == -1)
910 return -1;
911 // For MUBUF/MTBUF instructions this hazard only exists if the
912 // instruction is not using a register in the soffset field.
913 const MachineOperand *SOffset =
914 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
915 // If we have no soffset operand, then assume this field has been
916 // hardcoded to zero.
917 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64 &&
918 (!SOffset || !SOffset->isReg()))
919 return VDataIdx;
920 }
921
922 // MIMG instructions create a hazard if they don't use a 256-bit T# and
923 // the store size is greater than 8 bytes and they have more than two bits
924 // of their dmask set.
925 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
926 if (TII->isMIMG(MI)) {
927 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::srsrc);
928 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
929 Desc.operands()[SRsrcIdx])) == 256);
930 (void)SRsrcIdx;
931 }
932
933 if (TII->isFLAT(MI)) {
934 // There is no hazard if the instruction does not use vector regs
935 if (VDataIdx == -1)
936 return -1;
937
938 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64)
939 return VDataIdx;
940 }
941
942 return -1;
943}
944
945int GCNHazardRecognizer::checkVALUHazardsHelper(
946 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
947 // Helper to check for the hazard where VMEM instructions that store more than
948 // 8 bytes can have there store data over written by the next instruction.
949 const SIRegisterInfo *TRI = ST.getRegisterInfo();
950
951 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
952 int WaitStatesNeeded = 0;
953
954 if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
955 return WaitStatesNeeded;
956 Register Reg = Def.getReg();
957 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
958 int DataIdx = createsVALUHazard(MI);
959 return DataIdx >= 0 &&
960 TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg);
961 };
962
963 int WaitStatesNeededForDef =
964 VALUWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: VALUWaitStates);
965 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
966
967 return WaitStatesNeeded;
968}
969
970/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
971/// pack the computed value into correct bit position of the dest register. This
972/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
973/// dst_sel that is not aligned to the register. This function analayzes the \p
974/// MI and \returns an operand with dst forwarding issue, or nullptr if
975/// none exists.
976static const MachineOperand *
977getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
978 if (!SIInstrInfo::isVALU(MI))
979 return nullptr;
980
981 const SIInstrInfo *TII = ST.getInstrInfo();
982
983 unsigned Opcode = MI.getOpcode();
984
985 // There are three different types of instructions
986 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
987 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
988 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
989 // op_sel[3:2]
990 // != 0
991 if (SIInstrInfo::isSDWA(MI)) {
992 // Type 1: SDWA with dst_sel != DWORD
993 if (auto *DstSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel))
994 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
995 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
996 }
997
998 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opc: Opcode);
999 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel)) {
1000 // Type 2: VOP3 which write the hi bits
1001 if (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers) &
1002 SISrcMods::DST_OP_SEL)
1003 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1004
1005 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1006 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1007 (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers) &
1008 SISrcMods::OP_SEL_0))
1009 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1010 }
1011
1012 // Special case: nop is required for all the opsel values for fp4 sr variant
1013 // cvt scale instructions
1014 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1015 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1016
1017 return nullptr;
1018}
1019
1020/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1021/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1022/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1023static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
1024 const MachineOperand *Dst,
1025 const SIRegisterInfo *TRI) {
1026 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1027 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1028 // and we must account for that hazard.
1029 // We also must account for WAW hazards. In particular, WAW with dest
1030 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1031 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1032 // check for ECC. Without accounting for this hazard, the ECC will be
1033 // wrong.
1034 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1035 // complete zeroesHigh16BitsOfDest)
1036 for (auto &Operand : VALU->operands()) {
1037 if (Operand.isReg() && TRI->regsOverlap(RegA: Dst->getReg(), RegB: Operand.getReg())) {
1038 return true;
1039 }
1040 }
1041 return false;
1042}
1043
1044int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1045 int WaitStatesNeeded = 0;
1046
1047 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
1048 const int TransDefWaitstates = 1;
1049
1050 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1051 if (!SIInstrInfo::isTRANS(MI))
1052 return false;
1053 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1054 const SIInstrInfo *TII = ST.getInstrInfo();
1055 Register Def = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)->getReg();
1056
1057 for (const MachineOperand &Use : VALU->explicit_uses()) {
1058 if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
1059 return true;
1060 }
1061
1062 return false;
1063 };
1064
1065 int WaitStatesNeededForDef =
1066 TransDefWaitstates -
1067 getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
1068 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1069 }
1070
1071 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1072 const int Shift16DefWaitstates = 1;
1073
1074 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1075 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1076 const MachineOperand *ForwardedDst =
1077 getDstSelForwardingOperand(MI: ProducerMI, ST);
1078 if (ForwardedDst) {
1079 return consumesDstSelForwardingOperand(VALU, Dst: ForwardedDst, TRI);
1080 }
1081
1082 if (ProducerMI.isInlineAsm()) {
1083 // Assume inline asm has dst forwarding hazard
1084 for (auto &Def : ProducerMI.all_defs()) {
1085 if (consumesDstSelForwardingOperand(VALU, Dst: &Def, TRI))
1086 return true;
1087 }
1088 }
1089
1090 return false;
1091 };
1092
1093 int WaitStatesNeededForDef =
1094 Shift16DefWaitstates -
1095 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1096 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1097 }
1098
1099 if (ST.hasVDecCoExecHazard()) {
1100 const int VALUWriteSGPRVALUReadWaitstates = 2;
1101 const int VALUWriteEXECRWLane = 4;
1102 const int VALUWriteVGPRReadlaneRead = 1;
1103
1104 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1105 const MachineRegisterInfo &MRI = MF.getRegInfo();
1106 Register UseReg;
1107 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1108 if (!SIInstrInfo::isVALU(MI))
1109 return false;
1110 return MI.modifiesRegister(Reg: UseReg, TRI);
1111 };
1112
1113 for (const MachineOperand &Use : VALU->explicit_uses()) {
1114 if (!Use.isReg())
1115 continue;
1116
1117 UseReg = Use.getReg();
1118 if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
1119 int WaitStatesNeededForDef =
1120 VALUWriteSGPRVALUReadWaitstates -
1121 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn,
1122 Limit: VALUWriteSGPRVALUReadWaitstates);
1123 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1124 }
1125 }
1126
1127 if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) {
1128 UseReg = AMDGPU::VCC;
1129 int WaitStatesNeededForDef =
1130 VALUWriteSGPRVALUReadWaitstates -
1131 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates);
1132 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1133 }
1134
1135 switch (VALU->getOpcode()) {
1136 case AMDGPU::V_READLANE_B32:
1137 case AMDGPU::V_READFIRSTLANE_B32: {
1138 MachineOperand *Src = TII.getNamedOperand(MI&: *VALU, OperandName: AMDGPU::OpName::src0);
1139 UseReg = Src->getReg();
1140 int WaitStatesNeededForDef =
1141 VALUWriteVGPRReadlaneRead -
1142 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead);
1143 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1144 }
1145 [[fallthrough]];
1146 case AMDGPU::V_WRITELANE_B32: {
1147 UseReg = AMDGPU::EXEC;
1148 int WaitStatesNeededForDef =
1149 VALUWriteEXECRWLane -
1150 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane);
1151 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1152 break;
1153 }
1154 default:
1155 break;
1156 }
1157 }
1158
1159 // This checks for the hazard where VMEM instructions that store more than
1160 // 8 bytes can have there store data over written by the next instruction.
1161 if (!ST.has12DWordStoreHazard())
1162 return WaitStatesNeeded;
1163
1164 const MachineRegisterInfo &MRI = MF.getRegInfo();
1165
1166 for (const MachineOperand &Def : VALU->defs()) {
1167 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1168 }
1169
1170 return WaitStatesNeeded;
1171}
1172
1173int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1174 // This checks for hazards associated with inline asm statements.
1175 // Since inline asms can contain just about anything, we use this
1176 // to call/leverage other check*Hazard routines. Note that
1177 // this function doesn't attempt to address all possible inline asm
1178 // hazards (good luck), but is a collection of what has been
1179 // problematic thus far.
1180
1181 // see checkVALUHazards()
1182 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1183 !ST.hasCvtScaleForwardingHazard())
1184 return 0;
1185
1186 const MachineRegisterInfo &MRI = MF.getRegInfo();
1187 int WaitStatesNeeded = 0;
1188
1189 for (const MachineOperand &Op :
1190 llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1191 if (Op.isReg() && Op.isDef()) {
1192 if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1193 continue;
1194
1195 if (ST.has12DWordStoreHazard()) {
1196 WaitStatesNeeded =
1197 std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1198 }
1199 }
1200 }
1201
1202 if (ST.hasDstSelForwardingHazard()) {
1203 const int Shift16DefWaitstates = 1;
1204
1205 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1206 const MachineOperand *Dst = getDstSelForwardingOperand(MI: ProducerMI, ST);
1207 // Assume inline asm reads the dst
1208 if (Dst)
1209 return IA->modifiesRegister(Reg: Dst->getReg(), TRI: &TRI) ||
1210 IA->readsRegister(Reg: Dst->getReg(), TRI: &TRI);
1211
1212 if (ProducerMI.isInlineAsm()) {
1213 // If MI is inline asm, assume it has dst forwarding hazard
1214 for (auto &Def : ProducerMI.all_defs()) {
1215 if (IA->modifiesRegister(Reg: Def.getReg(), TRI: &TRI) ||
1216 IA->readsRegister(Reg: Def.getReg(), TRI: &TRI)) {
1217 return true;
1218 }
1219 }
1220 }
1221
1222 return false;
1223 };
1224
1225 int WaitStatesNeededForDef =
1226 Shift16DefWaitstates -
1227 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1228 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1229 }
1230
1231 return WaitStatesNeeded;
1232}
1233
1234int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1235 const SIInstrInfo *TII = ST.getInstrInfo();
1236 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1237 const MachineRegisterInfo &MRI = MF.getRegInfo();
1238
1239 const MachineOperand *LaneSelectOp =
1240 TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1);
1241
1242 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1243 return 0;
1244
1245 Register LaneSelectReg = LaneSelectOp->getReg();
1246 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1247
1248 const int RWLaneWaitStates = 4;
1249 int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1250 Limit: RWLaneWaitStates);
1251 return RWLaneWaitStates - WaitStatesSince;
1252}
1253
1254int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1255 if (!ST.hasRFEHazards())
1256 return 0;
1257
1258 const SIInstrInfo *TII = ST.getInstrInfo();
1259
1260 const int RFEWaitStates = 1;
1261
1262 auto IsHazardFn = [TII](const MachineInstr &MI) {
1263 return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1264 };
1265 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1266 return RFEWaitStates - WaitStatesNeeded;
1267}
1268
1269int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1270 const SIInstrInfo *TII = ST.getInstrInfo();
1271 const int ReadM0WaitStates = 1;
1272 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1273 return ReadM0WaitStates -
1274 getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates);
1275}
1276
1277void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1278 MachineBasicBlock::iterator InsertPt,
1279 int WaitStatesNeeded, bool IsHoisting) {
1280 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1281 for (int I = 0; I < WaitStatesNeeded; ++I)
1282 BuildMI(BB&: MBB, I: InsertPt, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
1283}
1284
1285void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1286 fixVMEMtoScalarWriteHazards(MI);
1287 fixVcmpxPermlaneHazards(MI);
1288 fixSMEMtoVectorWriteHazards(MI);
1289 fixVcmpxExecWARHazard(MI);
1290 fixLdsBranchVmemWARHazard(MI);
1291 if (ST.hasLdsDirect()) {
1292 fixLdsDirectVALUHazard(MI);
1293 fixLdsDirectVMEMHazard(MI);
1294 }
1295 fixVALUPartialForwardingHazard(MI);
1296 fixVALUTransUseHazard(MI);
1297 fixVALUTransCoexecutionHazards(MI);
1298 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1299 fixWMMACoexecutionHazards(MI);
1300 fixShift64HighRegBug(MI);
1301 fixVALUMaskWriteHazard(MI);
1302 fixRequiredExportPriority(MI);
1303 if (ST.requiresWaitIdleBeforeGetReg())
1304 fixGetRegWaitIdle(MI);
1305 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1306 fixDsAtomicAsyncBarrierArriveB64(MI);
1307 if (ST.hasScratchBaseForwardingHazard())
1308 fixScratchBaseForwardingHazard(MI);
1309 if (ST.setRegModeNeedsVNOPs())
1310 fixSetRegMode(MI);
1311}
1312
1313static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1314 const MachineInstr &MI) {
1315 return (TII.isVOPC(MI) ||
1316 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1317 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI);
1318}
1319
1320bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1321 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(MI: *MI))
1322 return false;
1323
1324 const SIInstrInfo *TII = ST.getInstrInfo();
1325 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1326 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1327 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
1328 };
1329
1330 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1331 unsigned Opc = MI.getOpcode();
1332 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1333 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1334 };
1335
1336 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1337 std::numeric_limits<int>::max())
1338 return false;
1339
1340 // V_NOP will be discarded by SQ.
1341 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1342 // which is always a VGPR and available.
1343 auto *Src0 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
1344 Register Reg = Src0->getReg();
1345 bool IsUndef = Src0->isUndef();
1346 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1347 MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32))
1348 .addReg(RegNo: Reg, Flags: RegState::Define | getDeadRegState(B: IsUndef))
1349 .addReg(RegNo: Reg, Flags: IsUndef ? RegState::Undef : RegState::Kill);
1350
1351 return true;
1352}
1353
1354bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1355 if (!ST.hasVMEMtoScalarWriteHazard())
1356 return false;
1357 assert(!ST.hasExtendedWaitCounts());
1358
1359 if (!SIInstrInfo::isSALU(MI: *MI) && !SIInstrInfo::isSMRD(MI: *MI))
1360 return false;
1361
1362 if (MI->getNumDefs() == 0)
1363 return false;
1364
1365 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1366
1367 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1368 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1369 return false;
1370
1371 for (const MachineOperand &Def : MI->defs()) {
1372 const MachineOperand *Op =
1373 I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false);
1374 if (!Op)
1375 continue;
1376 return true;
1377 }
1378 return false;
1379 };
1380
1381 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1382 return SIInstrInfo::isVALU(MI) ||
1383 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1384 !MI.getOperand(i: 0).getImm()) ||
1385 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1386 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: 0).getImm()) == 0);
1387 };
1388
1389 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1390 std::numeric_limits<int>::max())
1391 return false;
1392
1393 const SIInstrInfo *TII = ST.getInstrInfo();
1394 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1395 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1396 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
1397 return true;
1398}
1399
1400bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1401 if (!ST.hasSMEMtoVectorWriteHazard())
1402 return false;
1403 assert(!ST.hasExtendedWaitCounts());
1404
1405 if (!SIInstrInfo::isVALU(MI: *MI))
1406 return false;
1407
1408 AMDGPU::OpName SDSTName;
1409 switch (MI->getOpcode()) {
1410 case AMDGPU::V_READLANE_B32:
1411 case AMDGPU::V_READFIRSTLANE_B32:
1412 SDSTName = AMDGPU::OpName::vdst;
1413 break;
1414 default:
1415 SDSTName = AMDGPU::OpName::sdst;
1416 break;
1417 }
1418
1419 const SIInstrInfo *TII = ST.getInstrInfo();
1420 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1421 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1422 const MachineOperand *SDST = TII->getNamedOperand(MI&: *MI, OperandName: SDSTName);
1423 if (!SDST) {
1424 for (const auto &MO : MI->implicit_operands()) {
1425 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) {
1426 SDST = &MO;
1427 break;
1428 }
1429 }
1430 }
1431
1432 if (!SDST)
1433 return false;
1434
1435 const Register SDSTReg = SDST->getReg();
1436 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1437 return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI);
1438 };
1439
1440 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1441 if (TII->isSALU(MI)) {
1442 switch (MI.getOpcode()) {
1443 case AMDGPU::S_SETVSKIP:
1444 case AMDGPU::S_VERSION:
1445 case AMDGPU::S_WAITCNT_VSCNT:
1446 case AMDGPU::S_WAITCNT_VMCNT:
1447 case AMDGPU::S_WAITCNT_EXPCNT:
1448 // These instructions cannot not mitigate the hazard.
1449 return false;
1450 case AMDGPU::S_WAITCNT_LGKMCNT:
1451 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1452 return (MI.getOperand(i: 1).getImm() == 0) &&
1453 (MI.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL);
1454 case AMDGPU::S_WAITCNT: {
1455 const int64_t Imm = MI.getOperand(i: 0).getImm();
1456 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1457 // DsCnt corresponds to LGKMCnt here.
1458 return Decoded.get(T: AMDGPU::DS_CNT) == 0;
1459 }
1460 default:
1461 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1462 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1463 "unexpected wait count instruction");
1464 // SOPP instructions cannot mitigate the hazard.
1465 if (TII->isSOPP(MI))
1466 return false;
1467 // At this point the SALU can be assumed to mitigate the hazard
1468 // because either:
1469 // (a) it is independent of the at risk SMEM (breaking chain),
1470 // or
1471 // (b) it is dependent on the SMEM, in which case an appropriate
1472 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1473 // SMEM instruction.
1474 return true;
1475 }
1476 }
1477 return false;
1478 };
1479
1480 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1481 std::numeric_limits<int>::max())
1482 return false;
1483
1484 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1485 MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL)
1486 .addImm(Val: 0);
1487 return true;
1488}
1489
1490bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1491 if (!ST.hasVcmpxExecWARHazard())
1492 return false;
1493 assert(!ST.hasExtendedWaitCounts());
1494
1495 if (!SIInstrInfo::isVALU(MI: *MI))
1496 return false;
1497
1498 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1499 if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
1500 return false;
1501
1502 auto IsHazardFn = [TRI](const MachineInstr &I) {
1503 if (SIInstrInfo::isVALU(MI: I))
1504 return false;
1505 return I.readsRegister(Reg: AMDGPU::EXEC, TRI);
1506 };
1507
1508 const SIInstrInfo *TII = ST.getInstrInfo();
1509 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1510 if (SIInstrInfo::isVALU(MI)) {
1511 if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))
1512 return true;
1513 for (auto MO : MI.implicit_operands())
1514 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg())))
1515 return true;
1516 }
1517 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1518 AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
1519 return true;
1520 return false;
1521 };
1522
1523 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1524 std::numeric_limits<int>::max())
1525 return false;
1526
1527 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1528 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1529 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
1530 return true;
1531}
1532
1533static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1534 const GCNSubtarget &ST) {
1535 if (!ST.hasLdsBranchVmemWARHazard())
1536 return false;
1537
1538 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1539 // instructions need to appear in the same function.
1540 bool HasLds = false;
1541 bool HasVmem = false;
1542 for (auto &MBB : MF) {
1543 for (auto &MI : MBB) {
1544 HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI);
1545 HasVmem |= SIInstrInfo::isVMEM(MI);
1546 if (HasLds && HasVmem)
1547 return true;
1548 }
1549 }
1550 return false;
1551}
1552
1553static bool isStoreCountWaitZero(const MachineInstr &I) {
1554 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1555 I.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL &&
1556 !I.getOperand(i: 1).getImm();
1557}
1558
1559bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1560 if (!RunLdsBranchVmemWARHazardFixup)
1561 return false;
1562
1563 assert(ST.hasLdsBranchVmemWARHazard());
1564 assert(!ST.hasExtendedWaitCounts());
1565
1566 auto IsHazardInst = [](const MachineInstr &MI) {
1567 if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI))
1568 return 1;
1569 if (SIInstrInfo::isVMEM(MI))
1570 return 2;
1571 return 0;
1572 };
1573
1574 auto InstType = IsHazardInst(*MI);
1575 if (!InstType)
1576 return false;
1577
1578 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1579 return IsHazardInst(I) || isStoreCountWaitZero(I);
1580 };
1581
1582 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1583 if (!I.isBranch())
1584 return false;
1585
1586 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1587 auto InstType2 = IsHazardInst(I);
1588 return InstType2 && InstType != InstType2;
1589 };
1590
1591 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1592 auto InstType2 = IsHazardInst(I);
1593 if (InstType == InstType2)
1594 return true;
1595
1596 return isStoreCountWaitZero(I);
1597 };
1598
1599 return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1600 std::numeric_limits<int>::max();
1601 };
1602
1603 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1604 std::numeric_limits<int>::max())
1605 return false;
1606
1607 const SIInstrInfo *TII = ST.getInstrInfo();
1608 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1609 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1610 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1611 .addImm(Val: 0);
1612
1613 return true;
1614}
1615
1616bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1617 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1618 return false;
1619
1620 const int NoHazardWaitStates = 15;
1621 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1622 const Register VDSTReg = VDST->getReg();
1623
1624 bool VisitedTrans = false;
1625 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1626 if (!SIInstrInfo::isVALU(MI: I))
1627 return false;
1628 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(MI: I);
1629 // Cover both WAR and WAW
1630 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1631 };
1632 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1633 if (WaitStates >= NoHazardWaitStates)
1634 return true;
1635 // Instructions which cause va_vdst==0 expire hazard
1636 return SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1637 SIInstrInfo::isEXP(MI: I);
1638 };
1639 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1640 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1641 };
1642
1643 DenseSet<const MachineBasicBlock *> Visited;
1644 auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(),
1645 I: std::next(x: MI->getReverseIterator()), WaitStates: 0,
1646 IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn);
1647
1648 // Transcendentals can execute in parallel to other VALUs.
1649 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1650 if (VisitedTrans)
1651 Count = 0;
1652
1653 MachineOperand *WaitVdstOp =
1654 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst);
1655 WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates));
1656
1657 return true;
1658}
1659
1660bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1661 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1662 return false;
1663
1664 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1665 const Register VDSTReg = VDST->getReg();
1666
1667 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1668 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1669 return false;
1670 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1671 };
1672 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1673 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1674 // according to the type of VMEM instruction.
1675 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1676 return SIInstrInfo::isVALU(MI: I) || SIInstrInfo::isEXP(MI: I) ||
1677 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: 0).getImm()) ||
1678 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1679 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: 0).getImm()) == 0) ||
1680 (LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) &&
1681 !TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::waitvsrc)->getImm());
1682 };
1683
1684 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1685 std::numeric_limits<int>::max())
1686 return false;
1687
1688 if (LdsdirCanWait) {
1689 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(0);
1690 } else {
1691 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1692 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1693 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
1694 }
1695
1696 return true;
1697}
1698
1699bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1700 if (!ST.hasVALUPartialForwardingHazard())
1701 return false;
1702 assert(!ST.hasExtendedWaitCounts());
1703
1704 if (!ST.isWave64() || !SIInstrInfo::isVALU(MI: *MI))
1705 return false;
1706
1707 SmallSetVector<Register, 4> SrcVGPRs;
1708
1709 for (const MachineOperand &Use : MI->explicit_uses()) {
1710 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1711 SrcVGPRs.insert(X: Use.getReg());
1712 }
1713
1714 // Only applies with >= 2 unique VGPR sources
1715 if (SrcVGPRs.size() <= 1)
1716 return false;
1717
1718 // Look for the following pattern:
1719 // Va <- VALU [PreExecPos]
1720 // intv1
1721 // Exec <- SALU [ExecPos]
1722 // intv2
1723 // Vb <- VALU [PostExecPos]
1724 // intv3
1725 // MI Va, Vb (WaitState = 0)
1726 //
1727 // Where:
1728 // intv1 + intv2 <= 2 VALUs
1729 // intv3 <= 4 VALUs
1730 //
1731 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1732
1733 const int Intv1plus2MaxVALUs = 2;
1734 const int Intv3MaxVALUs = 4;
1735 const int IntvMaxVALUs = 6;
1736 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1737
1738 struct StateType {
1739 SmallDenseMap<Register, int, 4> DefPos;
1740 int ExecPos = std::numeric_limits<int>::max();
1741 int VALUs = 0;
1742
1743 static unsigned getHashValue(const StateType &State) {
1744 return hash_combine(args: State.ExecPos, args: State.VALUs,
1745 args: hash_combine_range(R: State.DefPos));
1746 }
1747 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1748 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1749 LHS.VALUs == RHS.VALUs;
1750 }
1751 };
1752
1753 StateType State;
1754
1755 // This overloads expiry testing with all the hazard detection
1756 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1757 // Too many VALU states have passed
1758 if (State.VALUs > NoHazardVALUWaitStates)
1759 return HazardExpired;
1760
1761 // Instructions which cause va_vdst==0 expire hazard
1762 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1763 SIInstrInfo::isEXP(MI: I) ||
1764 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1765 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
1766 return HazardExpired;
1767
1768 // Track registers writes
1769 bool Changed = false;
1770 if (SIInstrInfo::isVALU(MI: I)) {
1771 for (Register Src : SrcVGPRs) {
1772 if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1773 State.DefPos[Src] = State.VALUs;
1774 Changed = true;
1775 }
1776 }
1777 } else if (SIInstrInfo::isSALU(MI: I)) {
1778 if (State.ExecPos == std::numeric_limits<int>::max()) {
1779 if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
1780 State.ExecPos = State.VALUs;
1781 Changed = true;
1782 }
1783 }
1784 }
1785
1786 // Early expiration: too many VALUs in intv3
1787 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1788 return HazardExpired;
1789
1790 // Only evaluate state if something changed
1791 if (!Changed)
1792 return NoHazardFound;
1793
1794 // Determine positions of VALUs pre/post exec change
1795 if (State.ExecPos == std::numeric_limits<int>::max())
1796 return NoHazardFound;
1797
1798 int PreExecPos = std::numeric_limits<int>::max();
1799 int PostExecPos = std::numeric_limits<int>::max();
1800
1801 for (auto Entry : State.DefPos) {
1802 int DefVALUs = Entry.second;
1803 if (DefVALUs != std::numeric_limits<int>::max()) {
1804 if (DefVALUs >= State.ExecPos)
1805 PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1806 else
1807 PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1808 }
1809 }
1810
1811 // Need a VALUs post exec change
1812 if (PostExecPos == std::numeric_limits<int>::max())
1813 return NoHazardFound;
1814
1815 // Too many VALUs in intv3?
1816 int Intv3VALUs = PostExecPos;
1817 if (Intv3VALUs > Intv3MaxVALUs)
1818 return HazardExpired;
1819
1820 // Too many VALUs in intv2?
1821 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1822 if (Intv2VALUs > Intv1plus2MaxVALUs)
1823 return HazardExpired;
1824
1825 // Need a VALUs pre exec change
1826 if (PreExecPos == std::numeric_limits<int>::max())
1827 return NoHazardFound;
1828
1829 // Too many VALUs in intv1?
1830 int Intv1VALUs = PreExecPos - State.ExecPos;
1831 if (Intv1VALUs > Intv1plus2MaxVALUs)
1832 return HazardExpired;
1833
1834 // Too many VALUs in intv1 + intv2
1835 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1836 return HazardExpired;
1837
1838 return HazardFound;
1839 };
1840 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1841 if (SIInstrInfo::isVALU(MI))
1842 State.VALUs += 1;
1843 };
1844
1845 if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1846 InitialI: std::next(x: MI->getReverseIterator())))
1847 return false;
1848
1849 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1850 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1851 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST));
1852
1853 return true;
1854}
1855
1856bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1857 if (!ST.hasVALUTransUseHazard())
1858 return false;
1859 assert(!ST.hasExtendedWaitCounts());
1860
1861 if (!SIInstrInfo::isVALU(MI: *MI))
1862 return false;
1863
1864 SmallSet<Register, 4> SrcVGPRs;
1865
1866 for (const MachineOperand &Use : MI->explicit_uses()) {
1867 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1868 SrcVGPRs.insert(V: Use.getReg());
1869 }
1870
1871 // Look for the following pattern:
1872 // Va <- TRANS VALU
1873 // intv
1874 // MI Va (WaitState = 0)
1875 //
1876 // Where:
1877 // intv <= 5 VALUs / 1 TRANS
1878 //
1879 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1880
1881 const int IntvMaxVALUs = 5;
1882 const int IntvMaxTRANS = 1;
1883
1884 struct StateType {
1885 int VALUs = 0;
1886 int TRANS = 0;
1887
1888 static unsigned getHashValue(const StateType &State) {
1889 return hash_combine(args: State.VALUs, args: State.TRANS);
1890 }
1891 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1892 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1893 }
1894 };
1895
1896 StateType State;
1897
1898 // This overloads expiry testing with all the hazard detection
1899 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1900 // Too many VALU states have passed
1901 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1902 return HazardExpired;
1903
1904 // Instructions which cause va_vdst==0 expire hazard
1905 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1906 SIInstrInfo::isEXP(MI: I) ||
1907 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1908 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
1909 return HazardExpired;
1910
1911 // Track registers writes
1912 if (SIInstrInfo::isTRANS(MI: I)) {
1913 for (Register Src : SrcVGPRs) {
1914 if (I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1915 return HazardFound;
1916 }
1917 }
1918 }
1919
1920 return NoHazardFound;
1921 };
1922 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1923 if (SIInstrInfo::isVALU(MI))
1924 State.VALUs += 1;
1925 if (SIInstrInfo::isTRANS(MI))
1926 State.TRANS += 1;
1927 };
1928
1929 if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1930 InitialI: std::next(x: MI->getReverseIterator())))
1931 return false;
1932
1933 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1934 // avoided.
1935 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1936 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1937 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST));
1938
1939 return true;
1940}
1941
1942bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1943 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1944 !SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isTRANS(MI: *MI))
1945 return false;
1946
1947 const SIInstrInfo *TII = ST.getInstrInfo();
1948 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1949
1950 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1951 if (!SIInstrInfo::isTRANS(MI: I))
1952 return false;
1953
1954 // RAW: Trans(I) writes, VALU(MI) reads.
1955 Register TransDef = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
1956 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1957 if (ValuUse.isReg() && TRI->regsOverlap(RegA: TransDef, RegB: ValuUse.getReg()))
1958 return true;
1959 }
1960
1961 auto *ValuDst = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1962 if (!ValuDst || !ValuDst->isReg())
1963 return false;
1964
1965 // WAR: Trans(I) reads, VALU(MI) writes.
1966 Register ValuDef = ValuDst->getReg();
1967 for (const MachineOperand &TransUse : I.explicit_uses()) {
1968 if (TransUse.isReg() && TRI->regsOverlap(RegA: ValuDef, RegB: TransUse.getReg()))
1969 return true;
1970 }
1971
1972 return false;
1973 };
1974
1975 auto IsExpiredFn = [](const MachineInstr &I, int) {
1976 return SIInstrInfo::isVALU(MI: I);
1977 };
1978
1979 const int HasVALU = std::numeric_limits<int>::max();
1980 if (::getWaitStatesSince(IsHazard: IsTransHazardFn, MI, IsExpired: IsExpiredFn) == HasVALU)
1981 return false;
1982
1983 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
1984 return true;
1985}
1986
1987bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1988 if (!SIInstrInfo::isWMMA(MI: *MI) && !SIInstrInfo::isSWMMAC(MI: *MI))
1989 return false;
1990
1991 const SIInstrInfo *TII = ST.getInstrInfo();
1992 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1993
1994 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1995 if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
1996 return false;
1997
1998 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1999 // with the dest(matrix D) of the previous wmma.
2000 const Register CurSrc0Reg =
2001 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
2002 const Register CurSrc1Reg =
2003 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
2004
2005 const Register PrevDstReg =
2006 TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
2007
2008 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) ||
2009 TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) {
2010 return true;
2011 }
2012
2013 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2014 // but Index can't overlap with PrevDstReg.
2015 if (AMDGPU::isGFX12Plus(STI: ST)) {
2016 if (SIInstrInfo::isSWMMAC(MI: *MI)) {
2017 const Register CurIndex =
2018 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
2019 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex))
2020 return true;
2021 }
2022 return false;
2023 }
2024
2025 return false;
2026 };
2027
2028 auto IsExpiredFn = [](const MachineInstr &I, int) {
2029 return SIInstrInfo::isVALU(MI: I);
2030 };
2031
2032 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
2033 std::numeric_limits<int>::max())
2034 return false;
2035
2036 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
2037
2038 return true;
2039}
2040
2041static bool isCoexecutableVALUInst(const MachineInstr &MI) {
2042 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isWMMA(MI) &&
2043 !SIInstrInfo::isSWMMAC(MI) && !SIInstrInfo::isLDSDMA(MI);
2044}
2045
2046static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
2047 const SIInstrInfo *TII, unsigned Latency,
2048 unsigned Category) {
2049 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2050 "Handle me if the xdl wmma instruction latency changes");
2051
2052 switch (Category) {
2053 case 0: // Dense WMMA Instructions:
2054 // WMMA_*F16, WMMA_*BF16
2055 // WMMA_*FP8FP8
2056 // WMMA_*FP8BF8
2057 // WMMA_*BF8FP8
2058 // WMMA_*BF8BF8
2059 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2060 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2061
2062 case 1: // Dense WMMA Instructions:
2063 // WMMA_IU8
2064 // WMMA_IU4
2065 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2066 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2067
2068 case 2: // Dense SWMMAC Instructions
2069 // SWMMAC_*F16, SWMMAC_*BF16,
2070 // SWMMAC_*FP8FP8
2071 // SWMMAC_*BF8FP8
2072 // SWMMAC_*FP8BF8
2073 // SWMMAC_*BF8BF8
2074 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2075
2076 case 3: // Sparse WMMA Instructions:
2077 // SWMMAC_IU8
2078 // SWMMAC_IU4
2079 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2080 default:
2081 break;
2082 } // end switch.
2083
2084 return false;
2085}
2086
2087int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2088 if (!ST.hasGFX1250Insts())
2089 return 0;
2090
2091 const SIInstrInfo *TII = ST.getInstrInfo();
2092 if (!TII->isXDLWMMA(MI: *MI) && !isCoexecutableVALUInst(MI: *MI))
2093 return 0;
2094
2095 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2096 // be in between the first WMMA and the second instruction to cover the hazard
2097 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2098 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2099 // numbers, which depends on the category of the first WMMA.
2100 const int WMMAWaitStates[] = {5, 9, 3, 5};
2101 const int VALUWaitStates[] = {4, 8, 2, 4};
2102 unsigned Category = 0;
2103
2104 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2105 if (!TII->isXDLWMMA(MI: I))
2106 return false;
2107
2108 unsigned Latency = TSchedModel.computeInstrLatency(MI: &I);
2109 if (!IsWMMAHazardInstInCategory(MI: I, TII, Latency, Category))
2110 return false;
2111
2112 return hasWMMAToWMMARegOverlap(WMMA: I, MI: *MI);
2113 };
2114
2115 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2116 if (!TII->isXDLWMMA(MI: I))
2117 return false;
2118
2119 unsigned Latency = TSchedModel.computeInstrLatency(MI: &I);
2120 if (!IsWMMAHazardInstInCategory(MI: I, TII, Latency, Category))
2121 return false;
2122
2123 return hasWMMAToVALURegOverlap(WMMA: I, MI: *MI);
2124 };
2125
2126 int Limit = 0;
2127
2128 auto GetWaitStatesFn = [](const MachineInstr &I) {
2129 return SIInstrInfo::isVALU(MI: I) ? 1 : 0;
2130 };
2131
2132 int WaitStatesNeeded = -1;
2133 if (TII->isXDLWMMA(MI: *MI)) {
2134 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2135 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2136 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2137 // exists, and INT_MAX if there is no hazard. As a result, a negative
2138 // WaitStatesNeeded here means no hazard, and we will continue to search
2139 // for other categories.
2140 WaitStatesNeeded =
2141 Limit - getWaitStatesSince(IsHazard: IsWMMAHazardFn, Limit, GetNumWaitStates: GetWaitStatesFn);
2142 }
2143 } else { // Must be a co-executable VALU.
2144 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2145 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2146 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2147 // exists, and INT_MAX if there is no hazard. As a result, a negative
2148 // WaitStatesNeeded here means no hazard, and we will continue to search
2149 // for other categories.
2150 WaitStatesNeeded =
2151 Limit - getWaitStatesSince(IsHazard: IsVALUHazardFn, Limit, GetNumWaitStates: GetWaitStatesFn);
2152 }
2153 }
2154
2155 return WaitStatesNeeded;
2156}
2157
2158bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2159 const MachineInstr &WMMA, const MachineInstr &MI) const {
2160 Register D0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::vdst)->getReg();
2161 Register A1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)->getReg();
2162 Register B1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)->getReg();
2163
2164 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2165 if (TRI.regsOverlap(RegA: D0, RegB: A1) || TRI.regsOverlap(RegA: D0, RegB: B1))
2166 return true;
2167
2168 if (SIInstrInfo::isSWMMAC(MI)) {
2169 Register Idx1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2170 if (TRI.regsOverlap(RegA: D0, RegB: Idx1))
2171 return true;
2172 }
2173 return false;
2174}
2175
2176bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2177 const MachineInstr &WMMA, const MachineInstr &MI) const {
2178 // WMMA writes, VALU reads.
2179 Register D0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::vdst)->getReg();
2180 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2181 if (ValuUse.isReg() && TRI.regsOverlap(RegA: D0, RegB: ValuUse.getReg()))
2182 return true;
2183 }
2184
2185 // WMMA reads or writes, VALU writes.
2186 Register A0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src0)->getReg();
2187 Register B0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src1)->getReg();
2188 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2189
2190 if (SIInstrInfo::isSWMMAC(MI: WMMA)) {
2191 Register Idx0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src2)->getReg();
2192 WMMARegs.push_back(Elt: Idx0);
2193 }
2194
2195 for (const MachineOperand &ValuDef : MI.defs()) {
2196 Register VDstReg = ValuDef.getReg();
2197 for (Register WMMAReg : WMMARegs) {
2198 if (TRI.regsOverlap(RegA: VDstReg, RegB: WMMAReg))
2199 return true;
2200 }
2201 }
2202 return false;
2203}
2204
2205bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2206 const MachineInstr &MI) const {
2207 // I is the potential WMMA hazard source, MI is the instruction being checked
2208 // for hazard.
2209 if (!TII.isXDLWMMA(MI: I))
2210 return false;
2211
2212 // Dispatch based on MI type
2213 if (TII.isXDLWMMA(MI))
2214 return hasWMMAToWMMARegOverlap(WMMA: I, MI);
2215 if (isCoexecutableVALUInst(MI))
2216 return hasWMMAToVALURegOverlap(WMMA: I, MI);
2217
2218 return false;
2219}
2220
2221bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2222 bool IncludeSubloops) {
2223 // Scan loop for any WMMA that hazards MI.
2224 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2225 for (MachineBasicBlock *MBB : L->getBlocks()) {
2226 if (!IncludeSubloops && MLI->getLoopFor(BB: MBB) != L)
2227 continue;
2228 for (MachineInstr &I : *MBB) {
2229 if (&I == MI)
2230 continue;
2231 if (isCoexecutionHazardFor(I, MI: *MI))
2232 return true;
2233 }
2234 }
2235 return false;
2236}
2237
2238bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2239 int WaitStatesNeeded) {
2240 if (!MLI)
2241 return false;
2242
2243 MachineLoop *L = MLI->getLoopFor(BB: MI->getParent());
2244 if (!L) {
2245 ++NumWMMAHoistingBailed;
2246 return false;
2247 }
2248
2249 // If innermost loop has WMMA hazard, we can't hoist at all
2250 if (hasWMMAHazardInLoop(L, MI)) {
2251 ++NumWMMAHoistingBailed;
2252 return false;
2253 }
2254
2255 // Find outermost loop with no internal hazard
2256 MachineLoop *TargetLoop = L;
2257 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2258 if (hasWMMAHazardInLoop(L: Parent, MI, IncludeSubloops: false))
2259 break; // Parent has hazard in its own blocks, stop here
2260 TargetLoop = Parent; // Safe to hoist further out
2261 }
2262
2263 // Need valid preheader to insert V_NOPs
2264 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2265 if (!Preheader) {
2266 ++NumWMMAHoistingBailed;
2267 return false;
2268 }
2269
2270 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2271 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2272 << "\n");
2273
2274 emitVNops(MBB&: *Preheader, InsertPt: Preheader->getFirstTerminator(), WaitStatesNeeded,
2275 /*IsHoisting=*/true);
2276 NumWMMANopsHoisted += WaitStatesNeeded;
2277 return true;
2278}
2279
2280bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2281 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2282 if (WaitStatesNeeded <= 0)
2283 return false;
2284
2285 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2286 return true;
2287
2288 emitVNops(MBB&: *MI->getParent(), InsertPt: MI->getIterator(), WaitStatesNeeded);
2289 return true;
2290}
2291
2292bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2293 if (!ST.hasShift64HighRegBug())
2294 return false;
2295 assert(!ST.hasExtendedWaitCounts());
2296
2297 switch (MI->getOpcode()) {
2298 default:
2299 return false;
2300 case AMDGPU::V_LSHLREV_B64_e64:
2301 case AMDGPU::V_LSHRREV_B64_e64:
2302 case AMDGPU::V_ASHRREV_I64_e64:
2303 break;
2304 }
2305
2306 MachineOperand *Amt = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
2307 if (!Amt->isReg())
2308 return false;
2309
2310 Register AmtReg = Amt->getReg();
2311 const MachineRegisterInfo &MRI = MF.getRegInfo();
2312 // Check if this is a last VGPR in the allocation block.
2313 if (!TRI.isVGPR(MRI, Reg: AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2314 return false;
2315
2316 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + 1))
2317 return false;
2318
2319 assert(ST.needsAlignedVGPRs());
2320 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2321
2322 const DebugLoc &DL = MI->getDebugLoc();
2323 MachineBasicBlock *MBB = MI->getParent();
2324 MachineOperand *Src1 = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1);
2325
2326 // In:
2327 //
2328 // Dst = shiftrev64 Amt, Src1
2329 //
2330 // if Dst!=Src1 then avoid the bug with:
2331 //
2332 // Dst.sub0 = Amt
2333 // Dst = shift64 Dst.sub0, Src1
2334
2335 Register DstReg = MI->getOperand(i: 0).getReg();
2336 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2337 Register DstLo = TRI.getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
2338 runOnInstruction(
2339 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo).add(MO: *Amt));
2340 Amt->setReg(DstLo);
2341 Amt->setIsKill(true);
2342 return true;
2343 }
2344
2345 bool Overlapped = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI);
2346 Register NewReg;
2347 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2348 : AMDGPU::VGPR_32RegClass) {
2349 if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) {
2350 NewReg = Reg;
2351 break;
2352 }
2353 }
2354
2355 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1)
2356 : NewReg;
2357 Register NewAmtLo;
2358
2359 if (Overlapped)
2360 NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0);
2361
2362 // Insert a full wait count because found register might be pending a wait.
2363 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
2364 .addImm(Val: 0);
2365
2366 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2367 if (Overlapped)
2368 runOnInstruction(
2369 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo)
2370 .addDef(RegNo: AmtReg - 1)
2371 .addReg(RegNo: AmtReg - 1, Flags: RegState::Undef)
2372 .addReg(RegNo: NewAmtLo, Flags: RegState::Undef));
2373 runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt)
2374 .addDef(RegNo: AmtReg)
2375 .addReg(RegNo: AmtReg, Flags: RegState::Undef)
2376 .addReg(RegNo: NewAmt, Flags: RegState::Undef));
2377
2378 // Instructions emitted after the current instruction will be processed by the
2379 // parent loop of the hazard recognizer in a natural way.
2380 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2381 DestReg: AmtReg)
2382 .addDef(RegNo: NewAmt)
2383 .addReg(RegNo: NewAmt)
2384 .addReg(RegNo: AmtReg);
2385 if (Overlapped)
2386 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2387 DestReg: AmtReg - 1)
2388 .addDef(RegNo: NewAmtLo)
2389 .addReg(RegNo: NewAmtLo)
2390 .addReg(RegNo: AmtReg - 1);
2391
2392 // Re-running hazard recognizer on the modified instruction is not necessary,
2393 // inserted V_SWAP_B32 has already both read and write new registers so
2394 // hazards related to these register has already been handled.
2395 Amt->setReg(NewAmt);
2396 Amt->setIsKill(false);
2397 // We do not update liveness, so verifier may see it as undef.
2398 Amt->setIsUndef();
2399 if (Overlapped) {
2400 MI->getOperand(i: 0).setReg(NewReg);
2401 Src1->setReg(NewReg);
2402 Src1->setIsKill(false);
2403 Src1->setIsUndef();
2404 }
2405
2406 return true;
2407}
2408
2409int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2410 int NSAtoVMEMWaitStates = 1;
2411
2412 if (!ST.hasNSAtoVMEMBug())
2413 return 0;
2414
2415 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isMTBUF(MI: *MI))
2416 return 0;
2417
2418 const SIInstrInfo *TII = ST.getInstrInfo();
2419 const auto *Offset = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2420 if (!Offset || (Offset->getImm() & 6) == 0)
2421 return 0;
2422
2423 auto IsHazardFn = [TII](const MachineInstr &I) {
2424 if (!SIInstrInfo::isMIMG(MI: I))
2425 return false;
2426 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
2427 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2428 TII->getInstSizeInBytes(MI: I) >= 16;
2429 };
2430
2431 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: 1);
2432}
2433
2434int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2435 MachineInstr *MI) const {
2436 int FPAtomicToDenormModeWaitStates = 3;
2437
2438 if (!ST.hasFPAtomicToDenormModeHazard())
2439 return 0;
2440 assert(!ST.hasExtendedWaitCounts());
2441
2442 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2443 return 0;
2444
2445 auto IsHazardFn = [](const MachineInstr &I) {
2446 if (!SIInstrInfo::isVMEM(MI: I))
2447 return false;
2448 return SIInstrInfo::isFPAtomic(MI: I);
2449 };
2450
2451 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2452 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2453 return true;
2454
2455 return SIInstrInfo::isWaitcnt(Opcode: MI.getOpcode());
2456 };
2457
2458 return FPAtomicToDenormModeWaitStates -
2459 ::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
2460}
2461
2462int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2463 assert(SIInstrInfo::isMAI(*MI));
2464
2465 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2466}
2467
2468int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2469 // Early exit if no padding is requested.
2470 if (MFMAPaddingRatio == 0)
2471 return 0;
2472
2473 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2474 if (!SIInstrInfo::isMFMA(MI: *MI) || MFI->getOccupancy() < 2)
2475 return 0;
2476
2477 int NeighborMFMALatency = 0;
2478 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2479 this](const MachineInstr &MI) {
2480 if (!SIInstrInfo::isMFMA(MI))
2481 return false;
2482
2483 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2484 return true;
2485 };
2486
2487 const int MaxMFMAPipelineWaitStates = 16;
2488 int WaitStatesSinceNeighborMFMA =
2489 getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
2490
2491 int NeighborMFMAPaddingNeeded =
2492 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2493 WaitStatesSinceNeighborMFMA;
2494
2495 return std::max(a: 0, b: NeighborMFMAPaddingNeeded);
2496}
2497
2498int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2499 int WaitStatesNeeded = 0;
2500 unsigned Opc = MI->getOpcode();
2501
2502 auto IsVALUFn = [](const MachineInstr &MI) {
2503 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2504 };
2505
2506 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2507 const int LegacyVALUWritesVGPRWaitStates = 2;
2508 const int VALUWritesExecWaitStates = 4;
2509 const int MaxWaitStates = 4;
2510
2511 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2512 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2513 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2514
2515 if (WaitStatesNeeded < MaxWaitStates) {
2516 for (const MachineOperand &Use : MI->explicit_uses()) {
2517 const int MaxWaitStates = 2;
2518
2519 if (!Use.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
2520 continue;
2521
2522 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2523 getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2524 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2525
2526 if (WaitStatesNeeded == MaxWaitStates)
2527 break;
2528 }
2529 }
2530 }
2531
2532 for (const MachineOperand &Op : MI->explicit_operands()) {
2533 if (!Op.isReg() || !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2534 continue;
2535
2536 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2537 continue;
2538
2539 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2540 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2541 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2542 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2543 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2544 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2545 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2546 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2547 const int MaxWaitStates = 18;
2548 Register Reg = Op.getReg();
2549 unsigned HazardDefLatency = 0;
2550
2551 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2552 this](const MachineInstr &MI) {
2553 if (!SIInstrInfo::isMFMA(MI))
2554 return false;
2555 Register DstReg = MI.getOperand(i: 0).getReg();
2556 if (DstReg == Reg)
2557 return false;
2558 HazardDefLatency =
2559 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2560 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2561 };
2562
2563 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn,
2564 Limit: MaxWaitStates);
2565 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2566 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2567 int OpNo = Op.getOperandNo();
2568 if (OpNo == SrcCIdx) {
2569 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2570 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2571 switch (HazardDefLatency) {
2572 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2573 break;
2574 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2575 break;
2576 case 16: [[fallthrough]];
2577 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2578 break;
2579 }
2580 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2581 switch (HazardDefLatency) {
2582 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2583 break;
2584 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2585 break;
2586 case 16: [[fallthrough]];
2587 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2588 break;
2589 }
2590 }
2591
2592 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2593 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2594
2595 if (WaitStatesNeeded == MaxWaitStates)
2596 return WaitStatesNeeded; // Early exit.
2597
2598 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2599 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2600 return false;
2601 Register DstReg = MI.getOperand(i: 0).getReg();
2602 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2603 };
2604
2605 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2606 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2607 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2608 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2609 if (OpNo == SrcCIdx)
2610 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2611 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2612 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2613
2614 WaitStatesNeededForUse = NeedWaitStates -
2615 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates);
2616 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2617
2618 if (WaitStatesNeeded == MaxWaitStates)
2619 return WaitStatesNeeded; // Early exit.
2620 }
2621
2622 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2623 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2624 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2625 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2626 const int MaxWaitStates = 13;
2627 Register DstReg = MI->getOperand(i: 0).getReg();
2628 unsigned HazardDefLatency = 0;
2629
2630 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2631 this](const MachineInstr &MI) {
2632 if (!SIInstrInfo::isMFMA(MI))
2633 return false;
2634 Register Reg = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2635 HazardDefLatency =
2636 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2637 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2638 };
2639
2640 int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates);
2641 int NeedWaitStates;
2642 switch (HazardDefLatency) {
2643 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2644 break;
2645 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2646 break;
2647 case 16: [[fallthrough]];
2648 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2649 break;
2650 }
2651
2652 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2653 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2654 }
2655
2656 // Pad neighboring MFMA with noops for better inter-wave performance.
2657 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2658
2659 return WaitStatesNeeded;
2660}
2661
2662static int
2663GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2664 bool IsGFX950) {
2665 // xdl def cycles | gfx940 | gfx950
2666 // 2 pass | 3 4
2667 // 4 pass | 5 6
2668 // 8 pass | 9 10
2669 // 16 pass | 17 18
2670 return NumPasses + 1 + IsGFX950;
2671}
2672
2673static int
2674GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2675 bool IsGFX950) {
2676 // xdl def cycles | gfx940 | gfx950
2677 // 2 pass | 3 3
2678 // 4 pass | 5 6
2679 // 8 pass | 9 10
2680 // 16 pass | 17 18
2681 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2682}
2683
2684static int
2685GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2686 // 2 pass -> 2
2687 // 4 pass -> 4
2688 // 8 pass -> 8
2689 // 16 pass -> 16
2690 return NumPasses;
2691}
2692
2693static int
2694GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2695 // 2 pass -> 4
2696 // 4 pass -> 6
2697 // 8 pass -> 10
2698 // 16 pass -> 18
2699 return NumPasses + 2;
2700}
2701
2702static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
2703 bool IsGFX950) {
2704 // xdl def cycles | gfx942 | gfx950
2705 // 2 pass | 5 5
2706 // 4 pass | 7 8
2707 // 8 pass | 11 12
2708 // 16 pass | 19 20
2709 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2710}
2711
2712int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2713 int WaitStatesNeeded = 0;
2714 unsigned Opc = MI->getOpcode();
2715
2716 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2717 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2718 };
2719
2720 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2721 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2722 !SIInstrInfo::isDOT(MI);
2723 };
2724
2725 if (!SIInstrInfo::isMFMA(MI: *MI))
2726 return WaitStatesNeeded;
2727
2728 const int VALUWritesExecWaitStates = 4;
2729 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2730 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn,
2731 Limit: VALUWritesExecWaitStates);
2732 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2733
2734 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2735
2736 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2737 for (const MachineOperand &Use : MI->explicit_uses()) {
2738 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2739 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2740 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2741 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2742 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2743 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2744 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2745 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2746 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2747 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2748 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2749 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2750 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2751 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2752 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2753 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2754 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2755 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2756 const int MaxWaitStates = 19;
2757
2758 if (!Use.isReg())
2759 continue;
2760 Register Reg = Use.getReg();
2761 bool FullReg;
2762 const MachineInstr *MI1;
2763
2764 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2765 this](const MachineInstr &MI) {
2766 if (!SIInstrInfo::isMFMA(MI))
2767 return false;
2768 Register DstReg = MI.getOperand(i: 0).getReg();
2769 FullReg = (DstReg == Reg);
2770 MI1 = &MI;
2771 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2772 };
2773
2774 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2775 getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2776 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2777
2778 int NumWaitStates =
2779 getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates);
2780 if (NumWaitStates == std::numeric_limits<int>::max())
2781 continue;
2782
2783 int OpNo = Use.getOperandNo();
2784 unsigned Opc1 = MI1->getOpcode();
2785 int NeedWaitStates = 0;
2786 if (OpNo == SrcCIdx) {
2787 if (!SIInstrInfo::isDGEMM(Opcode: Opc) &&
2788 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opcode: Opc1))) {
2789 NeedWaitStates = 0;
2790 } else if (FullReg) {
2791 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2792 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2793 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2794 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2795 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2796 else if (ST.hasGFX940Insts() &&
2797 TSchedModel.computeInstrLatency(MI: MI1) == 2)
2798 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2799 } else {
2800 switch (Opc1) {
2801 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2802 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2803 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2804 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2805 if (!TII.isXDL(MI: *MI))
2806 NeedWaitStates =
2807 ST.hasGFX950Insts()
2808 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2809 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2810 break;
2811 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2812 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2813 if (!TII.isXDL(MI: *MI))
2814 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2815 break;
2816 default:
2817 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2818 if (ST.hasGFX940Insts()) {
2819 if (TII.isXDL(MI: *MI) && !TII.isXDL(MI: *MI1))
2820 break;
2821
2822 NeedWaitStates =
2823 TII.isXDL(MI: *MI1)
2824 ? (TII.isXDL(MI: *MI)
2825 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2826 NumPasses, IsGFX950: ST.hasGFX950Insts())
2827 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2828 NumPasses, IsGFX950: ST.hasGFX950Insts()))
2829 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2830 NumPasses);
2831 break;
2832 }
2833
2834 switch (NumPasses) {
2835 case 2:
2836 NeedWaitStates =
2837 SIInstrInfo::isDGEMM(Opcode: Opc)
2838 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2839 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2840 break;
2841 case 8:
2842 NeedWaitStates =
2843 SIInstrInfo::isDGEMM(Opcode: Opc)
2844 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2845 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2846 break;
2847 case 16:
2848 NeedWaitStates =
2849 SIInstrInfo::isDGEMM(Opcode: Opc)
2850 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2851 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2852 break;
2853 default:
2854 llvm_unreachable("unexpected number of passes");
2855 }
2856 }
2857 }
2858 } else {
2859 switch (Opc1) {
2860 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2861 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2862 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2863 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2864 NeedWaitStates =
2865 ST.hasGFX950Insts()
2866 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2867 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2868 break;
2869 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2870 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2871 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2872 break;
2873 default:
2874 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2875
2876 if (ST.hasGFX940Insts()) {
2877 NeedWaitStates =
2878 TII.isXDL(MI: *MI1)
2879 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2880 NumPasses, IsGFX950: ST.hasGFX950Insts())
2881 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2882 NumPasses);
2883 break;
2884 }
2885
2886 switch (NumPasses) {
2887 case 2:
2888 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2889 break;
2890 case 4:
2891 llvm_unreachable("unexpected number of passes for mfma");
2892 case 8:
2893 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2894 break;
2895 case 16:
2896 default:
2897 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2898 }
2899 }
2900 }
2901 if (WaitStatesNeeded >= NeedWaitStates)
2902 continue;
2903
2904 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2905 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2906
2907 if (WaitStatesNeeded == MaxWaitStates)
2908 break;
2909 }
2910
2911 // Pad neighboring MFMA with noops for better inter-wave performance.
2912 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2913
2914 return WaitStatesNeeded;
2915}
2916
2917int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
2918 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2919 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2920 return 0;
2921
2922 int WaitStatesNeeded = 0;
2923
2924 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2925 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2926 };
2927
2928 for (const MachineOperand &Op : MI->explicit_uses()) {
2929 if (!Op.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2930 continue;
2931
2932 Register Reg = Op.getReg();
2933
2934 const int AccVgprReadLdStWaitStates = 2;
2935 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2936 const int MaxWaitStates = 2;
2937
2938 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2939 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates);
2940 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2941
2942 if (WaitStatesNeeded == MaxWaitStates)
2943 return WaitStatesNeeded; // Early exit.
2944
2945 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2946 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2947 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2948 return false;
2949 auto IsVALUFn = [](const MachineInstr &MI) {
2950 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2951 };
2952 return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: 2 /*MaxWaitStates*/) <
2953 std::numeric_limits<int>::max();
2954 };
2955
2956 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2957 getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2958 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2959 }
2960
2961 return WaitStatesNeeded;
2962}
2963
2964int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
2965 assert(!ST.hasVcmpxPermlaneHazard() &&
2966 "this is a different vcmpx+permlane hazard");
2967 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2968 const SIInstrInfo *TII = ST.getInstrInfo();
2969
2970 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2971 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
2972 };
2973
2974 auto IsVALUFn = [](const MachineInstr &MI) {
2975 return SIInstrInfo::isVALU(MI);
2976 };
2977
2978 const int VCmpXWritesExecWaitStates = 4;
2979 const int VALUWritesVDstWaitStates = 2;
2980 int WaitStatesNeeded = 0;
2981
2982 for (const MachineOperand &Op : MI->explicit_uses()) {
2983 if (!Op.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2984 continue;
2985 Register Reg = Op.getReg();
2986
2987 int WaitStatesSinceDef =
2988 VALUWritesVDstWaitStates -
2989 getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn,
2990 /*MaxWaitStates=*/Limit: VALUWritesVDstWaitStates);
2991 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesSinceDef);
2992 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2993 break;
2994 }
2995
2996 int VCmpXHazardWaits =
2997 VCmpXWritesExecWaitStates -
2998 getWaitStatesSince(IsHazard: IsVCmpXWritesExecFn, Limit: VCmpXWritesExecWaitStates);
2999
3000 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: VCmpXHazardWaits);
3001 return WaitStatesNeeded;
3002}
3003
3004static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
3005 // 2 pass -> 4
3006 // 4 pass -> 6
3007 // 8 pass -> 10
3008 // 16 pass -> 18
3009 return NumPasses + 2;
3010}
3011
3012static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
3013 bool IsGFX950) {
3014 // xdl def cycles | gfx942 | gfx950
3015 // 2 pass | 5 5
3016 // 4 pass | 7 8
3017 // 8 pass | 11 12
3018 // 16 pass | 19 20
3019 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3020}
3021
3022static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
3023 bool IsGFX950) {
3024 // xdl def cycles | gfx942 | gfx950
3025 // 2 pass | 5 5
3026 // 4 pass | 7 8
3027 // 8 pass | 11 12
3028 // 16 pass | 19 20
3029 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3030}
3031
3032static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
3033 // 2 pass -> 4
3034 // 4 pass -> 6
3035 // 8 pass -> 10
3036 // 16 pass -> 18
3037 return NumPasses + 2;
3038}
3039
3040int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3041 if (!ST.hasGFX90AInsts())
3042 return 0;
3043
3044 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3045 return SIInstrInfo::isDGEMM(Opcode: MI.getOpcode());
3046 };
3047
3048 // This is checked in checkMAIHazards90A()
3049 if (SIInstrInfo::isMFMA(MI: *MI))
3050 return 0;
3051
3052 const MachineRegisterInfo &MRI = MF.getRegInfo();
3053
3054 int WaitStatesNeeded = 0;
3055
3056 bool IsMem = SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI);
3057 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(MI: *MI);
3058 bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
3059
3060 const MachineInstr *MFMA = nullptr;
3061 unsigned Reg;
3062 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3063 if (!SIInstrInfo::isMFMA(MI) ||
3064 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
3065 return false;
3066 MFMA = &MI;
3067 return true;
3068 };
3069
3070 const MachineInstr *DOT = nullptr;
3071 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3072 if (!SIInstrInfo::isDOT(MI) ||
3073 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
3074 return false;
3075 DOT = &MI;
3076 return true;
3077 };
3078
3079 bool DGEMMAfterVALUWrite = false;
3080 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3081 // Found DGEMM on reverse traversal to def.
3082 if (SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()))
3083 DGEMMAfterVALUWrite = true;
3084
3085 // Only hazard if register is defined by a VALU and a DGEMM is found after
3086 // after the def.
3087 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
3088 return false;
3089
3090 return true;
3091 };
3092
3093 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(),
3094 Name: AMDGPU::OpName::src2);
3095
3096 if (IsMemOrExport || IsVALU) {
3097 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3098 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3099 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3100 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3101 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3102 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3103 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3104 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3105 const int DotWriteSameDotReadSrcAB = 3;
3106 const int DotWriteDifferentVALURead = 3;
3107 const int DMFMABetweenVALUWriteVMEMRead = 2;
3108 const int MaxWaitStates = 19;
3109
3110 for (const MachineOperand &Use : MI->explicit_uses()) {
3111 if (!Use.isReg())
3112 continue;
3113 Reg = Use.getReg();
3114
3115 DOT = nullptr;
3116 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3117 Limit: MaxWaitStates);
3118 if (DOT) {
3119 int NeedWaitStates = 0;
3120 if (DOT->getOpcode() == MI->getOpcode()) {
3121 if (&Use - &MI->getOperand(i: 0) != SrcCIdx)
3122 NeedWaitStates = DotWriteSameDotReadSrcAB;
3123 } else {
3124 NeedWaitStates = DotWriteDifferentVALURead;
3125 }
3126
3127 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3128 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3129 }
3130
3131 // Workaround for HW data hazard bug observed only in GFX90A. When there
3132 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3133 // causes the SQ to incorrectly not insert two wait states between the two
3134 // instructions needed to avoid data hazard.
3135 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3136 DGEMMAfterVALUWrite = false;
3137 if (TRI.isVectorRegister(MRI, Reg)) {
3138 int WaitStatesNeededForUse =
3139 DMFMABetweenVALUWriteVMEMRead -
3140 getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
3141 Limit: DMFMABetweenVALUWriteVMEMRead);
3142
3143 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3144 }
3145 }
3146
3147 MFMA = nullptr;
3148 WaitStatesSinceDef =
3149 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3150 if (!MFMA)
3151 continue;
3152
3153 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3154 int NumPasses = HazardDefLatency;
3155 int NeedWaitStates = MaxWaitStates;
3156
3157 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3158 switch (HazardDefLatency) {
3159 case 4:
3160 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3161 : DMFMA4x4WriteVgprVALUReadWaitStates;
3162 break;
3163 case 8:
3164 case 16:
3165 NeedWaitStates =
3166 IsMemOrExport
3167 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3168 : (ST.hasGFX950Insts()
3169 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3170 : DMFMA16x16WriteVgprVALUReadWaitStates);
3171 break;
3172 default:
3173 llvm_unreachable("unexpected dgemm");
3174 }
3175 } else if (ST.hasGFX940Insts()) {
3176 NeedWaitStates =
3177 TII.isXDL(MI: *MFMA)
3178 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
3179 NumPasses, IsGFX950: ST.hasGFX950Insts())
3180 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
3181 NumPasses);
3182 } else {
3183 switch (HazardDefLatency) {
3184 case 2:
3185 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3186 break;
3187 case 8:
3188 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3189 break;
3190 case 16:
3191 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3192 break;
3193 default:
3194 llvm_unreachable("unexpected number of passes for mfma");
3195 }
3196 }
3197
3198 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3199 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3200
3201 if (WaitStatesNeeded == MaxWaitStates)
3202 break;
3203 }
3204 }
3205
3206 unsigned Opc = MI->getOpcode();
3207 const int DMFMAToFMA64WaitStates = 2;
3208 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3209 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3210 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3211 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3212 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3213 getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
3214 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3215 }
3216
3217 if (!IsVALU && !IsMemOrExport)
3218 return WaitStatesNeeded;
3219
3220 for (const MachineOperand &Def : MI->defs()) {
3221 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3222 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3223 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3224 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3225 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3226 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3227 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3228 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3229 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3230 const int DotWriteDifferentVALUWrite = 3;
3231 const int MaxWaitStates = 19;
3232 const int MaxWarWaitStates = 15;
3233
3234 Reg = Def.getReg();
3235
3236 DOT = nullptr;
3237 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3238 Limit: MaxWaitStates);
3239 if (DOT && DOT->getOpcode() != MI->getOpcode())
3240 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
3241 WaitStatesSinceDef);
3242
3243 MFMA = nullptr;
3244 WaitStatesSinceDef =
3245 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3246 if (MFMA) {
3247 int NeedWaitStates = MaxWaitStates;
3248 int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
3249
3250 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3251 switch (NumPasses) {
3252 case 4:
3253 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3254 break;
3255 case 8:
3256 case 16:
3257 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3258 break;
3259 default:
3260 llvm_unreachable("unexpected number of cycles for dgemm");
3261 }
3262 } else if (ST.hasGFX940Insts()) {
3263 NeedWaitStates =
3264 TII.isXDL(MI: *MFMA)
3265 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
3266 NumPasses, IsGFX950: ST.hasGFX950Insts())
3267 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
3268 } else {
3269 switch (NumPasses) {
3270 case 2:
3271 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3272 break;
3273 case 8:
3274 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3275 break;
3276 case 16:
3277 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3278 break;
3279 default:
3280 llvm_unreachable("Unexpected number of passes for mfma");
3281 }
3282 }
3283
3284 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3285 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3286
3287 if (WaitStatesNeeded == MaxWaitStates)
3288 break;
3289 }
3290
3291 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3292 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()) ||
3293 !MI.readsRegister(Reg, TRI: &TRI))
3294 return false;
3295
3296 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3297 return false;
3298
3299 const MachineOperand *SrcC =
3300 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
3301 assert(SrcC);
3302 if (!SrcC->isReg() || !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg))
3303 return false;
3304
3305 MFMA = &MI;
3306 return true;
3307 };
3308
3309 MFMA = nullptr;
3310 int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
3311 Limit: MaxWarWaitStates);
3312 if (!MFMA)
3313 continue;
3314
3315 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3316 int NeedWaitStates = MaxWaitStates;
3317 switch (HazardDefLatency) {
3318 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3319 break;
3320 case 4: assert(ST.hasGFX940Insts());
3321 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3322 break;
3323 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3324 break;
3325 case 16: [[fallthrough]];
3326 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3327 break;
3328 }
3329
3330 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3331 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3332 }
3333
3334 return WaitStatesNeeded;
3335}
3336
3337bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) const {
3338 if (!SU->isInstr())
3339 return false;
3340
3341 const MachineInstr *MAI = nullptr;
3342
3343 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3344 MAI = nullptr;
3345 if (SIInstrInfo::isMFMA(MI))
3346 MAI = &MI;
3347 return MAI != nullptr;
3348 };
3349
3350 MachineInstr *MI = SU->getInstr();
3351 if (IsMFMAFn(*MI)) {
3352 int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: 16);
3353 if (MAI)
3354 return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
3355 }
3356
3357 return false;
3358}
3359
3360// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3361// insertion of a new instruction.
3362static void updateGetPCBundle(MachineInstr *NewMI) {
3363 if (!NewMI->isBundled())
3364 return;
3365
3366 // Find start of bundle.
3367 auto I = NewMI->getIterator();
3368 while (I->isBundledWithPred())
3369 I--;
3370 if (I->isBundle())
3371 I++;
3372
3373 // Bail if this is not an S_GETPC bundle.
3374 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3375 return;
3376
3377 // Update offsets of any references in the bundle.
3378 const unsigned NewBytes = 4;
3379 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3380 "Unexpected instruction insertion in bundle");
3381 auto NextMI = std::next(x: NewMI->getIterator());
3382 auto End = NewMI->getParent()->end();
3383 while (NextMI != End && NextMI->isBundledWithPred()) {
3384 for (auto &Operand : NextMI->operands()) {
3385 if (Operand.isGlobal())
3386 Operand.setOffset(Operand.getOffset() + NewBytes);
3387 }
3388 NextMI++;
3389 }
3390}
3391
3392bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3393 if (!ST.hasVALUMaskWriteHazard())
3394 return false;
3395 assert(!ST.hasExtendedWaitCounts());
3396
3397 if (!ST.isWave64())
3398 return false;
3399
3400 const bool IsSALU = SIInstrInfo::isSALU(MI: *MI);
3401 const bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
3402 if (!IsSALU && !IsVALU)
3403 return false;
3404
3405 // The hazard sequence is three instructions:
3406 // 1. VALU reads SGPR as mask
3407 // 2. VALU/SALU writes SGPR
3408 // 3. VALU/SALU reads SGPR
3409 // The hazard can expire if the distance between 2 and 3 is sufficient,
3410 // or (2) is VALU and (3) is SALU.
3411 // In practice this happens <10% of the time, hence always assume the hazard
3412 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3413
3414 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3415 const MachineRegisterInfo &MRI = MF.getRegInfo();
3416
3417 auto IgnoreableSGPR = [](const Register Reg) {
3418 switch (Reg) {
3419 case AMDGPU::EXEC:
3420 case AMDGPU::EXEC_LO:
3421 case AMDGPU::EXEC_HI:
3422 case AMDGPU::M0:
3423 case AMDGPU::SGPR_NULL:
3424 case AMDGPU::SGPR_NULL64:
3425 case AMDGPU::SCC:
3426 return true;
3427 default:
3428 return false;
3429 }
3430 };
3431 auto IsVCC = [](const Register Reg) {
3432 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3433 };
3434
3435 struct StateType {
3436 SmallSet<Register, 2> HazardSGPRs;
3437
3438 static unsigned getHashValue(const StateType &State) {
3439 return hash_combine_range(R: State.HazardSGPRs);
3440 }
3441 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3442 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3443 }
3444 };
3445
3446 SmallVector<const MachineInstr *> WaitInstrs;
3447 bool HasSGPRRead = false;
3448 StateType InitialState;
3449
3450 // Look for SGPR write.
3451 MachineOperand *HazardDef = nullptr;
3452 for (MachineOperand &Op : MI->operands()) {
3453 if (!Op.isReg())
3454 continue;
3455 if (Op.isDef() && HazardDef)
3456 continue;
3457
3458 Register Reg = Op.getReg();
3459 if (IgnoreableSGPR(Reg))
3460 continue;
3461 if (!IsVCC(Reg)) {
3462 if (Op.isImplicit())
3463 continue;
3464 if (!TRI->isSGPRReg(MRI, Reg))
3465 continue;
3466 }
3467 // Also check for SGPR reads.
3468 if (Op.isUse()) {
3469 HasSGPRRead = true;
3470 continue;
3471 }
3472
3473 assert(!HazardDef);
3474 HazardDef = &Op;
3475 }
3476
3477 if (!HazardDef)
3478 return false;
3479
3480 // Setup to track writes to individual SGPRs
3481 const Register HazardReg = HazardDef->getReg();
3482 if (AMDGPU::SReg_32RegClass.contains(Reg: HazardReg)) {
3483 InitialState.HazardSGPRs.insert(V: HazardReg);
3484 } else {
3485 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3486 InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub0));
3487 InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub1));
3488 }
3489
3490 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3491 if (State.HazardSGPRs.empty())
3492 return HazardExpired;
3493
3494 switch (I.getOpcode()) {
3495 case AMDGPU::V_ADDC_U32_e32:
3496 case AMDGPU::V_ADDC_U32_dpp:
3497 case AMDGPU::V_CNDMASK_B16_t16_e32:
3498 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3499 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3500 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3501 case AMDGPU::V_CNDMASK_B32_e32:
3502 case AMDGPU::V_CNDMASK_B32_dpp:
3503 case AMDGPU::V_DIV_FMAS_F32_e64:
3504 case AMDGPU::V_DIV_FMAS_F64_e64:
3505 case AMDGPU::V_SUBB_U32_e32:
3506 case AMDGPU::V_SUBB_U32_dpp:
3507 case AMDGPU::V_SUBBREV_U32_e32:
3508 case AMDGPU::V_SUBBREV_U32_dpp: {
3509 // These implicitly read VCC as mask source.
3510 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3511 }
3512 case AMDGPU::V_ADDC_U32_e64:
3513 case AMDGPU::V_ADDC_U32_e64_dpp:
3514 case AMDGPU::V_CNDMASK_B16_t16_e64:
3515 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3516 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3517 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3518 case AMDGPU::V_CNDMASK_B32_e64:
3519 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3520 case AMDGPU::V_SUBB_U32_e64:
3521 case AMDGPU::V_SUBB_U32_e64_dpp:
3522 case AMDGPU::V_SUBBREV_U32_e64:
3523 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3524 // Only check mask register overlaps.
3525 const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src2);
3526 assert(SSRCOp);
3527 bool Result = TRI->regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg);
3528 return Result ? HazardFound : NoHazardFound;
3529 }
3530 default:
3531 return NoHazardFound;
3532 }
3533 };
3534
3535 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3536 Encoded: AMDGPU::DepCtr::encodeFieldVaSdst(Encoded: AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: 0, STI: ST),
3537 VaSdst: 0),
3538 SaSdst: 0);
3539 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3540 switch (I.getOpcode()) {
3541 case AMDGPU::S_WAITCNT_DEPCTR:
3542 // Record mergable waits within region of instructions free of SGPR reads.
3543 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3544 (I.getOperand(i: 0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3545 WaitInstrs.push_back(Elt: &I);
3546 break;
3547 default:
3548 // Update tracking of SGPR reads and writes.
3549 for (auto &Op : I.operands()) {
3550 if (!Op.isReg())
3551 continue;
3552
3553 Register Reg = Op.getReg();
3554 if (IgnoreableSGPR(Reg))
3555 continue;
3556 if (!IsVCC(Reg)) {
3557 if (Op.isImplicit())
3558 continue;
3559 if (!TRI->isSGPRReg(MRI, Reg))
3560 continue;
3561 }
3562 if (Op.isUse()) {
3563 HasSGPRRead = true;
3564 continue;
3565 }
3566
3567 // Stop tracking any SGPRs with writes on the basis that they will
3568 // already have an appropriate wait inserted afterwards.
3569 SmallVector<Register, 2> Found;
3570 for (Register SGPR : State.HazardSGPRs) {
3571 if (Reg == SGPR || TRI->regsOverlap(RegA: Reg, RegB: SGPR))
3572 Found.push_back(Elt: SGPR);
3573 }
3574 for (Register SGPR : Found)
3575 State.HazardSGPRs.erase(V: SGPR);
3576 }
3577 break;
3578 }
3579 };
3580
3581 // Check for hazard
3582 if (!hasHazard<StateType>(InitialState, IsHazard: IsHazardFn, UpdateState: UpdateStateFn,
3583 InitialMBB: MI->getParent(),
3584 InitialI: std::next(x: MI->getReverseIterator())))
3585 return false;
3586
3587 // Compute counter mask
3588 unsigned DepCtr =
3589 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: 0, STI: ST)
3590 : AMDGPU::DepCtr::encodeFieldVaSdst(VaSdst: 0, STI: ST))
3591 : AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST);
3592
3593 // Try to merge previous waits into this one for regions with no SGPR reads.
3594 if (!WaitInstrs.empty()) {
3595 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3596 // obtain a mutable pointer to each instruction to be merged.
3597 // This is expected to be a very short walk within the same block.
3598 SmallVector<MachineInstr *> ToErase;
3599 unsigned Found = 0;
3600 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3601 End = MI->getParent()->rend();
3602 Found < WaitInstrs.size() && It != End; ++It) {
3603 MachineInstr *WaitMI = &*It;
3604 // Find next wait instruction.
3605 if (std::as_const(t&: WaitMI) != WaitInstrs[Found])
3606 continue;
3607 Found++;
3608 unsigned WaitMask = WaitMI->getOperand(i: 0).getImm();
3609 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3610 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3611 Encoded: DepCtr, SaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: WaitMask),
3612 b: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: DepCtr)));
3613 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3614 Encoded: DepCtr, VaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: WaitMask),
3615 b: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: DepCtr)));
3616 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3617 Encoded: DepCtr, VaVcc: std::min(a: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: WaitMask),
3618 b: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: DepCtr)));
3619 ToErase.push_back(Elt: WaitMI);
3620 }
3621 assert(Found == WaitInstrs.size());
3622 for (MachineInstr *WaitMI : ToErase)
3623 WaitMI->eraseFromParent();
3624 }
3625
3626 // Add s_waitcnt_depctr after SGPR write.
3627 auto NextMI = std::next(x: MI->getIterator());
3628 auto NewMI = BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(),
3629 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3630 .addImm(Val: DepCtr);
3631
3632 // SALU write may be s_getpc in a bundle.
3633 updateGetPCBundle(NewMI);
3634
3635 return true;
3636}
3637
3638static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3639 const SIInstrInfo &TII) {
3640 MachineBasicBlock &EntryMBB = MF->front();
3641 if (EntryMBB.begin() != EntryMBB.end()) {
3642 auto &EntryMI = *EntryMBB.begin();
3643 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3644 EntryMI.getOperand(i: 0).getImm() >= Priority)
3645 return false;
3646 }
3647
3648 BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3649 .addImm(Val: Priority);
3650 return true;
3651}
3652
3653bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3654 if (!ST.hasRequiredExportPriority())
3655 return false;
3656
3657 // Assume the following shader types will never have exports,
3658 // and avoid adding or adjusting S_SETPRIO.
3659 MachineBasicBlock *MBB = MI->getParent();
3660 MachineFunction *MF = MBB->getParent();
3661 auto CC = MF->getFunction().getCallingConv();
3662 switch (CC) {
3663 case CallingConv::AMDGPU_CS:
3664 case CallingConv::AMDGPU_CS_Chain:
3665 case CallingConv::AMDGPU_CS_ChainPreserve:
3666 case CallingConv::AMDGPU_KERNEL:
3667 return false;
3668 default:
3669 break;
3670 }
3671
3672 const int MaxPriority = 3;
3673 const int NormalPriority = 2;
3674 const int PostExportPriority = 0;
3675
3676 auto It = MI->getIterator();
3677 switch (MI->getOpcode()) {
3678 case AMDGPU::S_ENDPGM:
3679 case AMDGPU::S_ENDPGM_SAVED:
3680 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3681 case AMDGPU::SI_RETURN_TO_EPILOG:
3682 // Ensure shader with calls raises priority at entry.
3683 // This ensures correct priority if exports exist in callee.
3684 if (MF->getFrameInfo().hasCalls())
3685 return ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3686 return false;
3687 case AMDGPU::S_SETPRIO: {
3688 // Raise minimum priority unless in workaround.
3689 auto &PrioOp = MI->getOperand(i: 0);
3690 int Prio = PrioOp.getImm();
3691 bool InWA = (Prio == PostExportPriority) &&
3692 (It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It)));
3693 if (InWA || Prio >= NormalPriority)
3694 return false;
3695 PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority));
3696 return true;
3697 }
3698 default:
3699 if (!TII.isEXP(MI: *MI))
3700 return false;
3701 break;
3702 }
3703
3704 // Check entry priority at each export (as there will only be a few).
3705 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3706 bool Changed = false;
3707 if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
3708 Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3709
3710 auto NextMI = std::next(x: It);
3711 bool EndOfShader = false;
3712 if (NextMI != MBB->end()) {
3713 // Only need WA at end of sequence of exports.
3714 if (TII.isEXP(MI: *NextMI))
3715 return Changed;
3716 // Assume appropriate S_SETPRIO after export means WA already applied.
3717 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3718 NextMI->getOperand(i: 0).getImm() == PostExportPriority)
3719 return Changed;
3720 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3721 }
3722
3723 const DebugLoc &DL = MI->getDebugLoc();
3724
3725 // Lower priority.
3726 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3727 .addImm(Val: PostExportPriority);
3728
3729 if (!EndOfShader) {
3730 // Wait for exports to complete.
3731 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT))
3732 .addReg(RegNo: AMDGPU::SGPR_NULL)
3733 .addImm(Val: 0);
3734 }
3735
3736 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3737 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3738
3739 if (!EndOfShader) {
3740 // Return to normal (higher) priority.
3741 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3742 .addImm(Val: NormalPriority);
3743 }
3744
3745 return true;
3746}
3747
3748bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3749 if (!isSGetReg(Opcode: MI->getOpcode()))
3750 return false;
3751
3752 const SIInstrInfo *TII = ST.getInstrInfo();
3753 switch (getHWReg(TII, RegInstr: *MI)) {
3754 default:
3755 return false;
3756 case AMDGPU::Hwreg::ID_STATUS:
3757 case AMDGPU::Hwreg::ID_STATE_PRIV:
3758 case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV:
3759 case AMDGPU::Hwreg::ID_EXCP_FLAG_USER:
3760 break;
3761 }
3762
3763 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3764 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3765 .addImm(Val: 0);
3766 return true;
3767}
3768
3769bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3770 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3771 return false;
3772
3773 const SIInstrInfo *TII = ST.getInstrInfo();
3774 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3775 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3776 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
3777 BuildMI(BB&: *MI->getParent(), I: std::next(x: MI->getIterator()), MIMD: MI->getDebugLoc(),
3778 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3779 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
3780
3781 return true;
3782}
3783
3784bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3785 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3786 // for hazard to trigger.
3787 if (!IsHazardRecognizerMode)
3788 return false;
3789
3790 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3791 const SIInstrInfo *TII = ST.getInstrInfo();
3792 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3793 const int FlatScrBaseWaitStates = 10;
3794
3795 bool ReadsFlatScrLo =
3796 MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3797 bool ReadsFlatScrHi =
3798 MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3799 if (isSGetReg(Opcode: MI->getOpcode())) {
3800 switch (getHWReg(TII, RegInstr: *MI)) {
3801 default:
3802 break;
3803 case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
3804 ReadsFlatScrLo = true;
3805 break;
3806 case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
3807 ReadsFlatScrHi = true;
3808 break;
3809 }
3810 }
3811
3812 const MachineRegisterInfo &MRI = MF.getRegInfo();
3813
3814 auto IsRegDefHazard = [&](Register Reg) -> bool {
3815 DenseSet<const MachineBasicBlock *> Visited;
3816 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3817 return MI.modifiesRegister(Reg, TRI);
3818 };
3819
3820 // This literally abuses the idea of waitstates. Instead of waitstates it
3821 // returns 1 for SGPR written and 0 otherwise.
3822 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3823 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3824 return 0;
3825 for (const MachineOperand &MO : MI.all_defs()) {
3826 if (TRI->isSGPRReg(MRI, Reg: MO.getReg()))
3827 return 1;
3828 }
3829 return 0;
3830 };
3831
3832 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3833 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3834 unsigned Wait = MI.getOperand(i: 0).getImm();
3835 if (AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: Wait) == 0 &&
3836 AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: Wait) == 0)
3837 return true;
3838 }
3839 return SgprWrites >= FlatScrBaseWaitStates;
3840 };
3841
3842 return ::getWaitStatesSince(
3843 IsHazard: IsHazardFn, MBB: MI->getParent(), I: std::next(x: MI->getReverseIterator()),
3844 WaitStates: 0, IsExpired: IsExpiredFn, Visited, GetNumWaitStates: IsSGPRDef) < FlatScrBaseWaitStates;
3845 };
3846
3847 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR102) ||
3848 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3849 (!ReadsFlatScrHi || MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR103) ||
3850 !IsRegDefHazard(AMDGPU::SGPR103)))
3851 return false;
3852
3853 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3854 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3855 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaSdst(
3856 Encoded: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST), VaSdst: 0));
3857 return true;
3858}
3859
3860bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3861 if (!isSSetReg(Opcode: MI->getOpcode()) ||
3862 MI->getOperand(i: 1).getImm() != AMDGPU::Hwreg::ID_MODE)
3863 return false;
3864
3865 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3866 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3867 return true;
3868}
3869