1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "AMDGPUWaitcntUtils.h"
15#include "GCNSubtarget.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "SIMachineFunctionInfo.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/CodeGen/MachineFrameInfo.h"
20#include "llvm/CodeGen/MachineFunction.h"
21#include "llvm/CodeGen/MachineInstrBuilder.h"
22#include "llvm/CodeGen/ScheduleDAG.h"
23#include "llvm/Support/Debug.h"
24#include "llvm/TargetParser/AMDGPUTargetParser.h"
25
26using namespace llvm;
27
28#define DEBUG_TYPE "gcn-hazard-recognizer"
29
30STATISTIC(NumWMMANopsHoisted,
31 "Number of WMMA hazard V_NOPs hoisted from loops");
32STATISTIC(NumWMMAHoistingBailed,
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
34
35namespace {
36
37struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
38 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
39
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
41 if (Arg.getAsInteger(Radix: 0, Result&: Value))
42 return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
43
44 if (Value > 100)
45 return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
46
47 return false;
48 }
49};
50
51} // end anonymous namespace
52
53static cl::opt<unsigned, false, MFMAPaddingRatioParser>
54 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: 0), cl::Hidden,
55 cl::desc("Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
57
58// This is intended for debugging purposes only.
59static cl::opt<unsigned>
60 NopPadding("amdgpu-snop-padding", cl::init(Val: 0), cl::Hidden,
61 cl::desc("Insert a s_nop x before every instruction"));
62
63static cl::opt<bool> EnableWMMAVnopHoisting(
64 "amdgpu-wmma-vnop-hoisting", cl::init(Val: true), cl::Hidden,
65 cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"));
66
67//===----------------------------------------------------------------------===//
68// Hazard Recognizer Implementation
69//===----------------------------------------------------------------------===//
70
71static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
72 const GCNSubtarget &ST);
73
74GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF,
75 MachineLoopInfo *MLI)
76 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? 19 : 5;
81 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
82}
83
84void GCNHazardRecognizer::Reset() {
85 EmittedInstrs.clear();
86 EmittedVALUInstrs.clear();
87 HasPendingWMMACoexecHazard = false;
88}
89
90void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
91 EmitInstruction(MI: SU->getInstr());
92}
93
94void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
95 CurrCycleInstr = MI;
96}
97
98static bool isDivFMas(unsigned Opcode) {
99 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
100}
101
102static bool isSGetReg(unsigned Opcode) {
103 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
104}
105
106static bool isSSetReg(unsigned Opcode) {
107 switch (Opcode) {
108 case AMDGPU::S_SETREG_B32:
109 case AMDGPU::S_SETREG_B32_mode:
110 case AMDGPU::S_SETREG_IMM32_B32:
111 case AMDGPU::S_SETREG_IMM32_B32_mode:
112 return true;
113 }
114 return false;
115}
116
117static bool isRWLane(unsigned Opcode) {
118 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
119}
120
121static bool isRFE(unsigned Opcode) {
122 return Opcode == AMDGPU::S_RFE_B64;
123}
124
125static bool isSMovRel(unsigned Opcode) {
126 switch (Opcode) {
127 case AMDGPU::S_MOVRELS_B32:
128 case AMDGPU::S_MOVRELS_B64:
129 case AMDGPU::S_MOVRELD_B32:
130 case AMDGPU::S_MOVRELD_B64:
131 return true;
132 default:
133 return false;
134 }
135}
136
137static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
138 const MachineInstr &MI) {
139 if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
140 return true;
141
142 switch (MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
146 return true;
147 // These DS opcodes don't support GDS.
148 case AMDGPU::DS_NOP:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
151 return false;
152 default:
153 if (TII.isDS(Opcode: MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
155 Name: AMDGPU::OpName::gds);
156 if (MI.getOperand(i: GDS).getImm())
157 return true;
158 }
159 return false;
160 }
161}
162
163static bool isPermlane(const MachineInstr &MI) {
164 unsigned Opcode = MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
173 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
177 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
178 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
179}
180
181static bool isLdsDma(const MachineInstr &MI) {
182 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
183 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
184}
185
186static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
187 const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr,
188 OperandName: AMDGPU::OpName::simm16);
189 return std::get<0>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm()));
190}
191
192ScheduleHazardRecognizer::HazardType
193GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
194 MachineInstr *MI = SU->getInstr();
195 // If we are not in "HazardRecognizerMode" and therefore not being run from
196 // the scheduler, track possible stalls from hazards but don't insert noops.
197 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
198
199 if (MI->isBundle())
200 return NoHazard;
201
202 if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > 0)
203 return HazardType;
204
205 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
206 return HazardType;
207
208 if (checkFPAtomicToDenormModeHazard(MI) > 0)
209 return HazardType;
210
211 // Hazards which cannot be mitigated with S_NOPs.
212 if (!IsHazardRecognizerMode) {
213 if (checkWMMACoexecutionHazards(MI) > 0) {
214 HasPendingWMMACoexecHazard = true;
215 return Hazard;
216 }
217 }
218
219 if (ST.hasNoDataDepHazard())
220 return NoHazard;
221
222 if (SIInstrInfo::isVMEM(MI: *MI) && checkVMEMHazards(VMEM: MI) > 0)
223 return HazardType;
224
225 if (SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true) &&
226 checkVALUHazards(VALU: MI) > 0)
227 return HazardType;
228
229 if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > 0)
230 return HazardType;
231
232 if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > 0)
233 return HazardType;
234
235 if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > 0)
236 return HazardType;
237
238 if ((SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true) ||
239 SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI) ||
240 SIInstrInfo::isEXP(MI: *MI)) &&
241 checkMAIVALUHazards(MI) > 0)
242 return HazardType;
243
244 if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > 0)
245 return HazardType;
246
247 if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > 0)
248 return HazardType;
249
250 if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > 0)
251 return HazardType;
252
253 if (((ST.hasReadM0MovRelInterpHazard() &&
254 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
255 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
256 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
257 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
258 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
259 (ST.hasReadM0LdsDirectHazard() &&
260 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
261 checkReadM0Hazards(SMovRel: MI) > 0)
262 return HazardType;
263
264 if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > 0)
265 return HazardType;
266
267 if ((SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI)) &&
268 checkMAILdStHazards(MI) > 0)
269 return HazardType;
270
271 if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > 0)
272 return HazardType;
273
274 return NoHazard;
275}
276
277static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
278 unsigned Quantity) {
279 while (Quantity > 0) {
280 unsigned Arg = std::min(a: Quantity, b: 8u);
281 Quantity -= Arg;
282 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
283 .addImm(Val: Arg - 1);
284 }
285}
286
287unsigned
288GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
289 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
290 assert(TSchedModel.getWriteProcResBegin(SC) !=
291 TSchedModel.getWriteProcResEnd(SC));
292 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
293}
294
295void GCNHazardRecognizer::processBundle() {
296 MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
297 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
298 // Check bundled MachineInstr's for hazards.
299 for (; MI != E && MI->isInsideBundle(); ++MI) {
300 CurrCycleInstr = &*MI;
301 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
302
303 if (IsHazardRecognizerMode) {
304 fixHazards(MI: CurrCycleInstr);
305
306 insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
307 }
308
309 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
310 // include the bundled MI directly after, only add a maximum of
311 // (MaxLookAhead - 1) noops to EmittedInstrs.
312 for (unsigned i = 0, e = std::min(a: WaitStates, b: MaxLookAhead - 1); i < e; ++i)
313 EmittedInstrs.push_front(x: nullptr);
314
315 EmittedInstrs.push_front(x: CurrCycleInstr);
316 EmittedInstrs.resize(new_size: MaxLookAhead);
317 }
318 CurrCycleInstr = nullptr;
319}
320
321void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
322 assert(IsHazardRecognizerMode);
323
324 unsigned NumPreNoops = PreEmitNoops(MI);
325 EmitNoops(Quantity: NumPreNoops);
326 if (MI->isInsideBundle())
327 insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
328 else
329 TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator(MI),
330 Quantity: NumPreNoops);
331 EmitInstruction(MI);
332 AdvanceCycle();
333}
334
335unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
336 IsHazardRecognizerMode = true;
337 CurrCycleInstr = MI;
338 unsigned W = PreEmitNoopsCommon(MI);
339 fixHazards(MI);
340 CurrCycleInstr = nullptr;
341 return std::max(a: W, b: NopPadding.getValue());
342}
343
344unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const {
345 return this->PreEmitNoopsCommon(MI);
346}
347
348unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) const {
349 if (MI->isBundle())
350 return 0;
351
352 int WaitStates = 0;
353
354 if (SIInstrInfo::isSMRD(MI: *MI))
355 return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
356
357 if (ST.hasNSAtoVMEMBug())
358 WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
359
360 WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
361
362 if (ST.hasNoDataDepHazard())
363 return WaitStates;
364
365 if (SIInstrInfo::isVMEM(MI: *MI))
366 WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
367
368 if (SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true))
369 WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
370
371 if (SIInstrInfo::isDPP(MI: *MI))
372 WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
373
374 if (isDivFMas(Opcode: MI->getOpcode()))
375 WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
376
377 if (isRWLane(Opcode: MI->getOpcode()))
378 WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
379
380 if ((SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true) ||
381 SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI) ||
382 SIInstrInfo::isEXP(MI: *MI)) &&
383 checkMAIVALUHazards(MI) > 0)
384 WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
385
386 if (MI->isInlineAsm())
387 return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
388
389 if (isSGetReg(Opcode: MI->getOpcode()))
390 return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
391
392 if (isSSetReg(Opcode: MI->getOpcode()))
393 return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
394
395 if (isRFE(Opcode: MI->getOpcode()))
396 return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
397
398 if ((ST.hasReadM0MovRelInterpHazard() &&
399 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
400 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
401 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
402 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
403 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
404 (ST.hasReadM0LdsDirectHazard() &&
405 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
406 return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
407
408 if (SIInstrInfo::isMAI(MI: *MI))
409 return std::max(a: WaitStates, b: checkMAIHazards(MI));
410
411 if (SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI))
412 return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
413
414 if (ST.hasGFX950Insts() && isPermlane(MI: *MI))
415 return std::max(a: WaitStates, b: checkPermlaneHazards(MI));
416
417 return WaitStates;
418}
419
420void GCNHazardRecognizer::EmitNoop() {
421 EmittedInstrs.push_front(x: nullptr);
422}
423
424void GCNHazardRecognizer::AdvanceCycle() {
425 // When the scheduler detects a stall, it will call AdvanceCycle() without
426 // emitting any instructions.
427 if (!CurrCycleInstr) {
428 EmittedInstrs.push_front(x: nullptr);
429
430 if (HasPendingWMMACoexecHazard)
431 EmittedVALUInstrs.push_front(x: nullptr);
432 return;
433 }
434
435 HasPendingWMMACoexecHazard = false;
436
437 if (CurrCycleInstr->isBundle()) {
438 processBundle();
439 return;
440 }
441
442 unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
443 if (!NumWaitStates) {
444 CurrCycleInstr = nullptr;
445 return;
446 }
447
448 // Keep track of emitted instructions
449 EmittedInstrs.push_front(x: CurrCycleInstr);
450
451 bool IsVALUOrWMMA =
452 SIInstrInfo::isVALU(MI: *CurrCycleInstr, /*AllowLDSDMA=*/true) ||
453 SIInstrInfo::isWMMA(MI: *CurrCycleInstr) ||
454 SIInstrInfo::isSWMMAC(MI: *CurrCycleInstr);
455 if (IsVALUOrWMMA) {
456 EmittedVALUInstrs.push_front(x: CurrCycleInstr);
457 } else {
458 // A pending WMMA co-execution hazard optimistically records stall cycles as
459 // future V_NOPs. If the scheduler instead stalls for a different
460 // (S_NOP-resolvable) hazard and schedules a non-VALU into those cycles,
461 // they will not resolve the VALU-pipe hazard, so drop them here.
462 while (!EmittedVALUInstrs.empty() && EmittedVALUInstrs.front() == nullptr)
463 EmittedVALUInstrs.pop_front();
464 }
465
466 // Add a nullptr for each additional wait state after the first. Make sure
467 // not to add more than getMaxLookAhead() items to the list, since we
468 // truncate the list to that size right after this loop.
469 for (unsigned i = 1, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
470 i < e; ++i) {
471 EmittedInstrs.push_front(x: nullptr);
472 }
473
474 // getMaxLookahead() is the largest number of wait states we will ever need
475 // to insert, so there is no point in keeping track of more than that many
476 // wait states.
477 EmittedInstrs.resize(new_size: getMaxLookAhead());
478 if (EmittedVALUInstrs.size() > MaxVALULookAhead)
479 EmittedVALUInstrs.resize(new_size: MaxVALULookAhead);
480
481 CurrCycleInstr = nullptr;
482}
483
484void GCNHazardRecognizer::RecedeCycle() {
485 assert(!IsHazardRecognizerMode &&
486 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
487}
488
489//===----------------------------------------------------------------------===//
490// Helper Functions
491//===----------------------------------------------------------------------===//
492
493enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
494
495// Search for a hazard in a block and its predecessors.
496template <typename StateT>
497static bool
498hasHazard(StateT InitialState,
499 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
500 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
501 const MachineBasicBlock *InitialMBB,
502 MachineBasicBlock::const_reverse_instr_iterator InitialI) {
503 struct StateMapKey {
504 SmallVectorImpl<StateT> *States;
505 unsigned Idx;
506 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
507 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
508 }
509 };
510 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
511 static unsigned getHashValue(const StateMapKey &Key) {
512 return StateT::getHashValue((*Key.States)[Key.Idx]);
513 }
514 static unsigned getHashValue(const StateT &State) {
515 return StateT::getHashValue(State);
516 }
517 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
518 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
519 }
520 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
521 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
522 }
523 };
524
525 SmallDenseMap<StateMapKey, unsigned, 8, StateMapKeyTraits> StateMap;
526 SmallVector<StateT, 8> States;
527
528 MachineBasicBlock::const_reverse_instr_iterator I = InitialI;
529 const MachineBasicBlock *MBB = InitialMBB;
530 StateT State = InitialState;
531
532 SmallSetVector<std::pair<const MachineBasicBlock *, unsigned>, 16> Worklist;
533 unsigned WorkIdx = 0;
534 for (;;) {
535 bool Expired = false;
536 for (auto E = MBB->instr_rend(); I != E; ++I) {
537 // No need to look at parent BUNDLE instructions.
538 if (I->isBundle())
539 continue;
540
541 auto Result = IsHazard(State, *I);
542 if (Result == HazardFound)
543 return true;
544 if (Result == HazardExpired) {
545 Expired = true;
546 break;
547 }
548
549 if (I->isInlineAsm() || I->isMetaInstruction())
550 continue;
551
552 UpdateState(State, *I);
553 }
554
555 if (!Expired) {
556 unsigned StateIdx = States.size();
557 StateMapKey Key = {&States, StateIdx};
558 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
559 if (Insertion.second) {
560 States.emplace_back(State);
561 } else {
562 StateIdx = Insertion.first->second;
563 }
564 for (MachineBasicBlock *Pred : MBB->predecessors())
565 Worklist.insert(X: std::pair(Pred, StateIdx));
566 }
567
568 if (WorkIdx == Worklist.size())
569 break;
570
571 unsigned StateIdx;
572 std::tie(args&: MBB, args&: StateIdx) = Worklist[WorkIdx++];
573 State = States[StateIdx];
574 I = MBB->instr_rbegin();
575 }
576
577 return false;
578}
579
580// Returns a minimum wait states since \p I walking all predecessors.
581// Only scans until \p IsExpired does not return true.
582// Can only be run in a hazard recognizer mode.
583static int
584getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
585 const MachineBasicBlock *MBB,
586 MachineBasicBlock::const_reverse_instr_iterator I,
587 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
588 DenseSet<const MachineBasicBlock *> &Visited,
589 GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
590 SIInstrInfo::getNumWaitStates) {
591 for (auto E = MBB->instr_rend(); I != E; ++I) {
592 // Don't add WaitStates for parent BUNDLE instructions.
593 if (I->isBundle())
594 continue;
595
596 if (IsHazard(*I))
597 return WaitStates;
598
599 if (I->isInlineAsm())
600 continue;
601
602 WaitStates += GetNumWaitStates(*I);
603
604 if (IsExpired(*I, WaitStates))
605 return std::numeric_limits<int>::max();
606 }
607
608 int MinWaitStates = std::numeric_limits<int>::max();
609 for (MachineBasicBlock *Pred : MBB->predecessors()) {
610 if (!Visited.insert(V: Pred).second)
611 continue;
612
613 int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
614 IsExpired, Visited, GetNumWaitStates);
615
616 MinWaitStates = std::min(a: MinWaitStates, b: W);
617 }
618
619 return MinWaitStates;
620}
621
622static int
623getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
624 const MachineInstr *MI,
625 GCNHazardRecognizer::IsExpiredFn IsExpired,
626 GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
627 SIInstrInfo::getNumWaitStates) {
628 DenseSet<const MachineBasicBlock *> Visited;
629 return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
630 I: std::next(x: MI->getReverseIterator()), WaitStates: 0, IsExpired,
631 Visited, GetNumWaitStates);
632}
633
634int GCNHazardRecognizer::getWaitStatesSince(
635 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) const {
636 if (IsHazardRecognizerMode) {
637 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
638 return WaitStates >= Limit;
639 };
640 return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn,
641 GetNumWaitStates);
642 }
643
644 int WaitStates = 0;
645 for (MachineInstr *MI : EmittedInstrs) {
646 if (MI) {
647 if (IsHazard(*MI))
648 return WaitStates;
649
650 if (MI->isInlineAsm())
651 continue;
652 }
653 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
654
655 if (WaitStates >= Limit)
656 break;
657 }
658 return std::numeric_limits<int>::max();
659}
660
661int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
662 int Limit) const {
663 return getWaitStatesSince(IsHazard, Limit, GetNumWaitStates: SIInstrInfo::getNumWaitStates);
664}
665
666int GCNHazardRecognizer::getWaitStatesSinceVALU(IsHazardFn IsHazard,
667 int Limit) const {
668 if (IsHazardRecognizerMode) {
669 auto GetVALUWaitStates = [](const MachineInstr &MI) -> unsigned {
670 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) ? 1 : 0;
671 };
672 return getWaitStatesSince(IsHazard, Limit, GetNumWaitStates: GetVALUWaitStates);
673 }
674
675 // EmittedVALUInstrs is capped at MaxVALULookAhead, so a Limit beyond that
676 // window could miss a hazard. Keep the cap in sync with the wait-state
677 // tables.
678 assert(Limit <= (int)MaxVALULookAhead &&
679 "Limit exceeds the EmittedVALUInstrs lookahead window");
680 int WaitStates = 0;
681 for (MachineInstr *MI : EmittedVALUInstrs) {
682 if (MI) {
683 if (IsHazard(*MI))
684 return WaitStates;
685 }
686
687 ++WaitStates;
688
689 if (WaitStates >= Limit)
690 break;
691 }
692 return std::numeric_limits<int>::max();
693}
694
695int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
696 IsHazardFn IsHazardDef,
697 int Limit) const {
698 const SIRegisterInfo *TRI = ST.getRegisterInfo();
699
700 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
701 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
702 };
703
704 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
705}
706
707int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
708 int Limit) const {
709 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
710 return isSSetReg(Opcode: MI.getOpcode()) && IsHazard(MI);
711 };
712
713 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
714}
715
716//===----------------------------------------------------------------------===//
717// No-op Hazard Detection
718//===----------------------------------------------------------------------===//
719
720static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
721 MCRegister Reg) {
722 for (MCRegUnit Unit : TRI.regunits(Reg))
723 BV.set(static_cast<unsigned>(Unit));
724}
725
726static void addRegsToSet(const SIRegisterInfo &TRI,
727 iterator_range<MachineInstr::const_mop_iterator> Ops,
728 BitVector &DefSet, BitVector &UseSet) {
729 for (const MachineOperand &Op : Ops) {
730 if (Op.isReg())
731 addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
732 }
733}
734
735void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) const {
736 addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
737}
738
739static bool breaksSMEMSoftClause(MachineInstr *MI) {
740 return !SIInstrInfo::isSMRD(MI: *MI);
741}
742
743static bool breaksVMEMSoftClause(MachineInstr *MI) {
744 return !SIInstrInfo::isVMEM(MI: *MI);
745}
746
747int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) const {
748 // SMEM soft clause are only present on VI+, and only matter if xnack is
749 // enabled.
750 if (!ST.isXNACKEnabled())
751 return 0;
752
753 bool IsSMRD = TII.isSMRD(MI: *MEM);
754
755 resetClause();
756
757 // A soft-clause is any group of consecutive SMEM instructions. The
758 // instructions in this group may return out of order and/or may be
759 // replayed (i.e. the same instruction issued more than once).
760 //
761 // In order to handle these situations correctly we need to make sure that
762 // when a clause has more than one instruction, no instruction in the clause
763 // writes to a register that is read by another instruction in the clause
764 // (including itself). If we encounter this situation, we need to break the
765 // clause by inserting a non SMEM instruction.
766
767 for (MachineInstr *MI : EmittedInstrs) {
768 // When we hit a non-SMEM instruction then we have passed the start of the
769 // clause and we can stop.
770 if (!MI)
771 break;
772
773 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
774 break;
775
776 addClauseInst(MI: *MI);
777 }
778
779 if (ClauseDefs.none())
780 return 0;
781
782 // We need to make sure not to put loads and stores in the same clause if they
783 // use the same address. For now, just start a new clause whenever we see a
784 // store.
785 if (MEM->mayStore())
786 return 1;
787
788 addClauseInst(MI: *MEM);
789
790 // If the set of defs and uses intersect then we cannot add this instruction
791 // to the clause, so we have a hazard.
792 return ClauseDefs.anyCommon(RHS: ClauseUses) ? 1 : 0;
793}
794
795int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) const {
796 int WaitStatesNeeded = 0;
797
798 WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
799
800 // This SMRD hazard only affects SI.
801 if (!ST.hasSMRDReadVALUDefHazard())
802 return WaitStatesNeeded;
803
804 // A read of an SGPR by SMRD instruction requires 4 wait states when the
805 // SGPR was written by a VALU instruction.
806 int SmrdSgprWaitStates = 4;
807 auto IsHazardDefFn = [this](const MachineInstr &MI) {
808 return TII.isVALU(MI, /*AllowLDSDMA=*/true);
809 };
810 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
811 return TII.isSALU(MI);
812 };
813
814 bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
815
816 for (const MachineOperand &Use : SMRD->uses()) {
817 if (!Use.isReg())
818 continue;
819 int WaitStatesNeededForUse =
820 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
821 Limit: SmrdSgprWaitStates);
822 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
823
824 // This fixes what appears to be undocumented hardware behavior in SI where
825 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
826 // needs some number of nops in between. We don't know how many we need, but
827 // let's use 4. This wasn't discovered before probably because the only
828 // case when this happens is when we expand a 64-bit pointer into a full
829 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
830 // probably never encountered in the closed-source land.
831 if (IsBufferSMRD) {
832 int WaitStatesNeededForUse =
833 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
834 IsHazardDef: IsBufferHazardDefFn,
835 Limit: SmrdSgprWaitStates);
836 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
837 }
838 }
839
840 return WaitStatesNeeded;
841}
842
843int GCNHazardRecognizer::checkVMEMHazards(MachineInstr *VMEM) const {
844 if (!ST.hasVMEMReadSGPRVALUDefHazard())
845 return 0;
846
847 int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
848
849 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
850 // SGPR was written by a VALU Instruction.
851 const int VmemSgprWaitStates = 5;
852 auto IsHazardDefFn = [this](const MachineInstr &MI) {
853 return TII.isVALU(MI, /*AllowLDSDMA=*/true);
854 };
855 for (const MachineOperand &Use : VMEM->uses()) {
856 if (!Use.isReg() || TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
857 continue;
858
859 int WaitStatesNeededForUse =
860 VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
861 Limit: VmemSgprWaitStates);
862 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
863 }
864 return WaitStatesNeeded;
865}
866
867int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) const {
868 const SIRegisterInfo *TRI = ST.getRegisterInfo();
869 const SIInstrInfo *TII = ST.getInstrInfo();
870
871 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
872 int DppVgprWaitStates = 2;
873 int DppExecWaitStates = 5;
874 int WaitStatesNeeded = 0;
875 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
876 return TII->isVALU(MI, /*AllowLDSDMA=*/true);
877 };
878
879 for (const MachineOperand &Use : DPP->uses()) {
880 if (!Use.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
881 continue;
882 int WaitStatesNeededForUse =
883 DppVgprWaitStates - getWaitStatesSinceDef(
884 Reg: Use.getReg(),
885 IsHazardDef: [](const MachineInstr &) { return true; },
886 Limit: DppVgprWaitStates);
887 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
888 }
889
890 WaitStatesNeeded = std::max(
891 a: WaitStatesNeeded,
892 b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn,
893 Limit: DppExecWaitStates));
894
895 return WaitStatesNeeded;
896}
897
898int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) const {
899 const SIInstrInfo *TII = ST.getInstrInfo();
900
901 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
902 // instruction.
903 const int DivFMasWaitStates = 4;
904 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
905 return TII->isVALU(MI, /*AllowLDSDMA=*/true);
906 };
907 int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn,
908 Limit: DivFMasWaitStates);
909
910 return DivFMasWaitStates - WaitStatesNeeded;
911}
912
913int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) const {
914 const SIInstrInfo *TII = ST.getInstrInfo();
915 unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
916
917 const int GetRegWaitStates = 2;
918 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
919 return GetRegHWReg == getHWReg(TII, RegInstr: MI);
920 };
921 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
922
923 return GetRegWaitStates - WaitStatesNeeded;
924}
925
926int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) const {
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
929
930 const int SetRegWaitStates = ST.getSetRegWaitStates();
931 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
932 return HWReg == getHWReg(TII, RegInstr: MI);
933 };
934 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
935 return SetRegWaitStates - WaitStatesNeeded;
936}
937
938int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) const {
939 if (!MI.mayStore())
940 return -1;
941
942 const SIInstrInfo *TII = ST.getInstrInfo();
943 unsigned Opcode = MI.getOpcode();
944 const MCInstrDesc &Desc = MI.getDesc();
945
946 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
947 int VDataRCID = -1;
948 if (VDataIdx != -1)
949 VDataRCID = TII->getOpRegClassID(OpInfo: Desc.operands()[VDataIdx]);
950
951 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
952 // There is no hazard if the instruction does not use vector regs
953 // (like wbinvl1)
954 if (VDataIdx == -1)
955 return -1;
956 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64) {
957 // When SOFFSET-dependent wide-store windows apply, the BUFFER_STORE
958 // source-vgpr WAR hazard exists for every SOFFSET shape; the wait-state
959 // count differs by SOFFSET and is computed in checkVALUHazardsHelper.
960 // Otherwise the hazard only exists if soffset is not an SGPR.
961 if (ST.hasVDecCoExecHazard())
962 return VDataIdx;
963 const MachineOperand *SOffset =
964 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
965 if (!SOffset || !SOffset->isReg())
966 return VDataIdx;
967 }
968 }
969
970 // MIMG instructions create a hazard if they don't use a 256-bit T# and
971 // the store size is greater than 8 bytes and they have more than two bits
972 // of their dmask set.
973 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
974 if (TII->isMIMG(MI)) {
975 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::srsrc);
976 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
977 Desc.operands()[SRsrcIdx])) == 256);
978 (void)SRsrcIdx;
979 }
980
981 if (TII->isFLAT(MI)) {
982 // There is no hazard if the instruction does not use vector regs
983 if (VDataIdx == -1)
984 return -1;
985
986 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64)
987 return VDataIdx;
988 }
989
990 return -1;
991}
992
993int GCNHazardRecognizer::checkUniformWindowVALUHazardsHelper(
994 Register Reg) const {
995 // Wide stores need a single wait-state bubble before a VALU that overwrites
996 // store data. createsVALUHazard already excludes MUBUF/MTBUF stores with an
997 // SGPR SOFFSET.
998 const SIRegisterInfo *TRI = ST.getRegisterInfo();
999
1000 auto IsHazard = [&](const MachineInstr &MI) {
1001 int DataIdx = createsVALUHazard(MI);
1002 return DataIdx >= 0 &&
1003 TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg);
1004 };
1005
1006 return std::max(a: 0, b: 1 - getWaitStatesSince(IsHazard, /*Limit=*/1));
1007}
1008
1009int GCNHazardRecognizer::checkSOFFSETWindowVALUHazardsHelper(
1010 Register Reg) const {
1011 // The required wait-state window depends on the producer's SOFFSET shape:
1012 // - MUBUF/MTBUF wide store with sgpr SOFFSET: 1 wait state.
1013 // - MUBUF/MTBUF wide store with literal/absent SOFFSET, and FLAT wide
1014 // store: 2 wait states.
1015 // The 1-cycle sgpr-SOFFSET window was measured on gfx950.
1016 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1017 const SIInstrInfo *TII = ST.getInstrInfo();
1018
1019 int WaitStatesNeeded = 0;
1020
1021 // Scan each wait-state window separately and take the max padding needed.
1022 // getWaitStatesSince supplies the minimum distance to a producer over paths.
1023 for (int Window = 1; Window <= 2; ++Window) {
1024 auto IsHazard = [&](const MachineInstr &MI) {
1025 int DataIdx = createsVALUHazard(MI);
1026 if (DataIdx < 0 ||
1027 !TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg))
1028 return false;
1029
1030 // Window 1 matches every hazard producer. Window 2 excludes BUF stores
1031 // with an SGPR SOFFSET, which only require a single wait state.
1032 if (Window == 1 || !TII->isBUF(MI))
1033 return true;
1034
1035 const MachineOperand *SOffset =
1036 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
1037 return !SOffset || !SOffset->isReg();
1038 };
1039 WaitStatesNeeded = std::max(a: WaitStatesNeeded,
1040 b: Window - getWaitStatesSince(IsHazard, Limit: Window));
1041 }
1042
1043 return WaitStatesNeeded;
1044}
1045
1046int GCNHazardRecognizer::checkVALUHazardsHelper(
1047 const MachineOperand &Def, const MachineRegisterInfo &MRI) const {
1048 // Helper to check for the hazard where VMEM instructions that store more
1049 // than 8 bytes can have their store data overwritten by the next
1050 // instruction.
1051 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1052
1053 if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
1054 return 0;
1055
1056 if (ST.hasVDecCoExecHazard())
1057 return checkSOFFSETWindowVALUHazardsHelper(Reg: Def.getReg());
1058
1059 return checkUniformWindowVALUHazardsHelper(Reg: Def.getReg());
1060}
1061
1062/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
1063/// pack the computed value into correct bit position of the dest register. This
1064/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
1065/// dst_sel that is not aligned to the register. This function analayzes the \p
1066/// MI and \returns an operand with dst forwarding issue, or nullptr if
1067/// none exists.
1068static const MachineOperand *
1069getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
1070 if (!SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
1071 return nullptr;
1072
1073 const SIInstrInfo *TII = ST.getInstrInfo();
1074
1075 unsigned Opcode = MI.getOpcode();
1076
1077 // There are three different types of instructions
1078 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
1079 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
1080 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
1081 // op_sel[3:2]
1082 // != 0
1083 if (SIInstrInfo::isSDWA(MI)) {
1084 // Type 1: SDWA with dst_sel != DWORD
1085 if (auto *DstSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel))
1086 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
1087 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1088 }
1089
1090 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opc: Opcode);
1091 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel)) {
1092 // Type 2: VOP3 which write the hi bits
1093 if (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers) &
1094 SISrcMods::DST_OP_SEL)
1095 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1096
1097 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
1098 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
1099 (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers) &
1100 SISrcMods::OP_SEL_0))
1101 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1102 }
1103
1104 // Special case: nop is required for all the opsel values for fp4 sr variant
1105 // cvt scale instructions
1106 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1107 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1108
1109 return nullptr;
1110}
1111
1112/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1113/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1114/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1115static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
1116 const MachineOperand *Dst,
1117 const SIRegisterInfo *TRI) {
1118 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1119 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1120 // and we must account for that hazard.
1121 // We also must account for WAW hazards. In particular, WAW with dest
1122 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1123 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1124 // check for ECC. Without accounting for this hazard, the ECC will be
1125 // wrong.
1126 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1127 // complete zeroesHigh16BitsOfDest)
1128 for (auto &Operand : VALU->operands()) {
1129 if (Operand.isReg() && TRI->regsOverlap(RegA: Dst->getReg(), RegB: Operand.getReg())) {
1130 return true;
1131 }
1132 }
1133 return false;
1134}
1135
1136int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) const {
1137 int WaitStatesNeeded = 0;
1138
1139 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
1140 const int TransDefWaitstates = 1;
1141
1142 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1143 if (!SIInstrInfo::isTRANS(MI))
1144 return false;
1145 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1146 const SIInstrInfo *TII = ST.getInstrInfo();
1147 Register Def = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)->getReg();
1148
1149 for (const MachineOperand &Use : VALU->explicit_uses()) {
1150 if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
1151 return true;
1152 }
1153
1154 return false;
1155 };
1156
1157 int WaitStatesNeededForDef =
1158 TransDefWaitstates -
1159 getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
1160 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1161 }
1162
1163 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1164 const int Shift16DefWaitstates = 1;
1165
1166 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1167 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1168 const MachineOperand *ForwardedDst =
1169 getDstSelForwardingOperand(MI: ProducerMI, ST);
1170 if (ForwardedDst) {
1171 return consumesDstSelForwardingOperand(VALU, Dst: ForwardedDst, TRI);
1172 }
1173
1174 if (ProducerMI.isInlineAsm()) {
1175 // Assume inline asm has dst forwarding hazard
1176 for (auto &Def : ProducerMI.all_defs()) {
1177 if (consumesDstSelForwardingOperand(VALU, Dst: &Def, TRI))
1178 return true;
1179 }
1180 }
1181
1182 return false;
1183 };
1184
1185 int WaitStatesNeededForDef =
1186 Shift16DefWaitstates -
1187 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1188 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1189 }
1190
1191 if (ST.hasVDecCoExecHazard()) {
1192 const int VALUWriteSGPRVALUReadWaitstates = 2;
1193 const int VALUWriteEXECRWLane = 4;
1194 const int VALUWriteVGPRReadlaneRead = 1;
1195
1196 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1197 const MachineRegisterInfo &MRI = MF.getRegInfo();
1198 Register UseReg;
1199 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1200 if (!SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
1201 return false;
1202 return MI.modifiesRegister(Reg: UseReg, TRI);
1203 };
1204
1205 for (const MachineOperand &Use : VALU->explicit_uses()) {
1206 if (!Use.isReg())
1207 continue;
1208
1209 UseReg = Use.getReg();
1210 if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
1211 int WaitStatesNeededForDef =
1212 VALUWriteSGPRVALUReadWaitstates -
1213 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn,
1214 Limit: VALUWriteSGPRVALUReadWaitstates);
1215 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1216 }
1217 }
1218
1219 if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) {
1220 UseReg = AMDGPU::VCC;
1221 int WaitStatesNeededForDef =
1222 VALUWriteSGPRVALUReadWaitstates -
1223 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates);
1224 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1225 }
1226
1227 switch (VALU->getOpcode()) {
1228 case AMDGPU::V_READLANE_B32:
1229 case AMDGPU::V_READFIRSTLANE_B32: {
1230 MachineOperand *Src = TII.getNamedOperand(MI&: *VALU, OperandName: AMDGPU::OpName::src0);
1231 UseReg = Src->getReg();
1232 int WaitStatesNeededForDef =
1233 VALUWriteVGPRReadlaneRead -
1234 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead);
1235 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1236 }
1237 [[fallthrough]];
1238 case AMDGPU::V_WRITELANE_B32: {
1239 UseReg = AMDGPU::EXEC;
1240 int WaitStatesNeededForDef =
1241 VALUWriteEXECRWLane -
1242 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane);
1243 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1244 break;
1245 }
1246 default:
1247 break;
1248 }
1249 }
1250
1251 // This checks for the hazard where VMEM instructions that store more than
1252 // 8 bytes can have there store data over written by the next instruction.
1253 if (!ST.has12DWordStoreHazard())
1254 return WaitStatesNeeded;
1255
1256 const MachineRegisterInfo &MRI = MF.getRegInfo();
1257
1258 for (const MachineOperand &Def : VALU->defs()) {
1259 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1260 }
1261
1262 return WaitStatesNeeded;
1263}
1264
1265int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) const {
1266 // This checks for hazards associated with inline asm statements.
1267 // Since inline asms can contain just about anything, we use this
1268 // to call/leverage other check*Hazard routines. Note that
1269 // this function doesn't attempt to address all possible inline asm
1270 // hazards (good luck), but is a collection of what has been
1271 // problematic thus far.
1272
1273 // see checkVALUHazards()
1274 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1275 !ST.hasCvtScaleForwardingHazard())
1276 return 0;
1277
1278 const MachineRegisterInfo &MRI = MF.getRegInfo();
1279 int WaitStatesNeeded = 0;
1280
1281 for (const MachineOperand &Op :
1282 llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1283 if (Op.isReg() && Op.isDef()) {
1284 if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1285 continue;
1286
1287 if (ST.has12DWordStoreHazard()) {
1288 WaitStatesNeeded =
1289 std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1290 }
1291 }
1292 }
1293
1294 if (ST.hasDstSelForwardingHazard()) {
1295 const int Shift16DefWaitstates = 1;
1296
1297 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1298 const MachineOperand *Dst = getDstSelForwardingOperand(MI: ProducerMI, ST);
1299 // Assume inline asm reads the dst
1300 if (Dst)
1301 return IA->modifiesRegister(Reg: Dst->getReg(), TRI: &TRI) ||
1302 IA->readsRegister(Reg: Dst->getReg(), TRI: &TRI);
1303
1304 if (ProducerMI.isInlineAsm()) {
1305 // If MI is inline asm, assume it has dst forwarding hazard
1306 for (auto &Def : ProducerMI.all_defs()) {
1307 if (IA->modifiesRegister(Reg: Def.getReg(), TRI: &TRI) ||
1308 IA->readsRegister(Reg: Def.getReg(), TRI: &TRI)) {
1309 return true;
1310 }
1311 }
1312 }
1313
1314 return false;
1315 };
1316
1317 int WaitStatesNeededForDef =
1318 Shift16DefWaitstates -
1319 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1320 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1321 }
1322
1323 return WaitStatesNeeded;
1324}
1325
1326int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) const {
1327 const SIInstrInfo *TII = ST.getInstrInfo();
1328 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1329 const MachineRegisterInfo &MRI = MF.getRegInfo();
1330
1331 const MachineOperand *LaneSelectOp =
1332 TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1);
1333
1334 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1335 return 0;
1336
1337 Register LaneSelectReg = LaneSelectOp->getReg();
1338 auto IsHazardFn = [TII](const MachineInstr &MI) {
1339 return TII->isVALU(MI, /*AllowLDSDMA=*/true);
1340 };
1341
1342 const int RWLaneWaitStates = 4;
1343 int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1344 Limit: RWLaneWaitStates);
1345 return RWLaneWaitStates - WaitStatesSince;
1346}
1347
1348int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) const {
1349 if (!ST.hasRFEHazards())
1350 return 0;
1351
1352 const SIInstrInfo *TII = ST.getInstrInfo();
1353
1354 const int RFEWaitStates = 1;
1355
1356 auto IsHazardFn = [TII](const MachineInstr &MI) {
1357 return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1358 };
1359 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1360 return RFEWaitStates - WaitStatesNeeded;
1361}
1362
1363int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) const {
1364 const SIInstrInfo *TII = ST.getInstrInfo();
1365 const int ReadM0WaitStates = 1;
1366 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1367 return ReadM0WaitStates -
1368 getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates);
1369}
1370
1371void GCNHazardRecognizer::emitVNops(MachineBasicBlock &MBB,
1372 MachineBasicBlock::iterator InsertPt,
1373 int WaitStatesNeeded, bool IsHoisting) {
1374 const DebugLoc &DL = IsHoisting ? DebugLoc() : InsertPt->getDebugLoc();
1375 for (int I = 0; I < WaitStatesNeeded; ++I)
1376 BuildMI(BB&: MBB, I: InsertPt, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
1377}
1378
1379void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1380 fixVMEMtoScalarWriteHazards(MI);
1381 fixVcmpxPermlaneHazards(MI);
1382 fixSMEMtoVectorWriteHazards(MI);
1383 fixVcmpxExecWARHazard(MI);
1384 fixLdsBranchVmemWARHazard(MI);
1385 if (ST.hasLdsDirect()) {
1386 fixLdsDirectVALUHazard(MI);
1387 fixLdsDirectVMEMHazard(MI);
1388 }
1389 fixVALUPartialForwardingHazard(MI);
1390 fixVALUTransUseHazard(MI);
1391 fixVALUTransCoexecutionHazards(MI);
1392 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1393 fixWMMACoexecutionHazards(MI);
1394 fixShift64HighRegBug(MI);
1395 fixVALUMaskWriteHazard(MI);
1396 fixRequiredExportPriority(MI);
1397 if (ST.requiresWaitIdleBeforeGetReg())
1398 fixGetRegWaitIdle(MI);
1399 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1400 fixDsAtomicAsyncBarrierArriveB64(MI);
1401 if (ST.hasScratchBaseForwardingHazard())
1402 fixScratchBaseForwardingHazard(MI);
1403 if (ST.setRegModeNeedsVNOPs())
1404 fixSetRegMode(MI);
1405}
1406
1407static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1408 const MachineInstr &MI) {
1409 return (TII.isVOPC(MI) ||
1410 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1411 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI);
1412}
1413
1414bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1415 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(MI: *MI))
1416 return false;
1417
1418 const SIInstrInfo *TII = ST.getInstrInfo();
1419 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1420 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1421 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
1422 };
1423
1424 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1425 unsigned Opc = MI.getOpcode();
1426 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
1427 Opc != AMDGPU::V_NOP_e32 && Opc != AMDGPU::V_NOP_e64 &&
1428 Opc != AMDGPU::V_NOP_sdwa;
1429 };
1430
1431 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1432 std::numeric_limits<int>::max())
1433 return false;
1434
1435 // V_NOP will be discarded by SQ.
1436 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1437 // which is always a VGPR and available.
1438 auto *Src0 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
1439 Register Reg = Src0->getReg();
1440 bool IsUndef = Src0->isUndef();
1441 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1442 MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32))
1443 .addReg(RegNo: Reg, Flags: RegState::Define | getDeadRegState(B: IsUndef))
1444 .addReg(RegNo: Reg, Flags: IsUndef ? RegState::Undef : RegState::Kill);
1445
1446 return true;
1447}
1448
1449bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1450 if (!ST.hasVMEMtoScalarWriteHazard())
1451 return false;
1452 assert(!ST.hasExtendedWaitCounts());
1453
1454 if (!SIInstrInfo::isSALU(MI: *MI) && !SIInstrInfo::isSMRD(MI: *MI))
1455 return false;
1456
1457 if (MI->getNumDefs() == 0)
1458 return false;
1459
1460 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1461
1462 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1463 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1464 return false;
1465
1466 for (const MachineOperand &Def : MI->defs()) {
1467 const MachineOperand *Op =
1468 I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false);
1469 if (!Op)
1470 continue;
1471 return true;
1472 }
1473 return false;
1474 };
1475
1476 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1477 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) ||
1478 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1479 !MI.getOperand(i: 0).getImm()) ||
1480 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1481 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: 0).getImm()) == 0);
1482 };
1483
1484 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1485 std::numeric_limits<int>::max())
1486 return false;
1487
1488 const SIInstrInfo *TII = ST.getInstrInfo();
1489 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1490 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1491 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
1492 return true;
1493}
1494
1495bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1496 if (!ST.hasSMEMtoVectorWriteHazard())
1497 return false;
1498 assert(!ST.hasExtendedWaitCounts());
1499
1500 if (!SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true))
1501 return false;
1502
1503 AMDGPU::OpName SDSTName;
1504 switch (MI->getOpcode()) {
1505 case AMDGPU::V_READLANE_B32:
1506 case AMDGPU::V_READFIRSTLANE_B32:
1507 SDSTName = AMDGPU::OpName::vdst;
1508 break;
1509 default:
1510 SDSTName = AMDGPU::OpName::sdst;
1511 break;
1512 }
1513
1514 const SIInstrInfo *TII = ST.getInstrInfo();
1515 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1516 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1517 const MachineOperand *SDST = TII->getNamedOperand(MI&: *MI, OperandName: SDSTName);
1518 if (!SDST) {
1519 for (const auto &MO : MI->implicit_operands()) {
1520 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) {
1521 SDST = &MO;
1522 break;
1523 }
1524 }
1525 }
1526
1527 if (!SDST)
1528 return false;
1529
1530 const Register SDSTReg = SDST->getReg();
1531 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1532 return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI);
1533 };
1534
1535 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1536 if (TII->isSALU(MI)) {
1537 switch (MI.getOpcode()) {
1538 case AMDGPU::S_SETVSKIP:
1539 case AMDGPU::S_VERSION:
1540 case AMDGPU::S_WAITCNT_VSCNT:
1541 case AMDGPU::S_WAITCNT_VMCNT:
1542 case AMDGPU::S_WAITCNT_EXPCNT:
1543 // These instructions cannot not mitigate the hazard.
1544 return false;
1545 case AMDGPU::S_WAITCNT_LGKMCNT:
1546 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1547 return (MI.getOperand(i: 1).getImm() == 0) &&
1548 (MI.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL);
1549 case AMDGPU::S_WAITCNT: {
1550 const int64_t Imm = MI.getOperand(i: 0).getImm();
1551 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1552 // DsCnt corresponds to LGKMCnt here.
1553 return Decoded.get(T: AMDGPU::DS_CNT) == 0;
1554 }
1555 default:
1556 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1557 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1558 "unexpected wait count instruction");
1559 // SOPP instructions cannot mitigate the hazard.
1560 if (TII->isSOPP(MI))
1561 return false;
1562 // At this point the SALU can be assumed to mitigate the hazard
1563 // because either:
1564 // (a) it is independent of the at risk SMEM (breaking chain),
1565 // or
1566 // (b) it is dependent on the SMEM, in which case an appropriate
1567 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1568 // SMEM instruction.
1569 return true;
1570 }
1571 }
1572 return false;
1573 };
1574
1575 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1576 std::numeric_limits<int>::max())
1577 return false;
1578
1579 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1580 MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL)
1581 .addImm(Val: 0);
1582 return true;
1583}
1584
1585bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1586 if (!ST.hasVcmpxExecWARHazard())
1587 return false;
1588 assert(!ST.hasExtendedWaitCounts());
1589
1590 if (!SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true))
1591 return false;
1592
1593 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1594 if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
1595 return false;
1596
1597 auto IsHazardFn = [TRI](const MachineInstr &I) {
1598 if (SIInstrInfo::isVALU(MI: I, /*AllowLDSDMA=*/true))
1599 return false;
1600 return I.readsRegister(Reg: AMDGPU::EXEC, TRI);
1601 };
1602
1603 const SIInstrInfo *TII = ST.getInstrInfo();
1604 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1605 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true)) {
1606 if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))
1607 return true;
1608 for (auto MO : MI.implicit_operands())
1609 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg())))
1610 return true;
1611 }
1612 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1613 AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
1614 return true;
1615 return false;
1616 };
1617
1618 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1619 std::numeric_limits<int>::max())
1620 return false;
1621
1622 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1623 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1624 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
1625 return true;
1626}
1627
1628static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1629 const GCNSubtarget &ST) {
1630 if (!ST.hasLdsBranchVmemWARHazard())
1631 return false;
1632
1633 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1634 // instructions need to appear in the same function.
1635 bool HasLds = false;
1636 bool HasVmem = false;
1637 for (auto &MBB : MF) {
1638 for (auto &MI : MBB) {
1639 HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI);
1640 HasVmem |= SIInstrInfo::isVMEM(MI);
1641 if (HasLds && HasVmem)
1642 return true;
1643 }
1644 }
1645 return false;
1646}
1647
1648static bool isStoreCountWaitZero(const MachineInstr &I) {
1649 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1650 I.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL &&
1651 !I.getOperand(i: 1).getImm();
1652}
1653
1654bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1655 if (!RunLdsBranchVmemWARHazardFixup)
1656 return false;
1657
1658 assert(ST.hasLdsBranchVmemWARHazard());
1659 assert(!ST.hasExtendedWaitCounts());
1660
1661 auto IsHazardInst = [](const MachineInstr &MI) {
1662 if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI))
1663 return 1;
1664 if (SIInstrInfo::isVMEM(MI))
1665 return 2;
1666 return 0;
1667 };
1668
1669 auto InstType = IsHazardInst(*MI);
1670 if (!InstType)
1671 return false;
1672
1673 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1674 return IsHazardInst(I) || isStoreCountWaitZero(I);
1675 };
1676
1677 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1678 if (!I.isBranch())
1679 return false;
1680
1681 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1682 auto InstType2 = IsHazardInst(I);
1683 return InstType2 && InstType != InstType2;
1684 };
1685
1686 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1687 auto InstType2 = IsHazardInst(I);
1688 if (InstType == InstType2)
1689 return true;
1690
1691 return isStoreCountWaitZero(I);
1692 };
1693
1694 return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1695 std::numeric_limits<int>::max();
1696 };
1697
1698 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1699 std::numeric_limits<int>::max())
1700 return false;
1701
1702 const SIInstrInfo *TII = ST.getInstrInfo();
1703 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1704 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1705 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1706 .addImm(Val: 0);
1707
1708 return true;
1709}
1710
1711bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1712 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1713 return false;
1714
1715 const int NoHazardWaitStates = 15;
1716 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1717 const Register VDSTReg = VDST->getReg();
1718
1719 bool VisitedTrans = false;
1720 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1721 if (!SIInstrInfo::isVALU(MI: I, /*AllowLDSDMA=*/true))
1722 return false;
1723 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(MI: I);
1724 // Cover both WAR and WAW
1725 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1726 };
1727 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1728 if (WaitStates >= NoHazardWaitStates)
1729 return true;
1730 // Instructions which cause va_vdst==0 expire hazard
1731 return SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1732 SIInstrInfo::isEXP(MI: I);
1733 };
1734 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1735 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) ? 1 : 0;
1736 };
1737
1738 DenseSet<const MachineBasicBlock *> Visited;
1739 auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(),
1740 I: std::next(x: MI->getReverseIterator()), WaitStates: 0,
1741 IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn);
1742
1743 // Transcendentals can execute in parallel to other VALUs.
1744 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1745 if (VisitedTrans)
1746 Count = 0;
1747
1748 MachineOperand *WaitVdstOp =
1749 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst);
1750 WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates));
1751
1752 return true;
1753}
1754
1755bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1756 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1757 return false;
1758
1759 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1760 const Register VDSTReg = VDST->getReg();
1761
1762 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1763 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1764 return false;
1765 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1766 };
1767 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1768 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1769 // according to the type of VMEM instruction.
1770 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1771 return SIInstrInfo::isVALU(MI: I, /*AllowLDSDMA=*/true) ||
1772 SIInstrInfo::isEXP(MI: I) ||
1773 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: 0).getImm()) ||
1774 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1775 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: 0).getImm()) == 0) ||
1776 (LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) &&
1777 !TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::waitvsrc)->getImm());
1778 };
1779
1780 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1781 std::numeric_limits<int>::max())
1782 return false;
1783
1784 if (LdsdirCanWait) {
1785 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(0);
1786 } else {
1787 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1788 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1789 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
1790 }
1791
1792 return true;
1793}
1794
1795bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1796 if (!ST.hasVALUPartialForwardingHazard())
1797 return false;
1798 assert(!ST.hasExtendedWaitCounts());
1799
1800 if (!ST.isWave64() || !SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true))
1801 return false;
1802
1803 SmallSetVector<Register, 4> SrcVGPRs;
1804
1805 for (const MachineOperand &Use : MI->explicit_uses()) {
1806 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1807 SrcVGPRs.insert(X: Use.getReg());
1808 }
1809
1810 // Only applies with >= 2 unique VGPR sources
1811 if (SrcVGPRs.size() <= 1)
1812 return false;
1813
1814 // Look for the following pattern:
1815 // Va <- VALU [PreExecPos]
1816 // intv1
1817 // Exec <- SALU [ExecPos]
1818 // intv2
1819 // Vb <- VALU [PostExecPos]
1820 // intv3
1821 // MI Va, Vb (WaitState = 0)
1822 //
1823 // Where:
1824 // intv1 + intv2 <= 2 VALUs
1825 // intv3 <= 4 VALUs
1826 //
1827 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1828
1829 const int Intv1plus2MaxVALUs = 2;
1830 const int Intv3MaxVALUs = 4;
1831 const int IntvMaxVALUs = 6;
1832 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1833
1834 struct StateType {
1835 SmallDenseMap<Register, int, 4> DefPos;
1836 int ExecPos = std::numeric_limits<int>::max();
1837 int VALUs = 0;
1838
1839 static unsigned getHashValue(const StateType &State) {
1840 return hash_combine(args: State.ExecPos, args: State.VALUs,
1841 args: hash_combine_range(R: State.DefPos));
1842 }
1843 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1844 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1845 LHS.VALUs == RHS.VALUs;
1846 }
1847 };
1848
1849 StateType State;
1850
1851 // This overloads expiry testing with all the hazard detection
1852 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1853 // Too many VALU states have passed
1854 if (State.VALUs > NoHazardVALUWaitStates)
1855 return HazardExpired;
1856
1857 // Instructions which cause va_vdst==0 expire hazard
1858 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1859 SIInstrInfo::isEXP(MI: I) ||
1860 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1861 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
1862 return HazardExpired;
1863
1864 // Track registers writes
1865 bool Changed = false;
1866 if (SIInstrInfo::isVALU(MI: I, /*AllowLDSDMA=*/true)) {
1867 for (Register Src : SrcVGPRs) {
1868 if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1869 State.DefPos[Src] = State.VALUs;
1870 Changed = true;
1871 }
1872 }
1873 } else if (SIInstrInfo::isSALU(MI: I)) {
1874 if (State.ExecPos == std::numeric_limits<int>::max()) {
1875 if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
1876 State.ExecPos = State.VALUs;
1877 Changed = true;
1878 }
1879 }
1880 }
1881
1882 // Early expiration: too many VALUs in intv3
1883 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1884 return HazardExpired;
1885
1886 // Only evaluate state if something changed
1887 if (!Changed)
1888 return NoHazardFound;
1889
1890 // Determine positions of VALUs pre/post exec change
1891 if (State.ExecPos == std::numeric_limits<int>::max())
1892 return NoHazardFound;
1893
1894 int PreExecPos = std::numeric_limits<int>::max();
1895 int PostExecPos = std::numeric_limits<int>::max();
1896
1897 for (auto Entry : State.DefPos) {
1898 int DefVALUs = Entry.second;
1899 if (DefVALUs != std::numeric_limits<int>::max()) {
1900 if (DefVALUs >= State.ExecPos)
1901 PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1902 else
1903 PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1904 }
1905 }
1906
1907 // Need a VALUs post exec change
1908 if (PostExecPos == std::numeric_limits<int>::max())
1909 return NoHazardFound;
1910
1911 // Too many VALUs in intv3?
1912 int Intv3VALUs = PostExecPos;
1913 if (Intv3VALUs > Intv3MaxVALUs)
1914 return HazardExpired;
1915
1916 // Too many VALUs in intv2?
1917 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1918 if (Intv2VALUs > Intv1plus2MaxVALUs)
1919 return HazardExpired;
1920
1921 // Need a VALUs pre exec change
1922 if (PreExecPos == std::numeric_limits<int>::max())
1923 return NoHazardFound;
1924
1925 // Too many VALUs in intv1?
1926 int Intv1VALUs = PreExecPos - State.ExecPos;
1927 if (Intv1VALUs > Intv1plus2MaxVALUs)
1928 return HazardExpired;
1929
1930 // Too many VALUs in intv1 + intv2
1931 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1932 return HazardExpired;
1933
1934 return HazardFound;
1935 };
1936 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1937 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
1938 State.VALUs += 1;
1939 };
1940
1941 if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1942 InitialI: std::next(x: MI->getReverseIterator())))
1943 return false;
1944
1945 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1946 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1947 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST));
1948
1949 return true;
1950}
1951
1952bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1953 if (!ST.hasVALUTransUseHazard())
1954 return false;
1955 assert(!ST.hasExtendedWaitCounts());
1956
1957 if (!SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true))
1958 return false;
1959
1960 SmallSet<Register, 4> SrcVGPRs;
1961
1962 for (const MachineOperand &Use : MI->explicit_uses()) {
1963 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1964 SrcVGPRs.insert(V: Use.getReg());
1965 }
1966
1967 // Look for the following pattern:
1968 // Va <- TRANS VALU
1969 // intv
1970 // MI Va (WaitState = 0)
1971 //
1972 // Where:
1973 // intv <= 5 VALUs / 1 TRANS
1974 //
1975 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1976
1977 const int IntvMaxVALUs = 5;
1978 const int IntvMaxTRANS = 1;
1979
1980 struct StateType {
1981 int VALUs = 0;
1982 int TRANS = 0;
1983
1984 static unsigned getHashValue(const StateType &State) {
1985 return hash_combine(args: State.VALUs, args: State.TRANS);
1986 }
1987 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1988 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1989 }
1990 };
1991
1992 StateType State;
1993
1994 // This overloads expiry testing with all the hazard detection
1995 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1996 // Too many VALU states have passed
1997 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1998 return HazardExpired;
1999
2000 // Instructions which cause va_vdst==0 expire hazard
2001 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
2002 SIInstrInfo::isEXP(MI: I) ||
2003 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2004 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
2005 return HazardExpired;
2006
2007 // Track registers writes
2008 if (SIInstrInfo::isTRANS(MI: I)) {
2009 for (Register Src : SrcVGPRs) {
2010 if (I.modifiesRegister(Reg: Src, TRI: &TRI)) {
2011 return HazardFound;
2012 }
2013 }
2014 }
2015
2016 return NoHazardFound;
2017 };
2018 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
2019 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
2020 State.VALUs += 1;
2021 if (SIInstrInfo::isTRANS(MI))
2022 State.TRANS += 1;
2023 };
2024
2025 if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
2026 InitialI: std::next(x: MI->getReverseIterator())))
2027 return false;
2028
2029 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
2030 // avoided.
2031 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
2032 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
2033 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST));
2034
2035 return true;
2036}
2037
2038bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
2039 if (!ST.hasTransCoexecutionHazard() || // Coexecution disabled.
2040 !SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true) ||
2041 SIInstrInfo::isTRANS(MI: *MI))
2042 return false;
2043
2044 const SIInstrInfo *TII = ST.getInstrInfo();
2045 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2046
2047 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
2048 if (!SIInstrInfo::isTRANS(MI: I))
2049 return false;
2050
2051 // RAW: Trans(I) writes, VALU(MI) reads.
2052 Register TransDef = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
2053 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2054 if (ValuUse.isReg() && TRI->regsOverlap(RegA: TransDef, RegB: ValuUse.getReg()))
2055 return true;
2056 }
2057
2058 auto *ValuDst = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
2059 if (!ValuDst || !ValuDst->isReg())
2060 return false;
2061
2062 // WAR: Trans(I) reads, VALU(MI) writes.
2063 Register ValuDef = ValuDst->getReg();
2064 for (const MachineOperand &TransUse : I.explicit_uses()) {
2065 if (TransUse.isReg() && TRI->regsOverlap(RegA: ValuDef, RegB: TransUse.getReg()))
2066 return true;
2067 }
2068
2069 return false;
2070 };
2071
2072 auto IsExpiredFn = [](const MachineInstr &I, int) {
2073 return SIInstrInfo::isVALU(MI: I, /*AllowLDSDMA=*/true);
2074 };
2075
2076 const int HasVALU = std::numeric_limits<int>::max();
2077 if (::getWaitStatesSince(IsHazard: IsTransHazardFn, MI, IsExpired: IsExpiredFn) == HasVALU)
2078 return false;
2079
2080 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
2081 return true;
2082}
2083
2084bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
2085 if (!SIInstrInfo::isWMMA(MI: *MI) && !SIInstrInfo::isSWMMAC(MI: *MI))
2086 return false;
2087
2088 const SIInstrInfo *TII = ST.getInstrInfo();
2089 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2090
2091 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
2092 if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
2093 return false;
2094
2095 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
2096 // with the dest(matrix D) of the previous wmma.
2097 const Register CurSrc0Reg =
2098 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
2099 const Register CurSrc1Reg =
2100 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
2101
2102 const Register PrevDstReg =
2103 TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
2104
2105 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) ||
2106 TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) {
2107 return true;
2108 }
2109
2110 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2111 // but Index can't overlap with PrevDstReg.
2112 if (AMDGPU::isGFX12Plus(STI: ST)) {
2113 if (SIInstrInfo::isSWMMAC(MI: *MI)) {
2114 const Register CurIndex =
2115 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
2116 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex))
2117 return true;
2118 }
2119 return false;
2120 }
2121
2122 return false;
2123 };
2124
2125 auto IsExpiredFn = [](const MachineInstr &I, int) {
2126 return SIInstrInfo::isVALU(MI: I, /*AllowLDSDMA=*/true);
2127 };
2128
2129 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
2130 std::numeric_limits<int>::max())
2131 return false;
2132
2133 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
2134
2135 return true;
2136}
2137
2138static bool isCoexecutableVALUInst(const MachineInstr &MI) {
2139 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
2140 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI) &&
2141 !SIInstrInfo::isLDSDMA(MI);
2142}
2143
2144// Classify XDL WMMA instructions into co-execution hazard categories
2145// (Refer to SPG 4.6.12.1), mainly based on instruction latency.
2146//
2147// Category 0: WMMA with Latency 8
2148// WMMA_*F16, WMMA_*BF16
2149// WMMA_*FP8FP8
2150// WMMA_*FP8BF8
2151// WMMA_*BF8FP8
2152// WMMA_*BF8BF8
2153// WMMA_*F8F6F4 if SRCA & SRCB != F8
2154//
2155// Category 1: WMMA Latency 16
2156// WMMA_IU8
2157// WMMA_*F8F6F4 if SRCA OR SRCB == F8
2158//
2159// Category 2: SWMMAC with Latency 8
2160// SWMMAC_*F16, SWMMAC_*BF16,
2161// SWMMAC_*FP8FP8
2162// SWMMAC_*BF8FP8
2163// SWMMAC_*FP8BF8
2164// SWMMAC_*BF8BF8
2165//
2166// Category 3: SWMMAC with Latency 16
2167// SWMMAC_IU8
2168//
2169// Category 4: 16 Pass GFX1251 WMMA with latency 16
2170// V_WMMA_*_16X16X32_{F16,BF16}
2171// V_WMMA_{F32,F16}_16X16X64_{FP8,BF8}*
2172// V_WMMA_F32_16x16x128_F8F6F4 (F4 only)
2173// V_SWMMAC_*_16X16X64_{F16,BF16}
2174// V_SWMMAC_{F32,F16}_16X16X128_{FP8,BF8}*
2175//
2176// Category 5: 32 Pass GFX1251 WMMA with latency 32
2177// V_WMMA_F32_16x16x128_F8F6F4 (not all F4)
2178// V_WMMA_{F32,F16}_16X16X128_{FP8,BF8}*
2179// V_WMMA_F32_32X16X128_F4
2180// V_WMMA_I32_16X16X64_IU8
2181// V_WMMA_I32_16X16X64_IU8
2182static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI,
2183 const SIInstrInfo *TII,
2184 const TargetSchedModel &SchedModel,
2185 const GCNSubtarget &ST) {
2186 assert(TII->isXDLWMMA(MI) && "must be xdl wmma");
2187 bool IsSWMMAC = SIInstrInfo::isSWMMAC(MI);
2188 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2189 unsigned Category = 0;
2190
2191 unsigned Latency = SchedModel.computeInstrLatency(MI: &MI);
2192 switch (Latency) {
2193 case 8:
2194 Category = IsSWMMAC ? 2 : 0;
2195 break;
2196 case 16:
2197 Category = IsLowestRateWMMA ? 4 : (IsSWMMAC ? 3 : 1);
2198 break;
2199 case 32:
2200 assert(IsLowestRateWMMA && "latency 32 is not expected");
2201 Category = 5;
2202 break;
2203 default:
2204 llvm_unreachable("unexpected xdl wmma latency");
2205 } // end switch.
2206
2207 return Category;
2208}
2209
2210int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) const {
2211 if (!ST.hasWMMACoexecutionHazards())
2212 return 0;
2213
2214 const SIInstrInfo *TII = ST.getInstrInfo();
2215 if (!TII->isXDLWMMA(MI: *MI) && !isCoexecutableVALUInst(MI: *MI))
2216 return 0;
2217
2218 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2219 // be in between the first WMMA and the second instruction to cover the hazard
2220 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2221 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2222 // numbers, which depends on the category of the first WMMA.
2223 const int WMMAWaitStates[] = {5, 9, 3, 5, 9, 17};
2224 const int VALUWaitStates[] = {4, 8, 2, 4, 8, 16};
2225 unsigned Category = 0;
2226
2227 auto IsWMMAHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2228 if (!TII->isXDLWMMA(MI: I))
2229 return false;
2230
2231 Category = getWMMAHazardInstInCategory(MI: I, TII, SchedModel: TSchedModel, ST);
2232 return hasWMMAToWMMARegOverlap(WMMA: I, MI: *MI);
2233 };
2234
2235 auto IsVALUHazardFn = [MI, TII, &Category, this](const MachineInstr &I) {
2236 if (!TII->isXDLWMMA(MI: I))
2237 return false;
2238
2239 Category = getWMMAHazardInstInCategory(MI: I, TII, SchedModel: TSchedModel, ST);
2240 return hasWMMAToVALURegOverlap(WMMA: I, MI: *MI);
2241 };
2242
2243 int WaitStatesNeeded = -1;
2244 int ExistingVALUs = 0; // Existing number of VALU ops in between.
2245 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2246
2247 // getWaitStatesSinceVALU checks for a hazard between instruction 'I' and
2248 // 'MI':
2249 // - If a hazard exists: returns the number of VALUs in between and sets
2250 // 'Category' via IsWMMAHazardFn/IsVALUHazardFn for instruction 'I'.
2251 // - If no hazard exists: returns INT_MAX, making WaitStatesNeeded negative,
2252 // so no V_NOP insertion is needed.
2253 if (TII->isXDLWMMA(MI: *MI)) {
2254 // Maximum of MMAWaitStates.
2255 const int WMMAWaitsLimit = IsLowestRateWMMA ? 17 : 9;
2256 ExistingVALUs = getWaitStatesSinceVALU(IsHazard: IsWMMAHazardFn, Limit: WMMAWaitsLimit);
2257 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2258 } else { // Must be a co-executable VALU.
2259 // Maximum of VALUWaitStates.
2260 const int VALUWaitsLimit = IsLowestRateWMMA ? 16 : 8;
2261 ExistingVALUs = getWaitStatesSinceVALU(IsHazard: IsVALUHazardFn, Limit: VALUWaitsLimit);
2262 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2263 }
2264
2265 return WaitStatesNeeded;
2266}
2267
2268bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2269 const MachineInstr &WMMA, const MachineInstr &MI) const {
2270 Register D0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::vdst)->getReg();
2271 Register A1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src0)->getReg();
2272 Register B1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src1)->getReg();
2273
2274 // WMMA0 writes (D0), WMMA1 reads (A1/B1/Idx1).
2275 if (TRI.regsOverlap(RegA: D0, RegB: A1) || TRI.regsOverlap(RegA: D0, RegB: B1))
2276 return true;
2277
2278 if (SIInstrInfo::isSWMMAC(MI)) {
2279 Register Idx1 = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2280 if (TRI.regsOverlap(RegA: D0, RegB: Idx1))
2281 return true;
2282 }
2283 return false;
2284}
2285
2286bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2287 const MachineInstr &WMMA, const MachineInstr &MI) const {
2288 // WMMA writes, VALU reads.
2289 Register D0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::vdst)->getReg();
2290 for (const MachineOperand &ValuUse : MI.explicit_uses()) {
2291 if (ValuUse.isReg() && TRI.regsOverlap(RegA: D0, RegB: ValuUse.getReg()))
2292 return true;
2293 }
2294
2295 // WMMA reads or writes, VALU writes.
2296 Register A0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src0)->getReg();
2297 Register B0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src1)->getReg();
2298 SmallVector<Register, 4> WMMARegs({D0, A0, B0});
2299
2300 if (SIInstrInfo::isSWMMAC(MI: WMMA)) {
2301 Register Idx0 = TII.getNamedOperand(MI: WMMA, OperandName: AMDGPU::OpName::src2)->getReg();
2302 WMMARegs.push_back(Elt: Idx0);
2303 }
2304
2305 for (const MachineOperand &ValuDef : MI.defs()) {
2306 Register VDstReg = ValuDef.getReg();
2307 for (Register WMMAReg : WMMARegs) {
2308 if (TRI.regsOverlap(RegA: VDstReg, RegB: WMMAReg))
2309 return true;
2310 }
2311 }
2312 return false;
2313}
2314
2315bool GCNHazardRecognizer::isCoexecutionHazardFor(const MachineInstr &I,
2316 const MachineInstr &MI) const {
2317 // I is the potential WMMA hazard source, MI is the instruction being checked
2318 // for hazard.
2319 if (!TII.isXDLWMMA(MI: I))
2320 return false;
2321
2322 // Dispatch based on MI type
2323 if (TII.isXDLWMMA(MI))
2324 return hasWMMAToWMMARegOverlap(WMMA: I, MI);
2325 if (isCoexecutableVALUInst(MI))
2326 return hasWMMAToVALURegOverlap(WMMA: I, MI);
2327
2328 return false;
2329}
2330
2331bool GCNHazardRecognizer::hasWMMAHazardInLoop(MachineLoop *L, MachineInstr *MI,
2332 bool IncludeSubloops) {
2333 // Scan loop for any WMMA that hazards MI.
2334 // TODO: Avoid full loop scan when WMMA is beyond VALU distance.
2335 for (MachineBasicBlock *MBB : L->getBlocks()) {
2336 if (!IncludeSubloops && MLI->getLoopFor(BB: MBB) != L)
2337 continue;
2338 for (MachineInstr &I : *MBB) {
2339 if (&I == MI)
2340 continue;
2341 if (isCoexecutionHazardFor(I, MI: *MI))
2342 return true;
2343 }
2344 }
2345 return false;
2346}
2347
2348bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(MachineInstr *MI,
2349 int WaitStatesNeeded) {
2350 if (!MLI)
2351 return false;
2352
2353 MachineLoop *L = MLI->getLoopFor(BB: MI->getParent());
2354 if (!L) {
2355 ++NumWMMAHoistingBailed;
2356 return false;
2357 }
2358
2359 // If innermost loop has WMMA hazard, we can't hoist at all
2360 if (hasWMMAHazardInLoop(L, MI)) {
2361 ++NumWMMAHoistingBailed;
2362 return false;
2363 }
2364
2365 // Find outermost loop with no internal hazard
2366 MachineLoop *TargetLoop = L;
2367 while (MachineLoop *Parent = TargetLoop->getParentLoop()) {
2368 if (hasWMMAHazardInLoop(L: Parent, MI, IncludeSubloops: false))
2369 break; // Parent has hazard in its own blocks, stop here
2370 TargetLoop = Parent; // Safe to hoist further out
2371 }
2372
2373 // Need valid preheader to insert V_NOPs
2374 MachineBasicBlock *Preheader = TargetLoop->getLoopPreheader();
2375 if (!Preheader) {
2376 ++NumWMMAHoistingBailed;
2377 return false;
2378 }
2379
2380 LLVM_DEBUG(dbgs() << "WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2381 << " V_NOPs from loop to " << printMBBReference(*Preheader)
2382 << "\n");
2383
2384 emitVNops(MBB&: *Preheader, InsertPt: Preheader->getFirstTerminator(), WaitStatesNeeded,
2385 /*IsHoisting=*/true);
2386 NumWMMANopsHoisted += WaitStatesNeeded;
2387 return true;
2388}
2389
2390bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
2391 int WaitStatesNeeded = checkWMMACoexecutionHazards(MI);
2392 if (WaitStatesNeeded <= 0)
2393 return false;
2394
2395 if (EnableWMMAVnopHoisting && tryHoistWMMAVnopsFromLoop(MI, WaitStatesNeeded))
2396 return true;
2397
2398 emitVNops(MBB&: *MI->getParent(), InsertPt: MI->getIterator(), WaitStatesNeeded);
2399 return true;
2400}
2401
2402bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2403 if (!ST.hasShift64HighRegBug())
2404 return false;
2405 assert(!ST.hasExtendedWaitCounts());
2406
2407 switch (MI->getOpcode()) {
2408 default:
2409 return false;
2410 case AMDGPU::V_LSHLREV_B64_e64:
2411 case AMDGPU::V_LSHRREV_B64_e64:
2412 case AMDGPU::V_ASHRREV_I64_e64:
2413 break;
2414 }
2415
2416 MachineOperand *Amt = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
2417 if (!Amt->isReg())
2418 return false;
2419
2420 Register AmtReg = Amt->getReg();
2421 const MachineRegisterInfo &MRI = MF.getRegInfo();
2422 // Check if this is a last VGPR in the allocation block.
2423 if (!TRI.isVGPR(MRI, Reg: AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2424 return false;
2425
2426 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + 1))
2427 return false;
2428
2429 assert(ST.needsAlignedVGPRs());
2430 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2431
2432 const DebugLoc &DL = MI->getDebugLoc();
2433 MachineBasicBlock *MBB = MI->getParent();
2434 MachineOperand *Src1 = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1);
2435
2436 // In:
2437 //
2438 // Dst = shiftrev64 Amt, Src1
2439 //
2440 // if Dst!=Src1 then avoid the bug with:
2441 //
2442 // Dst.sub0 = Amt
2443 // Dst = shift64 Dst.sub0, Src1
2444
2445 Register DstReg = MI->getOperand(i: 0).getReg();
2446 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2447 Register DstLo = TRI.getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
2448 runOnInstruction(
2449 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo).add(MO: *Amt));
2450 Amt->setReg(DstLo);
2451 Amt->setIsKill(true);
2452 return true;
2453 }
2454
2455 bool Overlapped = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI);
2456 Register NewReg;
2457 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2458 : AMDGPU::VGPR_32RegClass) {
2459 if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) {
2460 NewReg = Reg;
2461 break;
2462 }
2463 }
2464
2465 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1)
2466 : NewReg;
2467 Register NewAmtLo;
2468
2469 if (Overlapped)
2470 NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0);
2471
2472 // Insert a full wait count because found register might be pending a wait.
2473 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
2474 .addImm(Val: 0);
2475
2476 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2477 if (Overlapped)
2478 runOnInstruction(
2479 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo)
2480 .addDef(RegNo: AmtReg - 1)
2481 .addReg(RegNo: AmtReg - 1, Flags: RegState::Undef)
2482 .addReg(RegNo: NewAmtLo, Flags: RegState::Undef));
2483 runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt)
2484 .addDef(RegNo: AmtReg)
2485 .addReg(RegNo: AmtReg, Flags: RegState::Undef)
2486 .addReg(RegNo: NewAmt, Flags: RegState::Undef));
2487
2488 // Instructions emitted after the current instruction will be processed by the
2489 // parent loop of the hazard recognizer in a natural way.
2490 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2491 DestReg: AmtReg)
2492 .addDef(RegNo: NewAmt)
2493 .addReg(RegNo: NewAmt)
2494 .addReg(RegNo: AmtReg);
2495 if (Overlapped)
2496 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2497 DestReg: AmtReg - 1)
2498 .addDef(RegNo: NewAmtLo)
2499 .addReg(RegNo: NewAmtLo)
2500 .addReg(RegNo: AmtReg - 1);
2501
2502 // Re-running hazard recognizer on the modified instruction is not necessary,
2503 // inserted V_SWAP_B32 has already both read and write new registers so
2504 // hazards related to these register has already been handled.
2505 Amt->setReg(NewAmt);
2506 Amt->setIsKill(false);
2507 // We do not update liveness, so verifier may see it as undef.
2508 Amt->setIsUndef();
2509 if (Overlapped) {
2510 MI->getOperand(i: 0).setReg(NewReg);
2511 Src1->setReg(NewReg);
2512 Src1->setIsKill(false);
2513 Src1->setIsUndef();
2514 }
2515
2516 return true;
2517}
2518
2519int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) const {
2520 int NSAtoVMEMWaitStates = 1;
2521
2522 if (!ST.hasNSAtoVMEMBug())
2523 return 0;
2524
2525 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isMTBUF(MI: *MI))
2526 return 0;
2527
2528 const SIInstrInfo *TII = ST.getInstrInfo();
2529 const auto *Offset = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2530 if (!Offset || (Offset->getImm() & 6) == 0)
2531 return 0;
2532
2533 auto IsHazardFn = [TII](const MachineInstr &I) {
2534 if (!SIInstrInfo::isMIMG(MI: I))
2535 return false;
2536 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
2537 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2538 TII->getInstSizeInBytes(MI: I) >= 16;
2539 };
2540
2541 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: 1);
2542}
2543
2544int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2545 MachineInstr *MI) const {
2546 int FPAtomicToDenormModeWaitStates = 3;
2547
2548 if (!ST.hasFPAtomicToDenormModeHazard())
2549 return 0;
2550 assert(!ST.hasExtendedWaitCounts());
2551
2552 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2553 return 0;
2554
2555 auto IsHazardFn = [](const MachineInstr &I) {
2556 if (!SIInstrInfo::isVMEM(MI: I))
2557 return false;
2558 return SIInstrInfo::isFPAtomic(MI: I);
2559 };
2560
2561 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2562 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
2563 return true;
2564
2565 return SIInstrInfo::isWaitcnt(Opcode: MI.getOpcode());
2566 };
2567
2568 return FPAtomicToDenormModeWaitStates -
2569 ::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
2570}
2571
2572int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) const {
2573 assert(SIInstrInfo::isMAI(*MI));
2574
2575 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2576}
2577
2578int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) const {
2579 // Early exit if no padding is requested.
2580 if (MFMAPaddingRatio == 0)
2581 return 0;
2582
2583 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2584 if (!SIInstrInfo::isMFMA(MI: *MI) || MFI->getOccupancy() < 2)
2585 return 0;
2586
2587 int NeighborMFMALatency = 0;
2588 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2589 this](const MachineInstr &MI) {
2590 if (!SIInstrInfo::isMFMA(MI))
2591 return false;
2592
2593 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2594 return true;
2595 };
2596
2597 const int MaxMFMAPipelineWaitStates = 16;
2598 int WaitStatesSinceNeighborMFMA =
2599 getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
2600
2601 int NeighborMFMAPaddingNeeded =
2602 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2603 WaitStatesSinceNeighborMFMA;
2604
2605 return std::max(a: 0, b: NeighborMFMAPaddingNeeded);
2606}
2607
2608int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
2609 int WaitStatesNeeded = 0;
2610 unsigned Opc = MI->getOpcode();
2611
2612 auto IsVALUFn = [](const MachineInstr &MI) {
2613 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) || MI.isInlineAsm();
2614 };
2615
2616 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2617 const int LegacyVALUWritesVGPRWaitStates = 2;
2618 const int VALUWritesExecWaitStates = 4;
2619 const int MaxWaitStates = 4;
2620
2621 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2622 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2623 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2624
2625 if (WaitStatesNeeded < MaxWaitStates) {
2626 for (const MachineOperand &Use : MI->explicit_uses()) {
2627 const int MaxWaitStates = 2;
2628
2629 if (!Use.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
2630 continue;
2631
2632 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2633 getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2634 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2635
2636 if (WaitStatesNeeded == MaxWaitStates)
2637 break;
2638 }
2639 }
2640 }
2641
2642 for (const MachineOperand &Op : MI->explicit_operands()) {
2643 if (!Op.isReg() || !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2644 continue;
2645
2646 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2647 continue;
2648
2649 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2650 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2651 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2652 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2653 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2654 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2655 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2656 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2657 const int MaxWaitStates = 18;
2658 Register Reg = Op.getReg();
2659 unsigned HazardDefLatency = 0;
2660
2661 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2662 this](const MachineInstr &MI) {
2663 if (!SIInstrInfo::isMFMA(MI))
2664 return false;
2665 Register DstReg = MI.getOperand(i: 0).getReg();
2666 if (DstReg == Reg)
2667 return false;
2668 HazardDefLatency =
2669 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2670 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2671 };
2672
2673 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn,
2674 Limit: MaxWaitStates);
2675 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2676 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2677 int OpNo = Op.getOperandNo();
2678 if (OpNo == SrcCIdx) {
2679 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2680 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2681 switch (HazardDefLatency) {
2682 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2683 break;
2684 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2685 break;
2686 case 16: [[fallthrough]];
2687 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2688 break;
2689 }
2690 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2691 switch (HazardDefLatency) {
2692 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2693 break;
2694 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2695 break;
2696 case 16: [[fallthrough]];
2697 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2698 break;
2699 }
2700 }
2701
2702 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2703 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2704
2705 if (WaitStatesNeeded == MaxWaitStates)
2706 return WaitStatesNeeded; // Early exit.
2707
2708 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2709 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2710 return false;
2711 Register DstReg = MI.getOperand(i: 0).getReg();
2712 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2713 };
2714
2715 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2716 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2717 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2718 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2719 if (OpNo == SrcCIdx)
2720 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2721 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2722 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2723
2724 WaitStatesNeededForUse = NeedWaitStates -
2725 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates);
2726 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2727
2728 if (WaitStatesNeeded == MaxWaitStates)
2729 return WaitStatesNeeded; // Early exit.
2730 }
2731
2732 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2733 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2734 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2735 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2736 const int MaxWaitStates = 13;
2737 Register DstReg = MI->getOperand(i: 0).getReg();
2738 unsigned HazardDefLatency = 0;
2739
2740 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2741 this](const MachineInstr &MI) {
2742 if (!SIInstrInfo::isMFMA(MI))
2743 return false;
2744 Register Reg = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2745 HazardDefLatency =
2746 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2747 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2748 };
2749
2750 int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates);
2751 int NeedWaitStates;
2752 switch (HazardDefLatency) {
2753 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2754 break;
2755 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2756 break;
2757 case 16: [[fallthrough]];
2758 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2759 break;
2760 }
2761
2762 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2763 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2764 }
2765
2766 // Pad neighboring MFMA with noops for better inter-wave performance.
2767 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2768
2769 return WaitStatesNeeded;
2770}
2771
2772static int
2773GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2774 bool IsGFX950) {
2775 // xdl def cycles | gfx940 | gfx950
2776 // 2 pass | 3 4
2777 // 4 pass | 5 6
2778 // 8 pass | 9 10
2779 // 16 pass | 17 18
2780 return NumPasses + 1 + IsGFX950;
2781}
2782
2783static int
2784GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2785 bool IsGFX950) {
2786 // xdl def cycles | gfx940 | gfx950
2787 // 2 pass | 3 3
2788 // 4 pass | 5 6
2789 // 8 pass | 9 10
2790 // 16 pass | 17 18
2791 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2792}
2793
2794static int
2795GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2796 // 2 pass -> 2
2797 // 4 pass -> 4
2798 // 8 pass -> 8
2799 // 16 pass -> 16
2800 return NumPasses;
2801}
2802
2803static int
2804GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2805 // 2 pass -> 4
2806 // 4 pass -> 6
2807 // 8 pass -> 10
2808 // 16 pass -> 18
2809 return NumPasses + 2;
2810}
2811
2812static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
2813 bool IsGFX950) {
2814 // xdl def cycles | gfx942 | gfx950
2815 // 2 pass | 5 5
2816 // 4 pass | 7 8
2817 // 8 pass | 11 12
2818 // 16 pass | 19 20
2819 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2820}
2821
2822int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
2823 int WaitStatesNeeded = 0;
2824 unsigned Opc = MI->getOpcode();
2825
2826 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2827 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
2828 !SIInstrInfo::isMFMA(MI);
2829 };
2830
2831 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2832 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
2833 !SIInstrInfo::isMFMA(MI) && !SIInstrInfo::isDOT(MI);
2834 };
2835
2836 if (!SIInstrInfo::isMFMA(MI: *MI))
2837 return WaitStatesNeeded;
2838
2839 const int VALUWritesExecWaitStates = 4;
2840 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2841 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn,
2842 Limit: VALUWritesExecWaitStates);
2843 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2844
2845 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2846
2847 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2848 for (const MachineOperand &Use : MI->explicit_uses()) {
2849 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2850 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2851 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2852 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2853 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2854 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2855 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2856 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2857 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2858 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2859 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2860 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2861 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2862 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2863 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2864 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2865 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2866 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2867 const int MaxWaitStates = 19;
2868
2869 if (!Use.isReg())
2870 continue;
2871 Register Reg = Use.getReg();
2872 bool FullReg;
2873 const MachineInstr *MI1;
2874
2875 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2876 this](const MachineInstr &MI) {
2877 if (!SIInstrInfo::isMFMA(MI))
2878 return false;
2879 Register DstReg = MI.getOperand(i: 0).getReg();
2880 FullReg = (DstReg == Reg);
2881 MI1 = &MI;
2882 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2883 };
2884
2885 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2886 getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2887 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2888
2889 int NumWaitStates =
2890 getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates);
2891 if (NumWaitStates == std::numeric_limits<int>::max())
2892 continue;
2893
2894 int OpNo = Use.getOperandNo();
2895 unsigned Opc1 = MI1->getOpcode();
2896 int NeedWaitStates = 0;
2897 if (OpNo == SrcCIdx) {
2898 if (!SIInstrInfo::isDGEMM(Opcode: Opc) &&
2899 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opcode: Opc1))) {
2900 NeedWaitStates = 0;
2901 } else if (FullReg) {
2902 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2903 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2904 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2905 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2906 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2907 else if (ST.hasGFX940Insts() &&
2908 TSchedModel.computeInstrLatency(MI: MI1) == 2)
2909 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2910 } else {
2911 switch (Opc1) {
2912 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2913 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2914 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2915 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2916 if (!TII.isXDL(MI: *MI))
2917 NeedWaitStates =
2918 ST.hasGFX950Insts()
2919 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2920 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2921 break;
2922 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2923 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2924 if (!TII.isXDL(MI: *MI))
2925 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2926 break;
2927 default:
2928 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2929 if (ST.hasGFX940Insts()) {
2930 if (TII.isXDL(MI: *MI) && !TII.isXDL(MI: *MI1))
2931 break;
2932
2933 NeedWaitStates =
2934 TII.isXDL(MI: *MI1)
2935 ? (TII.isXDL(MI: *MI)
2936 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2937 NumPasses, IsGFX950: ST.hasGFX950Insts())
2938 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2939 NumPasses, IsGFX950: ST.hasGFX950Insts()))
2940 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2941 NumPasses);
2942 break;
2943 }
2944
2945 switch (NumPasses) {
2946 case 2:
2947 NeedWaitStates =
2948 SIInstrInfo::isDGEMM(Opcode: Opc)
2949 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2950 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2951 break;
2952 case 8:
2953 NeedWaitStates =
2954 SIInstrInfo::isDGEMM(Opcode: Opc)
2955 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2956 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2957 break;
2958 case 16:
2959 NeedWaitStates =
2960 SIInstrInfo::isDGEMM(Opcode: Opc)
2961 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2962 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2963 break;
2964 default:
2965 llvm_unreachable("unexpected number of passes");
2966 }
2967 }
2968 }
2969 } else {
2970 switch (Opc1) {
2971 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2972 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2973 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2974 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2975 NeedWaitStates =
2976 ST.hasGFX950Insts()
2977 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2978 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2979 break;
2980 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2981 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2982 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2983 break;
2984 default:
2985 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2986
2987 if (ST.hasGFX940Insts()) {
2988 NeedWaitStates =
2989 TII.isXDL(MI: *MI1)
2990 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2991 NumPasses, IsGFX950: ST.hasGFX950Insts())
2992 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2993 NumPasses);
2994 break;
2995 }
2996
2997 switch (NumPasses) {
2998 case 2:
2999 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
3000 break;
3001 case 4:
3002 llvm_unreachable("unexpected number of passes for mfma");
3003 case 8:
3004 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
3005 break;
3006 case 16:
3007 default:
3008 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
3009 }
3010 }
3011 }
3012 if (WaitStatesNeeded >= NeedWaitStates)
3013 continue;
3014
3015 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
3016 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3017
3018 if (WaitStatesNeeded == MaxWaitStates)
3019 break;
3020 }
3021
3022 // Pad neighboring MFMA with noops for better inter-wave performance.
3023 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
3024
3025 return WaitStatesNeeded;
3026}
3027
3028int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) const {
3029 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
3030 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
3031 return 0;
3032
3033 int WaitStatesNeeded = 0;
3034
3035 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
3036 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
3037 };
3038
3039 for (const MachineOperand &Op : MI->explicit_uses()) {
3040 if (!Op.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
3041 continue;
3042
3043 Register Reg = Op.getReg();
3044
3045 const int AccVgprReadLdStWaitStates = 2;
3046 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
3047 const int MaxWaitStates = 2;
3048
3049 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
3050 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates);
3051 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3052
3053 if (WaitStatesNeeded == MaxWaitStates)
3054 return WaitStatesNeeded; // Early exit.
3055
3056 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
3057 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
3058 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
3059 return false;
3060 auto IsVALUFn = [](const MachineInstr &MI) {
3061 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true) &&
3062 !SIInstrInfo::isMAI(MI);
3063 };
3064 return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: 2 /*MaxWaitStates*/) <
3065 std::numeric_limits<int>::max();
3066 };
3067
3068 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
3069 getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
3070 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3071 }
3072
3073 return WaitStatesNeeded;
3074}
3075
3076int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) const {
3077 assert(!ST.hasVcmpxPermlaneHazard() &&
3078 "this is a different vcmpx+permlane hazard");
3079 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3080 const SIInstrInfo *TII = ST.getInstrInfo();
3081
3082 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
3083 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
3084 };
3085
3086 auto IsVALUFn = [](const MachineInstr &MI) {
3087 return SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true);
3088 };
3089
3090 const int VCmpXWritesExecWaitStates = 4;
3091 const int VALUWritesVDstWaitStates = 2;
3092 int WaitStatesNeeded = 0;
3093
3094 for (const MachineOperand &Op : MI->explicit_uses()) {
3095 if (!Op.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
3096 continue;
3097 Register Reg = Op.getReg();
3098
3099 int WaitStatesSinceDef =
3100 VALUWritesVDstWaitStates -
3101 getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn,
3102 /*MaxWaitStates=*/Limit: VALUWritesVDstWaitStates);
3103 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesSinceDef);
3104 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3105 break;
3106 }
3107
3108 int VCmpXHazardWaits =
3109 VCmpXWritesExecWaitStates -
3110 getWaitStatesSince(IsHazard: IsVCmpXWritesExecFn, Limit: VCmpXWritesExecWaitStates);
3111
3112 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: VCmpXHazardWaits);
3113 return WaitStatesNeeded;
3114}
3115
3116static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
3117 // 2 pass -> 4
3118 // 4 pass -> 6
3119 // 8 pass -> 10
3120 // 16 pass -> 18
3121 return NumPasses + 2;
3122}
3123
3124static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
3125 bool IsGFX950) {
3126 // xdl def cycles | gfx942 | gfx950
3127 // 2 pass | 5 5
3128 // 4 pass | 7 8
3129 // 8 pass | 11 12
3130 // 16 pass | 19 20
3131 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3132}
3133
3134static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
3135 bool IsGFX950) {
3136 // xdl def cycles | gfx942 | gfx950
3137 // 2 pass | 5 5
3138 // 4 pass | 7 8
3139 // 8 pass | 11 12
3140 // 16 pass | 19 20
3141 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3142}
3143
3144static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
3145 // 2 pass -> 4
3146 // 4 pass -> 6
3147 // 8 pass -> 10
3148 // 16 pass -> 18
3149 return NumPasses + 2;
3150}
3151
3152int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
3153 if (!ST.hasGFX90AInsts())
3154 return 0;
3155
3156 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
3157 return SIInstrInfo::isDGEMM(Opcode: MI.getOpcode());
3158 };
3159
3160 // This is checked in checkMAIHazards90A()
3161 if (SIInstrInfo::isMFMA(MI: *MI))
3162 return 0;
3163
3164 const MachineRegisterInfo &MRI = MF.getRegInfo();
3165
3166 int WaitStatesNeeded = 0;
3167
3168 bool IsMem = SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI);
3169 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(MI: *MI);
3170 bool IsVALU = SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true);
3171
3172 const MachineInstr *MFMA = nullptr;
3173 unsigned Reg;
3174 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3175 if (!SIInstrInfo::isMFMA(MI) ||
3176 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
3177 return false;
3178 MFMA = &MI;
3179 return true;
3180 };
3181
3182 const MachineInstr *DOT = nullptr;
3183 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
3184 if (!SIInstrInfo::isDOT(MI) ||
3185 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
3186 return false;
3187 DOT = &MI;
3188 return true;
3189 };
3190
3191 bool DGEMMAfterVALUWrite = false;
3192 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
3193 // Found DGEMM on reverse traversal to def.
3194 if (SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()))
3195 DGEMMAfterVALUWrite = true;
3196
3197 // Only hazard if register is defined by a VALU and a DGEMM is found after
3198 // after the def.
3199 if (!TII.isVALU(MI, /*AllowLDSDMA=*/true) || !DGEMMAfterVALUWrite)
3200 return false;
3201
3202 return true;
3203 };
3204
3205 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(),
3206 Name: AMDGPU::OpName::src2);
3207
3208 if (IsMemOrExport || IsVALU) {
3209 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3210 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3211 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3212 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3213 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3214 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3215 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3216 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3217 const int DotWriteSameDotReadSrcAB = 3;
3218 const int DotWriteDifferentVALURead = 3;
3219 const int DMFMABetweenVALUWriteVMEMRead = 2;
3220 const int MaxWaitStates = 19;
3221
3222 for (const MachineOperand &Use : MI->explicit_uses()) {
3223 if (!Use.isReg())
3224 continue;
3225 Reg = Use.getReg();
3226
3227 DOT = nullptr;
3228 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3229 Limit: MaxWaitStates);
3230 if (DOT) {
3231 int NeedWaitStates = 0;
3232 if (DOT->getOpcode() == MI->getOpcode()) {
3233 if (&Use - &MI->getOperand(i: 0) != SrcCIdx)
3234 NeedWaitStates = DotWriteSameDotReadSrcAB;
3235 } else {
3236 NeedWaitStates = DotWriteDifferentVALURead;
3237 }
3238
3239 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3240 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3241 }
3242
3243 // Workaround for HW data hazard bug observed only in GFX90A. When there
3244 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3245 // causes the SQ to incorrectly not insert two wait states between the two
3246 // instructions needed to avoid data hazard.
3247 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3248 DGEMMAfterVALUWrite = false;
3249 if (TRI.isVectorRegister(MRI, Reg)) {
3250 int WaitStatesNeededForUse =
3251 DMFMABetweenVALUWriteVMEMRead -
3252 getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
3253 Limit: DMFMABetweenVALUWriteVMEMRead);
3254
3255 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3256 }
3257 }
3258
3259 MFMA = nullptr;
3260 WaitStatesSinceDef =
3261 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3262 if (!MFMA)
3263 continue;
3264
3265 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3266 int NumPasses = HazardDefLatency;
3267 int NeedWaitStates = MaxWaitStates;
3268
3269 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3270 switch (HazardDefLatency) {
3271 case 4:
3272 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3273 : DMFMA4x4WriteVgprVALUReadWaitStates;
3274 break;
3275 case 8:
3276 case 16:
3277 NeedWaitStates =
3278 IsMemOrExport
3279 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3280 : (ST.hasGFX950Insts()
3281 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3282 : DMFMA16x16WriteVgprVALUReadWaitStates);
3283 break;
3284 default:
3285 llvm_unreachable("unexpected dgemm");
3286 }
3287 } else if (ST.hasGFX940Insts()) {
3288 NeedWaitStates =
3289 TII.isXDL(MI: *MFMA)
3290 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
3291 NumPasses, IsGFX950: ST.hasGFX950Insts())
3292 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
3293 NumPasses);
3294 } else {
3295 switch (HazardDefLatency) {
3296 case 2:
3297 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3298 break;
3299 case 8:
3300 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3301 break;
3302 case 16:
3303 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3304 break;
3305 default:
3306 llvm_unreachable("unexpected number of passes for mfma");
3307 }
3308 }
3309
3310 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3311 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3312
3313 if (WaitStatesNeeded == MaxWaitStates)
3314 break;
3315 }
3316 }
3317
3318 unsigned Opc = MI->getOpcode();
3319 const int DMFMAToFMA64WaitStates = 2;
3320 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3321 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3322 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3323 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3324 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3325 getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
3326 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3327 }
3328
3329 if (!IsVALU && !IsMemOrExport)
3330 return WaitStatesNeeded;
3331
3332 for (const MachineOperand &Def : MI->defs()) {
3333 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3334 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3335 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3336 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3337 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3338 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3339 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3340 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3341 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3342 const int DotWriteDifferentVALUWrite = 3;
3343 const int MaxWaitStates = 19;
3344 const int MaxWarWaitStates = 15;
3345
3346 Reg = Def.getReg();
3347
3348 DOT = nullptr;
3349 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3350 Limit: MaxWaitStates);
3351 if (DOT && DOT->getOpcode() != MI->getOpcode())
3352 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
3353 WaitStatesSinceDef);
3354
3355 MFMA = nullptr;
3356 WaitStatesSinceDef =
3357 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3358 if (MFMA) {
3359 int NeedWaitStates = MaxWaitStates;
3360 int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
3361
3362 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3363 switch (NumPasses) {
3364 case 4:
3365 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3366 break;
3367 case 8:
3368 case 16:
3369 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3370 break;
3371 default:
3372 llvm_unreachable("unexpected number of cycles for dgemm");
3373 }
3374 } else if (ST.hasGFX940Insts()) {
3375 NeedWaitStates =
3376 TII.isXDL(MI: *MFMA)
3377 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
3378 NumPasses, IsGFX950: ST.hasGFX950Insts())
3379 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
3380 } else {
3381 switch (NumPasses) {
3382 case 2:
3383 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3384 break;
3385 case 8:
3386 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3387 break;
3388 case 16:
3389 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3390 break;
3391 default:
3392 llvm_unreachable("Unexpected number of passes for mfma");
3393 }
3394 }
3395
3396 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3397 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3398
3399 if (WaitStatesNeeded == MaxWaitStates)
3400 break;
3401 }
3402
3403 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3404 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()) ||
3405 !MI.readsRegister(Reg, TRI: &TRI))
3406 return false;
3407
3408 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3409 return false;
3410
3411 const MachineOperand *SrcC =
3412 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
3413 assert(SrcC);
3414 if (!SrcC->isReg() || !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg))
3415 return false;
3416
3417 MFMA = &MI;
3418 return true;
3419 };
3420
3421 MFMA = nullptr;
3422 int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
3423 Limit: MaxWarWaitStates);
3424 if (!MFMA)
3425 continue;
3426
3427 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3428 int NeedWaitStates = MaxWaitStates;
3429 switch (HazardDefLatency) {
3430 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3431 break;
3432 case 4: assert(ST.hasGFX940Insts());
3433 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3434 break;
3435 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3436 break;
3437 case 16: [[fallthrough]];
3438 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3439 break;
3440 }
3441
3442 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3443 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3444 }
3445
3446 return WaitStatesNeeded;
3447}
3448
3449bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) const {
3450 if (!SU->isInstr())
3451 return false;
3452
3453 const MachineInstr *MAI = nullptr;
3454
3455 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3456 MAI = nullptr;
3457 if (SIInstrInfo::isMFMA(MI))
3458 MAI = &MI;
3459 return MAI != nullptr;
3460 };
3461
3462 MachineInstr *MI = SU->getInstr();
3463 if (IsMFMAFn(*MI)) {
3464 int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: 16);
3465 if (MAI)
3466 return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
3467 }
3468
3469 return false;
3470}
3471
3472// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3473// insertion of a new instruction.
3474static void updateGetPCBundle(MachineInstr *NewMI) {
3475 if (!NewMI->isBundled())
3476 return;
3477
3478 // Find start of bundle.
3479 auto I = NewMI->getIterator();
3480 while (I->isBundledWithPred())
3481 I--;
3482 if (I->isBundle())
3483 I++;
3484
3485 // Bail if this is not an S_GETPC bundle.
3486 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3487 return;
3488
3489 // Update offsets of any references in the bundle.
3490 const unsigned NewBytes = 4;
3491 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3492 "Unexpected instruction insertion in bundle");
3493 auto NextMI = std::next(x: NewMI->getIterator());
3494 auto End = NewMI->getParent()->end();
3495 while (NextMI != End && NextMI->isBundledWithPred()) {
3496 for (auto &Operand : NextMI->operands()) {
3497 if (Operand.isGlobal())
3498 Operand.setOffset(Operand.getOffset() + NewBytes);
3499 }
3500 NextMI++;
3501 }
3502}
3503
3504bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3505 if (!ST.hasVALUMaskWriteHazard())
3506 return false;
3507 assert(!ST.hasExtendedWaitCounts());
3508
3509 if (!ST.isWave64())
3510 return false;
3511
3512 const bool IsSALU = SIInstrInfo::isSALU(MI: *MI);
3513 const bool IsVALU = SIInstrInfo::isVALU(MI: *MI, /*AllowLDSDMA=*/true);
3514 if (!IsSALU && !IsVALU)
3515 return false;
3516
3517 // The hazard sequence is three instructions:
3518 // 1. VALU reads SGPR as mask
3519 // 2. VALU/SALU writes SGPR
3520 // 3. VALU/SALU reads SGPR
3521 // The hazard can expire if the distance between 2 and 3 is sufficient,
3522 // or (2) is VALU and (3) is SALU.
3523 // In practice this happens <10% of the time, hence always assume the hazard
3524 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3525
3526 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3527 const MachineRegisterInfo &MRI = MF.getRegInfo();
3528
3529 auto IgnoreableSGPR = [](const Register Reg) {
3530 switch (Reg) {
3531 case AMDGPU::EXEC:
3532 case AMDGPU::EXEC_LO:
3533 case AMDGPU::EXEC_HI:
3534 case AMDGPU::M0:
3535 case AMDGPU::SGPR_NULL:
3536 case AMDGPU::SGPR_NULL64:
3537 case AMDGPU::SCC:
3538 return true;
3539 default:
3540 return false;
3541 }
3542 };
3543 auto IsVCC = [](const Register Reg) {
3544 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3545 };
3546
3547 struct StateType {
3548 SmallSet<Register, 2> HazardSGPRs;
3549
3550 static unsigned getHashValue(const StateType &State) {
3551 return hash_combine_range(R: State.HazardSGPRs);
3552 }
3553 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3554 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3555 }
3556 };
3557
3558 SmallVector<const MachineInstr *> WaitInstrs;
3559 bool HasSGPRRead = false;
3560 StateType InitialState;
3561
3562 // Look for SGPR write.
3563 MachineOperand *HazardDef = nullptr;
3564 for (MachineOperand &Op : MI->operands()) {
3565 if (!Op.isReg())
3566 continue;
3567 if (Op.isDef() && HazardDef)
3568 continue;
3569
3570 Register Reg = Op.getReg();
3571 if (IgnoreableSGPR(Reg))
3572 continue;
3573 if (!IsVCC(Reg)) {
3574 if (Op.isImplicit())
3575 continue;
3576 if (!TRI->isSGPRReg(MRI, Reg))
3577 continue;
3578 }
3579 // Also check for SGPR reads.
3580 if (Op.isUse()) {
3581 HasSGPRRead = true;
3582 continue;
3583 }
3584
3585 assert(!HazardDef);
3586 HazardDef = &Op;
3587 }
3588
3589 if (!HazardDef)
3590 return false;
3591
3592 // Setup to track writes to individual SGPRs
3593 const Register HazardReg = HazardDef->getReg();
3594 if (AMDGPU::SReg_32RegClass.contains(Reg: HazardReg)) {
3595 InitialState.HazardSGPRs.insert(V: HazardReg);
3596 } else {
3597 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3598 InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub0));
3599 InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub1));
3600 }
3601
3602 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3603 if (State.HazardSGPRs.empty())
3604 return HazardExpired;
3605
3606 switch (I.getOpcode()) {
3607 case AMDGPU::V_ADDC_U32_e32:
3608 case AMDGPU::V_ADDC_U32_dpp:
3609 case AMDGPU::V_CNDMASK_B16_t16_e32:
3610 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3611 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3612 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3613 case AMDGPU::V_CNDMASK_B32_e32:
3614 case AMDGPU::V_CNDMASK_B32_dpp:
3615 case AMDGPU::V_DIV_FMAS_F32_e64:
3616 case AMDGPU::V_DIV_FMAS_F64_e64:
3617 case AMDGPU::V_SUBB_U32_e32:
3618 case AMDGPU::V_SUBB_U32_dpp:
3619 case AMDGPU::V_SUBBREV_U32_e32:
3620 case AMDGPU::V_SUBBREV_U32_dpp: {
3621 // These implicitly read VCC as mask source.
3622 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3623 }
3624 case AMDGPU::V_ADDC_U32_e64:
3625 case AMDGPU::V_ADDC_U32_e64_dpp:
3626 case AMDGPU::V_CNDMASK_B16_t16_e64:
3627 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3628 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3629 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3630 case AMDGPU::V_CNDMASK_B32_e64:
3631 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3632 case AMDGPU::V_SUBB_U32_e64:
3633 case AMDGPU::V_SUBB_U32_e64_dpp:
3634 case AMDGPU::V_SUBBREV_U32_e64:
3635 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3636 // Only check mask register overlaps.
3637 const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src2);
3638 assert(SSRCOp);
3639 bool Result = TRI->regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg);
3640 return Result ? HazardFound : NoHazardFound;
3641 }
3642 default:
3643 return NoHazardFound;
3644 }
3645 };
3646
3647 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3648 Encoded: AMDGPU::DepCtr::encodeFieldVaSdst(Encoded: AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: 0, STI: ST),
3649 VaSdst: 0),
3650 SaSdst: 0);
3651 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3652 switch (I.getOpcode()) {
3653 case AMDGPU::S_WAITCNT_DEPCTR:
3654 // Record mergable waits within region of instructions free of SGPR reads.
3655 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3656 (I.getOperand(i: 0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3657 WaitInstrs.push_back(Elt: &I);
3658 break;
3659 default:
3660 // Update tracking of SGPR reads and writes.
3661 for (auto &Op : I.operands()) {
3662 if (!Op.isReg())
3663 continue;
3664
3665 Register Reg = Op.getReg();
3666 if (IgnoreableSGPR(Reg))
3667 continue;
3668 if (!IsVCC(Reg)) {
3669 if (Op.isImplicit())
3670 continue;
3671 if (!TRI->isSGPRReg(MRI, Reg))
3672 continue;
3673 }
3674 if (Op.isUse()) {
3675 HasSGPRRead = true;
3676 continue;
3677 }
3678
3679 // Stop tracking any SGPRs with writes on the basis that they will
3680 // already have an appropriate wait inserted afterwards.
3681 SmallVector<Register, 2> Found;
3682 for (Register SGPR : State.HazardSGPRs) {
3683 if (Reg == SGPR || TRI->regsOverlap(RegA: Reg, RegB: SGPR))
3684 Found.push_back(Elt: SGPR);
3685 }
3686 for (Register SGPR : Found)
3687 State.HazardSGPRs.erase(V: SGPR);
3688 }
3689 break;
3690 }
3691 };
3692
3693 // Check for hazard
3694 if (!hasHazard<StateType>(InitialState, IsHazard: IsHazardFn, UpdateState: UpdateStateFn,
3695 InitialMBB: MI->getParent(),
3696 InitialI: std::next(x: MI->getReverseIterator())))
3697 return false;
3698
3699 // Compute counter mask
3700 unsigned DepCtr =
3701 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: 0, STI: ST)
3702 : AMDGPU::DepCtr::encodeFieldVaSdst(VaSdst: 0, STI: ST))
3703 : AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST);
3704
3705 // Try to merge previous waits into this one for regions with no SGPR reads.
3706 if (!WaitInstrs.empty()) {
3707 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3708 // obtain a mutable pointer to each instruction to be merged.
3709 // This is expected to be a very short walk within the same block.
3710 SmallVector<MachineInstr *> ToErase;
3711 unsigned Found = 0;
3712 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3713 End = MI->getParent()->rend();
3714 Found < WaitInstrs.size() && It != End; ++It) {
3715 MachineInstr *WaitMI = &*It;
3716 // Find next wait instruction.
3717 if (std::as_const(t&: WaitMI) != WaitInstrs[Found])
3718 continue;
3719 Found++;
3720 unsigned WaitMask = WaitMI->getOperand(i: 0).getImm();
3721 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3722 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3723 Encoded: DepCtr, SaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: WaitMask),
3724 b: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: DepCtr)));
3725 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3726 Encoded: DepCtr, VaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: WaitMask),
3727 b: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: DepCtr)));
3728 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3729 Encoded: DepCtr, VaVcc: std::min(a: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: WaitMask),
3730 b: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: DepCtr)));
3731 ToErase.push_back(Elt: WaitMI);
3732 }
3733 assert(Found == WaitInstrs.size());
3734 for (MachineInstr *WaitMI : ToErase)
3735 WaitMI->eraseFromParent();
3736 }
3737
3738 // Add s_waitcnt_depctr after SGPR write.
3739 auto NextMI = std::next(x: MI->getIterator());
3740 auto NewMI = BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(),
3741 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3742 .addImm(Val: DepCtr);
3743
3744 // SALU write may be s_getpc in a bundle.
3745 updateGetPCBundle(NewMI);
3746
3747 return true;
3748}
3749
3750static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3751 const SIInstrInfo &TII) {
3752 MachineBasicBlock &EntryMBB = MF->front();
3753 if (EntryMBB.begin() != EntryMBB.end()) {
3754 auto &EntryMI = *EntryMBB.begin();
3755 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3756 EntryMI.getOperand(i: 0).getImm() >= Priority)
3757 return false;
3758 }
3759
3760 BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3761 .addImm(Val: Priority);
3762 return true;
3763}
3764
3765bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3766 if (!ST.hasRequiredExportPriority())
3767 return false;
3768
3769 // Assume the following shader types will never have exports,
3770 // and avoid adding or adjusting S_SETPRIO.
3771 MachineBasicBlock *MBB = MI->getParent();
3772 MachineFunction *MF = MBB->getParent();
3773 auto CC = MF->getFunction().getCallingConv();
3774 switch (CC) {
3775 case CallingConv::AMDGPU_CS:
3776 case CallingConv::AMDGPU_CS_Chain:
3777 case CallingConv::AMDGPU_CS_ChainPreserve:
3778 case CallingConv::AMDGPU_KERNEL:
3779 return false;
3780 default:
3781 break;
3782 }
3783
3784 const int MaxPriority = 3;
3785 const int NormalPriority = 2;
3786 const int PostExportPriority = 0;
3787
3788 auto It = MI->getIterator();
3789 switch (MI->getOpcode()) {
3790 case AMDGPU::S_ENDPGM:
3791 case AMDGPU::S_ENDPGM_SAVED:
3792 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3793 case AMDGPU::SI_RETURN_TO_EPILOG:
3794 // Ensure shader with calls raises priority at entry.
3795 // This ensures correct priority if exports exist in callee.
3796 if (MF->getFrameInfo().hasCalls())
3797 return ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3798 return false;
3799 case AMDGPU::S_SETPRIO: {
3800 // Raise minimum priority unless in workaround.
3801 auto &PrioOp = MI->getOperand(i: 0);
3802 int Prio = PrioOp.getImm();
3803 bool InWA = (Prio == PostExportPriority) &&
3804 (It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It)));
3805 if (InWA || Prio >= NormalPriority)
3806 return false;
3807 PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority));
3808 return true;
3809 }
3810 default:
3811 if (!TII.isEXP(MI: *MI))
3812 return false;
3813 break;
3814 }
3815
3816 // Check entry priority at each export (as there will only be a few).
3817 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3818 bool Changed = false;
3819 if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
3820 Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3821
3822 auto NextMI = std::next(x: It);
3823 bool EndOfShader = false;
3824 if (NextMI != MBB->end()) {
3825 // Only need WA at end of sequence of exports.
3826 if (TII.isEXP(MI: *NextMI))
3827 return Changed;
3828 // Assume appropriate S_SETPRIO after export means WA already applied.
3829 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3830 NextMI->getOperand(i: 0).getImm() == PostExportPriority)
3831 return Changed;
3832 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3833 }
3834
3835 const DebugLoc &DL = MI->getDebugLoc();
3836
3837 // Lower priority.
3838 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3839 .addImm(Val: PostExportPriority);
3840
3841 if (!EndOfShader) {
3842 // Wait for exports to complete.
3843 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT))
3844 .addReg(RegNo: AMDGPU::SGPR_NULL)
3845 .addImm(Val: 0);
3846 }
3847
3848 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3849 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3850
3851 if (!EndOfShader) {
3852 // Return to normal (higher) priority.
3853 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3854 .addImm(Val: NormalPriority);
3855 }
3856
3857 return true;
3858}
3859
3860bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3861 if (!isSGetReg(Opcode: MI->getOpcode()))
3862 return false;
3863
3864 const SIInstrInfo *TII = ST.getInstrInfo();
3865 switch (getHWReg(TII, RegInstr: *MI)) {
3866 default:
3867 return false;
3868 case AMDGPU::Hwreg::ID_STATUS:
3869 case AMDGPU::Hwreg::ID_STATE_PRIV:
3870 case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV:
3871 case AMDGPU::Hwreg::ID_EXCP_FLAG_USER:
3872 break;
3873 }
3874
3875 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3876 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3877 .addImm(Val: 0);
3878 return true;
3879}
3880
3881bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3882 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3883 return false;
3884
3885 const SIInstrInfo *TII = ST.getInstrInfo();
3886 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3887 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3888 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
3889 BuildMI(BB&: *MI->getParent(), I: std::next(x: MI->getIterator()), MIMD: MI->getDebugLoc(),
3890 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3891 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
3892
3893 return true;
3894}
3895
3896bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3897 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3898 // for hazard to trigger.
3899 if (!IsHazardRecognizerMode)
3900 return false;
3901
3902 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3903 const SIInstrInfo *TII = ST.getInstrInfo();
3904 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3905 const int FlatScrBaseWaitStates = 10;
3906
3907 bool ReadsFlatScrLo =
3908 MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3909 bool ReadsFlatScrHi =
3910 MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3911 if (isSGetReg(Opcode: MI->getOpcode())) {
3912 switch (getHWReg(TII, RegInstr: *MI)) {
3913 default:
3914 break;
3915 case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
3916 ReadsFlatScrLo = true;
3917 break;
3918 case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
3919 ReadsFlatScrHi = true;
3920 break;
3921 }
3922 }
3923
3924 const MachineRegisterInfo &MRI = MF.getRegInfo();
3925
3926 auto IsRegDefHazard = [&](Register Reg) -> bool {
3927 DenseSet<const MachineBasicBlock *> Visited;
3928 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3929 return MI.modifiesRegister(Reg, TRI);
3930 };
3931
3932 // This literally abuses the idea of waitstates. Instead of waitstates it
3933 // returns 1 for SGPR written and 0 otherwise.
3934 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3935 if (!TII->isSALU(MI) && !TII->isVALU(MI, /*AllowLDSDMA=*/true))
3936 return 0;
3937 for (const MachineOperand &MO : MI.all_defs()) {
3938 if (TRI->isSGPRReg(MRI, Reg: MO.getReg()))
3939 return 1;
3940 }
3941 return 0;
3942 };
3943
3944 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3945 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3946 unsigned Wait = MI.getOperand(i: 0).getImm();
3947 if (AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: Wait) == 0 &&
3948 AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: Wait) == 0)
3949 return true;
3950 }
3951 return SgprWrites >= FlatScrBaseWaitStates;
3952 };
3953
3954 return ::getWaitStatesSince(
3955 IsHazard: IsHazardFn, MBB: MI->getParent(), I: std::next(x: MI->getReverseIterator()),
3956 WaitStates: 0, IsExpired: IsExpiredFn, Visited, GetNumWaitStates: IsSGPRDef) < FlatScrBaseWaitStates;
3957 };
3958
3959 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR102) ||
3960 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3961 (!ReadsFlatScrHi || MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR103) ||
3962 !IsRegDefHazard(AMDGPU::SGPR103)))
3963 return false;
3964
3965 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3966 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3967 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaSdst(
3968 Encoded: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST), VaSdst: 0));
3969 return true;
3970}
3971
3972bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3973 if (!isSSetReg(Opcode: MI->getOpcode()) ||
3974 MI->getOperand(i: 1).getImm() != AMDGPU::Hwreg::ID_MODE)
3975 return false;
3976
3977 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3978 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3979 return true;
3980}
3981