1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16#include "SIMachineFunctionInfo.h"
17#include "llvm/CodeGen/MachineFrameInfo.h"
18#include "llvm/CodeGen/MachineFunction.h"
19#include "llvm/CodeGen/ScheduleDAG.h"
20#include "llvm/TargetParser/TargetParser.h"
21
22using namespace llvm;
23
24namespace {
25
26struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
28
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
30 if (Arg.getAsInteger(Radix: 0, Result&: Value))
31 return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
32
33 if (Value > 100)
34 return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
35
36 return false;
37 }
38};
39
40} // end anonymous namespace
41
42static cl::opt<unsigned, false, MFMAPaddingRatioParser>
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: 0), cl::Hidden,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
46
47// This is intended for debugging purposes only.
48static cl::opt<unsigned>
49 NopPadding("amdgpu-snop-padding", cl::init(Val: 0), cl::Hidden,
50 cl::desc("Insert a s_nop x before every instruction"));
51
52//===----------------------------------------------------------------------===//
53// Hazard Recognizer Implementation
54//===----------------------------------------------------------------------===//
55
56static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
57 const GCNSubtarget &ST);
58
59GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? 19 : 5;
65 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
66}
67
68void GCNHazardRecognizer::Reset() {
69 EmittedInstrs.clear();
70}
71
72void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
73 EmitInstruction(MI: SU->getInstr());
74}
75
76void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
77 CurrCycleInstr = MI;
78}
79
80static bool isDivFMas(unsigned Opcode) {
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
82}
83
84static bool isSGetReg(unsigned Opcode) {
85 return Opcode == AMDGPU::S_GETREG_B32;
86}
87
88static bool isSSetReg(unsigned Opcode) {
89 switch (Opcode) {
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
94 return true;
95 }
96 return false;
97}
98
99static bool isRWLane(unsigned Opcode) {
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
101}
102
103static bool isRFE(unsigned Opcode) {
104 return Opcode == AMDGPU::S_RFE_B64;
105}
106
107static bool isSMovRel(unsigned Opcode) {
108 switch (Opcode) {
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
113 return true;
114 default:
115 return false;
116 }
117}
118
119static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
120 const MachineInstr &MI) {
121 if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
122 return true;
123
124 switch (MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
128 return true;
129 // These DS opcodes don't support GDS.
130 case AMDGPU::DS_NOP:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
133 return false;
134 default:
135 if (TII.isDS(Opcode: MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
137 Name: AMDGPU::OpName::gds);
138 if (MI.getOperand(i: GDS).getImm())
139 return true;
140 }
141 return false;
142 }
143}
144
145static bool isPermlane(const MachineInstr &MI) {
146 unsigned Opcode = MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
156}
157
158static bool isLdsDma(const MachineInstr &MI) {
159 return SIInstrInfo::isVALU(MI) &&
160 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
161}
162
163static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
164 const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr,
165 OperandName: AMDGPU::OpName::simm16);
166 return std::get<0>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm()));
167}
168
169ScheduleHazardRecognizer::HazardType
170GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
171 MachineInstr *MI = SU->getInstr();
172 // If we are not in "HazardRecognizerMode" and therefore not being run from
173 // the scheduler, track possible stalls from hazards but don't insert noops.
174 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
175
176 if (MI->isBundle())
177 return NoHazard;
178
179 if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > 0)
180 return HazardType;
181
182 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
183 return HazardType;
184
185 if (checkFPAtomicToDenormModeHazard(MI) > 0)
186 return HazardType;
187
188 if (ST.hasNoDataDepHazard())
189 return NoHazard;
190
191 if (SIInstrInfo::isVMEM(MI: *MI) && checkVMEMHazards(VMEM: MI) > 0)
192 return HazardType;
193
194 if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > 0)
195 return HazardType;
196
197 if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > 0)
198 return HazardType;
199
200 if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > 0)
201 return HazardType;
202
203 if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > 0)
204 return HazardType;
205
206 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
207 SIInstrInfo::isDS(MI: *MI) || SIInstrInfo::isEXP(MI: *MI)) &&
208 checkMAIVALUHazards(MI) > 0)
209 return HazardType;
210
211 if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > 0)
212 return HazardType;
213
214 if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > 0)
215 return HazardType;
216
217 if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > 0)
218 return HazardType;
219
220 if (((ST.hasReadM0MovRelInterpHazard() &&
221 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
222 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
223 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
224 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
225 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
226 (ST.hasReadM0LdsDirectHazard() &&
227 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
228 checkReadM0Hazards(SMovRel: MI) > 0)
229 return HazardType;
230
231 if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > 0)
232 return HazardType;
233
234 if ((SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI)) &&
235 checkMAILdStHazards(MI) > 0)
236 return HazardType;
237
238 if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > 0)
239 return HazardType;
240
241 return NoHazard;
242}
243
244static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
245 unsigned Quantity) {
246 while (Quantity > 0) {
247 unsigned Arg = std::min(a: Quantity, b: 8u);
248 Quantity -= Arg;
249 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
250 .addImm(Val: Arg - 1);
251 }
252}
253
254unsigned
255GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
256 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
257 assert(TSchedModel.getWriteProcResBegin(SC) !=
258 TSchedModel.getWriteProcResEnd(SC));
259 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
260}
261
262void GCNHazardRecognizer::processBundle() {
263 MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
264 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
265 // Check bundled MachineInstr's for hazards.
266 for (; MI != E && MI->isInsideBundle(); ++MI) {
267 CurrCycleInstr = &*MI;
268 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
269
270 if (IsHazardRecognizerMode) {
271 fixHazards(MI: CurrCycleInstr);
272
273 insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
274 }
275
276 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
277 // include the bundled MI directly after, only add a maximum of
278 // (MaxLookAhead - 1) noops to EmittedInstrs.
279 for (unsigned i = 0, e = std::min(a: WaitStates, b: MaxLookAhead - 1); i < e; ++i)
280 EmittedInstrs.push_front(x: nullptr);
281
282 EmittedInstrs.push_front(x: CurrCycleInstr);
283 EmittedInstrs.resize(new_size: MaxLookAhead);
284 }
285 CurrCycleInstr = nullptr;
286}
287
288void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
289 assert(IsHazardRecognizerMode);
290
291 unsigned NumPreNoops = PreEmitNoops(MI);
292 EmitNoops(Quantity: NumPreNoops);
293 if (MI->isInsideBundle())
294 insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
295 else
296 TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator(MI),
297 Quantity: NumPreNoops);
298 EmitInstruction(MI);
299 AdvanceCycle();
300}
301
302unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
303 IsHazardRecognizerMode = true;
304 CurrCycleInstr = MI;
305 unsigned W = PreEmitNoopsCommon(MI);
306 fixHazards(MI);
307 CurrCycleInstr = nullptr;
308 return std::max(a: W, b: NopPadding.getValue());
309}
310
311unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
312 if (MI->isBundle())
313 return 0;
314
315 int WaitStates = 0;
316
317 if (SIInstrInfo::isSMRD(MI: *MI))
318 return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
319
320 if (ST.hasNSAtoVMEMBug())
321 WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
322
323 WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
324
325 if (ST.hasNoDataDepHazard())
326 return WaitStates;
327
328 if (SIInstrInfo::isVMEM(MI: *MI))
329 WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
330
331 if (SIInstrInfo::isVALU(MI: *MI))
332 WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
333
334 if (SIInstrInfo::isDPP(MI: *MI))
335 WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
336
337 if (isDivFMas(Opcode: MI->getOpcode()))
338 WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
339
340 if (isRWLane(Opcode: MI->getOpcode()))
341 WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
342
343 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
344 SIInstrInfo::isDS(MI: *MI) || SIInstrInfo::isEXP(MI: *MI)) &&
345 checkMAIVALUHazards(MI) > 0)
346 WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
347
348 if (MI->isInlineAsm())
349 return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
350
351 if (isSGetReg(Opcode: MI->getOpcode()))
352 return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
353
354 if (isSSetReg(Opcode: MI->getOpcode()))
355 return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
356
357 if (isRFE(Opcode: MI->getOpcode()))
358 return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
359
360 if ((ST.hasReadM0MovRelInterpHazard() &&
361 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
362 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
363 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
364 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
365 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
366 (ST.hasReadM0LdsDirectHazard() &&
367 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
368 return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
369
370 if (SIInstrInfo::isMAI(MI: *MI))
371 return std::max(a: WaitStates, b: checkMAIHazards(MI));
372
373 if (SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI))
374 return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
375
376 if (ST.hasGFX950Insts() && isPermlane(MI: *MI))
377 return std::max(a: WaitStates, b: checkPermlaneHazards(MI));
378
379 return WaitStates;
380}
381
382void GCNHazardRecognizer::EmitNoop() {
383 EmittedInstrs.push_front(x: nullptr);
384}
385
386void GCNHazardRecognizer::AdvanceCycle() {
387 // When the scheduler detects a stall, it will call AdvanceCycle() without
388 // emitting any instructions.
389 if (!CurrCycleInstr) {
390 EmittedInstrs.push_front(x: nullptr);
391 return;
392 }
393
394 if (CurrCycleInstr->isBundle()) {
395 processBundle();
396 return;
397 }
398
399 unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
400 if (!NumWaitStates) {
401 CurrCycleInstr = nullptr;
402 return;
403 }
404
405 // Keep track of emitted instructions
406 EmittedInstrs.push_front(x: CurrCycleInstr);
407
408 // Add a nullptr for each additional wait state after the first. Make sure
409 // not to add more than getMaxLookAhead() items to the list, since we
410 // truncate the list to that size right after this loop.
411 for (unsigned i = 1, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
412 i < e; ++i) {
413 EmittedInstrs.push_front(x: nullptr);
414 }
415
416 // getMaxLookahead() is the largest number of wait states we will ever need
417 // to insert, so there is no point in keeping track of more than that many
418 // wait states.
419 EmittedInstrs.resize(new_size: getMaxLookAhead());
420
421 CurrCycleInstr = nullptr;
422}
423
424void GCNHazardRecognizer::RecedeCycle() {
425 assert(!IsHazardRecognizerMode &&
426 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
427}
428
429//===----------------------------------------------------------------------===//
430// Helper Functions
431//===----------------------------------------------------------------------===//
432
433using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
434
435using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
436using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
437
438// Search for a hazard in a block and its predecessors.
439template <typename StateT>
440static bool
441hasHazard(StateT State,
442 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
443 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
444 const MachineBasicBlock *MBB,
445 MachineBasicBlock::const_reverse_instr_iterator I,
446 DenseSet<const MachineBasicBlock *> &Visited) {
447 for (auto E = MBB->instr_rend(); I != E; ++I) {
448 // No need to look at parent BUNDLE instructions.
449 if (I->isBundle())
450 continue;
451
452 switch (IsHazard(State, *I)) {
453 case HazardFound:
454 return true;
455 case HazardExpired:
456 return false;
457 default:
458 // Continue search
459 break;
460 }
461
462 if (I->isInlineAsm() || I->isMetaInstruction())
463 continue;
464
465 UpdateState(State, *I);
466 }
467
468 for (MachineBasicBlock *Pred : MBB->predecessors()) {
469 if (!Visited.insert(V: Pred).second)
470 continue;
471
472 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
473 Visited))
474 return true;
475 }
476
477 return false;
478}
479
480// Returns a minimum wait states since \p I walking all predecessors.
481// Only scans until \p IsExpired does not return true.
482// Can only be run in a hazard recognizer mode.
483static int getWaitStatesSince(
484 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
485 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
486 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
487 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
488 for (auto E = MBB->instr_rend(); I != E; ++I) {
489 // Don't add WaitStates for parent BUNDLE instructions.
490 if (I->isBundle())
491 continue;
492
493 if (IsHazard(*I))
494 return WaitStates;
495
496 if (I->isInlineAsm())
497 continue;
498
499 WaitStates += GetNumWaitStates(*I);
500
501 if (IsExpired(*I, WaitStates))
502 return std::numeric_limits<int>::max();
503 }
504
505 int MinWaitStates = std::numeric_limits<int>::max();
506 for (MachineBasicBlock *Pred : MBB->predecessors()) {
507 if (!Visited.insert(V: Pred).second)
508 continue;
509
510 int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
511 IsExpired, Visited, GetNumWaitStates);
512
513 MinWaitStates = std::min(a: MinWaitStates, b: W);
514 }
515
516 return MinWaitStates;
517}
518
519static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
520 const MachineInstr *MI, IsExpiredFn IsExpired) {
521 DenseSet<const MachineBasicBlock *> Visited;
522 return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
523 I: std::next(x: MI->getReverseIterator()),
524 WaitStates: 0, IsExpired, Visited);
525}
526
527int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
528 if (IsHazardRecognizerMode) {
529 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
530 return WaitStates >= Limit;
531 };
532 return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn);
533 }
534
535 int WaitStates = 0;
536 for (MachineInstr *MI : EmittedInstrs) {
537 if (MI) {
538 if (IsHazard(*MI))
539 return WaitStates;
540
541 if (MI->isInlineAsm())
542 continue;
543 }
544 ++WaitStates;
545
546 if (WaitStates >= Limit)
547 break;
548 }
549 return std::numeric_limits<int>::max();
550}
551
552int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
553 IsHazardFn IsHazardDef,
554 int Limit) {
555 const SIRegisterInfo *TRI = ST.getRegisterInfo();
556
557 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
558 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
559 };
560
561 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
562}
563
564int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
565 int Limit) {
566 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
567 return isSSetReg(Opcode: MI.getOpcode()) && IsHazard(MI);
568 };
569
570 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
571}
572
573//===----------------------------------------------------------------------===//
574// No-op Hazard Detection
575//===----------------------------------------------------------------------===//
576
577static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
578 MCRegister Reg) {
579 for (MCRegUnit Unit : TRI.regunits(Reg))
580 BV.set(Unit);
581}
582
583static void addRegsToSet(const SIRegisterInfo &TRI,
584 iterator_range<MachineInstr::const_mop_iterator> Ops,
585 BitVector &DefSet, BitVector &UseSet) {
586 for (const MachineOperand &Op : Ops) {
587 if (Op.isReg())
588 addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
589 }
590}
591
592void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
593 addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
594}
595
596static bool breaksSMEMSoftClause(MachineInstr *MI) {
597 return !SIInstrInfo::isSMRD(MI: *MI);
598}
599
600static bool breaksVMEMSoftClause(MachineInstr *MI) {
601 return !SIInstrInfo::isVMEM(MI: *MI);
602}
603
604int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
605 // SMEM soft clause are only present on VI+, and only matter if xnack is
606 // enabled.
607 if (!ST.isXNACKEnabled())
608 return 0;
609
610 bool IsSMRD = TII.isSMRD(MI: *MEM);
611
612 resetClause();
613
614 // A soft-clause is any group of consecutive SMEM instructions. The
615 // instructions in this group may return out of order and/or may be
616 // replayed (i.e. the same instruction issued more than once).
617 //
618 // In order to handle these situations correctly we need to make sure that
619 // when a clause has more than one instruction, no instruction in the clause
620 // writes to a register that is read by another instruction in the clause
621 // (including itself). If we encounter this situation, we need to break the
622 // clause by inserting a non SMEM instruction.
623
624 for (MachineInstr *MI : EmittedInstrs) {
625 // When we hit a non-SMEM instruction then we have passed the start of the
626 // clause and we can stop.
627 if (!MI)
628 break;
629
630 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
631 break;
632
633 addClauseInst(MI: *MI);
634 }
635
636 if (ClauseDefs.none())
637 return 0;
638
639 // We need to make sure not to put loads and stores in the same clause if they
640 // use the same address. For now, just start a new clause whenever we see a
641 // store.
642 if (MEM->mayStore())
643 return 1;
644
645 addClauseInst(MI: *MEM);
646
647 // If the set of defs and uses intersect then we cannot add this instruction
648 // to the clause, so we have a hazard.
649 return ClauseDefs.anyCommon(RHS: ClauseUses) ? 1 : 0;
650}
651
652int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
653 int WaitStatesNeeded = 0;
654
655 WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
656
657 // This SMRD hazard only affects SI.
658 if (!ST.hasSMRDReadVALUDefHazard())
659 return WaitStatesNeeded;
660
661 // A read of an SGPR by SMRD instruction requires 4 wait states when the
662 // SGPR was written by a VALU instruction.
663 int SmrdSgprWaitStates = 4;
664 auto IsHazardDefFn = [this](const MachineInstr &MI) {
665 return TII.isVALU(MI);
666 };
667 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
668 return TII.isSALU(MI);
669 };
670
671 bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
672
673 for (const MachineOperand &Use : SMRD->uses()) {
674 if (!Use.isReg())
675 continue;
676 int WaitStatesNeededForUse =
677 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
678 Limit: SmrdSgprWaitStates);
679 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
680
681 // This fixes what appears to be undocumented hardware behavior in SI where
682 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
683 // needs some number of nops in between. We don't know how many we need, but
684 // let's use 4. This wasn't discovered before probably because the only
685 // case when this happens is when we expand a 64-bit pointer into a full
686 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
687 // probably never encountered in the closed-source land.
688 if (IsBufferSMRD) {
689 int WaitStatesNeededForUse =
690 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
691 IsHazardDef: IsBufferHazardDefFn,
692 Limit: SmrdSgprWaitStates);
693 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
694 }
695 }
696
697 return WaitStatesNeeded;
698}
699
700int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
701 if (!ST.hasVMEMReadSGPRVALUDefHazard())
702 return 0;
703
704 int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
705
706 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
707 // SGPR was written by a VALU Instruction.
708 const int VmemSgprWaitStates = 5;
709 auto IsHazardDefFn = [this](const MachineInstr &MI) {
710 return TII.isVALU(MI);
711 };
712 for (const MachineOperand &Use : VMEM->uses()) {
713 if (!Use.isReg() || TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
714 continue;
715
716 int WaitStatesNeededForUse =
717 VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
718 Limit: VmemSgprWaitStates);
719 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
720 }
721 return WaitStatesNeeded;
722}
723
724int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
725 const SIRegisterInfo *TRI = ST.getRegisterInfo();
726 const SIInstrInfo *TII = ST.getInstrInfo();
727
728 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
729 int DppVgprWaitStates = 2;
730 int DppExecWaitStates = 5;
731 int WaitStatesNeeded = 0;
732 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
733 return TII->isVALU(MI);
734 };
735
736 for (const MachineOperand &Use : DPP->uses()) {
737 if (!Use.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
738 continue;
739 int WaitStatesNeededForUse =
740 DppVgprWaitStates - getWaitStatesSinceDef(
741 Reg: Use.getReg(),
742 IsHazardDef: [](const MachineInstr &) { return true; },
743 Limit: DppVgprWaitStates);
744 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
745 }
746
747 WaitStatesNeeded = std::max(
748 a: WaitStatesNeeded,
749 b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn,
750 Limit: DppExecWaitStates));
751
752 return WaitStatesNeeded;
753}
754
755int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
756 const SIInstrInfo *TII = ST.getInstrInfo();
757
758 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
759 // instruction.
760 const int DivFMasWaitStates = 4;
761 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
762 return TII->isVALU(MI);
763 };
764 int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn,
765 Limit: DivFMasWaitStates);
766
767 return DivFMasWaitStates - WaitStatesNeeded;
768}
769
770int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
771 const SIInstrInfo *TII = ST.getInstrInfo();
772 unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
773
774 const int GetRegWaitStates = 2;
775 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
776 return GetRegHWReg == getHWReg(TII, RegInstr: MI);
777 };
778 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
779
780 return GetRegWaitStates - WaitStatesNeeded;
781}
782
783int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
784 const SIInstrInfo *TII = ST.getInstrInfo();
785 unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
786
787 const int SetRegWaitStates = ST.getSetRegWaitStates();
788 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
789 return HWReg == getHWReg(TII, RegInstr: MI);
790 };
791 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
792 return SetRegWaitStates - WaitStatesNeeded;
793}
794
795int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
796 if (!MI.mayStore())
797 return -1;
798
799 const SIInstrInfo *TII = ST.getInstrInfo();
800 unsigned Opcode = MI.getOpcode();
801 const MCInstrDesc &Desc = MI.getDesc();
802
803 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
804 int VDataRCID = -1;
805 if (VDataIdx != -1)
806 VDataRCID = Desc.operands()[VDataIdx].RegClass;
807
808 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
809 // There is no hazard if the instruction does not use vector regs
810 // (like wbinvl1)
811 if (VDataIdx == -1)
812 return -1;
813 // For MUBUF/MTBUF instructions this hazard only exists if the
814 // instruction is not using a register in the soffset field.
815 const MachineOperand *SOffset =
816 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
817 // If we have no soffset operand, then assume this field has been
818 // hardcoded to zero.
819 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64 &&
820 (!SOffset || !SOffset->isReg()))
821 return VDataIdx;
822 }
823
824 // MIMG instructions create a hazard if they don't use a 256-bit T# and
825 // the store size is greater than 8 bytes and they have more than two bits
826 // of their dmask set.
827 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
828 if (TII->isMIMG(MI)) {
829 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::srsrc);
830 assert(SRsrcIdx != -1 &&
831 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
832 (void)SRsrcIdx;
833 }
834
835 if (TII->isFLAT(MI)) {
836 // There is no hazard if the instruction does not use vector regs
837 if (VDataIdx == -1)
838 return -1;
839
840 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64)
841 return VDataIdx;
842 }
843
844 return -1;
845}
846
847int
848GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
849 const MachineRegisterInfo &MRI) {
850 // Helper to check for the hazard where VMEM instructions that store more than
851 // 8 bytes can have there store data over written by the next instruction.
852 const SIRegisterInfo *TRI = ST.getRegisterInfo();
853
854 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
855 int WaitStatesNeeded = 0;
856
857 if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
858 return WaitStatesNeeded;
859 Register Reg = Def.getReg();
860 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
861 int DataIdx = createsVALUHazard(MI);
862 return DataIdx >= 0 &&
863 TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg);
864 };
865
866 int WaitStatesNeededForDef =
867 VALUWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: VALUWaitStates);
868 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
869
870 return WaitStatesNeeded;
871}
872
873/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
874/// pack the computed value into correct bit position of the dest register. This
875/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
876/// dst_sel that is not aligned to the register. This function analayzes the \p
877/// MI and \returns an operand with dst forwarding issue, or nullptr if
878/// none exists.
879static const MachineOperand *
880getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
881 if (!SIInstrInfo::isVALU(MI))
882 return nullptr;
883
884 const SIInstrInfo *TII = ST.getInstrInfo();
885
886 unsigned Opcode = MI.getOpcode();
887
888 // There are three different types of instructions
889 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
890 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
891 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
892 // op_sel[3:2]
893 // != 0
894 if (SIInstrInfo::isSDWA(MI)) {
895 // Type 1: SDWA with dst_sel != DWORD
896 if (auto *DstSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel))
897 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
898 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
899 }
900
901 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opc: Opcode);
902 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel)) {
903 // Type 2: VOP3 which write the hi bits
904 if (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers) &
905 SISrcMods::DST_OP_SEL)
906 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
907
908 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
909 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
910 (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers) &
911 SISrcMods::OP_SEL_0))
912 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
913 }
914
915 // Special case: nop is required for all the opsel values for fp4 sr variant
916 // cvt scale instructions
917 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
918 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
919
920 return nullptr;
921}
922
923/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
924/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
925/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
926static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
927 const MachineOperand *Dst,
928 const SIRegisterInfo *TRI) {
929 // We must consider implicit reads of the VALU. SDWA with dst_sel and
930 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
931 // and we must account for that hazard.
932 // We also must account for WAW hazards. In particular, WAW with dest
933 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
934 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
935 // check for ECC. Without accounting for this hazard, the ECC will be
936 // wrong.
937 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
938 // complete zeroesHigh16BitsOfDest)
939 for (auto &Operand : VALU->operands()) {
940 if (Operand.isReg() && TRI->regsOverlap(RegA: Dst->getReg(), RegB: Operand.getReg())) {
941 return true;
942 }
943 }
944 return false;
945}
946
947int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
948 int WaitStatesNeeded = 0;
949
950 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
951 const int TransDefWaitstates = 1;
952
953 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
954 if (!SIInstrInfo::isTRANS(MI))
955 return false;
956 const SIRegisterInfo *TRI = ST.getRegisterInfo();
957 const SIInstrInfo *TII = ST.getInstrInfo();
958 Register Def = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)->getReg();
959
960 for (const MachineOperand &Use : VALU->explicit_uses()) {
961 if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
962 return true;
963 }
964
965 return false;
966 };
967
968 int WaitStatesNeededForDef =
969 TransDefWaitstates -
970 getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
971 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
972 }
973
974 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
975 const int Shift16DefWaitstates = 1;
976
977 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
978 const SIRegisterInfo *TRI = ST.getRegisterInfo();
979 const MachineOperand *ForwardedDst =
980 getDstSelForwardingOperand(MI: ProducerMI, ST);
981 if (ForwardedDst) {
982 return consumesDstSelForwardingOperand(VALU, Dst: ForwardedDst, TRI);
983 }
984
985 if (ProducerMI.isInlineAsm()) {
986 // Assume inline asm has dst forwarding hazard
987 for (auto &Def : ProducerMI.all_defs()) {
988 if (consumesDstSelForwardingOperand(VALU, Dst: &Def, TRI))
989 return true;
990 }
991 }
992
993 return false;
994 };
995
996 int WaitStatesNeededForDef =
997 Shift16DefWaitstates -
998 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
999 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1000 }
1001
1002 if (ST.hasVDecCoExecHazard()) {
1003 const int VALUWriteSGPRVALUReadWaitstates = 2;
1004 const int VALUWriteEXECRWLane = 4;
1005 const int VALUWriteVGPRReadlaneRead = 1;
1006
1007 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1008 const MachineRegisterInfo &MRI = MF.getRegInfo();
1009 Register UseReg;
1010 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1011 if (!SIInstrInfo::isVALU(MI))
1012 return false;
1013 return MI.modifiesRegister(Reg: UseReg, TRI);
1014 };
1015
1016 for (const MachineOperand &Use : VALU->explicit_uses()) {
1017 if (!Use.isReg())
1018 continue;
1019
1020 UseReg = Use.getReg();
1021 if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
1022 int WaitStatesNeededForDef =
1023 VALUWriteSGPRVALUReadWaitstates -
1024 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn,
1025 Limit: VALUWriteSGPRVALUReadWaitstates);
1026 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1027 }
1028 }
1029
1030 if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) {
1031 UseReg = AMDGPU::VCC;
1032 int WaitStatesNeededForDef =
1033 VALUWriteSGPRVALUReadWaitstates -
1034 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates);
1035 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1036 }
1037
1038 switch (VALU->getOpcode()) {
1039 case AMDGPU::V_READLANE_B32:
1040 case AMDGPU::V_READFIRSTLANE_B32: {
1041 MachineOperand *Src = TII.getNamedOperand(MI&: *VALU, OperandName: AMDGPU::OpName::src0);
1042 UseReg = Src->getReg();
1043 int WaitStatesNeededForDef =
1044 VALUWriteVGPRReadlaneRead -
1045 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead);
1046 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1047 }
1048 [[fallthrough]];
1049 case AMDGPU::V_WRITELANE_B32: {
1050 UseReg = AMDGPU::EXEC;
1051 int WaitStatesNeededForDef =
1052 VALUWriteEXECRWLane -
1053 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane);
1054 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1055 break;
1056 }
1057 default:
1058 break;
1059 }
1060 }
1061
1062 // This checks for the hazard where VMEM instructions that store more than
1063 // 8 bytes can have there store data over written by the next instruction.
1064 if (!ST.has12DWordStoreHazard())
1065 return WaitStatesNeeded;
1066
1067 const MachineRegisterInfo &MRI = MF.getRegInfo();
1068
1069 for (const MachineOperand &Def : VALU->defs()) {
1070 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1071 }
1072
1073 return WaitStatesNeeded;
1074}
1075
1076int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1077 // This checks for hazards associated with inline asm statements.
1078 // Since inline asms can contain just about anything, we use this
1079 // to call/leverage other check*Hazard routines. Note that
1080 // this function doesn't attempt to address all possible inline asm
1081 // hazards (good luck), but is a collection of what has been
1082 // problematic thus far.
1083
1084 // see checkVALUHazards()
1085 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1086 !ST.hasCvtScaleForwardingHazard())
1087 return 0;
1088
1089 const MachineRegisterInfo &MRI = MF.getRegInfo();
1090 int WaitStatesNeeded = 0;
1091
1092 for (const MachineOperand &Op :
1093 llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1094 if (Op.isReg() && Op.isDef()) {
1095 if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1096 continue;
1097
1098 if (ST.has12DWordStoreHazard()) {
1099 WaitStatesNeeded =
1100 std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1101 }
1102 }
1103 }
1104
1105 if (ST.hasDstSelForwardingHazard()) {
1106 const int Shift16DefWaitstates = 1;
1107
1108 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1109 const MachineOperand *Dst = getDstSelForwardingOperand(MI: ProducerMI, ST);
1110 // Assume inline asm reads the dst
1111 if (Dst)
1112 return IA->modifiesRegister(Reg: Dst->getReg(), TRI: &TRI) ||
1113 IA->readsRegister(Reg: Dst->getReg(), TRI: &TRI);
1114
1115 if (ProducerMI.isInlineAsm()) {
1116 // If MI is inline asm, assume it has dst forwarding hazard
1117 for (auto &Def : ProducerMI.all_defs()) {
1118 if (IA->modifiesRegister(Reg: Def.getReg(), TRI: &TRI) ||
1119 IA->readsRegister(Reg: Def.getReg(), TRI: &TRI)) {
1120 return true;
1121 }
1122 }
1123 }
1124
1125 return false;
1126 };
1127
1128 int WaitStatesNeededForDef =
1129 Shift16DefWaitstates -
1130 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1131 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1132 }
1133
1134 return WaitStatesNeeded;
1135}
1136
1137int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1138 const SIInstrInfo *TII = ST.getInstrInfo();
1139 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1140 const MachineRegisterInfo &MRI = MF.getRegInfo();
1141
1142 const MachineOperand *LaneSelectOp =
1143 TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1);
1144
1145 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1146 return 0;
1147
1148 Register LaneSelectReg = LaneSelectOp->getReg();
1149 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1150
1151 const int RWLaneWaitStates = 4;
1152 int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1153 Limit: RWLaneWaitStates);
1154 return RWLaneWaitStates - WaitStatesSince;
1155}
1156
1157int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1158 if (!ST.hasRFEHazards())
1159 return 0;
1160
1161 const SIInstrInfo *TII = ST.getInstrInfo();
1162
1163 const int RFEWaitStates = 1;
1164
1165 auto IsHazardFn = [TII](const MachineInstr &MI) {
1166 return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1167 };
1168 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1169 return RFEWaitStates - WaitStatesNeeded;
1170}
1171
1172int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1173 const SIInstrInfo *TII = ST.getInstrInfo();
1174 const int ReadM0WaitStates = 1;
1175 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1176 return ReadM0WaitStates -
1177 getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates);
1178}
1179
1180void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1181 fixVMEMtoScalarWriteHazards(MI);
1182 fixVcmpxPermlaneHazards(MI);
1183 fixSMEMtoVectorWriteHazards(MI);
1184 fixVcmpxExecWARHazard(MI);
1185 fixLdsBranchVmemWARHazard(MI);
1186 if (ST.hasLdsDirect()) {
1187 fixLdsDirectVALUHazard(MI);
1188 fixLdsDirectVMEMHazard(MI);
1189 }
1190 fixVALUPartialForwardingHazard(MI);
1191 fixVALUTransUseHazard(MI);
1192 fixWMMAHazards(MI);
1193 fixShift64HighRegBug(MI);
1194 fixVALUMaskWriteHazard(MI);
1195 fixRequiredExportPriority(MI);
1196}
1197
1198static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1199 const MachineInstr &MI) {
1200 return (TII.isVOPC(MI) ||
1201 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1202 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI);
1203}
1204
1205bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1206 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(MI: *MI))
1207 return false;
1208
1209 const SIInstrInfo *TII = ST.getInstrInfo();
1210 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1211 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1212 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
1213 };
1214
1215 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1216 unsigned Opc = MI.getOpcode();
1217 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1218 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1219 };
1220
1221 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1222 std::numeric_limits<int>::max())
1223 return false;
1224
1225 // V_NOP will be discarded by SQ.
1226 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1227 // which is always a VGPR and available.
1228 auto *Src0 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
1229 Register Reg = Src0->getReg();
1230 bool IsUndef = Src0->isUndef();
1231 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1232 MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32))
1233 .addReg(RegNo: Reg, flags: RegState::Define | (IsUndef ? RegState::Dead : 0))
1234 .addReg(RegNo: Reg, flags: IsUndef ? RegState::Undef : RegState::Kill);
1235
1236 return true;
1237}
1238
1239bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1240 if (!ST.hasVMEMtoScalarWriteHazard())
1241 return false;
1242 assert(!ST.hasExtendedWaitCounts());
1243
1244 if (!SIInstrInfo::isSALU(MI: *MI) && !SIInstrInfo::isSMRD(MI: *MI))
1245 return false;
1246
1247 if (MI->getNumDefs() == 0)
1248 return false;
1249
1250 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1251
1252 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1253 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1254 return false;
1255
1256 for (const MachineOperand &Def : MI->defs()) {
1257 const MachineOperand *Op =
1258 I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false);
1259 if (!Op)
1260 continue;
1261 return true;
1262 }
1263 return false;
1264 };
1265
1266 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1267 return SIInstrInfo::isVALU(MI) ||
1268 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1269 !MI.getOperand(i: 0).getImm()) ||
1270 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1271 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: 0).getImm()) == 0);
1272 };
1273
1274 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1275 std::numeric_limits<int>::max())
1276 return false;
1277
1278 const SIInstrInfo *TII = ST.getInstrInfo();
1279 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1280 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1281 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0));
1282 return true;
1283}
1284
1285bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1286 if (!ST.hasSMEMtoVectorWriteHazard())
1287 return false;
1288 assert(!ST.hasExtendedWaitCounts());
1289
1290 if (!SIInstrInfo::isVALU(MI: *MI))
1291 return false;
1292
1293 AMDGPU::OpName SDSTName;
1294 switch (MI->getOpcode()) {
1295 case AMDGPU::V_READLANE_B32:
1296 case AMDGPU::V_READFIRSTLANE_B32:
1297 SDSTName = AMDGPU::OpName::vdst;
1298 break;
1299 default:
1300 SDSTName = AMDGPU::OpName::sdst;
1301 break;
1302 }
1303
1304 const SIInstrInfo *TII = ST.getInstrInfo();
1305 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1306 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1307 const MachineOperand *SDST = TII->getNamedOperand(MI&: *MI, OperandName: SDSTName);
1308 if (!SDST) {
1309 for (const auto &MO : MI->implicit_operands()) {
1310 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) {
1311 SDST = &MO;
1312 break;
1313 }
1314 }
1315 }
1316
1317 if (!SDST)
1318 return false;
1319
1320 const Register SDSTReg = SDST->getReg();
1321 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1322 return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI);
1323 };
1324
1325 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1326 if (TII->isSALU(MI)) {
1327 switch (MI.getOpcode()) {
1328 case AMDGPU::S_SETVSKIP:
1329 case AMDGPU::S_VERSION:
1330 case AMDGPU::S_WAITCNT_VSCNT:
1331 case AMDGPU::S_WAITCNT_VMCNT:
1332 case AMDGPU::S_WAITCNT_EXPCNT:
1333 // These instructions cannot not mitigate the hazard.
1334 return false;
1335 case AMDGPU::S_WAITCNT_LGKMCNT:
1336 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1337 return (MI.getOperand(i: 1).getImm() == 0) &&
1338 (MI.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL);
1339 case AMDGPU::S_WAITCNT: {
1340 const int64_t Imm = MI.getOperand(i: 0).getImm();
1341 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1342 // DsCnt corresponds to LGKMCnt here.
1343 return (Decoded.DsCnt == 0);
1344 }
1345 default:
1346 // SOPP instructions cannot mitigate the hazard.
1347 if (TII->isSOPP(MI))
1348 return false;
1349 // At this point the SALU can be assumed to mitigate the hazard
1350 // because either:
1351 // (a) it is independent of the at risk SMEM (breaking chain),
1352 // or
1353 // (b) it is dependent on the SMEM, in which case an appropriate
1354 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1355 // SMEM instruction.
1356 return true;
1357 }
1358 }
1359 return false;
1360 };
1361
1362 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1363 std::numeric_limits<int>::max())
1364 return false;
1365
1366 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1367 MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL)
1368 .addImm(Val: 0);
1369 return true;
1370}
1371
1372bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1373 if (!ST.hasVcmpxExecWARHazard())
1374 return false;
1375 assert(!ST.hasExtendedWaitCounts());
1376
1377 if (!SIInstrInfo::isVALU(MI: *MI))
1378 return false;
1379
1380 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1381 if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
1382 return false;
1383
1384 auto IsHazardFn = [TRI](const MachineInstr &I) {
1385 if (SIInstrInfo::isVALU(MI: I))
1386 return false;
1387 return I.readsRegister(Reg: AMDGPU::EXEC, TRI);
1388 };
1389
1390 const SIInstrInfo *TII = ST.getInstrInfo();
1391 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1392 if (SIInstrInfo::isVALU(MI)) {
1393 if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))
1394 return true;
1395 for (auto MO : MI.implicit_operands())
1396 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg())))
1397 return true;
1398 }
1399 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1400 AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
1401 return true;
1402 return false;
1403 };
1404
1405 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1406 std::numeric_limits<int>::max())
1407 return false;
1408
1409 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1410 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1411 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0));
1412 return true;
1413}
1414
1415static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1416 const GCNSubtarget &ST) {
1417 if (!ST.hasLdsBranchVmemWARHazard())
1418 return false;
1419
1420 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1421 // instructions need to appear in the same function.
1422 bool HasLds = false;
1423 bool HasVmem = false;
1424 for (auto &MBB : MF) {
1425 for (auto &MI : MBB) {
1426 HasLds |= SIInstrInfo::isDS(MI);
1427 HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
1428 SIInstrInfo::isSegmentSpecificFLAT(MI);
1429 if (HasLds && HasVmem)
1430 return true;
1431 }
1432 }
1433 return false;
1434}
1435
1436static bool isStoreCountWaitZero(const MachineInstr &I) {
1437 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1438 I.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL &&
1439 !I.getOperand(i: 1).getImm();
1440}
1441
1442bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1443 if (!RunLdsBranchVmemWARHazardFixup)
1444 return false;
1445
1446 assert(ST.hasLdsBranchVmemWARHazard());
1447 assert(!ST.hasExtendedWaitCounts());
1448
1449 auto IsHazardInst = [](const MachineInstr &MI) {
1450 if (SIInstrInfo::isDS(MI))
1451 return 1;
1452 if ((SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
1453 SIInstrInfo::isSegmentSpecificFLAT(MI))
1454 return 2;
1455 return 0;
1456 };
1457
1458 auto InstType = IsHazardInst(*MI);
1459 if (!InstType)
1460 return false;
1461
1462 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1463 return IsHazardInst(I) || isStoreCountWaitZero(I);
1464 };
1465
1466 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1467 if (!I.isBranch())
1468 return false;
1469
1470 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1471 auto InstType2 = IsHazardInst(I);
1472 return InstType2 && InstType != InstType2;
1473 };
1474
1475 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1476 auto InstType2 = IsHazardInst(I);
1477 if (InstType == InstType2)
1478 return true;
1479
1480 return isStoreCountWaitZero(I);
1481 };
1482
1483 return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1484 std::numeric_limits<int>::max();
1485 };
1486
1487 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1488 std::numeric_limits<int>::max())
1489 return false;
1490
1491 const SIInstrInfo *TII = ST.getInstrInfo();
1492 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1493 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1494 .addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef)
1495 .addImm(Val: 0);
1496
1497 return true;
1498}
1499
1500bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1501 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1502 return false;
1503
1504 const int NoHazardWaitStates = 15;
1505 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1506 const Register VDSTReg = VDST->getReg();
1507
1508 bool VisitedTrans = false;
1509 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1510 if (!SIInstrInfo::isVALU(MI: I))
1511 return false;
1512 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(MI: I);
1513 // Cover both WAR and WAW
1514 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1515 };
1516 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1517 if (WaitStates >= NoHazardWaitStates)
1518 return true;
1519 // Instructions which cause va_vdst==0 expire hazard
1520 return SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1521 SIInstrInfo::isEXP(MI: I);
1522 };
1523 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1524 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1525 };
1526
1527 DenseSet<const MachineBasicBlock *> Visited;
1528 auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(),
1529 I: std::next(x: MI->getReverseIterator()), WaitStates: 0,
1530 IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn);
1531
1532 // Transcendentals can execute in parallel to other VALUs.
1533 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1534 if (VisitedTrans)
1535 Count = 0;
1536
1537 MachineOperand *WaitVdstOp =
1538 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst);
1539 WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates));
1540
1541 return true;
1542}
1543
1544bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1545 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1546 return false;
1547
1548 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1549 const Register VDSTReg = VDST->getReg();
1550
1551 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1552 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1553 return false;
1554 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1555 };
1556 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1557 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1558 // according to the type of VMEM instruction.
1559 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1560 return SIInstrInfo::isVALU(MI: I) || SIInstrInfo::isEXP(MI: I) ||
1561 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: 0).getImm()) ||
1562 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1563 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: 0).getImm()) == 0) ||
1564 (LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) &&
1565 !TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::waitvsrc)->getImm());
1566 };
1567
1568 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1569 std::numeric_limits<int>::max())
1570 return false;
1571
1572 if (LdsdirCanWait) {
1573 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(0);
1574 } else {
1575 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1576 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1577 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0));
1578 }
1579
1580 return true;
1581}
1582
1583bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1584 if (!ST.hasVALUPartialForwardingHazard())
1585 return false;
1586 assert(!ST.hasExtendedWaitCounts());
1587
1588 if (!ST.isWave64() || !SIInstrInfo::isVALU(MI: *MI))
1589 return false;
1590
1591 SmallSetVector<Register, 4> SrcVGPRs;
1592
1593 for (const MachineOperand &Use : MI->explicit_uses()) {
1594 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1595 SrcVGPRs.insert(X: Use.getReg());
1596 }
1597
1598 // Only applies with >= 2 unique VGPR sources
1599 if (SrcVGPRs.size() <= 1)
1600 return false;
1601
1602 // Look for the following pattern:
1603 // Va <- VALU [PreExecPos]
1604 // intv1
1605 // Exec <- SALU [ExecPos]
1606 // intv2
1607 // Vb <- VALU [PostExecPos]
1608 // intv3
1609 // MI Va, Vb (WaitState = 0)
1610 //
1611 // Where:
1612 // intv1 + intv2 <= 2 VALUs
1613 // intv3 <= 4 VALUs
1614 //
1615 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1616
1617 const int Intv1plus2MaxVALUs = 2;
1618 const int Intv3MaxVALUs = 4;
1619 const int IntvMaxVALUs = 6;
1620 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1621
1622 struct StateType {
1623 SmallDenseMap<Register, int, 4> DefPos;
1624 int ExecPos = std::numeric_limits<int>::max();
1625 int VALUs = 0;
1626 };
1627
1628 StateType State;
1629
1630 // This overloads expiry testing with all the hazard detection
1631 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1632 // Too many VALU states have passed
1633 if (State.VALUs > NoHazardVALUWaitStates)
1634 return HazardExpired;
1635
1636 // Instructions which cause va_vdst==0 expire hazard
1637 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1638 SIInstrInfo::isEXP(MI: I) ||
1639 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1640 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
1641 return HazardExpired;
1642
1643 // Track registers writes
1644 bool Changed = false;
1645 if (SIInstrInfo::isVALU(MI: I)) {
1646 for (Register Src : SrcVGPRs) {
1647 if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1648 State.DefPos[Src] = State.VALUs;
1649 Changed = true;
1650 }
1651 }
1652 } else if (SIInstrInfo::isSALU(MI: I)) {
1653 if (State.ExecPos == std::numeric_limits<int>::max()) {
1654 if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
1655 State.ExecPos = State.VALUs;
1656 Changed = true;
1657 }
1658 }
1659 }
1660
1661 // Early expiration: too many VALUs in intv3
1662 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1663 return HazardExpired;
1664
1665 // Only evaluate state if something changed
1666 if (!Changed)
1667 return NoHazardFound;
1668
1669 // Determine positions of VALUs pre/post exec change
1670 if (State.ExecPos == std::numeric_limits<int>::max())
1671 return NoHazardFound;
1672
1673 int PreExecPos = std::numeric_limits<int>::max();
1674 int PostExecPos = std::numeric_limits<int>::max();
1675
1676 for (auto Entry : State.DefPos) {
1677 int DefVALUs = Entry.second;
1678 if (DefVALUs != std::numeric_limits<int>::max()) {
1679 if (DefVALUs >= State.ExecPos)
1680 PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1681 else
1682 PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1683 }
1684 }
1685
1686 // Need a VALUs post exec change
1687 if (PostExecPos == std::numeric_limits<int>::max())
1688 return NoHazardFound;
1689
1690 // Too many VALUs in intv3?
1691 int Intv3VALUs = PostExecPos;
1692 if (Intv3VALUs > Intv3MaxVALUs)
1693 return HazardExpired;
1694
1695 // Too many VALUs in intv2?
1696 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1697 if (Intv2VALUs > Intv1plus2MaxVALUs)
1698 return HazardExpired;
1699
1700 // Need a VALUs pre exec change
1701 if (PreExecPos == std::numeric_limits<int>::max())
1702 return NoHazardFound;
1703
1704 // Too many VALUs in intv1?
1705 int Intv1VALUs = PreExecPos - State.ExecPos;
1706 if (Intv1VALUs > Intv1plus2MaxVALUs)
1707 return HazardExpired;
1708
1709 // Too many VALUs in intv1 + intv2
1710 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1711 return HazardExpired;
1712
1713 return HazardFound;
1714 };
1715 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1716 if (SIInstrInfo::isVALU(MI))
1717 State.VALUs += 1;
1718 };
1719
1720 DenseSet<const MachineBasicBlock *> Visited;
1721 if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1722 I: std::next(x: MI->getReverseIterator()), Visited))
1723 return false;
1724
1725 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1726 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1727 .addImm(Val: 0x0fff);
1728
1729 return true;
1730}
1731
1732bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1733 if (!ST.hasVALUTransUseHazard())
1734 return false;
1735 assert(!ST.hasExtendedWaitCounts());
1736
1737 if (!SIInstrInfo::isVALU(MI: *MI))
1738 return false;
1739
1740 SmallSet<Register, 4> SrcVGPRs;
1741
1742 for (const MachineOperand &Use : MI->explicit_uses()) {
1743 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1744 SrcVGPRs.insert(V: Use.getReg());
1745 }
1746
1747 // Look for the following pattern:
1748 // Va <- TRANS VALU
1749 // intv
1750 // MI Va (WaitState = 0)
1751 //
1752 // Where:
1753 // intv <= 5 VALUs / 1 TRANS
1754 //
1755 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1756
1757 const int IntvMaxVALUs = 5;
1758 const int IntvMaxTRANS = 1;
1759
1760 struct StateType {
1761 int VALUs = 0;
1762 int TRANS = 0;
1763 };
1764
1765 StateType State;
1766
1767 // This overloads expiry testing with all the hazard detection
1768 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1769 // Too many VALU states have passed
1770 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1771 return HazardExpired;
1772
1773 // Instructions which cause va_vdst==0 expire hazard
1774 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1775 SIInstrInfo::isEXP(MI: I) ||
1776 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1777 I.getOperand(i: 0).getImm() == 0x0fff))
1778 return HazardExpired;
1779
1780 // Track registers writes
1781 if (SIInstrInfo::isTRANS(MI: I)) {
1782 for (Register Src : SrcVGPRs) {
1783 if (I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1784 return HazardFound;
1785 }
1786 }
1787 }
1788
1789 return NoHazardFound;
1790 };
1791 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1792 if (SIInstrInfo::isVALU(MI))
1793 State.VALUs += 1;
1794 if (SIInstrInfo::isTRANS(MI))
1795 State.TRANS += 1;
1796 };
1797
1798 DenseSet<const MachineBasicBlock *> Visited;
1799 if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(),
1800 I: std::next(x: MI->getReverseIterator()), Visited))
1801 return false;
1802
1803 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1804 // avoided.
1805 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1806 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1807 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0));
1808
1809 return true;
1810}
1811
1812bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1813 if (!SIInstrInfo::isWMMA(MI: *MI) && !SIInstrInfo::isSWMMAC(MI: *MI))
1814 return false;
1815
1816 const SIInstrInfo *TII = ST.getInstrInfo();
1817 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1818
1819 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1820 if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
1821 return false;
1822
1823 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1824 // with the dest(matrix D) of the previous wmma.
1825 const Register CurSrc0Reg =
1826 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
1827 const Register CurSrc1Reg =
1828 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
1829
1830 const Register PrevDstReg =
1831 TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
1832
1833 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) ||
1834 TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) {
1835 return true;
1836 }
1837
1838 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1839 // but Index can't overlap with PrevDstReg.
1840 if (AMDGPU::isGFX12Plus(STI: ST)) {
1841 if (SIInstrInfo::isSWMMAC(MI: *MI)) {
1842 const Register CurIndex =
1843 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
1844 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex))
1845 return true;
1846 }
1847 return false;
1848 }
1849
1850 return false;
1851 };
1852
1853 auto IsExpiredFn = [](const MachineInstr &I, int) {
1854 return SIInstrInfo::isVALU(MI: I);
1855 };
1856
1857 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1858 std::numeric_limits<int>::max())
1859 return false;
1860
1861 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
1862
1863 return true;
1864}
1865
1866bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1867 if (!ST.hasShift64HighRegBug())
1868 return false;
1869 assert(!ST.hasExtendedWaitCounts());
1870
1871 switch (MI->getOpcode()) {
1872 default:
1873 return false;
1874 case AMDGPU::V_LSHLREV_B64_e64:
1875 case AMDGPU::V_LSHRREV_B64_e64:
1876 case AMDGPU::V_ASHRREV_I64_e64:
1877 break;
1878 }
1879
1880 MachineOperand *Amt = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
1881 if (!Amt->isReg())
1882 return false;
1883
1884 Register AmtReg = Amt->getReg();
1885 const MachineRegisterInfo &MRI = MF.getRegInfo();
1886 // Check if this is a last VGPR in the allocation block.
1887 if (!TRI.isVGPR(MRI, Reg: AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1888 return false;
1889
1890 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + 1))
1891 return false;
1892
1893 MachineOperand *Src1 = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1);
1894 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(RegA: Src1->getReg(), RegB: AmtReg);
1895 bool OverlappedDst = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI);
1896 bool Overlapped = OverlappedSrc || OverlappedDst;
1897
1898 assert(!OverlappedDst || !OverlappedSrc ||
1899 Src1->getReg() == MI->getOperand(0).getReg());
1900 assert(ST.needsAlignedVGPRs());
1901 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1902
1903 Register NewReg;
1904 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1905 : AMDGPU::VGPR_32RegClass) {
1906 if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) {
1907 NewReg = Reg;
1908 break;
1909 }
1910 }
1911
1912 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1)
1913 : NewReg;
1914 Register NewAmtLo;
1915
1916 if (Overlapped)
1917 NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0);
1918
1919 DebugLoc DL = MI->getDebugLoc();
1920 MachineBasicBlock *MBB = MI->getParent();
1921 // Insert a full wait count because found register might be pending a wait.
1922 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
1923 .addImm(Val: 0);
1924
1925 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1926 if (Overlapped)
1927 runOnInstruction(
1928 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo)
1929 .addDef(RegNo: AmtReg - 1)
1930 .addReg(RegNo: AmtReg - 1, flags: RegState::Undef)
1931 .addReg(RegNo: NewAmtLo, flags: RegState::Undef));
1932 runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt)
1933 .addDef(RegNo: AmtReg)
1934 .addReg(RegNo: AmtReg, flags: RegState::Undef)
1935 .addReg(RegNo: NewAmt, flags: RegState::Undef));
1936
1937 // Instructions emitted after the current instruction will be processed by the
1938 // parent loop of the hazard recognizer in a natural way.
1939 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
1940 DestReg: AmtReg)
1941 .addDef(RegNo: NewAmt)
1942 .addReg(RegNo: NewAmt)
1943 .addReg(RegNo: AmtReg);
1944 if (Overlapped)
1945 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
1946 DestReg: AmtReg - 1)
1947 .addDef(RegNo: NewAmtLo)
1948 .addReg(RegNo: NewAmtLo)
1949 .addReg(RegNo: AmtReg - 1);
1950
1951 // Re-running hazard recognizer on the modified instruction is not necessary,
1952 // inserted V_SWAP_B32 has already both read and write new registers so
1953 // hazards related to these register has already been handled.
1954 Amt->setReg(NewAmt);
1955 Amt->setIsKill(false);
1956 // We do not update liveness, so verifier may see it as undef.
1957 Amt->setIsUndef();
1958 if (OverlappedDst)
1959 MI->getOperand(i: 0).setReg(NewReg);
1960 if (OverlappedSrc) {
1961 Src1->setReg(NewReg);
1962 Src1->setIsKill(false);
1963 Src1->setIsUndef();
1964 }
1965
1966 return true;
1967}
1968
1969int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1970 int NSAtoVMEMWaitStates = 1;
1971
1972 if (!ST.hasNSAtoVMEMBug())
1973 return 0;
1974
1975 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isMTBUF(MI: *MI))
1976 return 0;
1977
1978 const SIInstrInfo *TII = ST.getInstrInfo();
1979 const auto *Offset = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
1980 if (!Offset || (Offset->getImm() & 6) == 0)
1981 return 0;
1982
1983 auto IsHazardFn = [TII](const MachineInstr &I) {
1984 if (!SIInstrInfo::isMIMG(MI: I))
1985 return false;
1986 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
1987 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1988 TII->getInstSizeInBytes(MI: I) >= 16;
1989 };
1990
1991 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: 1);
1992}
1993
1994int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1995 int FPAtomicToDenormModeWaitStates = 3;
1996
1997 if (!ST.hasFPAtomicToDenormModeHazard())
1998 return 0;
1999 assert(!ST.hasExtendedWaitCounts());
2000
2001 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2002 return 0;
2003
2004 auto IsHazardFn = [](const MachineInstr &I) {
2005 if (!SIInstrInfo::isVMEM(MI: I))
2006 return false;
2007 return SIInstrInfo::isFPAtomic(MI: I);
2008 };
2009
2010 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2011 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2012 return true;
2013
2014 switch (MI.getOpcode()) {
2015 case AMDGPU::S_WAITCNT:
2016 case AMDGPU::S_WAITCNT_VSCNT:
2017 case AMDGPU::S_WAITCNT_VMCNT:
2018 case AMDGPU::S_WAITCNT_EXPCNT:
2019 case AMDGPU::S_WAITCNT_LGKMCNT:
2020 case AMDGPU::S_WAIT_IDLE:
2021 return true;
2022 default:
2023 break;
2024 }
2025
2026 return false;
2027 };
2028
2029 return FPAtomicToDenormModeWaitStates -
2030 ::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
2031}
2032
2033int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2034 assert(SIInstrInfo::isMAI(*MI));
2035
2036 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2037}
2038
2039int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2040 // Early exit if no padding is requested.
2041 if (MFMAPaddingRatio == 0)
2042 return 0;
2043
2044 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2045 if (!SIInstrInfo::isMFMA(MI: *MI) || MFI->getOccupancy() < 2)
2046 return 0;
2047
2048 int NeighborMFMALatency = 0;
2049 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2050 this](const MachineInstr &MI) {
2051 if (!SIInstrInfo::isMFMA(MI))
2052 return false;
2053
2054 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2055 return true;
2056 };
2057
2058 const int MaxMFMAPipelineWaitStates = 16;
2059 int WaitStatesSinceNeighborMFMA =
2060 getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
2061
2062 int NeighborMFMAPaddingNeeded =
2063 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2064 WaitStatesSinceNeighborMFMA;
2065
2066 return std::max(a: 0, b: NeighborMFMAPaddingNeeded);
2067}
2068
2069int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2070 int WaitStatesNeeded = 0;
2071 unsigned Opc = MI->getOpcode();
2072
2073 auto IsVALUFn = [](const MachineInstr &MI) {
2074 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2075 };
2076
2077 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2078 const int LegacyVALUWritesVGPRWaitStates = 2;
2079 const int VALUWritesExecWaitStates = 4;
2080 const int MaxWaitStates = 4;
2081
2082 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2083 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2084 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2085
2086 if (WaitStatesNeeded < MaxWaitStates) {
2087 for (const MachineOperand &Use : MI->explicit_uses()) {
2088 const int MaxWaitStates = 2;
2089
2090 if (!Use.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
2091 continue;
2092
2093 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2094 getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2095 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2096
2097 if (WaitStatesNeeded == MaxWaitStates)
2098 break;
2099 }
2100 }
2101 }
2102
2103 for (const MachineOperand &Op : MI->explicit_operands()) {
2104 if (!Op.isReg() || !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2105 continue;
2106
2107 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2108 continue;
2109
2110 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2111 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2112 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2113 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2114 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2115 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2116 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2117 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2118 const int MaxWaitStates = 18;
2119 Register Reg = Op.getReg();
2120 unsigned HazardDefLatency = 0;
2121
2122 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2123 this](const MachineInstr &MI) {
2124 if (!SIInstrInfo::isMFMA(MI))
2125 return false;
2126 Register DstReg = MI.getOperand(i: 0).getReg();
2127 if (DstReg == Reg)
2128 return false;
2129 HazardDefLatency =
2130 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2131 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2132 };
2133
2134 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn,
2135 Limit: MaxWaitStates);
2136 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2137 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2138 int OpNo = Op.getOperandNo();
2139 if (OpNo == SrcCIdx) {
2140 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2141 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2142 switch (HazardDefLatency) {
2143 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2144 break;
2145 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2146 break;
2147 case 16: [[fallthrough]];
2148 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2149 break;
2150 }
2151 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2152 switch (HazardDefLatency) {
2153 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2154 break;
2155 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2156 break;
2157 case 16: [[fallthrough]];
2158 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2159 break;
2160 }
2161 }
2162
2163 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2164 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2165
2166 if (WaitStatesNeeded == MaxWaitStates)
2167 return WaitStatesNeeded; // Early exit.
2168
2169 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2170 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2171 return false;
2172 Register DstReg = MI.getOperand(i: 0).getReg();
2173 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2174 };
2175
2176 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2177 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2178 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2179 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2180 if (OpNo == SrcCIdx)
2181 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2182 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2183 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2184
2185 WaitStatesNeededForUse = NeedWaitStates -
2186 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates);
2187 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2188
2189 if (WaitStatesNeeded == MaxWaitStates)
2190 return WaitStatesNeeded; // Early exit.
2191 }
2192
2193 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2194 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2195 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2196 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2197 const int MaxWaitStates = 13;
2198 Register DstReg = MI->getOperand(i: 0).getReg();
2199 unsigned HazardDefLatency = 0;
2200
2201 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2202 this](const MachineInstr &MI) {
2203 if (!SIInstrInfo::isMFMA(MI))
2204 return false;
2205 Register Reg = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2206 HazardDefLatency =
2207 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2208 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2209 };
2210
2211 int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates);
2212 int NeedWaitStates;
2213 switch (HazardDefLatency) {
2214 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2215 break;
2216 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2217 break;
2218 case 16: [[fallthrough]];
2219 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2220 break;
2221 }
2222
2223 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2224 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2225 }
2226
2227 // Pad neighboring MFMA with noops for better inter-wave performance.
2228 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2229
2230 return WaitStatesNeeded;
2231}
2232
2233static int
2234GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2235 bool IsGFX950) {
2236 // xdl def cycles | gfx940 | gfx950
2237 // 2 pass | 3 4
2238 // 4 pass | 5 6
2239 // 8 pass | 9 10
2240 // 16 pass | 17 18
2241 return NumPasses + 1 + IsGFX950;
2242}
2243
2244static int
2245GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2246 bool IsGFX950) {
2247 // xdl def cycles | gfx940 | gfx950
2248 // 2 pass | 3 3
2249 // 4 pass | 5 6
2250 // 8 pass | 9 10
2251 // 16 pass | 17 18
2252 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2253}
2254
2255static int
2256GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2257 // 2 pass -> 2
2258 // 4 pass -> 4
2259 // 8 pass -> 8
2260 // 16 pass -> 16
2261 return NumPasses;
2262}
2263
2264static int
2265GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2266 // 2 pass -> 4
2267 // 4 pass -> 6
2268 // 8 pass -> 10
2269 // 16 pass -> 18
2270 return NumPasses + 2;
2271}
2272
2273static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
2274 bool IsGFX950) {
2275 // xdl def cycles | gfx942 | gfx950
2276 // 2 pass | 5 5
2277 // 4 pass | 7 8
2278 // 8 pass | 11 12
2279 // 16 pass | 19 20
2280 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2281}
2282
2283int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2284 int WaitStatesNeeded = 0;
2285 unsigned Opc = MI->getOpcode();
2286
2287 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2288 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2289 };
2290
2291 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2292 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2293 !SIInstrInfo::isDOT(MI);
2294 };
2295
2296 if (!SIInstrInfo::isMFMA(MI: *MI))
2297 return WaitStatesNeeded;
2298
2299 const int VALUWritesExecWaitStates = 4;
2300 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2301 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn,
2302 Limit: VALUWritesExecWaitStates);
2303 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2304
2305 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2306
2307 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2308 for (const MachineOperand &Use : MI->explicit_uses()) {
2309 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2310 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2311 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2312 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2313 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2314 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2315 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2316 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2317 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2318 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2319 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2320 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2321 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2322 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2323 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2324 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2325 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2326 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2327 const int MaxWaitStates = 19;
2328
2329 if (!Use.isReg())
2330 continue;
2331 Register Reg = Use.getReg();
2332 bool FullReg;
2333 const MachineInstr *MI1;
2334
2335 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2336 this](const MachineInstr &MI) {
2337 if (!SIInstrInfo::isMFMA(MI))
2338 return false;
2339 Register DstReg = MI.getOperand(i: 0).getReg();
2340 FullReg = (DstReg == Reg);
2341 MI1 = &MI;
2342 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2343 };
2344
2345 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2346 getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2347 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2348
2349 int NumWaitStates =
2350 getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates);
2351 if (NumWaitStates == std::numeric_limits<int>::max())
2352 continue;
2353
2354 int OpNo = Use.getOperandNo();
2355 unsigned Opc1 = MI1->getOpcode();
2356 int NeedWaitStates = 0;
2357 if (OpNo == SrcCIdx) {
2358 if (!SIInstrInfo::isDGEMM(Opcode: Opc) &&
2359 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opcode: Opc1))) {
2360 NeedWaitStates = 0;
2361 } else if (FullReg) {
2362 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2363 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2364 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2365 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2366 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2367 else if (ST.hasGFX940Insts() &&
2368 TSchedModel.computeInstrLatency(MI: MI1) == 2)
2369 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2370 } else {
2371 switch (Opc1) {
2372 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2373 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2374 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2375 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2376 if (!TII.isXDL(MI: *MI))
2377 NeedWaitStates =
2378 ST.hasGFX950Insts()
2379 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2380 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2381 break;
2382 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2383 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2384 if (!TII.isXDL(MI: *MI))
2385 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2386 break;
2387 default:
2388 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2389 if (ST.hasGFX940Insts()) {
2390 if (TII.isXDL(MI: *MI) && !TII.isXDL(MI: *MI1))
2391 break;
2392
2393 NeedWaitStates =
2394 TII.isXDL(MI: *MI1)
2395 ? (TII.isXDL(MI: *MI)
2396 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2397 NumPasses, IsGFX950: ST.hasGFX950Insts())
2398 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2399 NumPasses, IsGFX950: ST.hasGFX950Insts()))
2400 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2401 NumPasses);
2402 break;
2403 }
2404
2405 switch (NumPasses) {
2406 case 2:
2407 NeedWaitStates =
2408 SIInstrInfo::isDGEMM(Opcode: Opc)
2409 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2410 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2411 break;
2412 case 8:
2413 NeedWaitStates =
2414 SIInstrInfo::isDGEMM(Opcode: Opc)
2415 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2416 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2417 break;
2418 case 16:
2419 NeedWaitStates =
2420 SIInstrInfo::isDGEMM(Opcode: Opc)
2421 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2422 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2423 break;
2424 default:
2425 llvm_unreachable("unexpected number of passes");
2426 }
2427 }
2428 }
2429 } else {
2430 switch (Opc1) {
2431 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2432 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2433 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2434 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2435 NeedWaitStates =
2436 ST.hasGFX950Insts()
2437 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2438 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2439 break;
2440 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2441 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2442 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2443 break;
2444 default:
2445 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2446
2447 if (ST.hasGFX940Insts()) {
2448 NeedWaitStates =
2449 TII.isXDL(MI: *MI1)
2450 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2451 NumPasses, IsGFX950: ST.hasGFX950Insts())
2452 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2453 NumPasses);
2454 break;
2455 }
2456
2457 switch (NumPasses) {
2458 case 2:
2459 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2460 break;
2461 case 4:
2462 llvm_unreachable("unexpected number of passes for mfma");
2463 case 8:
2464 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2465 break;
2466 case 16:
2467 default:
2468 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2469 }
2470 }
2471 }
2472 if (WaitStatesNeeded >= NeedWaitStates)
2473 continue;
2474
2475 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2476 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2477
2478 if (WaitStatesNeeded == MaxWaitStates)
2479 break;
2480 }
2481
2482 // Pad neighboring MFMA with noops for better inter-wave performance.
2483 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2484
2485 return WaitStatesNeeded;
2486}
2487
2488int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2489 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2490 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2491 return 0;
2492
2493 int WaitStatesNeeded = 0;
2494
2495 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2496 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2497 };
2498
2499 for (const MachineOperand &Op : MI->explicit_uses()) {
2500 if (!Op.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2501 continue;
2502
2503 Register Reg = Op.getReg();
2504
2505 const int AccVgprReadLdStWaitStates = 2;
2506 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2507 const int MaxWaitStates = 2;
2508
2509 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2510 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates);
2511 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2512
2513 if (WaitStatesNeeded == MaxWaitStates)
2514 return WaitStatesNeeded; // Early exit.
2515
2516 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2517 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2518 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2519 return false;
2520 auto IsVALUFn = [](const MachineInstr &MI) {
2521 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2522 };
2523 return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: 2 /*MaxWaitStates*/) <
2524 std::numeric_limits<int>::max();
2525 };
2526
2527 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2528 getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2529 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2530 }
2531
2532 return WaitStatesNeeded;
2533}
2534
2535int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2536 assert(!ST.hasVcmpxPermlaneHazard() &&
2537 "this is a different vcmpx+permlane hazard");
2538 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2539 const SIInstrInfo *TII = ST.getInstrInfo();
2540
2541 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2542 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
2543 };
2544
2545 auto IsVALUFn = [](const MachineInstr &MI) {
2546 return SIInstrInfo::isVALU(MI);
2547 };
2548
2549 const int VCmpXWritesExecWaitStates = 4;
2550 const int VALUWritesVDstWaitStates = 2;
2551 int WaitStatesNeeded = 0;
2552
2553 for (const MachineOperand &Op : MI->explicit_uses()) {
2554 if (!Op.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2555 continue;
2556 Register Reg = Op.getReg();
2557
2558 int WaitStatesSinceDef =
2559 VALUWritesVDstWaitStates -
2560 getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn,
2561 /*MaxWaitStates=*/Limit: VALUWritesVDstWaitStates);
2562 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesSinceDef);
2563 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2564 break;
2565 }
2566
2567 int VCmpXHazardWaits =
2568 VCmpXWritesExecWaitStates -
2569 getWaitStatesSince(IsHazard: IsVCmpXWritesExecFn, Limit: VCmpXWritesExecWaitStates);
2570
2571 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: VCmpXHazardWaits);
2572 return WaitStatesNeeded;
2573}
2574
2575static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2576 // 2 pass -> 4
2577 // 4 pass -> 6
2578 // 8 pass -> 10
2579 // 16 pass -> 18
2580 return NumPasses + 2;
2581}
2582
2583static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
2584 bool IsGFX950) {
2585 // xdl def cycles | gfx942 | gfx950
2586 // 2 pass | 5 5
2587 // 4 pass | 7 8
2588 // 8 pass | 11 12
2589 // 16 pass | 19 20
2590 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2591}
2592
2593static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
2594 bool IsGFX950) {
2595 // xdl def cycles | gfx942 | gfx950
2596 // 2 pass | 5 5
2597 // 4 pass | 7 8
2598 // 8 pass | 11 12
2599 // 16 pass | 19 20
2600 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2601}
2602
2603static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2604 // 2 pass -> 4
2605 // 4 pass -> 6
2606 // 8 pass -> 10
2607 // 16 pass -> 18
2608 return NumPasses + 2;
2609}
2610
2611int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2612 if (!ST.hasGFX90AInsts())
2613 return 0;
2614
2615 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2616 return SIInstrInfo::isDGEMM(Opcode: MI.getOpcode());
2617 };
2618
2619 // This is checked in checkMAIHazards90A()
2620 if (SIInstrInfo::isMFMA(MI: *MI))
2621 return 0;
2622
2623 const MachineRegisterInfo &MRI = MF.getRegInfo();
2624
2625 int WaitStatesNeeded = 0;
2626
2627 bool IsMem = SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI);
2628 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(MI: *MI);
2629 bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
2630
2631 const MachineInstr *MFMA = nullptr;
2632 unsigned Reg;
2633 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2634 if (!SIInstrInfo::isMFMA(MI) ||
2635 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
2636 return false;
2637 MFMA = &MI;
2638 return true;
2639 };
2640
2641 const MachineInstr *DOT = nullptr;
2642 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2643 if (!SIInstrInfo::isDOT(MI) ||
2644 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
2645 return false;
2646 DOT = &MI;
2647 return true;
2648 };
2649
2650 bool DGEMMAfterVALUWrite = false;
2651 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2652 // Found DGEMM on reverse traversal to def.
2653 if (SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()))
2654 DGEMMAfterVALUWrite = true;
2655
2656 // Only hazard if register is defined by a VALU and a DGEMM is found after
2657 // after the def.
2658 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2659 return false;
2660
2661 return true;
2662 };
2663
2664 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(),
2665 Name: AMDGPU::OpName::src2);
2666
2667 if (IsMemOrExport || IsVALU) {
2668 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2669 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2670 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2671 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2672 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2673 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2674 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2675 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2676 const int DotWriteSameDotReadSrcAB = 3;
2677 const int DotWriteDifferentVALURead = 3;
2678 const int DMFMABetweenVALUWriteVMEMRead = 2;
2679 const int MaxWaitStates = 19;
2680
2681 for (const MachineOperand &Use : MI->explicit_uses()) {
2682 if (!Use.isReg())
2683 continue;
2684 Reg = Use.getReg();
2685
2686 DOT = nullptr;
2687 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2688 Limit: MaxWaitStates);
2689 if (DOT) {
2690 int NeedWaitStates = 0;
2691 if (DOT->getOpcode() == MI->getOpcode()) {
2692 if (&Use - &MI->getOperand(i: 0) != SrcCIdx)
2693 NeedWaitStates = DotWriteSameDotReadSrcAB;
2694 } else {
2695 NeedWaitStates = DotWriteDifferentVALURead;
2696 }
2697
2698 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2699 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2700 }
2701
2702 // Workaround for HW data hazard bug observed only in GFX90A. When there
2703 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2704 // causes the SQ to incorrectly not insert two wait states between the two
2705 // instructions needed to avoid data hazard.
2706 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2707 DGEMMAfterVALUWrite = false;
2708 if (TRI.isVectorRegister(MRI, Reg)) {
2709 int WaitStatesNeededForUse =
2710 DMFMABetweenVALUWriteVMEMRead -
2711 getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
2712 Limit: DMFMABetweenVALUWriteVMEMRead);
2713
2714 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2715 }
2716 }
2717
2718 MFMA = nullptr;
2719 WaitStatesSinceDef =
2720 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2721 if (!MFMA)
2722 continue;
2723
2724 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2725 int NumPasses = HazardDefLatency;
2726 int NeedWaitStates = MaxWaitStates;
2727
2728 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
2729 switch (HazardDefLatency) {
2730 case 4:
2731 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2732 : DMFMA4x4WriteVgprVALUReadWaitStates;
2733 break;
2734 case 8:
2735 case 16:
2736 NeedWaitStates =
2737 IsMemOrExport
2738 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2739 : (ST.hasGFX950Insts()
2740 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2741 : DMFMA16x16WriteVgprVALUReadWaitStates);
2742 break;
2743 default:
2744 llvm_unreachable("unexpected dgemm");
2745 }
2746 } else if (ST.hasGFX940Insts()) {
2747 NeedWaitStates =
2748 TII.isXDL(MI: *MFMA)
2749 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
2750 NumPasses, IsGFX950: ST.hasGFX950Insts())
2751 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2752 NumPasses);
2753 } else {
2754 switch (HazardDefLatency) {
2755 case 2:
2756 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2757 break;
2758 case 8:
2759 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2760 break;
2761 case 16:
2762 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2763 break;
2764 default:
2765 llvm_unreachable("unexpected number of passes for mfma");
2766 }
2767 }
2768
2769 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2770 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2771
2772 if (WaitStatesNeeded == MaxWaitStates)
2773 break;
2774 }
2775 }
2776
2777 unsigned Opc = MI->getOpcode();
2778 const int DMFMAToFMA64WaitStates = 2;
2779 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2780 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2781 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2782 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2783 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2784 getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
2785 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2786 }
2787
2788 if (!IsVALU && !IsMemOrExport)
2789 return WaitStatesNeeded;
2790
2791 for (const MachineOperand &Def : MI->defs()) {
2792 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2793 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2794 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2795 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2796 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2797 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2798 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2799 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2800 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2801 const int DotWriteDifferentVALUWrite = 3;
2802 const int MaxWaitStates = 19;
2803 const int MaxWarWaitStates = 15;
2804
2805 Reg = Def.getReg();
2806
2807 DOT = nullptr;
2808 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
2809 Limit: MaxWaitStates);
2810 if (DOT && DOT->getOpcode() != MI->getOpcode())
2811 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
2812 WaitStatesSinceDef);
2813
2814 MFMA = nullptr;
2815 WaitStatesSinceDef =
2816 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
2817 if (MFMA) {
2818 int NeedWaitStates = MaxWaitStates;
2819 int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
2820
2821 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
2822 switch (NumPasses) {
2823 case 4:
2824 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2825 break;
2826 case 8:
2827 case 16:
2828 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2829 break;
2830 default:
2831 llvm_unreachable("unexpected number of cycles for dgemm");
2832 }
2833 } else if (ST.hasGFX940Insts()) {
2834 NeedWaitStates =
2835 TII.isXDL(MI: *MFMA)
2836 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
2837 NumPasses, IsGFX950: ST.hasGFX950Insts())
2838 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
2839 } else {
2840 switch (NumPasses) {
2841 case 2:
2842 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2843 break;
2844 case 8:
2845 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2846 break;
2847 case 16:
2848 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2849 break;
2850 default:
2851 llvm_unreachable("Unexpected number of passes for mfma");
2852 }
2853 }
2854
2855 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2856 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2857
2858 if (WaitStatesNeeded == MaxWaitStates)
2859 break;
2860 }
2861
2862 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2863 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()) ||
2864 !MI.readsRegister(Reg, TRI: &TRI))
2865 return false;
2866
2867 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
2868 return false;
2869
2870 const MachineOperand *SrcC =
2871 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
2872 assert(SrcC);
2873 if (!SrcC->isReg() || !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg))
2874 return false;
2875
2876 MFMA = &MI;
2877 return true;
2878 };
2879
2880 MFMA = nullptr;
2881 int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
2882 Limit: MaxWarWaitStates);
2883 if (!MFMA)
2884 continue;
2885
2886 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
2887 int NeedWaitStates = MaxWaitStates;
2888 switch (HazardDefLatency) {
2889 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2890 break;
2891 case 4: assert(ST.hasGFX940Insts());
2892 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2893 break;
2894 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2895 break;
2896 case 16: [[fallthrough]];
2897 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2898 break;
2899 }
2900
2901 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2902 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2903 }
2904
2905 return WaitStatesNeeded;
2906}
2907
2908bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2909 if (!SU->isInstr())
2910 return false;
2911
2912 const MachineInstr *MAI = nullptr;
2913
2914 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2915 MAI = nullptr;
2916 if (SIInstrInfo::isMFMA(MI))
2917 MAI = &MI;
2918 return MAI != nullptr;
2919 };
2920
2921 MachineInstr *MI = SU->getInstr();
2922 if (IsMFMAFn(*MI)) {
2923 int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: 16);
2924 if (MAI)
2925 return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
2926 }
2927
2928 return false;
2929}
2930
2931// Adjust global offsets for instructions bundled with S_GETPC_B64 after
2932// insertion of a new instruction.
2933static void updateGetPCBundle(MachineInstr *NewMI) {
2934 if (!NewMI->isBundled())
2935 return;
2936
2937 // Find start of bundle.
2938 auto I = NewMI->getIterator();
2939 while (I->isBundledWithPred())
2940 I--;
2941 if (I->isBundle())
2942 I++;
2943
2944 // Bail if this is not an S_GETPC bundle.
2945 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
2946 return;
2947
2948 // Update offsets of any references in the bundle.
2949 const unsigned NewBytes = 4;
2950 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2951 "Unexpected instruction insertion in bundle");
2952 auto NextMI = std::next(x: NewMI->getIterator());
2953 auto End = NewMI->getParent()->end();
2954 while (NextMI != End && NextMI->isBundledWithPred()) {
2955 for (auto &Operand : NextMI->operands()) {
2956 if (Operand.isGlobal())
2957 Operand.setOffset(Operand.getOffset() + NewBytes);
2958 }
2959 NextMI++;
2960 }
2961}
2962
2963bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2964 if (!ST.hasVALUMaskWriteHazard())
2965 return false;
2966 assert(!ST.hasExtendedWaitCounts());
2967
2968 if (!ST.isWave64() || !SIInstrInfo::isSALU(MI: *MI))
2969 return false;
2970
2971 // The hazard sequence is three instructions:
2972 // 1. VALU reads SGPR as mask
2973 // 2. SALU writes SGPR
2974 // 3. SALU reads SGPR
2975 // The hazard can expire if the distance between 2 and 3 is sufficient.
2976 // In practice this happens <10% of the time, hence this always assumes
2977 // the hazard exists if 1 and 2 are present to avoid searching.
2978
2979 const MachineOperand *SDSTOp = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::sdst);
2980 if (!SDSTOp || !SDSTOp->isReg())
2981 return false;
2982
2983 const Register HazardReg = SDSTOp->getReg();
2984 if (HazardReg == AMDGPU::EXEC ||
2985 HazardReg == AMDGPU::EXEC_LO ||
2986 HazardReg == AMDGPU::EXEC_HI ||
2987 HazardReg == AMDGPU::M0)
2988 return false;
2989
2990 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2991 switch (I.getOpcode()) {
2992 case AMDGPU::V_ADDC_U32_e32:
2993 case AMDGPU::V_ADDC_U32_dpp:
2994 case AMDGPU::V_CNDMASK_B16_t16_e32:
2995 case AMDGPU::V_CNDMASK_B16_fake16_e32:
2996 case AMDGPU::V_CNDMASK_B16_t16_dpp:
2997 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
2998 case AMDGPU::V_CNDMASK_B32_e32:
2999 case AMDGPU::V_CNDMASK_B32_dpp:
3000 case AMDGPU::V_DIV_FMAS_F32_e64:
3001 case AMDGPU::V_DIV_FMAS_F64_e64:
3002 case AMDGPU::V_SUBB_U32_e32:
3003 case AMDGPU::V_SUBB_U32_dpp:
3004 case AMDGPU::V_SUBBREV_U32_e32:
3005 case AMDGPU::V_SUBBREV_U32_dpp:
3006 // These implicitly read VCC as mask source.
3007 return HazardReg == AMDGPU::VCC ||
3008 HazardReg == AMDGPU::VCC_LO ||
3009 HazardReg == AMDGPU::VCC_HI;
3010 case AMDGPU::V_ADDC_U32_e64:
3011 case AMDGPU::V_ADDC_U32_e64_dpp:
3012 case AMDGPU::V_CNDMASK_B16_t16_e64:
3013 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3014 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3015 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3016 case AMDGPU::V_CNDMASK_B32_e64:
3017 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3018 case AMDGPU::V_SUBB_U32_e64:
3019 case AMDGPU::V_SUBB_U32_e64_dpp:
3020 case AMDGPU::V_SUBBREV_U32_e64:
3021 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3022 // Only check mask register overlaps.
3023 const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src2);
3024 assert(SSRCOp);
3025 return TRI.regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg);
3026 }
3027 default:
3028 return false;
3029 }
3030 };
3031
3032 const MachineRegisterInfo &MRI = MF.getRegInfo();
3033 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3034 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3035 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3036 AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: I.getOperand(i: 0).getImm()) == 0)
3037 return true;
3038
3039 // VALU access to any SGPR or literal constant other than HazardReg
3040 // mitigates hazard. No need to check HazardReg here as this will
3041 // only be called when !IsHazardFn.
3042 if (!SIInstrInfo::isVALU(MI: I))
3043 return false;
3044 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3045 const MachineOperand &Op = I.getOperand(i: OpNo);
3046 if (Op.isReg()) {
3047 Register OpReg = Op.getReg();
3048 // Only consider uses
3049 if (!Op.isUse())
3050 continue;
3051 // Ignore EXEC
3052 if (OpReg == AMDGPU::EXEC ||
3053 OpReg == AMDGPU::EXEC_LO ||
3054 OpReg == AMDGPU::EXEC_HI)
3055 continue;
3056 // Ignore all implicit uses except VCC
3057 if (Op.isImplicit()) {
3058 if (OpReg == AMDGPU::VCC ||
3059 OpReg == AMDGPU::VCC_LO ||
3060 OpReg == AMDGPU::VCC_HI)
3061 return true;
3062 continue;
3063 }
3064 if (TRI.isSGPRReg(MRI, Reg: OpReg))
3065 return true;
3066 } else {
3067 const MCInstrDesc &InstDesc = I.getDesc();
3068 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3069 if (!TII.isInlineConstant(MO: Op, OpInfo))
3070 return true;
3071 }
3072 }
3073 return false;
3074 };
3075
3076 // Check for hazard
3077 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
3078 std::numeric_limits<int>::max())
3079 return false;
3080
3081 auto NextMI = std::next(x: MI->getIterator());
3082
3083 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3084 auto NewMI = BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(),
3085 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3086 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0));
3087
3088 // SALU write may be s_getpc in a bundle.
3089 updateGetPCBundle(NewMI);
3090
3091 return true;
3092}
3093
3094static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3095 const SIInstrInfo &TII) {
3096 MachineBasicBlock &EntryMBB = MF->front();
3097 if (EntryMBB.begin() != EntryMBB.end()) {
3098 auto &EntryMI = *EntryMBB.begin();
3099 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3100 EntryMI.getOperand(i: 0).getImm() >= Priority)
3101 return false;
3102 }
3103
3104 BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3105 .addImm(Val: Priority);
3106 return true;
3107}
3108
3109bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3110 if (!ST.hasRequiredExportPriority())
3111 return false;
3112
3113 // Assume the following shader types will never have exports,
3114 // and avoid adding or adjusting S_SETPRIO.
3115 MachineBasicBlock *MBB = MI->getParent();
3116 MachineFunction *MF = MBB->getParent();
3117 auto CC = MF->getFunction().getCallingConv();
3118 switch (CC) {
3119 case CallingConv::AMDGPU_CS:
3120 case CallingConv::AMDGPU_CS_Chain:
3121 case CallingConv::AMDGPU_CS_ChainPreserve:
3122 case CallingConv::AMDGPU_KERNEL:
3123 return false;
3124 default:
3125 break;
3126 }
3127
3128 const int MaxPriority = 3;
3129 const int NormalPriority = 2;
3130 const int PostExportPriority = 0;
3131
3132 auto It = MI->getIterator();
3133 switch (MI->getOpcode()) {
3134 case AMDGPU::S_ENDPGM:
3135 case AMDGPU::S_ENDPGM_SAVED:
3136 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3137 case AMDGPU::SI_RETURN_TO_EPILOG:
3138 // Ensure shader with calls raises priority at entry.
3139 // This ensures correct priority if exports exist in callee.
3140 if (MF->getFrameInfo().hasCalls())
3141 return ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3142 return false;
3143 case AMDGPU::S_SETPRIO: {
3144 // Raise minimum priority unless in workaround.
3145 auto &PrioOp = MI->getOperand(i: 0);
3146 int Prio = PrioOp.getImm();
3147 bool InWA = (Prio == PostExportPriority) &&
3148 (It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It)));
3149 if (InWA || Prio >= NormalPriority)
3150 return false;
3151 PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority));
3152 return true;
3153 }
3154 default:
3155 if (!TII.isEXP(MI: *MI))
3156 return false;
3157 break;
3158 }
3159
3160 // Check entry priority at each export (as there will only be a few).
3161 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3162 bool Changed = false;
3163 if (CC != CallingConv::AMDGPU_Gfx)
3164 Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3165
3166 auto NextMI = std::next(x: It);
3167 bool EndOfShader = false;
3168 if (NextMI != MBB->end()) {
3169 // Only need WA at end of sequence of exports.
3170 if (TII.isEXP(MI: *NextMI))
3171 return Changed;
3172 // Assume appropriate S_SETPRIO after export means WA already applied.
3173 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3174 NextMI->getOperand(i: 0).getImm() == PostExportPriority)
3175 return Changed;
3176 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3177 }
3178
3179 const DebugLoc &DL = MI->getDebugLoc();
3180
3181 // Lower priority.
3182 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3183 .addImm(Val: PostExportPriority);
3184
3185 if (!EndOfShader) {
3186 // Wait for exports to complete.
3187 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT))
3188 .addReg(RegNo: AMDGPU::SGPR_NULL)
3189 .addImm(Val: 0);
3190 }
3191
3192 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3193 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3194
3195 if (!EndOfShader) {
3196 // Return to normal (higher) priority.
3197 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3198 .addImm(Val: NormalPriority);
3199 }
3200
3201 return true;
3202}
3203