1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "GCNSubtarget.h"
15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16#include "SIMachineFunctionInfo.h"
17#include "llvm/CodeGen/MachineFrameInfo.h"
18#include "llvm/CodeGen/MachineFunction.h"
19#include "llvm/CodeGen/MachineInstrBuilder.h"
20#include "llvm/CodeGen/ScheduleDAG.h"
21#include "llvm/TargetParser/TargetParser.h"
22
23using namespace llvm;
24
25namespace {
26
27struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
28 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
29
30 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
31 if (Arg.getAsInteger(Radix: 0, Result&: Value))
32 return O.error(Message: "'" + Arg + "' value invalid for uint argument!");
33
34 if (Value > 100)
35 return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!");
36
37 return false;
38 }
39};
40
41} // end anonymous namespace
42
43static cl::opt<unsigned, false, MFMAPaddingRatioParser>
44 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(Val: 0), cl::Hidden,
45 cl::desc("Fill a percentage of the latency between "
46 "neighboring MFMA with s_nops."));
47
48// This is intended for debugging purposes only.
49static cl::opt<unsigned>
50 NopPadding("amdgpu-snop-padding", cl::init(Val: 0), cl::Hidden,
51 cl::desc("Insert a s_nop x before every instruction"));
52
53//===----------------------------------------------------------------------===//
54// Hazard Recognizer Implementation
55//===----------------------------------------------------------------------===//
56
57static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
58 const GCNSubtarget &ST);
59
60GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
61 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
62 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
63 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
64 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
65 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? 19 : 5;
66 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
67}
68
69void GCNHazardRecognizer::Reset() {
70 EmittedInstrs.clear();
71}
72
73void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
74 EmitInstruction(MI: SU->getInstr());
75}
76
77void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
78 CurrCycleInstr = MI;
79}
80
81static bool isDivFMas(unsigned Opcode) {
82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
83}
84
85static bool isSGetReg(unsigned Opcode) {
86 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
87}
88
89static bool isSSetReg(unsigned Opcode) {
90 switch (Opcode) {
91 case AMDGPU::S_SETREG_B32:
92 case AMDGPU::S_SETREG_B32_mode:
93 case AMDGPU::S_SETREG_IMM32_B32:
94 case AMDGPU::S_SETREG_IMM32_B32_mode:
95 return true;
96 }
97 return false;
98}
99
100static bool isRWLane(unsigned Opcode) {
101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
102}
103
104static bool isRFE(unsigned Opcode) {
105 return Opcode == AMDGPU::S_RFE_B64;
106}
107
108static bool isSMovRel(unsigned Opcode) {
109 switch (Opcode) {
110 case AMDGPU::S_MOVRELS_B32:
111 case AMDGPU::S_MOVRELS_B64:
112 case AMDGPU::S_MOVRELD_B32:
113 case AMDGPU::S_MOVRELD_B64:
114 return true;
115 default:
116 return false;
117 }
118}
119
120static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
121 const MachineInstr &MI) {
122 if (TII.isAlwaysGDS(Opcode: MI.getOpcode()))
123 return true;
124
125 switch (MI.getOpcode()) {
126 case AMDGPU::S_SENDMSG:
127 case AMDGPU::S_SENDMSGHALT:
128 case AMDGPU::S_TTRACEDATA:
129 return true;
130 // These DS opcodes don't support GDS.
131 case AMDGPU::DS_NOP:
132 case AMDGPU::DS_PERMUTE_B32:
133 case AMDGPU::DS_BPERMUTE_B32:
134 return false;
135 default:
136 if (TII.isDS(Opcode: MI.getOpcode())) {
137 int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
138 Name: AMDGPU::OpName::gds);
139 if (MI.getOperand(i: GDS).getImm())
140 return true;
141 }
142 return false;
143 }
144}
145
146static bool isPermlane(const MachineInstr &MI) {
147 unsigned Opcode = MI.getOpcode();
148 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
149 Opcode == AMDGPU::V_PERMLANE64_B32 ||
150 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
154 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
156 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
161 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
162}
163
164static bool isLdsDma(const MachineInstr &MI) {
165 return SIInstrInfo::isVALU(MI) &&
166 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
167}
168
169static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
170 const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr,
171 OperandName: AMDGPU::OpName::simm16);
172 return std::get<0>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm()));
173}
174
175ScheduleHazardRecognizer::HazardType
176GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
177 MachineInstr *MI = SU->getInstr();
178 // If we are not in "HazardRecognizerMode" and therefore not being run from
179 // the scheduler, track possible stalls from hazards but don't insert noops.
180 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
181
182 if (MI->isBundle())
183 return NoHazard;
184
185 if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > 0)
186 return HazardType;
187
188 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
189 return HazardType;
190
191 if (checkFPAtomicToDenormModeHazard(MI) > 0)
192 return HazardType;
193
194 // Hazards which cannot be mitigated with S_NOPs.
195 if (!IsHazardRecognizerMode) {
196 if (checkWMMACoexecutionHazards(MI) > 0)
197 return Hazard;
198 }
199
200 if (ST.hasNoDataDepHazard())
201 return NoHazard;
202
203 if (SIInstrInfo::isVMEM(MI: *MI) && checkVMEMHazards(VMEM: MI) > 0)
204 return HazardType;
205
206 if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > 0)
207 return HazardType;
208
209 if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > 0)
210 return HazardType;
211
212 if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > 0)
213 return HazardType;
214
215 if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > 0)
216 return HazardType;
217
218 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
219 SIInstrInfo::isDS(MI: *MI) || SIInstrInfo::isEXP(MI: *MI)) &&
220 checkMAIVALUHazards(MI) > 0)
221 return HazardType;
222
223 if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > 0)
224 return HazardType;
225
226 if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > 0)
227 return HazardType;
228
229 if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > 0)
230 return HazardType;
231
232 if (((ST.hasReadM0MovRelInterpHazard() &&
233 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
234 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
235 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
236 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
237 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
238 (ST.hasReadM0LdsDirectHazard() &&
239 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) &&
240 checkReadM0Hazards(SMovRel: MI) > 0)
241 return HazardType;
242
243 if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > 0)
244 return HazardType;
245
246 if ((SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI)) &&
247 checkMAILdStHazards(MI) > 0)
248 return HazardType;
249
250 if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > 0)
251 return HazardType;
252
253 return NoHazard;
254}
255
256static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
257 unsigned Quantity) {
258 while (Quantity > 0) {
259 unsigned Arg = std::min(a: Quantity, b: 8u);
260 Quantity -= Arg;
261 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP))
262 .addImm(Val: Arg - 1);
263 }
264}
265
266unsigned
267GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
268 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI);
269 assert(TSchedModel.getWriteProcResBegin(SC) !=
270 TSchedModel.getWriteProcResEnd(SC));
271 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
272}
273
274void GCNHazardRecognizer::processBundle() {
275 MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator());
276 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
277 // Check bundled MachineInstr's for hazards.
278 for (; MI != E && MI->isInsideBundle(); ++MI) {
279 CurrCycleInstr = &*MI;
280 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
281
282 if (IsHazardRecognizerMode) {
283 fixHazards(MI: CurrCycleInstr);
284
285 insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates);
286 }
287
288 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
289 // include the bundled MI directly after, only add a maximum of
290 // (MaxLookAhead - 1) noops to EmittedInstrs.
291 for (unsigned i = 0, e = std::min(a: WaitStates, b: MaxLookAhead - 1); i < e; ++i)
292 EmittedInstrs.push_front(x: nullptr);
293
294 EmittedInstrs.push_front(x: CurrCycleInstr);
295 EmittedInstrs.resize(new_size: MaxLookAhead);
296 }
297 CurrCycleInstr = nullptr;
298}
299
300void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
301 assert(IsHazardRecognizerMode);
302
303 unsigned NumPreNoops = PreEmitNoops(MI);
304 EmitNoops(Quantity: NumPreNoops);
305 if (MI->isInsideBundle())
306 insertNoopsInBundle(MI, TII, Quantity: NumPreNoops);
307 else
308 TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator(MI),
309 Quantity: NumPreNoops);
310 EmitInstruction(MI);
311 AdvanceCycle();
312}
313
314unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
315 IsHazardRecognizerMode = true;
316 CurrCycleInstr = MI;
317 unsigned W = PreEmitNoopsCommon(MI);
318 fixHazards(MI);
319 CurrCycleInstr = nullptr;
320 return std::max(a: W, b: NopPadding.getValue());
321}
322
323unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
324 if (MI->isBundle())
325 return 0;
326
327 int WaitStates = 0;
328
329 if (SIInstrInfo::isSMRD(MI: *MI))
330 return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI));
331
332 if (ST.hasNSAtoVMEMBug())
333 WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI));
334
335 WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI));
336
337 if (ST.hasNoDataDepHazard())
338 return WaitStates;
339
340 if (SIInstrInfo::isVMEM(MI: *MI))
341 WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI));
342
343 if (SIInstrInfo::isVALU(MI: *MI))
344 WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI));
345
346 if (SIInstrInfo::isDPP(MI: *MI))
347 WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI));
348
349 if (isDivFMas(Opcode: MI->getOpcode()))
350 WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI));
351
352 if (isRWLane(Opcode: MI->getOpcode()))
353 WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI));
354
355 if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) ||
356 SIInstrInfo::isDS(MI: *MI) || SIInstrInfo::isEXP(MI: *MI)) &&
357 checkMAIVALUHazards(MI) > 0)
358 WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI));
359
360 if (MI->isInlineAsm())
361 return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI));
362
363 if (isSGetReg(Opcode: MI->getOpcode()))
364 return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI));
365
366 if (isSSetReg(Opcode: MI->getOpcode()))
367 return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI));
368
369 if (isRFE(Opcode: MI->getOpcode()))
370 return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI));
371
372 if ((ST.hasReadM0MovRelInterpHazard() &&
373 (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) ||
374 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
375 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
376 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) ||
377 (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) ||
378 (ST.hasReadM0LdsDirectHazard() &&
379 MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr)))
380 return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI));
381
382 if (SIInstrInfo::isMAI(MI: *MI))
383 return std::max(a: WaitStates, b: checkMAIHazards(MI));
384
385 if (SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI))
386 return std::max(a: WaitStates, b: checkMAILdStHazards(MI));
387
388 if (ST.hasGFX950Insts() && isPermlane(MI: *MI))
389 return std::max(a: WaitStates, b: checkPermlaneHazards(MI));
390
391 return WaitStates;
392}
393
394void GCNHazardRecognizer::EmitNoop() {
395 EmittedInstrs.push_front(x: nullptr);
396}
397
398void GCNHazardRecognizer::AdvanceCycle() {
399 // When the scheduler detects a stall, it will call AdvanceCycle() without
400 // emitting any instructions.
401 if (!CurrCycleInstr) {
402 EmittedInstrs.push_front(x: nullptr);
403 return;
404 }
405
406 if (CurrCycleInstr->isBundle()) {
407 processBundle();
408 return;
409 }
410
411 unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr);
412 if (!NumWaitStates) {
413 CurrCycleInstr = nullptr;
414 return;
415 }
416
417 // Keep track of emitted instructions
418 EmittedInstrs.push_front(x: CurrCycleInstr);
419
420 // Add a nullptr for each additional wait state after the first. Make sure
421 // not to add more than getMaxLookAhead() items to the list, since we
422 // truncate the list to that size right after this loop.
423 for (unsigned i = 1, e = std::min(a: NumWaitStates, b: getMaxLookAhead());
424 i < e; ++i) {
425 EmittedInstrs.push_front(x: nullptr);
426 }
427
428 // getMaxLookahead() is the largest number of wait states we will ever need
429 // to insert, so there is no point in keeping track of more than that many
430 // wait states.
431 EmittedInstrs.resize(new_size: getMaxLookAhead());
432
433 CurrCycleInstr = nullptr;
434}
435
436void GCNHazardRecognizer::RecedeCycle() {
437 assert(!IsHazardRecognizerMode &&
438 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
439}
440
441//===----------------------------------------------------------------------===//
442// Helper Functions
443//===----------------------------------------------------------------------===//
444
445enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
446
447// Search for a hazard in a block and its predecessors.
448template <typename StateT>
449static bool
450hasHazard(StateT InitialState,
451 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
452 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
453 const MachineBasicBlock *InitialMBB,
454 MachineBasicBlock::const_reverse_instr_iterator InitialI) {
455 struct StateMapKey {
456 SmallVectorImpl<StateT> *States;
457 unsigned Idx;
458 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
459 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;
460 }
461 };
462 struct StateMapKeyTraits : DenseMapInfo<StateMapKey> {
463 static inline StateMapKey getEmptyKey() {
464 return {static_cast<SmallVectorImpl<StateT> *>(
465 DenseMapInfo<void *>::getEmptyKey()),
466 DenseMapInfo<unsigned>::getEmptyKey()};
467 }
468 static inline StateMapKey getTombstoneKey() {
469 return {static_cast<SmallVectorImpl<StateT> *>(
470 DenseMapInfo<void *>::getTombstoneKey()),
471 DenseMapInfo<unsigned>::getTombstoneKey()};
472 }
473 static unsigned getHashValue(const StateMapKey &Key) {
474 return StateT::getHashValue((*Key.States)[Key.Idx]);
475 }
476 static unsigned getHashValue(const StateT &State) {
477 return StateT::getHashValue(State);
478 }
479 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {
480 const auto EKey = getEmptyKey();
481 const auto TKey = getTombstoneKey();
482 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||
483 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))
484 return StateMapKey::isEqual(LHS, RHS);
485 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);
486 }
487 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {
488 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||
489 StateMapKey::isEqual(RHS, getTombstoneKey()))
490 return false;
491 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);
492 }
493 };
494
495 SmallDenseMap<StateMapKey, unsigned, 8, StateMapKeyTraits> StateMap;
496 SmallVector<StateT, 8> States;
497
498 MachineBasicBlock::const_reverse_instr_iterator I = InitialI;
499 const MachineBasicBlock *MBB = InitialMBB;
500 StateT State = InitialState;
501
502 SmallSetVector<std::pair<const MachineBasicBlock *, unsigned>, 16> Worklist;
503 unsigned WorkIdx = 0;
504 for (;;) {
505 bool Expired = false;
506 for (auto E = MBB->instr_rend(); I != E; ++I) {
507 // No need to look at parent BUNDLE instructions.
508 if (I->isBundle())
509 continue;
510
511 auto Result = IsHazard(State, *I);
512 if (Result == HazardFound)
513 return true;
514 if (Result == HazardExpired) {
515 Expired = true;
516 break;
517 }
518
519 if (I->isInlineAsm() || I->isMetaInstruction())
520 continue;
521
522 UpdateState(State, *I);
523 }
524
525 if (!Expired) {
526 unsigned StateIdx = States.size();
527 StateMapKey Key = {&States, StateIdx};
528 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);
529 if (Insertion.second) {
530 States.emplace_back(State);
531 } else {
532 StateIdx = Insertion.first->second;
533 }
534 for (MachineBasicBlock *Pred : MBB->predecessors())
535 Worklist.insert(X: std::pair(Pred, StateIdx));
536 }
537
538 if (WorkIdx == Worklist.size())
539 break;
540
541 unsigned StateIdx;
542 std::tie(args&: MBB, args&: StateIdx) = Worklist[WorkIdx++];
543 State = States[StateIdx];
544 I = MBB->instr_rbegin();
545 }
546
547 return false;
548}
549
550// Returns a minimum wait states since \p I walking all predecessors.
551// Only scans until \p IsExpired does not return true.
552// Can only be run in a hazard recognizer mode.
553static int
554getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
555 const MachineBasicBlock *MBB,
556 MachineBasicBlock::const_reverse_instr_iterator I,
557 int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
558 DenseSet<const MachineBasicBlock *> &Visited,
559 GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
560 SIInstrInfo::getNumWaitStates) {
561 for (auto E = MBB->instr_rend(); I != E; ++I) {
562 // Don't add WaitStates for parent BUNDLE instructions.
563 if (I->isBundle())
564 continue;
565
566 if (IsHazard(*I))
567 return WaitStates;
568
569 if (I->isInlineAsm())
570 continue;
571
572 WaitStates += GetNumWaitStates(*I);
573
574 if (IsExpired(*I, WaitStates))
575 return std::numeric_limits<int>::max();
576 }
577
578 int MinWaitStates = std::numeric_limits<int>::max();
579 for (MachineBasicBlock *Pred : MBB->predecessors()) {
580 if (!Visited.insert(V: Pred).second)
581 continue;
582
583 int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates,
584 IsExpired, Visited, GetNumWaitStates);
585
586 MinWaitStates = std::min(a: MinWaitStates, b: W);
587 }
588
589 return MinWaitStates;
590}
591
592static int
593getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
594 const MachineInstr *MI,
595 GCNHazardRecognizer::IsExpiredFn IsExpired,
596 GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
597 SIInstrInfo::getNumWaitStates) {
598 DenseSet<const MachineBasicBlock *> Visited;
599 return getWaitStatesSince(IsHazard, MBB: MI->getParent(),
600 I: std::next(x: MI->getReverseIterator()), WaitStates: 0, IsExpired,
601 Visited, GetNumWaitStates);
602}
603
604int GCNHazardRecognizer::getWaitStatesSince(
605 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {
606 if (IsHazardRecognizerMode) {
607 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
608 return WaitStates >= Limit;
609 };
610 return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn,
611 GetNumWaitStates);
612 }
613
614 int WaitStates = 0;
615 for (MachineInstr *MI : EmittedInstrs) {
616 if (MI) {
617 if (IsHazard(*MI))
618 return WaitStates;
619
620 if (MI->isInlineAsm())
621 continue;
622 }
623 WaitStates += MI ? GetNumWaitStates(*MI) : 1;
624
625 if (WaitStates >= Limit)
626 break;
627 }
628 return std::numeric_limits<int>::max();
629}
630
631int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
632 return getWaitStatesSince(IsHazard, Limit, GetNumWaitStates: SIInstrInfo::getNumWaitStates);
633}
634
635int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
636 IsHazardFn IsHazardDef,
637 int Limit) {
638 const SIRegisterInfo *TRI = ST.getRegisterInfo();
639
640 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
641 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
642 };
643
644 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
645}
646
647int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
648 int Limit) {
649 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
650 return isSSetReg(Opcode: MI.getOpcode()) && IsHazard(MI);
651 };
652
653 return getWaitStatesSince(IsHazard: IsHazardFn, Limit);
654}
655
656//===----------------------------------------------------------------------===//
657// No-op Hazard Detection
658//===----------------------------------------------------------------------===//
659
660static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
661 MCRegister Reg) {
662 for (MCRegUnit Unit : TRI.regunits(Reg))
663 BV.set(static_cast<unsigned>(Unit));
664}
665
666static void addRegsToSet(const SIRegisterInfo &TRI,
667 iterator_range<MachineInstr::const_mop_iterator> Ops,
668 BitVector &DefSet, BitVector &UseSet) {
669 for (const MachineOperand &Op : Ops) {
670 if (Op.isReg())
671 addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg());
672 }
673}
674
675void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
676 addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses);
677}
678
679static bool breaksSMEMSoftClause(MachineInstr *MI) {
680 return !SIInstrInfo::isSMRD(MI: *MI);
681}
682
683static bool breaksVMEMSoftClause(MachineInstr *MI) {
684 return !SIInstrInfo::isVMEM(MI: *MI);
685}
686
687int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
688 // SMEM soft clause are only present on VI+, and only matter if xnack is
689 // enabled.
690 if (!ST.isXNACKEnabled())
691 return 0;
692
693 bool IsSMRD = TII.isSMRD(MI: *MEM);
694
695 resetClause();
696
697 // A soft-clause is any group of consecutive SMEM instructions. The
698 // instructions in this group may return out of order and/or may be
699 // replayed (i.e. the same instruction issued more than once).
700 //
701 // In order to handle these situations correctly we need to make sure that
702 // when a clause has more than one instruction, no instruction in the clause
703 // writes to a register that is read by another instruction in the clause
704 // (including itself). If we encounter this situation, we need to break the
705 // clause by inserting a non SMEM instruction.
706
707 for (MachineInstr *MI : EmittedInstrs) {
708 // When we hit a non-SMEM instruction then we have passed the start of the
709 // clause and we can stop.
710 if (!MI)
711 break;
712
713 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
714 break;
715
716 addClauseInst(MI: *MI);
717 }
718
719 if (ClauseDefs.none())
720 return 0;
721
722 // We need to make sure not to put loads and stores in the same clause if they
723 // use the same address. For now, just start a new clause whenever we see a
724 // store.
725 if (MEM->mayStore())
726 return 1;
727
728 addClauseInst(MI: *MEM);
729
730 // If the set of defs and uses intersect then we cannot add this instruction
731 // to the clause, so we have a hazard.
732 return ClauseDefs.anyCommon(RHS: ClauseUses) ? 1 : 0;
733}
734
735int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
736 int WaitStatesNeeded = 0;
737
738 WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD);
739
740 // This SMRD hazard only affects SI.
741 if (!ST.hasSMRDReadVALUDefHazard())
742 return WaitStatesNeeded;
743
744 // A read of an SGPR by SMRD instruction requires 4 wait states when the
745 // SGPR was written by a VALU instruction.
746 int SmrdSgprWaitStates = 4;
747 auto IsHazardDefFn = [this](const MachineInstr &MI) {
748 return TII.isVALU(MI);
749 };
750 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
751 return TII.isSALU(MI);
752 };
753
754 bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD);
755
756 for (const MachineOperand &Use : SMRD->uses()) {
757 if (!Use.isReg())
758 continue;
759 int WaitStatesNeededForUse =
760 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
761 Limit: SmrdSgprWaitStates);
762 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
763
764 // This fixes what appears to be undocumented hardware behavior in SI where
765 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
766 // needs some number of nops in between. We don't know how many we need, but
767 // let's use 4. This wasn't discovered before probably because the only
768 // case when this happens is when we expand a 64-bit pointer into a full
769 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
770 // probably never encountered in the closed-source land.
771 if (IsBufferSMRD) {
772 int WaitStatesNeededForUse =
773 SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(),
774 IsHazardDef: IsBufferHazardDefFn,
775 Limit: SmrdSgprWaitStates);
776 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
777 }
778 }
779
780 return WaitStatesNeeded;
781}
782
783int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
784 if (!ST.hasVMEMReadSGPRVALUDefHazard())
785 return 0;
786
787 int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM);
788
789 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
790 // SGPR was written by a VALU Instruction.
791 const int VmemSgprWaitStates = 5;
792 auto IsHazardDefFn = [this](const MachineInstr &MI) {
793 return TII.isVALU(MI);
794 };
795 for (const MachineOperand &Use : VMEM->uses()) {
796 if (!Use.isReg() || TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg()))
797 continue;
798
799 int WaitStatesNeededForUse =
800 VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn,
801 Limit: VmemSgprWaitStates);
802 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
803 }
804 return WaitStatesNeeded;
805}
806
807int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
809 const SIInstrInfo *TII = ST.getInstrInfo();
810
811 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
812 int DppVgprWaitStates = 2;
813 int DppExecWaitStates = 5;
814 int WaitStatesNeeded = 0;
815 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
816 return TII->isVALU(MI);
817 };
818
819 for (const MachineOperand &Use : DPP->uses()) {
820 if (!Use.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
821 continue;
822 int WaitStatesNeededForUse =
823 DppVgprWaitStates - getWaitStatesSinceDef(
824 Reg: Use.getReg(),
825 IsHazardDef: [](const MachineInstr &) { return true; },
826 Limit: DppVgprWaitStates);
827 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
828 }
829
830 WaitStatesNeeded = std::max(
831 a: WaitStatesNeeded,
832 b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn,
833 Limit: DppExecWaitStates));
834
835 return WaitStatesNeeded;
836}
837
838int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
839 const SIInstrInfo *TII = ST.getInstrInfo();
840
841 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
842 // instruction.
843 const int DivFMasWaitStates = 4;
844 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
845 return TII->isVALU(MI);
846 };
847 int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn,
848 Limit: DivFMasWaitStates);
849
850 return DivFMasWaitStates - WaitStatesNeeded;
851}
852
853int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
854 const SIInstrInfo *TII = ST.getInstrInfo();
855 unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr);
856
857 const int GetRegWaitStates = 2;
858 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
859 return GetRegHWReg == getHWReg(TII, RegInstr: MI);
860 };
861 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates);
862
863 return GetRegWaitStates - WaitStatesNeeded;
864}
865
866int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
867 const SIInstrInfo *TII = ST.getInstrInfo();
868 unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr);
869
870 const int SetRegWaitStates = ST.getSetRegWaitStates();
871 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
872 return HWReg == getHWReg(TII, RegInstr: MI);
873 };
874 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates);
875 return SetRegWaitStates - WaitStatesNeeded;
876}
877
878int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
879 if (!MI.mayStore())
880 return -1;
881
882 const SIInstrInfo *TII = ST.getInstrInfo();
883 unsigned Opcode = MI.getOpcode();
884 const MCInstrDesc &Desc = MI.getDesc();
885
886 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::vdata);
887 int VDataRCID = -1;
888 if (VDataIdx != -1)
889 VDataRCID = TII->getOpRegClassID(OpInfo: Desc.operands()[VDataIdx]);
890
891 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
892 // There is no hazard if the instruction does not use vector regs
893 // (like wbinvl1)
894 if (VDataIdx == -1)
895 return -1;
896 // For MUBUF/MTBUF instructions this hazard only exists if the
897 // instruction is not using a register in the soffset field.
898 const MachineOperand *SOffset =
899 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::soffset);
900 // If we have no soffset operand, then assume this field has been
901 // hardcoded to zero.
902 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64 &&
903 (!SOffset || !SOffset->isReg()))
904 return VDataIdx;
905 }
906
907 // MIMG instructions create a hazard if they don't use a 256-bit T# and
908 // the store size is greater than 8 bytes and they have more than two bits
909 // of their dmask set.
910 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
911 if (TII->isMIMG(MI)) {
912 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::srsrc);
913 assert(SRsrcIdx != -1 && AMDGPU::getRegBitWidth(TII->getOpRegClassID(
914 Desc.operands()[SRsrcIdx])) == 256);
915 (void)SRsrcIdx;
916 }
917
918 if (TII->isFLAT(MI)) {
919 // There is no hazard if the instruction does not use vector regs
920 if (VDataIdx == -1)
921 return -1;
922
923 if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64)
924 return VDataIdx;
925 }
926
927 return -1;
928}
929
930int
931GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
932 const MachineRegisterInfo &MRI) {
933 // Helper to check for the hazard where VMEM instructions that store more than
934 // 8 bytes can have there store data over written by the next instruction.
935 const SIRegisterInfo *TRI = ST.getRegisterInfo();
936
937 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
938 int WaitStatesNeeded = 0;
939
940 if (!TRI->isVectorRegister(MRI, Reg: Def.getReg()))
941 return WaitStatesNeeded;
942 Register Reg = Def.getReg();
943 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
944 int DataIdx = createsVALUHazard(MI);
945 return DataIdx >= 0 &&
946 TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg);
947 };
948
949 int WaitStatesNeededForDef =
950 VALUWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: VALUWaitStates);
951 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
952
953 return WaitStatesNeeded;
954}
955
956/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
957/// pack the computed value into correct bit position of the dest register. This
958/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
959/// dst_sel that is not aligned to the register. This function analayzes the \p
960/// MI and \returns an operand with dst forwarding issue, or nullptr if
961/// none exists.
962static const MachineOperand *
963getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
964 if (!SIInstrInfo::isVALU(MI))
965 return nullptr;
966
967 const SIInstrInfo *TII = ST.getInstrInfo();
968
969 unsigned Opcode = MI.getOpcode();
970
971 // There are three different types of instructions
972 // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
973 // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
974 // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
975 // op_sel[3:2]
976 // != 0
977 if (SIInstrInfo::isSDWA(MI)) {
978 // Type 1: SDWA with dst_sel != DWORD
979 if (auto *DstSel = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dst_sel))
980 if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
981 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
982 }
983
984 AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opc: Opcode);
985 if (AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::op_sel)) {
986 // Type 2: VOP3 which write the hi bits
987 if (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src0_modifiers) &
988 SISrcMods::DST_OP_SEL)
989 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
990
991 // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
992 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 &&
993 (TII->getNamedImmOperand(MI, OperandName: AMDGPU::OpName::src2_modifiers) &
994 SISrcMods::OP_SEL_0))
995 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
996 }
997
998 // Special case: nop is required for all the opsel values for fp4 sr variant
999 // cvt scale instructions
1000 if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4)
1001 return TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst);
1002
1003 return nullptr;
1004}
1005
1006/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
1007/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
1008/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
1009static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
1010 const MachineOperand *Dst,
1011 const SIRegisterInfo *TRI) {
1012 // We must consider implicit reads of the VALU. SDWA with dst_sel and
1013 // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
1014 // and we must account for that hazard.
1015 // We also must account for WAW hazards. In particular, WAW with dest
1016 // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
1017 // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
1018 // check for ECC. Without accounting for this hazard, the ECC will be
1019 // wrong.
1020 // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
1021 // complete zeroesHigh16BitsOfDest)
1022 for (auto &Operand : VALU->operands()) {
1023 if (Operand.isReg() && TRI->regsOverlap(RegA: Dst->getReg(), RegB: Operand.getReg())) {
1024 return true;
1025 }
1026 }
1027 return false;
1028}
1029
1030int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
1031 int WaitStatesNeeded = 0;
1032
1033 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) {
1034 const int TransDefWaitstates = 1;
1035
1036 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
1037 if (!SIInstrInfo::isTRANS(MI))
1038 return false;
1039 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1040 const SIInstrInfo *TII = ST.getInstrInfo();
1041 Register Def = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::vdst)->getReg();
1042
1043 for (const MachineOperand &Use : VALU->explicit_uses()) {
1044 if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg()))
1045 return true;
1046 }
1047
1048 return false;
1049 };
1050
1051 int WaitStatesNeededForDef =
1052 TransDefWaitstates -
1053 getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates);
1054 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1055 }
1056
1057 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1058 const int Shift16DefWaitstates = 1;
1059
1060 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
1061 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1062 const MachineOperand *ForwardedDst =
1063 getDstSelForwardingOperand(MI: ProducerMI, ST);
1064 if (ForwardedDst) {
1065 return consumesDstSelForwardingOperand(VALU, Dst: ForwardedDst, TRI);
1066 }
1067
1068 if (ProducerMI.isInlineAsm()) {
1069 // Assume inline asm has dst forwarding hazard
1070 for (auto &Def : ProducerMI.all_defs()) {
1071 if (consumesDstSelForwardingOperand(VALU, Dst: &Def, TRI))
1072 return true;
1073 }
1074 }
1075
1076 return false;
1077 };
1078
1079 int WaitStatesNeededForDef =
1080 Shift16DefWaitstates -
1081 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1082 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1083 }
1084
1085 if (ST.hasVDecCoExecHazard()) {
1086 const int VALUWriteSGPRVALUReadWaitstates = 2;
1087 const int VALUWriteEXECRWLane = 4;
1088 const int VALUWriteVGPRReadlaneRead = 1;
1089
1090 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1091 const MachineRegisterInfo &MRI = MF.getRegInfo();
1092 Register UseReg;
1093 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
1094 if (!SIInstrInfo::isVALU(MI))
1095 return false;
1096 return MI.modifiesRegister(Reg: UseReg, TRI);
1097 };
1098
1099 for (const MachineOperand &Use : VALU->explicit_uses()) {
1100 if (!Use.isReg())
1101 continue;
1102
1103 UseReg = Use.getReg();
1104 if (TRI->isSGPRReg(MRI, Reg: UseReg)) {
1105 int WaitStatesNeededForDef =
1106 VALUWriteSGPRVALUReadWaitstates -
1107 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn,
1108 Limit: VALUWriteSGPRVALUReadWaitstates);
1109 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1110 }
1111 }
1112
1113 if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) {
1114 UseReg = AMDGPU::VCC;
1115 int WaitStatesNeededForDef =
1116 VALUWriteSGPRVALUReadWaitstates -
1117 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates);
1118 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1119 }
1120
1121 switch (VALU->getOpcode()) {
1122 case AMDGPU::V_READLANE_B32:
1123 case AMDGPU::V_READFIRSTLANE_B32: {
1124 MachineOperand *Src = TII.getNamedOperand(MI&: *VALU, OperandName: AMDGPU::OpName::src0);
1125 UseReg = Src->getReg();
1126 int WaitStatesNeededForDef =
1127 VALUWriteVGPRReadlaneRead -
1128 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead);
1129 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1130 }
1131 [[fallthrough]];
1132 case AMDGPU::V_WRITELANE_B32: {
1133 UseReg = AMDGPU::EXEC;
1134 int WaitStatesNeededForDef =
1135 VALUWriteEXECRWLane -
1136 getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane);
1137 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1138 break;
1139 }
1140 default:
1141 break;
1142 }
1143 }
1144
1145 // This checks for the hazard where VMEM instructions that store more than
1146 // 8 bytes can have there store data over written by the next instruction.
1147 if (!ST.has12DWordStoreHazard())
1148 return WaitStatesNeeded;
1149
1150 const MachineRegisterInfo &MRI = MF.getRegInfo();
1151
1152 for (const MachineOperand &Def : VALU->defs()) {
1153 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI));
1154 }
1155
1156 return WaitStatesNeeded;
1157}
1158
1159int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1160 // This checks for hazards associated with inline asm statements.
1161 // Since inline asms can contain just about anything, we use this
1162 // to call/leverage other check*Hazard routines. Note that
1163 // this function doesn't attempt to address all possible inline asm
1164 // hazards (good luck), but is a collection of what has been
1165 // problematic thus far.
1166
1167 // see checkVALUHazards()
1168 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1169 !ST.hasCvtScaleForwardingHazard())
1170 return 0;
1171
1172 const MachineRegisterInfo &MRI = MF.getRegInfo();
1173 int WaitStatesNeeded = 0;
1174
1175 for (const MachineOperand &Op :
1176 llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) {
1177 if (Op.isReg() && Op.isDef()) {
1178 if (!TRI.isVectorRegister(MRI, Reg: Op.getReg()))
1179 continue;
1180
1181 if (ST.has12DWordStoreHazard()) {
1182 WaitStatesNeeded =
1183 std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI));
1184 }
1185 }
1186 }
1187
1188 if (ST.hasDstSelForwardingHazard()) {
1189 const int Shift16DefWaitstates = 1;
1190
1191 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1192 const MachineOperand *Dst = getDstSelForwardingOperand(MI: ProducerMI, ST);
1193 // Assume inline asm reads the dst
1194 if (Dst)
1195 return IA->modifiesRegister(Reg: Dst->getReg(), TRI: &TRI) ||
1196 IA->readsRegister(Reg: Dst->getReg(), TRI: &TRI);
1197
1198 if (ProducerMI.isInlineAsm()) {
1199 // If MI is inline asm, assume it has dst forwarding hazard
1200 for (auto &Def : ProducerMI.all_defs()) {
1201 if (IA->modifiesRegister(Reg: Def.getReg(), TRI: &TRI) ||
1202 IA->readsRegister(Reg: Def.getReg(), TRI: &TRI)) {
1203 return true;
1204 }
1205 }
1206 }
1207
1208 return false;
1209 };
1210
1211 int WaitStatesNeededForDef =
1212 Shift16DefWaitstates -
1213 getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates);
1214 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef);
1215 }
1216
1217 return WaitStatesNeeded;
1218}
1219
1220int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1221 const SIInstrInfo *TII = ST.getInstrInfo();
1222 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1223 const MachineRegisterInfo &MRI = MF.getRegInfo();
1224
1225 const MachineOperand *LaneSelectOp =
1226 TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1);
1227
1228 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg()))
1229 return 0;
1230
1231 Register LaneSelectReg = LaneSelectOp->getReg();
1232 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1233
1234 const int RWLaneWaitStates = 4;
1235 int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn,
1236 Limit: RWLaneWaitStates);
1237 return RWLaneWaitStates - WaitStatesSince;
1238}
1239
1240int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1241 if (!ST.hasRFEHazards())
1242 return 0;
1243
1244 const SIInstrInfo *TII = ST.getInstrInfo();
1245
1246 const int RFEWaitStates = 1;
1247
1248 auto IsHazardFn = [TII](const MachineInstr &MI) {
1249 return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1250 };
1251 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates);
1252 return RFEWaitStates - WaitStatesNeeded;
1253}
1254
1255int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1256 const SIInstrInfo *TII = ST.getInstrInfo();
1257 const int ReadM0WaitStates = 1;
1258 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1259 return ReadM0WaitStates -
1260 getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates);
1261}
1262
1263// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
1264// to insert, negative means not needed.
1265bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
1266 if (WaitStatesNeeded <= 0)
1267 return false;
1268
1269 const SIInstrInfo *TII = ST.getInstrInfo();
1270 for (int I = 0; I < WaitStatesNeeded; ++I)
1271 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1272 MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
1273
1274 return true;
1275}
1276
1277void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1278 fixVMEMtoScalarWriteHazards(MI);
1279 fixVcmpxPermlaneHazards(MI);
1280 fixSMEMtoVectorWriteHazards(MI);
1281 fixVcmpxExecWARHazard(MI);
1282 fixLdsBranchVmemWARHazard(MI);
1283 if (ST.hasLdsDirect()) {
1284 fixLdsDirectVALUHazard(MI);
1285 fixLdsDirectVMEMHazard(MI);
1286 }
1287 fixVALUPartialForwardingHazard(MI);
1288 fixVALUTransUseHazard(MI);
1289 fixVALUTransCoexecutionHazards(MI);
1290 fixWMMAHazards(MI); // fall-through if co-execution is enabled.
1291 emitVNops(MI, WaitStatesNeeded: checkWMMACoexecutionHazards(MI));
1292 fixShift64HighRegBug(MI);
1293 fixVALUMaskWriteHazard(MI);
1294 fixRequiredExportPriority(MI);
1295 if (ST.requiresWaitIdleBeforeGetReg())
1296 fixGetRegWaitIdle(MI);
1297 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1298 fixDsAtomicAsyncBarrierArriveB64(MI);
1299 if (ST.hasScratchBaseForwardingHazard())
1300 fixScratchBaseForwardingHazard(MI);
1301 if (ST.setRegModeNeedsVNOPs())
1302 fixSetRegMode(MI);
1303}
1304
1305static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
1306 const MachineInstr &MI) {
1307 return (TII.isVOPC(MI) ||
1308 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&
1309 MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI);
1310}
1311
1312bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1313 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(MI: *MI))
1314 return false;
1315
1316 const SIInstrInfo *TII = ST.getInstrInfo();
1317 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1318 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1319 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
1320 };
1321
1322 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1323 unsigned Opc = MI.getOpcode();
1324 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1325 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1326 };
1327
1328 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1329 std::numeric_limits<int>::max())
1330 return false;
1331
1332 // V_NOP will be discarded by SQ.
1333 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1334 // which is always a VGPR and available.
1335 auto *Src0 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
1336 Register Reg = Src0->getReg();
1337 bool IsUndef = Src0->isUndef();
1338 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1339 MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32))
1340 .addReg(RegNo: Reg, Flags: RegState::Define | getDeadRegState(B: IsUndef))
1341 .addReg(RegNo: Reg, Flags: IsUndef ? RegState::Undef : RegState::Kill);
1342
1343 return true;
1344}
1345
1346bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1347 if (!ST.hasVMEMtoScalarWriteHazard())
1348 return false;
1349 assert(!ST.hasExtendedWaitCounts());
1350
1351 if (!SIInstrInfo::isSALU(MI: *MI) && !SIInstrInfo::isSMRD(MI: *MI))
1352 return false;
1353
1354 if (MI->getNumDefs() == 0)
1355 return false;
1356
1357 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1358
1359 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1360 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1361 return false;
1362
1363 for (const MachineOperand &Def : MI->defs()) {
1364 const MachineOperand *Op =
1365 I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false);
1366 if (!Op)
1367 continue;
1368 return true;
1369 }
1370 return false;
1371 };
1372
1373 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1374 return SIInstrInfo::isVALU(MI) ||
1375 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1376 !MI.getOperand(i: 0).getImm()) ||
1377 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1378 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: 0).getImm()) == 0);
1379 };
1380
1381 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1382 std::numeric_limits<int>::max())
1383 return false;
1384
1385 const SIInstrInfo *TII = ST.getInstrInfo();
1386 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1387 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1388 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
1389 return true;
1390}
1391
1392bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1393 if (!ST.hasSMEMtoVectorWriteHazard())
1394 return false;
1395 assert(!ST.hasExtendedWaitCounts());
1396
1397 if (!SIInstrInfo::isVALU(MI: *MI))
1398 return false;
1399
1400 AMDGPU::OpName SDSTName;
1401 switch (MI->getOpcode()) {
1402 case AMDGPU::V_READLANE_B32:
1403 case AMDGPU::V_READFIRSTLANE_B32:
1404 SDSTName = AMDGPU::OpName::vdst;
1405 break;
1406 default:
1407 SDSTName = AMDGPU::OpName::sdst;
1408 break;
1409 }
1410
1411 const SIInstrInfo *TII = ST.getInstrInfo();
1412 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1413 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
1414 const MachineOperand *SDST = TII->getNamedOperand(MI&: *MI, OperandName: SDSTName);
1415 if (!SDST) {
1416 for (const auto &MO : MI->implicit_operands()) {
1417 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) {
1418 SDST = &MO;
1419 break;
1420 }
1421 }
1422 }
1423
1424 if (!SDST)
1425 return false;
1426
1427 const Register SDSTReg = SDST->getReg();
1428 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1429 return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI);
1430 };
1431
1432 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1433 if (TII->isSALU(MI)) {
1434 switch (MI.getOpcode()) {
1435 case AMDGPU::S_SETVSKIP:
1436 case AMDGPU::S_VERSION:
1437 case AMDGPU::S_WAITCNT_VSCNT:
1438 case AMDGPU::S_WAITCNT_VMCNT:
1439 case AMDGPU::S_WAITCNT_EXPCNT:
1440 // These instructions cannot not mitigate the hazard.
1441 return false;
1442 case AMDGPU::S_WAITCNT_LGKMCNT:
1443 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1444 return (MI.getOperand(i: 1).getImm() == 0) &&
1445 (MI.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL);
1446 case AMDGPU::S_WAITCNT: {
1447 const int64_t Imm = MI.getOperand(i: 0).getImm();
1448 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm);
1449 // DsCnt corresponds to LGKMCnt here.
1450 return (Decoded.DsCnt == 0);
1451 }
1452 default:
1453 assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) ||
1454 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1455 "unexpected wait count instruction");
1456 // SOPP instructions cannot mitigate the hazard.
1457 if (TII->isSOPP(MI))
1458 return false;
1459 // At this point the SALU can be assumed to mitigate the hazard
1460 // because either:
1461 // (a) it is independent of the at risk SMEM (breaking chain),
1462 // or
1463 // (b) it is dependent on the SMEM, in which case an appropriate
1464 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1465 // SMEM instruction.
1466 return true;
1467 }
1468 }
1469 return false;
1470 };
1471
1472 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1473 std::numeric_limits<int>::max())
1474 return false;
1475
1476 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1477 MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL)
1478 .addImm(Val: 0);
1479 return true;
1480}
1481
1482bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1483 if (!ST.hasVcmpxExecWARHazard())
1484 return false;
1485 assert(!ST.hasExtendedWaitCounts());
1486
1487 if (!SIInstrInfo::isVALU(MI: *MI))
1488 return false;
1489
1490 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1491 if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI))
1492 return false;
1493
1494 auto IsHazardFn = [TRI](const MachineInstr &I) {
1495 if (SIInstrInfo::isVALU(MI: I))
1496 return false;
1497 return I.readsRegister(Reg: AMDGPU::EXEC, TRI);
1498 };
1499
1500 const SIInstrInfo *TII = ST.getInstrInfo();
1501 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1502 if (SIInstrInfo::isVALU(MI)) {
1503 if (TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::sdst))
1504 return true;
1505 for (auto MO : MI.implicit_operands())
1506 if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg())))
1507 return true;
1508 }
1509 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1510 AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
1511 return true;
1512 return false;
1513 };
1514
1515 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1516 std::numeric_limits<int>::max())
1517 return false;
1518
1519 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1520 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1521 .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST));
1522 return true;
1523}
1524
1525static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1526 const GCNSubtarget &ST) {
1527 if (!ST.hasLdsBranchVmemWARHazard())
1528 return false;
1529
1530 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1531 // instructions need to appear in the same function.
1532 bool HasLds = false;
1533 bool HasVmem = false;
1534 for (auto &MBB : MF) {
1535 for (auto &MI : MBB) {
1536 HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI);
1537 HasVmem |= SIInstrInfo::isVMEM(MI);
1538 if (HasLds && HasVmem)
1539 return true;
1540 }
1541 }
1542 return false;
1543}
1544
1545static bool isStoreCountWaitZero(const MachineInstr &I) {
1546 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1547 I.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL &&
1548 !I.getOperand(i: 1).getImm();
1549}
1550
1551bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1552 if (!RunLdsBranchVmemWARHazardFixup)
1553 return false;
1554
1555 assert(ST.hasLdsBranchVmemWARHazard());
1556 assert(!ST.hasExtendedWaitCounts());
1557
1558 auto IsHazardInst = [](const MachineInstr &MI) {
1559 if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI))
1560 return 1;
1561 if (SIInstrInfo::isVMEM(MI))
1562 return 2;
1563 return 0;
1564 };
1565
1566 auto InstType = IsHazardInst(*MI);
1567 if (!InstType)
1568 return false;
1569
1570 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1571 return IsHazardInst(I) || isStoreCountWaitZero(I);
1572 };
1573
1574 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1575 if (!I.isBranch())
1576 return false;
1577
1578 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1579 auto InstType2 = IsHazardInst(I);
1580 return InstType2 && InstType != InstType2;
1581 };
1582
1583 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1584 auto InstType2 = IsHazardInst(I);
1585 if (InstType == InstType2)
1586 return true;
1587
1588 return isStoreCountWaitZero(I);
1589 };
1590
1591 return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) !=
1592 std::numeric_limits<int>::max();
1593 };
1594
1595 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1596 std::numeric_limits<int>::max())
1597 return false;
1598
1599 const SIInstrInfo *TII = ST.getInstrInfo();
1600 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1601 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT))
1602 .addReg(RegNo: AMDGPU::SGPR_NULL, Flags: RegState::Undef)
1603 .addImm(Val: 0);
1604
1605 return true;
1606}
1607
1608bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1609 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1610 return false;
1611
1612 const int NoHazardWaitStates = 15;
1613 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1614 const Register VDSTReg = VDST->getReg();
1615
1616 bool VisitedTrans = false;
1617 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1618 if (!SIInstrInfo::isVALU(MI: I))
1619 return false;
1620 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(MI: I);
1621 // Cover both WAR and WAW
1622 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1623 };
1624 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1625 if (WaitStates >= NoHazardWaitStates)
1626 return true;
1627 // Instructions which cause va_vdst==0 expire hazard
1628 return SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1629 SIInstrInfo::isEXP(MI: I);
1630 };
1631 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1632 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1633 };
1634
1635 DenseSet<const MachineBasicBlock *> Visited;
1636 auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(),
1637 I: std::next(x: MI->getReverseIterator()), WaitStates: 0,
1638 IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn);
1639
1640 // Transcendentals can execute in parallel to other VALUs.
1641 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1642 if (VisitedTrans)
1643 Count = 0;
1644
1645 MachineOperand *WaitVdstOp =
1646 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst);
1647 WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates));
1648
1649 return true;
1650}
1651
1652bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1653 if (!SIInstrInfo::isLDSDIR(MI: *MI))
1654 return false;
1655
1656 const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1657 const Register VDSTReg = VDST->getReg();
1658
1659 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1660 if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I))
1661 return false;
1662 return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI);
1663 };
1664 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1665 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1666 // according to the type of VMEM instruction.
1667 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
1668 return SIInstrInfo::isVALU(MI: I) || SIInstrInfo::isEXP(MI: I) ||
1669 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: 0).getImm()) ||
1670 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1671 AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: 0).getImm()) == 0) ||
1672 (LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) &&
1673 !TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::waitvsrc)->getImm());
1674 };
1675
1676 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
1677 std::numeric_limits<int>::max())
1678 return false;
1679
1680 if (LdsdirCanWait) {
1681 TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(0);
1682 } else {
1683 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1684 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1685 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
1686 }
1687
1688 return true;
1689}
1690
1691bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1692 if (!ST.hasVALUPartialForwardingHazard())
1693 return false;
1694 assert(!ST.hasExtendedWaitCounts());
1695
1696 if (!ST.isWave64() || !SIInstrInfo::isVALU(MI: *MI))
1697 return false;
1698
1699 SmallSetVector<Register, 4> SrcVGPRs;
1700
1701 for (const MachineOperand &Use : MI->explicit_uses()) {
1702 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1703 SrcVGPRs.insert(X: Use.getReg());
1704 }
1705
1706 // Only applies with >= 2 unique VGPR sources
1707 if (SrcVGPRs.size() <= 1)
1708 return false;
1709
1710 // Look for the following pattern:
1711 // Va <- VALU [PreExecPos]
1712 // intv1
1713 // Exec <- SALU [ExecPos]
1714 // intv2
1715 // Vb <- VALU [PostExecPos]
1716 // intv3
1717 // MI Va, Vb (WaitState = 0)
1718 //
1719 // Where:
1720 // intv1 + intv2 <= 2 VALUs
1721 // intv3 <= 4 VALUs
1722 //
1723 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1724
1725 const int Intv1plus2MaxVALUs = 2;
1726 const int Intv3MaxVALUs = 4;
1727 const int IntvMaxVALUs = 6;
1728 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1729
1730 struct StateType {
1731 SmallDenseMap<Register, int, 4> DefPos;
1732 int ExecPos = std::numeric_limits<int>::max();
1733 int VALUs = 0;
1734
1735 static unsigned getHashValue(const StateType &State) {
1736 return hash_combine(args: State.ExecPos, args: State.VALUs,
1737 args: hash_combine_range(R: State.DefPos));
1738 }
1739 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1740 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&
1741 LHS.VALUs == RHS.VALUs;
1742 }
1743 };
1744
1745 StateType State;
1746
1747 // This overloads expiry testing with all the hazard detection
1748 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1749 // Too many VALU states have passed
1750 if (State.VALUs > NoHazardVALUWaitStates)
1751 return HazardExpired;
1752
1753 // Instructions which cause va_vdst==0 expire hazard
1754 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1755 SIInstrInfo::isEXP(MI: I) ||
1756 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1757 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
1758 return HazardExpired;
1759
1760 // Track registers writes
1761 bool Changed = false;
1762 if (SIInstrInfo::isVALU(MI: I)) {
1763 for (Register Src : SrcVGPRs) {
1764 if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1765 State.DefPos[Src] = State.VALUs;
1766 Changed = true;
1767 }
1768 }
1769 } else if (SIInstrInfo::isSALU(MI: I)) {
1770 if (State.ExecPos == std::numeric_limits<int>::max()) {
1771 if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) {
1772 State.ExecPos = State.VALUs;
1773 Changed = true;
1774 }
1775 }
1776 }
1777
1778 // Early expiration: too many VALUs in intv3
1779 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1780 return HazardExpired;
1781
1782 // Only evaluate state if something changed
1783 if (!Changed)
1784 return NoHazardFound;
1785
1786 // Determine positions of VALUs pre/post exec change
1787 if (State.ExecPos == std::numeric_limits<int>::max())
1788 return NoHazardFound;
1789
1790 int PreExecPos = std::numeric_limits<int>::max();
1791 int PostExecPos = std::numeric_limits<int>::max();
1792
1793 for (auto Entry : State.DefPos) {
1794 int DefVALUs = Entry.second;
1795 if (DefVALUs != std::numeric_limits<int>::max()) {
1796 if (DefVALUs >= State.ExecPos)
1797 PreExecPos = std::min(a: PreExecPos, b: DefVALUs);
1798 else
1799 PostExecPos = std::min(a: PostExecPos, b: DefVALUs);
1800 }
1801 }
1802
1803 // Need a VALUs post exec change
1804 if (PostExecPos == std::numeric_limits<int>::max())
1805 return NoHazardFound;
1806
1807 // Too many VALUs in intv3?
1808 int Intv3VALUs = PostExecPos;
1809 if (Intv3VALUs > Intv3MaxVALUs)
1810 return HazardExpired;
1811
1812 // Too many VALUs in intv2?
1813 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1814 if (Intv2VALUs > Intv1plus2MaxVALUs)
1815 return HazardExpired;
1816
1817 // Need a VALUs pre exec change
1818 if (PreExecPos == std::numeric_limits<int>::max())
1819 return NoHazardFound;
1820
1821 // Too many VALUs in intv1?
1822 int Intv1VALUs = PreExecPos - State.ExecPos;
1823 if (Intv1VALUs > Intv1plus2MaxVALUs)
1824 return HazardExpired;
1825
1826 // Too many VALUs in intv1 + intv2
1827 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1828 return HazardExpired;
1829
1830 return HazardFound;
1831 };
1832 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1833 if (SIInstrInfo::isVALU(MI))
1834 State.VALUs += 1;
1835 };
1836
1837 if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1838 InitialI: std::next(x: MI->getReverseIterator())))
1839 return false;
1840
1841 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1842 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1843 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST));
1844
1845 return true;
1846}
1847
1848bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1849 if (!ST.hasVALUTransUseHazard())
1850 return false;
1851 assert(!ST.hasExtendedWaitCounts());
1852
1853 if (!SIInstrInfo::isVALU(MI: *MI))
1854 return false;
1855
1856 SmallSet<Register, 4> SrcVGPRs;
1857
1858 for (const MachineOperand &Use : MI->explicit_uses()) {
1859 if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
1860 SrcVGPRs.insert(V: Use.getReg());
1861 }
1862
1863 // Look for the following pattern:
1864 // Va <- TRANS VALU
1865 // intv
1866 // MI Va (WaitState = 0)
1867 //
1868 // Where:
1869 // intv <= 5 VALUs / 1 TRANS
1870 //
1871 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1872
1873 const int IntvMaxVALUs = 5;
1874 const int IntvMaxTRANS = 1;
1875
1876 struct StateType {
1877 int VALUs = 0;
1878 int TRANS = 0;
1879
1880 static unsigned getHashValue(const StateType &State) {
1881 return hash_combine(args: State.VALUs, args: State.TRANS);
1882 }
1883 static bool isEqual(const StateType &LHS, const StateType &RHS) {
1884 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;
1885 }
1886 };
1887
1888 StateType State;
1889
1890 // This overloads expiry testing with all the hazard detection
1891 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1892 // Too many VALU states have passed
1893 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1894 return HazardExpired;
1895
1896 // Instructions which cause va_vdst==0 expire hazard
1897 if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isDS(MI: I) ||
1898 SIInstrInfo::isEXP(MI: I) ||
1899 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1900 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0))
1901 return HazardExpired;
1902
1903 // Track registers writes
1904 if (SIInstrInfo::isTRANS(MI: I)) {
1905 for (Register Src : SrcVGPRs) {
1906 if (I.modifiesRegister(Reg: Src, TRI: &TRI)) {
1907 return HazardFound;
1908 }
1909 }
1910 }
1911
1912 return NoHazardFound;
1913 };
1914 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1915 if (SIInstrInfo::isVALU(MI))
1916 State.VALUs += 1;
1917 if (SIInstrInfo::isTRANS(MI))
1918 State.TRANS += 1;
1919 };
1920
1921 if (!hasHazard<StateType>(InitialState: State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, InitialMBB: MI->getParent(),
1922 InitialI: std::next(x: MI->getReverseIterator())))
1923 return false;
1924
1925 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1926 // avoided.
1927 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
1928 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
1929 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0, STI: ST));
1930
1931 return true;
1932}
1933
1934bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
1935 if (!ST.hasGFX1250Insts() || // Coexecution disabled.
1936 !SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isTRANS(MI: *MI))
1937 return false;
1938
1939 const SIInstrInfo *TII = ST.getInstrInfo();
1940 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1941
1942 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1943 if (!SIInstrInfo::isTRANS(MI: I))
1944 return false;
1945
1946 // RAW: Trans(I) writes, VALU(MI) reads.
1947 Register TransDef = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
1948 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
1949 if (ValuUse.isReg() && TRI->regsOverlap(RegA: TransDef, RegB: ValuUse.getReg()))
1950 return true;
1951 }
1952
1953 auto *ValuDst = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
1954 if (!ValuDst || !ValuDst->isReg())
1955 return false;
1956
1957 // WAR: Trans(I) reads, VALU(MI) writes.
1958 Register ValuDef = ValuDst->getReg();
1959 for (const MachineOperand &TransUse : I.explicit_uses()) {
1960 if (TransUse.isReg() && TRI->regsOverlap(RegA: ValuDef, RegB: TransUse.getReg()))
1961 return true;
1962 }
1963
1964 return false;
1965 };
1966
1967 auto IsExpiredFn = [](const MachineInstr &I, int) {
1968 return SIInstrInfo::isVALU(MI: I);
1969 };
1970
1971 const int HasVALU = std::numeric_limits<int>::max();
1972 if (::getWaitStatesSince(IsHazard: IsTransHazardFn, MI, IsExpired: IsExpiredFn) == HasVALU)
1973 return false;
1974
1975 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
1976 return true;
1977}
1978
1979bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1980 if (!SIInstrInfo::isWMMA(MI: *MI) && !SIInstrInfo::isSWMMAC(MI: *MI))
1981 return false;
1982
1983 const SIInstrInfo *TII = ST.getInstrInfo();
1984 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1985
1986 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
1987 if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I))
1988 return false;
1989
1990 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1991 // with the dest(matrix D) of the previous wmma.
1992 const Register CurSrc0Reg =
1993 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
1994 const Register CurSrc1Reg =
1995 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
1996
1997 const Register PrevDstReg =
1998 TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
1999
2000 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) ||
2001 TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) {
2002 return true;
2003 }
2004
2005 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
2006 // but Index can't overlap with PrevDstReg.
2007 if (AMDGPU::isGFX12Plus(STI: ST)) {
2008 if (SIInstrInfo::isSWMMAC(MI: *MI)) {
2009 const Register CurIndex =
2010 TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
2011 if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex))
2012 return true;
2013 }
2014 return false;
2015 }
2016
2017 return false;
2018 };
2019
2020 auto IsExpiredFn = [](const MachineInstr &I, int) {
2021 return SIInstrInfo::isVALU(MI: I);
2022 };
2023
2024 if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) ==
2025 std::numeric_limits<int>::max())
2026 return false;
2027
2028 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32));
2029
2030 return true;
2031}
2032
2033static bool isCoexecutableVALUInst(const MachineInstr &MI) {
2034 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) &&
2035 !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
2036}
2037
2038static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
2039 const SIInstrInfo *TII, unsigned Latency,
2040 unsigned Category) {
2041 assert(TII->isXDLWMMA(MI) && (Latency == 8 || Latency == 16) &&
2042 "Handle me if the xdl wmma instruction latency changes");
2043
2044 switch (Category) {
2045 case 0: // Dense WMMA Instructions:
2046 // WMMA_*F16, WMMA_*BF16
2047 // WMMA_*FP8FP8
2048 // WMMA_*FP8BF8
2049 // WMMA_*BF8FP8
2050 // WMMA_*BF8BF8
2051 // WMMA_*F8F6F4 if SRCA & SRCB != F8
2052 return Latency == 8 && SIInstrInfo::isWMMA(MI);
2053
2054 case 1: // Dense WMMA Instructions:
2055 // WMMA_IU8
2056 // WMMA_IU4
2057 // WMMA_*F8F6F4 if SRCA OR SRCB == F8
2058 return Latency == 16 && SIInstrInfo::isWMMA(MI);
2059
2060 case 2: // Dense SWMMAC Instructions
2061 // SWMMAC_*F16, SWMMAC_*BF16,
2062 // SWMMAC_*FP8FP8
2063 // SWMMAC_*BF8FP8
2064 // SWMMAC_*FP8BF8
2065 // SWMMAC_*BF8BF8
2066 return Latency == 8 && SIInstrInfo::isSWMMAC(MI);
2067
2068 case 3: // Sparse WMMA Instructions:
2069 // SWMMAC_IU8
2070 // SWMMAC_IU4
2071 return Latency == 16 && SIInstrInfo::isSWMMAC(MI);
2072 default:
2073 break;
2074 } // end switch.
2075
2076 return false;
2077}
2078
2079int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
2080 if (!ST.hasGFX1250Insts())
2081 return 0;
2082
2083 const SIInstrInfo *TII = ST.getInstrInfo();
2084 if (!TII->isXDLWMMA(MI: *MI) && !isCoexecutableVALUInst(MI: *MI))
2085 return 0;
2086
2087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2088
2089 // WaitStates here is the number of V_NOPs or unrelated VALU instructions must
2090 // be in between the first WMMA and the second instruction to cover the hazard
2091 // (WMMAWaitStates if the second is also a WMMA, VALUWaitStates if the second
2092 // is a VALU). Refer to SPG 4.6.12.1. "Requirements for WMMA data hazards" for
2093 // numbers, which depends on the category of the first WMMA.
2094 const int WMMAWaitStates[] = {5, 9, 3, 5};
2095 const int VALUWaitStates[] = {4, 8, 2, 4};
2096 unsigned Category = 0;
2097
2098 auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2099 if (!TII->isXDLWMMA(MI: I))
2100 return false;
2101
2102 unsigned Latency = TSchedModel.computeInstrLatency(MI: &I);
2103 if (!IsWMMAHazardInstInCategory(MI: I, TII, Latency, Category))
2104 return false;
2105
2106 Register D0 = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
2107 Register A1 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg();
2108 Register B1 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg();
2109
2110 // WMMA0 wrires (D0), WMMA1 reads (A1/B1/Idx1).
2111 if (TRI->regsOverlap(RegA: D0, RegB: A1) || TRI->regsOverlap(RegA: D0, RegB: B1))
2112 return true;
2113
2114 if (SIInstrInfo::isSWMMAC(MI: *MI)) {
2115 Register Idx1 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg();
2116 if (TRI->regsOverlap(RegA: D0, RegB: Idx1))
2117 return true;
2118 }
2119
2120 return false;
2121 };
2122
2123 auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {
2124 if (!TII->isXDLWMMA(MI: I))
2125 return false;
2126
2127 unsigned Latency = TSchedModel.computeInstrLatency(MI: &I);
2128 if (!IsWMMAHazardInstInCategory(MI: I, TII, Latency, Category))
2129 return false;
2130
2131 // WMMA writes, VALU reads.
2132 Register D0 = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::vdst)->getReg();
2133 for (const MachineOperand &ValuUse : MI->explicit_uses()) {
2134 if (ValuUse.isReg() && TRI->regsOverlap(RegA: D0, RegB: ValuUse.getReg()))
2135 return true;
2136 }
2137
2138 auto *ValuDst = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst);
2139 if (!ValuDst || !ValuDst->isReg())
2140 return false;
2141 Register D1 = ValuDst->getReg();
2142
2143 // WMMA writes, VALU writes.
2144 if (TRI->regsOverlap(RegA: D0, RegB: D1))
2145 return true;
2146
2147 // WMMA reads, VALU writes.
2148 Register A0 = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src0)->getReg();
2149 Register B0 = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src1)->getReg();
2150 if (TRI->regsOverlap(RegA: A0, RegB: D1) || TRI->regsOverlap(RegA: B0, RegB: D1))
2151 return true;
2152
2153 if (SIInstrInfo::isSWMMAC(MI: I)) {
2154 Register Idx0 = TII->getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src2)->getReg();
2155 if (TRI->regsOverlap(RegA: D1, RegB: Idx0))
2156 return true;
2157 }
2158
2159 return false;
2160 };
2161
2162 int Limit = 0;
2163
2164 auto GetWaitStatesFn = [](const MachineInstr &I) {
2165 return SIInstrInfo::isVALU(MI: I) ? 1 : 0;
2166 };
2167
2168 int WaitStatesNeeded = -1;
2169 if (TII->isXDLWMMA(MI: *MI)) {
2170 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2171 Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
2172 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2173 // exists, and INT_MAX if there is no hazard. As a result, a negative
2174 // WaitStatesNeeded here means no hazard, and we will continue to search
2175 // for other categories.
2176 WaitStatesNeeded =
2177 Limit - getWaitStatesSince(IsHazard: IsWMMAHazardFn, Limit, GetNumWaitStates: GetWaitStatesFn);
2178 }
2179 } else { // Must be a co-executable VALU.
2180 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2181 Limit = VALUWaitStates[Category]; // for IsExpiredFn.
2182 // 'getWaitStatesSince' returns the number of VALUs in between if hazard
2183 // exists, and INT_MAX if there is no hazard. As a result, a negative
2184 // WaitStatesNeeded here means no hazard, and we will continue to search
2185 // for other categories.
2186 WaitStatesNeeded =
2187 Limit - getWaitStatesSince(IsHazard: IsVALUHazardFn, Limit, GetNumWaitStates: GetWaitStatesFn);
2188 }
2189 }
2190
2191 return WaitStatesNeeded;
2192}
2193
2194bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
2195 if (!ST.hasShift64HighRegBug())
2196 return false;
2197 assert(!ST.hasExtendedWaitCounts());
2198
2199 switch (MI->getOpcode()) {
2200 default:
2201 return false;
2202 case AMDGPU::V_LSHLREV_B64_e64:
2203 case AMDGPU::V_LSHRREV_B64_e64:
2204 case AMDGPU::V_ASHRREV_I64_e64:
2205 break;
2206 }
2207
2208 MachineOperand *Amt = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0);
2209 if (!Amt->isReg())
2210 return false;
2211
2212 Register AmtReg = Amt->getReg();
2213 const MachineRegisterInfo &MRI = MF.getRegInfo();
2214 // Check if this is a last VGPR in the allocation block.
2215 if (!TRI.isVGPR(MRI, Reg: AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2216 return false;
2217
2218 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + 1))
2219 return false;
2220
2221 assert(ST.needsAlignedVGPRs());
2222 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2223
2224 const DebugLoc &DL = MI->getDebugLoc();
2225 MachineBasicBlock *MBB = MI->getParent();
2226 MachineOperand *Src1 = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1);
2227
2228 // In:
2229 //
2230 // Dst = shiftrev64 Amt, Src1
2231 //
2232 // if Dst!=Src1 then avoid the bug with:
2233 //
2234 // Dst.sub0 = Amt
2235 // Dst = shift64 Dst.sub0, Src1
2236
2237 Register DstReg = MI->getOperand(i: 0).getReg();
2238 if (!Src1->isReg() || Src1->getReg() != DstReg) {
2239 Register DstLo = TRI.getSubReg(Reg: DstReg, Idx: AMDGPU::sub0);
2240 runOnInstruction(
2241 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: DstLo).add(MO: *Amt));
2242 Amt->setReg(DstLo);
2243 Amt->setIsKill(true);
2244 return true;
2245 }
2246
2247 bool Overlapped = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI);
2248 Register NewReg;
2249 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2250 : AMDGPU::VGPR_32RegClass) {
2251 if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) {
2252 NewReg = Reg;
2253 break;
2254 }
2255 }
2256
2257 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1)
2258 : NewReg;
2259 Register NewAmtLo;
2260
2261 if (Overlapped)
2262 NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0);
2263
2264 // Insert a full wait count because found register might be pending a wait.
2265 BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
2266 .addImm(Val: 0);
2267
2268 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
2269 if (Overlapped)
2270 runOnInstruction(
2271 MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo)
2272 .addDef(RegNo: AmtReg - 1)
2273 .addReg(RegNo: AmtReg - 1, Flags: RegState::Undef)
2274 .addReg(RegNo: NewAmtLo, Flags: RegState::Undef));
2275 runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt)
2276 .addDef(RegNo: AmtReg)
2277 .addReg(RegNo: AmtReg, Flags: RegState::Undef)
2278 .addReg(RegNo: NewAmt, Flags: RegState::Undef));
2279
2280 // Instructions emitted after the current instruction will be processed by the
2281 // parent loop of the hazard recognizer in a natural way.
2282 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2283 DestReg: AmtReg)
2284 .addDef(RegNo: NewAmt)
2285 .addReg(RegNo: NewAmt)
2286 .addReg(RegNo: AmtReg);
2287 if (Overlapped)
2288 BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32),
2289 DestReg: AmtReg - 1)
2290 .addDef(RegNo: NewAmtLo)
2291 .addReg(RegNo: NewAmtLo)
2292 .addReg(RegNo: AmtReg - 1);
2293
2294 // Re-running hazard recognizer on the modified instruction is not necessary,
2295 // inserted V_SWAP_B32 has already both read and write new registers so
2296 // hazards related to these register has already been handled.
2297 Amt->setReg(NewAmt);
2298 Amt->setIsKill(false);
2299 // We do not update liveness, so verifier may see it as undef.
2300 Amt->setIsUndef();
2301 if (Overlapped) {
2302 MI->getOperand(i: 0).setReg(NewReg);
2303 Src1->setReg(NewReg);
2304 Src1->setIsKill(false);
2305 Src1->setIsUndef();
2306 }
2307
2308 return true;
2309}
2310
2311int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
2312 int NSAtoVMEMWaitStates = 1;
2313
2314 if (!ST.hasNSAtoVMEMBug())
2315 return 0;
2316
2317 if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isMTBUF(MI: *MI))
2318 return 0;
2319
2320 const SIInstrInfo *TII = ST.getInstrInfo();
2321 const auto *Offset = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset);
2322 if (!Offset || (Offset->getImm() & 6) == 0)
2323 return 0;
2324
2325 auto IsHazardFn = [TII](const MachineInstr &I) {
2326 if (!SIInstrInfo::isMIMG(MI: I))
2327 return false;
2328 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode());
2329 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2330 TII->getInstSizeInBytes(MI: I) >= 16;
2331 };
2332
2333 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: 1);
2334}
2335
2336int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
2337 int FPAtomicToDenormModeWaitStates = 3;
2338
2339 if (!ST.hasFPAtomicToDenormModeHazard())
2340 return 0;
2341 assert(!ST.hasExtendedWaitCounts());
2342
2343 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2344 return 0;
2345
2346 auto IsHazardFn = [](const MachineInstr &I) {
2347 if (!SIInstrInfo::isVMEM(MI: I))
2348 return false;
2349 return SIInstrInfo::isFPAtomic(MI: I);
2350 };
2351
2352 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
2353 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
2354 return true;
2355
2356 return SIInstrInfo::isWaitcnt(Opcode: MI.getOpcode());
2357 };
2358
2359 return FPAtomicToDenormModeWaitStates -
2360 ::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn);
2361}
2362
2363int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
2364 assert(SIInstrInfo::isMAI(*MI));
2365
2366 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
2367}
2368
2369int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
2370 // Early exit if no padding is requested.
2371 if (MFMAPaddingRatio == 0)
2372 return 0;
2373
2374 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2375 if (!SIInstrInfo::isMFMA(MI: *MI) || MFI->getOccupancy() < 2)
2376 return 0;
2377
2378 int NeighborMFMALatency = 0;
2379 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2380 this](const MachineInstr &MI) {
2381 if (!SIInstrInfo::isMFMA(MI))
2382 return false;
2383
2384 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
2385 return true;
2386 };
2387
2388 const int MaxMFMAPipelineWaitStates = 16;
2389 int WaitStatesSinceNeighborMFMA =
2390 getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates);
2391
2392 int NeighborMFMAPaddingNeeded =
2393 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
2394 WaitStatesSinceNeighborMFMA;
2395
2396 return std::max(a: 0, b: NeighborMFMAPaddingNeeded);
2397}
2398
2399int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
2400 int WaitStatesNeeded = 0;
2401 unsigned Opc = MI->getOpcode();
2402
2403 auto IsVALUFn = [](const MachineInstr &MI) {
2404 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
2405 };
2406
2407 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
2408 const int LegacyVALUWritesVGPRWaitStates = 2;
2409 const int VALUWritesExecWaitStates = 4;
2410 const int MaxWaitStates = 4;
2411
2412 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2413 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2414 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2415
2416 if (WaitStatesNeeded < MaxWaitStates) {
2417 for (const MachineOperand &Use : MI->explicit_uses()) {
2418 const int MaxWaitStates = 2;
2419
2420 if (!Use.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg()))
2421 continue;
2422
2423 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2424 getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates);
2425 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2426
2427 if (WaitStatesNeeded == MaxWaitStates)
2428 break;
2429 }
2430 }
2431 }
2432
2433 for (const MachineOperand &Op : MI->explicit_operands()) {
2434 if (!Op.isReg() || !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2435 continue;
2436
2437 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2438 continue;
2439
2440 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2441 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2442 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2443 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2444 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2445 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2446 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2447 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2448 const int MaxWaitStates = 18;
2449 Register Reg = Op.getReg();
2450 unsigned HazardDefLatency = 0;
2451
2452 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2453 this](const MachineInstr &MI) {
2454 if (!SIInstrInfo::isMFMA(MI))
2455 return false;
2456 Register DstReg = MI.getOperand(i: 0).getReg();
2457 if (DstReg == Reg)
2458 return false;
2459 HazardDefLatency =
2460 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2461 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2462 };
2463
2464 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn,
2465 Limit: MaxWaitStates);
2466 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2467 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2468 int OpNo = Op.getOperandNo();
2469 if (OpNo == SrcCIdx) {
2470 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2471 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2472 switch (HazardDefLatency) {
2473 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2474 break;
2475 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2476 break;
2477 case 16: [[fallthrough]];
2478 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2479 break;
2480 }
2481 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2482 switch (HazardDefLatency) {
2483 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2484 break;
2485 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2486 break;
2487 case 16: [[fallthrough]];
2488 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2489 break;
2490 }
2491 }
2492
2493 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2494 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2495
2496 if (WaitStatesNeeded == MaxWaitStates)
2497 return WaitStatesNeeded; // Early exit.
2498
2499 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2500 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2501 return false;
2502 Register DstReg = MI.getOperand(i: 0).getReg();
2503 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2504 };
2505
2506 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2507 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2508 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2509 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2510 if (OpNo == SrcCIdx)
2511 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2512 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2513 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2514
2515 WaitStatesNeededForUse = NeedWaitStates -
2516 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates);
2517 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2518
2519 if (WaitStatesNeeded == MaxWaitStates)
2520 return WaitStatesNeeded; // Early exit.
2521 }
2522
2523 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2524 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2525 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2526 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2527 const int MaxWaitStates = 13;
2528 Register DstReg = MI->getOperand(i: 0).getReg();
2529 unsigned HazardDefLatency = 0;
2530
2531 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2532 this](const MachineInstr &MI) {
2533 if (!SIInstrInfo::isMFMA(MI))
2534 return false;
2535 Register Reg = TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)->getReg();
2536 HazardDefLatency =
2537 std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI));
2538 return TRI.regsOverlap(RegA: Reg, RegB: DstReg);
2539 };
2540
2541 int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates);
2542 int NeedWaitStates;
2543 switch (HazardDefLatency) {
2544 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2545 break;
2546 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2547 break;
2548 case 16: [[fallthrough]];
2549 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2550 break;
2551 }
2552
2553 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2554 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2555 }
2556
2557 // Pad neighboring MFMA with noops for better inter-wave performance.
2558 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2559
2560 return WaitStatesNeeded;
2561}
2562
2563static int
2564GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses,
2565 bool IsGFX950) {
2566 // xdl def cycles | gfx940 | gfx950
2567 // 2 pass | 3 4
2568 // 4 pass | 5 6
2569 // 8 pass | 9 10
2570 // 16 pass | 17 18
2571 return NumPasses + 1 + IsGFX950;
2572}
2573
2574static int
2575GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses,
2576 bool IsGFX950) {
2577 // xdl def cycles | gfx940 | gfx950
2578 // 2 pass | 3 3
2579 // 4 pass | 5 6
2580 // 8 pass | 9 10
2581 // 16 pass | 17 18
2582 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2583}
2584
2585static int
2586GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) {
2587 // 2 pass -> 2
2588 // 4 pass -> 4
2589 // 8 pass -> 8
2590 // 16 pass -> 16
2591 return NumPasses;
2592}
2593
2594static int
2595GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) {
2596 // 2 pass -> 4
2597 // 4 pass -> 6
2598 // 8 pass -> 10
2599 // 16 pass -> 18
2600 return NumPasses + 2;
2601}
2602
2603static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses,
2604 bool IsGFX950) {
2605 // xdl def cycles | gfx942 | gfx950
2606 // 2 pass | 5 5
2607 // 4 pass | 7 8
2608 // 8 pass | 11 12
2609 // 16 pass | 19 20
2610 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2611}
2612
2613int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2614 int WaitStatesNeeded = 0;
2615 unsigned Opc = MI->getOpcode();
2616
2617 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2618 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2619 };
2620
2621 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2622 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2623 !SIInstrInfo::isDOT(MI);
2624 };
2625
2626 if (!SIInstrInfo::isMFMA(MI: *MI))
2627 return WaitStatesNeeded;
2628
2629 const int VALUWritesExecWaitStates = 4;
2630 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2631 getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn,
2632 Limit: VALUWritesExecWaitStates);
2633 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2634
2635 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
2636
2637 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2638 for (const MachineOperand &Use : MI->explicit_uses()) {
2639 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2640 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2641 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2642 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2643 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2644 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2645 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2646 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2647 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2648 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2649 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2650 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2651 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2652 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2653 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2654 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2655 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2656 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2657 const int MaxWaitStates = 19;
2658
2659 if (!Use.isReg())
2660 continue;
2661 Register Reg = Use.getReg();
2662 bool FullReg;
2663 const MachineInstr *MI1;
2664
2665 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2666 this](const MachineInstr &MI) {
2667 if (!SIInstrInfo::isMFMA(MI))
2668 return false;
2669 Register DstReg = MI.getOperand(i: 0).getReg();
2670 FullReg = (DstReg == Reg);
2671 MI1 = &MI;
2672 return TRI.regsOverlap(RegA: DstReg, RegB: Reg);
2673 };
2674
2675 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2676 getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates);
2677 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2678
2679 int NumWaitStates =
2680 getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates);
2681 if (NumWaitStates == std::numeric_limits<int>::max())
2682 continue;
2683
2684 int OpNo = Use.getOperandNo();
2685 unsigned Opc1 = MI1->getOpcode();
2686 int NeedWaitStates = 0;
2687 if (OpNo == SrcCIdx) {
2688 if (!SIInstrInfo::isDGEMM(Opcode: Opc) &&
2689 (!ST.hasGFX940Insts() && SIInstrInfo::isDGEMM(Opcode: Opc1))) {
2690 NeedWaitStates = 0;
2691 } else if (FullReg) {
2692 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2693 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2694 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2695 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2696 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2697 else if (ST.hasGFX940Insts() &&
2698 TSchedModel.computeInstrLatency(MI: MI1) == 2)
2699 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2700 } else {
2701 switch (Opc1) {
2702 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2703 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2704 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2705 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2706 if (!TII.isXDL(MI: *MI))
2707 NeedWaitStates =
2708 ST.hasGFX950Insts()
2709 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2710 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2711 break;
2712 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2713 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2714 if (!TII.isXDL(MI: *MI))
2715 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2716 break;
2717 default:
2718 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2719 if (ST.hasGFX940Insts()) {
2720 if (TII.isXDL(MI: *MI) && !TII.isXDL(MI: *MI1))
2721 break;
2722
2723 NeedWaitStates =
2724 TII.isXDL(MI: *MI1)
2725 ? (TII.isXDL(MI: *MI)
2726 ? GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(
2727 NumPasses, IsGFX950: ST.hasGFX950Insts())
2728 : GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(
2729 NumPasses, IsGFX950: ST.hasGFX950Insts()))
2730 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2731 NumPasses);
2732 break;
2733 }
2734
2735 switch (NumPasses) {
2736 case 2:
2737 NeedWaitStates =
2738 SIInstrInfo::isDGEMM(Opcode: Opc)
2739 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2740 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2741 break;
2742 case 8:
2743 NeedWaitStates =
2744 SIInstrInfo::isDGEMM(Opcode: Opc)
2745 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2746 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2747 break;
2748 case 16:
2749 NeedWaitStates =
2750 SIInstrInfo::isDGEMM(Opcode: Opc)
2751 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2752 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2753 break;
2754 default:
2755 llvm_unreachable("unexpected number of passes");
2756 }
2757 }
2758 }
2759 } else {
2760 switch (Opc1) {
2761 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2762 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2763 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2764 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2765 NeedWaitStates =
2766 ST.hasGFX950Insts()
2767 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2768 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2769 break;
2770 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2771 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2772 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2773 break;
2774 default:
2775 int NumPasses = TSchedModel.computeInstrLatency(MI: MI1);
2776
2777 if (ST.hasGFX940Insts()) {
2778 NeedWaitStates =
2779 TII.isXDL(MI: *MI1)
2780 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2781 NumPasses, IsGFX950: ST.hasGFX950Insts())
2782 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2783 NumPasses);
2784 break;
2785 }
2786
2787 switch (NumPasses) {
2788 case 2:
2789 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2790 break;
2791 case 4:
2792 llvm_unreachable("unexpected number of passes for mfma");
2793 case 8:
2794 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2795 break;
2796 case 16:
2797 default:
2798 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2799 }
2800 }
2801 }
2802 if (WaitStatesNeeded >= NeedWaitStates)
2803 continue;
2804
2805 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2806 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2807
2808 if (WaitStatesNeeded == MaxWaitStates)
2809 break;
2810 }
2811
2812 // Pad neighboring MFMA with noops for better inter-wave performance.
2813 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI));
2814
2815 return WaitStatesNeeded;
2816}
2817
2818int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2819 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2820 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2821 return 0;
2822
2823 int WaitStatesNeeded = 0;
2824
2825 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2826 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2827 };
2828
2829 for (const MachineOperand &Op : MI->explicit_uses()) {
2830 if (!Op.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2831 continue;
2832
2833 Register Reg = Op.getReg();
2834
2835 const int AccVgprReadLdStWaitStates = 2;
2836 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2837 const int MaxWaitStates = 2;
2838
2839 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2840 getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates);
2841 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2842
2843 if (WaitStatesNeeded == MaxWaitStates)
2844 return WaitStatesNeeded; // Early exit.
2845
2846 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2847 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2848 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2849 return false;
2850 auto IsVALUFn = [](const MachineInstr &MI) {
2851 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2852 };
2853 return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: 2 /*MaxWaitStates*/) <
2854 std::numeric_limits<int>::max();
2855 };
2856
2857 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2858 getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates);
2859 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
2860 }
2861
2862 return WaitStatesNeeded;
2863}
2864
2865int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
2866 assert(!ST.hasVcmpxPermlaneHazard() &&
2867 "this is a different vcmpx+permlane hazard");
2868 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2869 const SIInstrInfo *TII = ST.getInstrInfo();
2870
2871 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {
2872 return isVCmpXWritesExec(TII: *TII, TRI: *TRI, MI);
2873 };
2874
2875 auto IsVALUFn = [](const MachineInstr &MI) {
2876 return SIInstrInfo::isVALU(MI);
2877 };
2878
2879 const int VCmpXWritesExecWaitStates = 4;
2880 const int VALUWritesVDstWaitStates = 2;
2881 int WaitStatesNeeded = 0;
2882
2883 for (const MachineOperand &Op : MI->explicit_uses()) {
2884 if (!Op.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg()))
2885 continue;
2886 Register Reg = Op.getReg();
2887
2888 int WaitStatesSinceDef =
2889 VALUWritesVDstWaitStates -
2890 getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn,
2891 /*MaxWaitStates=*/Limit: VALUWritesVDstWaitStates);
2892 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesSinceDef);
2893 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2894 break;
2895 }
2896
2897 int VCmpXHazardWaits =
2898 VCmpXWritesExecWaitStates -
2899 getWaitStatesSince(IsHazard: IsVCmpXWritesExecFn, Limit: VCmpXWritesExecWaitStates);
2900
2901 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: VCmpXHazardWaits);
2902 return WaitStatesNeeded;
2903}
2904
2905static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
2906 // 2 pass -> 4
2907 // 4 pass -> 6
2908 // 8 pass -> 10
2909 // 16 pass -> 18
2910 return NumPasses + 2;
2911}
2912
2913static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses,
2914 bool IsGFX950) {
2915 // xdl def cycles | gfx942 | gfx950
2916 // 2 pass | 5 5
2917 // 4 pass | 7 8
2918 // 8 pass | 11 12
2919 // 16 pass | 19 20
2920 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2921}
2922
2923static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
2924 bool IsGFX950) {
2925 // xdl def cycles | gfx942 | gfx950
2926 // 2 pass | 5 5
2927 // 4 pass | 7 8
2928 // 8 pass | 11 12
2929 // 16 pass | 19 20
2930 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2931}
2932
2933static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2934 // 2 pass -> 4
2935 // 4 pass -> 6
2936 // 8 pass -> 10
2937 // 16 pass -> 18
2938 return NumPasses + 2;
2939}
2940
2941int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2942 if (!ST.hasGFX90AInsts())
2943 return 0;
2944
2945 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2946 return SIInstrInfo::isDGEMM(Opcode: MI.getOpcode());
2947 };
2948
2949 // This is checked in checkMAIHazards90A()
2950 if (SIInstrInfo::isMFMA(MI: *MI))
2951 return 0;
2952
2953 const MachineRegisterInfo &MRI = MF.getRegInfo();
2954
2955 int WaitStatesNeeded = 0;
2956
2957 bool IsMem = SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isDS(MI: *MI);
2958 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(MI: *MI);
2959 bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
2960
2961 const MachineInstr *MFMA = nullptr;
2962 unsigned Reg;
2963 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2964 if (!SIInstrInfo::isMFMA(MI) ||
2965 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
2966 return false;
2967 MFMA = &MI;
2968 return true;
2969 };
2970
2971 const MachineInstr *DOT = nullptr;
2972 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2973 if (!SIInstrInfo::isDOT(MI) ||
2974 !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
2975 return false;
2976 DOT = &MI;
2977 return true;
2978 };
2979
2980 bool DGEMMAfterVALUWrite = false;
2981 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2982 // Found DGEMM on reverse traversal to def.
2983 if (SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()))
2984 DGEMMAfterVALUWrite = true;
2985
2986 // Only hazard if register is defined by a VALU and a DGEMM is found after
2987 // after the def.
2988 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2989 return false;
2990
2991 return true;
2992 };
2993
2994 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(),
2995 Name: AMDGPU::OpName::src2);
2996
2997 if (IsMemOrExport || IsVALU) {
2998 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2999 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3000 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3001 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3002 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3003 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3004 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3005 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3006 const int DotWriteSameDotReadSrcAB = 3;
3007 const int DotWriteDifferentVALURead = 3;
3008 const int DMFMABetweenVALUWriteVMEMRead = 2;
3009 const int MaxWaitStates = 19;
3010
3011 for (const MachineOperand &Use : MI->explicit_uses()) {
3012 if (!Use.isReg())
3013 continue;
3014 Reg = Use.getReg();
3015
3016 DOT = nullptr;
3017 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3018 Limit: MaxWaitStates);
3019 if (DOT) {
3020 int NeedWaitStates = 0;
3021 if (DOT->getOpcode() == MI->getOpcode()) {
3022 if (&Use - &MI->getOperand(i: 0) != SrcCIdx)
3023 NeedWaitStates = DotWriteSameDotReadSrcAB;
3024 } else {
3025 NeedWaitStates = DotWriteDifferentVALURead;
3026 }
3027
3028 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3029 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3030 }
3031
3032 // Workaround for HW data hazard bug observed only in GFX90A. When there
3033 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
3034 // causes the SQ to incorrectly not insert two wait states between the two
3035 // instructions needed to avoid data hazard.
3036 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3037 DGEMMAfterVALUWrite = false;
3038 if (TRI.isVectorRegister(MRI, Reg)) {
3039 int WaitStatesNeededForUse =
3040 DMFMABetweenVALUWriteVMEMRead -
3041 getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard,
3042 Limit: DMFMABetweenVALUWriteVMEMRead);
3043
3044 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3045 }
3046 }
3047
3048 MFMA = nullptr;
3049 WaitStatesSinceDef =
3050 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3051 if (!MFMA)
3052 continue;
3053
3054 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3055 int NumPasses = HazardDefLatency;
3056 int NeedWaitStates = MaxWaitStates;
3057
3058 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3059 switch (HazardDefLatency) {
3060 case 4:
3061 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3062 : DMFMA4x4WriteVgprVALUReadWaitStates;
3063 break;
3064 case 8:
3065 case 16:
3066 NeedWaitStates =
3067 IsMemOrExport
3068 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3069 : (ST.hasGFX950Insts()
3070 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3071 : DMFMA16x16WriteVgprVALUReadWaitStates);
3072 break;
3073 default:
3074 llvm_unreachable("unexpected dgemm");
3075 }
3076 } else if (ST.hasGFX940Insts()) {
3077 NeedWaitStates =
3078 TII.isXDL(MI: *MFMA)
3079 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
3080 NumPasses, IsGFX950: ST.hasGFX950Insts())
3081 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
3082 NumPasses);
3083 } else {
3084 switch (HazardDefLatency) {
3085 case 2:
3086 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3087 break;
3088 case 8:
3089 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3090 break;
3091 case 16:
3092 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3093 break;
3094 default:
3095 llvm_unreachable("unexpected number of passes for mfma");
3096 }
3097 }
3098
3099 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3100 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3101
3102 if (WaitStatesNeeded == MaxWaitStates)
3103 break;
3104 }
3105 }
3106
3107 unsigned Opc = MI->getOpcode();
3108 const int DMFMAToFMA64WaitStates = 2;
3109 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
3110 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
3111 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3112 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3113 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3114 getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates);
3115 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3116 }
3117
3118 if (!IsVALU && !IsMemOrExport)
3119 return WaitStatesNeeded;
3120
3121 for (const MachineOperand &Def : MI->defs()) {
3122 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3123 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3124 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3125 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3126 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3127 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3128 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3129 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3130 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3131 const int DotWriteDifferentVALUWrite = 3;
3132 const int MaxWaitStates = 19;
3133 const int MaxWarWaitStates = 15;
3134
3135 Reg = Def.getReg();
3136
3137 DOT = nullptr;
3138 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn,
3139 Limit: MaxWaitStates);
3140 if (DOT && DOT->getOpcode() != MI->getOpcode())
3141 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite -
3142 WaitStatesSinceDef);
3143
3144 MFMA = nullptr;
3145 WaitStatesSinceDef =
3146 getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates);
3147 if (MFMA) {
3148 int NeedWaitStates = MaxWaitStates;
3149 int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA);
3150
3151 if (SIInstrInfo::isDGEMM(Opcode: MFMA->getOpcode())) {
3152 switch (NumPasses) {
3153 case 4:
3154 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3155 break;
3156 case 8:
3157 case 16:
3158 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3159 break;
3160 default:
3161 llvm_unreachable("unexpected number of cycles for dgemm");
3162 }
3163 } else if (ST.hasGFX940Insts()) {
3164 NeedWaitStates =
3165 TII.isXDL(MI: *MFMA)
3166 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(
3167 NumPasses, IsGFX950: ST.hasGFX950Insts())
3168 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses);
3169 } else {
3170 switch (NumPasses) {
3171 case 2:
3172 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3173 break;
3174 case 8:
3175 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3176 break;
3177 case 16:
3178 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3179 break;
3180 default:
3181 llvm_unreachable("Unexpected number of passes for mfma");
3182 }
3183 }
3184
3185 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3186 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3187
3188 if (WaitStatesNeeded == MaxWaitStates)
3189 break;
3190 }
3191
3192 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
3193 if (!SIInstrInfo::isMFMA(MI) || SIInstrInfo::isDGEMM(Opcode: MI.getOpcode()) ||
3194 !MI.readsRegister(Reg, TRI: &TRI))
3195 return false;
3196
3197 if (ST.hasGFX940Insts() && !TII.isXDL(MI))
3198 return false;
3199
3200 const MachineOperand *SrcC =
3201 TII.getNamedOperand(MI, OperandName: AMDGPU::OpName::src2);
3202 assert(SrcC);
3203 if (!SrcC->isReg() || !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg))
3204 return false;
3205
3206 MFMA = &MI;
3207 return true;
3208 };
3209
3210 MFMA = nullptr;
3211 int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn,
3212 Limit: MaxWarWaitStates);
3213 if (!MFMA)
3214 continue;
3215
3216 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA);
3217 int NeedWaitStates = MaxWaitStates;
3218 switch (HazardDefLatency) {
3219 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3220 break;
3221 case 4: assert(ST.hasGFX940Insts());
3222 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3223 break;
3224 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3225 break;
3226 case 16: [[fallthrough]];
3227 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3228 break;
3229 }
3230
3231 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3232 WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse);
3233 }
3234
3235 return WaitStatesNeeded;
3236}
3237
3238bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
3239 if (!SU->isInstr())
3240 return false;
3241
3242 const MachineInstr *MAI = nullptr;
3243
3244 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
3245 MAI = nullptr;
3246 if (SIInstrInfo::isMFMA(MI))
3247 MAI = &MI;
3248 return MAI != nullptr;
3249 };
3250
3251 MachineInstr *MI = SU->getInstr();
3252 if (IsMFMAFn(*MI)) {
3253 int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: 16);
3254 if (MAI)
3255 return W < (int)TSchedModel.computeInstrLatency(MI: MAI);
3256 }
3257
3258 return false;
3259}
3260
3261// Adjust global offsets for instructions bundled with S_GETPC_B64 after
3262// insertion of a new instruction.
3263static void updateGetPCBundle(MachineInstr *NewMI) {
3264 if (!NewMI->isBundled())
3265 return;
3266
3267 // Find start of bundle.
3268 auto I = NewMI->getIterator();
3269 while (I->isBundledWithPred())
3270 I--;
3271 if (I->isBundle())
3272 I++;
3273
3274 // Bail if this is not an S_GETPC bundle.
3275 if (I->getOpcode() != AMDGPU::S_GETPC_B64)
3276 return;
3277
3278 // Update offsets of any references in the bundle.
3279 const unsigned NewBytes = 4;
3280 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3281 "Unexpected instruction insertion in bundle");
3282 auto NextMI = std::next(x: NewMI->getIterator());
3283 auto End = NewMI->getParent()->end();
3284 while (NextMI != End && NextMI->isBundledWithPred()) {
3285 for (auto &Operand : NextMI->operands()) {
3286 if (Operand.isGlobal())
3287 Operand.setOffset(Operand.getOffset() + NewBytes);
3288 }
3289 NextMI++;
3290 }
3291}
3292
3293bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
3294 if (!ST.hasVALUMaskWriteHazard())
3295 return false;
3296 assert(!ST.hasExtendedWaitCounts());
3297
3298 if (!ST.isWave64())
3299 return false;
3300
3301 const bool IsSALU = SIInstrInfo::isSALU(MI: *MI);
3302 const bool IsVALU = SIInstrInfo::isVALU(MI: *MI);
3303 if (!IsSALU && !IsVALU)
3304 return false;
3305
3306 // The hazard sequence is three instructions:
3307 // 1. VALU reads SGPR as mask
3308 // 2. VALU/SALU writes SGPR
3309 // 3. VALU/SALU reads SGPR
3310 // The hazard can expire if the distance between 2 and 3 is sufficient,
3311 // or (2) is VALU and (3) is SALU.
3312 // In practice this happens <10% of the time, hence always assume the hazard
3313 // exists if (1) and (2) are present to avoid searching all SGPR reads.
3314
3315 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3316 const MachineRegisterInfo &MRI = MF.getRegInfo();
3317
3318 auto IgnoreableSGPR = [](const Register Reg) {
3319 switch (Reg) {
3320 case AMDGPU::EXEC:
3321 case AMDGPU::EXEC_LO:
3322 case AMDGPU::EXEC_HI:
3323 case AMDGPU::M0:
3324 case AMDGPU::SGPR_NULL:
3325 case AMDGPU::SGPR_NULL64:
3326 case AMDGPU::SCC:
3327 return true;
3328 default:
3329 return false;
3330 }
3331 };
3332 auto IsVCC = [](const Register Reg) {
3333 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3334 };
3335
3336 struct StateType {
3337 SmallSet<Register, 2> HazardSGPRs;
3338
3339 static unsigned getHashValue(const StateType &State) {
3340 return hash_combine_range(R: State.HazardSGPRs);
3341 }
3342 static bool isEqual(const StateType &LHS, const StateType &RHS) {
3343 return LHS.HazardSGPRs == RHS.HazardSGPRs;
3344 }
3345 };
3346
3347 SmallVector<const MachineInstr *> WaitInstrs;
3348 bool HasSGPRRead = false;
3349 StateType InitialState;
3350
3351 // Look for SGPR write.
3352 MachineOperand *HazardDef = nullptr;
3353 for (MachineOperand &Op : MI->operands()) {
3354 if (!Op.isReg())
3355 continue;
3356 if (Op.isDef() && HazardDef)
3357 continue;
3358
3359 Register Reg = Op.getReg();
3360 if (IgnoreableSGPR(Reg))
3361 continue;
3362 if (!IsVCC(Reg)) {
3363 if (Op.isImplicit())
3364 continue;
3365 if (!TRI->isSGPRReg(MRI, Reg))
3366 continue;
3367 }
3368 // Also check for SGPR reads.
3369 if (Op.isUse()) {
3370 HasSGPRRead = true;
3371 continue;
3372 }
3373
3374 assert(!HazardDef);
3375 HazardDef = &Op;
3376 }
3377
3378 if (!HazardDef)
3379 return false;
3380
3381 // Setup to track writes to individual SGPRs
3382 const Register HazardReg = HazardDef->getReg();
3383 if (AMDGPU::SReg_32RegClass.contains(Reg: HazardReg)) {
3384 InitialState.HazardSGPRs.insert(V: HazardReg);
3385 } else {
3386 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3387 InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub0));
3388 InitialState.HazardSGPRs.insert(V: TRI->getSubReg(Reg: HazardReg, Idx: AMDGPU::sub1));
3389 }
3390
3391 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3392 if (State.HazardSGPRs.empty())
3393 return HazardExpired;
3394
3395 switch (I.getOpcode()) {
3396 case AMDGPU::V_ADDC_U32_e32:
3397 case AMDGPU::V_ADDC_U32_dpp:
3398 case AMDGPU::V_CNDMASK_B16_t16_e32:
3399 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3400 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3401 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3402 case AMDGPU::V_CNDMASK_B32_e32:
3403 case AMDGPU::V_CNDMASK_B32_dpp:
3404 case AMDGPU::V_DIV_FMAS_F32_e64:
3405 case AMDGPU::V_DIV_FMAS_F64_e64:
3406 case AMDGPU::V_SUBB_U32_e32:
3407 case AMDGPU::V_SUBB_U32_dpp:
3408 case AMDGPU::V_SUBBREV_U32_e32:
3409 case AMDGPU::V_SUBBREV_U32_dpp: {
3410 // These implicitly read VCC as mask source.
3411 return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3412 }
3413 case AMDGPU::V_ADDC_U32_e64:
3414 case AMDGPU::V_ADDC_U32_e64_dpp:
3415 case AMDGPU::V_CNDMASK_B16_t16_e64:
3416 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3417 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3418 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3419 case AMDGPU::V_CNDMASK_B32_e64:
3420 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3421 case AMDGPU::V_SUBB_U32_e64:
3422 case AMDGPU::V_SUBB_U32_e64_dpp:
3423 case AMDGPU::V_SUBBREV_U32_e64:
3424 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3425 // Only check mask register overlaps.
3426 const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OperandName: AMDGPU::OpName::src2);
3427 assert(SSRCOp);
3428 bool Result = TRI->regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg);
3429 return Result ? HazardFound : NoHazardFound;
3430 }
3431 default:
3432 return NoHazardFound;
3433 }
3434 };
3435
3436 const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3437 Encoded: AMDGPU::DepCtr::encodeFieldVaSdst(Encoded: AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: 0, STI: ST),
3438 VaSdst: 0),
3439 SaSdst: 0);
3440 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3441 switch (I.getOpcode()) {
3442 case AMDGPU::S_WAITCNT_DEPCTR:
3443 // Record mergable waits within region of instructions free of SGPR reads.
3444 if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3445 (I.getOperand(i: 0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3446 WaitInstrs.push_back(Elt: &I);
3447 break;
3448 default:
3449 // Update tracking of SGPR reads and writes.
3450 for (auto &Op : I.operands()) {
3451 if (!Op.isReg())
3452 continue;
3453
3454 Register Reg = Op.getReg();
3455 if (IgnoreableSGPR(Reg))
3456 continue;
3457 if (!IsVCC(Reg)) {
3458 if (Op.isImplicit())
3459 continue;
3460 if (!TRI->isSGPRReg(MRI, Reg))
3461 continue;
3462 }
3463 if (Op.isUse()) {
3464 HasSGPRRead = true;
3465 continue;
3466 }
3467
3468 // Stop tracking any SGPRs with writes on the basis that they will
3469 // already have an appropriate wait inserted afterwards.
3470 SmallVector<Register, 2> Found;
3471 for (Register SGPR : State.HazardSGPRs) {
3472 if (Reg == SGPR || TRI->regsOverlap(RegA: Reg, RegB: SGPR))
3473 Found.push_back(Elt: SGPR);
3474 }
3475 for (Register SGPR : Found)
3476 State.HazardSGPRs.erase(V: SGPR);
3477 }
3478 break;
3479 }
3480 };
3481
3482 // Check for hazard
3483 if (!hasHazard<StateType>(InitialState, IsHazard: IsHazardFn, UpdateState: UpdateStateFn,
3484 InitialMBB: MI->getParent(),
3485 InitialI: std::next(x: MI->getReverseIterator())))
3486 return false;
3487
3488 // Compute counter mask
3489 unsigned DepCtr =
3490 IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(VaVcc: 0, STI: ST)
3491 : AMDGPU::DepCtr::encodeFieldVaSdst(VaSdst: 0, STI: ST))
3492 : AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST);
3493
3494 // Try to merge previous waits into this one for regions with no SGPR reads.
3495 if (!WaitInstrs.empty()) {
3496 // Note: WaitInstrs contains const pointers, so walk backward from MI to
3497 // obtain a mutable pointer to each instruction to be merged.
3498 // This is expected to be a very short walk within the same block.
3499 SmallVector<MachineInstr *> ToErase;
3500 unsigned Found = 0;
3501 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3502 End = MI->getParent()->rend();
3503 Found < WaitInstrs.size() && It != End; ++It) {
3504 MachineInstr *WaitMI = &*It;
3505 // Find next wait instruction.
3506 if (std::as_const(t&: WaitMI) != WaitInstrs[Found])
3507 continue;
3508 Found++;
3509 unsigned WaitMask = WaitMI->getOperand(i: 0).getImm();
3510 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3511 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3512 Encoded: DepCtr, SaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: WaitMask),
3513 b: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: DepCtr)));
3514 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3515 Encoded: DepCtr, VaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: WaitMask),
3516 b: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: DepCtr)));
3517 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3518 Encoded: DepCtr, VaVcc: std::min(a: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: WaitMask),
3519 b: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: DepCtr)));
3520 ToErase.push_back(Elt: WaitMI);
3521 }
3522 assert(Found == WaitInstrs.size());
3523 for (MachineInstr *WaitMI : ToErase)
3524 WaitMI->eraseFromParent();
3525 }
3526
3527 // Add s_waitcnt_depctr after SGPR write.
3528 auto NextMI = std::next(x: MI->getIterator());
3529 auto NewMI = BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(),
3530 MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3531 .addImm(Val: DepCtr);
3532
3533 // SALU write may be s_getpc in a bundle.
3534 updateGetPCBundle(NewMI);
3535
3536 return true;
3537}
3538
3539static bool ensureEntrySetPrio(MachineFunction *MF, int Priority,
3540 const SIInstrInfo &TII) {
3541 MachineBasicBlock &EntryMBB = MF->front();
3542 if (EntryMBB.begin() != EntryMBB.end()) {
3543 auto &EntryMI = *EntryMBB.begin();
3544 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3545 EntryMI.getOperand(i: 0).getImm() >= Priority)
3546 return false;
3547 }
3548
3549 BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3550 .addImm(Val: Priority);
3551 return true;
3552}
3553
3554bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
3555 if (!ST.hasRequiredExportPriority())
3556 return false;
3557
3558 // Assume the following shader types will never have exports,
3559 // and avoid adding or adjusting S_SETPRIO.
3560 MachineBasicBlock *MBB = MI->getParent();
3561 MachineFunction *MF = MBB->getParent();
3562 auto CC = MF->getFunction().getCallingConv();
3563 switch (CC) {
3564 case CallingConv::AMDGPU_CS:
3565 case CallingConv::AMDGPU_CS_Chain:
3566 case CallingConv::AMDGPU_CS_ChainPreserve:
3567 case CallingConv::AMDGPU_KERNEL:
3568 return false;
3569 default:
3570 break;
3571 }
3572
3573 const int MaxPriority = 3;
3574 const int NormalPriority = 2;
3575 const int PostExportPriority = 0;
3576
3577 auto It = MI->getIterator();
3578 switch (MI->getOpcode()) {
3579 case AMDGPU::S_ENDPGM:
3580 case AMDGPU::S_ENDPGM_SAVED:
3581 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3582 case AMDGPU::SI_RETURN_TO_EPILOG:
3583 // Ensure shader with calls raises priority at entry.
3584 // This ensures correct priority if exports exist in callee.
3585 if (MF->getFrameInfo().hasCalls())
3586 return ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3587 return false;
3588 case AMDGPU::S_SETPRIO: {
3589 // Raise minimum priority unless in workaround.
3590 auto &PrioOp = MI->getOperand(i: 0);
3591 int Prio = PrioOp.getImm();
3592 bool InWA = (Prio == PostExportPriority) &&
3593 (It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It)));
3594 if (InWA || Prio >= NormalPriority)
3595 return false;
3596 PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority));
3597 return true;
3598 }
3599 default:
3600 if (!TII.isEXP(MI: *MI))
3601 return false;
3602 break;
3603 }
3604
3605 // Check entry priority at each export (as there will only be a few).
3606 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
3607 bool Changed = false;
3608 if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
3609 Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII);
3610
3611 auto NextMI = std::next(x: It);
3612 bool EndOfShader = false;
3613 if (NextMI != MBB->end()) {
3614 // Only need WA at end of sequence of exports.
3615 if (TII.isEXP(MI: *NextMI))
3616 return Changed;
3617 // Assume appropriate S_SETPRIO after export means WA already applied.
3618 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3619 NextMI->getOperand(i: 0).getImm() == PostExportPriority)
3620 return Changed;
3621 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3622 }
3623
3624 const DebugLoc &DL = MI->getDebugLoc();
3625
3626 // Lower priority.
3627 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3628 .addImm(Val: PostExportPriority);
3629
3630 if (!EndOfShader) {
3631 // Wait for exports to complete.
3632 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT))
3633 .addReg(RegNo: AMDGPU::SGPR_NULL)
3634 .addImm(Val: 0);
3635 }
3636
3637 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3638 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0);
3639
3640 if (!EndOfShader) {
3641 // Return to normal (higher) priority.
3642 BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO))
3643 .addImm(Val: NormalPriority);
3644 }
3645
3646 return true;
3647}
3648
3649bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {
3650 if (!isSGetReg(Opcode: MI->getOpcode()))
3651 return false;
3652
3653 const SIInstrInfo *TII = ST.getInstrInfo();
3654 switch (getHWReg(TII, RegInstr: *MI)) {
3655 default:
3656 return false;
3657 case AMDGPU::Hwreg::ID_STATUS:
3658 case AMDGPU::Hwreg::ID_STATE_PRIV:
3659 case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV:
3660 case AMDGPU::Hwreg::ID_EXCP_FLAG_USER:
3661 break;
3662 }
3663
3664 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3665 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3666 .addImm(Val: 0);
3667 return true;
3668}
3669
3670bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3671 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3672 return false;
3673
3674 const SIInstrInfo *TII = ST.getInstrInfo();
3675 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3676 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3677 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
3678 BuildMI(BB&: *MI->getParent(), I: std::next(x: MI->getIterator()), MIMD: MI->getDebugLoc(),
3679 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3680 .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0, STI: ST));
3681
3682 return true;
3683}
3684
3685bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
3686 // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3687 // for hazard to trigger.
3688 if (!IsHazardRecognizerMode)
3689 return false;
3690
3691 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3692 const SIInstrInfo *TII = ST.getInstrInfo();
3693 // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3694 const int FlatScrBaseWaitStates = 10;
3695
3696 bool ReadsFlatScrLo =
3697 MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3698 bool ReadsFlatScrHi =
3699 MI->readsRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3700 if (isSGetReg(Opcode: MI->getOpcode())) {
3701 switch (getHWReg(TII, RegInstr: *MI)) {
3702 default:
3703 break;
3704 case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
3705 ReadsFlatScrLo = true;
3706 break;
3707 case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
3708 ReadsFlatScrHi = true;
3709 break;
3710 }
3711 }
3712
3713 const MachineRegisterInfo &MRI = MF.getRegInfo();
3714
3715 auto IsRegDefHazard = [&](Register Reg) -> bool {
3716 DenseSet<const MachineBasicBlock *> Visited;
3717 auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3718 return MI.modifiesRegister(Reg, TRI);
3719 };
3720
3721 // This literally abuses the idea of waitstates. Instead of waitstates it
3722 // returns 1 for SGPR written and 0 otherwise.
3723 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3724 if (!TII->isSALU(MI) && !TII->isVALU(MI))
3725 return 0;
3726 for (const MachineOperand &MO : MI.all_defs()) {
3727 if (TRI->isSGPRReg(MRI, Reg: MO.getReg()))
3728 return 1;
3729 }
3730 return 0;
3731 };
3732
3733 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3734 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3735 unsigned Wait = MI.getOperand(i: 0).getImm();
3736 if (AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: Wait) == 0 &&
3737 AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: Wait) == 0)
3738 return true;
3739 }
3740 return SgprWrites >= FlatScrBaseWaitStates;
3741 };
3742
3743 return ::getWaitStatesSince(
3744 IsHazard: IsHazardFn, MBB: MI->getParent(), I: std::next(x: MI->getReverseIterator()),
3745 WaitStates: 0, IsExpired: IsExpiredFn, Visited, GetNumWaitStates: IsSGPRDef) < FlatScrBaseWaitStates;
3746 };
3747
3748 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR102) ||
3749 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3750 (!ReadsFlatScrHi || MRI.isConstantPhysReg(PhysReg: AMDGPU::SGPR103) ||
3751 !IsRegDefHazard(AMDGPU::SGPR103)))
3752 return false;
3753
3754 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(),
3755 MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR))
3756 .addImm(Val: AMDGPU::DepCtr::encodeFieldVaSdst(
3757 Encoded: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0, STI: ST), VaSdst: 0));
3758 return true;
3759}
3760
3761bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {
3762 if (!isSSetReg(Opcode: MI->getOpcode()) ||
3763 MI->getOperand(i: 1).getImm() != AMDGPU::Hwreg::ID_MODE)
3764 return false;
3765
3766 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3767 BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::V_NOP_e32));
3768 return true;
3769}
3770