1 | //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements hazard recognizers for scheduling on GCN processors. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "GCNHazardRecognizer.h" |
14 | #include "GCNSubtarget.h" |
15 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
16 | #include "SIMachineFunctionInfo.h" |
17 | #include "llvm/CodeGen/MachineFrameInfo.h" |
18 | #include "llvm/CodeGen/MachineFunction.h" |
19 | #include "llvm/CodeGen/ScheduleDAG.h" |
20 | #include "llvm/TargetParser/TargetParser.h" |
21 | |
22 | using namespace llvm; |
23 | |
24 | namespace { |
25 | |
26 | struct MFMAPaddingRatioParser : public cl::parser<unsigned> { |
27 | MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {} |
28 | |
29 | bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) { |
30 | if (Arg.getAsInteger(Radix: 0, Result&: Value)) |
31 | return O.error(Message: "'" + Arg + "' value invalid for uint argument!" ); |
32 | |
33 | if (Value > 100) |
34 | return O.error(Message: "'" + Arg + "' value must be in the range [0, 100]!" ); |
35 | |
36 | return false; |
37 | } |
38 | }; |
39 | |
40 | } // end anonymous namespace |
41 | |
42 | static cl::opt<unsigned, false, MFMAPaddingRatioParser> |
43 | MFMAPaddingRatio("amdgpu-mfma-padding-ratio" , cl::init(Val: 0), cl::Hidden, |
44 | cl::desc("Fill a percentage of the latency between " |
45 | "neighboring MFMA with s_nops." )); |
46 | |
47 | //===----------------------------------------------------------------------===// |
48 | // Hazard Recognizer Implementation |
49 | //===----------------------------------------------------------------------===// |
50 | |
51 | static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, |
52 | const GCNSubtarget &ST); |
53 | |
54 | GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : |
55 | IsHazardRecognizerMode(false), |
56 | CurrCycleInstr(nullptr), |
57 | MF(MF), |
58 | ST(MF.getSubtarget<GCNSubtarget>()), |
59 | TII(*ST.getInstrInfo()), |
60 | TRI(TII.getRegisterInfo()), |
61 | ClauseUses(TRI.getNumRegUnits()), |
62 | ClauseDefs(TRI.getNumRegUnits()) { |
63 | MaxLookAhead = MF.getRegInfo().isPhysRegUsed(PhysReg: AMDGPU::AGPR0) ? 19 : 5; |
64 | TSchedModel.init(TSInfo: &ST); |
65 | RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST); |
66 | } |
67 | |
68 | void GCNHazardRecognizer::Reset() { |
69 | EmittedInstrs.clear(); |
70 | } |
71 | |
72 | void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { |
73 | EmitInstruction(MI: SU->getInstr()); |
74 | } |
75 | |
76 | void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { |
77 | CurrCycleInstr = MI; |
78 | } |
79 | |
80 | static bool isDivFMas(unsigned Opcode) { |
81 | return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64; |
82 | } |
83 | |
84 | static bool isSGetReg(unsigned Opcode) { |
85 | return Opcode == AMDGPU::S_GETREG_B32; |
86 | } |
87 | |
88 | static bool isSSetReg(unsigned Opcode) { |
89 | switch (Opcode) { |
90 | case AMDGPU::S_SETREG_B32: |
91 | case AMDGPU::S_SETREG_B32_mode: |
92 | case AMDGPU::S_SETREG_IMM32_B32: |
93 | case AMDGPU::S_SETREG_IMM32_B32_mode: |
94 | return true; |
95 | } |
96 | return false; |
97 | } |
98 | |
99 | static bool isRWLane(unsigned Opcode) { |
100 | return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; |
101 | } |
102 | |
103 | static bool isRFE(unsigned Opcode) { |
104 | return Opcode == AMDGPU::S_RFE_B64; |
105 | } |
106 | |
107 | static bool isSMovRel(unsigned Opcode) { |
108 | switch (Opcode) { |
109 | case AMDGPU::S_MOVRELS_B32: |
110 | case AMDGPU::S_MOVRELS_B64: |
111 | case AMDGPU::S_MOVRELD_B32: |
112 | case AMDGPU::S_MOVRELD_B64: |
113 | return true; |
114 | default: |
115 | return false; |
116 | } |
117 | } |
118 | |
119 | static bool isDGEMM(unsigned Opcode) { |
120 | return AMDGPU::getMAIIsDGEMM(Opc: Opcode); |
121 | } |
122 | |
123 | static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { |
124 | unsigned Opcode = MI.getOpcode(); |
125 | |
126 | if (!SIInstrInfo::isMAI(MI) || |
127 | isDGEMM(Opcode) || |
128 | Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || |
129 | Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) |
130 | return false; |
131 | |
132 | if (!ST.hasGFX940Insts()) |
133 | return true; |
134 | |
135 | return AMDGPU::getMAIIsGFX940XDL(Opc: Opcode); |
136 | } |
137 | |
138 | static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, |
139 | const MachineInstr &MI) { |
140 | if (TII.isAlwaysGDS(Opcode: MI.getOpcode())) |
141 | return true; |
142 | |
143 | switch (MI.getOpcode()) { |
144 | case AMDGPU::S_SENDMSG: |
145 | case AMDGPU::S_SENDMSGHALT: |
146 | case AMDGPU::S_TTRACEDATA: |
147 | return true; |
148 | // These DS opcodes don't support GDS. |
149 | case AMDGPU::DS_NOP: |
150 | case AMDGPU::DS_PERMUTE_B32: |
151 | case AMDGPU::DS_BPERMUTE_B32: |
152 | return false; |
153 | default: |
154 | if (TII.isDS(Opcode: MI.getOpcode())) { |
155 | int GDS = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), |
156 | NamedIdx: AMDGPU::OpName::gds); |
157 | if (MI.getOperand(i: GDS).getImm()) |
158 | return true; |
159 | } |
160 | return false; |
161 | } |
162 | } |
163 | |
164 | static bool isPermlane(const MachineInstr &MI) { |
165 | unsigned Opcode = MI.getOpcode(); |
166 | return Opcode == AMDGPU::V_PERMLANE16_B32_e64 || |
167 | Opcode == AMDGPU::V_PERMLANE64_B32 || |
168 | Opcode == AMDGPU::V_PERMLANEX16_B32_e64 || |
169 | Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 || |
170 | Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64; |
171 | } |
172 | |
173 | static bool isLdsDma(const MachineInstr &MI) { |
174 | return SIInstrInfo::isVALU(MI) && |
175 | (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)); |
176 | } |
177 | |
178 | static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { |
179 | const MachineOperand *RegOp = TII->getNamedOperand(MI: RegInstr, |
180 | OpName: AMDGPU::OpName::simm16); |
181 | return std::get<0>(t: AMDGPU::Hwreg::HwregEncoding::decode(Encoded: RegOp->getImm())); |
182 | } |
183 | |
184 | ScheduleHazardRecognizer::HazardType |
185 | GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { |
186 | MachineInstr *MI = SU->getInstr(); |
187 | // If we are not in "HazardRecognizerMode" and therefore not being run from |
188 | // the scheduler, track possible stalls from hazards but don't insert noops. |
189 | auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; |
190 | |
191 | if (MI->isBundle()) |
192 | return NoHazard; |
193 | |
194 | if (SIInstrInfo::isSMRD(MI: *MI) && checkSMRDHazards(SMRD: MI) > 0) |
195 | return HazardType; |
196 | |
197 | if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) |
198 | return HazardType; |
199 | |
200 | if (checkFPAtomicToDenormModeHazard(MI) > 0) |
201 | return HazardType; |
202 | |
203 | if (ST.hasNoDataDepHazard()) |
204 | return NoHazard; |
205 | |
206 | // FIXME: Should flat be considered vmem? |
207 | if ((SIInstrInfo::isVMEM(MI: *MI) || |
208 | SIInstrInfo::isFLAT(MI: *MI)) |
209 | && checkVMEMHazards(VMEM: MI) > 0) |
210 | return HazardType; |
211 | |
212 | if (SIInstrInfo::isVALU(MI: *MI) && checkVALUHazards(VALU: MI) > 0) |
213 | return HazardType; |
214 | |
215 | if (SIInstrInfo::isDPP(MI: *MI) && checkDPPHazards(DPP: MI) > 0) |
216 | return HazardType; |
217 | |
218 | if (isDivFMas(Opcode: MI->getOpcode()) && checkDivFMasHazards(DivFMas: MI) > 0) |
219 | return HazardType; |
220 | |
221 | if (isRWLane(Opcode: MI->getOpcode()) && checkRWLaneHazards(RWLane: MI) > 0) |
222 | return HazardType; |
223 | |
224 | if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) || |
225 | SIInstrInfo::isFLAT(MI: *MI) || SIInstrInfo::isDS(MI: *MI) || |
226 | SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > 0) |
227 | return HazardType; |
228 | |
229 | if (isSGetReg(Opcode: MI->getOpcode()) && checkGetRegHazards(GetRegInstr: MI) > 0) |
230 | return HazardType; |
231 | |
232 | if (isSSetReg(Opcode: MI->getOpcode()) && checkSetRegHazards(SetRegInstr: MI) > 0) |
233 | return HazardType; |
234 | |
235 | if (isRFE(Opcode: MI->getOpcode()) && checkRFEHazards(RFE: MI) > 0) |
236 | return HazardType; |
237 | |
238 | if (((ST.hasReadM0MovRelInterpHazard() && |
239 | (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) || |
240 | MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || |
241 | MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || |
242 | (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) || |
243 | (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) || |
244 | (ST.hasReadM0LdsDirectHazard() && |
245 | MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) && |
246 | checkReadM0Hazards(SMovRel: MI) > 0) |
247 | return HazardType; |
248 | |
249 | if (SIInstrInfo::isMAI(MI: *MI) && checkMAIHazards(MI) > 0) |
250 | return HazardType; |
251 | |
252 | if ((SIInstrInfo::isVMEM(MI: *MI) || |
253 | SIInstrInfo::isFLAT(MI: *MI) || |
254 | SIInstrInfo::isDS(MI: *MI)) && checkMAILdStHazards(MI) > 0) |
255 | return HazardType; |
256 | |
257 | if (MI->isInlineAsm() && checkInlineAsmHazards(IA: MI) > 0) |
258 | return HazardType; |
259 | |
260 | return NoHazard; |
261 | } |
262 | |
263 | static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, |
264 | unsigned Quantity) { |
265 | while (Quantity > 0) { |
266 | unsigned Arg = std::min(a: Quantity, b: 8u); |
267 | Quantity -= Arg; |
268 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_NOP)) |
269 | .addImm(Val: Arg - 1); |
270 | } |
271 | } |
272 | |
273 | unsigned |
274 | GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const { |
275 | const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(MI: &MI); |
276 | assert(TSchedModel.getWriteProcResBegin(SC) != |
277 | TSchedModel.getWriteProcResEnd(SC)); |
278 | return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle; |
279 | } |
280 | |
281 | void GCNHazardRecognizer::processBundle() { |
282 | MachineBasicBlock::instr_iterator MI = std::next(x: CurrCycleInstr->getIterator()); |
283 | MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); |
284 | // Check bundled MachineInstr's for hazards. |
285 | for (; MI != E && MI->isInsideBundle(); ++MI) { |
286 | CurrCycleInstr = &*MI; |
287 | unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); |
288 | |
289 | if (IsHazardRecognizerMode) { |
290 | fixHazards(MI: CurrCycleInstr); |
291 | |
292 | insertNoopsInBundle(MI: CurrCycleInstr, TII, Quantity: WaitStates); |
293 | } |
294 | |
295 | // It’s unnecessary to track more than MaxLookAhead instructions. Since we |
296 | // include the bundled MI directly after, only add a maximum of |
297 | // (MaxLookAhead - 1) noops to EmittedInstrs. |
298 | for (unsigned i = 0, e = std::min(a: WaitStates, b: MaxLookAhead - 1); i < e; ++i) |
299 | EmittedInstrs.push_front(x: nullptr); |
300 | |
301 | EmittedInstrs.push_front(x: CurrCycleInstr); |
302 | EmittedInstrs.resize(new_size: MaxLookAhead); |
303 | } |
304 | CurrCycleInstr = nullptr; |
305 | } |
306 | |
307 | void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { |
308 | assert(IsHazardRecognizerMode); |
309 | |
310 | unsigned NumPreNoops = PreEmitNoops(MI); |
311 | EmitNoops(Quantity: NumPreNoops); |
312 | if (MI->isInsideBundle()) |
313 | insertNoopsInBundle(MI, TII, Quantity: NumPreNoops); |
314 | else |
315 | TII.insertNoops(MBB&: *MI->getParent(), MI: MachineBasicBlock::iterator(MI), |
316 | Quantity: NumPreNoops); |
317 | EmitInstruction(MI); |
318 | AdvanceCycle(); |
319 | } |
320 | |
321 | unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { |
322 | IsHazardRecognizerMode = true; |
323 | CurrCycleInstr = MI; |
324 | unsigned W = PreEmitNoopsCommon(MI); |
325 | fixHazards(MI); |
326 | CurrCycleInstr = nullptr; |
327 | return W; |
328 | } |
329 | |
330 | unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { |
331 | if (MI->isBundle()) |
332 | return 0; |
333 | |
334 | int WaitStates = 0; |
335 | |
336 | if (SIInstrInfo::isSMRD(MI: *MI)) |
337 | return std::max(a: WaitStates, b: checkSMRDHazards(SMRD: MI)); |
338 | |
339 | if (ST.hasNSAtoVMEMBug()) |
340 | WaitStates = std::max(a: WaitStates, b: checkNSAtoVMEMHazard(MI)); |
341 | |
342 | WaitStates = std::max(a: WaitStates, b: checkFPAtomicToDenormModeHazard(MI)); |
343 | |
344 | if (ST.hasNoDataDepHazard()) |
345 | return WaitStates; |
346 | |
347 | if (SIInstrInfo::isVMEM(MI: *MI) || SIInstrInfo::isFLAT(MI: *MI)) |
348 | WaitStates = std::max(a: WaitStates, b: checkVMEMHazards(VMEM: MI)); |
349 | |
350 | if (SIInstrInfo::isVALU(MI: *MI)) |
351 | WaitStates = std::max(a: WaitStates, b: checkVALUHazards(VALU: MI)); |
352 | |
353 | if (SIInstrInfo::isDPP(MI: *MI)) |
354 | WaitStates = std::max(a: WaitStates, b: checkDPPHazards(DPP: MI)); |
355 | |
356 | if (isDivFMas(Opcode: MI->getOpcode())) |
357 | WaitStates = std::max(a: WaitStates, b: checkDivFMasHazards(DivFMas: MI)); |
358 | |
359 | if (isRWLane(Opcode: MI->getOpcode())) |
360 | WaitStates = std::max(a: WaitStates, b: checkRWLaneHazards(RWLane: MI)); |
361 | |
362 | if ((SIInstrInfo::isVALU(MI: *MI) || SIInstrInfo::isVMEM(MI: *MI) || |
363 | SIInstrInfo::isFLAT(MI: *MI) || SIInstrInfo::isDS(MI: *MI) || |
364 | SIInstrInfo::isEXP(MI: *MI)) && checkMAIVALUHazards(MI) > 0) |
365 | WaitStates = std::max(a: WaitStates, b: checkMAIVALUHazards(MI)); |
366 | |
367 | if (MI->isInlineAsm()) |
368 | return std::max(a: WaitStates, b: checkInlineAsmHazards(IA: MI)); |
369 | |
370 | if (isSGetReg(Opcode: MI->getOpcode())) |
371 | return std::max(a: WaitStates, b: checkGetRegHazards(GetRegInstr: MI)); |
372 | |
373 | if (isSSetReg(Opcode: MI->getOpcode())) |
374 | return std::max(a: WaitStates, b: checkSetRegHazards(SetRegInstr: MI)); |
375 | |
376 | if (isRFE(Opcode: MI->getOpcode())) |
377 | return std::max(a: WaitStates, b: checkRFEHazards(RFE: MI)); |
378 | |
379 | if ((ST.hasReadM0MovRelInterpHazard() && |
380 | (TII.isVINTRP(MI: *MI) || isSMovRel(Opcode: MI->getOpcode()) || |
381 | MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 || |
382 | MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) || |
383 | (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, MI: *MI)) || |
384 | (ST.hasReadM0LdsDmaHazard() && isLdsDma(MI: *MI)) || |
385 | (ST.hasReadM0LdsDirectHazard() && |
386 | MI->readsRegister(Reg: AMDGPU::LDS_DIRECT, /*TRI=*/nullptr))) |
387 | return std::max(a: WaitStates, b: checkReadM0Hazards(SMovRel: MI)); |
388 | |
389 | if (SIInstrInfo::isMAI(MI: *MI)) |
390 | return std::max(a: WaitStates, b: checkMAIHazards(MI)); |
391 | |
392 | if (SIInstrInfo::isVMEM(MI: *MI) || |
393 | SIInstrInfo::isFLAT(MI: *MI) || |
394 | SIInstrInfo::isDS(MI: *MI)) |
395 | return std::max(a: WaitStates, b: checkMAILdStHazards(MI)); |
396 | |
397 | return WaitStates; |
398 | } |
399 | |
400 | void GCNHazardRecognizer::EmitNoop() { |
401 | EmittedInstrs.push_front(x: nullptr); |
402 | } |
403 | |
404 | void GCNHazardRecognizer::AdvanceCycle() { |
405 | // When the scheduler detects a stall, it will call AdvanceCycle() without |
406 | // emitting any instructions. |
407 | if (!CurrCycleInstr) { |
408 | EmittedInstrs.push_front(x: nullptr); |
409 | return; |
410 | } |
411 | |
412 | if (CurrCycleInstr->isBundle()) { |
413 | processBundle(); |
414 | return; |
415 | } |
416 | |
417 | unsigned NumWaitStates = TII.getNumWaitStates(MI: *CurrCycleInstr); |
418 | if (!NumWaitStates) { |
419 | CurrCycleInstr = nullptr; |
420 | return; |
421 | } |
422 | |
423 | // Keep track of emitted instructions |
424 | EmittedInstrs.push_front(x: CurrCycleInstr); |
425 | |
426 | // Add a nullptr for each additional wait state after the first. Make sure |
427 | // not to add more than getMaxLookAhead() items to the list, since we |
428 | // truncate the list to that size right after this loop. |
429 | for (unsigned i = 1, e = std::min(a: NumWaitStates, b: getMaxLookAhead()); |
430 | i < e; ++i) { |
431 | EmittedInstrs.push_front(x: nullptr); |
432 | } |
433 | |
434 | // getMaxLookahead() is the largest number of wait states we will ever need |
435 | // to insert, so there is no point in keeping track of more than that many |
436 | // wait states. |
437 | EmittedInstrs.resize(new_size: getMaxLookAhead()); |
438 | |
439 | CurrCycleInstr = nullptr; |
440 | } |
441 | |
442 | void GCNHazardRecognizer::RecedeCycle() { |
443 | llvm_unreachable("hazard recognizer does not support bottom-up scheduling." ); |
444 | } |
445 | |
446 | //===----------------------------------------------------------------------===// |
447 | // Helper Functions |
448 | //===----------------------------------------------------------------------===// |
449 | |
450 | using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; |
451 | |
452 | using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; |
453 | using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; |
454 | |
455 | // Search for a hazard in a block and its predecessors. |
456 | template <typename StateT> |
457 | static bool |
458 | hasHazard(StateT State, |
459 | function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard, |
460 | function_ref<void(StateT &, const MachineInstr &)> UpdateState, |
461 | const MachineBasicBlock *MBB, |
462 | MachineBasicBlock::const_reverse_instr_iterator I, |
463 | DenseSet<const MachineBasicBlock *> &Visited) { |
464 | for (auto E = MBB->instr_rend(); I != E; ++I) { |
465 | // No need to look at parent BUNDLE instructions. |
466 | if (I->isBundle()) |
467 | continue; |
468 | |
469 | switch (IsHazard(State, *I)) { |
470 | case HazardFound: |
471 | return true; |
472 | case HazardExpired: |
473 | return false; |
474 | default: |
475 | // Continue search |
476 | break; |
477 | } |
478 | |
479 | if (I->isInlineAsm() || I->isMetaInstruction()) |
480 | continue; |
481 | |
482 | UpdateState(State, *I); |
483 | } |
484 | |
485 | for (MachineBasicBlock *Pred : MBB->predecessors()) { |
486 | if (!Visited.insert(V: Pred).second) |
487 | continue; |
488 | |
489 | if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), |
490 | Visited)) |
491 | return true; |
492 | } |
493 | |
494 | return false; |
495 | } |
496 | |
497 | // Returns a minimum wait states since \p I walking all predecessors. |
498 | // Only scans until \p IsExpired does not return true. |
499 | // Can only be run in a hazard recognizer mode. |
500 | static int getWaitStatesSince( |
501 | GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, |
502 | MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, |
503 | IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited, |
504 | GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) { |
505 | for (auto E = MBB->instr_rend(); I != E; ++I) { |
506 | // Don't add WaitStates for parent BUNDLE instructions. |
507 | if (I->isBundle()) |
508 | continue; |
509 | |
510 | if (IsHazard(*I)) |
511 | return WaitStates; |
512 | |
513 | if (I->isInlineAsm()) |
514 | continue; |
515 | |
516 | WaitStates += GetNumWaitStates(*I); |
517 | |
518 | if (IsExpired(*I, WaitStates)) |
519 | return std::numeric_limits<int>::max(); |
520 | } |
521 | |
522 | int MinWaitStates = std::numeric_limits<int>::max(); |
523 | for (MachineBasicBlock *Pred : MBB->predecessors()) { |
524 | if (!Visited.insert(V: Pred).second) |
525 | continue; |
526 | |
527 | int W = getWaitStatesSince(IsHazard, MBB: Pred, I: Pred->instr_rbegin(), WaitStates, |
528 | IsExpired, Visited, GetNumWaitStates); |
529 | |
530 | MinWaitStates = std::min(a: MinWaitStates, b: W); |
531 | } |
532 | |
533 | return MinWaitStates; |
534 | } |
535 | |
536 | static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, |
537 | const MachineInstr *MI, IsExpiredFn IsExpired) { |
538 | DenseSet<const MachineBasicBlock *> Visited; |
539 | return getWaitStatesSince(IsHazard, MBB: MI->getParent(), |
540 | I: std::next(x: MI->getReverseIterator()), |
541 | WaitStates: 0, IsExpired, Visited); |
542 | } |
543 | |
544 | int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { |
545 | if (IsHazardRecognizerMode) { |
546 | auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) { |
547 | return WaitStates >= Limit; |
548 | }; |
549 | return ::getWaitStatesSince(IsHazard, MI: CurrCycleInstr, IsExpired: IsExpiredFn); |
550 | } |
551 | |
552 | int WaitStates = 0; |
553 | for (MachineInstr *MI : EmittedInstrs) { |
554 | if (MI) { |
555 | if (IsHazard(*MI)) |
556 | return WaitStates; |
557 | |
558 | if (MI->isInlineAsm()) |
559 | continue; |
560 | } |
561 | ++WaitStates; |
562 | |
563 | if (WaitStates >= Limit) |
564 | break; |
565 | } |
566 | return std::numeric_limits<int>::max(); |
567 | } |
568 | |
569 | int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, |
570 | IsHazardFn IsHazardDef, |
571 | int Limit) { |
572 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
573 | |
574 | auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) { |
575 | return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI); |
576 | }; |
577 | |
578 | return getWaitStatesSince(IsHazard: IsHazardFn, Limit); |
579 | } |
580 | |
581 | int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, |
582 | int Limit) { |
583 | auto IsHazardFn = [IsHazard](const MachineInstr &MI) { |
584 | return isSSetReg(Opcode: MI.getOpcode()) && IsHazard(MI); |
585 | }; |
586 | |
587 | return getWaitStatesSince(IsHazard: IsHazardFn, Limit); |
588 | } |
589 | |
590 | //===----------------------------------------------------------------------===// |
591 | // No-op Hazard Detection |
592 | //===----------------------------------------------------------------------===// |
593 | |
594 | static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, |
595 | MCRegister Reg) { |
596 | for (MCRegUnit Unit : TRI.regunits(Reg)) |
597 | BV.set(Unit); |
598 | } |
599 | |
600 | static void addRegsToSet(const SIRegisterInfo &TRI, |
601 | iterator_range<MachineInstr::const_mop_iterator> Ops, |
602 | BitVector &DefSet, BitVector &UseSet) { |
603 | for (const MachineOperand &Op : Ops) { |
604 | if (Op.isReg()) |
605 | addRegUnits(TRI, BV&: Op.isDef() ? DefSet : UseSet, Reg: Op.getReg().asMCReg()); |
606 | } |
607 | } |
608 | |
609 | void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { |
610 | addRegsToSet(TRI, Ops: MI.operands(), DefSet&: ClauseDefs, UseSet&: ClauseUses); |
611 | } |
612 | |
613 | static bool breaksSMEMSoftClause(MachineInstr *MI) { |
614 | return !SIInstrInfo::isSMRD(MI: *MI); |
615 | } |
616 | |
617 | static bool breaksVMEMSoftClause(MachineInstr *MI) { |
618 | return !SIInstrInfo::isVMEM(MI: *MI) && !SIInstrInfo::isFLAT(MI: *MI); |
619 | } |
620 | |
621 | int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { |
622 | // SMEM soft clause are only present on VI+, and only matter if xnack is |
623 | // enabled. |
624 | if (!ST.isXNACKEnabled()) |
625 | return 0; |
626 | |
627 | bool IsSMRD = TII.isSMRD(MI: *MEM); |
628 | |
629 | resetClause(); |
630 | |
631 | // A soft-clause is any group of consecutive SMEM instructions. The |
632 | // instructions in this group may return out of order and/or may be |
633 | // replayed (i.e. the same instruction issued more than once). |
634 | // |
635 | // In order to handle these situations correctly we need to make sure that |
636 | // when a clause has more than one instruction, no instruction in the clause |
637 | // writes to a register that is read by another instruction in the clause |
638 | // (including itself). If we encounter this situation, we need to break the |
639 | // clause by inserting a non SMEM instruction. |
640 | |
641 | for (MachineInstr *MI : EmittedInstrs) { |
642 | // When we hit a non-SMEM instruction then we have passed the start of the |
643 | // clause and we can stop. |
644 | if (!MI) |
645 | break; |
646 | |
647 | if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI)) |
648 | break; |
649 | |
650 | addClauseInst(MI: *MI); |
651 | } |
652 | |
653 | if (ClauseDefs.none()) |
654 | return 0; |
655 | |
656 | // We need to make sure not to put loads and stores in the same clause if they |
657 | // use the same address. For now, just start a new clause whenever we see a |
658 | // store. |
659 | if (MEM->mayStore()) |
660 | return 1; |
661 | |
662 | addClauseInst(MI: *MEM); |
663 | |
664 | // If the set of defs and uses intersect then we cannot add this instruction |
665 | // to the clause, so we have a hazard. |
666 | return ClauseDefs.anyCommon(RHS: ClauseUses) ? 1 : 0; |
667 | } |
668 | |
669 | int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { |
670 | int WaitStatesNeeded = 0; |
671 | |
672 | WaitStatesNeeded = checkSoftClauseHazards(MEM: SMRD); |
673 | |
674 | // This SMRD hazard only affects SI. |
675 | if (!ST.hasSMRDReadVALUDefHazard()) |
676 | return WaitStatesNeeded; |
677 | |
678 | // A read of an SGPR by SMRD instruction requires 4 wait states when the |
679 | // SGPR was written by a VALU instruction. |
680 | int SmrdSgprWaitStates = 4; |
681 | auto IsHazardDefFn = [this](const MachineInstr &MI) { |
682 | return TII.isVALU(MI); |
683 | }; |
684 | auto IsBufferHazardDefFn = [this](const MachineInstr &MI) { |
685 | return TII.isSALU(MI); |
686 | }; |
687 | |
688 | bool IsBufferSMRD = TII.isBufferSMRD(MI: *SMRD); |
689 | |
690 | for (const MachineOperand &Use : SMRD->uses()) { |
691 | if (!Use.isReg()) |
692 | continue; |
693 | int WaitStatesNeededForUse = |
694 | SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn, |
695 | Limit: SmrdSgprWaitStates); |
696 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
697 | |
698 | // This fixes what appears to be undocumented hardware behavior in SI where |
699 | // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor |
700 | // needs some number of nops in between. We don't know how many we need, but |
701 | // let's use 4. This wasn't discovered before probably because the only |
702 | // case when this happens is when we expand a 64-bit pointer into a full |
703 | // descriptor and use s_buffer_load_dword instead of s_load_dword, which was |
704 | // probably never encountered in the closed-source land. |
705 | if (IsBufferSMRD) { |
706 | int WaitStatesNeededForUse = |
707 | SmrdSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), |
708 | IsHazardDef: IsBufferHazardDefFn, |
709 | Limit: SmrdSgprWaitStates); |
710 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
711 | } |
712 | } |
713 | |
714 | return WaitStatesNeeded; |
715 | } |
716 | |
717 | int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { |
718 | if (!ST.hasVMEMReadSGPRVALUDefHazard()) |
719 | return 0; |
720 | |
721 | int WaitStatesNeeded = checkSoftClauseHazards(MEM: VMEM); |
722 | |
723 | // A read of an SGPR by a VMEM instruction requires 5 wait states when the |
724 | // SGPR was written by a VALU Instruction. |
725 | const int VmemSgprWaitStates = 5; |
726 | auto IsHazardDefFn = [this](const MachineInstr &MI) { |
727 | return TII.isVALU(MI); |
728 | }; |
729 | for (const MachineOperand &Use : VMEM->uses()) { |
730 | if (!Use.isReg() || TRI.isVectorRegister(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
731 | continue; |
732 | |
733 | int WaitStatesNeededForUse = |
734 | VmemSgprWaitStates - getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsHazardDefFn, |
735 | Limit: VmemSgprWaitStates); |
736 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
737 | } |
738 | return WaitStatesNeeded; |
739 | } |
740 | |
741 | int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { |
742 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
743 | const SIInstrInfo *TII = ST.getInstrInfo(); |
744 | |
745 | // Check for DPP VGPR read after VALU VGPR write and EXEC write. |
746 | int DppVgprWaitStates = 2; |
747 | int DppExecWaitStates = 5; |
748 | int WaitStatesNeeded = 0; |
749 | auto IsHazardDefFn = [TII](const MachineInstr &MI) { |
750 | return TII->isVALU(MI); |
751 | }; |
752 | |
753 | for (const MachineOperand &Use : DPP->uses()) { |
754 | if (!Use.isReg() || !TRI->isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
755 | continue; |
756 | int WaitStatesNeededForUse = |
757 | DppVgprWaitStates - getWaitStatesSinceDef( |
758 | Reg: Use.getReg(), |
759 | IsHazardDef: [](const MachineInstr &) { return true; }, |
760 | Limit: DppVgprWaitStates); |
761 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
762 | } |
763 | |
764 | WaitStatesNeeded = std::max( |
765 | a: WaitStatesNeeded, |
766 | b: DppExecWaitStates - getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsHazardDefFn, |
767 | Limit: DppExecWaitStates)); |
768 | |
769 | return WaitStatesNeeded; |
770 | } |
771 | |
772 | int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { |
773 | const SIInstrInfo *TII = ST.getInstrInfo(); |
774 | |
775 | // v_div_fmas requires 4 wait states after a write to vcc from a VALU |
776 | // instruction. |
777 | const int DivFMasWaitStates = 4; |
778 | auto IsHazardDefFn = [TII](const MachineInstr &MI) { |
779 | return TII->isVALU(MI); |
780 | }; |
781 | int WaitStatesNeeded = getWaitStatesSinceDef(Reg: AMDGPU::VCC, IsHazardDef: IsHazardDefFn, |
782 | Limit: DivFMasWaitStates); |
783 | |
784 | return DivFMasWaitStates - WaitStatesNeeded; |
785 | } |
786 | |
787 | int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { |
788 | const SIInstrInfo *TII = ST.getInstrInfo(); |
789 | unsigned GetRegHWReg = getHWReg(TII, RegInstr: *GetRegInstr); |
790 | |
791 | const int GetRegWaitStates = 2; |
792 | auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) { |
793 | return GetRegHWReg == getHWReg(TII, RegInstr: MI); |
794 | }; |
795 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: GetRegWaitStates); |
796 | |
797 | return GetRegWaitStates - WaitStatesNeeded; |
798 | } |
799 | |
800 | int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { |
801 | const SIInstrInfo *TII = ST.getInstrInfo(); |
802 | unsigned HWReg = getHWReg(TII, RegInstr: *SetRegInstr); |
803 | |
804 | const int SetRegWaitStates = ST.getSetRegWaitStates(); |
805 | auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) { |
806 | return HWReg == getHWReg(TII, RegInstr: MI); |
807 | }; |
808 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: SetRegWaitStates); |
809 | return SetRegWaitStates - WaitStatesNeeded; |
810 | } |
811 | |
812 | int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { |
813 | if (!MI.mayStore()) |
814 | return -1; |
815 | |
816 | const SIInstrInfo *TII = ST.getInstrInfo(); |
817 | unsigned Opcode = MI.getOpcode(); |
818 | const MCInstrDesc &Desc = MI.getDesc(); |
819 | |
820 | int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::vdata); |
821 | int VDataRCID = -1; |
822 | if (VDataIdx != -1) |
823 | VDataRCID = Desc.operands()[VDataIdx].RegClass; |
824 | |
825 | if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { |
826 | // There is no hazard if the instruction does not use vector regs |
827 | // (like wbinvl1) |
828 | if (VDataIdx == -1) |
829 | return -1; |
830 | // For MUBUF/MTBUF instructions this hazard only exists if the |
831 | // instruction is not using a register in the soffset field. |
832 | const MachineOperand *SOffset = |
833 | TII->getNamedOperand(MI, OpName: AMDGPU::OpName::soffset); |
834 | // If we have no soffset operand, then assume this field has been |
835 | // hardcoded to zero. |
836 | if (AMDGPU::getRegBitWidth(RCID: VDataRCID) > 64 && |
837 | (!SOffset || !SOffset->isReg())) |
838 | return VDataIdx; |
839 | } |
840 | |
841 | // MIMG instructions create a hazard if they don't use a 256-bit T# and |
842 | // the store size is greater than 8 bytes and they have more than two bits |
843 | // of their dmask set. |
844 | // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. |
845 | if (TII->isMIMG(MI)) { |
846 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::srsrc); |
847 | assert(SRsrcIdx != -1 && |
848 | AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256); |
849 | (void)SRsrcIdx; |
850 | } |
851 | |
852 | if (TII->isFLAT(MI)) { |
853 | int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::vdata); |
854 | if (AMDGPU::getRegBitWidth(RCID: Desc.operands()[DataIdx].RegClass) > 64) |
855 | return DataIdx; |
856 | } |
857 | |
858 | return -1; |
859 | } |
860 | |
861 | int |
862 | GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, |
863 | const MachineRegisterInfo &MRI) { |
864 | // Helper to check for the hazard where VMEM instructions that store more than |
865 | // 8 bytes can have there store data over written by the next instruction. |
866 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
867 | |
868 | const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1; |
869 | int WaitStatesNeeded = 0; |
870 | |
871 | if (!TRI->isVectorRegister(MRI, Reg: Def.getReg())) |
872 | return WaitStatesNeeded; |
873 | Register Reg = Def.getReg(); |
874 | auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) { |
875 | int DataIdx = createsVALUHazard(MI); |
876 | return DataIdx >= 0 && |
877 | TRI->regsOverlap(RegA: MI.getOperand(i: DataIdx).getReg(), RegB: Reg); |
878 | }; |
879 | int WaitStatesNeededForDef = |
880 | VALUWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: VALUWaitStates); |
881 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
882 | |
883 | return WaitStatesNeeded; |
884 | } |
885 | |
886 | int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { |
887 | int WaitStatesNeeded = 0; |
888 | |
889 | if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(MI: *VALU)) { |
890 | const int TransDefWaitstates = 1; |
891 | |
892 | auto IsTransDefFn = [this, VALU](const MachineInstr &MI) { |
893 | if (!SIInstrInfo::isTRANS(MI)) |
894 | return false; |
895 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
896 | const SIInstrInfo *TII = ST.getInstrInfo(); |
897 | Register Def = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::vdst)->getReg(); |
898 | |
899 | for (const MachineOperand &Use : VALU->explicit_uses()) { |
900 | if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg())) |
901 | return true; |
902 | } |
903 | |
904 | return false; |
905 | }; |
906 | |
907 | int WaitStatesNeededForDef = |
908 | TransDefWaitstates - |
909 | getWaitStatesSince(IsHazard: IsTransDefFn, Limit: TransDefWaitstates); |
910 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
911 | } |
912 | |
913 | if (ST.hasDstSelForwardingHazard()) { |
914 | const int Shift16DefWaitstates = 1; |
915 | |
916 | auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) { |
917 | if (!SIInstrInfo::isVALU(MI)) |
918 | return false; |
919 | const SIInstrInfo *TII = ST.getInstrInfo(); |
920 | if (SIInstrInfo::isSDWA(MI)) { |
921 | if (auto *DstSel = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::dst_sel)) |
922 | if (DstSel->getImm() == AMDGPU::SDWA::DWORD) |
923 | return false; |
924 | } else { |
925 | if (!AMDGPU::hasNamedOperand(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::op_sel) || |
926 | !(TII->getNamedOperand(MI, OpName: AMDGPU::OpName::src0_modifiers) |
927 | ->getImm() & |
928 | SISrcMods::DST_OP_SEL)) |
929 | return false; |
930 | } |
931 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
932 | if (auto *Dst = TII->getNamedOperand(MI, OpName: AMDGPU::OpName::vdst)) { |
933 | Register Def = Dst->getReg(); |
934 | |
935 | for (const MachineOperand &Use : VALU->explicit_uses()) { |
936 | if (Use.isReg() && TRI->regsOverlap(RegA: Def, RegB: Use.getReg())) |
937 | return true; |
938 | } |
939 | } |
940 | |
941 | return false; |
942 | }; |
943 | |
944 | int WaitStatesNeededForDef = |
945 | Shift16DefWaitstates - |
946 | getWaitStatesSince(IsHazard: IsShift16BitDefFn, Limit: Shift16DefWaitstates); |
947 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
948 | } |
949 | |
950 | if (ST.hasVDecCoExecHazard()) { |
951 | const int VALUWriteSGPRVALUReadWaitstates = 2; |
952 | const int VALUWriteEXECRWLane = 4; |
953 | const int VALUWriteVGPRReadlaneRead = 1; |
954 | |
955 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
956 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
957 | Register UseReg; |
958 | auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) { |
959 | if (!SIInstrInfo::isVALU(MI)) |
960 | return false; |
961 | return MI.modifiesRegister(Reg: UseReg, TRI); |
962 | }; |
963 | |
964 | for (const MachineOperand &Use : VALU->explicit_uses()) { |
965 | if (!Use.isReg()) |
966 | continue; |
967 | |
968 | UseReg = Use.getReg(); |
969 | if (TRI->isSGPRReg(MRI, Reg: UseReg)) { |
970 | int WaitStatesNeededForDef = |
971 | VALUWriteSGPRVALUReadWaitstates - |
972 | getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, |
973 | Limit: VALUWriteSGPRVALUReadWaitstates); |
974 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
975 | } |
976 | } |
977 | |
978 | if (VALU->readsRegister(Reg: AMDGPU::VCC, TRI)) { |
979 | UseReg = AMDGPU::VCC; |
980 | int WaitStatesNeededForDef = |
981 | VALUWriteSGPRVALUReadWaitstates - |
982 | getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteSGPRVALUReadWaitstates); |
983 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
984 | } |
985 | |
986 | switch (VALU->getOpcode()) { |
987 | case AMDGPU::V_READLANE_B32: |
988 | case AMDGPU::V_READFIRSTLANE_B32: { |
989 | MachineOperand *Src = TII.getNamedOperand(MI&: *VALU, OperandName: AMDGPU::OpName::src0); |
990 | UseReg = Src->getReg(); |
991 | int WaitStatesNeededForDef = |
992 | VALUWriteVGPRReadlaneRead - |
993 | getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteVGPRReadlaneRead); |
994 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
995 | } |
996 | [[fallthrough]]; |
997 | case AMDGPU::V_WRITELANE_B32: { |
998 | UseReg = AMDGPU::EXEC; |
999 | int WaitStatesNeededForDef = |
1000 | VALUWriteEXECRWLane - |
1001 | getWaitStatesSince(IsHazard: IsVALUDefSGPRFn, Limit: VALUWriteEXECRWLane); |
1002 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForDef); |
1003 | break; |
1004 | } |
1005 | default: |
1006 | break; |
1007 | } |
1008 | } |
1009 | |
1010 | // This checks for the hazard where VMEM instructions that store more than |
1011 | // 8 bytes can have there store data over written by the next instruction. |
1012 | if (!ST.has12DWordStoreHazard()) |
1013 | return WaitStatesNeeded; |
1014 | |
1015 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1016 | |
1017 | for (const MachineOperand &Def : VALU->defs()) { |
1018 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def, MRI)); |
1019 | } |
1020 | |
1021 | return WaitStatesNeeded; |
1022 | } |
1023 | |
1024 | int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { |
1025 | // This checks for hazards associated with inline asm statements. |
1026 | // Since inline asms can contain just about anything, we use this |
1027 | // to call/leverage other check*Hazard routines. Note that |
1028 | // this function doesn't attempt to address all possible inline asm |
1029 | // hazards (good luck), but is a collection of what has been |
1030 | // problematic thus far. |
1031 | |
1032 | // see checkVALUHazards() |
1033 | if (!ST.has12DWordStoreHazard()) |
1034 | return 0; |
1035 | |
1036 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1037 | int WaitStatesNeeded = 0; |
1038 | |
1039 | for (const MachineOperand &Op : |
1040 | llvm::drop_begin(RangeOrContainer: IA->operands(), N: InlineAsm::MIOp_FirstOperand)) { |
1041 | if (Op.isReg() && Op.isDef()) { |
1042 | WaitStatesNeeded = |
1043 | std::max(a: WaitStatesNeeded, b: checkVALUHazardsHelper(Def: Op, MRI)); |
1044 | } |
1045 | } |
1046 | |
1047 | return WaitStatesNeeded; |
1048 | } |
1049 | |
1050 | int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { |
1051 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1052 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1053 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1054 | |
1055 | const MachineOperand *LaneSelectOp = |
1056 | TII->getNamedOperand(MI&: *RWLane, OperandName: AMDGPU::OpName::src1); |
1057 | |
1058 | if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, Reg: LaneSelectOp->getReg())) |
1059 | return 0; |
1060 | |
1061 | Register LaneSelectReg = LaneSelectOp->getReg(); |
1062 | auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); }; |
1063 | |
1064 | const int RWLaneWaitStates = 4; |
1065 | int WaitStatesSince = getWaitStatesSinceDef(Reg: LaneSelectReg, IsHazardDef: IsHazardFn, |
1066 | Limit: RWLaneWaitStates); |
1067 | return RWLaneWaitStates - WaitStatesSince; |
1068 | } |
1069 | |
1070 | int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { |
1071 | if (!ST.hasRFEHazards()) |
1072 | return 0; |
1073 | |
1074 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1075 | |
1076 | const int RFEWaitStates = 1; |
1077 | |
1078 | auto IsHazardFn = [TII](const MachineInstr &MI) { |
1079 | return getHWReg(TII, RegInstr: MI) == AMDGPU::Hwreg::ID_TRAPSTS; |
1080 | }; |
1081 | int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazard: IsHazardFn, Limit: RFEWaitStates); |
1082 | return RFEWaitStates - WaitStatesNeeded; |
1083 | } |
1084 | |
1085 | int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { |
1086 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1087 | const int ReadM0WaitStates = 1; |
1088 | auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); }; |
1089 | return ReadM0WaitStates - |
1090 | getWaitStatesSinceDef(Reg: AMDGPU::M0, IsHazardDef: IsHazardFn, Limit: ReadM0WaitStates); |
1091 | } |
1092 | |
1093 | void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { |
1094 | fixVMEMtoScalarWriteHazards(MI); |
1095 | fixVcmpxPermlaneHazards(MI); |
1096 | fixSMEMtoVectorWriteHazards(MI); |
1097 | fixVcmpxExecWARHazard(MI); |
1098 | fixLdsBranchVmemWARHazard(MI); |
1099 | if (ST.hasLdsDirect()) { |
1100 | fixLdsDirectVALUHazard(MI); |
1101 | fixLdsDirectVMEMHazard(MI); |
1102 | } |
1103 | fixVALUPartialForwardingHazard(MI); |
1104 | fixVALUTransUseHazard(MI); |
1105 | fixWMMAHazards(MI); |
1106 | fixShift64HighRegBug(MI); |
1107 | fixVALUMaskWriteHazard(MI); |
1108 | fixRequiredExportPriority(MI); |
1109 | } |
1110 | |
1111 | bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { |
1112 | if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(MI: *MI)) |
1113 | return false; |
1114 | |
1115 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1116 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1117 | auto IsHazardFn = [TII, TRI](const MachineInstr &MI) { |
1118 | return (TII->isVOPC(MI) || |
1119 | ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) && |
1120 | MI.modifiesRegister(Reg: AMDGPU::EXEC, TRI); |
1121 | }; |
1122 | |
1123 | auto IsExpiredFn = [](const MachineInstr &MI, int) { |
1124 | unsigned Opc = MI.getOpcode(); |
1125 | return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 && |
1126 | Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa; |
1127 | }; |
1128 | |
1129 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1130 | std::numeric_limits<int>::max()) |
1131 | return false; |
1132 | |
1133 | // V_NOP will be discarded by SQ. |
1134 | // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* |
1135 | // which is always a VGPR and available. |
1136 | auto *Src0 = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0); |
1137 | Register Reg = Src0->getReg(); |
1138 | bool IsUndef = Src0->isUndef(); |
1139 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
1140 | MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32)) |
1141 | .addReg(RegNo: Reg, flags: RegState::Define | (IsUndef ? RegState::Dead : 0)) |
1142 | .addReg(RegNo: Reg, flags: IsUndef ? RegState::Undef : RegState::Kill); |
1143 | |
1144 | return true; |
1145 | } |
1146 | |
1147 | bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { |
1148 | if (!ST.hasVMEMtoScalarWriteHazard()) |
1149 | return false; |
1150 | assert(!ST.hasExtendedWaitCounts()); |
1151 | |
1152 | if (!SIInstrInfo::isSALU(MI: *MI) && !SIInstrInfo::isSMRD(MI: *MI)) |
1153 | return false; |
1154 | |
1155 | if (MI->getNumDefs() == 0) |
1156 | return false; |
1157 | |
1158 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1159 | |
1160 | auto IsHazardFn = [TRI, MI](const MachineInstr &I) { |
1161 | if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isDS(MI: I) && |
1162 | !SIInstrInfo::isFLAT(MI: I)) |
1163 | return false; |
1164 | |
1165 | for (const MachineOperand &Def : MI->defs()) { |
1166 | const MachineOperand *Op = |
1167 | I.findRegisterUseOperand(Reg: Def.getReg(), TRI, isKill: false); |
1168 | if (!Op) |
1169 | continue; |
1170 | return true; |
1171 | } |
1172 | return false; |
1173 | }; |
1174 | |
1175 | auto IsExpiredFn = [](const MachineInstr &MI, int) { |
1176 | return SIInstrInfo::isVALU(MI) || |
1177 | (MI.getOpcode() == AMDGPU::S_WAITCNT && |
1178 | !MI.getOperand(i: 0).getImm()) || |
1179 | (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1180 | AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: MI.getOperand(i: 0).getImm()) == 0); |
1181 | }; |
1182 | |
1183 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1184 | std::numeric_limits<int>::max()) |
1185 | return false; |
1186 | |
1187 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1188 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
1189 | MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)) |
1190 | .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0)); |
1191 | return true; |
1192 | } |
1193 | |
1194 | bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { |
1195 | if (!ST.hasSMEMtoVectorWriteHazard()) |
1196 | return false; |
1197 | assert(!ST.hasExtendedWaitCounts()); |
1198 | |
1199 | if (!SIInstrInfo::isVALU(MI: *MI)) |
1200 | return false; |
1201 | |
1202 | unsigned SDSTName; |
1203 | switch (MI->getOpcode()) { |
1204 | case AMDGPU::V_READLANE_B32: |
1205 | case AMDGPU::V_READFIRSTLANE_B32: |
1206 | SDSTName = AMDGPU::OpName::vdst; |
1207 | break; |
1208 | default: |
1209 | SDSTName = AMDGPU::OpName::sdst; |
1210 | break; |
1211 | } |
1212 | |
1213 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1214 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1215 | const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU()); |
1216 | const MachineOperand *SDST = TII->getNamedOperand(MI&: *MI, OperandName: SDSTName); |
1217 | if (!SDST) { |
1218 | for (const auto &MO : MI->implicit_operands()) { |
1219 | if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) { |
1220 | SDST = &MO; |
1221 | break; |
1222 | } |
1223 | } |
1224 | } |
1225 | |
1226 | if (!SDST) |
1227 | return false; |
1228 | |
1229 | const Register SDSTReg = SDST->getReg(); |
1230 | auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) { |
1231 | return SIInstrInfo::isSMRD(MI: I) && I.readsRegister(Reg: SDSTReg, TRI); |
1232 | }; |
1233 | |
1234 | auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) { |
1235 | if (TII->isSALU(MI)) { |
1236 | switch (MI.getOpcode()) { |
1237 | case AMDGPU::S_SETVSKIP: |
1238 | case AMDGPU::S_VERSION: |
1239 | case AMDGPU::S_WAITCNT_VSCNT: |
1240 | case AMDGPU::S_WAITCNT_VMCNT: |
1241 | case AMDGPU::S_WAITCNT_EXPCNT: |
1242 | // These instructions cannot not mitigate the hazard. |
1243 | return false; |
1244 | case AMDGPU::S_WAITCNT_LGKMCNT: |
1245 | // Reducing lgkmcnt count to 0 always mitigates the hazard. |
1246 | return (MI.getOperand(i: 1).getImm() == 0) && |
1247 | (MI.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL); |
1248 | case AMDGPU::S_WAITCNT: { |
1249 | const int64_t Imm = MI.getOperand(i: 0).getImm(); |
1250 | AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(Version: IV, Encoded: Imm); |
1251 | // DsCnt corresponds to LGKMCnt here. |
1252 | return (Decoded.DsCnt == 0); |
1253 | } |
1254 | default: |
1255 | // SOPP instructions cannot mitigate the hazard. |
1256 | if (TII->isSOPP(MI)) |
1257 | return false; |
1258 | // At this point the SALU can be assumed to mitigate the hazard |
1259 | // because either: |
1260 | // (a) it is independent of the at risk SMEM (breaking chain), |
1261 | // or |
1262 | // (b) it is dependent on the SMEM, in which case an appropriate |
1263 | // s_waitcnt lgkmcnt _must_ exist between it and the at risk |
1264 | // SMEM instruction. |
1265 | return true; |
1266 | } |
1267 | } |
1268 | return false; |
1269 | }; |
1270 | |
1271 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1272 | std::numeric_limits<int>::max()) |
1273 | return false; |
1274 | |
1275 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
1276 | MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::SGPR_NULL) |
1277 | .addImm(Val: 0); |
1278 | return true; |
1279 | } |
1280 | |
1281 | bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { |
1282 | if (!ST.hasVcmpxExecWARHazard()) |
1283 | return false; |
1284 | assert(!ST.hasExtendedWaitCounts()); |
1285 | |
1286 | if (!SIInstrInfo::isVALU(MI: *MI)) |
1287 | return false; |
1288 | |
1289 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1290 | if (!MI->modifiesRegister(Reg: AMDGPU::EXEC, TRI)) |
1291 | return false; |
1292 | |
1293 | auto IsHazardFn = [TRI](const MachineInstr &I) { |
1294 | if (SIInstrInfo::isVALU(MI: I)) |
1295 | return false; |
1296 | return I.readsRegister(Reg: AMDGPU::EXEC, TRI); |
1297 | }; |
1298 | |
1299 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1300 | auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) { |
1301 | if (SIInstrInfo::isVALU(MI)) { |
1302 | if (TII->getNamedOperand(MI, OpName: AMDGPU::OpName::sdst)) |
1303 | return true; |
1304 | for (auto MO : MI.implicit_operands()) |
1305 | if (MO.isDef() && TRI->isSGPRClass(RC: TRI->getPhysRegBaseClass(Reg: MO.getReg()))) |
1306 | return true; |
1307 | } |
1308 | if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1309 | AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: MI.getOperand(i: 0).getImm()) == 0) |
1310 | return true; |
1311 | return false; |
1312 | }; |
1313 | |
1314 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1315 | std::numeric_limits<int>::max()) |
1316 | return false; |
1317 | |
1318 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
1319 | MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)) |
1320 | .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0)); |
1321 | return true; |
1322 | } |
1323 | |
1324 | static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, |
1325 | const GCNSubtarget &ST) { |
1326 | if (!ST.hasLdsBranchVmemWARHazard()) |
1327 | return false; |
1328 | |
1329 | // Check if the necessary condition for the hazard is met: both LDS and VMEM |
1330 | // instructions need to appear in the same function. |
1331 | bool HasLds = false; |
1332 | bool HasVmem = false; |
1333 | for (auto &MBB : MF) { |
1334 | for (auto &MI : MBB) { |
1335 | HasLds |= SIInstrInfo::isDS(MI); |
1336 | HasVmem |= |
1337 | SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI); |
1338 | if (HasLds && HasVmem) |
1339 | return true; |
1340 | } |
1341 | } |
1342 | return false; |
1343 | } |
1344 | |
1345 | static bool isStoreCountWaitZero(const MachineInstr &I) { |
1346 | return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && |
1347 | I.getOperand(i: 0).getReg() == AMDGPU::SGPR_NULL && |
1348 | !I.getOperand(i: 1).getImm(); |
1349 | } |
1350 | |
1351 | bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { |
1352 | if (!RunLdsBranchVmemWARHazardFixup) |
1353 | return false; |
1354 | |
1355 | assert(ST.hasLdsBranchVmemWARHazard()); |
1356 | assert(!ST.hasExtendedWaitCounts()); |
1357 | |
1358 | auto IsHazardInst = [](const MachineInstr &MI) { |
1359 | if (SIInstrInfo::isDS(MI)) |
1360 | return 1; |
1361 | if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) |
1362 | return 2; |
1363 | return 0; |
1364 | }; |
1365 | |
1366 | auto InstType = IsHazardInst(*MI); |
1367 | if (!InstType) |
1368 | return false; |
1369 | |
1370 | auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) { |
1371 | return IsHazardInst(I) || isStoreCountWaitZero(I); |
1372 | }; |
1373 | |
1374 | auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) { |
1375 | if (!I.isBranch()) |
1376 | return false; |
1377 | |
1378 | auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) { |
1379 | auto InstType2 = IsHazardInst(I); |
1380 | return InstType2 && InstType != InstType2; |
1381 | }; |
1382 | |
1383 | auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) { |
1384 | auto InstType2 = IsHazardInst(I); |
1385 | if (InstType == InstType2) |
1386 | return true; |
1387 | |
1388 | return isStoreCountWaitZero(I); |
1389 | }; |
1390 | |
1391 | return ::getWaitStatesSince(IsHazard: IsHazardFn, MI: &I, IsExpired: IsExpiredFn) != |
1392 | std::numeric_limits<int>::max(); |
1393 | }; |
1394 | |
1395 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1396 | std::numeric_limits<int>::max()) |
1397 | return false; |
1398 | |
1399 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1400 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
1401 | MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_VSCNT)) |
1402 | .addReg(RegNo: AMDGPU::SGPR_NULL, flags: RegState::Undef) |
1403 | .addImm(Val: 0); |
1404 | |
1405 | return true; |
1406 | } |
1407 | |
1408 | bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) { |
1409 | if (!SIInstrInfo::isLDSDIR(MI: *MI)) |
1410 | return false; |
1411 | |
1412 | const int NoHazardWaitStates = 15; |
1413 | const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst); |
1414 | const Register VDSTReg = VDST->getReg(); |
1415 | |
1416 | bool VisitedTrans = false; |
1417 | auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) { |
1418 | if (!SIInstrInfo::isVALU(MI: I)) |
1419 | return false; |
1420 | VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(MI: I); |
1421 | // Cover both WAR and WAW |
1422 | return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI); |
1423 | }; |
1424 | auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) { |
1425 | if (WaitStates >= NoHazardWaitStates) |
1426 | return true; |
1427 | // Instructions which cause va_vdst==0 expire hazard |
1428 | return SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isFLAT(MI: I) || |
1429 | SIInstrInfo::isDS(MI: I) || SIInstrInfo::isEXP(MI: I); |
1430 | }; |
1431 | auto GetWaitStatesFn = [](const MachineInstr &MI) { |
1432 | return SIInstrInfo::isVALU(MI) ? 1 : 0; |
1433 | }; |
1434 | |
1435 | DenseSet<const MachineBasicBlock *> Visited; |
1436 | auto Count = ::getWaitStatesSince(IsHazard: IsHazardFn, MBB: MI->getParent(), |
1437 | I: std::next(x: MI->getReverseIterator()), WaitStates: 0, |
1438 | IsExpired: IsExpiredFn, Visited, GetNumWaitStates: GetWaitStatesFn); |
1439 | |
1440 | // Transcendentals can execute in parallel to other VALUs. |
1441 | // This makes va_vdst count unusable with a mixture of VALU and TRANS. |
1442 | if (VisitedTrans) |
1443 | Count = 0; |
1444 | |
1445 | MachineOperand *WaitVdstOp = |
1446 | TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvdst); |
1447 | WaitVdstOp->setImm(std::min(a: Count, b: NoHazardWaitStates)); |
1448 | |
1449 | return true; |
1450 | } |
1451 | |
1452 | bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { |
1453 | if (!SIInstrInfo::isLDSDIR(MI: *MI)) |
1454 | return false; |
1455 | |
1456 | const MachineOperand *VDST = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::vdst); |
1457 | const Register VDSTReg = VDST->getReg(); |
1458 | |
1459 | auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) { |
1460 | if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I) && |
1461 | !SIInstrInfo::isDS(MI: I)) |
1462 | return false; |
1463 | return I.readsRegister(Reg: VDSTReg, TRI: &TRI) || I.modifiesRegister(Reg: VDSTReg, TRI: &TRI); |
1464 | }; |
1465 | bool LdsdirCanWait = ST.hasLdsWaitVMSRC(); |
1466 | // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT |
1467 | // according to the type of VMEM instruction. |
1468 | auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) { |
1469 | return SIInstrInfo::isVALU(MI: I) || SIInstrInfo::isEXP(MI: I) || |
1470 | (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(i: 0).getImm()) || |
1471 | (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1472 | AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: I.getOperand(i: 0).getImm()) == 0) || |
1473 | (LdsdirCanWait && SIInstrInfo::isLDSDIR(MI: I) && |
1474 | !TII.getNamedOperand(MI: I, OpName: AMDGPU::OpName::waitvsrc)->getImm()); |
1475 | }; |
1476 | |
1477 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1478 | std::numeric_limits<int>::max()) |
1479 | return false; |
1480 | |
1481 | if (LdsdirCanWait) { |
1482 | TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::waitvsrc)->setImm(0); |
1483 | } else { |
1484 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
1485 | MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)) |
1486 | .addImm(Val: AMDGPU::DepCtr::encodeFieldVmVsrc(VmVsrc: 0)); |
1487 | } |
1488 | |
1489 | return true; |
1490 | } |
1491 | |
1492 | bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { |
1493 | if (!ST.hasVALUPartialForwardingHazard()) |
1494 | return false; |
1495 | assert(!ST.hasExtendedWaitCounts()); |
1496 | |
1497 | if (!ST.isWave64() || !SIInstrInfo::isVALU(MI: *MI)) |
1498 | return false; |
1499 | |
1500 | SmallSetVector<Register, 4> SrcVGPRs; |
1501 | |
1502 | for (const MachineOperand &Use : MI->explicit_uses()) { |
1503 | if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
1504 | SrcVGPRs.insert(X: Use.getReg()); |
1505 | } |
1506 | |
1507 | // Only applies with >= 2 unique VGPR sources |
1508 | if (SrcVGPRs.size() <= 1) |
1509 | return false; |
1510 | |
1511 | // Look for the following pattern: |
1512 | // Va <- VALU [PreExecPos] |
1513 | // intv1 |
1514 | // Exec <- SALU [ExecPos] |
1515 | // intv2 |
1516 | // Vb <- VALU [PostExecPos] |
1517 | // intv3 |
1518 | // MI Va, Vb (WaitState = 0) |
1519 | // |
1520 | // Where: |
1521 | // intv1 + intv2 <= 2 VALUs |
1522 | // intv3 <= 4 VALUs |
1523 | // |
1524 | // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. |
1525 | |
1526 | const int Intv1plus2MaxVALUs = 2; |
1527 | const int Intv3MaxVALUs = 4; |
1528 | const int IntvMaxVALUs = 6; |
1529 | const int NoHazardVALUWaitStates = IntvMaxVALUs + 2; |
1530 | |
1531 | struct StateType { |
1532 | SmallDenseMap<Register, int, 4> DefPos; |
1533 | int ExecPos = std::numeric_limits<int>::max(); |
1534 | int VALUs = 0; |
1535 | }; |
1536 | |
1537 | StateType State; |
1538 | |
1539 | // This overloads expiry testing with all the hazard detection |
1540 | auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { |
1541 | // Too many VALU states have passed |
1542 | if (State.VALUs > NoHazardVALUWaitStates) |
1543 | return HazardExpired; |
1544 | |
1545 | // Instructions which cause va_vdst==0 expire hazard |
1546 | if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isFLAT(MI: I) || |
1547 | SIInstrInfo::isDS(MI: I) || SIInstrInfo::isEXP(MI: I) || |
1548 | (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1549 | AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: I.getOperand(i: 0).getImm()) == 0)) |
1550 | return HazardExpired; |
1551 | |
1552 | // Track registers writes |
1553 | bool Changed = false; |
1554 | if (SIInstrInfo::isVALU(MI: I)) { |
1555 | for (Register Src : SrcVGPRs) { |
1556 | if (!State.DefPos.count(Val: Src) && I.modifiesRegister(Reg: Src, TRI: &TRI)) { |
1557 | State.DefPos[Src] = State.VALUs; |
1558 | Changed = true; |
1559 | } |
1560 | } |
1561 | } else if (SIInstrInfo::isSALU(MI: I)) { |
1562 | if (State.ExecPos == std::numeric_limits<int>::max()) { |
1563 | if (!State.DefPos.empty() && I.modifiesRegister(Reg: AMDGPU::EXEC, TRI: &TRI)) { |
1564 | State.ExecPos = State.VALUs; |
1565 | Changed = true; |
1566 | } |
1567 | } |
1568 | } |
1569 | |
1570 | // Early expiration: too many VALUs in intv3 |
1571 | if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty()) |
1572 | return HazardExpired; |
1573 | |
1574 | // Only evaluate state if something changed |
1575 | if (!Changed) |
1576 | return NoHazardFound; |
1577 | |
1578 | // Determine positions of VALUs pre/post exec change |
1579 | if (State.ExecPos == std::numeric_limits<int>::max()) |
1580 | return NoHazardFound; |
1581 | |
1582 | int PreExecPos = std::numeric_limits<int>::max(); |
1583 | int PostExecPos = std::numeric_limits<int>::max(); |
1584 | |
1585 | for (auto Entry : State.DefPos) { |
1586 | int DefVALUs = Entry.second; |
1587 | if (DefVALUs != std::numeric_limits<int>::max()) { |
1588 | if (DefVALUs >= State.ExecPos) |
1589 | PreExecPos = std::min(a: PreExecPos, b: DefVALUs); |
1590 | else |
1591 | PostExecPos = std::min(a: PostExecPos, b: DefVALUs); |
1592 | } |
1593 | } |
1594 | |
1595 | // Need a VALUs post exec change |
1596 | if (PostExecPos == std::numeric_limits<int>::max()) |
1597 | return NoHazardFound; |
1598 | |
1599 | // Too many VALUs in intv3? |
1600 | int Intv3VALUs = PostExecPos; |
1601 | if (Intv3VALUs > Intv3MaxVALUs) |
1602 | return HazardExpired; |
1603 | |
1604 | // Too many VALUs in intv2? |
1605 | int Intv2VALUs = (State.ExecPos - PostExecPos) - 1; |
1606 | if (Intv2VALUs > Intv1plus2MaxVALUs) |
1607 | return HazardExpired; |
1608 | |
1609 | // Need a VALUs pre exec change |
1610 | if (PreExecPos == std::numeric_limits<int>::max()) |
1611 | return NoHazardFound; |
1612 | |
1613 | // Too many VALUs in intv1? |
1614 | int Intv1VALUs = PreExecPos - State.ExecPos; |
1615 | if (Intv1VALUs > Intv1plus2MaxVALUs) |
1616 | return HazardExpired; |
1617 | |
1618 | // Too many VALUs in intv1 + intv2 |
1619 | if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs) |
1620 | return HazardExpired; |
1621 | |
1622 | return HazardFound; |
1623 | }; |
1624 | auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { |
1625 | if (SIInstrInfo::isVALU(MI)) |
1626 | State.VALUs += 1; |
1627 | }; |
1628 | |
1629 | DenseSet<const MachineBasicBlock *> Visited; |
1630 | if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(), |
1631 | I: std::next(x: MI->getReverseIterator()), Visited)) |
1632 | return false; |
1633 | |
1634 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
1635 | MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)) |
1636 | .addImm(Val: 0x0fff); |
1637 | |
1638 | return true; |
1639 | } |
1640 | |
1641 | bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { |
1642 | if (!ST.hasVALUTransUseHazard()) |
1643 | return false; |
1644 | assert(!ST.hasExtendedWaitCounts()); |
1645 | |
1646 | if (!SIInstrInfo::isVALU(MI: *MI)) |
1647 | return false; |
1648 | |
1649 | SmallSet<Register, 4> SrcVGPRs; |
1650 | |
1651 | for (const MachineOperand &Use : MI->explicit_uses()) { |
1652 | if (Use.isReg() && TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
1653 | SrcVGPRs.insert(V: Use.getReg()); |
1654 | } |
1655 | |
1656 | // Look for the following pattern: |
1657 | // Va <- TRANS VALU |
1658 | // intv |
1659 | // MI Va (WaitState = 0) |
1660 | // |
1661 | // Where: |
1662 | // intv <= 5 VALUs / 1 TRANS |
1663 | // |
1664 | // If found, insert an appropriate S_WAITCNT_DEPCTR before MI. |
1665 | |
1666 | const int IntvMaxVALUs = 5; |
1667 | const int IntvMaxTRANS = 1; |
1668 | |
1669 | struct StateType { |
1670 | int VALUs = 0; |
1671 | int TRANS = 0; |
1672 | }; |
1673 | |
1674 | StateType State; |
1675 | |
1676 | // This overloads expiry testing with all the hazard detection |
1677 | auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) { |
1678 | // Too many VALU states have passed |
1679 | if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS) |
1680 | return HazardExpired; |
1681 | |
1682 | // Instructions which cause va_vdst==0 expire hazard |
1683 | if (SIInstrInfo::isVMEM(MI: I) || SIInstrInfo::isFLAT(MI: I) || |
1684 | SIInstrInfo::isDS(MI: I) || SIInstrInfo::isEXP(MI: I) || |
1685 | (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
1686 | I.getOperand(i: 0).getImm() == 0x0fff)) |
1687 | return HazardExpired; |
1688 | |
1689 | // Track registers writes |
1690 | if (SIInstrInfo::isTRANS(MI: I)) { |
1691 | for (Register Src : SrcVGPRs) { |
1692 | if (I.modifiesRegister(Reg: Src, TRI: &TRI)) { |
1693 | return HazardFound; |
1694 | } |
1695 | } |
1696 | } |
1697 | |
1698 | return NoHazardFound; |
1699 | }; |
1700 | auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) { |
1701 | if (SIInstrInfo::isVALU(MI)) |
1702 | State.VALUs += 1; |
1703 | if (SIInstrInfo::isTRANS(MI)) |
1704 | State.TRANS += 1; |
1705 | }; |
1706 | |
1707 | DenseSet<const MachineBasicBlock *> Visited; |
1708 | if (!hasHazard<StateType>(State, IsHazard: IsHazardFn, UpdateState: UpdateStateFn, MBB: MI->getParent(), |
1709 | I: std::next(x: MI->getReverseIterator()), Visited)) |
1710 | return false; |
1711 | |
1712 | // Hazard is observed - insert a wait on va_dst counter to ensure hazard is |
1713 | // avoided. |
1714 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), |
1715 | MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)) |
1716 | .addImm(Val: AMDGPU::DepCtr::encodeFieldVaVdst(VaVdst: 0)); |
1717 | |
1718 | return true; |
1719 | } |
1720 | |
1721 | bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { |
1722 | if (!SIInstrInfo::isWMMA(MI: *MI) && !SIInstrInfo::isSWMMAC(MI: *MI)) |
1723 | return false; |
1724 | |
1725 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1726 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
1727 | |
1728 | auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) { |
1729 | if (!SIInstrInfo::isWMMA(MI: I) && !SIInstrInfo::isSWMMAC(MI: I)) |
1730 | return false; |
1731 | |
1732 | // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps |
1733 | // with the dest(matrix D) of the previous wmma. |
1734 | const Register CurSrc0Reg = |
1735 | TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0)->getReg(); |
1736 | const Register CurSrc1Reg = |
1737 | TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1)->getReg(); |
1738 | |
1739 | const Register PrevDstReg = |
1740 | TII->getNamedOperand(MI: I, OpName: AMDGPU::OpName::vdst)->getReg(); |
1741 | |
1742 | if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc0Reg) || |
1743 | TRI->regsOverlap(RegA: PrevDstReg, RegB: CurSrc1Reg)) { |
1744 | return true; |
1745 | } |
1746 | |
1747 | // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall) |
1748 | // but Index can't overlap with PrevDstReg. |
1749 | if (AMDGPU::isGFX12Plus(STI: ST)) { |
1750 | if (SIInstrInfo::isSWMMAC(MI: *MI)) { |
1751 | const Register CurIndex = |
1752 | TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src2)->getReg(); |
1753 | if (TRI->regsOverlap(RegA: PrevDstReg, RegB: CurIndex)) |
1754 | return true; |
1755 | } |
1756 | return false; |
1757 | } |
1758 | |
1759 | return false; |
1760 | }; |
1761 | |
1762 | auto IsExpiredFn = [](const MachineInstr &I, int) { |
1763 | return SIInstrInfo::isVALU(MI: I); |
1764 | }; |
1765 | |
1766 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
1767 | std::numeric_limits<int>::max()) |
1768 | return false; |
1769 | |
1770 | BuildMI(BB&: *MI->getParent(), I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::V_NOP_e32)); |
1771 | |
1772 | return true; |
1773 | } |
1774 | |
1775 | bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { |
1776 | if (!ST.hasShift64HighRegBug()) |
1777 | return false; |
1778 | assert(!ST.hasExtendedWaitCounts()); |
1779 | |
1780 | switch (MI->getOpcode()) { |
1781 | default: |
1782 | return false; |
1783 | case AMDGPU::V_LSHLREV_B64_e64: |
1784 | case AMDGPU::V_LSHRREV_B64_e64: |
1785 | case AMDGPU::V_ASHRREV_I64_e64: |
1786 | break; |
1787 | } |
1788 | |
1789 | MachineOperand *Amt = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src0); |
1790 | if (!Amt->isReg()) |
1791 | return false; |
1792 | |
1793 | Register AmtReg = Amt->getReg(); |
1794 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
1795 | // Check if this is a last VGPR in the allocation block. |
1796 | if (!TRI.isVGPR(MRI, Reg: AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) |
1797 | return false; |
1798 | |
1799 | if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(PhysReg: AmtReg + 1)) |
1800 | return false; |
1801 | |
1802 | MachineOperand *Src1 = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::src1); |
1803 | bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(RegA: Src1->getReg(), RegB: AmtReg); |
1804 | bool OverlappedDst = MI->modifiesRegister(Reg: AmtReg, TRI: &TRI); |
1805 | bool Overlapped = OverlappedSrc || OverlappedDst; |
1806 | |
1807 | assert(!OverlappedDst || !OverlappedSrc || |
1808 | Src1->getReg() == MI->getOperand(0).getReg()); |
1809 | assert(ST.needsAlignedVGPRs()); |
1810 | static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); |
1811 | |
1812 | Register NewReg; |
1813 | for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass |
1814 | : AMDGPU::VGPR_32RegClass) { |
1815 | if (!MI->modifiesRegister(Reg, TRI: &TRI) && !MI->readsRegister(Reg, TRI: &TRI)) { |
1816 | NewReg = Reg; |
1817 | break; |
1818 | } |
1819 | } |
1820 | |
1821 | Register NewAmt = Overlapped ? (Register)TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub1) |
1822 | : NewReg; |
1823 | Register NewAmtLo; |
1824 | |
1825 | if (Overlapped) |
1826 | NewAmtLo = TRI.getSubReg(Reg: NewReg, Idx: AMDGPU::sub0); |
1827 | |
1828 | DebugLoc DL = MI->getDebugLoc(); |
1829 | MachineBasicBlock *MBB = MI->getParent(); |
1830 | // Insert a full wait count because found register might be pending a wait. |
1831 | BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT)) |
1832 | .addImm(Val: 0); |
1833 | |
1834 | // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. |
1835 | if (Overlapped) |
1836 | runOnInstruction( |
1837 | MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmtLo) |
1838 | .addDef(RegNo: AmtReg - 1) |
1839 | .addReg(RegNo: AmtReg - 1, flags: RegState::Undef) |
1840 | .addReg(RegNo: NewAmtLo, flags: RegState::Undef)); |
1841 | runOnInstruction(MI: BuildMI(BB&: *MBB, I: MI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), DestReg: NewAmt) |
1842 | .addDef(RegNo: AmtReg) |
1843 | .addReg(RegNo: AmtReg, flags: RegState::Undef) |
1844 | .addReg(RegNo: NewAmt, flags: RegState::Undef)); |
1845 | |
1846 | // Instructions emitted after the current instruction will be processed by the |
1847 | // parent loop of the hazard recognizer in a natural way. |
1848 | BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), |
1849 | DestReg: AmtReg) |
1850 | .addDef(RegNo: NewAmt) |
1851 | .addReg(RegNo: NewAmt) |
1852 | .addReg(RegNo: AmtReg); |
1853 | if (Overlapped) |
1854 | BuildMI(BB&: *MBB, I: std::next(x: MI->getIterator()), MIMD: DL, MCID: TII.get(Opcode: AMDGPU::V_SWAP_B32), |
1855 | DestReg: AmtReg - 1) |
1856 | .addDef(RegNo: NewAmtLo) |
1857 | .addReg(RegNo: NewAmtLo) |
1858 | .addReg(RegNo: AmtReg - 1); |
1859 | |
1860 | // Re-running hazard recognizer on the modified instruction is not necessary, |
1861 | // inserted V_SWAP_B32 has already both read and write new registers so |
1862 | // hazards related to these register has already been handled. |
1863 | Amt->setReg(NewAmt); |
1864 | Amt->setIsKill(false); |
1865 | // We do not update liveness, so verifier may see it as undef. |
1866 | Amt->setIsUndef(); |
1867 | if (OverlappedDst) |
1868 | MI->getOperand(i: 0).setReg(NewReg); |
1869 | if (OverlappedSrc) { |
1870 | Src1->setReg(NewReg); |
1871 | Src1->setIsKill(false); |
1872 | Src1->setIsUndef(); |
1873 | } |
1874 | |
1875 | return true; |
1876 | } |
1877 | |
1878 | int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { |
1879 | int NSAtoVMEMWaitStates = 1; |
1880 | |
1881 | if (!ST.hasNSAtoVMEMBug()) |
1882 | return 0; |
1883 | |
1884 | if (!SIInstrInfo::isMUBUF(MI: *MI) && !SIInstrInfo::isMTBUF(MI: *MI)) |
1885 | return 0; |
1886 | |
1887 | const SIInstrInfo *TII = ST.getInstrInfo(); |
1888 | const auto *Offset = TII->getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::offset); |
1889 | if (!Offset || (Offset->getImm() & 6) == 0) |
1890 | return 0; |
1891 | |
1892 | auto IsHazardFn = [TII](const MachineInstr &I) { |
1893 | if (!SIInstrInfo::isMIMG(MI: I)) |
1894 | return false; |
1895 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: I.getOpcode()); |
1896 | return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && |
1897 | TII->getInstSizeInBytes(MI: I) >= 16; |
1898 | }; |
1899 | |
1900 | return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazard: IsHazardFn, Limit: 1); |
1901 | } |
1902 | |
1903 | int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { |
1904 | int FPAtomicToDenormModeWaitStates = 3; |
1905 | |
1906 | if (!ST.hasFPAtomicToDenormModeHazard()) |
1907 | return 0; |
1908 | assert(!ST.hasExtendedWaitCounts()); |
1909 | |
1910 | if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) |
1911 | return 0; |
1912 | |
1913 | auto IsHazardFn = [](const MachineInstr &I) { |
1914 | if (!SIInstrInfo::isVMEM(MI: I) && !SIInstrInfo::isFLAT(MI: I)) |
1915 | return false; |
1916 | return SIInstrInfo::isFPAtomic(MI: I); |
1917 | }; |
1918 | |
1919 | auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) { |
1920 | if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) |
1921 | return true; |
1922 | |
1923 | switch (MI.getOpcode()) { |
1924 | case AMDGPU::S_WAITCNT: |
1925 | case AMDGPU::S_WAITCNT_VSCNT: |
1926 | case AMDGPU::S_WAITCNT_VMCNT: |
1927 | case AMDGPU::S_WAITCNT_EXPCNT: |
1928 | case AMDGPU::S_WAITCNT_LGKMCNT: |
1929 | case AMDGPU::S_WAIT_IDLE: |
1930 | return true; |
1931 | default: |
1932 | break; |
1933 | } |
1934 | |
1935 | return false; |
1936 | }; |
1937 | |
1938 | return FPAtomicToDenormModeWaitStates - |
1939 | ::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn); |
1940 | } |
1941 | |
1942 | int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { |
1943 | assert(SIInstrInfo::isMAI(*MI)); |
1944 | |
1945 | return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); |
1946 | } |
1947 | |
1948 | int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) { |
1949 | // Early exit if no padding is requested. |
1950 | if (MFMAPaddingRatio == 0) |
1951 | return 0; |
1952 | |
1953 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1954 | if (!SIInstrInfo::isMFMA(MI: *MI) || MFI->getOccupancy() < 2) |
1955 | return 0; |
1956 | |
1957 | int NeighborMFMALatency = 0; |
1958 | auto IsNeighboringMFMA = [&NeighborMFMALatency, |
1959 | this](const MachineInstr &MI) { |
1960 | if (!SIInstrInfo::isMFMA(MI)) |
1961 | return false; |
1962 | |
1963 | NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI); |
1964 | return true; |
1965 | }; |
1966 | |
1967 | const int MaxMFMAPipelineWaitStates = 16; |
1968 | int WaitStatesSinceNeighborMFMA = |
1969 | getWaitStatesSince(IsHazard: IsNeighboringMFMA, Limit: MaxMFMAPipelineWaitStates); |
1970 | |
1971 | int NeighborMFMAPaddingNeeded = |
1972 | (NeighborMFMALatency * MFMAPaddingRatio / 100) - |
1973 | WaitStatesSinceNeighborMFMA; |
1974 | |
1975 | return std::max(a: 0, b: NeighborMFMAPaddingNeeded); |
1976 | } |
1977 | |
1978 | int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { |
1979 | int WaitStatesNeeded = 0; |
1980 | unsigned Opc = MI->getOpcode(); |
1981 | |
1982 | auto IsVALUFn = [](const MachineInstr &MI) { |
1983 | return SIInstrInfo::isVALU(MI) || MI.isInlineAsm(); |
1984 | }; |
1985 | |
1986 | if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write |
1987 | const int LegacyVALUWritesVGPRWaitStates = 2; |
1988 | const int VALUWritesExecWaitStates = 4; |
1989 | const int MaxWaitStates = 4; |
1990 | |
1991 | int WaitStatesNeededForUse = VALUWritesExecWaitStates - |
1992 | getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsVALUFn, Limit: MaxWaitStates); |
1993 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
1994 | |
1995 | if (WaitStatesNeeded < MaxWaitStates) { |
1996 | for (const MachineOperand &Use : MI->explicit_uses()) { |
1997 | const int MaxWaitStates = 2; |
1998 | |
1999 | if (!Use.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Use.getReg())) |
2000 | continue; |
2001 | |
2002 | int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - |
2003 | getWaitStatesSinceDef(Reg: Use.getReg(), IsHazardDef: IsVALUFn, Limit: MaxWaitStates); |
2004 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2005 | |
2006 | if (WaitStatesNeeded == MaxWaitStates) |
2007 | break; |
2008 | } |
2009 | } |
2010 | } |
2011 | |
2012 | for (const MachineOperand &Op : MI->explicit_operands()) { |
2013 | if (!Op.isReg() || !TRI.isAGPR(MRI: MF.getRegInfo(), Reg: Op.getReg())) |
2014 | continue; |
2015 | |
2016 | if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64) |
2017 | continue; |
2018 | |
2019 | const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; |
2020 | const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; |
2021 | const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; |
2022 | const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; |
2023 | const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; |
2024 | const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; |
2025 | const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; |
2026 | const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; |
2027 | const int MaxWaitStates = 18; |
2028 | Register Reg = Op.getReg(); |
2029 | unsigned HazardDefLatency = 0; |
2030 | |
2031 | auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency, |
2032 | this](const MachineInstr &MI) { |
2033 | if (!SIInstrInfo::isMFMA(MI)) |
2034 | return false; |
2035 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2036 | if (DstReg == Reg) |
2037 | return false; |
2038 | HazardDefLatency = |
2039 | std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI)); |
2040 | return TRI.regsOverlap(RegA: DstReg, RegB: Reg); |
2041 | }; |
2042 | |
2043 | int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, |
2044 | Limit: MaxWaitStates); |
2045 | int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; |
2046 | int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src2); |
2047 | int OpNo = Op.getOperandNo(); |
2048 | if (OpNo == SrcCIdx) { |
2049 | NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; |
2050 | } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { |
2051 | switch (HazardDefLatency) { |
2052 | case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; |
2053 | break; |
2054 | case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; |
2055 | break; |
2056 | case 16: [[fallthrough]]; |
2057 | default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; |
2058 | break; |
2059 | } |
2060 | } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { |
2061 | switch (HazardDefLatency) { |
2062 | case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; |
2063 | break; |
2064 | case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; |
2065 | break; |
2066 | case 16: [[fallthrough]]; |
2067 | default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; |
2068 | break; |
2069 | } |
2070 | } |
2071 | |
2072 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
2073 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2074 | |
2075 | if (WaitStatesNeeded == MaxWaitStates) |
2076 | return WaitStatesNeeded; // Early exit. |
2077 | |
2078 | auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) { |
2079 | if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) |
2080 | return false; |
2081 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2082 | return TRI.regsOverlap(RegA: Reg, RegB: DstReg); |
2083 | }; |
2084 | |
2085 | const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; |
2086 | const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; |
2087 | const int AccVGPRWriteAccVgprReadWaitStates = 3; |
2088 | NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; |
2089 | if (OpNo == SrcCIdx) |
2090 | NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; |
2091 | else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) |
2092 | NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; |
2093 | |
2094 | WaitStatesNeededForUse = NeedWaitStates - |
2095 | getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprWriteFn, Limit: MaxWaitStates); |
2096 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2097 | |
2098 | if (WaitStatesNeeded == MaxWaitStates) |
2099 | return WaitStatesNeeded; // Early exit. |
2100 | } |
2101 | |
2102 | if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { |
2103 | const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; |
2104 | const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; |
2105 | const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; |
2106 | const int MaxWaitStates = 13; |
2107 | Register DstReg = MI->getOperand(i: 0).getReg(); |
2108 | unsigned HazardDefLatency = 0; |
2109 | |
2110 | auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency, |
2111 | this](const MachineInstr &MI) { |
2112 | if (!SIInstrInfo::isMFMA(MI)) |
2113 | return false; |
2114 | Register Reg = TII.getNamedOperand(MI, OpName: AMDGPU::OpName::src2)->getReg(); |
2115 | HazardDefLatency = |
2116 | std::max(a: HazardDefLatency, b: TSchedModel.computeInstrLatency(MI: &MI)); |
2117 | return TRI.regsOverlap(RegA: Reg, RegB: DstReg); |
2118 | }; |
2119 | |
2120 | int WaitStatesSince = getWaitStatesSince(IsHazard: IsSrcCMFMAFn, Limit: MaxWaitStates); |
2121 | int NeedWaitStates; |
2122 | switch (HazardDefLatency) { |
2123 | case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; |
2124 | break; |
2125 | case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; |
2126 | break; |
2127 | case 16: [[fallthrough]]; |
2128 | default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; |
2129 | break; |
2130 | } |
2131 | |
2132 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; |
2133 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2134 | } |
2135 | |
2136 | // Pad neighboring MFMA with noops for better inter-wave performance. |
2137 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI)); |
2138 | |
2139 | return WaitStatesNeeded; |
2140 | } |
2141 | |
2142 | static int |
2143 | GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { |
2144 | // 2 pass -> 3 |
2145 | // 4 pass -> 5 |
2146 | // 8 pass -> 9 |
2147 | // 16 pass -> 17 |
2148 | return NumPasses + 1; |
2149 | } |
2150 | |
2151 | static int |
2152 | GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { |
2153 | // 2 pass -> 2 |
2154 | // 4 pass -> 4 |
2155 | // 8 pass -> 8 |
2156 | // 16 pass -> 16 |
2157 | return NumPasses; |
2158 | } |
2159 | |
2160 | static int |
2161 | GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { |
2162 | // 2 pass -> 4 |
2163 | // 4 pass -> 6 |
2164 | // 8 pass -> 10 |
2165 | // 16 pass -> 18 |
2166 | return NumPasses + 2; |
2167 | } |
2168 | |
2169 | static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { |
2170 | // 2 pass -> 5 |
2171 | // 4 pass -> 7 |
2172 | // 8 pass -> 11 |
2173 | // 16 pass -> 19 |
2174 | return NumPasses + 3; |
2175 | } |
2176 | |
2177 | int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { |
2178 | int WaitStatesNeeded = 0; |
2179 | unsigned Opc = MI->getOpcode(); |
2180 | |
2181 | auto IsLegacyVALUFn = [](const MachineInstr &MI) { |
2182 | return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI); |
2183 | }; |
2184 | |
2185 | auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) { |
2186 | return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) && |
2187 | !SIInstrInfo::isDOT(MI); |
2188 | }; |
2189 | |
2190 | if (!SIInstrInfo::isMFMA(MI: *MI)) |
2191 | return WaitStatesNeeded; |
2192 | |
2193 | const int VALUWritesExecWaitStates = 4; |
2194 | int WaitStatesNeededForUse = VALUWritesExecWaitStates - |
2195 | getWaitStatesSinceDef(Reg: AMDGPU::EXEC, IsHazardDef: IsLegacyVALUFn, |
2196 | Limit: VALUWritesExecWaitStates); |
2197 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2198 | |
2199 | int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src2); |
2200 | |
2201 | // Loop for both DGEMM and S/HGEMM 2nd instruction. |
2202 | for (const MachineOperand &Use : MI->explicit_uses()) { |
2203 | const int LegacyVALUNotDotWritesVGPRWaitStates = 2; |
2204 | const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; |
2205 | const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; |
2206 | const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; |
2207 | const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; |
2208 | const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; |
2209 | const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; |
2210 | const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; |
2211 | const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; |
2212 | const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; |
2213 | const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; |
2214 | const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; |
2215 | const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; |
2216 | const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; |
2217 | const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; |
2218 | const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2; |
2219 | const int MaxWaitStates = 19; |
2220 | |
2221 | if (!Use.isReg()) |
2222 | continue; |
2223 | Register Reg = Use.getReg(); |
2224 | bool FullReg; |
2225 | const MachineInstr *MI1; |
2226 | |
2227 | auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1, |
2228 | this](const MachineInstr &MI) { |
2229 | if (!SIInstrInfo::isMFMA(MI)) |
2230 | return false; |
2231 | Register DstReg = MI.getOperand(i: 0).getReg(); |
2232 | FullReg = (DstReg == Reg); |
2233 | MI1 = &MI; |
2234 | return TRI.regsOverlap(RegA: DstReg, RegB: Reg); |
2235 | }; |
2236 | |
2237 | WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - |
2238 | getWaitStatesSinceDef(Reg, IsHazardDef: IsLegacyVALUNotDotFn, Limit: MaxWaitStates); |
2239 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2240 | |
2241 | int NumWaitStates = |
2242 | getWaitStatesSinceDef(Reg, IsHazardDef: IsOverlappedMFMAFn, Limit: MaxWaitStates); |
2243 | if (NumWaitStates == std::numeric_limits<int>::max()) |
2244 | continue; |
2245 | |
2246 | int OpNo = Use.getOperandNo(); |
2247 | unsigned Opc1 = MI1->getOpcode(); |
2248 | int NeedWaitStates = 0; |
2249 | if (OpNo == SrcCIdx) { |
2250 | if (!isDGEMM(Opcode: Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opcode: Opc1))) { |
2251 | NeedWaitStates = 0; |
2252 | } else if (FullReg) { |
2253 | if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || |
2254 | Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && |
2255 | (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || |
2256 | Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) |
2257 | NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; |
2258 | else if (ST.hasGFX940Insts() && |
2259 | TSchedModel.computeInstrLatency(MI: MI1) == 2) |
2260 | NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates; |
2261 | } else { |
2262 | switch (Opc1) { |
2263 | case AMDGPU::V_MFMA_F64_16X16X4F64_e64: |
2264 | case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: |
2265 | case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: |
2266 | case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: |
2267 | if (!isXDL(ST, MI: *MI)) |
2268 | NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; |
2269 | break; |
2270 | case AMDGPU::V_MFMA_F64_4X4X4F64_e64: |
2271 | case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: |
2272 | if (!isXDL(ST, MI: *MI)) |
2273 | NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; |
2274 | break; |
2275 | default: |
2276 | int NumPasses = TSchedModel.computeInstrLatency(MI: MI1); |
2277 | if (ST.hasGFX940Insts()) { |
2278 | if (isXDL(ST, MI: *MI) && !isXDL(ST, MI: *MI1)) |
2279 | break; |
2280 | |
2281 | NeedWaitStates = |
2282 | isXDL(ST, MI: *MI1) |
2283 | ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( |
2284 | NumPasses) |
2285 | : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( |
2286 | NumPasses); |
2287 | break; |
2288 | } |
2289 | |
2290 | switch (NumPasses) { |
2291 | case 2: |
2292 | NeedWaitStates = |
2293 | isDGEMM(Opcode: Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates |
2294 | : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; |
2295 | break; |
2296 | case 8: |
2297 | NeedWaitStates = |
2298 | isDGEMM(Opcode: Opc) |
2299 | ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates |
2300 | : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; |
2301 | break; |
2302 | case 16: |
2303 | NeedWaitStates = |
2304 | isDGEMM(Opcode: Opc) |
2305 | ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates |
2306 | : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; |
2307 | break; |
2308 | default: |
2309 | llvm_unreachable("unexpected number of passes" ); |
2310 | } |
2311 | } |
2312 | } |
2313 | } else { |
2314 | switch (Opc1) { |
2315 | case AMDGPU::V_MFMA_F64_16X16X4F64_e64: |
2316 | case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: |
2317 | case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: |
2318 | case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: |
2319 | NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; |
2320 | break; |
2321 | case AMDGPU::V_MFMA_F64_4X4X4F64_e64: |
2322 | case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: |
2323 | NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; |
2324 | break; |
2325 | default: |
2326 | int NumPasses = TSchedModel.computeInstrLatency(MI: MI1); |
2327 | |
2328 | if (ST.hasGFX940Insts()) { |
2329 | NeedWaitStates = |
2330 | isXDL(ST, MI: *MI1) |
2331 | ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( |
2332 | NumPasses) |
2333 | : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( |
2334 | NumPasses); |
2335 | break; |
2336 | } |
2337 | |
2338 | switch (NumPasses) { |
2339 | case 2: |
2340 | NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; |
2341 | break; |
2342 | case 4: |
2343 | llvm_unreachable("unexpected number of passes for mfma" ); |
2344 | case 8: |
2345 | NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; |
2346 | break; |
2347 | case 16: |
2348 | default: |
2349 | NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; |
2350 | } |
2351 | } |
2352 | } |
2353 | if (WaitStatesNeeded >= NeedWaitStates) |
2354 | continue; |
2355 | |
2356 | WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; |
2357 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2358 | |
2359 | if (WaitStatesNeeded == MaxWaitStates) |
2360 | break; |
2361 | } |
2362 | |
2363 | // Pad neighboring MFMA with noops for better inter-wave performance. |
2364 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: checkMFMAPadding(MI)); |
2365 | |
2366 | return WaitStatesNeeded; |
2367 | } |
2368 | |
2369 | int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { |
2370 | // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards() |
2371 | if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) |
2372 | return 0; |
2373 | |
2374 | int WaitStatesNeeded = 0; |
2375 | |
2376 | auto IsAccVgprReadFn = [](const MachineInstr &MI) { |
2377 | return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64; |
2378 | }; |
2379 | |
2380 | for (const MachineOperand &Op : MI->explicit_uses()) { |
2381 | if (!Op.isReg() || !TRI.isVGPR(MRI: MF.getRegInfo(), Reg: Op.getReg())) |
2382 | continue; |
2383 | |
2384 | Register Reg = Op.getReg(); |
2385 | |
2386 | const int AccVgprReadLdStWaitStates = 2; |
2387 | const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1; |
2388 | const int MaxWaitStates = 2; |
2389 | |
2390 | int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - |
2391 | getWaitStatesSinceDef(Reg, IsHazardDef: IsAccVgprReadFn, Limit: MaxWaitStates); |
2392 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2393 | |
2394 | if (WaitStatesNeeded == MaxWaitStates) |
2395 | return WaitStatesNeeded; // Early exit. |
2396 | |
2397 | auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) { |
2398 | if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 && |
2399 | MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) |
2400 | return false; |
2401 | auto IsVALUFn = [](const MachineInstr &MI) { |
2402 | return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI); |
2403 | }; |
2404 | return getWaitStatesSinceDef(Reg, IsHazardDef: IsVALUFn, Limit: 2 /*MaxWaitStates*/) < |
2405 | std::numeric_limits<int>::max(); |
2406 | }; |
2407 | |
2408 | WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates - |
2409 | getWaitStatesSince(IsHazard: IsVALUAccVgprRdWrCheckFn, Limit: MaxWaitStates); |
2410 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2411 | } |
2412 | |
2413 | return WaitStatesNeeded; |
2414 | } |
2415 | |
2416 | static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { |
2417 | // 2 pass -> 4 |
2418 | // 4 pass -> 6 |
2419 | // 8 pass -> 10 |
2420 | // 16 pass -> 18 |
2421 | return NumPasses + 2; |
2422 | } |
2423 | |
2424 | static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { |
2425 | // 2 pass -> 5 |
2426 | // 4 pass -> 7 |
2427 | // 8 pass -> 11 |
2428 | // 16 pass -> 19 |
2429 | return NumPasses + 3; |
2430 | } |
2431 | |
2432 | static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { |
2433 | // 2 pass -> 5 |
2434 | // 4 pass -> 7 |
2435 | // 8 pass -> 11 |
2436 | // 16 pass -> 19 |
2437 | return NumPasses + 3; |
2438 | } |
2439 | |
2440 | static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { |
2441 | // 2 pass -> 4 |
2442 | // 4 pass -> 6 |
2443 | // 8 pass -> 10 |
2444 | // 16 pass -> 18 |
2445 | return NumPasses + 2; |
2446 | } |
2447 | |
2448 | int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { |
2449 | if (!ST.hasGFX90AInsts()) |
2450 | return 0; |
2451 | |
2452 | auto IsDGEMMFn = [](const MachineInstr &MI) -> bool { |
2453 | return isDGEMM(Opcode: MI.getOpcode()); |
2454 | }; |
2455 | |
2456 | // This is checked in checkMAIHazards90A() |
2457 | if (SIInstrInfo::isMFMA(MI: *MI)) |
2458 | return 0; |
2459 | |
2460 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2461 | |
2462 | int WaitStatesNeeded = 0; |
2463 | |
2464 | bool IsMem = SIInstrInfo::isVMEM(MI: *MI) || |
2465 | SIInstrInfo::isFLAT(MI: *MI) || |
2466 | SIInstrInfo::isDS(MI: *MI); |
2467 | bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(MI: *MI); |
2468 | bool IsVALU = SIInstrInfo::isVALU(MI: *MI); |
2469 | |
2470 | const MachineInstr *MFMA = nullptr; |
2471 | unsigned Reg; |
2472 | auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) { |
2473 | if (!SIInstrInfo::isMFMA(MI) || |
2474 | !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg)) |
2475 | return false; |
2476 | MFMA = &MI; |
2477 | return true; |
2478 | }; |
2479 | |
2480 | const MachineInstr *DOT = nullptr; |
2481 | auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) { |
2482 | if (!SIInstrInfo::isDOT(MI) || |
2483 | !TRI.regsOverlap(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg)) |
2484 | return false; |
2485 | DOT = &MI; |
2486 | return true; |
2487 | }; |
2488 | |
2489 | bool DGEMMAfterVALUWrite = false; |
2490 | auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) { |
2491 | // Found DGEMM on reverse traversal to def. |
2492 | if (isDGEMM(Opcode: MI.getOpcode())) |
2493 | DGEMMAfterVALUWrite = true; |
2494 | |
2495 | // Only hazard if register is defined by a VALU and a DGEMM is found after |
2496 | // after the def. |
2497 | if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite) |
2498 | return false; |
2499 | |
2500 | return true; |
2501 | }; |
2502 | |
2503 | int SrcCIdx = AMDGPU::getNamedOperandIdx(Opcode: MI->getOpcode(), |
2504 | NamedIdx: AMDGPU::OpName::src2); |
2505 | |
2506 | if (IsMemOrExport || IsVALU) { |
2507 | const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; |
2508 | const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; |
2509 | const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; |
2510 | const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; |
2511 | const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; |
2512 | const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; |
2513 | const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; |
2514 | const int DotWriteSameDotReadSrcAB = 3; |
2515 | const int DotWriteDifferentVALURead = 3; |
2516 | const int DMFMABetweenVALUWriteVMEMRead = 2; |
2517 | const int MaxWaitStates = 19; |
2518 | |
2519 | for (const MachineOperand &Use : MI->explicit_uses()) { |
2520 | if (!Use.isReg()) |
2521 | continue; |
2522 | Reg = Use.getReg(); |
2523 | |
2524 | DOT = nullptr; |
2525 | int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn, |
2526 | Limit: MaxWaitStates); |
2527 | if (DOT) { |
2528 | int NeedWaitStates = 0; |
2529 | if (DOT->getOpcode() == MI->getOpcode()) { |
2530 | if (&Use - &MI->getOperand(i: 0) != SrcCIdx) |
2531 | NeedWaitStates = DotWriteSameDotReadSrcAB; |
2532 | } else { |
2533 | NeedWaitStates = DotWriteDifferentVALURead; |
2534 | } |
2535 | |
2536 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
2537 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2538 | } |
2539 | |
2540 | // Workaround for HW data hazard bug observed only in GFX90A. When there |
2541 | // is a DGEMM instruction in-between a VALU and a VMEM instruction it |
2542 | // causes the SQ to incorrectly not insert two wait states between the two |
2543 | // instructions needed to avoid data hazard. |
2544 | if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) { |
2545 | DGEMMAfterVALUWrite = false; |
2546 | if (TRI.isVectorRegister(MRI, Reg)) { |
2547 | int WaitStatesNeededForUse = |
2548 | DMFMABetweenVALUWriteVMEMRead - |
2549 | getWaitStatesSinceDef(Reg, IsHazardDef: IsDGEMMHazard, |
2550 | Limit: DMFMABetweenVALUWriteVMEMRead); |
2551 | |
2552 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2553 | } |
2554 | } |
2555 | |
2556 | MFMA = nullptr; |
2557 | WaitStatesSinceDef = |
2558 | getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates); |
2559 | if (!MFMA) |
2560 | continue; |
2561 | |
2562 | unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA); |
2563 | int NumPasses = HazardDefLatency; |
2564 | int NeedWaitStates = MaxWaitStates; |
2565 | |
2566 | if (isDGEMM(Opcode: MFMA->getOpcode())) { |
2567 | switch (HazardDefLatency) { |
2568 | case 4: |
2569 | NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates |
2570 | : DMFMA4x4WriteVgprVALUReadWaitStates; |
2571 | break; |
2572 | case 8: |
2573 | case 16: |
2574 | NeedWaitStates = IsMemOrExport |
2575 | ? DMFMA16x16WriteVgprMemExpReadWaitStates |
2576 | : DMFMA16x16WriteVgprVALUReadWaitStates; |
2577 | break; |
2578 | default: |
2579 | llvm_unreachable("unexpected dgemm" ); |
2580 | } |
2581 | } else if (ST.hasGFX940Insts()) { |
2582 | NeedWaitStates = |
2583 | isXDL(ST, MI: *MFMA) |
2584 | ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) |
2585 | : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( |
2586 | NumPasses); |
2587 | } else { |
2588 | switch (HazardDefLatency) { |
2589 | case 2: |
2590 | NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; |
2591 | break; |
2592 | case 8: |
2593 | NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; |
2594 | break; |
2595 | case 16: |
2596 | NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates; |
2597 | break; |
2598 | default: |
2599 | llvm_unreachable("unexpected number of passes for mfma" ); |
2600 | } |
2601 | } |
2602 | |
2603 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
2604 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2605 | |
2606 | if (WaitStatesNeeded == MaxWaitStates) |
2607 | break; |
2608 | } |
2609 | } |
2610 | |
2611 | unsigned Opc = MI->getOpcode(); |
2612 | const int DMFMAToFMA64WaitStates = 2; |
2613 | if ((Opc == AMDGPU::V_FMA_F64_e64 || |
2614 | Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || |
2615 | Opc == AMDGPU::V_FMAC_F64_dpp) && |
2616 | WaitStatesNeeded < DMFMAToFMA64WaitStates) { |
2617 | int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - |
2618 | getWaitStatesSince(IsHazard: IsDGEMMFn, Limit: DMFMAToFMA64WaitStates); |
2619 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2620 | } |
2621 | |
2622 | if (!IsVALU && !IsMemOrExport) |
2623 | return WaitStatesNeeded; |
2624 | |
2625 | for (const MachineOperand &Def : MI->defs()) { |
2626 | const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; |
2627 | const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; |
2628 | const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; |
2629 | const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; |
2630 | const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3; |
2631 | const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; |
2632 | const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; |
2633 | const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; |
2634 | const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; |
2635 | const int DotWriteDifferentVALUWrite = 3; |
2636 | const int MaxWaitStates = 19; |
2637 | const int MaxWarWaitStates = 15; |
2638 | |
2639 | Reg = Def.getReg(); |
2640 | |
2641 | DOT = nullptr; |
2642 | int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsHazardDef: IsDotWriteFn, |
2643 | Limit: MaxWaitStates); |
2644 | if (DOT && DOT->getOpcode() != MI->getOpcode()) |
2645 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: DotWriteDifferentVALUWrite - |
2646 | WaitStatesSinceDef); |
2647 | |
2648 | MFMA = nullptr; |
2649 | WaitStatesSinceDef = |
2650 | getWaitStatesSinceDef(Reg, IsHazardDef: IsMFMAWriteFn, Limit: MaxWaitStates); |
2651 | if (MFMA) { |
2652 | int NeedWaitStates = MaxWaitStates; |
2653 | int NumPasses = TSchedModel.computeInstrLatency(MI: MFMA); |
2654 | |
2655 | if (isDGEMM(Opcode: MFMA->getOpcode())) { |
2656 | switch (NumPasses) { |
2657 | case 4: |
2658 | NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; |
2659 | break; |
2660 | case 8: |
2661 | case 16: |
2662 | NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates; |
2663 | break; |
2664 | default: |
2665 | llvm_unreachable("unexpected number of cycles for dgemm" ); |
2666 | } |
2667 | } else if (ST.hasGFX940Insts()) { |
2668 | NeedWaitStates = |
2669 | isXDL(ST, MI: *MFMA) |
2670 | ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) |
2671 | : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); |
2672 | } else { |
2673 | switch (NumPasses) { |
2674 | case 2: |
2675 | NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; |
2676 | break; |
2677 | case 8: |
2678 | NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; |
2679 | break; |
2680 | case 16: |
2681 | NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates; |
2682 | break; |
2683 | default: |
2684 | llvm_unreachable("Unexpected number of passes for mfma" ); |
2685 | } |
2686 | } |
2687 | |
2688 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; |
2689 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2690 | |
2691 | if (WaitStatesNeeded == MaxWaitStates) |
2692 | break; |
2693 | } |
2694 | |
2695 | auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) { |
2696 | if (!SIInstrInfo::isMFMA(MI) || isDGEMM(Opcode: MI.getOpcode()) || |
2697 | !MI.readsRegister(Reg, TRI: &TRI)) |
2698 | return false; |
2699 | |
2700 | if (ST.hasGFX940Insts() && !isXDL(ST, MI)) |
2701 | return false; |
2702 | |
2703 | const MachineOperand *SrcC = |
2704 | TII.getNamedOperand(MI, OpName: AMDGPU::OpName::src2); |
2705 | assert(SrcC); |
2706 | if (!SrcC->isReg() || !TRI.regsOverlap(RegA: SrcC->getReg(), RegB: Reg)) |
2707 | return false; |
2708 | |
2709 | MFMA = &MI; |
2710 | return true; |
2711 | }; |
2712 | |
2713 | MFMA = nullptr; |
2714 | int WaitStatesSinceUse = getWaitStatesSince(IsHazard: IsSMFMAReadAsCFn, |
2715 | Limit: MaxWarWaitStates); |
2716 | if (!MFMA) |
2717 | continue; |
2718 | |
2719 | unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MI: MFMA); |
2720 | int NeedWaitStates = MaxWaitStates; |
2721 | switch (HazardDefLatency) { |
2722 | case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; |
2723 | break; |
2724 | case 4: assert(ST.hasGFX940Insts()); |
2725 | NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates; |
2726 | break; |
2727 | case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; |
2728 | break; |
2729 | case 16: [[fallthrough]]; |
2730 | default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; |
2731 | break; |
2732 | } |
2733 | |
2734 | int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; |
2735 | WaitStatesNeeded = std::max(a: WaitStatesNeeded, b: WaitStatesNeededForUse); |
2736 | } |
2737 | |
2738 | return WaitStatesNeeded; |
2739 | } |
2740 | |
2741 | bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { |
2742 | if (!SU->isInstr()) |
2743 | return false; |
2744 | |
2745 | const MachineInstr *MAI = nullptr; |
2746 | |
2747 | auto IsMFMAFn = [&MAI](const MachineInstr &MI) { |
2748 | MAI = nullptr; |
2749 | if (SIInstrInfo::isMFMA(MI)) |
2750 | MAI = &MI; |
2751 | return MAI != nullptr; |
2752 | }; |
2753 | |
2754 | MachineInstr *MI = SU->getInstr(); |
2755 | if (IsMFMAFn(*MI)) { |
2756 | int W = getWaitStatesSince(IsHazard: IsMFMAFn, Limit: 16); |
2757 | if (MAI) |
2758 | return W < (int)TSchedModel.computeInstrLatency(MI: MAI); |
2759 | } |
2760 | |
2761 | return false; |
2762 | } |
2763 | |
2764 | bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { |
2765 | if (!ST.hasVALUMaskWriteHazard()) |
2766 | return false; |
2767 | assert(!ST.hasExtendedWaitCounts()); |
2768 | |
2769 | if (!ST.isWave64() || !SIInstrInfo::isSALU(MI: *MI)) |
2770 | return false; |
2771 | |
2772 | // The hazard sequence is three instructions: |
2773 | // 1. VALU reads SGPR as mask |
2774 | // 2. SALU writes SGPR |
2775 | // 3. SALU reads SGPR |
2776 | // The hazard can expire if the distance between 2 and 3 is sufficient. |
2777 | // In practice this happens <10% of the time, hence this always assumes |
2778 | // the hazard exists if 1 and 2 are present to avoid searching. |
2779 | |
2780 | const MachineOperand *SDSTOp = TII.getNamedOperand(MI&: *MI, OperandName: AMDGPU::OpName::sdst); |
2781 | if (!SDSTOp || !SDSTOp->isReg()) |
2782 | return false; |
2783 | |
2784 | const Register HazardReg = SDSTOp->getReg(); |
2785 | if (HazardReg == AMDGPU::EXEC || |
2786 | HazardReg == AMDGPU::EXEC_LO || |
2787 | HazardReg == AMDGPU::EXEC_HI || |
2788 | HazardReg == AMDGPU::M0) |
2789 | return false; |
2790 | |
2791 | auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { |
2792 | switch (I.getOpcode()) { |
2793 | case AMDGPU::V_ADDC_U32_e32: |
2794 | case AMDGPU::V_ADDC_U32_dpp: |
2795 | case AMDGPU::V_CNDMASK_B16_e32: |
2796 | case AMDGPU::V_CNDMASK_B16_dpp: |
2797 | case AMDGPU::V_CNDMASK_B32_e32: |
2798 | case AMDGPU::V_CNDMASK_B32_dpp: |
2799 | case AMDGPU::V_DIV_FMAS_F32_e64: |
2800 | case AMDGPU::V_DIV_FMAS_F64_e64: |
2801 | case AMDGPU::V_SUBB_U32_e32: |
2802 | case AMDGPU::V_SUBB_U32_dpp: |
2803 | case AMDGPU::V_SUBBREV_U32_e32: |
2804 | case AMDGPU::V_SUBBREV_U32_dpp: |
2805 | // These implicitly read VCC as mask source. |
2806 | return HazardReg == AMDGPU::VCC || |
2807 | HazardReg == AMDGPU::VCC_LO || |
2808 | HazardReg == AMDGPU::VCC_HI; |
2809 | case AMDGPU::V_ADDC_U32_e64: |
2810 | case AMDGPU::V_ADDC_U32_e64_dpp: |
2811 | case AMDGPU::V_CNDMASK_B16_e64: |
2812 | case AMDGPU::V_CNDMASK_B16_e64_dpp: |
2813 | case AMDGPU::V_CNDMASK_B32_e64: |
2814 | case AMDGPU::V_CNDMASK_B32_e64_dpp: |
2815 | case AMDGPU::V_SUBB_U32_e64: |
2816 | case AMDGPU::V_SUBB_U32_e64_dpp: |
2817 | case AMDGPU::V_SUBBREV_U32_e64: |
2818 | case AMDGPU::V_SUBBREV_U32_e64_dpp: { |
2819 | // Only check mask register overlaps. |
2820 | const MachineOperand *SSRCOp = TII.getNamedOperand(MI: I, OpName: AMDGPU::OpName::src2); |
2821 | assert(SSRCOp); |
2822 | return TRI.regsOverlap(RegA: SSRCOp->getReg(), RegB: HazardReg); |
2823 | } |
2824 | default: |
2825 | return false; |
2826 | } |
2827 | }; |
2828 | |
2829 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2830 | auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { |
2831 | // s_waitcnt_depctr sa_sdst(0) mitigates hazard. |
2832 | if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
2833 | AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: I.getOperand(i: 0).getImm()) == 0) |
2834 | return true; |
2835 | |
2836 | // VALU access to any SGPR or literal constant other than HazardReg |
2837 | // mitigates hazard. No need to check HazardReg here as this will |
2838 | // only be called when !IsHazardFn. |
2839 | if (!SIInstrInfo::isVALU(MI: I)) |
2840 | return false; |
2841 | for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { |
2842 | const MachineOperand &Op = I.getOperand(i: OpNo); |
2843 | if (Op.isReg()) { |
2844 | Register OpReg = Op.getReg(); |
2845 | // Only consider uses |
2846 | if (!Op.isUse()) |
2847 | continue; |
2848 | // Ignore EXEC |
2849 | if (OpReg == AMDGPU::EXEC || |
2850 | OpReg == AMDGPU::EXEC_LO || |
2851 | OpReg == AMDGPU::EXEC_HI) |
2852 | continue; |
2853 | // Ignore all implicit uses except VCC |
2854 | if (Op.isImplicit()) { |
2855 | if (OpReg == AMDGPU::VCC || |
2856 | OpReg == AMDGPU::VCC_LO || |
2857 | OpReg == AMDGPU::VCC_HI) |
2858 | return true; |
2859 | continue; |
2860 | } |
2861 | if (TRI.isSGPRReg(MRI, Reg: OpReg)) |
2862 | return true; |
2863 | } else { |
2864 | const MCInstrDesc &InstDesc = I.getDesc(); |
2865 | const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; |
2866 | if (!TII.isInlineConstant(MO: Op, OpInfo)) |
2867 | return true; |
2868 | } |
2869 | } |
2870 | return false; |
2871 | }; |
2872 | |
2873 | // Check for hazard |
2874 | if (::getWaitStatesSince(IsHazard: IsHazardFn, MI, IsExpired: IsExpiredFn) == |
2875 | std::numeric_limits<int>::max()) |
2876 | return false; |
2877 | |
2878 | auto NextMI = std::next(x: MI->getIterator()); |
2879 | |
2880 | // Add s_waitcnt_depctr sa_sdst(0) after SALU write. |
2881 | BuildMI(BB&: *MI->getParent(), I: NextMI, MIMD: MI->getDebugLoc(), |
2882 | MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)) |
2883 | .addImm(Val: AMDGPU::DepCtr::encodeFieldSaSdst(SaSdst: 0)); |
2884 | |
2885 | // SALU write may be s_getpc in a bundle. |
2886 | if (MI->getOpcode() == AMDGPU::S_GETPC_B64) { |
2887 | // Update offsets of any references in the bundle. |
2888 | while (NextMI != MI->getParent()->end() && |
2889 | NextMI->isBundledWithPred()) { |
2890 | for (auto &Operand : NextMI->operands()) { |
2891 | if (Operand.isGlobal()) |
2892 | Operand.setOffset(Operand.getOffset() + 4); |
2893 | } |
2894 | NextMI++; |
2895 | } |
2896 | } |
2897 | |
2898 | return true; |
2899 | } |
2900 | |
2901 | static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, |
2902 | const SIInstrInfo &TII) { |
2903 | MachineBasicBlock &EntryMBB = MF->front(); |
2904 | if (EntryMBB.begin() != EntryMBB.end()) { |
2905 | auto &EntryMI = *EntryMBB.begin(); |
2906 | if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO && |
2907 | EntryMI.getOperand(i: 0).getImm() >= Priority) |
2908 | return false; |
2909 | } |
2910 | |
2911 | BuildMI(BB&: EntryMBB, I: EntryMBB.begin(), MIMD: DebugLoc(), MCID: TII.get(Opcode: AMDGPU::S_SETPRIO)) |
2912 | .addImm(Val: Priority); |
2913 | return true; |
2914 | } |
2915 | |
2916 | bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { |
2917 | if (!ST.hasRequiredExportPriority()) |
2918 | return false; |
2919 | |
2920 | // Assume the following shader types will never have exports, |
2921 | // and avoid adding or adjusting S_SETPRIO. |
2922 | MachineBasicBlock *MBB = MI->getParent(); |
2923 | MachineFunction *MF = MBB->getParent(); |
2924 | auto CC = MF->getFunction().getCallingConv(); |
2925 | switch (CC) { |
2926 | case CallingConv::AMDGPU_CS: |
2927 | case CallingConv::AMDGPU_CS_Chain: |
2928 | case CallingConv::AMDGPU_CS_ChainPreserve: |
2929 | case CallingConv::AMDGPU_KERNEL: |
2930 | return false; |
2931 | default: |
2932 | break; |
2933 | } |
2934 | |
2935 | const int MaxPriority = 3; |
2936 | const int NormalPriority = 2; |
2937 | const int PostExportPriority = 0; |
2938 | |
2939 | auto It = MI->getIterator(); |
2940 | switch (MI->getOpcode()) { |
2941 | case AMDGPU::S_ENDPGM: |
2942 | case AMDGPU::S_ENDPGM_SAVED: |
2943 | case AMDGPU::S_ENDPGM_ORDERED_PS_DONE: |
2944 | case AMDGPU::SI_RETURN_TO_EPILOG: |
2945 | // Ensure shader with calls raises priority at entry. |
2946 | // This ensures correct priority if exports exist in callee. |
2947 | if (MF->getFrameInfo().hasCalls()) |
2948 | return ensureEntrySetPrio(MF, Priority: NormalPriority, TII); |
2949 | return false; |
2950 | case AMDGPU::S_SETPRIO: { |
2951 | // Raise minimum priority unless in workaround. |
2952 | auto &PrioOp = MI->getOperand(i: 0); |
2953 | int Prio = PrioOp.getImm(); |
2954 | bool InWA = (Prio == PostExportPriority) && |
2955 | (It != MBB->begin() && TII.isEXP(MI: *std::prev(x: It))); |
2956 | if (InWA || Prio >= NormalPriority) |
2957 | return false; |
2958 | PrioOp.setImm(std::min(a: Prio + NormalPriority, b: MaxPriority)); |
2959 | return true; |
2960 | } |
2961 | default: |
2962 | if (!TII.isEXP(MI: *MI)) |
2963 | return false; |
2964 | break; |
2965 | } |
2966 | |
2967 | // Check entry priority at each export (as there will only be a few). |
2968 | // Note: amdgpu_gfx can only be a callee, so defer to caller setprio. |
2969 | bool Changed = false; |
2970 | if (CC != CallingConv::AMDGPU_Gfx) |
2971 | Changed = ensureEntrySetPrio(MF, Priority: NormalPriority, TII); |
2972 | |
2973 | auto NextMI = std::next(x: It); |
2974 | bool EndOfShader = false; |
2975 | if (NextMI != MBB->end()) { |
2976 | // Only need WA at end of sequence of exports. |
2977 | if (TII.isEXP(MI: *NextMI)) |
2978 | return Changed; |
2979 | // Assume appropriate S_SETPRIO after export means WA already applied. |
2980 | if (NextMI->getOpcode() == AMDGPU::S_SETPRIO && |
2981 | NextMI->getOperand(i: 0).getImm() == PostExportPriority) |
2982 | return Changed; |
2983 | EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM; |
2984 | } |
2985 | |
2986 | const DebugLoc &DL = MI->getDebugLoc(); |
2987 | |
2988 | // Lower priority. |
2989 | BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO)) |
2990 | .addImm(Val: PostExportPriority); |
2991 | |
2992 | if (!EndOfShader) { |
2993 | // Wait for exports to complete. |
2994 | BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_WAITCNT_EXPCNT)) |
2995 | .addReg(RegNo: AMDGPU::SGPR_NULL) |
2996 | .addImm(Val: 0); |
2997 | } |
2998 | |
2999 | BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0); |
3000 | BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_NOP)).addImm(Val: 0); |
3001 | |
3002 | if (!EndOfShader) { |
3003 | // Return to normal (higher) priority. |
3004 | BuildMI(BB&: *MBB, I: NextMI, MIMD: DL, MCID: TII.get(Opcode: AMDGPU::S_SETPRIO)) |
3005 | .addImm(Val: NormalPriority); |
3006 | } |
3007 | |
3008 | return true; |
3009 | } |
3010 | |