1//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "SIInstrInfo.h"
18#include "SIMachineFunctionInfo.h"
19#include "llvm/ADT/SetVector.h"
20
21using namespace llvm;
22
23#define DEBUG_TYPE "amdgpu-insert-delay-alu"
24
25namespace {
26
27class AMDGPUInsertDelayAlu {
28public:
29 const GCNSubtarget *ST;
30 const SIInstrInfo *SII;
31 const TargetRegisterInfo *TRI;
32
33 const TargetSchedModel *SchedModel;
34
35 // Return true if MI waits for all outstanding VALU instructions to complete.
36 static bool instructionWaitsForVALU(const MachineInstr &MI) {
37 // These instruction types wait for VA_VDST==0 before issuing.
38 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
39 SIInstrFlags::FLAT | SIInstrFlags::MIMG |
40 SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
41 if (MI.getDesc().TSFlags & VA_VDST_0)
42 return true;
43 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
44 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
45 return true;
46 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
47 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
48 return true;
49 return false;
50 }
51
52 static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
53 // These instruction types wait for VA_SDST==0 before issuing.
54 uint64_t MIFlags = MI.getDesc().TSFlags;
55 if (MIFlags & SIInstrFlags::SMRD)
56 return true;
57
58 if (MIFlags & SIInstrFlags::SALU) {
59 for (auto &Op : MI.operands()) {
60 if (Op.isReg())
61 return true;
62 }
63 }
64 return false;
65 }
66
67 // Types of delay that can be encoded in an s_delay_alu instruction.
68 enum DelayType { VALU, TRANS, SALU, OTHER };
69
70 // Get the delay type for a MachineInstr.
71 DelayType getDelayType(const MachineInstr &MI) {
72 // Non-F64 TRANS instructions use a separate delay type.
73 if (SIInstrInfo::isTRANS(MI) &&
74 !AMDGPU::isDPMACCInstruction(Opc: MI.getOpcode()))
75 return TRANS;
76 // WMMA XDL ops are treated the same as TRANS.
77 if (ST->hasGFX1250Insts() && SII->isXDLWMMA(MI))
78 return TRANS;
79 if (SIInstrInfo::isVALU(MI, /*AllowLDSDMA=*/true))
80 return VALU;
81 if (SIInstrInfo::isSALU(MI))
82 return SALU;
83 return OTHER;
84 }
85
86 // Information about the last instruction(s) that wrote to a particular
87 // regunit. In straight-line code there will only be one such instruction, but
88 // when control flow converges we merge the delay information from each path
89 // to represent the union of the worst-case delays of each type.
90 struct DelayInfo {
91 // One larger than the maximum number of (non-TRANS) VALU instructions we
92 // can encode in an s_delay_alu instruction.
93 static constexpr unsigned VALU_MAX = 5;
94
95 // One larger than the maximum number of TRANS instructions we can encode in
96 // an s_delay_alu instruction.
97 static constexpr unsigned TRANS_MAX = 4;
98
99 // One larger than the maximum number of SALU cycles we can encode in an
100 // s_delay_alu instruction.
101 static constexpr unsigned SALU_CYCLES_MAX = 4;
102
103 // If it was written by a (non-TRANS) VALU, remember how many clock cycles
104 // are left until it completes, and how many other (non-TRANS) VALU we have
105 // seen since it was issued.
106 uint8_t VALUCycles = 0;
107 uint8_t VALUNum = VALU_MAX;
108
109 // If it was written by a TRANS, remember how many clock cycles are left
110 // until it completes, and how many other TRANS we have seen since it was
111 // issued.
112 uint8_t TRANSCycles = 0;
113 uint8_t TRANSNum = TRANS_MAX;
114 // Also remember how many other (non-TRANS) VALU we have seen since it was
115 // issued. When an instruction depends on both a prior TRANS and a prior
116 // non-TRANS VALU, this is used to decide whether to encode a wait for just
117 // one or both of them.
118 uint8_t TRANSNumVALU = VALU_MAX;
119
120 // If it was written by an SALU, remember how many clock cycles are left
121 // until it completes.
122 uint8_t SALUCycles = 0;
123
124 DelayInfo() = default;
125
126 DelayInfo(DelayType Type, unsigned Cycles) {
127 switch (Type) {
128 default:
129 llvm_unreachable("unexpected type");
130 case VALU:
131 VALUCycles = Cycles;
132 VALUNum = 0;
133 break;
134 case TRANS:
135 TRANSCycles = Cycles;
136 TRANSNum = 0;
137 TRANSNumVALU = 0;
138 break;
139 case SALU:
140 // Guard against pseudo-instructions like SI_CALL which are marked as
141 // SALU but with a very high latency.
142 SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX);
143 break;
144 }
145 }
146
147 bool operator==(const DelayInfo &RHS) const {
148 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
149 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
150 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
151 }
152
153 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
154
155 // Merge another DelayInfo into this one, to represent the union of the
156 // worst-case delays of each type.
157 void merge(const DelayInfo &RHS) {
158 VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles);
159 VALUNum = std::min(a: VALUNum, b: RHS.VALUNum);
160 TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles);
161 TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum);
162 TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU);
163 SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles);
164 }
165
166 // Update this DelayInfo after issuing an instruction of the specified type.
167 // Cycles is the number of cycles it takes to issue the instruction. Return
168 // true if there is no longer any useful delay info.
169 bool advance(DelayType Type, unsigned Cycles) {
170 bool Erase = true;
171
172 VALUNum += (Type == VALU);
173 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
174 // Forget about the VALU instruction. It was too far back or has
175 // definitely completed by now.
176 VALUNum = VALU_MAX;
177 VALUCycles = 0;
178 } else {
179 VALUCycles -= Cycles;
180 Erase = false;
181 }
182
183 TRANSNum += (Type == TRANS);
184 TRANSNumVALU += (Type == VALU);
185 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
186 // Forget about any TRANS instruction. It was too far back or has
187 // definitely completed by now.
188 TRANSNum = TRANS_MAX;
189 TRANSNumVALU = VALU_MAX;
190 TRANSCycles = 0;
191 } else {
192 TRANSCycles -= Cycles;
193 Erase = false;
194 }
195
196 if (SALUCycles <= Cycles) {
197 // Forget about any SALU instruction. It has definitely completed by
198 // now.
199 SALUCycles = 0;
200 } else {
201 SALUCycles -= Cycles;
202 Erase = false;
203 }
204
205 return Erase;
206 }
207
208#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
209 void dump() const {
210 if (VALUCycles)
211 dbgs() << " VALUCycles=" << (int)VALUCycles;
212 if (VALUNum < VALU_MAX)
213 dbgs() << " VALUNum=" << (int)VALUNum;
214 if (TRANSCycles)
215 dbgs() << " TRANSCycles=" << (int)TRANSCycles;
216 if (TRANSNum < TRANS_MAX)
217 dbgs() << " TRANSNum=" << (int)TRANSNum;
218 if (TRANSNumVALU < VALU_MAX)
219 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
220 if (SALUCycles)
221 dbgs() << " SALUCycles=" << (int)SALUCycles;
222 }
223#endif
224 };
225
226 // A map from regunits to the delay info for that regunit.
227 struct DelayState : DenseMap<MCRegUnit, DelayInfo> {
228 // Merge another DelayState into this one by merging the delay info for each
229 // regunit.
230 void merge(const DelayState &RHS) {
231 for (const auto &KV : RHS) {
232 iterator It;
233 bool Inserted;
234 std::tie(args&: It, args&: Inserted) = insert(KV);
235 if (!Inserted)
236 It->second.merge(RHS: KV.second);
237 }
238 }
239
240 // Advance the delay info for each regunit, erasing any that are no longer
241 // useful.
242 void advance(DelayType Type, unsigned Cycles) {
243 remove_if(Pred: [&](auto &P) { return P.second.advance(Type, Cycles); });
244 }
245
246 void advanceByVALUNum(unsigned VALUNum) {
247 remove_if(Pred: [&](auto &P) {
248 return P.second.VALUNum >= VALUNum && P.second.VALUCycles > 0;
249 });
250 }
251
252#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
253 void dump(const TargetRegisterInfo *TRI) const {
254 if (empty()) {
255 dbgs() << " empty\n";
256 return;
257 }
258
259 // Dump DelayInfo for each RegUnit in numerical order.
260 SmallVector<const_iterator, 8> Order;
261 Order.reserve(size());
262 for (const_iterator I = begin(), E = end(); I != E; ++I)
263 Order.push_back(I);
264 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
265 return A->first < B->first;
266 });
267 for (const_iterator I : Order) {
268 dbgs() << " " << printRegUnit(I->first, TRI);
269 I->second.dump();
270 dbgs() << "\n";
271 }
272 }
273#endif
274 };
275
276 // The saved delay state at the end of each basic block.
277 DenseMap<MachineBasicBlock *, DelayState> BlockState;
278
279 // Emit an s_delay_alu instruction if necessary before MI.
280 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
281 MachineInstr *LastDelayAlu) {
282 unsigned Imm = 0;
283
284 // Wait for a TRANS instruction.
285 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
286 Imm |= 4 + Delay.TRANSNum;
287
288 // Wait for a VALU instruction (if it's more recent than any TRANS
289 // instruction that we're also waiting for).
290 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
291 Delay.VALUNum <= Delay.TRANSNumVALU) {
292 if (Imm & 0xf)
293 Imm |= Delay.VALUNum << 7;
294 else
295 Imm |= Delay.VALUNum;
296 }
297
298 // Wait for an SALU instruction.
299 if (Delay.SALUCycles) {
300 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
301 if (Imm & 0x780) {
302 // We have already encoded a VALU and a TRANS delay. There's no room in
303 // the encoding for an SALU delay as well, so just drop it.
304 } else if (Imm & 0xf) {
305 Imm |= (Delay.SALUCycles + 8) << 7;
306 } else {
307 Imm |= Delay.SALUCycles + 8;
308 }
309 }
310
311 // Don't emit the s_delay_alu instruction if there's nothing to wait for.
312 if (!Imm)
313 return LastDelayAlu;
314
315 // If we only need to wait for one instruction, try encoding it in the last
316 // s_delay_alu that we emitted.
317 if (!(Imm & 0x780) && LastDelayAlu) {
318 unsigned Skip = 0;
319 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
320 E = MachineBasicBlock::instr_iterator(MI);
321 ++I != E;) {
322 if (I->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
323 // It is not deterministic whether the skip count counts
324 // S_SET_VGPR_MSB instructions or not, so do not include them in a
325 // skip region.
326 Skip = 6;
327 break;
328 }
329 if (!I->isBundle() && !I->isMetaInstruction())
330 ++Skip;
331 }
332 if (Skip < 6) {
333 MachineOperand &Op = LastDelayAlu->getOperand(i: 0);
334 unsigned LastImm = Op.getImm();
335 assert((LastImm & ~0xf) == 0 &&
336 "Remembered an s_delay_alu with no room for another delay!");
337 LastImm |= Imm << 7 | Skip << 4;
338 Op.setImm(LastImm);
339 return nullptr;
340 }
341 }
342
343 auto &MBB = *MI.getParent();
344 MachineInstr *DelayAlu =
345 BuildMI(BB&: MBB, I&: MI, MIMD: DebugLoc(), MCID: SII->get(Opcode: AMDGPU::S_DELAY_ALU)).addImm(Val: Imm);
346 // Remember the s_delay_alu for next time if there is still room in it to
347 // encode another delay.
348 return (Imm & 0x780) ? nullptr : DelayAlu;
349 }
350
351 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
352 DelayState State;
353 for (auto *Pred : MBB.predecessors())
354 State.merge(RHS: BlockState[Pred]);
355
356 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
357 << "\n";
358 State.dump(TRI););
359
360 bool Changed = false;
361 MachineInstr *LastDelayAlu = nullptr;
362
363 // FIXME: 0 is a valid register unit.
364 MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0);
365 // Iterate over the contents of bundles, but don't emit any instructions
366 // inside a bundle.
367 for (auto &MI : MBB.instrs()) {
368 if (MI.isBundle() || MI.isMetaInstruction())
369 continue;
370
371 // Ignore some more instructions that do not generate any code.
372 switch (MI.getOpcode()) {
373 case AMDGPU::SI_RETURN_TO_EPILOG:
374 continue;
375 }
376
377 DelayType Type = getDelayType(MI);
378
379 if (instructionWaitsForSGPRWrites(MI)) {
380 auto It = State.find(Val: LastSGPRFromVALU);
381 if (It != State.end()) {
382 DelayInfo Info = It->getSecond();
383 State.advanceByVALUNum(VALUNum: Info.VALUNum);
384 // FIXME: 0 is a valid register unit.
385 LastSGPRFromVALU = static_cast<MCRegUnit>(0);
386 }
387 }
388
389 if (instructionWaitsForVALU(MI)) {
390 // Forget about all outstanding VALU delays.
391 // TODO: This is overkill since it also forgets about SALU delays.
392 State = DelayState();
393 } else if (Type != OTHER) {
394 DelayInfo Delay;
395 // TODO: Scan implicit uses too?
396 for (const auto &Op : MI.explicit_uses()) {
397 if (Op.isReg()) {
398 // One of the operands of the writelane is also the output operand.
399 // This creates the insertion of redundant delays. Hence, we have to
400 // ignore this operand.
401 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
402 continue;
403 for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) {
404 auto It = State.find(Val: Unit);
405 if (It != State.end()) {
406 Delay.merge(RHS: It->second);
407 State.erase(Val: Unit);
408 }
409 }
410 }
411 }
412
413 if (SII->isVALU(Opcode: MI.getOpcode(), /*AllowLDSDMA=*/true)) {
414 for (const auto &Op : MI.defs()) {
415 Register Reg = Op.getReg();
416 if (AMDGPU::isSGPR(Reg, TRI)) {
417 LastSGPRFromVALU = *TRI->regunits(Reg).begin();
418 break;
419 }
420 }
421 }
422
423 if (Emit && !MI.isBundledWithPred()) {
424 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
425 // just ignore them?
426 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
427 }
428 }
429
430 if (Type != OTHER) {
431 // TODO: Scan implicit defs too?
432 for (const auto &Op : MI.defs()) {
433 unsigned Latency = SchedModel->computeOperandLatency(
434 DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: 0);
435 for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg()))
436 State[Unit] = DelayInfo(Type, Latency);
437 }
438 }
439
440 // Advance by the number of cycles it takes to issue this instruction.
441 // TODO: Use a more advanced model that accounts for instructions that
442 // take multiple cycles to issue on a particular pipeline.
443 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
444 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
445 // instructions on the assumption that they will usually have to be issued
446 // twice?
447 State.advance(Type, Cycles);
448
449 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
450 }
451
452 if (Emit) {
453 assert(State == BlockState[&MBB] &&
454 "Basic block state should not have changed on final pass!");
455 } else if (DelayState &BS = BlockState[&MBB]; State != BS) {
456 BS = std::move(State);
457 Changed = true;
458 }
459 return Changed;
460 }
461
462 bool run(MachineFunction &MF) {
463 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
464 << "\n");
465
466 ST = &MF.getSubtarget<GCNSubtarget>();
467 if (!ST->hasDelayAlu())
468 return false;
469
470 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
471
472 if (MFI.getMaxWavesPerEU() == 1)
473 return false;
474
475 SII = ST->getInstrInfo();
476 TRI = ST->getRegisterInfo();
477 SchedModel = &SII->getSchedModel();
478
479 // Calculate the delay state for each basic block, iterating until we reach
480 // a fixed point.
481 SetVector<MachineBasicBlock *> WorkList;
482 for (auto &MBB : reverse(C&: MF))
483 WorkList.insert(X: &MBB);
484 while (!WorkList.empty()) {
485 auto &MBB = *WorkList.pop_back_val();
486 bool Changed = runOnMachineBasicBlock(MBB, Emit: false);
487 if (Changed)
488 WorkList.insert_range(R: MBB.successors());
489 }
490
491 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
492
493 // Make one last pass over all basic blocks to emit s_delay_alu
494 // instructions.
495 bool Changed = false;
496 for (auto &MBB : MF)
497 Changed |= runOnMachineBasicBlock(MBB, Emit: true);
498 return Changed;
499 }
500};
501
502class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass {
503public:
504 static char ID;
505
506 AMDGPUInsertDelayAluLegacy() : MachineFunctionPass(ID) {}
507
508 void getAnalysisUsage(AnalysisUsage &AU) const override {
509 AU.setPreservesCFG();
510 MachineFunctionPass::getAnalysisUsage(AU);
511 }
512
513 bool runOnMachineFunction(MachineFunction &MF) override {
514 if (skipFunction(F: MF.getFunction()))
515 return false;
516 AMDGPUInsertDelayAlu Impl;
517 return Impl.run(MF);
518 }
519};
520} // namespace
521
522PreservedAnalyses
523AMDGPUInsertDelayAluPass::run(MachineFunction &MF,
524 MachineFunctionAnalysisManager &MFAM) {
525 if (!AMDGPUInsertDelayAlu().run(MF))
526 return PreservedAnalyses::all();
527 auto PA = getMachineFunctionPassPreservedAnalyses();
528 PA.preserveSet<CFGAnalyses>();
529 return PA;
530} // end namespace llvm
531
532char AMDGPUInsertDelayAluLegacy::ID = 0;
533
534char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID;
535
536INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE,
537 "AMDGPU Insert Delay ALU", false, false)
538