1//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "SIInstrInfo.h"
18#include "llvm/ADT/SetVector.h"
19
20using namespace llvm;
21
22#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24namespace {
25
26class AMDGPUInsertDelayAlu {
27public:
28 const GCNSubtarget *ST;
29 const SIInstrInfo *SII;
30 const TargetRegisterInfo *TRI;
31
32 const TargetSchedModel *SchedModel;
33
34 // Return true if MI waits for all outstanding VALU instructions to complete.
35 static bool instructionWaitsForVALU(const MachineInstr &MI) {
36 // These instruction types wait for VA_VDST==0 before issuing.
37 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
38 SIInstrFlags::FLAT | SIInstrFlags::MIMG |
39 SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
40 if (MI.getDesc().TSFlags & VA_VDST_0)
41 return true;
42 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
43 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
44 return true;
45 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
46 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
47 return true;
48 return false;
49 }
50
51 static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
52 // These instruction types wait for VA_SDST==0 before issuing.
53 uint64_t MIFlags = MI.getDesc().TSFlags;
54 if (MIFlags & SIInstrFlags::SMRD)
55 return true;
56
57 if (MIFlags & SIInstrFlags::SALU) {
58 for (auto &Op : MI.operands()) {
59 if (Op.isReg())
60 return true;
61 }
62 }
63 return false;
64 }
65
66 // Types of delay that can be encoded in an s_delay_alu instruction.
67 enum DelayType { VALU, TRANS, SALU, OTHER };
68
69 // Get the delay type for a MachineInstr.
70 DelayType getDelayType(const MachineInstr &MI) {
71 // Non-F64 TRANS instructions use a separate delay type.
72 if (SIInstrInfo::isTRANS(MI) &&
73 !AMDGPU::isDPMACCInstruction(Opc: MI.getOpcode()))
74 return TRANS;
75 // WMMA XDL ops are treated the same as TRANS.
76 if (ST->hasGFX1250Insts() && SII->isXDLWMMA(MI))
77 return TRANS;
78 if (SIInstrInfo::isVALU(MI))
79 return VALU;
80 if (SIInstrInfo::isSALU(MI))
81 return SALU;
82 return OTHER;
83 }
84
85 // Information about the last instruction(s) that wrote to a particular
86 // regunit. In straight-line code there will only be one such instruction, but
87 // when control flow converges we merge the delay information from each path
88 // to represent the union of the worst-case delays of each type.
89 struct DelayInfo {
90 // One larger than the maximum number of (non-TRANS) VALU instructions we
91 // can encode in an s_delay_alu instruction.
92 static constexpr unsigned VALU_MAX = 5;
93
94 // One larger than the maximum number of TRANS instructions we can encode in
95 // an s_delay_alu instruction.
96 static constexpr unsigned TRANS_MAX = 4;
97
98 // One larger than the maximum number of SALU cycles we can encode in an
99 // s_delay_alu instruction.
100 static constexpr unsigned SALU_CYCLES_MAX = 4;
101
102 // If it was written by a (non-TRANS) VALU, remember how many clock cycles
103 // are left until it completes, and how many other (non-TRANS) VALU we have
104 // seen since it was issued.
105 uint8_t VALUCycles = 0;
106 uint8_t VALUNum = VALU_MAX;
107
108 // If it was written by a TRANS, remember how many clock cycles are left
109 // until it completes, and how many other TRANS we have seen since it was
110 // issued.
111 uint8_t TRANSCycles = 0;
112 uint8_t TRANSNum = TRANS_MAX;
113 // Also remember how many other (non-TRANS) VALU we have seen since it was
114 // issued. When an instruction depends on both a prior TRANS and a prior
115 // non-TRANS VALU, this is used to decide whether to encode a wait for just
116 // one or both of them.
117 uint8_t TRANSNumVALU = VALU_MAX;
118
119 // If it was written by an SALU, remember how many clock cycles are left
120 // until it completes.
121 uint8_t SALUCycles = 0;
122
123 DelayInfo() = default;
124
125 DelayInfo(DelayType Type, unsigned Cycles) {
126 switch (Type) {
127 default:
128 llvm_unreachable("unexpected type");
129 case VALU:
130 VALUCycles = Cycles;
131 VALUNum = 0;
132 break;
133 case TRANS:
134 TRANSCycles = Cycles;
135 TRANSNum = 0;
136 TRANSNumVALU = 0;
137 break;
138 case SALU:
139 // Guard against pseudo-instructions like SI_CALL which are marked as
140 // SALU but with a very high latency.
141 SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX);
142 break;
143 }
144 }
145
146 bool operator==(const DelayInfo &RHS) const {
147 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
148 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
149 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
150 }
151
152 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
153
154 // Merge another DelayInfo into this one, to represent the union of the
155 // worst-case delays of each type.
156 void merge(const DelayInfo &RHS) {
157 VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles);
158 VALUNum = std::min(a: VALUNum, b: RHS.VALUNum);
159 TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles);
160 TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum);
161 TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU);
162 SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles);
163 }
164
165 // Update this DelayInfo after issuing an instruction of the specified type.
166 // Cycles is the number of cycles it takes to issue the instruction. Return
167 // true if there is no longer any useful delay info.
168 bool advance(DelayType Type, unsigned Cycles) {
169 bool Erase = true;
170
171 VALUNum += (Type == VALU);
172 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
173 // Forget about the VALU instruction. It was too far back or has
174 // definitely completed by now.
175 VALUNum = VALU_MAX;
176 VALUCycles = 0;
177 } else {
178 VALUCycles -= Cycles;
179 Erase = false;
180 }
181
182 TRANSNum += (Type == TRANS);
183 TRANSNumVALU += (Type == VALU);
184 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
185 // Forget about any TRANS instruction. It was too far back or has
186 // definitely completed by now.
187 TRANSNum = TRANS_MAX;
188 TRANSNumVALU = VALU_MAX;
189 TRANSCycles = 0;
190 } else {
191 TRANSCycles -= Cycles;
192 Erase = false;
193 }
194
195 if (SALUCycles <= Cycles) {
196 // Forget about any SALU instruction. It has definitely completed by
197 // now.
198 SALUCycles = 0;
199 } else {
200 SALUCycles -= Cycles;
201 Erase = false;
202 }
203
204 return Erase;
205 }
206
207#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
208 void dump() const {
209 if (VALUCycles)
210 dbgs() << " VALUCycles=" << (int)VALUCycles;
211 if (VALUNum < VALU_MAX)
212 dbgs() << " VALUNum=" << (int)VALUNum;
213 if (TRANSCycles)
214 dbgs() << " TRANSCycles=" << (int)TRANSCycles;
215 if (TRANSNum < TRANS_MAX)
216 dbgs() << " TRANSNum=" << (int)TRANSNum;
217 if (TRANSNumVALU < VALU_MAX)
218 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
219 if (SALUCycles)
220 dbgs() << " SALUCycles=" << (int)SALUCycles;
221 }
222#endif
223 };
224
225 // A map from regunits to the delay info for that regunit.
226 struct DelayState : DenseMap<MCRegUnit, DelayInfo> {
227 // Merge another DelayState into this one by merging the delay info for each
228 // regunit.
229 void merge(const DelayState &RHS) {
230 for (const auto &KV : RHS) {
231 iterator It;
232 bool Inserted;
233 std::tie(args&: It, args&: Inserted) = insert(KV);
234 if (!Inserted)
235 It->second.merge(RHS: KV.second);
236 }
237 }
238
239 // Advance the delay info for each regunit, erasing any that are no longer
240 // useful.
241 void advance(DelayType Type, unsigned Cycles) {
242 iterator Next;
243 for (auto I = begin(), E = end(); I != E; I = Next) {
244 Next = std::next(x: I);
245 if (I->second.advance(Type, Cycles))
246 erase(I);
247 }
248 }
249
250 void advanceByVALUNum(unsigned VALUNum) {
251 iterator Next;
252 for (auto I = begin(), E = end(); I != E; I = Next) {
253 Next = std::next(x: I);
254 if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {
255 erase(I);
256 }
257 }
258 }
259
260#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
261 void dump(const TargetRegisterInfo *TRI) const {
262 if (empty()) {
263 dbgs() << " empty\n";
264 return;
265 }
266
267 // Dump DelayInfo for each RegUnit in numerical order.
268 SmallVector<const_iterator, 8> Order;
269 Order.reserve(size());
270 for (const_iterator I = begin(), E = end(); I != E; ++I)
271 Order.push_back(I);
272 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
273 return A->first < B->first;
274 });
275 for (const_iterator I : Order) {
276 dbgs() << " " << printRegUnit(I->first, TRI);
277 I->second.dump();
278 dbgs() << "\n";
279 }
280 }
281#endif
282 };
283
284 // The saved delay state at the end of each basic block.
285 DenseMap<MachineBasicBlock *, DelayState> BlockState;
286
287 // Emit an s_delay_alu instruction if necessary before MI.
288 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
289 MachineInstr *LastDelayAlu) {
290 unsigned Imm = 0;
291
292 // Wait for a TRANS instruction.
293 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
294 Imm |= 4 + Delay.TRANSNum;
295
296 // Wait for a VALU instruction (if it's more recent than any TRANS
297 // instruction that we're also waiting for).
298 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
299 Delay.VALUNum <= Delay.TRANSNumVALU) {
300 if (Imm & 0xf)
301 Imm |= Delay.VALUNum << 7;
302 else
303 Imm |= Delay.VALUNum;
304 }
305
306 // Wait for an SALU instruction.
307 if (Delay.SALUCycles) {
308 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
309 if (Imm & 0x780) {
310 // We have already encoded a VALU and a TRANS delay. There's no room in
311 // the encoding for an SALU delay as well, so just drop it.
312 } else if (Imm & 0xf) {
313 Imm |= (Delay.SALUCycles + 8) << 7;
314 } else {
315 Imm |= Delay.SALUCycles + 8;
316 }
317 }
318
319 // Don't emit the s_delay_alu instruction if there's nothing to wait for.
320 if (!Imm)
321 return LastDelayAlu;
322
323 // If we only need to wait for one instruction, try encoding it in the last
324 // s_delay_alu that we emitted.
325 if (!(Imm & 0x780) && LastDelayAlu) {
326 unsigned Skip = 0;
327 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
328 E = MachineBasicBlock::instr_iterator(MI);
329 ++I != E;) {
330 if (I->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
331 // It is not deterministic whether the skip count counts
332 // S_SET_VGPR_MSB instructions or not, so do not include them in a
333 // skip region.
334 Skip = 6;
335 break;
336 }
337 if (!I->isBundle() && !I->isMetaInstruction())
338 ++Skip;
339 }
340 if (Skip < 6) {
341 MachineOperand &Op = LastDelayAlu->getOperand(i: 0);
342 unsigned LastImm = Op.getImm();
343 assert((LastImm & ~0xf) == 0 &&
344 "Remembered an s_delay_alu with no room for another delay!");
345 LastImm |= Imm << 7 | Skip << 4;
346 Op.setImm(LastImm);
347 return nullptr;
348 }
349 }
350
351 auto &MBB = *MI.getParent();
352 MachineInstr *DelayAlu =
353 BuildMI(BB&: MBB, I&: MI, MIMD: DebugLoc(), MCID: SII->get(Opcode: AMDGPU::S_DELAY_ALU)).addImm(Val: Imm);
354 // Remember the s_delay_alu for next time if there is still room in it to
355 // encode another delay.
356 return (Imm & 0x780) ? nullptr : DelayAlu;
357 }
358
359 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
360 DelayState State;
361 for (auto *Pred : MBB.predecessors())
362 State.merge(RHS: BlockState[Pred]);
363
364 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
365 << "\n";
366 State.dump(TRI););
367
368 bool Changed = false;
369 MachineInstr *LastDelayAlu = nullptr;
370
371 // FIXME: 0 is a valid register unit.
372 MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0);
373 // Iterate over the contents of bundles, but don't emit any instructions
374 // inside a bundle.
375 for (auto &MI : MBB.instrs()) {
376 if (MI.isBundle() || MI.isMetaInstruction())
377 continue;
378
379 // Ignore some more instructions that do not generate any code.
380 switch (MI.getOpcode()) {
381 case AMDGPU::SI_RETURN_TO_EPILOG:
382 continue;
383 }
384
385 DelayType Type = getDelayType(MI);
386
387 if (instructionWaitsForSGPRWrites(MI)) {
388 auto It = State.find(Val: LastSGPRFromVALU);
389 if (It != State.end()) {
390 DelayInfo Info = It->getSecond();
391 State.advanceByVALUNum(VALUNum: Info.VALUNum);
392 // FIXME: 0 is a valid register unit.
393 LastSGPRFromVALU = static_cast<MCRegUnit>(0);
394 }
395 }
396
397 if (instructionWaitsForVALU(MI)) {
398 // Forget about all outstanding VALU delays.
399 // TODO: This is overkill since it also forgets about SALU delays.
400 State = DelayState();
401 } else if (Type != OTHER) {
402 DelayInfo Delay;
403 // TODO: Scan implicit uses too?
404 for (const auto &Op : MI.explicit_uses()) {
405 if (Op.isReg()) {
406 // One of the operands of the writelane is also the output operand.
407 // This creates the insertion of redundant delays. Hence, we have to
408 // ignore this operand.
409 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
410 continue;
411 for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) {
412 auto It = State.find(Val: Unit);
413 if (It != State.end()) {
414 Delay.merge(RHS: It->second);
415 State.erase(Val: Unit);
416 }
417 }
418 }
419 }
420
421 if (SII->isVALU(Opcode: MI.getOpcode())) {
422 for (const auto &Op : MI.defs()) {
423 Register Reg = Op.getReg();
424 if (AMDGPU::isSGPR(Reg, TRI)) {
425 LastSGPRFromVALU = *TRI->regunits(Reg).begin();
426 break;
427 }
428 }
429 }
430
431 if (Emit && !MI.isBundledWithPred()) {
432 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
433 // just ignore them?
434 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
435 }
436 }
437
438 if (Type != OTHER) {
439 // TODO: Scan implicit defs too?
440 for (const auto &Op : MI.defs()) {
441 unsigned Latency = SchedModel->computeOperandLatency(
442 DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: 0);
443 for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg()))
444 State[Unit] = DelayInfo(Type, Latency);
445 }
446 }
447
448 // Advance by the number of cycles it takes to issue this instruction.
449 // TODO: Use a more advanced model that accounts for instructions that
450 // take multiple cycles to issue on a particular pipeline.
451 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
452 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
453 // instructions on the assumption that they will usually have to be issued
454 // twice?
455 State.advance(Type, Cycles);
456
457 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
458 }
459
460 if (Emit) {
461 assert(State == BlockState[&MBB] &&
462 "Basic block state should not have changed on final pass!");
463 } else if (DelayState &BS = BlockState[&MBB]; State != BS) {
464 BS = std::move(State);
465 Changed = true;
466 }
467 return Changed;
468 }
469
470 bool run(MachineFunction &MF) {
471 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
472 << "\n");
473
474 ST = &MF.getSubtarget<GCNSubtarget>();
475 if (!ST->hasDelayAlu())
476 return false;
477
478 SII = ST->getInstrInfo();
479 TRI = ST->getRegisterInfo();
480 SchedModel = &SII->getSchedModel();
481
482 // Calculate the delay state for each basic block, iterating until we reach
483 // a fixed point.
484 SetVector<MachineBasicBlock *> WorkList;
485 for (auto &MBB : reverse(C&: MF))
486 WorkList.insert(X: &MBB);
487 while (!WorkList.empty()) {
488 auto &MBB = *WorkList.pop_back_val();
489 bool Changed = runOnMachineBasicBlock(MBB, Emit: false);
490 if (Changed)
491 WorkList.insert_range(R: MBB.successors());
492 }
493
494 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
495
496 // Make one last pass over all basic blocks to emit s_delay_alu
497 // instructions.
498 bool Changed = false;
499 for (auto &MBB : MF)
500 Changed |= runOnMachineBasicBlock(MBB, Emit: true);
501 return Changed;
502 }
503};
504
505class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass {
506public:
507 static char ID;
508
509 AMDGPUInsertDelayAluLegacy() : MachineFunctionPass(ID) {}
510
511 void getAnalysisUsage(AnalysisUsage &AU) const override {
512 AU.setPreservesCFG();
513 MachineFunctionPass::getAnalysisUsage(AU);
514 }
515
516 bool runOnMachineFunction(MachineFunction &MF) override {
517 if (skipFunction(F: MF.getFunction()))
518 return false;
519 AMDGPUInsertDelayAlu Impl;
520 return Impl.run(MF);
521 }
522};
523} // namespace
524
525PreservedAnalyses
526AMDGPUInsertDelayAluPass::run(MachineFunction &MF,
527 MachineFunctionAnalysisManager &MFAM) {
528 if (!AMDGPUInsertDelayAlu().run(MF))
529 return PreservedAnalyses::all();
530 auto PA = getMachineFunctionPassPreservedAnalyses();
531 PA.preserveSet<CFGAnalyses>();
532 return PA;
533} // end namespace llvm
534
535char AMDGPUInsertDelayAluLegacy::ID = 0;
536
537char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID;
538
539INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE,
540 "AMDGPU Insert Delay ALU", false, false)
541