1//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "SIInstrInfo.h"
18#include "llvm/ADT/SetVector.h"
19
20using namespace llvm;
21
22#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24namespace {
25
26class AMDGPUInsertDelayAlu : public MachineFunctionPass {
27public:
28 static char ID;
29
30 const SIInstrInfo *SII;
31 const TargetRegisterInfo *TRI;
32
33 TargetSchedModel SchedModel;
34
35 AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
36
37 void getAnalysisUsage(AnalysisUsage &AU) const override {
38 AU.setPreservesCFG();
39 MachineFunctionPass::getAnalysisUsage(AU);
40 }
41
42 // Return true if MI waits for all outstanding VALU instructions to complete.
43 static bool instructionWaitsForVALU(const MachineInstr &MI) {
44 // These instruction types wait for VA_VDST==0 before issuing.
45 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
46 SIInstrFlags::FLAT | SIInstrFlags::MIMG |
47 SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
48 if (MI.getDesc().TSFlags & VA_VDST_0)
49 return true;
50 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
52 return true;
53 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
55 return true;
56 return false;
57 }
58
59 // Types of delay that can be encoded in an s_delay_alu instruction.
60 enum DelayType { VALU, TRANS, SALU, OTHER };
61
62 // Get the delay type for an instruction with the specified TSFlags.
63 static DelayType getDelayType(uint64_t TSFlags) {
64 if (TSFlags & SIInstrFlags::TRANS)
65 return TRANS;
66 if (TSFlags & SIInstrFlags::VALU)
67 return VALU;
68 if (TSFlags & SIInstrFlags::SALU)
69 return SALU;
70 return OTHER;
71 }
72
73 // Information about the last instruction(s) that wrote to a particular
74 // regunit. In straight-line code there will only be one such instruction, but
75 // when control flow converges we merge the delay information from each path
76 // to represent the union of the worst-case delays of each type.
77 struct DelayInfo {
78 // One larger than the maximum number of (non-TRANS) VALU instructions we
79 // can encode in an s_delay_alu instruction.
80 static constexpr unsigned VALU_MAX = 5;
81
82 // One larger than the maximum number of TRANS instructions we can encode in
83 // an s_delay_alu instruction.
84 static constexpr unsigned TRANS_MAX = 4;
85
86 // One larger than the maximum number of SALU cycles we can encode in an
87 // s_delay_alu instruction.
88 static constexpr unsigned SALU_CYCLES_MAX = 4;
89
90 // If it was written by a (non-TRANS) VALU, remember how many clock cycles
91 // are left until it completes, and how many other (non-TRANS) VALU we have
92 // seen since it was issued.
93 uint8_t VALUCycles = 0;
94 uint8_t VALUNum = VALU_MAX;
95
96 // If it was written by a TRANS, remember how many clock cycles are left
97 // until it completes, and how many other TRANS we have seen since it was
98 // issued.
99 uint8_t TRANSCycles = 0;
100 uint8_t TRANSNum = TRANS_MAX;
101 // Also remember how many other (non-TRANS) VALU we have seen since it was
102 // issued. When an instruction depends on both a prior TRANS and a prior
103 // non-TRANS VALU, this is used to decide whether to encode a wait for just
104 // one or both of them.
105 uint8_t TRANSNumVALU = VALU_MAX;
106
107 // If it was written by an SALU, remember how many clock cycles are left
108 // until it completes.
109 uint8_t SALUCycles = 0;
110
111 DelayInfo() = default;
112
113 DelayInfo(DelayType Type, unsigned Cycles) {
114 switch (Type) {
115 default:
116 llvm_unreachable("unexpected type");
117 case VALU:
118 VALUCycles = Cycles;
119 VALUNum = 0;
120 break;
121 case TRANS:
122 TRANSCycles = Cycles;
123 TRANSNum = 0;
124 TRANSNumVALU = 0;
125 break;
126 case SALU:
127 // Guard against pseudo-instructions like SI_CALL which are marked as
128 // SALU but with a very high latency.
129 SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX);
130 break;
131 }
132 }
133
134 bool operator==(const DelayInfo &RHS) const {
135 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
136 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
137 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
138 }
139
140 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
141
142 // Merge another DelayInfo into this one, to represent the union of the
143 // worst-case delays of each type.
144 void merge(const DelayInfo &RHS) {
145 VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles);
146 VALUNum = std::min(a: VALUNum, b: RHS.VALUNum);
147 TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles);
148 TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum);
149 TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU);
150 SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles);
151 }
152
153 // Update this DelayInfo after issuing an instruction. IsVALU should be 1
154 // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
155 // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
156 // instruction. Return true if there is no longer any useful delay info.
157 bool advance(DelayType Type, unsigned Cycles) {
158 bool Erase = true;
159
160 VALUNum += (Type == VALU);
161 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
162 // Forget about the VALU instruction. It was too far back or has
163 // definitely completed by now.
164 VALUNum = VALU_MAX;
165 VALUCycles = 0;
166 } else {
167 VALUCycles -= Cycles;
168 Erase = false;
169 }
170
171 TRANSNum += (Type == TRANS);
172 TRANSNumVALU += (Type == VALU);
173 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
174 // Forget about any TRANS instruction. It was too far back or has
175 // definitely completed by now.
176 TRANSNum = TRANS_MAX;
177 TRANSNumVALU = VALU_MAX;
178 TRANSCycles = 0;
179 } else {
180 TRANSCycles -= Cycles;
181 Erase = false;
182 }
183
184 if (SALUCycles <= Cycles) {
185 // Forget about any SALU instruction. It has definitely completed by
186 // now.
187 SALUCycles = 0;
188 } else {
189 SALUCycles -= Cycles;
190 Erase = false;
191 }
192
193 return Erase;
194 }
195
196#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
197 void dump() const {
198 if (VALUCycles)
199 dbgs() << " VALUCycles=" << (int)VALUCycles;
200 if (VALUNum < VALU_MAX)
201 dbgs() << " VALUNum=" << (int)VALUNum;
202 if (TRANSCycles)
203 dbgs() << " TRANSCycles=" << (int)TRANSCycles;
204 if (TRANSNum < TRANS_MAX)
205 dbgs() << " TRANSNum=" << (int)TRANSNum;
206 if (TRANSNumVALU < VALU_MAX)
207 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
208 if (SALUCycles)
209 dbgs() << " SALUCycles=" << (int)SALUCycles;
210 }
211#endif
212 };
213
214 // A map from regunits to the delay info for that regunit.
215 struct DelayState : DenseMap<unsigned, DelayInfo> {
216 // Merge another DelayState into this one by merging the delay info for each
217 // regunit.
218 void merge(const DelayState &RHS) {
219 for (const auto &KV : RHS) {
220 iterator It;
221 bool Inserted;
222 std::tie(args&: It, args&: Inserted) = insert(KV);
223 if (!Inserted)
224 It->second.merge(RHS: KV.second);
225 }
226 }
227
228 // Advance the delay info for each regunit, erasing any that are no longer
229 // useful.
230 void advance(DelayType Type, unsigned Cycles) {
231 iterator Next;
232 for (auto I = begin(), E = end(); I != E; I = Next) {
233 Next = std::next(x: I);
234 if (I->second.advance(Type, Cycles))
235 erase(I);
236 }
237 }
238
239#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
240 void dump(const TargetRegisterInfo *TRI) const {
241 if (empty()) {
242 dbgs() << " empty\n";
243 return;
244 }
245
246 // Dump DelayInfo for each RegUnit in numerical order.
247 SmallVector<const_iterator, 8> Order;
248 Order.reserve(size());
249 for (const_iterator I = begin(), E = end(); I != E; ++I)
250 Order.push_back(I);
251 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
252 return A->first < B->first;
253 });
254 for (const_iterator I : Order) {
255 dbgs() << " " << printRegUnit(I->first, TRI);
256 I->second.dump();
257 dbgs() << "\n";
258 }
259 }
260#endif
261 };
262
263 // The saved delay state at the end of each basic block.
264 DenseMap<MachineBasicBlock *, DelayState> BlockState;
265
266 // Emit an s_delay_alu instruction if necessary before MI.
267 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
268 MachineInstr *LastDelayAlu) {
269 unsigned Imm = 0;
270
271 // Wait for a TRANS instruction.
272 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
273 Imm |= 4 + Delay.TRANSNum;
274
275 // Wait for a VALU instruction (if it's more recent than any TRANS
276 // instruction that we're also waiting for).
277 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
278 Delay.VALUNum <= Delay.TRANSNumVALU) {
279 if (Imm & 0xf)
280 Imm |= Delay.VALUNum << 7;
281 else
282 Imm |= Delay.VALUNum;
283 }
284
285 // Wait for an SALU instruction.
286 if (Delay.SALUCycles) {
287 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
288 if (Imm & 0x780) {
289 // We have already encoded a VALU and a TRANS delay. There's no room in
290 // the encoding for an SALU delay as well, so just drop it.
291 } else if (Imm & 0xf) {
292 Imm |= (Delay.SALUCycles + 8) << 7;
293 } else {
294 Imm |= Delay.SALUCycles + 8;
295 }
296 }
297
298 // Don't emit the s_delay_alu instruction if there's nothing to wait for.
299 if (!Imm)
300 return LastDelayAlu;
301
302 // If we only need to wait for one instruction, try encoding it in the last
303 // s_delay_alu that we emitted.
304 if (!(Imm & 0x780) && LastDelayAlu) {
305 unsigned Skip = 0;
306 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
307 E = MachineBasicBlock::instr_iterator(MI);
308 ++I != E;) {
309 if (!I->isBundle() && !I->isMetaInstruction())
310 ++Skip;
311 }
312 if (Skip < 6) {
313 MachineOperand &Op = LastDelayAlu->getOperand(i: 0);
314 unsigned LastImm = Op.getImm();
315 assert((LastImm & ~0xf) == 0 &&
316 "Remembered an s_delay_alu with no room for another delay!");
317 LastImm |= Imm << 7 | Skip << 4;
318 Op.setImm(LastImm);
319 return nullptr;
320 }
321 }
322
323 auto &MBB = *MI.getParent();
324 MachineInstr *DelayAlu =
325 BuildMI(BB&: MBB, I&: MI, MIMD: DebugLoc(), MCID: SII->get(Opcode: AMDGPU::S_DELAY_ALU)).addImm(Val: Imm);
326 // Remember the s_delay_alu for next time if there is still room in it to
327 // encode another delay.
328 return (Imm & 0x780) ? nullptr : DelayAlu;
329 }
330
331 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
332 DelayState State;
333 for (auto *Pred : MBB.predecessors())
334 State.merge(RHS: BlockState[Pred]);
335
336 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
337 << "\n";
338 State.dump(TRI););
339
340 bool Changed = false;
341 MachineInstr *LastDelayAlu = nullptr;
342
343 // Iterate over the contents of bundles, but don't emit any instructions
344 // inside a bundle.
345 for (auto &MI : MBB.instrs()) {
346 if (MI.isBundle() || MI.isMetaInstruction())
347 continue;
348
349 // Ignore some more instructions that do not generate any code.
350 switch (MI.getOpcode()) {
351 case AMDGPU::SI_RETURN_TO_EPILOG:
352 continue;
353 }
354
355 DelayType Type = getDelayType(TSFlags: MI.getDesc().TSFlags);
356
357 if (instructionWaitsForVALU(MI)) {
358 // Forget about all outstanding VALU delays.
359 // TODO: This is overkill since it also forgets about SALU delays.
360 State = DelayState();
361 } else if (Type != OTHER) {
362 DelayInfo Delay;
363 // TODO: Scan implicit uses too?
364 for (const auto &Op : MI.explicit_uses()) {
365 if (Op.isReg()) {
366 // One of the operands of the writelane is also the output operand.
367 // This creates the insertion of redundant delays. Hence, we have to
368 // ignore this operand.
369 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
370 continue;
371 for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) {
372 auto It = State.find(Val: Unit);
373 if (It != State.end()) {
374 Delay.merge(RHS: It->second);
375 State.erase(Val: Unit);
376 }
377 }
378 }
379 }
380 if (Emit && !MI.isBundledWithPred()) {
381 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
382 // just ignore them?
383 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
384 }
385 }
386
387 if (Type != OTHER) {
388 // TODO: Scan implicit defs too?
389 for (const auto &Op : MI.defs()) {
390 unsigned Latency = SchedModel.computeOperandLatency(
391 DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: 0);
392 for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg()))
393 State[Unit] = DelayInfo(Type, Latency);
394 }
395 }
396
397 // Advance by the number of cycles it takes to issue this instruction.
398 // TODO: Use a more advanced model that accounts for instructions that
399 // take multiple cycles to issue on a particular pipeline.
400 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
401 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
402 // instructions on the assumption that they will usually have to be issued
403 // twice?
404 State.advance(Type, Cycles);
405
406 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
407 }
408
409 if (Emit) {
410 assert(State == BlockState[&MBB] &&
411 "Basic block state should not have changed on final pass!");
412 } else if (State != BlockState[&MBB]) {
413 BlockState[&MBB] = std::move(State);
414 Changed = true;
415 }
416 return Changed;
417 }
418
419 bool runOnMachineFunction(MachineFunction &MF) override {
420 if (skipFunction(F: MF.getFunction()))
421 return false;
422
423 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
424 << "\n");
425
426 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
427 if (!ST.hasDelayAlu())
428 return false;
429
430 SII = ST.getInstrInfo();
431 TRI = ST.getRegisterInfo();
432
433 SchedModel.init(TSInfo: &ST);
434
435 // Calculate the delay state for each basic block, iterating until we reach
436 // a fixed point.
437 SetVector<MachineBasicBlock *> WorkList;
438 for (auto &MBB : reverse(C&: MF))
439 WorkList.insert(X: &MBB);
440 while (!WorkList.empty()) {
441 auto &MBB = *WorkList.pop_back_val();
442 bool Changed = runOnMachineBasicBlock(MBB, Emit: false);
443 if (Changed)
444 WorkList.insert(Start: MBB.succ_begin(), End: MBB.succ_end());
445 }
446
447 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
448
449 // Make one last pass over all basic blocks to emit s_delay_alu
450 // instructions.
451 bool Changed = false;
452 for (auto &MBB : MF)
453 Changed |= runOnMachineBasicBlock(MBB, Emit: true);
454 return Changed;
455 }
456};
457
458} // namespace
459
460char AMDGPUInsertDelayAlu::ID = 0;
461
462char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
463
464INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
465 false, false)
466