1//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "SIInstrInfo.h"
18#include "llvm/ADT/SetVector.h"
19
20using namespace llvm;
21
22#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24namespace {
25
26class AMDGPUInsertDelayAlu {
27public:
28 const SIInstrInfo *SII;
29 const TargetRegisterInfo *TRI;
30
31 const TargetSchedModel *SchedModel;
32
33 // Return true if MI waits for all outstanding VALU instructions to complete.
34 static bool instructionWaitsForVALU(const MachineInstr &MI) {
35 // These instruction types wait for VA_VDST==0 before issuing.
36 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
37 SIInstrFlags::FLAT | SIInstrFlags::MIMG |
38 SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
39 if (MI.getDesc().TSFlags & VA_VDST_0)
40 return true;
41 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
42 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
43 return true;
44 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
45 AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: 0).getImm()) == 0)
46 return true;
47 return false;
48 }
49
50 static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
51 // These instruction types wait for VA_SDST==0 before issuing.
52 uint64_t MIFlags = MI.getDesc().TSFlags;
53 if (MIFlags & SIInstrFlags::SMRD)
54 return true;
55
56 if (MIFlags & SIInstrFlags::SALU) {
57 for (auto &Op : MI.operands()) {
58 if (Op.isReg())
59 return true;
60 }
61 }
62 return false;
63 }
64
65 // Types of delay that can be encoded in an s_delay_alu instruction.
66 enum DelayType { VALU, TRANS, SALU, OTHER };
67
68 // Get the delay type for an instruction with the specified TSFlags.
69 static DelayType getDelayType(uint64_t TSFlags) {
70 if (TSFlags & SIInstrFlags::TRANS)
71 return TRANS;
72 if (TSFlags & SIInstrFlags::VALU)
73 return VALU;
74 if (TSFlags & SIInstrFlags::SALU)
75 return SALU;
76 return OTHER;
77 }
78
79 // Information about the last instruction(s) that wrote to a particular
80 // regunit. In straight-line code there will only be one such instruction, but
81 // when control flow converges we merge the delay information from each path
82 // to represent the union of the worst-case delays of each type.
83 struct DelayInfo {
84 // One larger than the maximum number of (non-TRANS) VALU instructions we
85 // can encode in an s_delay_alu instruction.
86 static constexpr unsigned VALU_MAX = 5;
87
88 // One larger than the maximum number of TRANS instructions we can encode in
89 // an s_delay_alu instruction.
90 static constexpr unsigned TRANS_MAX = 4;
91
92 // One larger than the maximum number of SALU cycles we can encode in an
93 // s_delay_alu instruction.
94 static constexpr unsigned SALU_CYCLES_MAX = 4;
95
96 // If it was written by a (non-TRANS) VALU, remember how many clock cycles
97 // are left until it completes, and how many other (non-TRANS) VALU we have
98 // seen since it was issued.
99 uint8_t VALUCycles = 0;
100 uint8_t VALUNum = VALU_MAX;
101
102 // If it was written by a TRANS, remember how many clock cycles are left
103 // until it completes, and how many other TRANS we have seen since it was
104 // issued.
105 uint8_t TRANSCycles = 0;
106 uint8_t TRANSNum = TRANS_MAX;
107 // Also remember how many other (non-TRANS) VALU we have seen since it was
108 // issued. When an instruction depends on both a prior TRANS and a prior
109 // non-TRANS VALU, this is used to decide whether to encode a wait for just
110 // one or both of them.
111 uint8_t TRANSNumVALU = VALU_MAX;
112
113 // If it was written by an SALU, remember how many clock cycles are left
114 // until it completes.
115 uint8_t SALUCycles = 0;
116
117 DelayInfo() = default;
118
119 DelayInfo(DelayType Type, unsigned Cycles) {
120 switch (Type) {
121 default:
122 llvm_unreachable("unexpected type");
123 case VALU:
124 VALUCycles = Cycles;
125 VALUNum = 0;
126 break;
127 case TRANS:
128 TRANSCycles = Cycles;
129 TRANSNum = 0;
130 TRANSNumVALU = 0;
131 break;
132 case SALU:
133 // Guard against pseudo-instructions like SI_CALL which are marked as
134 // SALU but with a very high latency.
135 SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX);
136 break;
137 }
138 }
139
140 bool operator==(const DelayInfo &RHS) const {
141 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
142 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
143 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
144 }
145
146 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
147
148 // Merge another DelayInfo into this one, to represent the union of the
149 // worst-case delays of each type.
150 void merge(const DelayInfo &RHS) {
151 VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles);
152 VALUNum = std::min(a: VALUNum, b: RHS.VALUNum);
153 TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles);
154 TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum);
155 TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU);
156 SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles);
157 }
158
159 // Update this DelayInfo after issuing an instruction of the specified type.
160 // Cycles is the number of cycles it takes to issue the instruction. Return
161 // true if there is no longer any useful delay info.
162 bool advance(DelayType Type, unsigned Cycles) {
163 bool Erase = true;
164
165 VALUNum += (Type == VALU);
166 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
167 // Forget about the VALU instruction. It was too far back or has
168 // definitely completed by now.
169 VALUNum = VALU_MAX;
170 VALUCycles = 0;
171 } else {
172 VALUCycles -= Cycles;
173 Erase = false;
174 }
175
176 TRANSNum += (Type == TRANS);
177 TRANSNumVALU += (Type == VALU);
178 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
179 // Forget about any TRANS instruction. It was too far back or has
180 // definitely completed by now.
181 TRANSNum = TRANS_MAX;
182 TRANSNumVALU = VALU_MAX;
183 TRANSCycles = 0;
184 } else {
185 TRANSCycles -= Cycles;
186 Erase = false;
187 }
188
189 if (SALUCycles <= Cycles) {
190 // Forget about any SALU instruction. It has definitely completed by
191 // now.
192 SALUCycles = 0;
193 } else {
194 SALUCycles -= Cycles;
195 Erase = false;
196 }
197
198 return Erase;
199 }
200
201#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
202 void dump() const {
203 if (VALUCycles)
204 dbgs() << " VALUCycles=" << (int)VALUCycles;
205 if (VALUNum < VALU_MAX)
206 dbgs() << " VALUNum=" << (int)VALUNum;
207 if (TRANSCycles)
208 dbgs() << " TRANSCycles=" << (int)TRANSCycles;
209 if (TRANSNum < TRANS_MAX)
210 dbgs() << " TRANSNum=" << (int)TRANSNum;
211 if (TRANSNumVALU < VALU_MAX)
212 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
213 if (SALUCycles)
214 dbgs() << " SALUCycles=" << (int)SALUCycles;
215 }
216#endif
217 };
218
219 // A map from regunits to the delay info for that regunit.
220 struct DelayState : DenseMap<unsigned, DelayInfo> {
221 // Merge another DelayState into this one by merging the delay info for each
222 // regunit.
223 void merge(const DelayState &RHS) {
224 for (const auto &KV : RHS) {
225 iterator It;
226 bool Inserted;
227 std::tie(args&: It, args&: Inserted) = insert(KV);
228 if (!Inserted)
229 It->second.merge(RHS: KV.second);
230 }
231 }
232
233 // Advance the delay info for each regunit, erasing any that are no longer
234 // useful.
235 void advance(DelayType Type, unsigned Cycles) {
236 iterator Next;
237 for (auto I = begin(), E = end(); I != E; I = Next) {
238 Next = std::next(x: I);
239 if (I->second.advance(Type, Cycles))
240 erase(I);
241 }
242 }
243
244 void advanceByVALUNum(unsigned VALUNum) {
245 iterator Next;
246 for (auto I = begin(), E = end(); I != E; I = Next) {
247 Next = std::next(x: I);
248 if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {
249 erase(I);
250 }
251 }
252 }
253
254#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
255 void dump(const TargetRegisterInfo *TRI) const {
256 if (empty()) {
257 dbgs() << " empty\n";
258 return;
259 }
260
261 // Dump DelayInfo for each RegUnit in numerical order.
262 SmallVector<const_iterator, 8> Order;
263 Order.reserve(size());
264 for (const_iterator I = begin(), E = end(); I != E; ++I)
265 Order.push_back(I);
266 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
267 return A->first < B->first;
268 });
269 for (const_iterator I : Order) {
270 dbgs() << " " << printRegUnit(I->first, TRI);
271 I->second.dump();
272 dbgs() << "\n";
273 }
274 }
275#endif
276 };
277
278 // The saved delay state at the end of each basic block.
279 DenseMap<MachineBasicBlock *, DelayState> BlockState;
280
281 // Emit an s_delay_alu instruction if necessary before MI.
282 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
283 MachineInstr *LastDelayAlu) {
284 unsigned Imm = 0;
285
286 // Wait for a TRANS instruction.
287 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
288 Imm |= 4 + Delay.TRANSNum;
289
290 // Wait for a VALU instruction (if it's more recent than any TRANS
291 // instruction that we're also waiting for).
292 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
293 Delay.VALUNum <= Delay.TRANSNumVALU) {
294 if (Imm & 0xf)
295 Imm |= Delay.VALUNum << 7;
296 else
297 Imm |= Delay.VALUNum;
298 }
299
300 // Wait for an SALU instruction.
301 if (Delay.SALUCycles) {
302 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
303 if (Imm & 0x780) {
304 // We have already encoded a VALU and a TRANS delay. There's no room in
305 // the encoding for an SALU delay as well, so just drop it.
306 } else if (Imm & 0xf) {
307 Imm |= (Delay.SALUCycles + 8) << 7;
308 } else {
309 Imm |= Delay.SALUCycles + 8;
310 }
311 }
312
313 // Don't emit the s_delay_alu instruction if there's nothing to wait for.
314 if (!Imm)
315 return LastDelayAlu;
316
317 // If we only need to wait for one instruction, try encoding it in the last
318 // s_delay_alu that we emitted.
319 if (!(Imm & 0x780) && LastDelayAlu) {
320 unsigned Skip = 0;
321 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
322 E = MachineBasicBlock::instr_iterator(MI);
323 ++I != E;) {
324 if (!I->isBundle() && !I->isMetaInstruction())
325 ++Skip;
326 }
327 if (Skip < 6) {
328 MachineOperand &Op = LastDelayAlu->getOperand(i: 0);
329 unsigned LastImm = Op.getImm();
330 assert((LastImm & ~0xf) == 0 &&
331 "Remembered an s_delay_alu with no room for another delay!");
332 LastImm |= Imm << 7 | Skip << 4;
333 Op.setImm(LastImm);
334 return nullptr;
335 }
336 }
337
338 auto &MBB = *MI.getParent();
339 MachineInstr *DelayAlu =
340 BuildMI(BB&: MBB, I&: MI, MIMD: DebugLoc(), MCID: SII->get(Opcode: AMDGPU::S_DELAY_ALU)).addImm(Val: Imm);
341 // Remember the s_delay_alu for next time if there is still room in it to
342 // encode another delay.
343 return (Imm & 0x780) ? nullptr : DelayAlu;
344 }
345
346 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
347 DelayState State;
348 for (auto *Pred : MBB.predecessors())
349 State.merge(RHS: BlockState[Pred]);
350
351 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
352 << "\n";
353 State.dump(TRI););
354
355 bool Changed = false;
356 MachineInstr *LastDelayAlu = nullptr;
357
358 MCRegUnit LastSGPRFromVALU = 0;
359 // Iterate over the contents of bundles, but don't emit any instructions
360 // inside a bundle.
361 for (auto &MI : MBB.instrs()) {
362 if (MI.isBundle() || MI.isMetaInstruction())
363 continue;
364
365 // Ignore some more instructions that do not generate any code.
366 switch (MI.getOpcode()) {
367 case AMDGPU::SI_RETURN_TO_EPILOG:
368 continue;
369 }
370
371 DelayType Type = getDelayType(TSFlags: MI.getDesc().TSFlags);
372
373 if (instructionWaitsForSGPRWrites(MI)) {
374 auto It = State.find(Val: LastSGPRFromVALU);
375 if (It != State.end()) {
376 DelayInfo Info = It->getSecond();
377 State.advanceByVALUNum(VALUNum: Info.VALUNum);
378 LastSGPRFromVALU = 0;
379 }
380 }
381
382 if (instructionWaitsForVALU(MI)) {
383 // Forget about all outstanding VALU delays.
384 // TODO: This is overkill since it also forgets about SALU delays.
385 State = DelayState();
386 } else if (Type != OTHER) {
387 DelayInfo Delay;
388 // TODO: Scan implicit uses too?
389 for (const auto &Op : MI.explicit_uses()) {
390 if (Op.isReg()) {
391 // One of the operands of the writelane is also the output operand.
392 // This creates the insertion of redundant delays. Hence, we have to
393 // ignore this operand.
394 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
395 continue;
396 for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) {
397 auto It = State.find(Val: Unit);
398 if (It != State.end()) {
399 Delay.merge(RHS: It->second);
400 State.erase(Val: Unit);
401 }
402 }
403 }
404 }
405
406 if (SII->isVALU(Opcode: MI.getOpcode())) {
407 for (const auto &Op : MI.defs()) {
408 Register Reg = Op.getReg();
409 if (AMDGPU::isSGPR(Reg, TRI)) {
410 LastSGPRFromVALU = *TRI->regunits(Reg).begin();
411 break;
412 }
413 }
414 }
415
416 if (Emit && !MI.isBundledWithPred()) {
417 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
418 // just ignore them?
419 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
420 }
421 }
422
423 if (Type != OTHER) {
424 // TODO: Scan implicit defs too?
425 for (const auto &Op : MI.defs()) {
426 unsigned Latency = SchedModel->computeOperandLatency(
427 DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: 0);
428 for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg()))
429 State[Unit] = DelayInfo(Type, Latency);
430 }
431 }
432
433 // Advance by the number of cycles it takes to issue this instruction.
434 // TODO: Use a more advanced model that accounts for instructions that
435 // take multiple cycles to issue on a particular pipeline.
436 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
437 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
438 // instructions on the assumption that they will usually have to be issued
439 // twice?
440 State.advance(Type, Cycles);
441
442 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
443 }
444
445 if (Emit) {
446 assert(State == BlockState[&MBB] &&
447 "Basic block state should not have changed on final pass!");
448 } else if (DelayState &BS = BlockState[&MBB]; State != BS) {
449 BS = std::move(State);
450 Changed = true;
451 }
452 return Changed;
453 }
454
455 bool run(MachineFunction &MF) {
456 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
457 << "\n");
458
459 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
460 if (!ST.hasDelayAlu())
461 return false;
462
463 SII = ST.getInstrInfo();
464 TRI = ST.getRegisterInfo();
465 SchedModel = &SII->getSchedModel();
466
467 // Calculate the delay state for each basic block, iterating until we reach
468 // a fixed point.
469 SetVector<MachineBasicBlock *> WorkList;
470 for (auto &MBB : reverse(C&: MF))
471 WorkList.insert(X: &MBB);
472 while (!WorkList.empty()) {
473 auto &MBB = *WorkList.pop_back_val();
474 bool Changed = runOnMachineBasicBlock(MBB, Emit: false);
475 if (Changed)
476 WorkList.insert_range(R: MBB.successors());
477 }
478
479 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
480
481 // Make one last pass over all basic blocks to emit s_delay_alu
482 // instructions.
483 bool Changed = false;
484 for (auto &MBB : MF)
485 Changed |= runOnMachineBasicBlock(MBB, Emit: true);
486 return Changed;
487 }
488};
489
490class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass {
491public:
492 static char ID;
493
494 AMDGPUInsertDelayAluLegacy() : MachineFunctionPass(ID) {}
495
496 void getAnalysisUsage(AnalysisUsage &AU) const override {
497 AU.setPreservesCFG();
498 MachineFunctionPass::getAnalysisUsage(AU);
499 }
500
501 bool runOnMachineFunction(MachineFunction &MF) override {
502 if (skipFunction(F: MF.getFunction()))
503 return false;
504 AMDGPUInsertDelayAlu Impl;
505 return Impl.run(MF);
506 }
507};
508} // namespace
509
510PreservedAnalyses
511AMDGPUInsertDelayAluPass::run(MachineFunction &MF,
512 MachineFunctionAnalysisManager &MFAM) {
513 if (!AMDGPUInsertDelayAlu().run(MF))
514 return PreservedAnalyses::all();
515 auto PA = getMachineFunctionPassPreservedAnalyses();
516 PA.preserveSet<CFGAnalyses>();
517 return PA;
518} // end namespace llvm
519
520char AMDGPUInsertDelayAluLegacy::ID = 0;
521
522char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID;
523
524INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE,
525 "AMDGPU Insert Delay ALU", false, false)
526