1 | //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Insert s_delay_alu instructions to avoid stalls on GFX11+. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "GCNSubtarget.h" |
16 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
17 | #include "SIInstrInfo.h" |
18 | #include "llvm/ADT/SetVector.h" |
19 | |
20 | using namespace llvm; |
21 | |
22 | #define DEBUG_TYPE "amdgpu-insert-delay-alu" |
23 | |
24 | namespace { |
25 | |
26 | class AMDGPUInsertDelayAlu { |
27 | public: |
28 | const SIInstrInfo *SII; |
29 | const TargetRegisterInfo *TRI; |
30 | |
31 | const TargetSchedModel *SchedModel; |
32 | |
33 | // Return true if MI waits for all outstanding VALU instructions to complete. |
34 | static bool instructionWaitsForVALU(const MachineInstr &MI) { |
35 | // These instruction types wait for VA_VDST==0 before issuing. |
36 | const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | |
37 | SIInstrFlags::FLAT | SIInstrFlags::MIMG | |
38 | SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; |
39 | if (MI.getDesc().TSFlags & VA_VDST_0) |
40 | return true; |
41 | if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || |
42 | MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) |
43 | return true; |
44 | if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
45 | AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: 0).getImm()) == 0) |
46 | return true; |
47 | return false; |
48 | } |
49 | |
50 | static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) { |
51 | // These instruction types wait for VA_SDST==0 before issuing. |
52 | uint64_t MIFlags = MI.getDesc().TSFlags; |
53 | if (MIFlags & SIInstrFlags::SMRD) |
54 | return true; |
55 | |
56 | if (MIFlags & SIInstrFlags::SALU) { |
57 | for (auto &Op : MI.operands()) { |
58 | if (Op.isReg()) |
59 | return true; |
60 | } |
61 | } |
62 | return false; |
63 | } |
64 | |
65 | // Types of delay that can be encoded in an s_delay_alu instruction. |
66 | enum DelayType { VALU, TRANS, SALU, OTHER }; |
67 | |
68 | // Get the delay type for an instruction with the specified TSFlags. |
69 | static DelayType getDelayType(uint64_t TSFlags) { |
70 | if (TSFlags & SIInstrFlags::TRANS) |
71 | return TRANS; |
72 | if (TSFlags & SIInstrFlags::VALU) |
73 | return VALU; |
74 | if (TSFlags & SIInstrFlags::SALU) |
75 | return SALU; |
76 | return OTHER; |
77 | } |
78 | |
79 | // Information about the last instruction(s) that wrote to a particular |
80 | // regunit. In straight-line code there will only be one such instruction, but |
81 | // when control flow converges we merge the delay information from each path |
82 | // to represent the union of the worst-case delays of each type. |
83 | struct DelayInfo { |
84 | // One larger than the maximum number of (non-TRANS) VALU instructions we |
85 | // can encode in an s_delay_alu instruction. |
86 | static constexpr unsigned VALU_MAX = 5; |
87 | |
88 | // One larger than the maximum number of TRANS instructions we can encode in |
89 | // an s_delay_alu instruction. |
90 | static constexpr unsigned TRANS_MAX = 4; |
91 | |
92 | // One larger than the maximum number of SALU cycles we can encode in an |
93 | // s_delay_alu instruction. |
94 | static constexpr unsigned SALU_CYCLES_MAX = 4; |
95 | |
96 | // If it was written by a (non-TRANS) VALU, remember how many clock cycles |
97 | // are left until it completes, and how many other (non-TRANS) VALU we have |
98 | // seen since it was issued. |
99 | uint8_t VALUCycles = 0; |
100 | uint8_t VALUNum = VALU_MAX; |
101 | |
102 | // If it was written by a TRANS, remember how many clock cycles are left |
103 | // until it completes, and how many other TRANS we have seen since it was |
104 | // issued. |
105 | uint8_t TRANSCycles = 0; |
106 | uint8_t TRANSNum = TRANS_MAX; |
107 | // Also remember how many other (non-TRANS) VALU we have seen since it was |
108 | // issued. When an instruction depends on both a prior TRANS and a prior |
109 | // non-TRANS VALU, this is used to decide whether to encode a wait for just |
110 | // one or both of them. |
111 | uint8_t TRANSNumVALU = VALU_MAX; |
112 | |
113 | // If it was written by an SALU, remember how many clock cycles are left |
114 | // until it completes. |
115 | uint8_t SALUCycles = 0; |
116 | |
117 | DelayInfo() = default; |
118 | |
119 | DelayInfo(DelayType Type, unsigned Cycles) { |
120 | switch (Type) { |
121 | default: |
122 | llvm_unreachable("unexpected type" ); |
123 | case VALU: |
124 | VALUCycles = Cycles; |
125 | VALUNum = 0; |
126 | break; |
127 | case TRANS: |
128 | TRANSCycles = Cycles; |
129 | TRANSNum = 0; |
130 | TRANSNumVALU = 0; |
131 | break; |
132 | case SALU: |
133 | // Guard against pseudo-instructions like SI_CALL which are marked as |
134 | // SALU but with a very high latency. |
135 | SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX); |
136 | break; |
137 | } |
138 | } |
139 | |
140 | bool operator==(const DelayInfo &RHS) const { |
141 | return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && |
142 | TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && |
143 | TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; |
144 | } |
145 | |
146 | bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } |
147 | |
148 | // Merge another DelayInfo into this one, to represent the union of the |
149 | // worst-case delays of each type. |
150 | void merge(const DelayInfo &RHS) { |
151 | VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles); |
152 | VALUNum = std::min(a: VALUNum, b: RHS.VALUNum); |
153 | TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles); |
154 | TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum); |
155 | TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU); |
156 | SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles); |
157 | } |
158 | |
159 | // Update this DelayInfo after issuing an instruction of the specified type. |
160 | // Cycles is the number of cycles it takes to issue the instruction. Return |
161 | // true if there is no longer any useful delay info. |
162 | bool advance(DelayType Type, unsigned Cycles) { |
163 | bool Erase = true; |
164 | |
165 | VALUNum += (Type == VALU); |
166 | if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { |
167 | // Forget about the VALU instruction. It was too far back or has |
168 | // definitely completed by now. |
169 | VALUNum = VALU_MAX; |
170 | VALUCycles = 0; |
171 | } else { |
172 | VALUCycles -= Cycles; |
173 | Erase = false; |
174 | } |
175 | |
176 | TRANSNum += (Type == TRANS); |
177 | TRANSNumVALU += (Type == VALU); |
178 | if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { |
179 | // Forget about any TRANS instruction. It was too far back or has |
180 | // definitely completed by now. |
181 | TRANSNum = TRANS_MAX; |
182 | TRANSNumVALU = VALU_MAX; |
183 | TRANSCycles = 0; |
184 | } else { |
185 | TRANSCycles -= Cycles; |
186 | Erase = false; |
187 | } |
188 | |
189 | if (SALUCycles <= Cycles) { |
190 | // Forget about any SALU instruction. It has definitely completed by |
191 | // now. |
192 | SALUCycles = 0; |
193 | } else { |
194 | SALUCycles -= Cycles; |
195 | Erase = false; |
196 | } |
197 | |
198 | return Erase; |
199 | } |
200 | |
201 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
202 | void dump() const { |
203 | if (VALUCycles) |
204 | dbgs() << " VALUCycles=" << (int)VALUCycles; |
205 | if (VALUNum < VALU_MAX) |
206 | dbgs() << " VALUNum=" << (int)VALUNum; |
207 | if (TRANSCycles) |
208 | dbgs() << " TRANSCycles=" << (int)TRANSCycles; |
209 | if (TRANSNum < TRANS_MAX) |
210 | dbgs() << " TRANSNum=" << (int)TRANSNum; |
211 | if (TRANSNumVALU < VALU_MAX) |
212 | dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; |
213 | if (SALUCycles) |
214 | dbgs() << " SALUCycles=" << (int)SALUCycles; |
215 | } |
216 | #endif |
217 | }; |
218 | |
219 | // A map from regunits to the delay info for that regunit. |
220 | struct DelayState : DenseMap<unsigned, DelayInfo> { |
221 | // Merge another DelayState into this one by merging the delay info for each |
222 | // regunit. |
223 | void merge(const DelayState &RHS) { |
224 | for (const auto &KV : RHS) { |
225 | iterator It; |
226 | bool Inserted; |
227 | std::tie(args&: It, args&: Inserted) = insert(KV); |
228 | if (!Inserted) |
229 | It->second.merge(RHS: KV.second); |
230 | } |
231 | } |
232 | |
233 | // Advance the delay info for each regunit, erasing any that are no longer |
234 | // useful. |
235 | void advance(DelayType Type, unsigned Cycles) { |
236 | iterator Next; |
237 | for (auto I = begin(), E = end(); I != E; I = Next) { |
238 | Next = std::next(x: I); |
239 | if (I->second.advance(Type, Cycles)) |
240 | erase(I); |
241 | } |
242 | } |
243 | |
244 | void advanceByVALUNum(unsigned VALUNum) { |
245 | iterator Next; |
246 | for (auto I = begin(), E = end(); I != E; I = Next) { |
247 | Next = std::next(x: I); |
248 | if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) { |
249 | erase(I); |
250 | } |
251 | } |
252 | } |
253 | |
254 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
255 | void dump(const TargetRegisterInfo *TRI) const { |
256 | if (empty()) { |
257 | dbgs() << " empty\n" ; |
258 | return; |
259 | } |
260 | |
261 | // Dump DelayInfo for each RegUnit in numerical order. |
262 | SmallVector<const_iterator, 8> Order; |
263 | Order.reserve(size()); |
264 | for (const_iterator I = begin(), E = end(); I != E; ++I) |
265 | Order.push_back(I); |
266 | llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { |
267 | return A->first < B->first; |
268 | }); |
269 | for (const_iterator I : Order) { |
270 | dbgs() << " " << printRegUnit(I->first, TRI); |
271 | I->second.dump(); |
272 | dbgs() << "\n" ; |
273 | } |
274 | } |
275 | #endif |
276 | }; |
277 | |
278 | // The saved delay state at the end of each basic block. |
279 | DenseMap<MachineBasicBlock *, DelayState> BlockState; |
280 | |
281 | // Emit an s_delay_alu instruction if necessary before MI. |
282 | MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, |
283 | MachineInstr *LastDelayAlu) { |
284 | unsigned Imm = 0; |
285 | |
286 | // Wait for a TRANS instruction. |
287 | if (Delay.TRANSNum < DelayInfo::TRANS_MAX) |
288 | Imm |= 4 + Delay.TRANSNum; |
289 | |
290 | // Wait for a VALU instruction (if it's more recent than any TRANS |
291 | // instruction that we're also waiting for). |
292 | if (Delay.VALUNum < DelayInfo::VALU_MAX && |
293 | Delay.VALUNum <= Delay.TRANSNumVALU) { |
294 | if (Imm & 0xf) |
295 | Imm |= Delay.VALUNum << 7; |
296 | else |
297 | Imm |= Delay.VALUNum; |
298 | } |
299 | |
300 | // Wait for an SALU instruction. |
301 | if (Delay.SALUCycles) { |
302 | assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX); |
303 | if (Imm & 0x780) { |
304 | // We have already encoded a VALU and a TRANS delay. There's no room in |
305 | // the encoding for an SALU delay as well, so just drop it. |
306 | } else if (Imm & 0xf) { |
307 | Imm |= (Delay.SALUCycles + 8) << 7; |
308 | } else { |
309 | Imm |= Delay.SALUCycles + 8; |
310 | } |
311 | } |
312 | |
313 | // Don't emit the s_delay_alu instruction if there's nothing to wait for. |
314 | if (!Imm) |
315 | return LastDelayAlu; |
316 | |
317 | // If we only need to wait for one instruction, try encoding it in the last |
318 | // s_delay_alu that we emitted. |
319 | if (!(Imm & 0x780) && LastDelayAlu) { |
320 | unsigned Skip = 0; |
321 | for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), |
322 | E = MachineBasicBlock::instr_iterator(MI); |
323 | ++I != E;) { |
324 | if (!I->isBundle() && !I->isMetaInstruction()) |
325 | ++Skip; |
326 | } |
327 | if (Skip < 6) { |
328 | MachineOperand &Op = LastDelayAlu->getOperand(i: 0); |
329 | unsigned LastImm = Op.getImm(); |
330 | assert((LastImm & ~0xf) == 0 && |
331 | "Remembered an s_delay_alu with no room for another delay!" ); |
332 | LastImm |= Imm << 7 | Skip << 4; |
333 | Op.setImm(LastImm); |
334 | return nullptr; |
335 | } |
336 | } |
337 | |
338 | auto &MBB = *MI.getParent(); |
339 | MachineInstr *DelayAlu = |
340 | BuildMI(BB&: MBB, I&: MI, MIMD: DebugLoc(), MCID: SII->get(Opcode: AMDGPU::S_DELAY_ALU)).addImm(Val: Imm); |
341 | // Remember the s_delay_alu for next time if there is still room in it to |
342 | // encode another delay. |
343 | return (Imm & 0x780) ? nullptr : DelayAlu; |
344 | } |
345 | |
346 | bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { |
347 | DelayState State; |
348 | for (auto *Pred : MBB.predecessors()) |
349 | State.merge(RHS: BlockState[Pred]); |
350 | |
351 | LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) |
352 | << "\n" ; |
353 | State.dump(TRI);); |
354 | |
355 | bool Changed = false; |
356 | MachineInstr *LastDelayAlu = nullptr; |
357 | |
358 | MCRegUnit LastSGPRFromVALU = 0; |
359 | // Iterate over the contents of bundles, but don't emit any instructions |
360 | // inside a bundle. |
361 | for (auto &MI : MBB.instrs()) { |
362 | if (MI.isBundle() || MI.isMetaInstruction()) |
363 | continue; |
364 | |
365 | // Ignore some more instructions that do not generate any code. |
366 | switch (MI.getOpcode()) { |
367 | case AMDGPU::SI_RETURN_TO_EPILOG: |
368 | continue; |
369 | } |
370 | |
371 | DelayType Type = getDelayType(TSFlags: MI.getDesc().TSFlags); |
372 | |
373 | if (instructionWaitsForSGPRWrites(MI)) { |
374 | auto It = State.find(Val: LastSGPRFromVALU); |
375 | if (It != State.end()) { |
376 | DelayInfo Info = It->getSecond(); |
377 | State.advanceByVALUNum(VALUNum: Info.VALUNum); |
378 | LastSGPRFromVALU = 0; |
379 | } |
380 | } |
381 | |
382 | if (instructionWaitsForVALU(MI)) { |
383 | // Forget about all outstanding VALU delays. |
384 | // TODO: This is overkill since it also forgets about SALU delays. |
385 | State = DelayState(); |
386 | } else if (Type != OTHER) { |
387 | DelayInfo Delay; |
388 | // TODO: Scan implicit uses too? |
389 | for (const auto &Op : MI.explicit_uses()) { |
390 | if (Op.isReg()) { |
391 | // One of the operands of the writelane is also the output operand. |
392 | // This creates the insertion of redundant delays. Hence, we have to |
393 | // ignore this operand. |
394 | if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) |
395 | continue; |
396 | for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) { |
397 | auto It = State.find(Val: Unit); |
398 | if (It != State.end()) { |
399 | Delay.merge(RHS: It->second); |
400 | State.erase(Val: Unit); |
401 | } |
402 | } |
403 | } |
404 | } |
405 | |
406 | if (SII->isVALU(Opcode: MI.getOpcode())) { |
407 | for (const auto &Op : MI.defs()) { |
408 | Register Reg = Op.getReg(); |
409 | if (AMDGPU::isSGPR(Reg, TRI)) { |
410 | LastSGPRFromVALU = *TRI->regunits(Reg).begin(); |
411 | break; |
412 | } |
413 | } |
414 | } |
415 | |
416 | if (Emit && !MI.isBundledWithPred()) { |
417 | // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or |
418 | // just ignore them? |
419 | LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); |
420 | } |
421 | } |
422 | |
423 | if (Type != OTHER) { |
424 | // TODO: Scan implicit defs too? |
425 | for (const auto &Op : MI.defs()) { |
426 | unsigned Latency = SchedModel->computeOperandLatency( |
427 | DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: 0); |
428 | for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) |
429 | State[Unit] = DelayInfo(Type, Latency); |
430 | } |
431 | } |
432 | |
433 | // Advance by the number of cycles it takes to issue this instruction. |
434 | // TODO: Use a more advanced model that accounts for instructions that |
435 | // take multiple cycles to issue on a particular pipeline. |
436 | unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); |
437 | // TODO: In wave64 mode, double the number of cycles for VALU and VMEM |
438 | // instructions on the assumption that they will usually have to be issued |
439 | // twice? |
440 | State.advance(Type, Cycles); |
441 | |
442 | LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); |
443 | } |
444 | |
445 | if (Emit) { |
446 | assert(State == BlockState[&MBB] && |
447 | "Basic block state should not have changed on final pass!" ); |
448 | } else if (DelayState &BS = BlockState[&MBB]; State != BS) { |
449 | BS = std::move(State); |
450 | Changed = true; |
451 | } |
452 | return Changed; |
453 | } |
454 | |
455 | bool run(MachineFunction &MF) { |
456 | LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() |
457 | << "\n" ); |
458 | |
459 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
460 | if (!ST.hasDelayAlu()) |
461 | return false; |
462 | |
463 | SII = ST.getInstrInfo(); |
464 | TRI = ST.getRegisterInfo(); |
465 | SchedModel = &SII->getSchedModel(); |
466 | |
467 | // Calculate the delay state for each basic block, iterating until we reach |
468 | // a fixed point. |
469 | SetVector<MachineBasicBlock *> WorkList; |
470 | for (auto &MBB : reverse(C&: MF)) |
471 | WorkList.insert(X: &MBB); |
472 | while (!WorkList.empty()) { |
473 | auto &MBB = *WorkList.pop_back_val(); |
474 | bool Changed = runOnMachineBasicBlock(MBB, Emit: false); |
475 | if (Changed) |
476 | WorkList.insert_range(R: MBB.successors()); |
477 | } |
478 | |
479 | LLVM_DEBUG(dbgs() << "Final pass over all BBs\n" ); |
480 | |
481 | // Make one last pass over all basic blocks to emit s_delay_alu |
482 | // instructions. |
483 | bool Changed = false; |
484 | for (auto &MBB : MF) |
485 | Changed |= runOnMachineBasicBlock(MBB, Emit: true); |
486 | return Changed; |
487 | } |
488 | }; |
489 | |
490 | class AMDGPUInsertDelayAluLegacy : public MachineFunctionPass { |
491 | public: |
492 | static char ID; |
493 | |
494 | AMDGPUInsertDelayAluLegacy() : MachineFunctionPass(ID) {} |
495 | |
496 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
497 | AU.setPreservesCFG(); |
498 | MachineFunctionPass::getAnalysisUsage(AU); |
499 | } |
500 | |
501 | bool runOnMachineFunction(MachineFunction &MF) override { |
502 | if (skipFunction(F: MF.getFunction())) |
503 | return false; |
504 | AMDGPUInsertDelayAlu Impl; |
505 | return Impl.run(MF); |
506 | } |
507 | }; |
508 | } // namespace |
509 | |
510 | PreservedAnalyses |
511 | AMDGPUInsertDelayAluPass::run(MachineFunction &MF, |
512 | MachineFunctionAnalysisManager &MFAM) { |
513 | if (!AMDGPUInsertDelayAlu().run(MF)) |
514 | return PreservedAnalyses::all(); |
515 | auto PA = getMachineFunctionPassPreservedAnalyses(); |
516 | PA.preserveSet<CFGAnalyses>(); |
517 | return PA; |
518 | } // end namespace llvm |
519 | |
520 | char AMDGPUInsertDelayAluLegacy::ID = 0; |
521 | |
522 | char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAluLegacy::ID; |
523 | |
524 | INITIALIZE_PASS(AMDGPUInsertDelayAluLegacy, DEBUG_TYPE, |
525 | "AMDGPU Insert Delay ALU" , false, false) |
526 | |