1 | //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Insert s_delay_alu instructions to avoid stalls on GFX11+. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "GCNSubtarget.h" |
16 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
17 | #include "SIInstrInfo.h" |
18 | #include "llvm/ADT/SetVector.h" |
19 | |
20 | using namespace llvm; |
21 | |
22 | #define DEBUG_TYPE "amdgpu-insert-delay-alu" |
23 | |
24 | namespace { |
25 | |
26 | class AMDGPUInsertDelayAlu : public MachineFunctionPass { |
27 | public: |
28 | static char ID; |
29 | |
30 | const SIInstrInfo *SII; |
31 | const TargetRegisterInfo *TRI; |
32 | |
33 | TargetSchedModel SchedModel; |
34 | |
35 | AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} |
36 | |
37 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
38 | AU.setPreservesCFG(); |
39 | MachineFunctionPass::getAnalysisUsage(AU); |
40 | } |
41 | |
42 | // Return true if MI waits for all outstanding VALU instructions to complete. |
43 | static bool instructionWaitsForVALU(const MachineInstr &MI) { |
44 | // These instruction types wait for VA_VDST==0 before issuing. |
45 | const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | |
46 | SIInstrFlags::FLAT | SIInstrFlags::MIMG | |
47 | SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; |
48 | if (MI.getDesc().TSFlags & VA_VDST_0) |
49 | return true; |
50 | if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || |
51 | MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) |
52 | return true; |
53 | if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
54 | AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: MI.getOperand(i: 0).getImm()) == 0) |
55 | return true; |
56 | return false; |
57 | } |
58 | |
59 | // Types of delay that can be encoded in an s_delay_alu instruction. |
60 | enum DelayType { VALU, TRANS, SALU, OTHER }; |
61 | |
62 | // Get the delay type for an instruction with the specified TSFlags. |
63 | static DelayType getDelayType(uint64_t TSFlags) { |
64 | if (TSFlags & SIInstrFlags::TRANS) |
65 | return TRANS; |
66 | if (TSFlags & SIInstrFlags::VALU) |
67 | return VALU; |
68 | if (TSFlags & SIInstrFlags::SALU) |
69 | return SALU; |
70 | return OTHER; |
71 | } |
72 | |
73 | // Information about the last instruction(s) that wrote to a particular |
74 | // regunit. In straight-line code there will only be one such instruction, but |
75 | // when control flow converges we merge the delay information from each path |
76 | // to represent the union of the worst-case delays of each type. |
77 | struct DelayInfo { |
78 | // One larger than the maximum number of (non-TRANS) VALU instructions we |
79 | // can encode in an s_delay_alu instruction. |
80 | static constexpr unsigned VALU_MAX = 5; |
81 | |
82 | // One larger than the maximum number of TRANS instructions we can encode in |
83 | // an s_delay_alu instruction. |
84 | static constexpr unsigned TRANS_MAX = 4; |
85 | |
86 | // One larger than the maximum number of SALU cycles we can encode in an |
87 | // s_delay_alu instruction. |
88 | static constexpr unsigned SALU_CYCLES_MAX = 4; |
89 | |
90 | // If it was written by a (non-TRANS) VALU, remember how many clock cycles |
91 | // are left until it completes, and how many other (non-TRANS) VALU we have |
92 | // seen since it was issued. |
93 | uint8_t VALUCycles = 0; |
94 | uint8_t VALUNum = VALU_MAX; |
95 | |
96 | // If it was written by a TRANS, remember how many clock cycles are left |
97 | // until it completes, and how many other TRANS we have seen since it was |
98 | // issued. |
99 | uint8_t TRANSCycles = 0; |
100 | uint8_t TRANSNum = TRANS_MAX; |
101 | // Also remember how many other (non-TRANS) VALU we have seen since it was |
102 | // issued. When an instruction depends on both a prior TRANS and a prior |
103 | // non-TRANS VALU, this is used to decide whether to encode a wait for just |
104 | // one or both of them. |
105 | uint8_t TRANSNumVALU = VALU_MAX; |
106 | |
107 | // If it was written by an SALU, remember how many clock cycles are left |
108 | // until it completes. |
109 | uint8_t SALUCycles = 0; |
110 | |
111 | DelayInfo() = default; |
112 | |
113 | DelayInfo(DelayType Type, unsigned Cycles) { |
114 | switch (Type) { |
115 | default: |
116 | llvm_unreachable("unexpected type" ); |
117 | case VALU: |
118 | VALUCycles = Cycles; |
119 | VALUNum = 0; |
120 | break; |
121 | case TRANS: |
122 | TRANSCycles = Cycles; |
123 | TRANSNum = 0; |
124 | TRANSNumVALU = 0; |
125 | break; |
126 | case SALU: |
127 | // Guard against pseudo-instructions like SI_CALL which are marked as |
128 | // SALU but with a very high latency. |
129 | SALUCycles = std::min(a: Cycles, b: SALU_CYCLES_MAX); |
130 | break; |
131 | } |
132 | } |
133 | |
134 | bool operator==(const DelayInfo &RHS) const { |
135 | return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && |
136 | TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && |
137 | TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; |
138 | } |
139 | |
140 | bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } |
141 | |
142 | // Merge another DelayInfo into this one, to represent the union of the |
143 | // worst-case delays of each type. |
144 | void merge(const DelayInfo &RHS) { |
145 | VALUCycles = std::max(a: VALUCycles, b: RHS.VALUCycles); |
146 | VALUNum = std::min(a: VALUNum, b: RHS.VALUNum); |
147 | TRANSCycles = std::max(a: TRANSCycles, b: RHS.TRANSCycles); |
148 | TRANSNum = std::min(a: TRANSNum, b: RHS.TRANSNum); |
149 | TRANSNumVALU = std::min(a: TRANSNumVALU, b: RHS.TRANSNumVALU); |
150 | SALUCycles = std::max(a: SALUCycles, b: RHS.SALUCycles); |
151 | } |
152 | |
153 | // Update this DelayInfo after issuing an instruction. IsVALU should be 1 |
154 | // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing |
155 | // a TRANS, else 0. Cycles is the number of cycles it takes to issue the |
156 | // instruction. Return true if there is no longer any useful delay info. |
157 | bool advance(DelayType Type, unsigned Cycles) { |
158 | bool Erase = true; |
159 | |
160 | VALUNum += (Type == VALU); |
161 | if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { |
162 | // Forget about the VALU instruction. It was too far back or has |
163 | // definitely completed by now. |
164 | VALUNum = VALU_MAX; |
165 | VALUCycles = 0; |
166 | } else { |
167 | VALUCycles -= Cycles; |
168 | Erase = false; |
169 | } |
170 | |
171 | TRANSNum += (Type == TRANS); |
172 | TRANSNumVALU += (Type == VALU); |
173 | if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { |
174 | // Forget about any TRANS instruction. It was too far back or has |
175 | // definitely completed by now. |
176 | TRANSNum = TRANS_MAX; |
177 | TRANSNumVALU = VALU_MAX; |
178 | TRANSCycles = 0; |
179 | } else { |
180 | TRANSCycles -= Cycles; |
181 | Erase = false; |
182 | } |
183 | |
184 | if (SALUCycles <= Cycles) { |
185 | // Forget about any SALU instruction. It has definitely completed by |
186 | // now. |
187 | SALUCycles = 0; |
188 | } else { |
189 | SALUCycles -= Cycles; |
190 | Erase = false; |
191 | } |
192 | |
193 | return Erase; |
194 | } |
195 | |
196 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
197 | void dump() const { |
198 | if (VALUCycles) |
199 | dbgs() << " VALUCycles=" << (int)VALUCycles; |
200 | if (VALUNum < VALU_MAX) |
201 | dbgs() << " VALUNum=" << (int)VALUNum; |
202 | if (TRANSCycles) |
203 | dbgs() << " TRANSCycles=" << (int)TRANSCycles; |
204 | if (TRANSNum < TRANS_MAX) |
205 | dbgs() << " TRANSNum=" << (int)TRANSNum; |
206 | if (TRANSNumVALU < VALU_MAX) |
207 | dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; |
208 | if (SALUCycles) |
209 | dbgs() << " SALUCycles=" << (int)SALUCycles; |
210 | } |
211 | #endif |
212 | }; |
213 | |
214 | // A map from regunits to the delay info for that regunit. |
215 | struct DelayState : DenseMap<unsigned, DelayInfo> { |
216 | // Merge another DelayState into this one by merging the delay info for each |
217 | // regunit. |
218 | void merge(const DelayState &RHS) { |
219 | for (const auto &KV : RHS) { |
220 | iterator It; |
221 | bool Inserted; |
222 | std::tie(args&: It, args&: Inserted) = insert(KV); |
223 | if (!Inserted) |
224 | It->second.merge(RHS: KV.second); |
225 | } |
226 | } |
227 | |
228 | // Advance the delay info for each regunit, erasing any that are no longer |
229 | // useful. |
230 | void advance(DelayType Type, unsigned Cycles) { |
231 | iterator Next; |
232 | for (auto I = begin(), E = end(); I != E; I = Next) { |
233 | Next = std::next(x: I); |
234 | if (I->second.advance(Type, Cycles)) |
235 | erase(I); |
236 | } |
237 | } |
238 | |
239 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
240 | void dump(const TargetRegisterInfo *TRI) const { |
241 | if (empty()) { |
242 | dbgs() << " empty\n" ; |
243 | return; |
244 | } |
245 | |
246 | // Dump DelayInfo for each RegUnit in numerical order. |
247 | SmallVector<const_iterator, 8> Order; |
248 | Order.reserve(size()); |
249 | for (const_iterator I = begin(), E = end(); I != E; ++I) |
250 | Order.push_back(I); |
251 | llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { |
252 | return A->first < B->first; |
253 | }); |
254 | for (const_iterator I : Order) { |
255 | dbgs() << " " << printRegUnit(I->first, TRI); |
256 | I->second.dump(); |
257 | dbgs() << "\n" ; |
258 | } |
259 | } |
260 | #endif |
261 | }; |
262 | |
263 | // The saved delay state at the end of each basic block. |
264 | DenseMap<MachineBasicBlock *, DelayState> BlockState; |
265 | |
266 | // Emit an s_delay_alu instruction if necessary before MI. |
267 | MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, |
268 | MachineInstr *LastDelayAlu) { |
269 | unsigned Imm = 0; |
270 | |
271 | // Wait for a TRANS instruction. |
272 | if (Delay.TRANSNum < DelayInfo::TRANS_MAX) |
273 | Imm |= 4 + Delay.TRANSNum; |
274 | |
275 | // Wait for a VALU instruction (if it's more recent than any TRANS |
276 | // instruction that we're also waiting for). |
277 | if (Delay.VALUNum < DelayInfo::VALU_MAX && |
278 | Delay.VALUNum <= Delay.TRANSNumVALU) { |
279 | if (Imm & 0xf) |
280 | Imm |= Delay.VALUNum << 7; |
281 | else |
282 | Imm |= Delay.VALUNum; |
283 | } |
284 | |
285 | // Wait for an SALU instruction. |
286 | if (Delay.SALUCycles) { |
287 | assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX); |
288 | if (Imm & 0x780) { |
289 | // We have already encoded a VALU and a TRANS delay. There's no room in |
290 | // the encoding for an SALU delay as well, so just drop it. |
291 | } else if (Imm & 0xf) { |
292 | Imm |= (Delay.SALUCycles + 8) << 7; |
293 | } else { |
294 | Imm |= Delay.SALUCycles + 8; |
295 | } |
296 | } |
297 | |
298 | // Don't emit the s_delay_alu instruction if there's nothing to wait for. |
299 | if (!Imm) |
300 | return LastDelayAlu; |
301 | |
302 | // If we only need to wait for one instruction, try encoding it in the last |
303 | // s_delay_alu that we emitted. |
304 | if (!(Imm & 0x780) && LastDelayAlu) { |
305 | unsigned Skip = 0; |
306 | for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), |
307 | E = MachineBasicBlock::instr_iterator(MI); |
308 | ++I != E;) { |
309 | if (!I->isBundle() && !I->isMetaInstruction()) |
310 | ++Skip; |
311 | } |
312 | if (Skip < 6) { |
313 | MachineOperand &Op = LastDelayAlu->getOperand(i: 0); |
314 | unsigned LastImm = Op.getImm(); |
315 | assert((LastImm & ~0xf) == 0 && |
316 | "Remembered an s_delay_alu with no room for another delay!" ); |
317 | LastImm |= Imm << 7 | Skip << 4; |
318 | Op.setImm(LastImm); |
319 | return nullptr; |
320 | } |
321 | } |
322 | |
323 | auto &MBB = *MI.getParent(); |
324 | MachineInstr *DelayAlu = |
325 | BuildMI(BB&: MBB, I&: MI, MIMD: DebugLoc(), MCID: SII->get(Opcode: AMDGPU::S_DELAY_ALU)).addImm(Val: Imm); |
326 | // Remember the s_delay_alu for next time if there is still room in it to |
327 | // encode another delay. |
328 | return (Imm & 0x780) ? nullptr : DelayAlu; |
329 | } |
330 | |
331 | bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { |
332 | DelayState State; |
333 | for (auto *Pred : MBB.predecessors()) |
334 | State.merge(RHS: BlockState[Pred]); |
335 | |
336 | LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) |
337 | << "\n" ; |
338 | State.dump(TRI);); |
339 | |
340 | bool Changed = false; |
341 | MachineInstr *LastDelayAlu = nullptr; |
342 | |
343 | // Iterate over the contents of bundles, but don't emit any instructions |
344 | // inside a bundle. |
345 | for (auto &MI : MBB.instrs()) { |
346 | if (MI.isBundle() || MI.isMetaInstruction()) |
347 | continue; |
348 | |
349 | // Ignore some more instructions that do not generate any code. |
350 | switch (MI.getOpcode()) { |
351 | case AMDGPU::SI_RETURN_TO_EPILOG: |
352 | continue; |
353 | } |
354 | |
355 | DelayType Type = getDelayType(TSFlags: MI.getDesc().TSFlags); |
356 | |
357 | if (instructionWaitsForVALU(MI)) { |
358 | // Forget about all outstanding VALU delays. |
359 | // TODO: This is overkill since it also forgets about SALU delays. |
360 | State = DelayState(); |
361 | } else if (Type != OTHER) { |
362 | DelayInfo Delay; |
363 | // TODO: Scan implicit uses too? |
364 | for (const auto &Op : MI.explicit_uses()) { |
365 | if (Op.isReg()) { |
366 | // One of the operands of the writelane is also the output operand. |
367 | // This creates the insertion of redundant delays. Hence, we have to |
368 | // ignore this operand. |
369 | if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) |
370 | continue; |
371 | for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) { |
372 | auto It = State.find(Val: Unit); |
373 | if (It != State.end()) { |
374 | Delay.merge(RHS: It->second); |
375 | State.erase(Val: Unit); |
376 | } |
377 | } |
378 | } |
379 | } |
380 | if (Emit && !MI.isBundledWithPred()) { |
381 | // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or |
382 | // just ignore them? |
383 | LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); |
384 | } |
385 | } |
386 | |
387 | if (Type != OTHER) { |
388 | // TODO: Scan implicit defs too? |
389 | for (const auto &Op : MI.defs()) { |
390 | unsigned Latency = SchedModel.computeOperandLatency( |
391 | DefMI: &MI, DefOperIdx: Op.getOperandNo(), UseMI: nullptr, UseOperIdx: 0); |
392 | for (MCRegUnit Unit : TRI->regunits(Reg: Op.getReg())) |
393 | State[Unit] = DelayInfo(Type, Latency); |
394 | } |
395 | } |
396 | |
397 | // Advance by the number of cycles it takes to issue this instruction. |
398 | // TODO: Use a more advanced model that accounts for instructions that |
399 | // take multiple cycles to issue on a particular pipeline. |
400 | unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); |
401 | // TODO: In wave64 mode, double the number of cycles for VALU and VMEM |
402 | // instructions on the assumption that they will usually have to be issued |
403 | // twice? |
404 | State.advance(Type, Cycles); |
405 | |
406 | LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); |
407 | } |
408 | |
409 | if (Emit) { |
410 | assert(State == BlockState[&MBB] && |
411 | "Basic block state should not have changed on final pass!" ); |
412 | } else if (State != BlockState[&MBB]) { |
413 | BlockState[&MBB] = std::move(State); |
414 | Changed = true; |
415 | } |
416 | return Changed; |
417 | } |
418 | |
419 | bool runOnMachineFunction(MachineFunction &MF) override { |
420 | if (skipFunction(F: MF.getFunction())) |
421 | return false; |
422 | |
423 | LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() |
424 | << "\n" ); |
425 | |
426 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
427 | if (!ST.hasDelayAlu()) |
428 | return false; |
429 | |
430 | SII = ST.getInstrInfo(); |
431 | TRI = ST.getRegisterInfo(); |
432 | |
433 | SchedModel.init(TSInfo: &ST); |
434 | |
435 | // Calculate the delay state for each basic block, iterating until we reach |
436 | // a fixed point. |
437 | SetVector<MachineBasicBlock *> WorkList; |
438 | for (auto &MBB : reverse(C&: MF)) |
439 | WorkList.insert(X: &MBB); |
440 | while (!WorkList.empty()) { |
441 | auto &MBB = *WorkList.pop_back_val(); |
442 | bool Changed = runOnMachineBasicBlock(MBB, Emit: false); |
443 | if (Changed) |
444 | WorkList.insert(Start: MBB.succ_begin(), End: MBB.succ_end()); |
445 | } |
446 | |
447 | LLVM_DEBUG(dbgs() << "Final pass over all BBs\n" ); |
448 | |
449 | // Make one last pass over all basic blocks to emit s_delay_alu |
450 | // instructions. |
451 | bool Changed = false; |
452 | for (auto &MBB : MF) |
453 | Changed |= runOnMachineBasicBlock(MBB, Emit: true); |
454 | return Changed; |
455 | } |
456 | }; |
457 | |
458 | } // namespace |
459 | |
460 | char AMDGPUInsertDelayAlu::ID = 0; |
461 | |
462 | char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; |
463 | |
464 | INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU" , |
465 | false, false) |
466 | |