1 | //===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPUWaitSGPRHazards.h" |
15 | #include "AMDGPU.h" |
16 | #include "GCNSubtarget.h" |
17 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
18 | #include "SIInstrInfo.h" |
19 | #include "llvm/ADT/SetVector.h" |
20 | |
21 | using namespace llvm; |
22 | |
23 | #define DEBUG_TYPE "amdgpu-wait-sgpr-hazards" |
24 | |
25 | static cl::opt<bool> GlobalEnableSGPRHazardWaits( |
26 | "amdgpu-sgpr-hazard-wait" , cl::init(Val: true), cl::Hidden, |
27 | cl::desc("Enable required s_wait_alu on SGPR hazards" )); |
28 | |
29 | static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary( |
30 | "amdgpu-sgpr-hazard-boundary-cull" , cl::init(Val: false), cl::Hidden, |
31 | cl::desc("Cull hazards on function boundaries" )); |
32 | |
33 | static cl::opt<bool> |
34 | GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull" , |
35 | cl::init(Val: false), cl::Hidden, |
36 | cl::desc("Cull hazards on memory waits" )); |
37 | |
38 | static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold( |
39 | "amdgpu-sgpr-hazard-mem-wait-cull-threshold" , cl::init(Val: 8), cl::Hidden, |
40 | cl::desc("Number of tracked SGPRs before initiating hazard cull on memory " |
41 | "wait" )); |
42 | |
43 | namespace { |
44 | |
45 | class AMDGPUWaitSGPRHazards { |
46 | public: |
47 | const SIInstrInfo *TII; |
48 | const SIRegisterInfo *TRI; |
49 | const MachineRegisterInfo *MRI; |
50 | unsigned DsNopCount; |
51 | |
52 | bool EnableSGPRHazardWaits; |
53 | bool CullSGPRHazardsOnFunctionBoundary; |
54 | bool CullSGPRHazardsAtMemWait; |
55 | unsigned CullSGPRHazardsMemWaitThreshold; |
56 | |
57 | AMDGPUWaitSGPRHazards() {} |
58 | |
59 | // Return the numeric ID 0-127 for a given SGPR. |
60 | static std::optional<unsigned> sgprNumber(Register Reg, |
61 | const SIRegisterInfo &TRI) { |
62 | switch (Reg) { |
63 | case AMDGPU::M0: |
64 | case AMDGPU::EXEC: |
65 | case AMDGPU::EXEC_LO: |
66 | case AMDGPU::EXEC_HI: |
67 | case AMDGPU::SGPR_NULL: |
68 | case AMDGPU::SGPR_NULL64: |
69 | return {}; |
70 | default: |
71 | break; |
72 | } |
73 | unsigned RegN = TRI.getHWRegIndex(Reg); |
74 | if (RegN > 127) |
75 | return {}; |
76 | return RegN; |
77 | } |
78 | |
79 | static inline bool isVCC(Register Reg) { |
80 | return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; |
81 | } |
82 | |
83 | // Adjust global offsets for instructions bundled with S_GETPC_B64 after |
84 | // insertion of a new instruction. |
85 | static void updateGetPCBundle(MachineInstr *NewMI) { |
86 | if (!NewMI->isBundled()) |
87 | return; |
88 | |
89 | // Find start of bundle. |
90 | auto I = NewMI->getIterator(); |
91 | while (I->isBundledWithPred()) |
92 | I--; |
93 | if (I->isBundle()) |
94 | I++; |
95 | |
96 | // Bail if this is not an S_GETPC bundle. |
97 | if (I->getOpcode() != AMDGPU::S_GETPC_B64) |
98 | return; |
99 | |
100 | // Update offsets of any references in the bundle. |
101 | const unsigned NewBytes = 4; |
102 | assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && |
103 | "Unexpected instruction insertion in bundle" ); |
104 | auto NextMI = std::next(x: NewMI->getIterator()); |
105 | auto End = NewMI->getParent()->end(); |
106 | while (NextMI != End && NextMI->isBundledWithPred()) { |
107 | for (auto &Operand : NextMI->operands()) { |
108 | if (Operand.isGlobal()) |
109 | Operand.setOffset(Operand.getOffset() + NewBytes); |
110 | } |
111 | NextMI++; |
112 | } |
113 | } |
114 | |
115 | struct HazardState { |
116 | static constexpr unsigned None = 0; |
117 | static constexpr unsigned SALU = (1 << 0); |
118 | static constexpr unsigned VALU = (1 << 1); |
119 | |
120 | std::bitset<64> Tracked; // SGPR banks ever read by VALU |
121 | std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU |
122 | std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU |
123 | unsigned VCCHazard = None; // Source of current VCC writes |
124 | bool ActiveFlat = false; // Has unwaited flat instructions |
125 | |
126 | bool merge(const HazardState &RHS) { |
127 | HazardState Orig(*this); |
128 | *this |= RHS; |
129 | return (*this != Orig); |
130 | } |
131 | |
132 | bool operator==(const HazardState &RHS) const { |
133 | return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards && |
134 | VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard && |
135 | ActiveFlat == RHS.ActiveFlat; |
136 | } |
137 | |
138 | bool operator!=(const HazardState &RHS) const { return !(*this == RHS); } |
139 | |
140 | void operator|=(const HazardState &RHS) { |
141 | Tracked |= RHS.Tracked; |
142 | SALUHazards |= RHS.SALUHazards; |
143 | VALUHazards |= RHS.VALUHazards; |
144 | VCCHazard |= RHS.VCCHazard; |
145 | ActiveFlat |= RHS.ActiveFlat; |
146 | } |
147 | }; |
148 | |
149 | struct BlockHazardState { |
150 | HazardState In; |
151 | HazardState Out; |
152 | }; |
153 | |
154 | DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState; |
155 | |
156 | static constexpr unsigned WAVE32_NOPS = 4; |
157 | static constexpr unsigned WAVE64_NOPS = 8; |
158 | |
159 | void insertHazardCull(MachineBasicBlock &MBB, |
160 | MachineBasicBlock::instr_iterator &MI) { |
161 | assert(!MI->isBundled()); |
162 | unsigned Count = DsNopCount; |
163 | while (Count--) |
164 | BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::DS_NOP)); |
165 | } |
166 | |
167 | unsigned mergeMasks(unsigned Mask1, unsigned Mask2) { |
168 | unsigned Mask = 0xffff; |
169 | Mask = AMDGPU::DepCtr::encodeFieldSaSdst( |
170 | Encoded: Mask, SaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: Mask1), |
171 | b: AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: Mask2))); |
172 | Mask = AMDGPU::DepCtr::encodeFieldVaVcc( |
173 | Encoded: Mask, VaVcc: std::min(a: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: Mask1), |
174 | b: AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: Mask2))); |
175 | Mask = AMDGPU::DepCtr::encodeFieldVmVsrc( |
176 | Encoded: Mask, VmVsrc: std::min(a: AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: Mask1), |
177 | b: AMDGPU::DepCtr::decodeFieldVmVsrc(Encoded: Mask2))); |
178 | Mask = AMDGPU::DepCtr::encodeFieldVaSdst( |
179 | Encoded: Mask, VaSdst: std::min(a: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: Mask1), |
180 | b: AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: Mask2))); |
181 | Mask = AMDGPU::DepCtr::encodeFieldVaVdst( |
182 | Encoded: Mask, VaVdst: std::min(a: AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: Mask1), |
183 | b: AMDGPU::DepCtr::decodeFieldVaVdst(Encoded: Mask2))); |
184 | Mask = AMDGPU::DepCtr::encodeFieldHoldCnt( |
185 | HoldCnt: Mask, Encoded: std::min(a: AMDGPU::DepCtr::decodeFieldHoldCnt(Encoded: Mask1), |
186 | b: AMDGPU::DepCtr::decodeFieldHoldCnt(Encoded: Mask2))); |
187 | Mask = AMDGPU::DepCtr::encodeFieldVaSsrc( |
188 | Encoded: Mask, VaSsrc: std::min(a: AMDGPU::DepCtr::decodeFieldVaSsrc(Encoded: Mask1), |
189 | b: AMDGPU::DepCtr::decodeFieldVaSsrc(Encoded: Mask2))); |
190 | return Mask; |
191 | } |
192 | |
193 | bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI, |
194 | unsigned Mask) { |
195 | auto MBB = MI->getParent(); |
196 | if (MI == MBB->instr_begin()) |
197 | return false; |
198 | |
199 | auto It = prev_nodbg(It: MI, Begin: MBB->instr_begin()); |
200 | if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR) |
201 | return false; |
202 | |
203 | It->getOperand(i: 0).setImm(mergeMasks(Mask1: Mask, Mask2: It->getOperand(i: 0).getImm())); |
204 | return true; |
205 | } |
206 | |
207 | bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { |
208 | enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 }; |
209 | |
210 | HazardState State = BlockState[&MBB].In; |
211 | SmallSet<Register, 8> SeenRegs; |
212 | bool Emitted = false; |
213 | unsigned DsNops = 0; |
214 | |
215 | for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(), |
216 | E = MBB.instr_end(); |
217 | MI != E; ++MI) { |
218 | if (MI->isMetaInstruction()) |
219 | continue; |
220 | |
221 | // Clear tracked SGPRs if sufficient DS_NOPs occur |
222 | if (MI->getOpcode() == AMDGPU::DS_NOP) { |
223 | if (++DsNops >= DsNopCount) |
224 | State.Tracked.reset(); |
225 | continue; |
226 | } |
227 | DsNops = 0; |
228 | |
229 | // Snoop FLAT instructions to avoid adding culls before scratch/lds loads. |
230 | // Culls could be disproportionate in cost to load time. |
231 | if (SIInstrInfo::isFLAT(MI: *MI) && !SIInstrInfo::isFLATGlobal(MI: *MI)) |
232 | State.ActiveFlat = true; |
233 | |
234 | // SMEM or VMEM clears hazards |
235 | // FIXME: adapt to add FLAT without VALU (so !isLDSDMA())? |
236 | if ((SIInstrInfo::isVMEM(MI: *MI) && !SIInstrInfo::isFLAT(MI: *MI)) || |
237 | SIInstrInfo::isSMRD(MI: *MI)) { |
238 | State.VCCHazard = HazardState::None; |
239 | State.SALUHazards.reset(); |
240 | State.VALUHazards.reset(); |
241 | continue; |
242 | } |
243 | |
244 | // Existing S_WAITALU can clear hazards |
245 | if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) { |
246 | unsigned int Mask = MI->getOperand(i: 0).getImm(); |
247 | if (AMDGPU::DepCtr::decodeFieldVaVcc(Encoded: Mask) == 0) |
248 | State.VCCHazard &= ~HazardState::VALU; |
249 | if (AMDGPU::DepCtr::decodeFieldSaSdst(Encoded: Mask) == 0) { |
250 | State.SALUHazards.reset(); |
251 | State.VCCHazard &= ~HazardState::SALU; |
252 | } |
253 | if (AMDGPU::DepCtr::decodeFieldVaSdst(Encoded: Mask) == 0) |
254 | State.VALUHazards.reset(); |
255 | continue; |
256 | } |
257 | |
258 | // Snoop counter waits to insert culls |
259 | if (CullSGPRHazardsAtMemWait && |
260 | (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT || |
261 | MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT || |
262 | MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) && |
263 | (MI->getOperand(i: 0).isImm() && MI->getOperand(i: 0).getImm() == 0) && |
264 | (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) { |
265 | if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) { |
266 | State.ActiveFlat = false; |
267 | } else { |
268 | State.Tracked.reset(); |
269 | if (Emit) |
270 | insertHazardCull(MBB, MI); |
271 | continue; |
272 | } |
273 | } |
274 | |
275 | // Process only VALUs and SALUs |
276 | bool IsVALU = SIInstrInfo::isVALU(MI: *MI); |
277 | bool IsSALU = SIInstrInfo::isSALU(MI: *MI); |
278 | if (!IsVALU && !IsSALU) |
279 | continue; |
280 | |
281 | unsigned Wait = 0; |
282 | |
283 | auto processOperand = [&](const MachineOperand &Op, bool IsUse) { |
284 | if (!Op.isReg()) |
285 | return; |
286 | Register Reg = Op.getReg(); |
287 | assert(!Op.getSubReg()); |
288 | if (!TRI->isSGPRReg(MRI: *MRI, Reg)) |
289 | return; |
290 | |
291 | // Only visit each register once |
292 | if (!SeenRegs.insert(V: Reg).second) |
293 | return; |
294 | |
295 | auto RegNumber = sgprNumber(Reg, TRI: *TRI); |
296 | if (!RegNumber) |
297 | return; |
298 | |
299 | // Track SGPRs by pair -- numeric ID of an 64b SGPR pair. |
300 | // i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc |
301 | unsigned RegN = *RegNumber; |
302 | unsigned PairN = (RegN >> 1) & 0x3f; |
303 | |
304 | // Read/write of untracked register is safe; but must record any new |
305 | // reads. |
306 | if (!State.Tracked[PairN]) { |
307 | if (IsVALU && IsUse) |
308 | State.Tracked.set(position: PairN); |
309 | return; |
310 | } |
311 | |
312 | uint8_t SGPRCount = |
313 | AMDGPU::getRegBitWidth(RC: *TRI->getRegClassForReg(MRI: *MRI, Reg)) / 32; |
314 | |
315 | if (IsUse) { |
316 | // SALU reading SGPR clears VALU hazards |
317 | if (IsSALU) { |
318 | if (isVCC(Reg)) { |
319 | if (State.VCCHazard & HazardState::VALU) |
320 | State.VCCHazard = HazardState::None; |
321 | } else { |
322 | State.VALUHazards.reset(); |
323 | } |
324 | } |
325 | // Compute required waits |
326 | for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) { |
327 | Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0; |
328 | Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0; |
329 | } |
330 | if (isVCC(Reg) && State.VCCHazard) { |
331 | // Note: it's possible for both SALU and VALU to exist if VCC |
332 | // was updated differently by merged predecessors. |
333 | if (State.VCCHazard & HazardState::SALU) |
334 | Wait |= WA_SALU; |
335 | if (State.VCCHazard & HazardState::VALU) |
336 | Wait |= WA_VCC; |
337 | } |
338 | } else { |
339 | // Update hazards |
340 | if (isVCC(Reg)) { |
341 | State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU; |
342 | } else { |
343 | for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) { |
344 | if (IsSALU) |
345 | State.SALUHazards.set(position: RegN + RegIdx); |
346 | else |
347 | State.VALUHazards.set(position: RegN + RegIdx); |
348 | } |
349 | } |
350 | } |
351 | }; |
352 | |
353 | const bool IsSetPC = |
354 | (MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) && |
355 | MI->getOpcode() != AMDGPU::S_ENDPGM && |
356 | MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED; |
357 | |
358 | // Only consider implicit VCC specified by instruction descriptor. |
359 | const bool HasImplicitVCC = |
360 | llvm::any_of(Range: MI->getDesc().implicit_uses(), P: isVCC) || |
361 | llvm::any_of(Range: MI->getDesc().implicit_defs(), P: isVCC); |
362 | |
363 | if (IsSetPC) { |
364 | // All SGPR writes before a call/return must be flushed as the |
365 | // callee/caller will not will not see the hazard chain. |
366 | if (State.VCCHazard & HazardState::VALU) |
367 | Wait |= WA_VCC; |
368 | if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU)) |
369 | Wait |= WA_SALU; |
370 | if (State.VALUHazards.any()) |
371 | Wait |= WA_VALU; |
372 | if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) { |
373 | State.Tracked.reset(); |
374 | if (Emit) |
375 | insertHazardCull(MBB, MI); |
376 | } |
377 | } else { |
378 | // Process uses to determine required wait. |
379 | SeenRegs.clear(); |
380 | for (const MachineOperand &Op : MI->all_uses()) { |
381 | if (Op.isImplicit() && |
382 | (!HasImplicitVCC || !Op.isReg() || !isVCC(Reg: Op.getReg()))) |
383 | continue; |
384 | processOperand(Op, true); |
385 | } |
386 | } |
387 | |
388 | // Apply wait |
389 | if (Wait) { |
390 | unsigned Mask = 0xffff; |
391 | if (Wait & WA_VCC) { |
392 | State.VCCHazard &= ~HazardState::VALU; |
393 | Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Encoded: Mask, VaVcc: 0); |
394 | } |
395 | if (Wait & WA_SALU) { |
396 | State.SALUHazards.reset(); |
397 | State.VCCHazard &= ~HazardState::SALU; |
398 | Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Encoded: Mask, SaSdst: 0); |
399 | } |
400 | if (Wait & WA_VALU) { |
401 | State.VALUHazards.reset(); |
402 | Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Encoded: Mask, VaSdst: 0); |
403 | } |
404 | if (Emit) { |
405 | if (!mergeConsecutiveWaitAlus(MI, Mask)) { |
406 | auto NewMI = BuildMI(BB&: MBB, I: MI, MIMD: MI->getDebugLoc(), |
407 | MCID: TII->get(Opcode: AMDGPU::S_WAITCNT_DEPCTR)) |
408 | .addImm(Val: Mask); |
409 | updateGetPCBundle(NewMI); |
410 | } |
411 | Emitted = true; |
412 | } |
413 | } |
414 | |
415 | // On return from a call SGPR state is unknown, so all potential hazards. |
416 | if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary) |
417 | State.Tracked.set(); |
418 | |
419 | // Update hazards based on defs. |
420 | SeenRegs.clear(); |
421 | for (const MachineOperand &Op : MI->all_defs()) { |
422 | if (Op.isImplicit() && |
423 | (!HasImplicitVCC || !Op.isReg() || !isVCC(Reg: Op.getReg()))) |
424 | continue; |
425 | processOperand(Op, false); |
426 | } |
427 | } |
428 | |
429 | BlockHazardState &BS = BlockState[&MBB]; |
430 | bool Changed = State != BS.Out; |
431 | if (Emit) { |
432 | assert(!Changed && "Hazard state should not change on emit pass" ); |
433 | return Emitted; |
434 | } |
435 | if (Changed) |
436 | BS.Out = State; |
437 | return Changed; |
438 | } |
439 | |
440 | bool run(MachineFunction &MF) { |
441 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
442 | if (!ST.hasVALUReadSGPRHazard()) |
443 | return false; |
444 | |
445 | // Parse settings |
446 | EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits; |
447 | CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary; |
448 | CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait; |
449 | CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold; |
450 | |
451 | if (!GlobalEnableSGPRHazardWaits.getNumOccurrences()) |
452 | EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger( |
453 | Kind: "amdgpu-sgpr-hazard-wait" , Default: EnableSGPRHazardWaits); |
454 | if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences()) |
455 | CullSGPRHazardsOnFunctionBoundary = |
456 | MF.getFunction().hasFnAttribute(Kind: "amdgpu-sgpr-hazard-boundary-cull" ); |
457 | if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences()) |
458 | CullSGPRHazardsAtMemWait = |
459 | MF.getFunction().hasFnAttribute(Kind: "amdgpu-sgpr-hazard-mem-wait-cull" ); |
460 | if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences()) |
461 | CullSGPRHazardsMemWaitThreshold = |
462 | MF.getFunction().getFnAttributeAsParsedInteger( |
463 | Kind: "amdgpu-sgpr-hazard-mem-wait-cull-threshold" , |
464 | Default: CullSGPRHazardsMemWaitThreshold); |
465 | |
466 | // Bail if disabled |
467 | if (!EnableSGPRHazardWaits) |
468 | return false; |
469 | |
470 | TII = ST.getInstrInfo(); |
471 | TRI = ST.getRegisterInfo(); |
472 | MRI = &MF.getRegInfo(); |
473 | DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS; |
474 | |
475 | auto CallingConv = MF.getFunction().getCallingConv(); |
476 | if (!AMDGPU::isEntryFunctionCC(CC: CallingConv) && |
477 | !CullSGPRHazardsOnFunctionBoundary) { |
478 | // Callee must consider all SGPRs as tracked. |
479 | LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n" ); |
480 | MachineBasicBlock &EntryBlock = MF.front(); |
481 | BlockState[&EntryBlock].In.Tracked.set(); |
482 | } |
483 | |
484 | // Calculate the hazard state for each basic block. |
485 | // Iterate until a fixed point is reached. |
486 | // Fixed point is guaranteed as merge function only ever increases |
487 | // the hazard set, and all backedges will cause a merge. |
488 | // |
489 | // Note: we have to take care of the entry block as this technically |
490 | // has an edge from outside the function. Failure to treat this as |
491 | // a merge could prevent fixed point being reached. |
492 | SetVector<MachineBasicBlock *> Worklist; |
493 | for (auto &MBB : reverse(C&: MF)) |
494 | Worklist.insert(X: &MBB); |
495 | while (!Worklist.empty()) { |
496 | auto &MBB = *Worklist.pop_back_val(); |
497 | bool Changed = runOnMachineBasicBlock(MBB, Emit: false); |
498 | if (Changed) { |
499 | // Note: take a copy of state here in case it is reallocated by map |
500 | HazardState NewState = BlockState[&MBB].Out; |
501 | // Propagate to all successor blocks |
502 | for (auto Succ : MBB.successors()) { |
503 | // We only need to merge hazards at CFG merge points. |
504 | auto &SuccState = BlockState[Succ]; |
505 | if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) { |
506 | if (SuccState.In != NewState) { |
507 | SuccState.In = NewState; |
508 | Worklist.insert(X: Succ); |
509 | } |
510 | } else if (SuccState.In.merge(RHS: NewState)) { |
511 | Worklist.insert(X: Succ); |
512 | } |
513 | } |
514 | } |
515 | } |
516 | |
517 | LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n" ); |
518 | |
519 | // Final to emit wait instructions. |
520 | bool Changed = false; |
521 | for (auto &MBB : MF) |
522 | Changed |= runOnMachineBasicBlock(MBB, Emit: true); |
523 | |
524 | BlockState.clear(); |
525 | return Changed; |
526 | } |
527 | }; |
528 | |
529 | class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass { |
530 | public: |
531 | static char ID; |
532 | |
533 | AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {} |
534 | |
535 | bool runOnMachineFunction(MachineFunction &MF) override { |
536 | return AMDGPUWaitSGPRHazards().run(MF); |
537 | } |
538 | |
539 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
540 | AU.setPreservesCFG(); |
541 | MachineFunctionPass::getAnalysisUsage(AU); |
542 | } |
543 | }; |
544 | |
545 | } // namespace |
546 | |
547 | char AMDGPUWaitSGPRHazardsLegacy::ID = 0; |
548 | |
549 | char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID; |
550 | |
551 | INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE, |
552 | "AMDGPU Insert waits for SGPR read hazards" , false, false) |
553 | |
554 | PreservedAnalyses |
555 | AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF, |
556 | MachineFunctionAnalysisManager &MFAM) { |
557 | if (AMDGPUWaitSGPRHazards().run(MF)) |
558 | return PreservedAnalyses::none(); |
559 | return PreservedAnalyses::all(); |
560 | } |
561 | |