1 | //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// |
10 | /// This file implements methods from the AMDGPUCustomBehaviour class. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPUCustomBehaviour.h" |
15 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
16 | #include "TargetInfo/AMDGPUTargetInfo.h" |
17 | #include "Utils/AMDGPUBaseInfo.h" |
18 | #include "llvm/MC/TargetRegistry.h" |
19 | #include "llvm/Support/Compiler.h" |
20 | #include "llvm/Support/WithColor.h" |
21 | |
22 | namespace llvm::mca { |
23 | |
24 | void AMDGPUInstrPostProcess::postProcessInstruction( |
25 | std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { |
26 | switch (MCI.getOpcode()) { |
27 | case AMDGPU::S_WAITCNT: |
28 | case AMDGPU::S_WAITCNT_soft: |
29 | case AMDGPU::S_WAITCNT_EXPCNT: |
30 | case AMDGPU::S_WAITCNT_LGKMCNT: |
31 | case AMDGPU::S_WAITCNT_VMCNT: |
32 | case AMDGPU::S_WAITCNT_VSCNT: |
33 | case AMDGPU::S_WAITCNT_VSCNT_soft: |
34 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
35 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
36 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
37 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
38 | case AMDGPU::S_WAITCNT_gfx10: |
39 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
40 | case AMDGPU::S_WAITCNT_vi: |
41 | return processWaitCnt(Inst, MCI); |
42 | } |
43 | } |
44 | |
45 | // s_waitcnt instructions encode important information as immediate operands |
46 | // which are lost during the MCInst -> mca::Instruction lowering. |
47 | void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst, |
48 | const MCInst &MCI) { |
49 | for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { |
50 | MCAOperand Op; |
51 | const MCOperand &MCOp = MCI.getOperand(i: Idx); |
52 | if (MCOp.isReg()) { |
53 | Op = MCAOperand::createReg(Reg: MCOp.getReg()); |
54 | } else if (MCOp.isImm()) { |
55 | Op = MCAOperand::createImm(Val: MCOp.getImm()); |
56 | } |
57 | Op.setIndex(Idx); |
58 | Inst->addOperand(Op); |
59 | } |
60 | } |
61 | |
62 | AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, |
63 | const mca::SourceMgr &SrcMgr, |
64 | const MCInstrInfo &MCII) |
65 | : CustomBehaviour(STI, SrcMgr, MCII) { |
66 | generateWaitCntInfo(); |
67 | } |
68 | |
69 | unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, |
70 | const InstRef &IR) { |
71 | const Instruction &Inst = *IR.getInstruction(); |
72 | unsigned Opcode = Inst.getOpcode(); |
73 | |
74 | // llvm-mca is generally run on fully compiled assembly so we wouldn't see any |
75 | // pseudo instructions here. However, there are plans for the future to make |
76 | // it possible to use mca within backend passes. As such, I have left the |
77 | // pseudo version of s_waitcnt within this switch statement. |
78 | switch (Opcode) { |
79 | default: |
80 | return 0; |
81 | case AMDGPU::S_WAITCNT: // This instruction |
82 | case AMDGPU::S_WAITCNT_soft: |
83 | case AMDGPU::S_WAITCNT_EXPCNT: |
84 | case AMDGPU::S_WAITCNT_LGKMCNT: |
85 | case AMDGPU::S_WAITCNT_VMCNT: |
86 | case AMDGPU::S_WAITCNT_VSCNT: |
87 | case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo. |
88 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
89 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
90 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
91 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
92 | case AMDGPU::S_WAITCNT_gfx10: |
93 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
94 | case AMDGPU::S_WAITCNT_vi: |
95 | // s_endpgm also behaves as if there is an implicit |
96 | // s_waitcnt 0, but I'm not sure if it would be appropriate |
97 | // to model this in llvm-mca based on how the iterations work |
98 | // while simulating the pipeline over and over. |
99 | return handleWaitCnt(IssuedInst, IR); |
100 | } |
101 | |
102 | return 0; |
103 | } |
104 | |
105 | unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, |
106 | const InstRef &IR) { |
107 | // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. |
108 | // I do not know how that instruction works so I did not attempt to model it. |
109 | // set the max values to begin |
110 | unsigned Vmcnt = 63; |
111 | unsigned Expcnt = 7; |
112 | unsigned Lgkmcnt = 31; |
113 | unsigned Vscnt = 63; |
114 | unsigned CurrVmcnt = 0; |
115 | unsigned CurrExpcnt = 0; |
116 | unsigned CurrLgkmcnt = 0; |
117 | unsigned CurrVscnt = 0; |
118 | unsigned CyclesToWaitVm = ~0U; |
119 | unsigned CyclesToWaitExp = ~0U; |
120 | unsigned CyclesToWaitLgkm = ~0U; |
121 | unsigned CyclesToWaitVs = ~0U; |
122 | |
123 | computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); |
124 | |
125 | // We will now look at each of the currently executing instructions |
126 | // to find out if this wait instruction still needs to wait. |
127 | for (const InstRef &PrevIR : IssuedInst) { |
128 | const Instruction &PrevInst = *PrevIR.getInstruction(); |
129 | const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); |
130 | const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; |
131 | const int CyclesLeft = PrevInst.getCyclesLeft(); |
132 | assert(CyclesLeft != UNKNOWN_CYCLES && |
133 | "We should know how many cycles are left for this instruction" ); |
134 | if (PrevInstWaitInfo.VmCnt) { |
135 | CurrVmcnt++; |
136 | if ((unsigned)CyclesLeft < CyclesToWaitVm) |
137 | CyclesToWaitVm = CyclesLeft; |
138 | } |
139 | if (PrevInstWaitInfo.ExpCnt) { |
140 | CurrExpcnt++; |
141 | if ((unsigned)CyclesLeft < CyclesToWaitExp) |
142 | CyclesToWaitExp = CyclesLeft; |
143 | } |
144 | if (PrevInstWaitInfo.LgkmCnt) { |
145 | CurrLgkmcnt++; |
146 | if ((unsigned)CyclesLeft < CyclesToWaitLgkm) |
147 | CyclesToWaitLgkm = CyclesLeft; |
148 | } |
149 | if (PrevInstWaitInfo.VsCnt) { |
150 | CurrVscnt++; |
151 | if ((unsigned)CyclesLeft < CyclesToWaitVs) |
152 | CyclesToWaitVs = CyclesLeft; |
153 | } |
154 | } |
155 | |
156 | unsigned CyclesToWait = ~0U; |
157 | if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) |
158 | CyclesToWait = CyclesToWaitVm; |
159 | if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) |
160 | CyclesToWait = CyclesToWaitExp; |
161 | if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) |
162 | CyclesToWait = CyclesToWaitLgkm; |
163 | if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) |
164 | CyclesToWait = CyclesToWaitVs; |
165 | |
166 | // We may underestimate how many cycles we need to wait, but this |
167 | // isn't a big deal. Our return value is just how many cycles until |
168 | // this function gets run again. So as long as we don't overestimate |
169 | // the wait time, we'll still end up stalling at this instruction |
170 | // for the correct number of cycles. |
171 | |
172 | if (CyclesToWait == ~0U) |
173 | return 0; |
174 | return CyclesToWait; |
175 | } |
176 | |
177 | void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, |
178 | unsigned &Expcnt, unsigned &Lgkmcnt, |
179 | unsigned &Vscnt) { |
180 | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU()); |
181 | const Instruction &Inst = *IR.getInstruction(); |
182 | unsigned Opcode = Inst.getOpcode(); |
183 | |
184 | switch (Opcode) { |
185 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
186 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
187 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
188 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: { |
189 | // Should probably be checking for nullptr |
190 | // here, but I'm not sure how I should handle the case |
191 | // where we see a nullptr. |
192 | const MCAOperand *OpReg = Inst.getOperand(Idx: 0); |
193 | const MCAOperand *OpImm = Inst.getOperand(Idx: 1); |
194 | assert(OpReg && OpReg->isReg() && "First operand should be a register." ); |
195 | assert(OpImm && OpImm->isImm() && "Second operand should be an immediate." ); |
196 | if (OpReg->getReg() != AMDGPU::SGPR_NULL) { |
197 | // Instruction is using a real register. |
198 | // Since we can't know what value this register will have, |
199 | // we can't compute what the value of this wait should be. |
200 | WithColor::warning() << "The register component of " |
201 | << MCII.getName(Opcode) << " will be completely " |
202 | << "ignored. So the wait may not be accurate.\n" ; |
203 | } |
204 | switch (Opcode) { |
205 | // Redundant switch so I don't have to repeat the code above |
206 | // for each case. There are more clever ways to avoid this |
207 | // extra switch and anyone can feel free to implement one of them. |
208 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
209 | Expcnt = OpImm->getImm(); |
210 | break; |
211 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
212 | Lgkmcnt = OpImm->getImm(); |
213 | break; |
214 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
215 | Vmcnt = OpImm->getImm(); |
216 | break; |
217 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
218 | Vscnt = OpImm->getImm(); |
219 | break; |
220 | } |
221 | return; |
222 | } |
223 | case AMDGPU::S_WAITCNT_gfx10: |
224 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
225 | case AMDGPU::S_WAITCNT_vi: |
226 | unsigned WaitCnt = Inst.getOperand(Idx: 0)->getImm(); |
227 | AMDGPU::decodeWaitcnt(Version: IV, Waitcnt: WaitCnt, Vmcnt, Expcnt, Lgkmcnt); |
228 | return; |
229 | } |
230 | } |
231 | |
232 | void AMDGPUCustomBehaviour::generateWaitCntInfo() { |
233 | // The core logic from this function is taken from |
234 | // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions |
235 | // that are being looked at are in the MachineInstr format, whereas we have |
236 | // access to the MCInst format. The side effects of this are that we can't use |
237 | // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) |
238 | // functions. Therefore, we conservatively assume that these functions will |
239 | // return true. This may cause a few instructions to be incorrectly tagged |
240 | // with an extra CNT. However, these are instructions that do interact with at |
241 | // least one CNT so giving them an extra CNT shouldn't cause issues in most |
242 | // scenarios. |
243 | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU()); |
244 | InstrWaitCntInfo.resize(new_size: SrcMgr.size()); |
245 | |
246 | for (const auto &EN : llvm::enumerate(First: SrcMgr.getInstructions())) { |
247 | const std::unique_ptr<Instruction> &Inst = EN.value(); |
248 | unsigned Index = EN.index(); |
249 | unsigned Opcode = Inst->getOpcode(); |
250 | const MCInstrDesc &MCID = MCII.get(Opcode); |
251 | if ((MCID.TSFlags & SIInstrFlags::DS) && |
252 | (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { |
253 | InstrWaitCntInfo[Index].LgkmCnt = true; |
254 | if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, OpName: AMDGPU::OpName::gds)) |
255 | InstrWaitCntInfo[Index].ExpCnt = true; |
256 | } else if (MCID.TSFlags & SIInstrFlags::FLAT) { |
257 | // We conservatively assume that mayAccessVMEMThroughFlat(Inst) |
258 | // and mayAccessLDSThroughFlat(Inst) would both return true for this |
259 | // instruction. We have to do this because those functions use |
260 | // information about the memory operands that we don't have access to. |
261 | InstrWaitCntInfo[Index].LgkmCnt = true; |
262 | if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt)) |
263 | InstrWaitCntInfo[Index].VmCnt = true; |
264 | else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) |
265 | InstrWaitCntInfo[Index].VmCnt = true; |
266 | else |
267 | InstrWaitCntInfo[Index].VsCnt = true; |
268 | } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opc: Opcode)) { |
269 | if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt)) |
270 | InstrWaitCntInfo[Index].VmCnt = true; |
271 | else if ((MCID.mayLoad() && |
272 | !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || |
273 | ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && |
274 | !MCID.mayStore())) |
275 | InstrWaitCntInfo[Index].VmCnt = true; |
276 | else if (MCID.mayStore()) |
277 | InstrWaitCntInfo[Index].VsCnt = true; |
278 | |
279 | // (IV.Major < 7) is meant to represent |
280 | // GCNTarget.vmemWriteNeedsExpWaitcnt() |
281 | // which is defined as |
282 | // { return getGeneration() < SEA_ISLANDS; } |
283 | if (IV.Major < 7 && |
284 | (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) |
285 | InstrWaitCntInfo[Index].ExpCnt = true; |
286 | } else if (MCID.TSFlags & SIInstrFlags::SMRD) { |
287 | InstrWaitCntInfo[Index].LgkmCnt = true; |
288 | } else if (MCID.TSFlags & SIInstrFlags::EXP) { |
289 | InstrWaitCntInfo[Index].ExpCnt = true; |
290 | } else { |
291 | switch (Opcode) { |
292 | case AMDGPU::S_SENDMSG: |
293 | case AMDGPU::S_SENDMSGHALT: |
294 | case AMDGPU::S_MEMTIME: |
295 | case AMDGPU::S_MEMREALTIME: |
296 | InstrWaitCntInfo[Index].LgkmCnt = true; |
297 | break; |
298 | } |
299 | } |
300 | } |
301 | } |
302 | |
303 | // taken from SIInstrInfo::isVMEM() |
304 | bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { |
305 | return MCID.TSFlags & SIInstrFlags::MUBUF || |
306 | MCID.TSFlags & SIInstrFlags::MTBUF || |
307 | MCID.TSFlags & SIInstrFlags::MIMG || MCID.TSFlags & SIInstrFlags::FLAT; |
308 | } |
309 | |
310 | // taken from SIInstrInfo::hasModifiersSet() |
311 | bool AMDGPUCustomBehaviour::( |
312 | const std::unique_ptr<Instruction> &Inst, AMDGPU::OpName OpName) const { |
313 | int Idx = AMDGPU::getNamedOperandIdx(Opcode: Inst->getOpcode(), Name: OpName); |
314 | if (Idx == -1) |
315 | return false; |
316 | |
317 | const MCAOperand *Op = Inst->getOperand(Idx); |
318 | if (Op == nullptr || !Op->isImm() || !Op->getImm()) |
319 | return false; |
320 | |
321 | return true; |
322 | } |
323 | |
324 | // taken from SIInstrInfo::isGWS() |
325 | bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const { |
326 | const MCInstrDesc &MCID = MCII.get(Opcode); |
327 | return MCID.TSFlags & SIInstrFlags::GWS; |
328 | } |
329 | |
330 | // taken from SIInstrInfo::isAlwaysGDS() |
331 | bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { |
332 | return Opcode == AMDGPU::DS_ORDERED_COUNT || |
333 | Opcode == AMDGPU::DS_ADD_GS_REG_RTN || |
334 | Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); |
335 | } |
336 | |
337 | } // namespace llvm::mca |
338 | |
339 | using namespace llvm; |
340 | using namespace mca; |
341 | |
342 | static CustomBehaviour * |
343 | createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, |
344 | const mca::SourceMgr &SrcMgr, |
345 | const MCInstrInfo &MCII) { |
346 | return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII); |
347 | } |
348 | |
349 | static InstrPostProcess * |
350 | createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, |
351 | const MCInstrInfo &MCII) { |
352 | return new AMDGPUInstrPostProcess(STI, MCII); |
353 | } |
354 | |
355 | /// Extern function to initialize the targets for the AMDGPU backend |
356 | |
357 | extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void |
358 | LLVMInitializeAMDGPUTargetMCA() { |
359 | TargetRegistry::RegisterCustomBehaviour(T&: getTheR600Target(), |
360 | Fn: createAMDGPUCustomBehaviour); |
361 | TargetRegistry::RegisterInstrPostProcess(T&: getTheR600Target(), |
362 | Fn: createAMDGPUInstrPostProcess); |
363 | |
364 | TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNTarget(), |
365 | Fn: createAMDGPUCustomBehaviour); |
366 | TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNTarget(), |
367 | Fn: createAMDGPUInstrPostProcess); |
368 | } |
369 | |