1 | //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// |
10 | /// This file implements methods from the AMDGPUCustomBehaviour class. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPUCustomBehaviour.h" |
15 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
16 | #include "TargetInfo/AMDGPUTargetInfo.h" |
17 | #include "Utils/AMDGPUBaseInfo.h" |
18 | #include "llvm/MC/TargetRegistry.h" |
19 | #include "llvm/Support/WithColor.h" |
20 | |
21 | namespace llvm::mca { |
22 | |
23 | void AMDGPUInstrPostProcess::postProcessInstruction( |
24 | std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { |
25 | switch (MCI.getOpcode()) { |
26 | case AMDGPU::S_WAITCNT: |
27 | case AMDGPU::S_WAITCNT_soft: |
28 | case AMDGPU::S_WAITCNT_EXPCNT: |
29 | case AMDGPU::S_WAITCNT_LGKMCNT: |
30 | case AMDGPU::S_WAITCNT_VMCNT: |
31 | case AMDGPU::S_WAITCNT_VSCNT: |
32 | case AMDGPU::S_WAITCNT_VSCNT_soft: |
33 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
34 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
35 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
36 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
37 | case AMDGPU::S_WAITCNT_gfx10: |
38 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
39 | case AMDGPU::S_WAITCNT_vi: |
40 | return processWaitCnt(Inst, MCI); |
41 | } |
42 | } |
43 | |
44 | // s_waitcnt instructions encode important information as immediate operands |
45 | // which are lost during the MCInst -> mca::Instruction lowering. |
46 | void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst, |
47 | const MCInst &MCI) { |
48 | for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { |
49 | MCAOperand Op; |
50 | const MCOperand &MCOp = MCI.getOperand(i: Idx); |
51 | if (MCOp.isReg()) { |
52 | Op = MCAOperand::createReg(Reg: MCOp.getReg()); |
53 | } else if (MCOp.isImm()) { |
54 | Op = MCAOperand::createImm(Val: MCOp.getImm()); |
55 | } |
56 | Op.setIndex(Idx); |
57 | Inst->addOperand(Op); |
58 | } |
59 | } |
60 | |
61 | AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, |
62 | const mca::SourceMgr &SrcMgr, |
63 | const MCInstrInfo &MCII) |
64 | : CustomBehaviour(STI, SrcMgr, MCII) { |
65 | generateWaitCntInfo(); |
66 | } |
67 | |
68 | unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, |
69 | const InstRef &IR) { |
70 | const Instruction &Inst = *IR.getInstruction(); |
71 | unsigned Opcode = Inst.getOpcode(); |
72 | |
73 | // llvm-mca is generally run on fully compiled assembly so we wouldn't see any |
74 | // pseudo instructions here. However, there are plans for the future to make |
75 | // it possible to use mca within backend passes. As such, I have left the |
76 | // pseudo version of s_waitcnt within this switch statement. |
77 | switch (Opcode) { |
78 | default: |
79 | return 0; |
80 | case AMDGPU::S_WAITCNT: // This instruction |
81 | case AMDGPU::S_WAITCNT_soft: |
82 | case AMDGPU::S_WAITCNT_EXPCNT: |
83 | case AMDGPU::S_WAITCNT_LGKMCNT: |
84 | case AMDGPU::S_WAITCNT_VMCNT: |
85 | case AMDGPU::S_WAITCNT_VSCNT: |
86 | case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo. |
87 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
88 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
89 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
90 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
91 | case AMDGPU::S_WAITCNT_gfx10: |
92 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
93 | case AMDGPU::S_WAITCNT_vi: |
94 | // s_endpgm also behaves as if there is an implicit |
95 | // s_waitcnt 0, but I'm not sure if it would be appropriate |
96 | // to model this in llvm-mca based on how the iterations work |
97 | // while simulating the pipeline over and over. |
98 | return handleWaitCnt(IssuedInst, IR); |
99 | } |
100 | |
101 | return 0; |
102 | } |
103 | |
104 | unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, |
105 | const InstRef &IR) { |
106 | // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. |
107 | // I do not know how that instruction works so I did not attempt to model it. |
108 | // set the max values to begin |
109 | unsigned Vmcnt = 63; |
110 | unsigned Expcnt = 7; |
111 | unsigned Lgkmcnt = 31; |
112 | unsigned Vscnt = 63; |
113 | unsigned CurrVmcnt = 0; |
114 | unsigned CurrExpcnt = 0; |
115 | unsigned CurrLgkmcnt = 0; |
116 | unsigned CurrVscnt = 0; |
117 | unsigned CyclesToWaitVm = ~0U; |
118 | unsigned CyclesToWaitExp = ~0U; |
119 | unsigned CyclesToWaitLgkm = ~0U; |
120 | unsigned CyclesToWaitVs = ~0U; |
121 | |
122 | computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); |
123 | |
124 | // We will now look at each of the currently executing instructions |
125 | // to find out if this wait instruction still needs to wait. |
126 | for (const InstRef &PrevIR : IssuedInst) { |
127 | const Instruction &PrevInst = *PrevIR.getInstruction(); |
128 | const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); |
129 | const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; |
130 | const int CyclesLeft = PrevInst.getCyclesLeft(); |
131 | assert(CyclesLeft != UNKNOWN_CYCLES && |
132 | "We should know how many cycles are left for this instruction" ); |
133 | if (PrevInstWaitInfo.VmCnt) { |
134 | CurrVmcnt++; |
135 | if ((unsigned)CyclesLeft < CyclesToWaitVm) |
136 | CyclesToWaitVm = CyclesLeft; |
137 | } |
138 | if (PrevInstWaitInfo.ExpCnt) { |
139 | CurrExpcnt++; |
140 | if ((unsigned)CyclesLeft < CyclesToWaitExp) |
141 | CyclesToWaitExp = CyclesLeft; |
142 | } |
143 | if (PrevInstWaitInfo.LgkmCnt) { |
144 | CurrLgkmcnt++; |
145 | if ((unsigned)CyclesLeft < CyclesToWaitLgkm) |
146 | CyclesToWaitLgkm = CyclesLeft; |
147 | } |
148 | if (PrevInstWaitInfo.VsCnt) { |
149 | CurrVscnt++; |
150 | if ((unsigned)CyclesLeft < CyclesToWaitVs) |
151 | CyclesToWaitVs = CyclesLeft; |
152 | } |
153 | } |
154 | |
155 | unsigned CyclesToWait = ~0U; |
156 | if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) |
157 | CyclesToWait = CyclesToWaitVm; |
158 | if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) |
159 | CyclesToWait = CyclesToWaitExp; |
160 | if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) |
161 | CyclesToWait = CyclesToWaitLgkm; |
162 | if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) |
163 | CyclesToWait = CyclesToWaitVs; |
164 | |
165 | // We may underestimate how many cycles we need to wait, but this |
166 | // isn't a big deal. Our return value is just how many cycles until |
167 | // this function gets run again. So as long as we don't overestimate |
168 | // the wait time, we'll still end up stalling at this instruction |
169 | // for the correct number of cycles. |
170 | |
171 | if (CyclesToWait == ~0U) |
172 | return 0; |
173 | return CyclesToWait; |
174 | } |
175 | |
176 | void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, |
177 | unsigned &Expcnt, unsigned &Lgkmcnt, |
178 | unsigned &Vscnt) { |
179 | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU()); |
180 | const Instruction &Inst = *IR.getInstruction(); |
181 | unsigned Opcode = Inst.getOpcode(); |
182 | |
183 | switch (Opcode) { |
184 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
185 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
186 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
187 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: { |
188 | // Should probably be checking for nullptr |
189 | // here, but I'm not sure how I should handle the case |
190 | // where we see a nullptr. |
191 | const MCAOperand *OpReg = Inst.getOperand(Idx: 0); |
192 | const MCAOperand *OpImm = Inst.getOperand(Idx: 1); |
193 | assert(OpReg && OpReg->isReg() && "First operand should be a register." ); |
194 | assert(OpImm && OpImm->isImm() && "Second operand should be an immediate." ); |
195 | if (OpReg->getReg() != AMDGPU::SGPR_NULL) { |
196 | // Instruction is using a real register. |
197 | // Since we can't know what value this register will have, |
198 | // we can't compute what the value of this wait should be. |
199 | WithColor::warning() << "The register component of " |
200 | << MCII.getName(Opcode) << " will be completely " |
201 | << "ignored. So the wait may not be accurate.\n" ; |
202 | } |
203 | switch (Opcode) { |
204 | // Redundant switch so I don't have to repeat the code above |
205 | // for each case. There are more clever ways to avoid this |
206 | // extra switch and anyone can feel free to implement one of them. |
207 | case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
208 | Expcnt = OpImm->getImm(); |
209 | break; |
210 | case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
211 | Lgkmcnt = OpImm->getImm(); |
212 | break; |
213 | case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
214 | Vmcnt = OpImm->getImm(); |
215 | break; |
216 | case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
217 | Vscnt = OpImm->getImm(); |
218 | break; |
219 | } |
220 | return; |
221 | } |
222 | case AMDGPU::S_WAITCNT_gfx10: |
223 | case AMDGPU::S_WAITCNT_gfx6_gfx7: |
224 | case AMDGPU::S_WAITCNT_vi: |
225 | unsigned WaitCnt = Inst.getOperand(Idx: 0)->getImm(); |
226 | AMDGPU::decodeWaitcnt(Version: IV, Waitcnt: WaitCnt, Vmcnt, Expcnt, Lgkmcnt); |
227 | return; |
228 | } |
229 | } |
230 | |
231 | void AMDGPUCustomBehaviour::generateWaitCntInfo() { |
232 | // The core logic from this function is taken from |
233 | // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions |
234 | // that are being looked at are in the MachineInstr format, whereas we have |
235 | // access to the MCInst format. The side effects of this are that we can't use |
236 | // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) |
237 | // functions. Therefore, we conservatively assume that these functions will |
238 | // return true. This may cause a few instructions to be incorrectly tagged |
239 | // with an extra CNT. However, these are instructions that do interact with at |
240 | // least one CNT so giving them an extra CNT shouldn't cause issues in most |
241 | // scenarios. |
242 | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU()); |
243 | InstrWaitCntInfo.resize(new_size: SrcMgr.size()); |
244 | |
245 | for (const auto &EN : llvm::enumerate(First: SrcMgr.getInstructions())) { |
246 | const std::unique_ptr<Instruction> &Inst = EN.value(); |
247 | unsigned Index = EN.index(); |
248 | unsigned Opcode = Inst->getOpcode(); |
249 | const MCInstrDesc &MCID = MCII.get(Opcode); |
250 | if ((MCID.TSFlags & SIInstrFlags::DS) && |
251 | (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { |
252 | InstrWaitCntInfo[Index].LgkmCnt = true; |
253 | if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, OpName: AMDGPU::OpName::gds)) |
254 | InstrWaitCntInfo[Index].ExpCnt = true; |
255 | } else if (MCID.TSFlags & SIInstrFlags::FLAT) { |
256 | // We conservatively assume that mayAccessVMEMThroughFlat(Inst) |
257 | // and mayAccessLDSThroughFlat(Inst) would both return true for this |
258 | // instruction. We have to do this because those functions use |
259 | // information about the memory operands that we don't have access to. |
260 | InstrWaitCntInfo[Index].LgkmCnt = true; |
261 | if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt)) |
262 | InstrWaitCntInfo[Index].VmCnt = true; |
263 | else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) |
264 | InstrWaitCntInfo[Index].VmCnt = true; |
265 | else |
266 | InstrWaitCntInfo[Index].VsCnt = true; |
267 | } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opc: Opcode)) { |
268 | if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt)) |
269 | InstrWaitCntInfo[Index].VmCnt = true; |
270 | else if ((MCID.mayLoad() && |
271 | !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || |
272 | ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && |
273 | !MCID.mayStore())) |
274 | InstrWaitCntInfo[Index].VmCnt = true; |
275 | else if (MCID.mayStore()) |
276 | InstrWaitCntInfo[Index].VsCnt = true; |
277 | |
278 | // (IV.Major < 7) is meant to represent |
279 | // GCNTarget.vmemWriteNeedsExpWaitcnt() |
280 | // which is defined as |
281 | // { return getGeneration() < SEA_ISLANDS; } |
282 | if (IV.Major < 7 && |
283 | (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) |
284 | InstrWaitCntInfo[Index].ExpCnt = true; |
285 | } else if (MCID.TSFlags & SIInstrFlags::SMRD) { |
286 | InstrWaitCntInfo[Index].LgkmCnt = true; |
287 | } else if (MCID.TSFlags & SIInstrFlags::EXP) { |
288 | InstrWaitCntInfo[Index].ExpCnt = true; |
289 | } else { |
290 | switch (Opcode) { |
291 | case AMDGPU::S_SENDMSG: |
292 | case AMDGPU::S_SENDMSGHALT: |
293 | case AMDGPU::S_MEMTIME: |
294 | case AMDGPU::S_MEMREALTIME: |
295 | InstrWaitCntInfo[Index].LgkmCnt = true; |
296 | break; |
297 | } |
298 | } |
299 | } |
300 | } |
301 | |
302 | // taken from SIInstrInfo::isVMEM() |
303 | bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { |
304 | return MCID.TSFlags & SIInstrFlags::MUBUF || |
305 | MCID.TSFlags & SIInstrFlags::MTBUF || |
306 | MCID.TSFlags & SIInstrFlags::MIMG; |
307 | } |
308 | |
309 | // taken from SIInstrInfo::hasModifiersSet() |
310 | bool AMDGPUCustomBehaviour::( |
311 | const std::unique_ptr<Instruction> &Inst, unsigned OpName) const { |
312 | int Idx = AMDGPU::getNamedOperandIdx(Opcode: Inst->getOpcode(), NamedIdx: OpName); |
313 | if (Idx == -1) |
314 | return false; |
315 | |
316 | const MCAOperand *Op = Inst->getOperand(Idx); |
317 | if (Op == nullptr || !Op->isImm() || !Op->getImm()) |
318 | return false; |
319 | |
320 | return true; |
321 | } |
322 | |
323 | // taken from SIInstrInfo::isGWS() |
324 | bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const { |
325 | const MCInstrDesc &MCID = MCII.get(Opcode); |
326 | return MCID.TSFlags & SIInstrFlags::GWS; |
327 | } |
328 | |
329 | // taken from SIInstrInfo::isAlwaysGDS() |
330 | bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { |
331 | return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); |
332 | } |
333 | |
334 | } // namespace llvm::mca |
335 | |
336 | using namespace llvm; |
337 | using namespace mca; |
338 | |
339 | static CustomBehaviour * |
340 | createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, |
341 | const mca::SourceMgr &SrcMgr, |
342 | const MCInstrInfo &MCII) { |
343 | return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII); |
344 | } |
345 | |
346 | static InstrPostProcess * |
347 | createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, |
348 | const MCInstrInfo &MCII) { |
349 | return new AMDGPUInstrPostProcess(STI, MCII); |
350 | } |
351 | |
352 | /// Extern function to initialize the targets for the AMDGPU backend |
353 | |
354 | extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() { |
355 | TargetRegistry::RegisterCustomBehaviour(T&: getTheR600Target(), |
356 | Fn: createAMDGPUCustomBehaviour); |
357 | TargetRegistry::RegisterInstrPostProcess(T&: getTheR600Target(), |
358 | Fn: createAMDGPUInstrPostProcess); |
359 | |
360 | TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNTarget(), |
361 | Fn: createAMDGPUCustomBehaviour); |
362 | TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNTarget(), |
363 | Fn: createAMDGPUInstrPostProcess); |
364 | } |
365 | |