1//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements methods from the AMDGPUCustomBehaviour class.
11///
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUCustomBehaviour.h"
15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16#include "TargetInfo/AMDGPUTargetInfo.h"
17#include "Utils/AMDGPUBaseInfo.h"
18#include "llvm/MC/TargetRegistry.h"
19#include "llvm/Support/Compiler.h"
20#include "llvm/Support/WithColor.h"
21#include "llvm/TargetParser/AMDGPUTargetParser.h"
22
23namespace llvm::mca {
24
25void AMDGPUInstrPostProcess::postProcessInstruction(Instruction &Inst,
26 const MCInst &MCI) {
27 switch (MCI.getOpcode()) {
28 case AMDGPU::S_WAITCNT:
29 case AMDGPU::S_WAITCNT_soft:
30 case AMDGPU::S_WAITCNT_EXPCNT:
31 case AMDGPU::S_WAITCNT_LGKMCNT:
32 case AMDGPU::S_WAITCNT_VMCNT:
33 case AMDGPU::S_WAITCNT_VSCNT:
34 case AMDGPU::S_WAITCNT_VSCNT_soft:
35 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
36 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
37 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
38 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
39 case AMDGPU::S_WAITCNT_gfx10:
40 case AMDGPU::S_WAITCNT_gfx6_gfx7:
41 case AMDGPU::S_WAITCNT_vi:
42 return processWaitCnt(Inst, MCI);
43 }
44}
45
46// s_waitcnt instructions encode important information as immediate operands
47// which are lost during the MCInst -> mca::Instruction lowering.
48void AMDGPUInstrPostProcess::processWaitCnt(Instruction &Inst,
49 const MCInst &MCI) {
50 for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
51 MCAOperand Op;
52 const MCOperand &MCOp = MCI.getOperand(i: Idx);
53 if (MCOp.isReg()) {
54 Op = MCAOperand::createReg(Reg: MCOp.getReg());
55 } else if (MCOp.isImm()) {
56 Op = MCAOperand::createImm(Val: MCOp.getImm());
57 }
58 Op.setIndex(Idx);
59 Inst.addOperand(Op);
60 }
61}
62
63AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
64 const mca::SourceMgr &SrcMgr,
65 const MCInstrInfo &MCII)
66 : CustomBehaviour(STI, SrcMgr, MCII) {
67 generateWaitCntInfo();
68}
69
70unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
71 const InstRef &IR) {
72 const Instruction &Inst = *IR.getInstruction();
73 unsigned Opcode = Inst.getOpcode();
74
75 // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
76 // pseudo instructions here. However, there are plans for the future to make
77 // it possible to use mca within backend passes. As such, I have left the
78 // pseudo version of s_waitcnt within this switch statement.
79 switch (Opcode) {
80 default:
81 return 0;
82 case AMDGPU::S_WAITCNT: // This instruction
83 case AMDGPU::S_WAITCNT_soft:
84 case AMDGPU::S_WAITCNT_EXPCNT:
85 case AMDGPU::S_WAITCNT_LGKMCNT:
86 case AMDGPU::S_WAITCNT_VMCNT:
87 case AMDGPU::S_WAITCNT_VSCNT:
88 case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
89 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
90 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
91 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
92 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
93 case AMDGPU::S_WAITCNT_gfx10:
94 case AMDGPU::S_WAITCNT_gfx6_gfx7:
95 case AMDGPU::S_WAITCNT_vi:
96 // s_endpgm also behaves as if there is an implicit
97 // s_waitcnt 0, but I'm not sure if it would be appropriate
98 // to model this in llvm-mca based on how the iterations work
99 // while simulating the pipeline over and over.
100 return handleWaitCnt(IssuedInst, IR);
101 }
102
103 return 0;
104}
105
106unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
107 const InstRef &IR) {
108 // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
109 // I do not know how that instruction works so I did not attempt to model it.
110 // set the max values to begin
111 unsigned Vmcnt = 63;
112 unsigned Expcnt = 7;
113 unsigned Lgkmcnt = 31;
114 unsigned Vscnt = 63;
115 unsigned CurrVmcnt = 0;
116 unsigned CurrExpcnt = 0;
117 unsigned CurrLgkmcnt = 0;
118 unsigned CurrVscnt = 0;
119 unsigned CyclesToWaitVm = ~0U;
120 unsigned CyclesToWaitExp = ~0U;
121 unsigned CyclesToWaitLgkm = ~0U;
122 unsigned CyclesToWaitVs = ~0U;
123
124 computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
125
126 // We will now look at each of the currently executing instructions
127 // to find out if this wait instruction still needs to wait.
128 for (const InstRef &PrevIR : IssuedInst) {
129 const Instruction &PrevInst = *PrevIR.getInstruction();
130 const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
131 const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
132 const int CyclesLeft = PrevInst.getCyclesLeft();
133 assert(CyclesLeft != UNKNOWN_CYCLES &&
134 "We should know how many cycles are left for this instruction");
135 if (PrevInstWaitInfo.VmCnt) {
136 CurrVmcnt++;
137 if ((unsigned)CyclesLeft < CyclesToWaitVm)
138 CyclesToWaitVm = CyclesLeft;
139 }
140 if (PrevInstWaitInfo.ExpCnt) {
141 CurrExpcnt++;
142 if ((unsigned)CyclesLeft < CyclesToWaitExp)
143 CyclesToWaitExp = CyclesLeft;
144 }
145 if (PrevInstWaitInfo.LgkmCnt) {
146 CurrLgkmcnt++;
147 if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
148 CyclesToWaitLgkm = CyclesLeft;
149 }
150 if (PrevInstWaitInfo.VsCnt) {
151 CurrVscnt++;
152 if ((unsigned)CyclesLeft < CyclesToWaitVs)
153 CyclesToWaitVs = CyclesLeft;
154 }
155 }
156
157 unsigned CyclesToWait = ~0U;
158 if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
159 CyclesToWait = CyclesToWaitVm;
160 if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
161 CyclesToWait = CyclesToWaitExp;
162 if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
163 CyclesToWait = CyclesToWaitLgkm;
164 if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
165 CyclesToWait = CyclesToWaitVs;
166
167 // We may underestimate how many cycles we need to wait, but this
168 // isn't a big deal. Our return value is just how many cycles until
169 // this function gets run again. So as long as we don't overestimate
170 // the wait time, we'll still end up stalling at this instruction
171 // for the correct number of cycles.
172
173 if (CyclesToWait == ~0U)
174 return 0;
175 return CyclesToWait;
176}
177
178void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
179 unsigned &Expcnt, unsigned &Lgkmcnt,
180 unsigned &Vscnt) {
181 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
182 const Instruction &Inst = *IR.getInstruction();
183 unsigned Opcode = Inst.getOpcode();
184
185 switch (Opcode) {
186 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
187 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
188 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
189 case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
190 // Should probably be checking for nullptr
191 // here, but I'm not sure how I should handle the case
192 // where we see a nullptr.
193 const MCAOperand *OpReg = Inst.getOperand(Idx: 0);
194 const MCAOperand *OpImm = Inst.getOperand(Idx: 1);
195 assert(OpReg && OpReg->isReg() && "First operand should be a register.");
196 assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
197 if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
198 // Instruction is using a real register.
199 // Since we can't know what value this register will have,
200 // we can't compute what the value of this wait should be.
201 WithColor::warning() << "The register component of "
202 << MCII.getName(Opcode) << " will be completely "
203 << "ignored. So the wait may not be accurate.\n";
204 }
205 switch (Opcode) {
206 // Redundant switch so I don't have to repeat the code above
207 // for each case. There are more clever ways to avoid this
208 // extra switch and anyone can feel free to implement one of them.
209 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
210 Expcnt = OpImm->getImm();
211 break;
212 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
213 Lgkmcnt = OpImm->getImm();
214 break;
215 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
216 Vmcnt = OpImm->getImm();
217 break;
218 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
219 Vscnt = OpImm->getImm();
220 break;
221 }
222 return;
223 }
224 case AMDGPU::S_WAITCNT_gfx10:
225 case AMDGPU::S_WAITCNT_gfx6_gfx7:
226 case AMDGPU::S_WAITCNT_vi:
227 unsigned WaitCnt = Inst.getOperand(Idx: 0)->getImm();
228 AMDGPU::decodeWaitcnt(Version: IV, Waitcnt: WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
229 return;
230 }
231}
232
233void AMDGPUCustomBehaviour::generateWaitCntInfo() {
234 // The core logic from this function is taken from
235 // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
236 // that are being looked at are in the MachineInstr format, whereas we have
237 // access to the MCInst format. The side effects of this are that we can't use
238 // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
239 // functions. Therefore, we conservatively assume that these functions will
240 // return true. This may cause a few instructions to be incorrectly tagged
241 // with an extra CNT. However, these are instructions that do interact with at
242 // least one CNT so giving them an extra CNT shouldn't cause issues in most
243 // scenarios.
244 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
245 InstrWaitCntInfo.resize(new_size: SrcMgr.size());
246
247 for (const auto &EN : llvm::enumerate(First: SrcMgr.getInstructions())) {
248 const std::unique_ptr<Instruction> &Inst = EN.value();
249 unsigned Index = EN.index();
250 unsigned Opcode = Inst->getOpcode();
251 const MCInstrDesc &MCID = MCII.get(Opcode);
252 if ((MCID.TSFlags & SIInstrFlags::DS) &&
253 (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
254 InstrWaitCntInfo[Index].LgkmCnt = true;
255 if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, OpName: AMDGPU::OpName::gds))
256 InstrWaitCntInfo[Index].ExpCnt = true;
257 } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
258 // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
259 // and mayAccessLDSThroughFlat(Inst) would both return true for this
260 // instruction. We have to do this because those functions use
261 // information about the memory operands that we don't have access to.
262 InstrWaitCntInfo[Index].LgkmCnt = true;
263 if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
264 InstrWaitCntInfo[Index].VmCnt = true;
265 else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
266 InstrWaitCntInfo[Index].VmCnt = true;
267 else
268 InstrWaitCntInfo[Index].VsCnt = true;
269 } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opc: Opcode)) {
270 if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
271 InstrWaitCntInfo[Index].VmCnt = true;
272 else if ((MCID.mayLoad() &&
273 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
274 ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
275 !MCID.mayStore()))
276 InstrWaitCntInfo[Index].VmCnt = true;
277 else if (MCID.mayStore())
278 InstrWaitCntInfo[Index].VsCnt = true;
279
280 // (IV.Major < 7) is meant to represent
281 // GCNTarget.vmemWriteNeedsExpWaitcnt()
282 // which is defined as
283 // { return getGeneration() < SEA_ISLANDS; }
284 if (IV.Major < 7 &&
285 (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
286 InstrWaitCntInfo[Index].ExpCnt = true;
287 } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
288 InstrWaitCntInfo[Index].LgkmCnt = true;
289 } else if (MCID.TSFlags & SIInstrFlags::EXP) {
290 InstrWaitCntInfo[Index].ExpCnt = true;
291 } else {
292 switch (Opcode) {
293 case AMDGPU::S_SENDMSG:
294 case AMDGPU::S_SENDMSGHALT:
295 case AMDGPU::S_MEMTIME:
296 case AMDGPU::S_MEMREALTIME:
297 InstrWaitCntInfo[Index].LgkmCnt = true;
298 break;
299 }
300 }
301 }
302}
303
304// taken from SIInstrInfo::isVMEM()
305bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
306 return MCID.TSFlags & SIInstrFlags::MUBUF ||
307 MCID.TSFlags & SIInstrFlags::MTBUF ||
308 MCID.TSFlags & SIInstrFlags::MIMG || MCID.TSFlags & SIInstrFlags::FLAT;
309}
310
311// taken from SIInstrInfo::hasModifiersSet()
312bool AMDGPUCustomBehaviour::hasModifiersSet(
313 const std::unique_ptr<Instruction> &Inst, AMDGPU::OpName OpName) const {
314 int Idx = AMDGPU::getNamedOperandIdx(Opcode: Inst->getOpcode(), Name: OpName);
315 if (Idx == -1)
316 return false;
317
318 const MCAOperand *Op = Inst->getOperand(Idx);
319 if (Op == nullptr || !Op->isImm() || !Op->getImm())
320 return false;
321
322 return true;
323}
324
325// taken from SIInstrInfo::isGWS()
326bool AMDGPUCustomBehaviour::isGWS(uint32_t Opcode) const {
327 const MCInstrDesc &MCID = MCII.get(Opcode);
328 return MCID.TSFlags & SIInstrFlags::GWS;
329}
330
331// taken from SIInstrInfo::isAlwaysGDS()
332bool AMDGPUCustomBehaviour::isAlwaysGDS(uint32_t Opcode) const {
333 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
334 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
335 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
336}
337
338} // namespace llvm::mca
339
340using namespace llvm;
341using namespace mca;
342
343static CustomBehaviour *
344createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
345 const mca::SourceMgr &SrcMgr,
346 const MCInstrInfo &MCII) {
347 return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
348}
349
350static InstrPostProcess *
351createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
352 const MCInstrInfo &MCII) {
353 return new AMDGPUInstrPostProcess(STI, MCII);
354}
355
356/// Extern function to initialize the targets for the AMDGPU backend
357
358extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
359LLVMInitializeAMDGPUTargetMCA() {
360 TargetRegistry::RegisterCustomBehaviour(T&: getTheR600Target(),
361 Fn: createAMDGPUCustomBehaviour);
362 TargetRegistry::RegisterInstrPostProcess(T&: getTheR600Target(),
363 Fn: createAMDGPUInstrPostProcess);
364
365 TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNTarget(),
366 Fn: createAMDGPUCustomBehaviour);
367 TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNTarget(),
368 Fn: createAMDGPUInstrPostProcess);
369}
370