1//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements methods from the AMDGPUCustomBehaviour class.
11///
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUCustomBehaviour.h"
15#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16#include "TargetInfo/AMDGPUTargetInfo.h"
17#include "Utils/AMDGPUBaseInfo.h"
18#include "llvm/MC/TargetRegistry.h"
19#include "llvm/Support/Compiler.h"
20#include "llvm/Support/WithColor.h"
21
22namespace llvm::mca {
23
24void AMDGPUInstrPostProcess::postProcessInstruction(
25 std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
26 switch (MCI.getOpcode()) {
27 case AMDGPU::S_WAITCNT:
28 case AMDGPU::S_WAITCNT_soft:
29 case AMDGPU::S_WAITCNT_EXPCNT:
30 case AMDGPU::S_WAITCNT_LGKMCNT:
31 case AMDGPU::S_WAITCNT_VMCNT:
32 case AMDGPU::S_WAITCNT_VSCNT:
33 case AMDGPU::S_WAITCNT_VSCNT_soft:
34 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
35 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
36 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
37 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
38 case AMDGPU::S_WAITCNT_gfx10:
39 case AMDGPU::S_WAITCNT_gfx6_gfx7:
40 case AMDGPU::S_WAITCNT_vi:
41 return processWaitCnt(Inst, MCI);
42 }
43}
44
45// s_waitcnt instructions encode important information as immediate operands
46// which are lost during the MCInst -> mca::Instruction lowering.
47void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
48 const MCInst &MCI) {
49 for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
50 MCAOperand Op;
51 const MCOperand &MCOp = MCI.getOperand(i: Idx);
52 if (MCOp.isReg()) {
53 Op = MCAOperand::createReg(Reg: MCOp.getReg());
54 } else if (MCOp.isImm()) {
55 Op = MCAOperand::createImm(Val: MCOp.getImm());
56 }
57 Op.setIndex(Idx);
58 Inst->addOperand(Op);
59 }
60}
61
62AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
63 const mca::SourceMgr &SrcMgr,
64 const MCInstrInfo &MCII)
65 : CustomBehaviour(STI, SrcMgr, MCII) {
66 generateWaitCntInfo();
67}
68
69unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
70 const InstRef &IR) {
71 const Instruction &Inst = *IR.getInstruction();
72 unsigned Opcode = Inst.getOpcode();
73
74 // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
75 // pseudo instructions here. However, there are plans for the future to make
76 // it possible to use mca within backend passes. As such, I have left the
77 // pseudo version of s_waitcnt within this switch statement.
78 switch (Opcode) {
79 default:
80 return 0;
81 case AMDGPU::S_WAITCNT: // This instruction
82 case AMDGPU::S_WAITCNT_soft:
83 case AMDGPU::S_WAITCNT_EXPCNT:
84 case AMDGPU::S_WAITCNT_LGKMCNT:
85 case AMDGPU::S_WAITCNT_VMCNT:
86 case AMDGPU::S_WAITCNT_VSCNT:
87 case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
88 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
89 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
90 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
91 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
92 case AMDGPU::S_WAITCNT_gfx10:
93 case AMDGPU::S_WAITCNT_gfx6_gfx7:
94 case AMDGPU::S_WAITCNT_vi:
95 // s_endpgm also behaves as if there is an implicit
96 // s_waitcnt 0, but I'm not sure if it would be appropriate
97 // to model this in llvm-mca based on how the iterations work
98 // while simulating the pipeline over and over.
99 return handleWaitCnt(IssuedInst, IR);
100 }
101
102 return 0;
103}
104
105unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
106 const InstRef &IR) {
107 // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
108 // I do not know how that instruction works so I did not attempt to model it.
109 // set the max values to begin
110 unsigned Vmcnt = 63;
111 unsigned Expcnt = 7;
112 unsigned Lgkmcnt = 31;
113 unsigned Vscnt = 63;
114 unsigned CurrVmcnt = 0;
115 unsigned CurrExpcnt = 0;
116 unsigned CurrLgkmcnt = 0;
117 unsigned CurrVscnt = 0;
118 unsigned CyclesToWaitVm = ~0U;
119 unsigned CyclesToWaitExp = ~0U;
120 unsigned CyclesToWaitLgkm = ~0U;
121 unsigned CyclesToWaitVs = ~0U;
122
123 computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
124
125 // We will now look at each of the currently executing instructions
126 // to find out if this wait instruction still needs to wait.
127 for (const InstRef &PrevIR : IssuedInst) {
128 const Instruction &PrevInst = *PrevIR.getInstruction();
129 const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
130 const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
131 const int CyclesLeft = PrevInst.getCyclesLeft();
132 assert(CyclesLeft != UNKNOWN_CYCLES &&
133 "We should know how many cycles are left for this instruction");
134 if (PrevInstWaitInfo.VmCnt) {
135 CurrVmcnt++;
136 if ((unsigned)CyclesLeft < CyclesToWaitVm)
137 CyclesToWaitVm = CyclesLeft;
138 }
139 if (PrevInstWaitInfo.ExpCnt) {
140 CurrExpcnt++;
141 if ((unsigned)CyclesLeft < CyclesToWaitExp)
142 CyclesToWaitExp = CyclesLeft;
143 }
144 if (PrevInstWaitInfo.LgkmCnt) {
145 CurrLgkmcnt++;
146 if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
147 CyclesToWaitLgkm = CyclesLeft;
148 }
149 if (PrevInstWaitInfo.VsCnt) {
150 CurrVscnt++;
151 if ((unsigned)CyclesLeft < CyclesToWaitVs)
152 CyclesToWaitVs = CyclesLeft;
153 }
154 }
155
156 unsigned CyclesToWait = ~0U;
157 if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
158 CyclesToWait = CyclesToWaitVm;
159 if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
160 CyclesToWait = CyclesToWaitExp;
161 if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
162 CyclesToWait = CyclesToWaitLgkm;
163 if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
164 CyclesToWait = CyclesToWaitVs;
165
166 // We may underestimate how many cycles we need to wait, but this
167 // isn't a big deal. Our return value is just how many cycles until
168 // this function gets run again. So as long as we don't overestimate
169 // the wait time, we'll still end up stalling at this instruction
170 // for the correct number of cycles.
171
172 if (CyclesToWait == ~0U)
173 return 0;
174 return CyclesToWait;
175}
176
177void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
178 unsigned &Expcnt, unsigned &Lgkmcnt,
179 unsigned &Vscnt) {
180 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
181 const Instruction &Inst = *IR.getInstruction();
182 unsigned Opcode = Inst.getOpcode();
183
184 switch (Opcode) {
185 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
186 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
187 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
188 case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
189 // Should probably be checking for nullptr
190 // here, but I'm not sure how I should handle the case
191 // where we see a nullptr.
192 const MCAOperand *OpReg = Inst.getOperand(Idx: 0);
193 const MCAOperand *OpImm = Inst.getOperand(Idx: 1);
194 assert(OpReg && OpReg->isReg() && "First operand should be a register.");
195 assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
196 if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
197 // Instruction is using a real register.
198 // Since we can't know what value this register will have,
199 // we can't compute what the value of this wait should be.
200 WithColor::warning() << "The register component of "
201 << MCII.getName(Opcode) << " will be completely "
202 << "ignored. So the wait may not be accurate.\n";
203 }
204 switch (Opcode) {
205 // Redundant switch so I don't have to repeat the code above
206 // for each case. There are more clever ways to avoid this
207 // extra switch and anyone can feel free to implement one of them.
208 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
209 Expcnt = OpImm->getImm();
210 break;
211 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
212 Lgkmcnt = OpImm->getImm();
213 break;
214 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
215 Vmcnt = OpImm->getImm();
216 break;
217 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
218 Vscnt = OpImm->getImm();
219 break;
220 }
221 return;
222 }
223 case AMDGPU::S_WAITCNT_gfx10:
224 case AMDGPU::S_WAITCNT_gfx6_gfx7:
225 case AMDGPU::S_WAITCNT_vi:
226 unsigned WaitCnt = Inst.getOperand(Idx: 0)->getImm();
227 AMDGPU::decodeWaitcnt(Version: IV, Waitcnt: WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
228 return;
229 }
230}
231
232void AMDGPUCustomBehaviour::generateWaitCntInfo() {
233 // The core logic from this function is taken from
234 // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
235 // that are being looked at are in the MachineInstr format, whereas we have
236 // access to the MCInst format. The side effects of this are that we can't use
237 // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
238 // functions. Therefore, we conservatively assume that these functions will
239 // return true. This may cause a few instructions to be incorrectly tagged
240 // with an extra CNT. However, these are instructions that do interact with at
241 // least one CNT so giving them an extra CNT shouldn't cause issues in most
242 // scenarios.
243 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
244 InstrWaitCntInfo.resize(new_size: SrcMgr.size());
245
246 for (const auto &EN : llvm::enumerate(First: SrcMgr.getInstructions())) {
247 const std::unique_ptr<Instruction> &Inst = EN.value();
248 unsigned Index = EN.index();
249 unsigned Opcode = Inst->getOpcode();
250 const MCInstrDesc &MCID = MCII.get(Opcode);
251 if ((MCID.TSFlags & SIInstrFlags::DS) &&
252 (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
253 InstrWaitCntInfo[Index].LgkmCnt = true;
254 if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, OpName: AMDGPU::OpName::gds))
255 InstrWaitCntInfo[Index].ExpCnt = true;
256 } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
257 // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
258 // and mayAccessLDSThroughFlat(Inst) would both return true for this
259 // instruction. We have to do this because those functions use
260 // information about the memory operands that we don't have access to.
261 InstrWaitCntInfo[Index].LgkmCnt = true;
262 if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
263 InstrWaitCntInfo[Index].VmCnt = true;
264 else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
265 InstrWaitCntInfo[Index].VmCnt = true;
266 else
267 InstrWaitCntInfo[Index].VsCnt = true;
268 } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opc: Opcode)) {
269 if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
270 InstrWaitCntInfo[Index].VmCnt = true;
271 else if ((MCID.mayLoad() &&
272 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
273 ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
274 !MCID.mayStore()))
275 InstrWaitCntInfo[Index].VmCnt = true;
276 else if (MCID.mayStore())
277 InstrWaitCntInfo[Index].VsCnt = true;
278
279 // (IV.Major < 7) is meant to represent
280 // GCNTarget.vmemWriteNeedsExpWaitcnt()
281 // which is defined as
282 // { return getGeneration() < SEA_ISLANDS; }
283 if (IV.Major < 7 &&
284 (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
285 InstrWaitCntInfo[Index].ExpCnt = true;
286 } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
287 InstrWaitCntInfo[Index].LgkmCnt = true;
288 } else if (MCID.TSFlags & SIInstrFlags::EXP) {
289 InstrWaitCntInfo[Index].ExpCnt = true;
290 } else {
291 switch (Opcode) {
292 case AMDGPU::S_SENDMSG:
293 case AMDGPU::S_SENDMSGHALT:
294 case AMDGPU::S_MEMTIME:
295 case AMDGPU::S_MEMREALTIME:
296 InstrWaitCntInfo[Index].LgkmCnt = true;
297 break;
298 }
299 }
300 }
301}
302
303// taken from SIInstrInfo::isVMEM()
304bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
305 return MCID.TSFlags & SIInstrFlags::MUBUF ||
306 MCID.TSFlags & SIInstrFlags::MTBUF ||
307 MCID.TSFlags & SIInstrFlags::MIMG || MCID.TSFlags & SIInstrFlags::FLAT;
308}
309
310// taken from SIInstrInfo::hasModifiersSet()
311bool AMDGPUCustomBehaviour::hasModifiersSet(
312 const std::unique_ptr<Instruction> &Inst, AMDGPU::OpName OpName) const {
313 int Idx = AMDGPU::getNamedOperandIdx(Opcode: Inst->getOpcode(), Name: OpName);
314 if (Idx == -1)
315 return false;
316
317 const MCAOperand *Op = Inst->getOperand(Idx);
318 if (Op == nullptr || !Op->isImm() || !Op->getImm())
319 return false;
320
321 return true;
322}
323
324// taken from SIInstrInfo::isGWS()
325bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
326 const MCInstrDesc &MCID = MCII.get(Opcode);
327 return MCID.TSFlags & SIInstrFlags::GWS;
328}
329
330// taken from SIInstrInfo::isAlwaysGDS()
331bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
332 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
333 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
334 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
335}
336
337} // namespace llvm::mca
338
339using namespace llvm;
340using namespace mca;
341
342static CustomBehaviour *
343createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
344 const mca::SourceMgr &SrcMgr,
345 const MCInstrInfo &MCII) {
346 return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
347}
348
349static InstrPostProcess *
350createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
351 const MCInstrInfo &MCII) {
352 return new AMDGPUInstrPostProcess(STI, MCII);
353}
354
355/// Extern function to initialize the targets for the AMDGPU backend
356
357extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
358LLVMInitializeAMDGPUTargetMCA() {
359 TargetRegistry::RegisterCustomBehaviour(T&: getTheR600Target(),
360 Fn: createAMDGPUCustomBehaviour);
361 TargetRegistry::RegisterInstrPostProcess(T&: getTheR600Target(),
362 Fn: createAMDGPUInstrPostProcess);
363
364 TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNTarget(),
365 Fn: createAMDGPUCustomBehaviour);
366 TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNTarget(),
367 Fn: createAMDGPUInstrPostProcess);
368}
369