AMDGPUCustomBehaviour.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp]

1	//===------------------ AMDGPUCustomBehaviour.cpp ----------------C++ -* -===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	///
10	/// This file implements methods from the AMDGPUCustomBehaviour class.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUCustomBehaviour.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "TargetInfo/AMDGPUTargetInfo.h"
17	#include "Utils/AMDGPUBaseInfo.h"
18	#include "llvm/MC/TargetRegistry.h"
19	#include "llvm/Support/Compiler.h"
20	#include "llvm/Support/WithColor.h"
21
22	namespace llvm::mca {
23
24	void AMDGPUInstrPostProcess::postProcessInstruction(
25	std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
26	switch (MCI.getOpcode()) {
27	case AMDGPU::S_WAITCNT:
28	case AMDGPU::S_WAITCNT_soft:
29	case AMDGPU::S_WAITCNT_EXPCNT:
30	case AMDGPU::S_WAITCNT_LGKMCNT:
31	case AMDGPU::S_WAITCNT_VMCNT:
32	case AMDGPU::S_WAITCNT_VSCNT:
33	case AMDGPU::S_WAITCNT_VSCNT_soft:
34	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
35	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
36	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
37	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
38	case AMDGPU::S_WAITCNT_gfx10:
39	case AMDGPU::S_WAITCNT_gfx6_gfx7:
40	case AMDGPU::S_WAITCNT_vi:
41	return processWaitCnt(Inst, MCI);
42	}
43	}
44
45	// s_waitcnt instructions encode important information as immediate operands
46	// which are lost during the MCInst -> mca::Instruction lowering.
47	void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
48	const MCInst &MCI) {
49	for (int Idx = `0`, N = MCI.size(); Idx < N; Idx++) {
50	MCAOperand Op;
51	const MCOperand &MCOp = MCI.getOperand(i: Idx);
52	if (MCOp.isReg()) {
53	Op = MCAOperand::createReg(Reg: MCOp.getReg());
54	} else if (MCOp.isImm()) {
55	Op = MCAOperand::createImm(Val: MCOp.getImm());
56	}
57	Op.setIndex(Idx);
58	Inst ->addOperand(Op);
59	}
60	}
61
62	AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
63	const mca::SourceMgr &SrcMgr,
64	const MCInstrInfo &MCII)
65	: CustomBehaviour (STI, SrcMgr, MCII) {
66	generateWaitCntInfo();
67	}
68
69	unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
70	const InstRef &IR) {
71	const Instruction &Inst = *IR.getInstruction();
72	unsigned Opcode = Inst.getOpcode();
73
74	// llvm-mca is generally run on fully compiled assembly so we wouldn't see any
75	// pseudo instructions here. However, there are plans for the future to make
76	// it possible to use mca within backend passes. As such, I have left the
77	// pseudo version of s_waitcnt within this switch statement.
78	switch (Opcode) {
79	default:
80	return `0`;
81	case AMDGPU::S_WAITCNT: // This instruction
82	case AMDGPU::S_WAITCNT_soft:
83	case AMDGPU::S_WAITCNT_EXPCNT:
84	case AMDGPU::S_WAITCNT_LGKMCNT:
85	case AMDGPU::S_WAITCNT_VMCNT:
86	case AMDGPU::S_WAITCNT_VSCNT:
87	case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
88	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
89	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
90	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
91	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
92	case AMDGPU::S_WAITCNT_gfx10:
93	case AMDGPU::S_WAITCNT_gfx6_gfx7:
94	case AMDGPU::S_WAITCNT_vi:
95	// s_endpgm also behaves as if there is an implicit
96	// s_waitcnt 0, but I'm not sure if it would be appropriate
97	// to model this in llvm-mca based on how the iterations work
98	// while simulating the pipeline over and over.
99	return handleWaitCnt(IssuedInst, IR);
100	}
101
102	return `0`;
103	}
104
105	unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
106	const InstRef &IR) {
107	// Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
108	// I do not know how that instruction works so I did not attempt to model it.
109	// set the max values to begin
110	unsigned Vmcnt = `63`;
111	unsigned Expcnt = `7`;
112	unsigned Lgkmcnt = `31`;
113	unsigned Vscnt = `63`;
114	unsigned CurrVmcnt = `0`;
115	unsigned CurrExpcnt = `0`;
116	unsigned CurrLgkmcnt = `0`;
117	unsigned CurrVscnt = `0`;
118	unsigned CyclesToWaitVm = ~`0U`;
119	unsigned CyclesToWaitExp = ~`0U`;
120	unsigned CyclesToWaitLgkm = ~`0U`;
121	unsigned CyclesToWaitVs = ~`0U`;
122
123	computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
124
125	// We will now look at each of the currently executing instructions
126	// to find out if this wait instruction still needs to wait.
127	for (const InstRef &PrevIR : IssuedInst) {
128	const Instruction &PrevInst = *PrevIR.getInstruction();
129	const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
130	const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo [PrevInstIndex];
131	const int CyclesLeft = PrevInst.getCyclesLeft();
132	assert(CyclesLeft != UNKNOWN_CYCLES &&
133	"We should know how many cycles are left for this instruction");
134	if (PrevInstWaitInfo.VmCnt) {
135	CurrVmcnt++;
136	if ((unsigned)CyclesLeft < CyclesToWaitVm)
137	CyclesToWaitVm = CyclesLeft;
138	}
139	if (PrevInstWaitInfo.ExpCnt) {
140	CurrExpcnt++;
141	if ((unsigned)CyclesLeft < CyclesToWaitExp)
142	CyclesToWaitExp = CyclesLeft;
143	}
144	if (PrevInstWaitInfo.LgkmCnt) {
145	CurrLgkmcnt++;
146	if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
147	CyclesToWaitLgkm = CyclesLeft;
148	}
149	if (PrevInstWaitInfo.VsCnt) {
150	CurrVscnt++;
151	if ((unsigned)CyclesLeft < CyclesToWaitVs)
152	CyclesToWaitVs = CyclesLeft;
153	}
154	}
155
156	unsigned CyclesToWait = ~`0U`;
157	if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
158	CyclesToWait = CyclesToWaitVm;
159	if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
160	CyclesToWait = CyclesToWaitExp;
161	if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
162	CyclesToWait = CyclesToWaitLgkm;
163	if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
164	CyclesToWait = CyclesToWaitVs;
165
166	// We may underestimate how many cycles we need to wait, but this
167	// isn't a big deal. Our return value is just how many cycles until
168	// this function gets run again. So as long as we don't overestimate
169	// the wait time, we'll still end up stalling at this instruction
170	// for the correct number of cycles.
171
172	if (CyclesToWait == ~`0U`)
173	return `0`;
174	return CyclesToWait;
175	}
176
177	void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
178	unsigned &Expcnt, unsigned &Lgkmcnt,
179	unsigned &Vscnt) {
180	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
181	const Instruction &Inst = *IR.getInstruction();
182	unsigned Opcode = Inst.getOpcode();
183
184	switch (Opcode) {
185	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
186	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
187	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
188	case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
189	// Should probably be checking for nullptr
190	// here, but I'm not sure how I should handle the case
191	// where we see a nullptr.
192	const MCAOperand *OpReg = Inst.getOperand(Idx: `0`);
193	const MCAOperand *OpImm = Inst.getOperand(Idx: `1`);
194	assert(OpReg && OpReg->isReg() && "First operand should be a register.");
195	assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
196	if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
197	// Instruction is using a real register.
198	// Since we can't know what value this register will have,
199	// we can't compute what the value of this wait should be.
200	WithColor::warning() << "The register component of "
201	<< MCII.getName(Opcode) << " will be completely "
202	<< "ignored. So the wait may not be accurate.\n";
203	}
204	switch (Opcode) {
205	// Redundant switch so I don't have to repeat the code above
206	// for each case. There are more clever ways to avoid this
207	// extra switch and anyone can feel free to implement one of them.
208	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
209	Expcnt = OpImm->getImm();
210	break;
211	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
212	Lgkmcnt = OpImm->getImm();
213	break;
214	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
215	Vmcnt = OpImm->getImm();
216	break;
217	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
218	Vscnt = OpImm->getImm();
219	break;
220	}
221	return;
222	}
223	case AMDGPU::S_WAITCNT_gfx10:
224	case AMDGPU::S_WAITCNT_gfx6_gfx7:
225	case AMDGPU::S_WAITCNT_vi:
226	unsigned WaitCnt = Inst.getOperand(Idx: `0`)->getImm();
227	AMDGPU::decodeWaitcnt(Version: IV, Waitcnt: WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
228	return;
229	}
230	}
231
232	void AMDGPUCustomBehaviour::generateWaitCntInfo() {
233	// The core logic from this function is taken from
234	// SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
235	// that are being looked at are in the MachineInstr format, whereas we have
236	// access to the MCInst format. The side effects of this are that we can't use
237	// the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
238	// functions. Therefore, we conservatively assume that these functions will
239	// return true. This may cause a few instructions to be incorrectly tagged
240	// with an extra CNT. However, these are instructions that do interact with at
241	// least one CNT so giving them an extra CNT shouldn't cause issues in most
242	// scenarios.
243	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
244	InstrWaitCntInfo.resize(new_size: SrcMgr.size());
245
246	for (const auto &EN : llvm::enumerate(First: SrcMgr.getInstructions())) {
247	const std::unique_ptr<Instruction> &Inst = EN.value();
248	unsigned Index = EN.index();
249	unsigned Opcode = Inst ->getOpcode();
250	const MCInstrDesc &MCID = MCII.get(Opcode);
251	if ((MCID.TSFlags & SIInstrFlags::DS) &&
252	(MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
253	InstrWaitCntInfo [Index].LgkmCnt = true;
254	if (isAlwaysGDS(Opcode) \|\| hasModifiersSet(Inst, OpName: AMDGPU::OpName::gds))
255	InstrWaitCntInfo [Index].ExpCnt = true;
256	} else if (MCID.TSFlags & SIInstrFlags::FLAT) {
257	// We conservatively assume that mayAccessVMEMThroughFlat(Inst)
258	// and mayAccessLDSThroughFlat(Inst) would both return true for this
259	// instruction. We have to do this because those functions use
260	// information about the memory operands that we don't have access to.
261	InstrWaitCntInfo [Index].LgkmCnt = true;
262	if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
263	InstrWaitCntInfo [Index].VmCnt = true;
264	else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
265	InstrWaitCntInfo [Index].VmCnt = true;
266	else
267	InstrWaitCntInfo [Index].VsCnt = true;
268	} else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opc: Opcode)) {
269	if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
270	InstrWaitCntInfo [Index].VmCnt = true;
271	else if ((MCID.mayLoad() &&
272	!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) \|\|
273	((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
274	!MCID.mayStore()))
275	InstrWaitCntInfo [Index].VmCnt = true;
276	else if (MCID.mayStore())
277	InstrWaitCntInfo [Index].VsCnt = true;
278
279	// (IV.Major < 7) is meant to represent
280	// GCNTarget.vmemWriteNeedsExpWaitcnt()
281	// which is defined as
282	// { return getGeneration() < SEA_ISLANDS; }
283	if (IV.Major < `7` &&
284	(MCID.mayStore() \|\| (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
285	InstrWaitCntInfo [Index].ExpCnt = true;
286	} else if (MCID.TSFlags & SIInstrFlags::SMRD) {
287	InstrWaitCntInfo [Index].LgkmCnt = true;
288	} else if (MCID.TSFlags & SIInstrFlags::EXP) {
289	InstrWaitCntInfo [Index].ExpCnt = true;
290	} else {
291	switch (Opcode) {
292	case AMDGPU::S_SENDMSG:
293	case AMDGPU::S_SENDMSGHALT:
294	case AMDGPU::S_MEMTIME:
295	case AMDGPU::S_MEMREALTIME:
296	InstrWaitCntInfo [Index].LgkmCnt = true;
297	break;
298	}
299	}
300	}
301	}
302
303	// taken from SIInstrInfo::isVMEM()
304	bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
305	return MCID.TSFlags & SIInstrFlags::MUBUF \|\|
306	MCID.TSFlags & SIInstrFlags::MTBUF \|\|
307	MCID.TSFlags & SIInstrFlags::MIMG \|\| MCID.TSFlags & SIInstrFlags::FLAT;
308	}
309
310	// taken from SIInstrInfo::hasModifiersSet()
311	bool AMDGPUCustomBehaviour::hasModifiersSet(
312	const std::unique_ptr<Instruction> &Inst, AMDGPU::OpName OpName) const {
313	int Idx = AMDGPU::getNamedOperandIdx(Opcode: Inst ->getOpcode(), Name: OpName);
314	if (Idx == -`1`)
315	return false;
316
317	const MCAOperand *Op = Inst ->getOperand(Idx);
318	if (Op == nullptr \|\| !Op->isImm() \|\| !Op->getImm())
319	return false;
320
321	return true;
322	}
323
324	// taken from SIInstrInfo::isGWS()
325	bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
326	const MCInstrDesc &MCID = MCII.get(Opcode);
327	return MCID.TSFlags & SIInstrFlags::GWS;
328	}
329
330	// taken from SIInstrInfo::isAlwaysGDS()
331	bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
332	return Opcode == AMDGPU::DS_ORDERED_COUNT \|\|
333	Opcode == AMDGPU::DS_ADD_GS_REG_RTN \|\|
334	Opcode == AMDGPU::DS_SUB_GS_REG_RTN \|\| isGWS(Opcode);
335	}
336
337	} // namespace llvm::mca
338
339	using namespace llvm;
340	using namespace mca;
341
342	static CustomBehaviour *
343	createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
344	const mca::SourceMgr &SrcMgr,
345	const MCInstrInfo &MCII) {
346	return new AMDGPUCustomBehaviour (STI, SrcMgr, MCII);
347	}
348
349	static InstrPostProcess *
350	createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
351	const MCInstrInfo &MCII) {
352	return new AMDGPUInstrPostProcess (STI, MCII);
353	}
354
355	/// Extern function to initialize the targets for the AMDGPU backend
356
357	extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
358	LLVMInitializeAMDGPUTargetMCA() {
359	TargetRegistry::RegisterCustomBehaviour(T&: getTheR600Target(),
360	Fn: createAMDGPUCustomBehaviour);
361	TargetRegistry::RegisterInstrPostProcess(T&: getTheR600Target(),
362	Fn: createAMDGPUInstrPostProcess);
363
364	TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNTarget(),
365	Fn: createAMDGPUCustomBehaviour);
366	TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNTarget(),
367	Fn: createAMDGPUInstrPostProcess);
368	}
369

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp