AMDGPUCustomBehaviour.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp]

1	//===------------------ AMDGPUCustomBehaviour.cpp ----------------C++ -* -===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	///
10	/// This file implements methods from the AMDGPUCustomBehaviour class.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUCustomBehaviour.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "TargetInfo/AMDGPUTargetInfo.h"
17	#include "Utils/AMDGPUBaseInfo.h"
18	#include "llvm/MC/TargetRegistry.h"
19	#include "llvm/Support/Compiler.h"
20	#include "llvm/Support/WithColor.h"
21	#include "llvm/TargetParser/AMDGPUTargetParser.h"
22
23	namespace llvm::mca {
24
25	void AMDGPUInstrPostProcess::postProcessInstruction(Instruction &Inst,
26	const MCInst &MCI) {
27	switch (MCI.getOpcode()) {
28	case AMDGPU::S_WAITCNT:
29	case AMDGPU::S_WAITCNT_soft:
30	case AMDGPU::S_WAITCNT_EXPCNT:
31	case AMDGPU::S_WAITCNT_LGKMCNT:
32	case AMDGPU::S_WAITCNT_VMCNT:
33	case AMDGPU::S_WAITCNT_VSCNT:
34	case AMDGPU::S_WAITCNT_VSCNT_soft:
35	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
36	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
37	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
38	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
39	case AMDGPU::S_WAITCNT_gfx10:
40	case AMDGPU::S_WAITCNT_gfx6_gfx7:
41	case AMDGPU::S_WAITCNT_vi:
42	return processWaitCnt(Inst, MCI);
43	}
44	}
45
46	// s_waitcnt instructions encode important information as immediate operands
47	// which are lost during the MCInst -> mca::Instruction lowering.
48	void AMDGPUInstrPostProcess::processWaitCnt(Instruction &Inst,
49	const MCInst &MCI) {
50	for (int Idx = `0`, N = MCI.size(); Idx < N; Idx++) {
51	MCAOperand Op;
52	const MCOperand &MCOp = MCI.getOperand(i: Idx);
53	if (MCOp.isReg()) {
54	Op = MCAOperand::createReg(Reg: MCOp.getReg());
55	} else if (MCOp.isImm()) {
56	Op = MCAOperand::createImm(Val: MCOp.getImm());
57	}
58	Op.setIndex(Idx);
59	Inst.addOperand(Op);
60	}
61	}
62
63	AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
64	const mca::SourceMgr &SrcMgr,
65	const MCInstrInfo &MCII)
66	: CustomBehaviour (STI, SrcMgr, MCII) {
67	generateWaitCntInfo();
68	}
69
70	unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
71	const InstRef &IR) {
72	const Instruction &Inst = *IR.getInstruction();
73	unsigned Opcode = Inst.getOpcode();
74
75	// llvm-mca is generally run on fully compiled assembly so we wouldn't see any
76	// pseudo instructions here. However, there are plans for the future to make
77	// it possible to use mca within backend passes. As such, I have left the
78	// pseudo version of s_waitcnt within this switch statement.
79	switch (Opcode) {
80	default:
81	return `0`;
82	case AMDGPU::S_WAITCNT: // This instruction
83	case AMDGPU::S_WAITCNT_soft:
84	case AMDGPU::S_WAITCNT_EXPCNT:
85	case AMDGPU::S_WAITCNT_LGKMCNT:
86	case AMDGPU::S_WAITCNT_VMCNT:
87	case AMDGPU::S_WAITCNT_VSCNT:
88	case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
89	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
90	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
91	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
92	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
93	case AMDGPU::S_WAITCNT_gfx10:
94	case AMDGPU::S_WAITCNT_gfx6_gfx7:
95	case AMDGPU::S_WAITCNT_vi:
96	// s_endpgm also behaves as if there is an implicit
97	// s_waitcnt 0, but I'm not sure if it would be appropriate
98	// to model this in llvm-mca based on how the iterations work
99	// while simulating the pipeline over and over.
100	return handleWaitCnt(IssuedInst, IR);
101	}
102
103	return `0`;
104	}
105
106	unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
107	const InstRef &IR) {
108	// Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
109	// I do not know how that instruction works so I did not attempt to model it.
110	// set the max values to begin
111	unsigned Vmcnt = `63`;
112	unsigned Expcnt = `7`;
113	unsigned Lgkmcnt = `31`;
114	unsigned Vscnt = `63`;
115	unsigned CurrVmcnt = `0`;
116	unsigned CurrExpcnt = `0`;
117	unsigned CurrLgkmcnt = `0`;
118	unsigned CurrVscnt = `0`;
119	unsigned CyclesToWaitVm = ~`0U`;
120	unsigned CyclesToWaitExp = ~`0U`;
121	unsigned CyclesToWaitLgkm = ~`0U`;
122	unsigned CyclesToWaitVs = ~`0U`;
123
124	computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
125
126	// We will now look at each of the currently executing instructions
127	// to find out if this wait instruction still needs to wait.
128	for (const InstRef &PrevIR : IssuedInst) {
129	const Instruction &PrevInst = *PrevIR.getInstruction();
130	const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
131	const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo [PrevInstIndex];
132	const int CyclesLeft = PrevInst.getCyclesLeft();
133	assert(CyclesLeft != UNKNOWN_CYCLES &&
134	"We should know how many cycles are left for this instruction");
135	if (PrevInstWaitInfo.VmCnt) {
136	CurrVmcnt++;
137	if ((unsigned)CyclesLeft < CyclesToWaitVm)
138	CyclesToWaitVm = CyclesLeft;
139	}
140	if (PrevInstWaitInfo.ExpCnt) {
141	CurrExpcnt++;
142	if ((unsigned)CyclesLeft < CyclesToWaitExp)
143	CyclesToWaitExp = CyclesLeft;
144	}
145	if (PrevInstWaitInfo.LgkmCnt) {
146	CurrLgkmcnt++;
147	if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
148	CyclesToWaitLgkm = CyclesLeft;
149	}
150	if (PrevInstWaitInfo.VsCnt) {
151	CurrVscnt++;
152	if ((unsigned)CyclesLeft < CyclesToWaitVs)
153	CyclesToWaitVs = CyclesLeft;
154	}
155	}
156
157	unsigned CyclesToWait = ~`0U`;
158	if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
159	CyclesToWait = CyclesToWaitVm;
160	if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
161	CyclesToWait = CyclesToWaitExp;
162	if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
163	CyclesToWait = CyclesToWaitLgkm;
164	if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
165	CyclesToWait = CyclesToWaitVs;
166
167	// We may underestimate how many cycles we need to wait, but this
168	// isn't a big deal. Our return value is just how many cycles until
169	// this function gets run again. So as long as we don't overestimate
170	// the wait time, we'll still end up stalling at this instruction
171	// for the correct number of cycles.
172
173	if (CyclesToWait == ~`0U`)
174	return `0`;
175	return CyclesToWait;
176	}
177
178	void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
179	unsigned &Expcnt, unsigned &Lgkmcnt,
180	unsigned &Vscnt) {
181	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
182	const Instruction &Inst = *IR.getInstruction();
183	unsigned Opcode = Inst.getOpcode();
184
185	switch (Opcode) {
186	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
187	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
188	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
189	case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
190	// Should probably be checking for nullptr
191	// here, but I'm not sure how I should handle the case
192	// where we see a nullptr.
193	const MCAOperand *OpReg = Inst.getOperand(Idx: `0`);
194	const MCAOperand *OpImm = Inst.getOperand(Idx: `1`);
195	assert(OpReg && OpReg->isReg() && "First operand should be a register.");
196	assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
197	if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
198	// Instruction is using a real register.
199	// Since we can't know what value this register will have,
200	// we can't compute what the value of this wait should be.
201	WithColor::warning() << "The register component of "
202	<< MCII.getName(Opcode) << " will be completely "
203	<< "ignored. So the wait may not be accurate.\n";
204	}
205	switch (Opcode) {
206	// Redundant switch so I don't have to repeat the code above
207	// for each case. There are more clever ways to avoid this
208	// extra switch and anyone can feel free to implement one of them.
209	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
210	Expcnt = OpImm->getImm();
211	break;
212	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
213	Lgkmcnt = OpImm->getImm();
214	break;
215	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
216	Vmcnt = OpImm->getImm();
217	break;
218	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
219	Vscnt = OpImm->getImm();
220	break;
221	}
222	return;
223	}
224	case AMDGPU::S_WAITCNT_gfx10:
225	case AMDGPU::S_WAITCNT_gfx6_gfx7:
226	case AMDGPU::S_WAITCNT_vi:
227	unsigned WaitCnt = Inst.getOperand(Idx: `0`)->getImm();
228	AMDGPU::decodeWaitcnt(Version: IV, Waitcnt: WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
229	return;
230	}
231	}
232
233	void AMDGPUCustomBehaviour::generateWaitCntInfo() {
234	// The core logic from this function is taken from
235	// SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
236	// that are being looked at are in the MachineInstr format, whereas we have
237	// access to the MCInst format. The side effects of this are that we can't use
238	// the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
239	// functions. Therefore, we conservatively assume that these functions will
240	// return true. This may cause a few instructions to be incorrectly tagged
241	// with an extra CNT. However, these are instructions that do interact with at
242	// least one CNT so giving them an extra CNT shouldn't cause issues in most
243	// scenarios.
244	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
245	InstrWaitCntInfo.resize(new_size: SrcMgr.size());
246
247	for (const auto &EN : llvm::enumerate(First: SrcMgr.getInstructions())) {
248	const std::unique_ptr<Instruction> &Inst = EN.value();
249	unsigned Index = EN.index();
250	unsigned Opcode = Inst ->getOpcode();
251	const MCInstrDesc &MCID = MCII.get(Opcode);
252	if (SIInstrFlags::isDS(O: MCID) && SIInstrFlags::usesLGKM_CNT(O: MCID)) {
253	InstrWaitCntInfo [Index].LgkmCnt = true;
254	if (isAlwaysGDS(Opcode) \|\| hasModifiersSet(Inst, OpName: AMDGPU::OpName::gds))
255	InstrWaitCntInfo [Index].ExpCnt = true;
256	} else if (SIInstrFlags::isFLAT(O: MCID)) {
257	// We conservatively assume that mayAccessVMEMThroughFlat(Inst)
258	// and mayAccessLDSThroughFlat(Inst) would both return true for this
259	// instruction. We have to do this because those functions use
260	// information about the memory operands that we don't have access to.
261	InstrWaitCntInfo [Index].LgkmCnt = true;
262	if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
263	InstrWaitCntInfo [Index].VmCnt = true;
264	else if (MCID.mayLoad() && !SIInstrFlags::isAtomicNoRet(O: MCID))
265	InstrWaitCntInfo [Index].VmCnt = true;
266	else
267	InstrWaitCntInfo [Index].VsCnt = true;
268	} else if (SIInstrFlags::isVMEM(O: MCID) &&
269	!AMDGPU::getMUBUFIsBufferInv(Opc: Opcode)) {
270	if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
271	InstrWaitCntInfo [Index].VmCnt = true;
272	else if ((MCID.mayLoad() && !SIInstrFlags::isAtomicNoRet(O: MCID)) \|\|
273	(SIInstrFlags::isMIMG(O: MCID) && !MCID.mayLoad() &&
274	!MCID.mayStore()))
275	InstrWaitCntInfo [Index].VmCnt = true;
276	else if (MCID.mayStore())
277	InstrWaitCntInfo [Index].VsCnt = true;
278
279	// (IV.Major < 7) is meant to represent
280	// GCNTarget.vmemWriteNeedsExpWaitcnt()
281	// which is defined as
282	// { return getGeneration() < SEA_ISLANDS; }
283	if (IV.Major < `7` && (MCID.mayStore() \|\| SIInstrFlags::isAtomicRet(O: MCID)))
284	InstrWaitCntInfo [Index].ExpCnt = true;
285	} else if (SIInstrFlags::isSMRD(O: MCID)) {
286	InstrWaitCntInfo [Index].LgkmCnt = true;
287	} else if (SIInstrFlags::isEXP(O: MCID)) {
288	InstrWaitCntInfo [Index].ExpCnt = true;
289	} else {
290	switch (Opcode) {
291	case AMDGPU::S_SENDMSG:
292	case AMDGPU::S_SENDMSGHALT:
293	case AMDGPU::S_MEMTIME:
294	case AMDGPU::S_MEMREALTIME:
295	InstrWaitCntInfo [Index].LgkmCnt = true;
296	break;
297	}
298	}
299	}
300	}
301
302	// taken from SIInstrInfo::hasModifiersSet()
303	bool AMDGPUCustomBehaviour::hasModifiersSet(
304	const std::unique_ptr<Instruction> &Inst, AMDGPU::OpName OpName) const {
305	int Idx = AMDGPU::getNamedOperandIdx(Opcode: Inst ->getOpcode(), Name: OpName);
306	if (Idx == -`1`)
307	return false;
308
309	const MCAOperand *Op = Inst ->getOperand(Idx);
310	if (Op == nullptr \|\| !Op->isImm() \|\| !Op->getImm())
311	return false;
312
313	return true;
314	}
315
316	// taken from SIInstrInfo::isAlwaysGDS()
317	bool AMDGPUCustomBehaviour::isAlwaysGDS(uint32_t Opcode) const {
318	return Opcode == AMDGPU::DS_ORDERED_COUNT \|\|
319	Opcode == AMDGPU::DS_ADD_GS_REG_RTN \|\|
320	Opcode == AMDGPU::DS_SUB_GS_REG_RTN \|\|
321	SIInstrFlags::isGWS(O: MCII, O: Opcode);
322	}
323
324	} // namespace llvm::mca
325
326	using namespace llvm;
327	using namespace mca;
328
329	static CustomBehaviour *
330	createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
331	const mca::SourceMgr &SrcMgr,
332	const MCInstrInfo &MCII) {
333	return new AMDGPUCustomBehaviour (STI, SrcMgr, MCII);
334	}
335
336	static InstrPostProcess *
337	createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
338	const MCInstrInfo &MCII) {
339	return new AMDGPUInstrPostProcess (STI, MCII);
340	}
341
342	/// Extern function to initialize the targets for the AMDGPU backend
343
344	extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
345	LLVMInitializeAMDGPUTargetMCA() {
346	TargetRegistry::RegisterCustomBehaviour(T&: getTheR600Target(),
347	Fn: createAMDGPUCustomBehaviour);
348	TargetRegistry::RegisterInstrPostProcess(T&: getTheR600Target(),
349	Fn: createAMDGPUInstrPostProcess);
350
351	TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNTarget(),
352	Fn: createAMDGPUCustomBehaviour);
353	TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNTarget(),
354	Fn: createAMDGPUInstrPostProcess);
355
356	TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNLegacyTarget(),
357	Fn: createAMDGPUCustomBehaviour);
358	TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNLegacyTarget(),
359	Fn: createAMDGPUInstrPostProcess);
360	}
361

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp