AMDGPUCustomBehaviour.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp]

1	//===------------------ AMDGPUCustomBehaviour.cpp ----------------C++ -* -===//*
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	///
10	/// This file implements methods from the AMDGPUCustomBehaviour class.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUCustomBehaviour.h"
15	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16	#include "TargetInfo/AMDGPUTargetInfo.h"
17	#include "Utils/AMDGPUBaseInfo.h"
18	#include "llvm/MC/TargetRegistry.h"
19	#include "llvm/Support/WithColor.h"
20
21	namespace llvm::mca {
22
23	void AMDGPUInstrPostProcess::postProcessInstruction(
24	std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
25	switch (MCI.getOpcode()) {
26	case AMDGPU::S_WAITCNT:
27	case AMDGPU::S_WAITCNT_soft:
28	case AMDGPU::S_WAITCNT_EXPCNT:
29	case AMDGPU::S_WAITCNT_LGKMCNT:
30	case AMDGPU::S_WAITCNT_VMCNT:
31	case AMDGPU::S_WAITCNT_VSCNT:
32	case AMDGPU::S_WAITCNT_VSCNT_soft:
33	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
34	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
35	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
36	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
37	case AMDGPU::S_WAITCNT_gfx10:
38	case AMDGPU::S_WAITCNT_gfx6_gfx7:
39	case AMDGPU::S_WAITCNT_vi:
40	return processWaitCnt(Inst, MCI);
41	}
42	}
43
44	// s_waitcnt instructions encode important information as immediate operands
45	// which are lost during the MCInst -> mca::Instruction lowering.
46	void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
47	const MCInst &MCI) {
48	for (int Idx = `0`, N = MCI.size(); Idx < N; Idx++) {
49	MCAOperand Op;
50	const MCOperand &MCOp = MCI.getOperand(i: Idx);
51	if (MCOp.isReg()) {
52	Op = MCAOperand::createReg(Reg: MCOp.getReg());
53	} else if (MCOp.isImm()) {
54	Op = MCAOperand::createImm(Val: MCOp.getImm());
55	}
56	Op.setIndex(Idx);
57	Inst ->addOperand(Op);
58	}
59	}
60
61	AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
62	const mca::SourceMgr &SrcMgr,
63	const MCInstrInfo &MCII)
64	: CustomBehaviour (STI, SrcMgr, MCII) {
65	generateWaitCntInfo();
66	}
67
68	unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
69	const InstRef &IR) {
70	const Instruction &Inst = *IR.getInstruction();
71	unsigned Opcode = Inst.getOpcode();
72
73	// llvm-mca is generally run on fully compiled assembly so we wouldn't see any
74	// pseudo instructions here. However, there are plans for the future to make
75	// it possible to use mca within backend passes. As such, I have left the
76	// pseudo version of s_waitcnt within this switch statement.
77	switch (Opcode) {
78	default:
79	return `0`;
80	case AMDGPU::S_WAITCNT: // This instruction
81	case AMDGPU::S_WAITCNT_soft:
82	case AMDGPU::S_WAITCNT_EXPCNT:
83	case AMDGPU::S_WAITCNT_LGKMCNT:
84	case AMDGPU::S_WAITCNT_VMCNT:
85	case AMDGPU::S_WAITCNT_VSCNT:
86	case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
87	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
88	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
89	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
90	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
91	case AMDGPU::S_WAITCNT_gfx10:
92	case AMDGPU::S_WAITCNT_gfx6_gfx7:
93	case AMDGPU::S_WAITCNT_vi:
94	// s_endpgm also behaves as if there is an implicit
95	// s_waitcnt 0, but I'm not sure if it would be appropriate
96	// to model this in llvm-mca based on how the iterations work
97	// while simulating the pipeline over and over.
98	return handleWaitCnt(IssuedInst, IR);
99	}
100
101	return `0`;
102	}
103
104	unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
105	const InstRef &IR) {
106	// Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
107	// I do not know how that instruction works so I did not attempt to model it.
108	// set the max values to begin
109	unsigned Vmcnt = `63`;
110	unsigned Expcnt = `7`;
111	unsigned Lgkmcnt = `31`;
112	unsigned Vscnt = `63`;
113	unsigned CurrVmcnt = `0`;
114	unsigned CurrExpcnt = `0`;
115	unsigned CurrLgkmcnt = `0`;
116	unsigned CurrVscnt = `0`;
117	unsigned CyclesToWaitVm = ~`0U`;
118	unsigned CyclesToWaitExp = ~`0U`;
119	unsigned CyclesToWaitLgkm = ~`0U`;
120	unsigned CyclesToWaitVs = ~`0U`;
121
122	computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
123
124	// We will now look at each of the currently executing instructions
125	// to find out if this wait instruction still needs to wait.
126	for (const InstRef &PrevIR : IssuedInst) {
127	const Instruction &PrevInst = *PrevIR.getInstruction();
128	const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
129	const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo [PrevInstIndex];
130	const int CyclesLeft = PrevInst.getCyclesLeft();
131	assert(CyclesLeft != UNKNOWN_CYCLES &&
132	"We should know how many cycles are left for this instruction");
133	if (PrevInstWaitInfo.VmCnt) {
134	CurrVmcnt++;
135	if ((unsigned)CyclesLeft < CyclesToWaitVm)
136	CyclesToWaitVm = CyclesLeft;
137	}
138	if (PrevInstWaitInfo.ExpCnt) {
139	CurrExpcnt++;
140	if ((unsigned)CyclesLeft < CyclesToWaitExp)
141	CyclesToWaitExp = CyclesLeft;
142	}
143	if (PrevInstWaitInfo.LgkmCnt) {
144	CurrLgkmcnt++;
145	if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
146	CyclesToWaitLgkm = CyclesLeft;
147	}
148	if (PrevInstWaitInfo.VsCnt) {
149	CurrVscnt++;
150	if ((unsigned)CyclesLeft < CyclesToWaitVs)
151	CyclesToWaitVs = CyclesLeft;
152	}
153	}
154
155	unsigned CyclesToWait = ~`0U`;
156	if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
157	CyclesToWait = CyclesToWaitVm;
158	if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
159	CyclesToWait = CyclesToWaitExp;
160	if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
161	CyclesToWait = CyclesToWaitLgkm;
162	if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
163	CyclesToWait = CyclesToWaitVs;
164
165	// We may underestimate how many cycles we need to wait, but this
166	// isn't a big deal. Our return value is just how many cycles until
167	// this function gets run again. So as long as we don't overestimate
168	// the wait time, we'll still end up stalling at this instruction
169	// for the correct number of cycles.
170
171	if (CyclesToWait == ~`0U`)
172	return `0`;
173	return CyclesToWait;
174	}
175
176	void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
177	unsigned &Expcnt, unsigned &Lgkmcnt,
178	unsigned &Vscnt) {
179	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
180	const Instruction &Inst = *IR.getInstruction();
181	unsigned Opcode = Inst.getOpcode();
182
183	switch (Opcode) {
184	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
185	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
186	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
187	case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
188	// Should probably be checking for nullptr
189	// here, but I'm not sure how I should handle the case
190	// where we see a nullptr.
191	const MCAOperand *OpReg = Inst.getOperand(Idx: `0`);
192	const MCAOperand *OpImm = Inst.getOperand(Idx: `1`);
193	assert(OpReg && OpReg->isReg() && "First operand should be a register.");
194	assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
195	if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
196	// Instruction is using a real register.
197	// Since we can't know what value this register will have,
198	// we can't compute what the value of this wait should be.
199	WithColor::warning() << "The register component of "
200	<< MCII.getName(Opcode) << " will be completely "
201	<< "ignored. So the wait may not be accurate.\n";
202	}
203	switch (Opcode) {
204	// Redundant switch so I don't have to repeat the code above
205	// for each case. There are more clever ways to avoid this
206	// extra switch and anyone can feel free to implement one of them.
207	case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
208	Expcnt = OpImm->getImm();
209	break;
210	case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
211	Lgkmcnt = OpImm->getImm();
212	break;
213	case AMDGPU::S_WAITCNT_VMCNT_gfx10:
214	Vmcnt = OpImm->getImm();
215	break;
216	case AMDGPU::S_WAITCNT_VSCNT_gfx10:
217	Vscnt = OpImm->getImm();
218	break;
219	}
220	return;
221	}
222	case AMDGPU::S_WAITCNT_gfx10:
223	case AMDGPU::S_WAITCNT_gfx6_gfx7:
224	case AMDGPU::S_WAITCNT_vi:
225	unsigned WaitCnt = Inst.getOperand(Idx: `0`)->getImm();
226	AMDGPU::decodeWaitcnt(Version: IV, Waitcnt: WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
227	return;
228	}
229	}
230
231	void AMDGPUCustomBehaviour::generateWaitCntInfo() {
232	// The core logic from this function is taken from
233	// SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
234	// that are being looked at are in the MachineInstr format, whereas we have
235	// access to the MCInst format. The side effects of this are that we can't use
236	// the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
237	// functions. Therefore, we conservatively assume that these functions will
238	// return true. This may cause a few instructions to be incorrectly tagged
239	// with an extra CNT. However, these are instructions that do interact with at
240	// least one CNT so giving them an extra CNT shouldn't cause issues in most
241	// scenarios.
242	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: STI.getCPU());
243	InstrWaitCntInfo.resize(new_size: SrcMgr.size());
244
245	for (const auto &EN : llvm::enumerate(First: SrcMgr.getInstructions())) {
246	const std::unique_ptr<Instruction> &Inst = EN.value();
247	unsigned Index = EN.index();
248	unsigned Opcode = Inst ->getOpcode();
249	const MCInstrDesc &MCID = MCII.get(Opcode);
250	if ((MCID.TSFlags & SIInstrFlags::DS) &&
251	(MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
252	InstrWaitCntInfo [Index].LgkmCnt = true;
253	if (isAlwaysGDS(Opcode) \|\| hasModifiersSet(Inst, OpName: AMDGPU::OpName::gds))
254	InstrWaitCntInfo [Index].ExpCnt = true;
255	} else if (MCID.TSFlags & SIInstrFlags::FLAT) {
256	// We conservatively assume that mayAccessVMEMThroughFlat(Inst)
257	// and mayAccessLDSThroughFlat(Inst) would both return true for this
258	// instruction. We have to do this because those functions use
259	// information about the memory operands that we don't have access to.
260	InstrWaitCntInfo [Index].LgkmCnt = true;
261	if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
262	InstrWaitCntInfo [Index].VmCnt = true;
263	else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
264	InstrWaitCntInfo [Index].VmCnt = true;
265	else
266	InstrWaitCntInfo [Index].VsCnt = true;
267	} else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opc: Opcode)) {
268	if (!STI.hasFeature(Feature: AMDGPU::FeatureVscnt))
269	InstrWaitCntInfo [Index].VmCnt = true;
270	else if ((MCID.mayLoad() &&
271	!(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) \|\|
272	((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
273	!MCID.mayStore()))
274	InstrWaitCntInfo [Index].VmCnt = true;
275	else if (MCID.mayStore())
276	InstrWaitCntInfo [Index].VsCnt = true;
277
278	// (IV.Major < 7) is meant to represent
279	// GCNTarget.vmemWriteNeedsExpWaitcnt()
280	// which is defined as
281	// { return getGeneration() < SEA_ISLANDS; }
282	if (IV.Major < `7` &&
283	(MCID.mayStore() \|\| (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
284	InstrWaitCntInfo [Index].ExpCnt = true;
285	} else if (MCID.TSFlags & SIInstrFlags::SMRD) {
286	InstrWaitCntInfo [Index].LgkmCnt = true;
287	} else if (MCID.TSFlags & SIInstrFlags::EXP) {
288	InstrWaitCntInfo [Index].ExpCnt = true;
289	} else {
290	switch (Opcode) {
291	case AMDGPU::S_SENDMSG:
292	case AMDGPU::S_SENDMSGHALT:
293	case AMDGPU::S_MEMTIME:
294	case AMDGPU::S_MEMREALTIME:
295	InstrWaitCntInfo [Index].LgkmCnt = true;
296	break;
297	}
298	}
299	}
300	}
301
302	// taken from SIInstrInfo::isVMEM()
303	bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
304	return MCID.TSFlags & SIInstrFlags::MUBUF \|\|
305	MCID.TSFlags & SIInstrFlags::MTBUF \|\|
306	MCID.TSFlags & SIInstrFlags::MIMG;
307	}
308
309	// taken from SIInstrInfo::hasModifiersSet()
310	bool AMDGPUCustomBehaviour::hasModifiersSet(
311	const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
312	int Idx = AMDGPU::getNamedOperandIdx(Opcode: Inst ->getOpcode(), NamedIdx: OpName);
313	if (Idx == -`1`)
314	return false;
315
316	const MCAOperand *Op = Inst ->getOperand(Idx);
317	if (Op == nullptr \|\| !Op->isImm() \|\| !Op->getImm())
318	return false;
319
320	return true;
321	}
322
323	// taken from SIInstrInfo::isGWS()
324	bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
325	const MCInstrDesc &MCID = MCII.get(Opcode);
326	return MCID.TSFlags & SIInstrFlags::GWS;
327	}
328
329	// taken from SIInstrInfo::isAlwaysGDS()
330	bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
331	return Opcode == AMDGPU::DS_ORDERED_COUNT \|\| isGWS(Opcode);
332	}
333
334	} // namespace llvm::mca
335
336	using namespace llvm;
337	using namespace mca;
338
339	static CustomBehaviour *
340	createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
341	const mca::SourceMgr &SrcMgr,
342	const MCInstrInfo &MCII) {
343	return new AMDGPUCustomBehaviour (STI, SrcMgr, MCII);
344	}
345
346	static InstrPostProcess *
347	createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
348	const MCInstrInfo &MCII) {
349	return new AMDGPUInstrPostProcess (STI, MCII);
350	}
351
352	/// Extern function to initialize the targets for the AMDGPU backend
353
354	extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
355	TargetRegistry::RegisterCustomBehaviour(T&: getTheR600Target(),
356	Fn: createAMDGPUCustomBehaviour);
357	TargetRegistry::RegisterInstrPostProcess(T&: getTheR600Target(),
358	Fn: createAMDGPUInstrPostProcess);
359
360	TargetRegistry::RegisterCustomBehaviour(T&: getTheGCNTarget(),
361	Fn: createAMDGPUCustomBehaviour);
362	TargetRegistry::RegisterInstrPostProcess(T&: getTheGCNTarget(),
363	Fn: createAMDGPUInstrPostProcess);
364	}
365

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp