SIMachineFunctionInfo.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp]

1	//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "SIMachineFunctionInfo.h"
10	#include "AMDGPUSubtarget.h"
11	#include "GCNSubtarget.h"
12	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13	#include "SIRegisterInfo.h"
14	#include "Utils/AMDGPUBaseInfo.h"
15	#include "llvm/CodeGen/LiveIntervals.h"
16	#include "llvm/CodeGen/MIRParser/MIParser.h"
17	#include "llvm/CodeGen/MachineBasicBlock.h"
18	#include "llvm/CodeGen/MachineFrameInfo.h"
19	#include "llvm/CodeGen/MachineFunction.h"
20	#include "llvm/CodeGen/MachineRegisterInfo.h"
21	#include "llvm/IR/CallingConv.h"
22	#include "llvm/IR/DiagnosticInfo.h"
23	#include "llvm/IR/Function.h"
24	#include <cassert>
25	#include <optional>
26	#include <vector>
27
28	enum { MAX_LANES = `64` };
29
30	using namespace llvm;
31
32	// TODO -- delete this flag once we have more robust mechanisms to allocate the
33	// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34	// where it is better to produce the VGPR form (e.g. if there are VGPR users
35	// of the MFMA result).
36	static cl::opt<bool, true> MFMAVGPRFormOpt(
37	"amdgpu-mfma-vgpr-form",
38	cl::desc ("Whether to force use VGPR for Opc and Dest of MFMA. If "
39	"unspecified, default to compiler heuristics"),
40	cl::location(L&: SIMachineFunctionInfo::MFMAVGPRForm), cl::init(Val: true),
41	cl::Hidden);
42
43	const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
44	const SITargetLowering *TLI = STI->getTargetLowering();
45	return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
46	}
47
48	bool SIMachineFunctionInfo::MFMAVGPRForm = false;
49
50	SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
51	const GCNSubtarget *STI)
52	: AMDGPUMachineFunction (F, STI), Mode (F, STI), GWSResourcePSV (getTM(STI)),
53	UserSGPRInfo (F, STI), WorkGroupIDX(false), WorkGroupIDY(false*),
54	WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
55	PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
56	WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
57	GITPtrHigh(`0xffffffff`), HighBitsOf32BitAddress(`0`),
58	IsWholeWaveFunction(F.getCallingConv() ==
59	CallingConv::AMDGPU_Gfx_WholeWave) {
60	const GCNSubtarget &ST = *STI;
61	FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
62	WavesPerEU = ST.getWavesPerEU(F);
63	MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
64	assert(MaxNumWorkGroups.size() == `3`);
65
66	// Temporarily check both the attribute and the subtarget feature, until the
67	// latter is completely removed.
68	DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
69	if (DynamicVGPRBlockSize == `0` && ST.isDynamicVGPREnabled())
70	DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
71
72	Occupancy = ST.computeOccupancy(F, LDSSize: getLDSSize()).second;
73	CallingConv::ID CC = F.getCallingConv();
74
75	VRegFlags.reserve(S: `1024`);
76
77	const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL \|\|
78	CC == CallingConv::SPIR_KERNEL;
79
80	if (IsKernel) {
81	WorkGroupIDX = true;
82	WorkItemIDX = true;
83	} else if (CC == CallingConv::AMDGPU_PS) {
84	PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
85	}
86
87	if (ST.hasGFX90AInsts()) {
88	// FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the
89	// allocation granule and clamping.
90	auto [MinNumAGPRAttr, MaxNumAGPRAttr] =
91	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: {~`0u`, ~`0u`},
92	/OnlyFirstRequired=/true);
93	MinNumAGPRs = MinNumAGPRAttr;
94	}
95
96	if (!isEntryFunction()) {
97	if (CC != CallingConv::AMDGPU_Gfx &&
98	CC != CallingConv::AMDGPU_Gfx_WholeWave)
99	ArgInfo = AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
100
101	FrameOffsetReg = AMDGPU::SGPR33;
102	StackPtrOffsetReg = AMDGPU::SGPR32;
103
104	if (!ST.hasFlatScratchEnabled()) {
105	// Non-entry functions have no special inputs for now, other registers
106	// required for scratch access.
107	ScratchRSrcReg = AMDGPU::isChainCC(CC)
108	? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
109	: ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
110
111	ArgInfo.PrivateSegmentBuffer =
112	ArgDescriptor::createRegister(Reg: ScratchRSrcReg);
113	}
114
115	if (!F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr") &&
116	!AMDGPU::isChainCC(CC))
117	ImplicitArgPtr = true;
118	} else {
119	ImplicitArgPtr = false;
120	MaxKernArgAlign =
121	std::max(a: ST.getAlignmentForImplicitArgPtr(), b: MaxKernArgAlign);
122	}
123
124	if (!AMDGPU::isGraphics(CC) \|\|
125	((CC == CallingConv::AMDGPU_CS \|\| CC == CallingConv::AMDGPU_Gfx) &&
126	ST.hasArchitectedSGPRs())) {
127	if (IsKernel \|\| !F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-x") \|\|
128	!F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-x"))
129	WorkGroupIDX = true;
130
131	if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-y") \|\|
132	!F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-y"))
133	WorkGroupIDY = true;
134
135	if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-z") \|\|
136	!F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-z"))
137	WorkGroupIDZ = true;
138	}
139
140	if (!AMDGPU::isGraphics(CC)) {
141	if (IsKernel \|\| !F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-x"))
142	WorkItemIDX = true;
143
144	if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-y") &&
145	ST.getMaxWorkitemID(Kernel: F, Dimension: `1`) != `0`)
146	WorkItemIDY = true;
147
148	if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-z") &&
149	ST.getMaxWorkitemID(Kernel: F, Dimension: `2`) != `0`)
150	WorkItemIDZ = true;
151
152	if (!IsKernel && !F.hasFnAttribute(Kind: "amdgpu-no-lds-kernel-id"))
153	LDSKernelId = true;
154	}
155
156	if (isEntryFunction()) {
157	// X, XY, and XYZ are the only supported combinations, so make sure Y is
158	// enabled if Z is.
159	if (WorkItemIDZ)
160	WorkItemIDY = true;
161
162	if (!ST.hasArchitectedFlatScratch()) {
163	PrivateSegmentWaveByteOffset = true;
164
165	// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
166	if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
167	(CC == CallingConv::AMDGPU_HS \|\| CC == CallingConv::AMDGPU_GS))
168	ArgInfo.PrivateSegmentWaveByteOffset =
169	ArgDescriptor::createRegister(Reg: AMDGPU::SGPR5);
170	}
171	}
172
173	Attribute A = F.getFnAttribute(Kind: "amdgpu-git-ptr-high");
174	StringRef S = A.getValueAsString();
175	if (!S.empty())
176	S.consumeInteger(Radix: `0`, Result&: GITPtrHigh);
177
178	A = F.getFnAttribute(Kind: "amdgpu-32bit-address-high-bits");
179	S = A.getValueAsString();
180	if (!S.empty())
181	S.consumeInteger(Radix: `0`, Result&: HighBitsOf32BitAddress);
182
183	MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
184	Kind: "amdgpu-max-memory-cluster-dwords", Default: DefaultMemoryClusterDWordsLimit);
185
186	// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
187	// VGPR available at all times. For now, reserve highest available VGPR. After
188	// RA, shift it to the lowest available unused VGPR if the one exist.
189	if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
190	VGPRForAGPRCopy =
191	AMDGPU::VGPR_32RegClass.getRegister(i: ST.getMaxNumVGPRs(F) - `1`);
192	}
193
194	ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
195	}
196
197	MachineFunctionInfo *SIMachineFunctionInfo::clone(
198	BumpPtrAllocator &Allocator, MachineFunction &DestMF,
199	const DenseMap<MachineBasicBlock , MachineBasicBlock > &Src2DstMBB)
200	const {
201	return DestMF.cloneInfo<SIMachineFunctionInfo>(Old: *this);
202	}
203
204	void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
205	limitOccupancy(Limit: getMaxWavesPerEU());
206	const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
207	limitOccupancy(Limit: ST.getOccupancyWithWorkGroupSizes(MF).second);
208	}
209
210	Register SIMachineFunctionInfo::addPrivateSegmentBuffer(
211	const SIRegisterInfo &TRI) {
212	ArgInfo.PrivateSegmentBuffer =
213	ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
214	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SGPR_128RegClass));
215	NumUserSGPRs += `4`;
216	return ArgInfo.PrivateSegmentBuffer.getRegister();
217	}
218
219	Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
220	ArgInfo.DispatchPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
221	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
222	NumUserSGPRs += `2`;
223	return ArgInfo.DispatchPtr.getRegister();
224	}
225
226	Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
227	ArgInfo.QueuePtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
228	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
229	NumUserSGPRs += `2`;
230	return ArgInfo.QueuePtr.getRegister();
231	}
232
233	Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
234	ArgInfo.KernargSegmentPtr
235	= ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
236	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
237	NumUserSGPRs += `2`;
238	return ArgInfo.KernargSegmentPtr.getRegister();
239	}
240
241	Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
242	ArgInfo.DispatchID = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
243	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
244	NumUserSGPRs += `2`;
245	return ArgInfo.DispatchID.getRegister();
246	}
247
248	Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
249	ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
250	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
251	NumUserSGPRs += `2`;
252	return ArgInfo.FlatScratchInit.getRegister();
253	}
254
255	Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) {
256	ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(Reg: getNextUserSGPR());
257	NumUserSGPRs += `1`;
258	return ArgInfo.PrivateSegmentSize.getRegister();
259	}
260
261	Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
262	ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
263	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
264	NumUserSGPRs += `2`;
265	return ArgInfo.ImplicitBufferPtr.getRegister();
266	}
267
268	Register SIMachineFunctionInfo::addLDSKernelId() {
269	ArgInfo.LDSKernelId = ArgDescriptor::createRegister(Reg: getNextUserSGPR());
270	NumUserSGPRs += `1`;
271	return ArgInfo.LDSKernelId.getRegister();
272	}
273
274	SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
275	const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
276	unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
277	auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(Key: KernArgIdx);
278	assert(Inserted && "Preload kernel argument allocated twice.");
279	NumUserSGPRs += PaddingSGPRs;
280	// If the available register tuples are aligned with the kernarg to be
281	// preloaded use that register, otherwise we need to use a set of SGPRs and
282	// merge them.
283	if (!ArgInfo.FirstKernArgPreloadReg)
284	ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
285	Register PreloadReg =
286	TRI.getMatchingSuperReg(Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC);
287	auto &Regs = It ->second.Regs;
288	if (PreloadReg &&
289	(RC == &AMDGPU::SReg_32RegClass \|\| RC == &AMDGPU::SReg_64RegClass)) {
290	Regs.push_back(Elt: PreloadReg);
291	NumUserSGPRs += AllocSizeDWord;
292	} else {
293	Regs.reserve(N: AllocSizeDWord);
294	for (unsigned I = `0`; I < AllocSizeDWord; ++I) {
295	Regs.push_back(Elt: getNextUserSGPR());
296	NumUserSGPRs++;
297	}
298	}
299
300	// Track the actual number of SGPRs that HW will preload to.
301	UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: AllocSizeDWord + PaddingSGPRs);
302	return &Regs;
303	}
304
305	void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
306	uint64_t Size, Align Alignment) {
307	// Skip if it is an entry function or the register is already added.
308	if (isEntryFunction() \|\| WWMSpills.count(Key: VGPR))
309	return;
310
311	// Skip if this is a function with the amdgpu_cs_chain or
312	// amdgpu_cs_chain_preserve calling convention and this is a scratch register.
313	// We never need to allocate a spill for these because we don't even need to
314	// restore the inactive lanes for them (they're scratchier than the usual
315	// scratch registers). We only need to do this if we have calls to
316	// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
317	// chain functions do not return) and the function did not contain a call to
318	// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
319	// when entering the function).
320	if (isChainFunction() &&
321	(SIRegisterInfo::isChainScratchRegister(VGPR) \|\|
322	!MF.getFrameInfo().hasTailCall() \|\| hasInitWholeWave()))
323	return;
324
325	WWMSpills.insert(KV: std::make_pair(
326	x&: VGPR, y: MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
327	}
328
329	// Separate out the callee-saved and scratch registers.
330	void SIMachineFunctionInfo::splitWWMSpillRegisters(
331	MachineFunction &MF,
332	SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
333	SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
334	const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
335	for (auto &Reg : WWMSpills) {
336	if (isCalleeSavedReg(CSRegs, Reg: Reg.first))
337	CalleeSavedRegs.push_back(Elt: Reg);
338	else
339	ScratchRegs.push_back(Elt: Reg);
340	}
341	}
342
343	bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
344	MCPhysReg Reg) const {
345	for (unsigned I = `0`; CSRegs[I]; ++I) {
346	if (CSRegs[I] == Reg)
347	return true;
348	}
349
350	return false;
351	}
352
353	void SIMachineFunctionInfo::shiftWwmVGPRsToLowestRange(
354	MachineFunction &MF, SmallVectorImpl<Register> &WWMVGPRs,
355	BitVector &SavedVGPRs) {
356	const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
357	MachineRegisterInfo &MRI = MF.getRegInfo();
358	for (unsigned I = `0`, E = WWMVGPRs.size(); I < E; ++I) {
359	Register Reg = WWMVGPRs [I];
360	Register NewReg =
361	TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF);
362	if (!NewReg \|\| NewReg >= Reg)
363	break;
364
365	MRI.replaceRegWith(FromReg: Reg, ToReg: NewReg);
366
367	// Update various tables with the new VGPR.
368	WWMVGPRs [I] = NewReg;
369	WWMReservedRegs.remove(X: Reg);
370	WWMReservedRegs.insert(X: NewReg);
371	MRI.reserveReg(PhysReg: NewReg, TRI);
372
373	// Replace the register in SpillPhysVGPRs. This is needed to look for free
374	// lanes while spilling special SGPRs like FP, BP, etc. during PEI.
375	auto *RegItr = llvm::find(Range&: SpillPhysVGPRs, Val: Reg);
376	if (RegItr != SpillPhysVGPRs.end()) {
377	unsigned Idx = std::distance(first: SpillPhysVGPRs.begin(), last: RegItr);
378	SpillPhysVGPRs [Idx] = NewReg;
379	}
380
381	// The generic `determineCalleeSaves` might have set the old register if it
382	// is in the CSR range.
383	SavedVGPRs.reset(Idx: Reg);
384
385	for (MachineBasicBlock &MBB : MF) {
386	MBB.removeLiveIn(Reg);
387	MBB.sortUniqueLiveIns();
388	}
389
390	Reg = NewReg;
391	}
392	}
393
394	bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
395	MachineFunction &MF, int FI, unsigned LaneIndex) {
396	MachineRegisterInfo &MRI = MF.getRegInfo();
397	Register LaneVGPR;
398	if (!LaneIndex) {
399	LaneVGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
400	SpillVGPRs.push_back(Elt: LaneVGPR);
401	} else {
402	LaneVGPR = SpillVGPRs.back();
403	}
404
405	SGPRSpillsToVirtualVGPRLanes [FI].emplace_back(args&: LaneVGPR, args&: LaneIndex);
406	return true;
407	}
408
409	bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
410	MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
411	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
412	const SIRegisterInfo *TRI = ST.getRegisterInfo();
413	MachineRegisterInfo &MRI = MF.getRegInfo();
414	Register LaneVGPR;
415	if (!LaneIndex) {
416	// Find the highest available register if called before RA to ensure the
417	// lowest registers are available for allocation. The LaneVGPR, in that
418	// case, will be shifted back to the lowest range after VGPR allocation.
419	LaneVGPR = TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF,
420	ReserveHighestVGPR: !IsPrologEpilog);
421	if (LaneVGPR == AMDGPU::NoRegister) {
422	// We have no VGPRs left for spilling SGPRs. Reset because we will not
423	// partially spill the SGPR to VGPRs.
424	SGPRSpillsToPhysicalVGPRLanes.erase(Val: FI);
425	return false;
426	}
427
428	if (IsPrologEpilog)
429	allocateWWMSpill(MF, VGPR: LaneVGPR);
430
431	reserveWWMRegister(Reg: LaneVGPR);
432	for (MachineBasicBlock &MBB : MF) {
433	MBB.addLiveIn(PhysReg: LaneVGPR);
434	MBB.sortUniqueLiveIns();
435	}
436	SpillPhysVGPRs.push_back(Elt: LaneVGPR);
437	} else {
438	LaneVGPR = SpillPhysVGPRs.back();
439	}
440
441	SGPRSpillsToPhysicalVGPRLanes [FI].emplace_back(args&: LaneVGPR, args&: LaneIndex);
442	return true;
443	}
444
445	bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
446	MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
447	bool IsPrologEpilog) {
448	std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
449	SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes [FI]
450	: SGPRSpillsToVirtualVGPRLanes [FI];
451
452	// This has already been allocated.
453	if (!SpillLanes.empty())
454	return true;
455
456	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
457	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
458	unsigned WaveSize = ST.getWavefrontSize();
459
460	unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI);
461	unsigned NumLanes = Size / `4`;
462
463	if (NumLanes > WaveSize)
464	return false;
465
466	assert(Size >= `4` && "invalid sgpr spill size");
467	assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
468	"not spilling SGPRs to VGPRs");
469
470	unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
471	: NumVirtualVGPRSpillLanes;
472
473	for (unsigned I = `0`; I < NumLanes; ++I, ++NumSpillLanes) {
474	unsigned LaneIndex = (NumSpillLanes % WaveSize);
475
476	bool Allocated = SpillToPhysVGPRLane
477	? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
478	IsPrologEpilog)
479	: allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
480	if (!Allocated) {
481	NumSpillLanes -= I;
482	return false;
483	}
484	}
485
486	return true;
487	}
488
489	/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
490	/// Either AGPR is spilled to VGPR to vice versa.
491	/// Returns true if a \p FI can be eliminated completely.
492	bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
493	int FI,
494	bool isAGPRtoVGPR) {
495	MachineRegisterInfo &MRI = MF.getRegInfo();
496	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
497	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
498
499	assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
500
501	auto &Spill = VGPRToAGPRSpills [FI];
502
503	// This has already been allocated.
504	if (!Spill.Lanes.empty())
505	return Spill.FullyAllocated;
506
507	unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI);
508	unsigned NumLanes = Size / `4`;
509	Spill.Lanes.resize(N: NumLanes, NV: AMDGPU::NoRegister);
510
511	const TargetRegisterClass &RC =
512	isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
513	auto Regs = RC.getRegisters();
514
515	auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
516	const SIRegisterInfo *TRI = ST.getRegisterInfo();
517	Spill.FullyAllocated = true;
518
519	// FIXME: Move allocation logic out of MachineFunctionInfo and initialize
520	// once.
521	BitVector OtherUsedRegs;
522	OtherUsedRegs.resize(N: TRI->getNumRegs());
523
524	const uint32_t *CSRMask =
525	TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
526	if (CSRMask)
527	OtherUsedRegs.setBitsInMask(Mask: CSRMask);
528
529	// TODO: Should include register tuples, but doesn't matter with current
530	// usage.
531	for (MCPhysReg Reg : SpillAGPR)
532	OtherUsedRegs.set(Reg);
533	for (MCPhysReg Reg : SpillVGPR)
534	OtherUsedRegs.set(Reg);
535
536	SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
537	for (int I = NumLanes - `1`; I >= `0`; --I) {
538	NextSpillReg = std::find_if(
539	first: NextSpillReg, last: Regs.end(), pred: [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
540	return MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg) &&
541	!OtherUsedRegs [Reg];
542	});
543
544	if (NextSpillReg == Regs.end()) { // Registers exhausted
545	Spill.FullyAllocated = false;
546	break;
547	}
548
549	OtherUsedRegs.set(*NextSpillReg);
550	SpillRegs.push_back(Elt: *NextSpillReg);
551	MRI.reserveReg(PhysReg: *NextSpillReg, TRI);
552	Spill.Lanes [I] = *NextSpillReg++;
553	}
554
555	return Spill.FullyAllocated;
556	}
557
558	bool SIMachineFunctionInfo::removeDeadFrameIndices(
559	MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
560	// Remove dead frame indices from function frame, however keep FP & BP since
561	// spills for them haven't been inserted yet. And also make sure to remove the
562	// frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
563	// otherwise, it could result in an unexpected side effect and bug, in case of
564	// any re-mapping of freed frame indices by later pass(es) like "stack slot
565	// coloring".
566	for (auto &R : make_early_inc_range(Range&: SGPRSpillsToVirtualVGPRLanes)) {
567	MFI.RemoveStackObject(ObjectIdx: R.first);
568	SGPRSpillsToVirtualVGPRLanes.erase(Val: R.first);
569	}
570
571	// Remove the dead frame indices of CSR SGPRs which are spilled to physical
572	// VGPR lanes during SILowerSGPRSpills pass.
573	if (!ResetSGPRSpillStackIDs) {
574	for (auto &R : make_early_inc_range(Range&: SGPRSpillsToPhysicalVGPRLanes)) {
575	MFI.RemoveStackObject(ObjectIdx: R.first);
576	SGPRSpillsToPhysicalVGPRLanes.erase(Val: R.first);
577	}
578	}
579	bool HaveSGPRToMemory = false;
580
581	if (ResetSGPRSpillStackIDs) {
582	// All other SGPRs must be allocated on the default stack, so reset the
583	// stack ID.
584	for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
585	++I) {
586	if (!checkIndexInPrologEpilogSGPRSpills(FI: I)) {
587	if (MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill) {
588	MFI.setStackID(ObjectIdx: I, ID: TargetStackID::Default);
589	HaveSGPRToMemory = true;
590	}
591	}
592	}
593	}
594
595	for (auto &R : VGPRToAGPRSpills) {
596	if (R.second.IsDead)
597	MFI.RemoveStackObject(ObjectIdx: R.first);
598	}
599
600	return HaveSGPRToMemory;
601	}
602
603	int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
604	const SIRegisterInfo &TRI) {
605	if (ScavengeFI)
606	return *ScavengeFI;
607
608	ScavengeFI =
609	MFI.CreateStackObject(Size: TRI.getSpillSize(RC: AMDGPU::SGPR_32RegClass),
610	Alignment: TRI.getSpillAlign(RC: AMDGPU::SGPR_32RegClass), isSpillSlot: false);
611	return *ScavengeFI;
612	}
613
614	MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
615	assert(NumSystemSGPRs == `0` && "System SGPRs must be added after user SGPRs");
616	return AMDGPU::SGPR0 + NumUserSGPRs;
617	}
618
619	MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
620	return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
621	}
622
623	void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
624	VRegFlags.grow(N: Reg);
625	}
626
627	void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
628	Register SrcReg) {
629	VRegFlags.grow(N: NewReg);
630	VRegFlags [NewReg] = VRegFlags [SrcReg];
631	}
632
633	Register
634	SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
635	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
636	if (!ST.isAmdPalOS())
637	return Register ();
638	Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
639	if (ST.hasMergedShaders()) {
640	switch (MF.getFunction().getCallingConv()) {
641	case CallingConv::AMDGPU_HS:
642	case CallingConv::AMDGPU_GS:
643	// Low GIT address is passed in s8 rather than s0 for an LS+HS or
644	// ES+GS merged shader on gfx9+.
645	GitPtrLo = AMDGPU::SGPR8;
646	return GitPtrLo;
647	default:
648	return GitPtrLo;
649	}
650	}
651	return GitPtrLo;
652	}
653
654	static yaml::StringValue regToString(Register Reg,
655	const TargetRegisterInfo &TRI) {
656	yaml::StringValue Dest;
657	{
658	raw_string_ostream OS(Dest.Value);
659	OS << printReg(Reg, TRI: &TRI);
660	}
661	return Dest;
662	}
663
664	static std::optional<yaml::SIArgumentInfo>
665	convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
666	const TargetRegisterInfo &TRI) {
667	yaml::SIArgumentInfo AI;
668
669	auto convertArg = [&](std::optional<yaml::SIArgument> &A,
670	const ArgDescriptor &Arg) {
671	if (!Arg)
672	return false;
673
674	// Create a register or stack argument.
675	yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: Arg.isRegister());
676	if (Arg.isRegister()) {
677	raw_string_ostream OS(SA.RegisterName.Value);
678	OS << printReg(Reg: Arg.getRegister(), TRI: &TRI);
679	} else
680	SA.StackOffset = Arg.getStackOffset();
681	// Check and update the optional mask.
682	if (Arg.isMasked())
683	SA.Mask = Arg.getMask();
684
685	A = std::move(SA);
686	return true;
687	};
688
689	bool Any = false;
690	Any \|= convertArg (AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
691	Any \|= convertArg (AI.DispatchPtr, ArgInfo.DispatchPtr);
692	Any \|= convertArg (AI.QueuePtr, ArgInfo.QueuePtr);
693	Any \|= convertArg (AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
694	Any \|= convertArg (AI.DispatchID, ArgInfo.DispatchID);
695	Any \|= convertArg (AI.FlatScratchInit, ArgInfo.FlatScratchInit);
696	Any \|= convertArg (AI.LDSKernelId, ArgInfo.LDSKernelId);
697	Any \|= convertArg (AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
698	Any \|= convertArg (AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
699	Any \|= convertArg (AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
700	Any \|= convertArg (AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
701	Any \|= convertArg (AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
702	Any \|= convertArg (AI.PrivateSegmentWaveByteOffset,
703	ArgInfo.PrivateSegmentWaveByteOffset);
704	Any \|= convertArg (AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
705	Any \|= convertArg (AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
706	Any \|= convertArg (AI.WorkItemIDX, ArgInfo.WorkItemIDX);
707	Any \|= convertArg (AI.WorkItemIDY, ArgInfo.WorkItemIDY);
708	Any \|= convertArg (AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
709
710	// Write FirstKernArgPreloadReg separately, since it's a Register,
711	// not ArgDescriptor.
712	if (ArgInfo.FirstKernArgPreloadReg) {
713	Register Reg = ArgInfo.FirstKernArgPreloadReg;
714	assert(Reg.isPhysical() &&
715	"FirstKernArgPreloadReg must be a physical register");
716
717	yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: true);
718	raw_string_ostream OS(SA.RegisterName.Value);
719	OS << printReg(Reg, TRI: &TRI);
720
721	AI.FirstKernArgPreloadReg = SA;
722	Any = true;
723	}
724
725	if (Any)
726	return AI;
727
728	return std::nullopt;
729	}
730
731	yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
732	const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI,
733	const llvm::MachineFunction &MF)
734	: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
735	MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
736	GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
737	IsEntryFunction(MFI.isEntryFunction()), MemoryBound(MFI.isMemoryBound()),
738	WaveLimiter(MFI.needsWaveLimiter()),
739	HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
740	HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
741	NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
742	NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
743	HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
744	Occupancy(MFI.getOccupancy()),
745	ScratchRSrcReg(regToString(Reg: MFI.getScratchRSrcReg(), TRI)),
746	FrameOffsetReg(regToString(Reg: MFI.getFrameOffsetReg(), TRI)),
747	StackPtrOffsetReg(regToString(Reg: MFI.getStackPtrOffsetReg(), TRI)),
748	BytesInStackArgArea(MFI.getBytesInStackArgArea()),
749	ReturnsVoid(MFI.returnsVoid()),
750	ArgInfo(convertArgumentInfo(ArgInfo: MFI.getArgInfo(), TRI)),
751	PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
752	MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
753	Mode (MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
754	IsWholeWaveFunction(MFI.isWholeWaveFunction()),
755	DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
756	ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
757	NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) {
758	for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
759	SpillPhysVGPRS.push_back(Elt: regToString(Reg, TRI));
760
761	for (Register Reg : MFI.getWWMReservedRegs())
762	WWMReservedRegs.push_back(Elt: regToString(Reg, TRI));
763
764	if (MFI.getLongBranchReservedReg())
765	LongBranchReservedReg = regToString(Reg: MFI.getLongBranchReservedReg(), TRI);
766	if (MFI.getVGPRForAGPRCopy())
767	VGPRForAGPRCopy = regToString(Reg: MFI.getVGPRForAGPRCopy(), TRI);
768
769	if (MFI.getSGPRForEXECCopy())
770	SGPRForEXECCopy = regToString(Reg: MFI.getSGPRForEXECCopy(), TRI);
771
772	auto SFI = MFI.getOptionalScavengeFI();
773	if (SFI)
774	ScavengeFI = yaml::FrameIndex (*SFI, MF.getFrameInfo());
775	}
776
777	void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
778	MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, MFI&: *this);
779	}
780
781	bool SIMachineFunctionInfo::initializeBaseYamlFields(
782	const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
783	PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) {
784	ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
785	MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
786	LDSSize = YamlMFI.LDSSize;
787	GDSSize = YamlMFI.GDSSize;
788	DynLDSAlign = YamlMFI.DynLDSAlign;
789	PSInputAddr = YamlMFI.PSInputAddr;
790	PSInputEnable = YamlMFI.PSInputEnable;
791	MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
792	HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
793	Occupancy = YamlMFI.Occupancy;
794	IsEntryFunction = YamlMFI.IsEntryFunction;
795	MemoryBound = YamlMFI.MemoryBound;
796	WaveLimiter = YamlMFI.WaveLimiter;
797	HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
798	HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
799	NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
800	NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
801	BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
802	ReturnsVoid = YamlMFI.ReturnsVoid;
803	IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
804
805	UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: YamlMFI.NumKernargPreloadSGPRs);
806
807	if (YamlMFI.ScavengeFI) {
808	auto FIOrErr = YamlMFI.ScavengeFI ->getFI(MFI: MF.getFrameInfo());
809	if (!FIOrErr) {
810	// Create a diagnostic for a the frame index.
811	const MemoryBuffer &Buffer =
812	*PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
813
814	Error = SMDiagnostic (*PFS.SM, SMLoc (), Buffer.getBufferIdentifier(), `1`, `1`,
815	SourceMgr::DK_Error, toString(E: FIOrErr.takeError()),
816	"", {}, {});
817	SourceRange = YamlMFI.ScavengeFI ->SourceRange;
818	return true;
819	}
820	ScavengeFI = *FIOrErr;
821	} else {
822	ScavengeFI = std::nullopt;
823	}
824	return false;
825	}
826
827	bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
828	auto [MinNumAGPR, MaxNumAGPR] =
829	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: {~`0u`, ~`0u`},
830	/OnlyFirstRequired=/true);
831	return MinNumAGPR != `0u`;
832	}
833

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp