SIMachineFunctionInfo.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp]

1	//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "SIMachineFunctionInfo.h"
10	#include "AMDGPUSubtarget.h"
11	#include "AMDGPUTargetMachine.h"
12	#include "GCNSubtarget.h"
13	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14	#include "SIRegisterInfo.h"
15	#include "Utils/AMDGPUBaseInfo.h"
16	#include "llvm/CodeGen/LiveIntervals.h"
17	#include "llvm/CodeGen/MIRParser/MIParser.h"
18	#include "llvm/CodeGen/MachineBasicBlock.h"
19	#include "llvm/CodeGen/MachineFrameInfo.h"
20	#include "llvm/CodeGen/MachineFunction.h"
21	#include "llvm/CodeGen/MachineRegisterInfo.h"
22	#include "llvm/IR/CallingConv.h"
23	#include "llvm/IR/DiagnosticInfo.h"
24	#include "llvm/IR/Function.h"
25	#include <cassert>
26	#include <optional>
27	#include <vector>
28
29	enum { MAX_LANES = `64` };
30
31	using namespace llvm;
32
33	const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
34	const SITargetLowering *TLI = STI->getTargetLowering();
35	return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
36	}
37
38	SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
39	const GCNSubtarget *STI)
40	: AMDGPUMachineFunction (F, STI), Mode (F, STI), GWSResourcePSV (getTM(STI)),
41	UserSGPRInfo (F, STI), WorkGroupIDX(false), WorkGroupIDY(false*),
42	WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
43	PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
44	WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
45	GITPtrHigh(`0xffffffff`), HighBitsOf32BitAddress(`0`) {
46	const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
47	FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
48	WavesPerEU = ST.getWavesPerEU(F);
49	MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
50	assert(MaxNumWorkGroups.size() == `3`);
51
52	Occupancy = ST.computeOccupancy(F, LDSSize: getLDSSize());
53	CallingConv::ID CC = F.getCallingConv();
54
55	VRegFlags.reserve(s: `1024`);
56
57	const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL \|\|
58	CC == CallingConv::SPIR_KERNEL;
59
60	if (IsKernel) {
61	WorkGroupIDX = true;
62	WorkItemIDX = true;
63	} else if (CC == CallingConv::AMDGPU_PS) {
64	PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
65	}
66
67	MayNeedAGPRs = ST.hasMAIInsts();
68
69	if (AMDGPU::isChainCC(CC)) {
70	// Chain functions don't receive an SP from their caller, but are free to
71	// set one up. For now, we can use s32 to match what amdgpu_gfx functions
72	// would use if called, but this can be revisited.
73	// FIXME: Only reserve this if we actually need it.
74	StackPtrOffsetReg = AMDGPU::SGPR32;
75
76	ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
77
78	ArgInfo.PrivateSegmentBuffer =
79	ArgDescriptor::createRegister(Reg: ScratchRSrcReg);
80
81	ImplicitArgPtr = false;
82	} else if (!isEntryFunction()) {
83	if (CC != CallingConv::AMDGPU_Gfx)
84	ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
85
86	FrameOffsetReg = AMDGPU::SGPR33;
87	StackPtrOffsetReg = AMDGPU::SGPR32;
88
89	if (!ST.enableFlatScratch()) {
90	// Non-entry functions have no special inputs for now, other registers
91	// required for scratch access.
92	ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
93
94	ArgInfo.PrivateSegmentBuffer =
95	ArgDescriptor::createRegister(Reg: ScratchRSrcReg);
96	}
97
98	if (!F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
99	ImplicitArgPtr = true;
100	} else {
101	ImplicitArgPtr = false;
102	MaxKernArgAlign = std::max(a: ST.getAlignmentForImplicitArgPtr(),
103	b: MaxKernArgAlign);
104
105	if (ST.hasGFX90AInsts() &&
106	ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
107	!mayUseAGPRs(F))
108	MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
109	}
110
111	if (!AMDGPU::isGraphics(CC) \|\|
112	((CC == CallingConv::AMDGPU_CS \|\| CC == CallingConv::AMDGPU_Gfx) &&
113	ST.hasArchitectedSGPRs())) {
114	if (IsKernel \|\| !F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-x"))
115	WorkGroupIDX = true;
116
117	if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-y"))
118	WorkGroupIDY = true;
119
120	if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-z"))
121	WorkGroupIDZ = true;
122	}
123
124	if (!AMDGPU::isGraphics(CC)) {
125	if (IsKernel \|\| !F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-x"))
126	WorkItemIDX = true;
127
128	if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-y") &&
129	ST.getMaxWorkitemID(Kernel: F, Dimension: `1`) != `0`)
130	WorkItemIDY = true;
131
132	if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-z") &&
133	ST.getMaxWorkitemID(Kernel: F, Dimension: `2`) != `0`)
134	WorkItemIDZ = true;
135
136	if (!IsKernel && !F.hasFnAttribute(Kind: "amdgpu-no-lds-kernel-id"))
137	LDSKernelId = true;
138	}
139
140	if (isEntryFunction()) {
141	// X, XY, and XYZ are the only supported combinations, so make sure Y is
142	// enabled if Z is.
143	if (WorkItemIDZ)
144	WorkItemIDY = true;
145
146	if (!ST.flatScratchIsArchitected()) {
147	PrivateSegmentWaveByteOffset = true;
148
149	// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
150	if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
151	(CC == CallingConv::AMDGPU_HS \|\| CC == CallingConv::AMDGPU_GS))
152	ArgInfo.PrivateSegmentWaveByteOffset =
153	ArgDescriptor::createRegister(Reg: AMDGPU::SGPR5);
154	}
155	}
156
157	Attribute A = F.getFnAttribute(Kind: "amdgpu-git-ptr-high");
158	StringRef S = A.getValueAsString();
159	if (!S.empty())
160	S.consumeInteger(Radix: `0`, Result&: GITPtrHigh);
161
162	A = F.getFnAttribute(Kind: "amdgpu-32bit-address-high-bits");
163	S = A.getValueAsString();
164	if (!S.empty())
165	S.consumeInteger(Radix: `0`, Result&: HighBitsOf32BitAddress);
166
167	// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
168	// VGPR available at all times. For now, reserve highest available VGPR. After
169	// RA, shift it to the lowest available unused VGPR if the one exist.
170	if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
171	VGPRForAGPRCopy =
172	AMDGPU::VGPR_32RegClass.getRegister(i: ST.getMaxNumVGPRs(F) - `1`);
173	}
174	}
175
176	MachineFunctionInfo *SIMachineFunctionInfo::clone(
177	BumpPtrAllocator &Allocator, MachineFunction &DestMF,
178	const DenseMap<MachineBasicBlock , MachineBasicBlock > &Src2DstMBB)
179	const {
180	return DestMF.cloneInfo<SIMachineFunctionInfo>(Old: *this);
181	}
182
183	void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
184	limitOccupancy(Limit: getMaxWavesPerEU());
185	const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
186	limitOccupancy(Limit: ST.getOccupancyWithLocalMemSize(Bytes: getLDSSize(),
187	MF.getFunction()));
188	}
189
190	Register SIMachineFunctionInfo::addPrivateSegmentBuffer(
191	const SIRegisterInfo &TRI) {
192	ArgInfo.PrivateSegmentBuffer =
193	ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
194	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SGPR_128RegClass));
195	NumUserSGPRs += `4`;
196	return ArgInfo.PrivateSegmentBuffer.getRegister();
197	}
198
199	Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
200	ArgInfo.DispatchPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
201	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
202	NumUserSGPRs += `2`;
203	return ArgInfo.DispatchPtr.getRegister();
204	}
205
206	Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
207	ArgInfo.QueuePtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
208	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
209	NumUserSGPRs += `2`;
210	return ArgInfo.QueuePtr.getRegister();
211	}
212
213	Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
214	ArgInfo.KernargSegmentPtr
215	= ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
216	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
217	NumUserSGPRs += `2`;
218	return ArgInfo.KernargSegmentPtr.getRegister();
219	}
220
221	Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
222	ArgInfo.DispatchID = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
223	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
224	NumUserSGPRs += `2`;
225	return ArgInfo.DispatchID.getRegister();
226	}
227
228	Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
229	ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
230	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
231	NumUserSGPRs += `2`;
232	return ArgInfo.FlatScratchInit.getRegister();
233	}
234
235	Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) {
236	ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(Reg: getNextUserSGPR());
237	NumUserSGPRs += `1`;
238	return ArgInfo.PrivateSegmentSize.getRegister();
239	}
240
241	Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
242	ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
243	Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
244	NumUserSGPRs += `2`;
245	return ArgInfo.ImplicitBufferPtr.getRegister();
246	}
247
248	Register SIMachineFunctionInfo::addLDSKernelId() {
249	ArgInfo.LDSKernelId = ArgDescriptor::createRegister(Reg: getNextUserSGPR());
250	NumUserSGPRs += `1`;
251	return ArgInfo.LDSKernelId.getRegister();
252	}
253
254	SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
255	const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
256	unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
257	assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
258	"Preload kernel argument allocated twice.");
259	NumUserSGPRs += PaddingSGPRs;
260	// If the available register tuples are aligned with the kernarg to be
261	// preloaded use that register, otherwise we need to use a set of SGPRs and
262	// merge them.
263	Register PreloadReg =
264	TRI.getMatchingSuperReg(Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC);
265	if (PreloadReg &&
266	(RC == &AMDGPU::SReg_32RegClass \|\| RC == &AMDGPU::SReg_64RegClass)) {
267	ArgInfo.PreloadKernArgs [KernArgIdx].Regs.push_back(Elt: PreloadReg);
268	NumUserSGPRs += AllocSizeDWord;
269	} else {
270	for (unsigned I = `0`; I < AllocSizeDWord; ++I) {
271	ArgInfo.PreloadKernArgs [KernArgIdx].Regs.push_back(Elt: getNextUserSGPR());
272	NumUserSGPRs++;
273	}
274	}
275
276	// Track the actual number of SGPRs that HW will preload to.
277	UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: AllocSizeDWord + PaddingSGPRs);
278	return &ArgInfo.PreloadKernArgs [KernArgIdx].Regs;
279	}
280
281	void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
282	uint64_t Size, Align Alignment) {
283	// Skip if it is an entry function or the register is already added.
284	if (isEntryFunction() \|\| WWMSpills.count(Key: VGPR))
285	return;
286
287	// Skip if this is a function with the amdgpu_cs_chain or
288	// amdgpu_cs_chain_preserve calling convention and this is a scratch register.
289	// We never need to allocate a spill for these because we don't even need to
290	// restore the inactive lanes for them (they're scratchier than the usual
291	// scratch registers).
292	if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR))
293	return;
294
295	WWMSpills.insert(KV: std::make_pair(
296	x&: VGPR, y: MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
297	}
298
299	// Separate out the callee-saved and scratch registers.
300	void SIMachineFunctionInfo::splitWWMSpillRegisters(
301	MachineFunction &MF,
302	SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
303	SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
304	const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
305	for (auto &Reg : WWMSpills) {
306	if (isCalleeSavedReg(CSRegs, Reg: Reg.first))
307	CalleeSavedRegs.push_back(Elt: Reg);
308	else
309	ScratchRegs.push_back(Elt: Reg);
310	}
311	}
312
313	bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
314	MCPhysReg Reg) const {
315	for (unsigned I = `0`; CSRegs[I]; ++I) {
316	if (CSRegs[I] == Reg)
317	return true;
318	}
319
320	return false;
321	}
322
323	void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange(
324	MachineFunction &MF) {
325	const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
326	MachineRegisterInfo &MRI = MF.getRegInfo();
327	for (Register &Reg : SpillPhysVGPRs) {
328	Register NewReg =
329	TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF);
330	if (!NewReg \|\| NewReg >= Reg)
331	break;
332
333	MRI.replaceRegWith(FromReg: Reg, ToReg: NewReg);
334
335	// Update various tables with the new VGPR.
336	WWMReservedRegs.remove(X: Reg);
337	WWMReservedRegs.insert(X: NewReg);
338	WWMSpills.insert(KV: std::make_pair(x&: NewReg, y&: WWMSpills [Reg]));
339	WWMSpills.erase(Key: Reg);
340
341	for (MachineBasicBlock &MBB : MF) {
342	MBB.removeLiveIn(Reg);
343	MBB.sortUniqueLiveIns();
344	}
345
346	Reg = NewReg;
347	}
348	}
349
350	bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
351	MachineFunction &MF, int FI, unsigned LaneIndex) {
352	MachineRegisterInfo &MRI = MF.getRegInfo();
353	Register LaneVGPR;
354	if (!LaneIndex) {
355	LaneVGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
356	SpillVGPRs.push_back(Elt: LaneVGPR);
357	} else {
358	LaneVGPR = SpillVGPRs.back();
359	}
360
361	SGPRSpillsToVirtualVGPRLanes [FI].emplace_back(args&: LaneVGPR, args&: LaneIndex);
362	return true;
363	}
364
365	bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
366	MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
367	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
368	const SIRegisterInfo *TRI = ST.getRegisterInfo();
369	MachineRegisterInfo &MRI = MF.getRegInfo();
370	Register LaneVGPR;
371	if (!LaneIndex) {
372	// Find the highest available register if called before RA to ensure the
373	// lowest registers are available for allocation. The LaneVGPR, in that
374	// case, will be shifted back to the lowest range after VGPR allocation.
375	LaneVGPR = TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF,
376	ReserveHighestVGPR: !IsPrologEpilog);
377	if (LaneVGPR == AMDGPU::NoRegister) {
378	// We have no VGPRs left for spilling SGPRs. Reset because we will not
379	// partially spill the SGPR to VGPRs.
380	SGPRSpillsToPhysicalVGPRLanes.erase(Val: FI);
381	return false;
382	}
383
384	allocateWWMSpill(MF, VGPR: LaneVGPR);
385	reserveWWMRegister(Reg: LaneVGPR);
386	for (MachineBasicBlock &MBB : MF) {
387	MBB.addLiveIn(PhysReg: LaneVGPR);
388	MBB.sortUniqueLiveIns();
389	}
390	SpillPhysVGPRs.push_back(Elt: LaneVGPR);
391	} else {
392	LaneVGPR = SpillPhysVGPRs.back();
393	}
394
395	SGPRSpillsToPhysicalVGPRLanes [FI].emplace_back(args&: LaneVGPR, args&: LaneIndex);
396	return true;
397	}
398
399	bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
400	MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
401	bool IsPrologEpilog) {
402	std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
403	SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes [FI]
404	: SGPRSpillsToVirtualVGPRLanes [FI];
405
406	// This has already been allocated.
407	if (!SpillLanes.empty())
408	return true;
409
410	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
411	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
412	unsigned WaveSize = ST.getWavefrontSize();
413
414	unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI);
415	unsigned NumLanes = Size / `4`;
416
417	if (NumLanes > WaveSize)
418	return false;
419
420	assert(Size >= `4` && "invalid sgpr spill size");
421	assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
422	"not spilling SGPRs to VGPRs");
423
424	unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
425	: NumVirtualVGPRSpillLanes;
426
427	for (unsigned I = `0`; I < NumLanes; ++I, ++NumSpillLanes) {
428	unsigned LaneIndex = (NumSpillLanes % WaveSize);
429
430	bool Allocated = SpillToPhysVGPRLane
431	? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
432	IsPrologEpilog)
433	: allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
434	if (!Allocated) {
435	NumSpillLanes -= I;
436	return false;
437	}
438	}
439
440	return true;
441	}
442
443	/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
444	/// Either AGPR is spilled to VGPR to vice versa.
445	/// Returns true if a \p FI can be eliminated completely.
446	bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
447	int FI,
448	bool isAGPRtoVGPR) {
449	MachineRegisterInfo &MRI = MF.getRegInfo();
450	MachineFrameInfo &FrameInfo = MF.getFrameInfo();
451	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
452
453	assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
454
455	auto &Spill = VGPRToAGPRSpills [FI];
456
457	// This has already been allocated.
458	if (!Spill.Lanes.empty())
459	return Spill.FullyAllocated;
460
461	unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI);
462	unsigned NumLanes = Size / `4`;
463	Spill.Lanes.resize(N: NumLanes, NV: AMDGPU::NoRegister);
464
465	const TargetRegisterClass &RC =
466	isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
467	auto Regs = RC.getRegisters();
468
469	auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
470	const SIRegisterInfo *TRI = ST.getRegisterInfo();
471	Spill.FullyAllocated = true;
472
473	// FIXME: Move allocation logic out of MachineFunctionInfo and initialize
474	// once.
475	BitVector OtherUsedRegs;
476	OtherUsedRegs.resize(N: TRI->getNumRegs());
477
478	const uint32_t *CSRMask =
479	TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
480	if (CSRMask)
481	OtherUsedRegs.setBitsInMask(Mask: CSRMask);
482
483	// TODO: Should include register tuples, but doesn't matter with current
484	// usage.
485	for (MCPhysReg Reg : SpillAGPR)
486	OtherUsedRegs.set(Reg);
487	for (MCPhysReg Reg : SpillVGPR)
488	OtherUsedRegs.set(Reg);
489
490	SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
491	for (int I = NumLanes - `1`; I >= `0`; --I) {
492	NextSpillReg = std::find_if(
493	first: NextSpillReg, last: Regs.end(), pred: [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
494	return MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg) &&
495	!OtherUsedRegs [Reg];
496	});
497
498	if (NextSpillReg == Regs.end()) { // Registers exhausted
499	Spill.FullyAllocated = false;
500	break;
501	}
502
503	OtherUsedRegs.set(*NextSpillReg);
504	SpillRegs.push_back(Elt: *NextSpillReg);
505	MRI.reserveReg(PhysReg: *NextSpillReg, TRI);
506	Spill.Lanes [I] = *NextSpillReg++;
507	}
508
509	return Spill.FullyAllocated;
510	}
511
512	bool SIMachineFunctionInfo::removeDeadFrameIndices(
513	MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
514	// Remove dead frame indices from function frame, however keep FP & BP since
515	// spills for them haven't been inserted yet. And also make sure to remove the
516	// frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
517	// otherwise, it could result in an unexpected side effect and bug, in case of
518	// any re-mapping of freed frame indices by later pass(es) like "stack slot
519	// coloring".
520	for (auto &R : make_early_inc_range(Range&: SGPRSpillsToVirtualVGPRLanes)) {
521	MFI.RemoveStackObject(ObjectIdx: R.first);
522	SGPRSpillsToVirtualVGPRLanes.erase(Val: R.first);
523	}
524
525	// Remove the dead frame indices of CSR SGPRs which are spilled to physical
526	// VGPR lanes during SILowerSGPRSpills pass.
527	if (!ResetSGPRSpillStackIDs) {
528	for (auto &R : make_early_inc_range(Range&: SGPRSpillsToPhysicalVGPRLanes)) {
529	MFI.RemoveStackObject(ObjectIdx: R.first);
530	SGPRSpillsToPhysicalVGPRLanes.erase(Val: R.first);
531	}
532	}
533	bool HaveSGPRToMemory = false;
534
535	if (ResetSGPRSpillStackIDs) {
536	// All other SGPRs must be allocated on the default stack, so reset the
537	// stack ID.
538	for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
539	++I) {
540	if (!checkIndexInPrologEpilogSGPRSpills(FI: I)) {
541	if (MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill) {
542	MFI.setStackID(ObjectIdx: I, ID: TargetStackID::Default);
543	HaveSGPRToMemory = true;
544	}
545	}
546	}
547	}
548
549	for (auto &R : VGPRToAGPRSpills) {
550	if (R.second.IsDead)
551	MFI.RemoveStackObject(ObjectIdx: R.first);
552	}
553
554	return HaveSGPRToMemory;
555	}
556
557	int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
558	const SIRegisterInfo &TRI) {
559	if (ScavengeFI)
560	return *ScavengeFI;
561
562	ScavengeFI =
563	MFI.CreateStackObject(Size: TRI.getSpillSize(RC: AMDGPU::SGPR_32RegClass),
564	Alignment: TRI.getSpillAlign(RC: AMDGPU::SGPR_32RegClass), isSpillSlot: false);
565	return *ScavengeFI;
566	}
567
568	MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
569	assert(NumSystemSGPRs == `0` && "System SGPRs must be added after user SGPRs");
570	return AMDGPU::SGPR0 + NumUserSGPRs;
571	}
572
573	MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
574	return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
575	}
576
577	void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
578	VRegFlags.grow(n: Reg);
579	}
580
581	void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
582	Register SrcReg) {
583	VRegFlags.grow(n: NewReg);
584	VRegFlags [NewReg] = VRegFlags [SrcReg];
585	}
586
587	Register
588	SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
589	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
590	if (!ST.isAmdPalOS())
591	return Register ();
592	Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
593	if (ST.hasMergedShaders()) {
594	switch (MF.getFunction().getCallingConv()) {
595	case CallingConv::AMDGPU_HS:
596	case CallingConv::AMDGPU_GS:
597	// Low GIT address is passed in s8 rather than s0 for an LS+HS or
598	// ES+GS merged shader on gfx9+.
599	GitPtrLo = AMDGPU::SGPR8;
600	return GitPtrLo;
601	default:
602	return GitPtrLo;
603	}
604	}
605	return GitPtrLo;
606	}
607
608	static yaml::StringValue regToString(Register Reg,
609	const TargetRegisterInfo &TRI) {
610	yaml::StringValue Dest;
611	{
612	raw_string_ostream OS(Dest.Value);
613	OS << printReg(Reg, TRI: &TRI);
614	}
615	return Dest;
616	}
617
618	static std::optional<yaml::SIArgumentInfo>
619	convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
620	const TargetRegisterInfo &TRI) {
621	yaml::SIArgumentInfo AI;
622
623	auto convertArg = [&](std::optional<yaml::SIArgument> &A,
624	const ArgDescriptor &Arg) {
625	if (!Arg)
626	return false;
627
628	// Create a register or stack argument.
629	yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: Arg.isRegister());
630	if (Arg.isRegister()) {
631	raw_string_ostream OS(SA.RegisterName.Value);
632	OS << printReg(Reg: Arg.getRegister(), TRI: &TRI);
633	} else
634	SA.StackOffset = Arg.getStackOffset();
635	// Check and update the optional mask.
636	if (Arg.isMasked())
637	SA.Mask = Arg.getMask();
638
639	A = SA;
640	return true;
641	};
642
643	// TODO: Need to serialize kernarg preloads.
644	bool Any = false;
645	Any \|= convertArg (AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
646	Any \|= convertArg (AI.DispatchPtr, ArgInfo.DispatchPtr);
647	Any \|= convertArg (AI.QueuePtr, ArgInfo.QueuePtr);
648	Any \|= convertArg (AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
649	Any \|= convertArg (AI.DispatchID, ArgInfo.DispatchID);
650	Any \|= convertArg (AI.FlatScratchInit, ArgInfo.FlatScratchInit);
651	Any \|= convertArg (AI.LDSKernelId, ArgInfo.LDSKernelId);
652	Any \|= convertArg (AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
653	Any \|= convertArg (AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
654	Any \|= convertArg (AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
655	Any \|= convertArg (AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
656	Any \|= convertArg (AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
657	Any \|= convertArg (AI.PrivateSegmentWaveByteOffset,
658	ArgInfo.PrivateSegmentWaveByteOffset);
659	Any \|= convertArg (AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
660	Any \|= convertArg (AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
661	Any \|= convertArg (AI.WorkItemIDX, ArgInfo.WorkItemIDX);
662	Any \|= convertArg (AI.WorkItemIDY, ArgInfo.WorkItemIDY);
663	Any \|= convertArg (AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
664
665	if (Any)
666	return AI;
667
668	return std::nullopt;
669	}
670
671	yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
672	const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI,
673	const llvm::MachineFunction &MF)
674	: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
675	MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
676	GDSSize(MFI.getGDSSize()),
677	DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
678	NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
679	MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
680	HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
681	HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
682	HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
683	Occupancy(MFI.getOccupancy()),
684	ScratchRSrcReg(regToString(Reg: MFI.getScratchRSrcReg(), TRI)),
685	FrameOffsetReg(regToString(Reg: MFI.getFrameOffsetReg(), TRI)),
686	StackPtrOffsetReg(regToString(Reg: MFI.getStackPtrOffsetReg(), TRI)),
687	BytesInStackArgArea(MFI.getBytesInStackArgArea()),
688	ReturnsVoid(MFI.returnsVoid()),
689	ArgInfo(convertArgumentInfo(ArgInfo: MFI.getArgInfo(), TRI)),
690	PSInputAddr(MFI.getPSInputAddr()),
691	PSInputEnable(MFI.getPSInputEnable()),
692	Mode (MFI.getMode()) {
693	for (Register Reg : MFI.getWWMReservedRegs())
694	WWMReservedRegs.push_back(Elt: regToString(Reg, TRI));
695
696	if (MFI.getLongBranchReservedReg())
697	LongBranchReservedReg = regToString(Reg: MFI.getLongBranchReservedReg(), TRI);
698	if (MFI.getVGPRForAGPRCopy())
699	VGPRForAGPRCopy = regToString(Reg: MFI.getVGPRForAGPRCopy(), TRI);
700
701	if (MFI.getSGPRForEXECCopy())
702	SGPRForEXECCopy = regToString(Reg: MFI.getSGPRForEXECCopy(), TRI);
703
704	auto SFI = MFI.getOptionalScavengeFI();
705	if (SFI)
706	ScavengeFI = yaml::FrameIndex (*SFI, MF.getFrameInfo());
707	}
708
709	void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
710	MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, MFI&: *this);
711	}
712
713	bool SIMachineFunctionInfo::initializeBaseYamlFields(
714	const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
715	PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) {
716	ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
717	MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
718	LDSSize = YamlMFI.LDSSize;
719	GDSSize = YamlMFI.GDSSize;
720	DynLDSAlign = YamlMFI.DynLDSAlign;
721	PSInputAddr = YamlMFI.PSInputAddr;
722	PSInputEnable = YamlMFI.PSInputEnable;
723	HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
724	Occupancy = YamlMFI.Occupancy;
725	IsEntryFunction = YamlMFI.IsEntryFunction;
726	NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
727	MemoryBound = YamlMFI.MemoryBound;
728	WaveLimiter = YamlMFI.WaveLimiter;
729	HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
730	HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
731	BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
732	ReturnsVoid = YamlMFI.ReturnsVoid;
733
734	if (YamlMFI.ScavengeFI) {
735	auto FIOrErr = YamlMFI.ScavengeFI ->getFI(MFI: MF.getFrameInfo());
736	if (!FIOrErr) {
737	// Create a diagnostic for a the frame index.
738	const MemoryBuffer &Buffer =
739	*PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
740
741	Error = SMDiagnostic (*PFS.SM, SMLoc (), Buffer.getBufferIdentifier(), `1`, `1`,
742	SourceMgr::DK_Error, toString(E: FIOrErr.takeError()),
743	"", std::nullopt, std::nullopt);
744	SourceRange = YamlMFI.ScavengeFI ->SourceRange;
745	return true;
746	}
747	ScavengeFI = *FIOrErr;
748	} else {
749	ScavengeFI = std::nullopt;
750	}
751	return false;
752	}
753
754	bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
755	return !F.hasFnAttribute(Kind: "amdgpu-no-agpr");
756	}
757
758	bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
759	if (UsesAGPRs)
760	return *UsesAGPRs;
761
762	if (!mayNeedAGPRs()) {
763	UsesAGPRs = false;
764	return false;
765	}
766
767	if (!AMDGPU::isEntryFunctionCC(CC: MF.getFunction().getCallingConv()) \|\|
768	MF.getFrameInfo().hasCalls()) {
769	UsesAGPRs = true;
770	return true;
771	}
772
773	const MachineRegisterInfo &MRI = MF.getRegInfo();
774
775	for (unsigned I = `0`, E = MRI.getNumVirtRegs(); I != E; ++I) {
776	const Register Reg = Register::index2VirtReg(Index: I);
777	const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
778	if (RC && SIRegisterInfo::isAGPRClass(RC)) {
779	UsesAGPRs = true;
780	return true;
781	}
782	if (!RC && !MRI.use_empty(RegNo: Reg) && MRI.getType(Reg).isValid()) {
783	// Defer caching UsesAGPRs, function might not yet been regbank selected.
784	return true;
785	}
786	}
787
788	for (MCRegister Reg : AMDGPU::AGPR_32RegClass) {
789	if (MRI.isPhysRegUsed(PhysReg: Reg)) {
790	UsesAGPRs = true;
791	return true;
792	}
793	}
794
795	UsesAGPRs = false;
796	return false;
797	}
798

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp