1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "SIMachineFunctionInfo.h"
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
12#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13#include "SIRegisterInfo.h"
14#include "Utils/AMDGPUBaseInfo.h"
15#include "llvm/CodeGen/LiveIntervals.h"
16#include "llvm/CodeGen/MIRParser/MIParser.h"
17#include "llvm/CodeGen/MachineBasicBlock.h"
18#include "llvm/CodeGen/MachineFrameInfo.h"
19#include "llvm/CodeGen/MachineFunction.h"
20#include "llvm/CodeGen/MachineRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
22#include "llvm/IR/DiagnosticInfo.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
36static cl::opt<bool, true> MFMAVGPRFormOpt(
37 "amdgpu-mfma-vgpr-form",
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
40 cl::location(L&: SIMachineFunctionInfo::MFMAVGPRForm), cl::init(Val: true),
41 cl::Hidden);
42
43const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
44 const SITargetLowering *TLI = STI->getTargetLowering();
45 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
46}
47
48bool SIMachineFunctionInfo::MFMAVGPRForm = false;
49
50SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
51 const GCNSubtarget *STI)
52 : AMDGPUMachineFunctionInfo(F, *STI), Mode(F, *STI),
53 GWSResourcePSV(getTM(STI)), UserSGPRInfo(F, *STI), WorkGroupIDX(false),
54 WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false),
55 LDSKernelId(false), PrivateSegmentWaveByteOffset(false),
56 WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false),
57 ImplicitArgPtr(false), GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
58 IsWholeWaveFunction(F.getCallingConv() ==
59 CallingConv::AMDGPU_Gfx_WholeWave) {
60 const GCNSubtarget &ST = *STI;
61 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
62 WavesPerEU = ST.getWavesPerEU(F);
63 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
64 assert(MaxNumWorkGroups.size() == 3);
65
66 // Temporarily check both the attribute and the subtarget feature, until the
67 // latter is completely removed.
68 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
69 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
70 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
71
72 Occupancy = ST.computeOccupancy(F, LDSSize: getLDSSize()).second;
73 CallingConv::ID CC = F.getCallingConv();
74
75 VRegFlags.reserve(S: 1024);
76
77 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
78 CC == CallingConv::SPIR_KERNEL;
79
80 if (IsKernel) {
81 WorkGroupIDX = true;
82 WorkItemIDX = true;
83 } else if (CC == CallingConv::AMDGPU_PS) {
84 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
85 }
86
87 if (ST.hasGFX90AInsts()) {
88 // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the
89 // allocation granule and clamping.
90 auto [MinNumAGPRAttr, MaxNumAGPRAttr] =
91 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: {~0u, ~0u},
92 /*OnlyFirstRequired=*/true);
93 MinNumAGPRs = MinNumAGPRAttr;
94 }
95
96 if (!isEntryFunction()) {
97 if (CC != CallingConv::AMDGPU_Gfx &&
98 CC != CallingConv::AMDGPU_Gfx_WholeWave)
99 ArgInfo = AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
100
101 FrameOffsetReg = AMDGPU::SGPR33;
102 StackPtrOffsetReg = AMDGPU::SGPR32;
103
104 if (!ST.hasFlatScratchEnabled()) {
105 // Non-entry functions have no special inputs for now, other registers
106 // required for scratch access.
107 ScratchRSrcReg = AMDGPU::isChainCC(CC)
108 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
109 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
110
111 ArgInfo.PrivateSegmentBuffer =
112 ArgDescriptor::createRegister(Reg: ScratchRSrcReg);
113 }
114
115 if (!F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr") &&
116 !AMDGPU::isChainCC(CC))
117 ImplicitArgPtr = true;
118 } else {
119 ImplicitArgPtr = false;
120 MaxKernArgAlign =
121 std::max(a: ST.getAlignmentForImplicitArgPtr(), b: MaxKernArgAlign);
122 }
123
124 if (!AMDGPU::isGraphics(CC) ||
125 ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_Gfx) &&
126 ST.hasArchitectedSGPRs())) {
127 if (IsKernel || !F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-x") ||
128 !F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-x"))
129 WorkGroupIDX = true;
130
131 if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-y") ||
132 !F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-y"))
133 WorkGroupIDY = true;
134
135 if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-z") ||
136 !F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-z"))
137 WorkGroupIDZ = true;
138 }
139
140 if (!AMDGPU::isGraphics(CC)) {
141 if (IsKernel || !F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-x"))
142 WorkItemIDX = true;
143
144 if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-y") &&
145 ST.getMaxWorkitemID(Kernel: F, Dimension: 1) != 0)
146 WorkItemIDY = true;
147
148 if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-z") &&
149 ST.getMaxWorkitemID(Kernel: F, Dimension: 2) != 0)
150 WorkItemIDZ = true;
151
152 if (!IsKernel && !F.hasFnAttribute(Kind: "amdgpu-no-lds-kernel-id"))
153 LDSKernelId = true;
154 }
155
156 if (isEntryFunction()) {
157 // X, XY, and XYZ are the only supported combinations, so make sure Y is
158 // enabled if Z is.
159 if (WorkItemIDZ)
160 WorkItemIDY = true;
161
162 if (!ST.hasArchitectedFlatScratch()) {
163 PrivateSegmentWaveByteOffset = true;
164
165 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
166 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
167 (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
168 ArgInfo.PrivateSegmentWaveByteOffset =
169 ArgDescriptor::createRegister(Reg: AMDGPU::SGPR5);
170 }
171 }
172
173 Attribute A = F.getFnAttribute(Kind: "amdgpu-git-ptr-high");
174 StringRef S = A.getValueAsString();
175 if (!S.empty())
176 S.consumeInteger(Radix: 0, Result&: GITPtrHigh);
177
178 A = F.getFnAttribute(Kind: "amdgpu-32bit-address-high-bits");
179 S = A.getValueAsString();
180 if (!S.empty())
181 S.consumeInteger(Radix: 0, Result&: HighBitsOf32BitAddress);
182
183 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
184 Kind: "amdgpu-max-memory-cluster-dwords", Default: DefaultMemoryClusterDWordsLimit);
185
186 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
187 // VGPR available at all times. For now, reserve highest available VGPR. After
188 // RA, shift it to the lowest available unused VGPR if the one exist.
189 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
190 VGPRForAGPRCopy =
191 AMDGPU::VGPR_32RegClass.getRegister(i: ST.getMaxNumVGPRs(F) - 1);
192 }
193
194 ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
195}
196
197MachineFunctionInfo *SIMachineFunctionInfo::clone(
198 BumpPtrAllocator &Allocator, MachineFunction &DestMF,
199 const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
200 const {
201 return DestMF.cloneInfo<SIMachineFunctionInfo>(Old: *this);
202}
203
204void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
205 limitOccupancy(Limit: getMaxWavesPerEU());
206 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
207 limitOccupancy(Limit: ST.getOccupancyWithWorkGroupSizes(MF).second);
208}
209
210Register SIMachineFunctionInfo::addPrivateSegmentBuffer(
211 const SIRegisterInfo &TRI) {
212 ArgInfo.PrivateSegmentBuffer =
213 ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
214 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SGPR_128RegClass));
215 NumUserSGPRs += 4;
216 return ArgInfo.PrivateSegmentBuffer.getRegister();
217}
218
219Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
220 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
221 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
222 NumUserSGPRs += 2;
223 return ArgInfo.DispatchPtr.getRegister();
224}
225
226Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
227 ArgInfo.QueuePtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
228 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
229 NumUserSGPRs += 2;
230 return ArgInfo.QueuePtr.getRegister();
231}
232
233Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
234 ArgInfo.KernargSegmentPtr
235 = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
236 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
237 NumUserSGPRs += 2;
238 return ArgInfo.KernargSegmentPtr.getRegister();
239}
240
241Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
242 ArgInfo.DispatchID = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
243 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
244 NumUserSGPRs += 2;
245 return ArgInfo.DispatchID.getRegister();
246}
247
248Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
249 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
250 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
251 NumUserSGPRs += 2;
252 return ArgInfo.FlatScratchInit.getRegister();
253}
254
255Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) {
256 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(Reg: getNextUserSGPR());
257 NumUserSGPRs += 1;
258 return ArgInfo.PrivateSegmentSize.getRegister();
259}
260
261Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
262 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
263 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
264 NumUserSGPRs += 2;
265 return ArgInfo.ImplicitBufferPtr.getRegister();
266}
267
268Register SIMachineFunctionInfo::addLDSKernelId() {
269 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(Reg: getNextUserSGPR());
270 NumUserSGPRs += 1;
271 return ArgInfo.LDSKernelId.getRegister();
272}
273
274SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
275 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
276 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
277 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(Key: KernArgIdx);
278 assert(Inserted && "Preload kernel argument allocated twice.");
279 NumUserSGPRs += PaddingSGPRs;
280 // If the available register tuples are aligned with the kernarg to be
281 // preloaded use that register, otherwise we need to use a set of SGPRs and
282 // merge them.
283 if (!ArgInfo.FirstKernArgPreloadReg)
284 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
285 Register PreloadReg =
286 TRI.getMatchingSuperReg(Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC);
287 auto &Regs = It->second.Regs;
288 if (PreloadReg &&
289 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
290 Regs.push_back(Elt: PreloadReg);
291 NumUserSGPRs += AllocSizeDWord;
292 } else {
293 Regs.reserve(N: AllocSizeDWord);
294 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
295 Regs.push_back(Elt: getNextUserSGPR());
296 NumUserSGPRs++;
297 }
298 }
299
300 // Track the actual number of SGPRs that HW will preload to.
301 UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: AllocSizeDWord + PaddingSGPRs);
302 return &Regs;
303}
304
305void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
306 uint64_t Size, Align Alignment) {
307 // Skip if it is an entry function or the register is already added.
308 if (isEntryFunction() || WWMSpills.count(Key: VGPR))
309 return;
310
311 // Skip if this is a function with the amdgpu_cs_chain or
312 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
313 // We never need to allocate a spill for these because we don't even need to
314 // restore the inactive lanes for them (they're scratchier than the usual
315 // scratch registers). We only need to do this if we have calls to
316 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
317 // chain functions do not return) and the function did not contain a call to
318 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
319 // when entering the function).
320 if (isChainFunction() &&
321 (SIRegisterInfo::isChainScratchRegister(VGPR) ||
322 !MF.getFrameInfo().hasTailCall() || hasInitWholeWave()))
323 return;
324
325 WWMSpills.insert(KV: std::make_pair(
326 x&: VGPR, y: MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
327}
328
329// Separate out the callee-saved and scratch registers.
330void SIMachineFunctionInfo::splitWWMSpillRegisters(
331 MachineFunction &MF,
332 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
333 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
334 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
335 for (auto &Reg : WWMSpills) {
336 if (isCalleeSavedReg(CSRegs, Reg: Reg.first))
337 CalleeSavedRegs.push_back(Elt: Reg);
338 else
339 ScratchRegs.push_back(Elt: Reg);
340 }
341}
342
343bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
344 MCPhysReg Reg) const {
345 for (unsigned I = 0; CSRegs[I]; ++I) {
346 if (CSRegs[I] == Reg)
347 return true;
348 }
349
350 return false;
351}
352
353void SIMachineFunctionInfo::shiftWwmVGPRsToLowestRange(
354 MachineFunction &MF, SmallVectorImpl<Register> &WWMVGPRs,
355 BitVector &SavedVGPRs) {
356 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
357 MachineRegisterInfo &MRI = MF.getRegInfo();
358 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
359 Register Reg = WWMVGPRs[I];
360 Register NewReg =
361 TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF);
362 if (!NewReg || NewReg >= Reg)
363 break;
364
365 MRI.replaceRegWith(FromReg: Reg, ToReg: NewReg);
366
367 // Update various tables with the new VGPR.
368 WWMVGPRs[I] = NewReg;
369 WWMReservedRegs.remove(X: Reg);
370 WWMReservedRegs.insert(X: NewReg);
371 MRI.reserveReg(PhysReg: NewReg, TRI);
372
373 // Replace the register in SpillPhysVGPRs. This is needed to look for free
374 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
375 auto *RegItr = llvm::find(Range&: SpillPhysVGPRs, Val: Reg);
376 if (RegItr != SpillPhysVGPRs.end()) {
377 unsigned Idx = std::distance(first: SpillPhysVGPRs.begin(), last: RegItr);
378 SpillPhysVGPRs[Idx] = NewReg;
379
380 // For replacing registers used in the CFI instructions.
381 MF.replaceFrameInstRegister(From: Reg, To: NewReg);
382 }
383
384 // The generic `determineCalleeSaves` might have set the old register if it
385 // is in the CSR range.
386 SavedVGPRs.reset(Idx: Reg);
387
388 for (MachineBasicBlock &MBB : MF) {
389 MBB.removeLiveIn(Reg);
390 MBB.sortUniqueLiveIns();
391 }
392
393 Reg = NewReg;
394 }
395}
396
397bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
398 MachineFunction &MF, int FI, unsigned LaneIndex) {
399 MachineRegisterInfo &MRI = MF.getRegInfo();
400 Register LaneVGPR;
401 if (!LaneIndex) {
402 LaneVGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
403 SpillVGPRs.push_back(Elt: LaneVGPR);
404 } else {
405 LaneVGPR = SpillVGPRs.back();
406 }
407
408 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(args&: LaneVGPR, args&: LaneIndex);
409 return true;
410}
411
412bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
413 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
414 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
415 const SIRegisterInfo *TRI = ST.getRegisterInfo();
416 MachineRegisterInfo &MRI = MF.getRegInfo();
417 Register LaneVGPR;
418 if (!LaneIndex) {
419 // Find the highest available register if called before RA to ensure the
420 // lowest registers are available for allocation. The LaneVGPR, in that
421 // case, will be shifted back to the lowest range after VGPR allocation.
422 LaneVGPR = TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF,
423 ReserveHighestVGPR: !IsPrologEpilog);
424 if (LaneVGPR == AMDGPU::NoRegister) {
425 // We have no VGPRs left for spilling SGPRs. Reset because we will not
426 // partially spill the SGPR to VGPRs.
427 SGPRSpillsToPhysicalVGPRLanes.erase(Val: FI);
428 return false;
429 }
430
431 if (IsPrologEpilog)
432 allocateWWMSpill(MF, VGPR: LaneVGPR);
433
434 reserveWWMRegister(Reg: LaneVGPR);
435 for (MachineBasicBlock &MBB : MF) {
436 MBB.addLiveIn(PhysReg: LaneVGPR);
437 MBB.sortUniqueLiveIns();
438 }
439 SpillPhysVGPRs.push_back(Elt: LaneVGPR);
440 } else {
441 LaneVGPR = SpillPhysVGPRs.back();
442 }
443
444 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(args&: LaneVGPR, args&: LaneIndex);
445 return true;
446}
447
448bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
449 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
450 bool IsPrologEpilog) {
451 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
452 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
453 : SGPRSpillsToVirtualVGPRLanes[FI];
454
455 // This has already been allocated.
456 if (!SpillLanes.empty())
457 return true;
458
459 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
460 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
461 unsigned WaveSize = ST.getWavefrontSize();
462
463 unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI);
464 unsigned NumLanes = Size / 4;
465
466 if (NumLanes > WaveSize)
467 return false;
468
469 assert(Size >= 4 && "invalid sgpr spill size");
470 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
471 "not spilling SGPRs to VGPRs");
472
473 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
474 : NumVirtualVGPRSpillLanes;
475
476 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
477 unsigned LaneIndex = (NumSpillLanes % WaveSize);
478
479 bool Allocated = SpillToPhysVGPRLane
480 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
481 IsPrologEpilog)
482 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
483 if (!Allocated) {
484 NumSpillLanes -= I;
485 return false;
486 }
487 }
488
489 return true;
490}
491
492/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
493/// Either AGPR is spilled to VGPR to vice versa.
494/// Returns true if a \p FI can be eliminated completely.
495bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
496 int FI,
497 bool isAGPRtoVGPR) {
498 MachineRegisterInfo &MRI = MF.getRegInfo();
499 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
500 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
501
502 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
503
504 auto &Spill = VGPRToAGPRSpills[FI];
505
506 // This has already been allocated.
507 if (!Spill.Lanes.empty())
508 return Spill.FullyAllocated;
509
510 unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI);
511 unsigned NumLanes = Size / 4;
512 Spill.Lanes.resize(N: NumLanes, NV: AMDGPU::NoRegister);
513
514 const TargetRegisterClass &RC =
515 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
516 auto Regs = RC.getRegisters();
517
518 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
519 const SIRegisterInfo *TRI = ST.getRegisterInfo();
520 Spill.FullyAllocated = true;
521
522 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
523 // once.
524 BitVector OtherUsedRegs;
525 OtherUsedRegs.resize(N: TRI->getNumRegs());
526
527 const uint32_t *CSRMask =
528 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
529 if (CSRMask)
530 OtherUsedRegs.setBitsInMask(Mask: CSRMask);
531
532 // TODO: Should include register tuples, but doesn't matter with current
533 // usage.
534 for (MCPhysReg Reg : SpillAGPR)
535 OtherUsedRegs.set(Reg);
536 for (MCPhysReg Reg : SpillVGPR)
537 OtherUsedRegs.set(Reg);
538
539 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
540 for (int I = NumLanes - 1; I >= 0; --I) {
541 NextSpillReg = std::find_if(
542 first: NextSpillReg, last: Regs.end(), pred: [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
543 return MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg) &&
544 !OtherUsedRegs[Reg];
545 });
546
547 if (NextSpillReg == Regs.end()) { // Registers exhausted
548 Spill.FullyAllocated = false;
549 break;
550 }
551
552 OtherUsedRegs.set(*NextSpillReg);
553 SpillRegs.push_back(Elt: *NextSpillReg);
554 MRI.reserveReg(PhysReg: *NextSpillReg, TRI);
555 Spill.Lanes[I] = *NextSpillReg++;
556 }
557
558 return Spill.FullyAllocated;
559}
560
561bool SIMachineFunctionInfo::removeDeadFrameIndices(
562 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
563 // Remove dead frame indices from function frame, however keep FP & BP since
564 // spills for them haven't been inserted yet. And also make sure to remove the
565 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
566 // otherwise, it could result in an unexpected side effect and bug, in case of
567 // any re-mapping of freed frame indices by later pass(es) like "stack slot
568 // coloring".
569 for (auto &R : SGPRSpillsToVirtualVGPRLanes)
570 MFI.RemoveStackObject(ObjectIdx: R.first);
571 SGPRSpillsToVirtualVGPRLanes.clear();
572
573 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
574 // VGPR lanes during SILowerSGPRSpills pass.
575 if (!ResetSGPRSpillStackIDs) {
576 for (auto &R : SGPRSpillsToPhysicalVGPRLanes)
577 MFI.RemoveStackObject(ObjectIdx: R.first);
578 SGPRSpillsToPhysicalVGPRLanes.clear();
579 }
580 bool HaveSGPRToMemory = false;
581
582 if (ResetSGPRSpillStackIDs) {
583 // All other SGPRs must be allocated on the default stack, so reset the
584 // stack ID.
585 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
586 ++I) {
587 if (!checkIndexInPrologEpilogSGPRSpills(FI: I)) {
588 if (MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill) {
589 MFI.setStackID(ObjectIdx: I, ID: TargetStackID::Default);
590 HaveSGPRToMemory = true;
591 }
592 }
593 }
594 }
595
596 for (auto &R : VGPRToAGPRSpills) {
597 if (R.second.IsDead)
598 MFI.RemoveStackObject(ObjectIdx: R.first);
599 }
600
601 return HaveSGPRToMemory;
602}
603
604int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
605 const SIRegisterInfo &TRI) {
606 if (ScavengeFI)
607 return *ScavengeFI;
608
609 ScavengeFI =
610 MFI.CreateStackObject(Size: TRI.getSpillSize(RC: AMDGPU::SGPR_32RegClass),
611 Alignment: TRI.getSpillAlign(RC: AMDGPU::SGPR_32RegClass), isSpillSlot: false);
612 return *ScavengeFI;
613}
614
615MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
616 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
617 return AMDGPU::SGPR0 + NumUserSGPRs;
618}
619
620MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
621 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
622}
623
624void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
625 VRegFlags.grow(N: Reg);
626}
627
628void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
629 Register SrcReg) {
630 VRegFlags.grow(N: NewReg);
631 VRegFlags[NewReg] = VRegFlags[SrcReg];
632}
633
634Register
635SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
636 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
637 if (!ST.isAmdPalOS())
638 return Register();
639 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
640 if (ST.hasMergedShaders()) {
641 switch (MF.getFunction().getCallingConv()) {
642 case CallingConv::AMDGPU_HS:
643 case CallingConv::AMDGPU_GS:
644 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
645 // ES+GS merged shader on gfx9+.
646 GitPtrLo = AMDGPU::SGPR8;
647 return GitPtrLo;
648 default:
649 return GitPtrLo;
650 }
651 }
652 return GitPtrLo;
653}
654
655static yaml::StringValue regToString(Register Reg,
656 const TargetRegisterInfo &TRI) {
657 yaml::StringValue Dest;
658 {
659 raw_string_ostream OS(Dest.Value);
660 OS << printReg(Reg, TRI: &TRI);
661 }
662 return Dest;
663}
664
665static std::optional<yaml::SIArgumentInfo>
666convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
667 const TargetRegisterInfo &TRI) {
668 yaml::SIArgumentInfo AI;
669
670 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
671 const ArgDescriptor &Arg) {
672 if (!Arg)
673 return false;
674
675 // Create a register or stack argument.
676 yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: Arg.isRegister());
677 if (Arg.isRegister()) {
678 raw_string_ostream OS(SA.RegisterName.Value);
679 OS << printReg(Reg: Arg.getRegister(), TRI: &TRI);
680 } else
681 SA.StackOffset = Arg.getStackOffset();
682 // Check and update the optional mask.
683 if (Arg.isMasked())
684 SA.Mask = Arg.getMask();
685
686 A = std::move(SA);
687 return true;
688 };
689
690 bool Any = false;
691 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
692 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
693 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
694 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
695 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
696 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
697 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
698 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
699 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
700 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
701 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
702 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
703 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
704 ArgInfo.PrivateSegmentWaveByteOffset);
705 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
706 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
707 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
708 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
709 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
710
711 // Write FirstKernArgPreloadReg separately, since it's a Register,
712 // not ArgDescriptor.
713 if (ArgInfo.FirstKernArgPreloadReg) {
714 Register Reg = ArgInfo.FirstKernArgPreloadReg;
715 assert(Reg.isPhysical() &&
716 "FirstKernArgPreloadReg must be a physical register");
717
718 yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: true);
719 raw_string_ostream OS(SA.RegisterName.Value);
720 OS << printReg(Reg, TRI: &TRI);
721
722 AI.FirstKernArgPreloadReg = SA;
723 Any = true;
724 }
725
726 if (Any)
727 return AI;
728
729 return std::nullopt;
730}
731
732yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
733 const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI,
734 const llvm::MachineFunction &MF)
735 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
736 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
737 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
738 IsEntryFunction(MFI.isEntryFunction()), MemoryBound(MFI.isMemoryBound()),
739 WaveLimiter(MFI.needsWaveLimiter()),
740 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
741 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
742 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
743 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
744 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
745 Occupancy(MFI.getOccupancy()),
746 ScratchRSrcReg(regToString(Reg: MFI.getScratchRSrcReg(), TRI)),
747 FrameOffsetReg(regToString(Reg: MFI.getFrameOffsetReg(), TRI)),
748 StackPtrOffsetReg(regToString(Reg: MFI.getStackPtrOffsetReg(), TRI)),
749 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
750 ReturnsVoid(MFI.returnsVoid()),
751 ArgInfo(convertArgumentInfo(ArgInfo: MFI.getArgInfo(), TRI)),
752 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
753 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
754 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
755 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
756 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
757 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
758 NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()),
759 MinNumAGPRs(MFI.getMinNumAGPRs()) {
760 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
761 SpillPhysVGPRS.push_back(Elt: regToString(Reg, TRI));
762
763 for (Register Reg : MFI.getWWMReservedRegs())
764 WWMReservedRegs.push_back(Elt: regToString(Reg, TRI));
765
766 if (MFI.getLongBranchReservedReg())
767 LongBranchReservedReg = regToString(Reg: MFI.getLongBranchReservedReg(), TRI);
768 if (MFI.getVGPRForAGPRCopy())
769 VGPRForAGPRCopy = regToString(Reg: MFI.getVGPRForAGPRCopy(), TRI);
770
771 if (MFI.getSGPRForEXECCopy())
772 SGPRForEXECCopy = regToString(Reg: MFI.getSGPRForEXECCopy(), TRI);
773
774 auto SFI = MFI.getOptionalScavengeFI();
775 if (SFI)
776 ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo());
777}
778
779void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
780 MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, MFI&: *this);
781}
782
783bool SIMachineFunctionInfo::initializeBaseYamlFields(
784 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
785 PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) {
786 ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
787 MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
788 LDSSize = YamlMFI.LDSSize;
789 GDSSize = YamlMFI.GDSSize;
790 DynLDSAlign = YamlMFI.DynLDSAlign;
791 PSInputAddr = YamlMFI.PSInputAddr;
792 PSInputEnable = YamlMFI.PSInputEnable;
793 MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
794 HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
795 Occupancy = YamlMFI.Occupancy;
796 IsEntryFunction = YamlMFI.IsEntryFunction;
797 MemoryBound = YamlMFI.MemoryBound;
798 WaveLimiter = YamlMFI.WaveLimiter;
799 HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
800 HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
801 NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
802 NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
803 BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
804 ReturnsVoid = YamlMFI.ReturnsVoid;
805 IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
806 MinNumAGPRs = YamlMFI.MinNumAGPRs;
807 // This can also be set by the function attribute, MFI has higher precedence
808 // though.
809 if (YamlMFI.DynamicVGPRBlockSize != std::nullopt)
810 DynamicVGPRBlockSize = *YamlMFI.DynamicVGPRBlockSize;
811
812 UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: YamlMFI.NumKernargPreloadSGPRs);
813
814 if (YamlMFI.ScavengeFI) {
815 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MFI: MF.getFrameInfo());
816 if (!FIOrErr) {
817 // Create a diagnostic for a the frame index.
818 const MemoryBuffer &Buffer =
819 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
820
821 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
822 SourceMgr::DK_Error, toString(E: FIOrErr.takeError()),
823 "", {}, {});
824 SourceRange = YamlMFI.ScavengeFI->SourceRange;
825 return true;
826 }
827 ScavengeFI = *FIOrErr;
828 } else {
829 ScavengeFI = std::nullopt;
830 }
831 return false;
832}
833
834bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
835 auto [MinNumAGPR, MaxNumAGPR] =
836 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: {~0u, ~0u},
837 /*OnlyFirstRequired=*/true);
838 return MinNumAGPR != 0u;
839}
840