1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "SIMachineFunctionInfo.h"
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
12#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13#include "SIRegisterInfo.h"
14#include "Utils/AMDGPUBaseInfo.h"
15#include "llvm/CodeGen/LiveIntervals.h"
16#include "llvm/CodeGen/MIRParser/MIParser.h"
17#include "llvm/CodeGen/MachineBasicBlock.h"
18#include "llvm/CodeGen/MachineFrameInfo.h"
19#include "llvm/CodeGen/MachineFunction.h"
20#include "llvm/CodeGen/MachineRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
22#include "llvm/IR/DiagnosticInfo.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
36static cl::opt<bool, true> MFMAVGPRFormOpt(
37 "amdgpu-mfma-vgpr-form",
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
40 cl::location(L&: SIMachineFunctionInfo::MFMAVGPRForm), cl::init(Val: true),
41 cl::Hidden);
42
43const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
44 const SITargetLowering *TLI = STI->getTargetLowering();
45 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
46}
47
48bool SIMachineFunctionInfo::MFMAVGPRForm = false;
49
50SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
51 const GCNSubtarget *STI)
52 : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
53 UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
54 WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
55 PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
56 WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
57 GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
58 IsWholeWaveFunction(F.getCallingConv() ==
59 CallingConv::AMDGPU_Gfx_WholeWave) {
60 const GCNSubtarget &ST = *STI;
61 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
62 WavesPerEU = ST.getWavesPerEU(F);
63 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
64 assert(MaxNumWorkGroups.size() == 3);
65
66 // Temporarily check both the attribute and the subtarget feature, until the
67 // latter is completely removed.
68 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
69 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
70 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
71
72 Occupancy = ST.computeOccupancy(F, LDSSize: getLDSSize()).second;
73 CallingConv::ID CC = F.getCallingConv();
74
75 VRegFlags.reserve(S: 1024);
76
77 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
78 CC == CallingConv::SPIR_KERNEL;
79
80 if (IsKernel) {
81 WorkGroupIDX = true;
82 WorkItemIDX = true;
83 } else if (CC == CallingConv::AMDGPU_PS) {
84 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
85 }
86
87 if (ST.hasGFX90AInsts()) {
88 // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the
89 // allocation granule and clamping.
90 auto [MinNumAGPRAttr, MaxNumAGPRAttr] =
91 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: {~0u, ~0u},
92 /*OnlyFirstRequired=*/true);
93 MinNumAGPRs = MinNumAGPRAttr;
94 }
95
96 if (!isEntryFunction()) {
97 if (CC != CallingConv::AMDGPU_Gfx &&
98 CC != CallingConv::AMDGPU_Gfx_WholeWave)
99 ArgInfo = AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
100
101 FrameOffsetReg = AMDGPU::SGPR33;
102 StackPtrOffsetReg = AMDGPU::SGPR32;
103
104 if (!ST.hasFlatScratchEnabled()) {
105 // Non-entry functions have no special inputs for now, other registers
106 // required for scratch access.
107 ScratchRSrcReg = AMDGPU::isChainCC(CC)
108 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
109 : ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
110
111 ArgInfo.PrivateSegmentBuffer =
112 ArgDescriptor::createRegister(Reg: ScratchRSrcReg);
113 }
114
115 if (!F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr") &&
116 !AMDGPU::isChainCC(CC))
117 ImplicitArgPtr = true;
118 } else {
119 ImplicitArgPtr = false;
120 MaxKernArgAlign =
121 std::max(a: ST.getAlignmentForImplicitArgPtr(), b: MaxKernArgAlign);
122 }
123
124 if (!AMDGPU::isGraphics(CC) ||
125 ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_Gfx) &&
126 ST.hasArchitectedSGPRs())) {
127 if (IsKernel || !F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-x") ||
128 !F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-x"))
129 WorkGroupIDX = true;
130
131 if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-y") ||
132 !F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-y"))
133 WorkGroupIDY = true;
134
135 if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-z") ||
136 !F.hasFnAttribute(Kind: "amdgpu-no-cluster-id-z"))
137 WorkGroupIDZ = true;
138 }
139
140 if (!AMDGPU::isGraphics(CC)) {
141 if (IsKernel || !F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-x"))
142 WorkItemIDX = true;
143
144 if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-y") &&
145 ST.getMaxWorkitemID(Kernel: F, Dimension: 1) != 0)
146 WorkItemIDY = true;
147
148 if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-z") &&
149 ST.getMaxWorkitemID(Kernel: F, Dimension: 2) != 0)
150 WorkItemIDZ = true;
151
152 if (!IsKernel && !F.hasFnAttribute(Kind: "amdgpu-no-lds-kernel-id"))
153 LDSKernelId = true;
154 }
155
156 if (isEntryFunction()) {
157 // X, XY, and XYZ are the only supported combinations, so make sure Y is
158 // enabled if Z is.
159 if (WorkItemIDZ)
160 WorkItemIDY = true;
161
162 if (!ST.hasArchitectedFlatScratch()) {
163 PrivateSegmentWaveByteOffset = true;
164
165 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
166 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
167 (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
168 ArgInfo.PrivateSegmentWaveByteOffset =
169 ArgDescriptor::createRegister(Reg: AMDGPU::SGPR5);
170 }
171 }
172
173 Attribute A = F.getFnAttribute(Kind: "amdgpu-git-ptr-high");
174 StringRef S = A.getValueAsString();
175 if (!S.empty())
176 S.consumeInteger(Radix: 0, Result&: GITPtrHigh);
177
178 A = F.getFnAttribute(Kind: "amdgpu-32bit-address-high-bits");
179 S = A.getValueAsString();
180 if (!S.empty())
181 S.consumeInteger(Radix: 0, Result&: HighBitsOf32BitAddress);
182
183 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
184 Kind: "amdgpu-max-memory-cluster-dwords", Default: DefaultMemoryClusterDWordsLimit);
185
186 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
187 // VGPR available at all times. For now, reserve highest available VGPR. After
188 // RA, shift it to the lowest available unused VGPR if the one exist.
189 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
190 VGPRForAGPRCopy =
191 AMDGPU::VGPR_32RegClass.getRegister(i: ST.getMaxNumVGPRs(F) - 1);
192 }
193
194 ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
195}
196
197MachineFunctionInfo *SIMachineFunctionInfo::clone(
198 BumpPtrAllocator &Allocator, MachineFunction &DestMF,
199 const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
200 const {
201 return DestMF.cloneInfo<SIMachineFunctionInfo>(Old: *this);
202}
203
204void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
205 limitOccupancy(Limit: getMaxWavesPerEU());
206 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
207 limitOccupancy(Limit: ST.getOccupancyWithWorkGroupSizes(MF).second);
208}
209
210Register SIMachineFunctionInfo::addPrivateSegmentBuffer(
211 const SIRegisterInfo &TRI) {
212 ArgInfo.PrivateSegmentBuffer =
213 ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
214 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SGPR_128RegClass));
215 NumUserSGPRs += 4;
216 return ArgInfo.PrivateSegmentBuffer.getRegister();
217}
218
219Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
220 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
221 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
222 NumUserSGPRs += 2;
223 return ArgInfo.DispatchPtr.getRegister();
224}
225
226Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
227 ArgInfo.QueuePtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
228 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
229 NumUserSGPRs += 2;
230 return ArgInfo.QueuePtr.getRegister();
231}
232
233Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
234 ArgInfo.KernargSegmentPtr
235 = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
236 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
237 NumUserSGPRs += 2;
238 return ArgInfo.KernargSegmentPtr.getRegister();
239}
240
241Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
242 ArgInfo.DispatchID = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
243 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
244 NumUserSGPRs += 2;
245 return ArgInfo.DispatchID.getRegister();
246}
247
248Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
249 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
250 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
251 NumUserSGPRs += 2;
252 return ArgInfo.FlatScratchInit.getRegister();
253}
254
255Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) {
256 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(Reg: getNextUserSGPR());
257 NumUserSGPRs += 1;
258 return ArgInfo.PrivateSegmentSize.getRegister();
259}
260
261Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
262 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg(
263 Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass));
264 NumUserSGPRs += 2;
265 return ArgInfo.ImplicitBufferPtr.getRegister();
266}
267
268Register SIMachineFunctionInfo::addLDSKernelId() {
269 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(Reg: getNextUserSGPR());
270 NumUserSGPRs += 1;
271 return ArgInfo.LDSKernelId.getRegister();
272}
273
274SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
275 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
276 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
277 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(Key: KernArgIdx);
278 assert(Inserted && "Preload kernel argument allocated twice.");
279 NumUserSGPRs += PaddingSGPRs;
280 // If the available register tuples are aligned with the kernarg to be
281 // preloaded use that register, otherwise we need to use a set of SGPRs and
282 // merge them.
283 if (!ArgInfo.FirstKernArgPreloadReg)
284 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
285 Register PreloadReg =
286 TRI.getMatchingSuperReg(Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC);
287 auto &Regs = It->second.Regs;
288 if (PreloadReg &&
289 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
290 Regs.push_back(Elt: PreloadReg);
291 NumUserSGPRs += AllocSizeDWord;
292 } else {
293 Regs.reserve(N: AllocSizeDWord);
294 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
295 Regs.push_back(Elt: getNextUserSGPR());
296 NumUserSGPRs++;
297 }
298 }
299
300 // Track the actual number of SGPRs that HW will preload to.
301 UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: AllocSizeDWord + PaddingSGPRs);
302 return &Regs;
303}
304
305void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
306 uint64_t Size, Align Alignment) {
307 // Skip if it is an entry function or the register is already added.
308 if (isEntryFunction() || WWMSpills.count(Key: VGPR))
309 return;
310
311 // Skip if this is a function with the amdgpu_cs_chain or
312 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
313 // We never need to allocate a spill for these because we don't even need to
314 // restore the inactive lanes for them (they're scratchier than the usual
315 // scratch registers). We only need to do this if we have calls to
316 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
317 // chain functions do not return) and the function did not contain a call to
318 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
319 // when entering the function).
320 if (isChainFunction() &&
321 (SIRegisterInfo::isChainScratchRegister(VGPR) ||
322 !MF.getFrameInfo().hasTailCall() || hasInitWholeWave()))
323 return;
324
325 WWMSpills.insert(KV: std::make_pair(
326 x&: VGPR, y: MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
327}
328
329// Separate out the callee-saved and scratch registers.
330void SIMachineFunctionInfo::splitWWMSpillRegisters(
331 MachineFunction &MF,
332 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
333 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
334 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
335 for (auto &Reg : WWMSpills) {
336 if (isCalleeSavedReg(CSRegs, Reg: Reg.first))
337 CalleeSavedRegs.push_back(Elt: Reg);
338 else
339 ScratchRegs.push_back(Elt: Reg);
340 }
341}
342
343bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
344 MCPhysReg Reg) const {
345 for (unsigned I = 0; CSRegs[I]; ++I) {
346 if (CSRegs[I] == Reg)
347 return true;
348 }
349
350 return false;
351}
352
353void SIMachineFunctionInfo::shiftWwmVGPRsToLowestRange(
354 MachineFunction &MF, SmallVectorImpl<Register> &WWMVGPRs,
355 BitVector &SavedVGPRs) {
356 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
357 MachineRegisterInfo &MRI = MF.getRegInfo();
358 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
359 Register Reg = WWMVGPRs[I];
360 Register NewReg =
361 TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF);
362 if (!NewReg || NewReg >= Reg)
363 break;
364
365 MRI.replaceRegWith(FromReg: Reg, ToReg: NewReg);
366
367 // Update various tables with the new VGPR.
368 WWMVGPRs[I] = NewReg;
369 WWMReservedRegs.remove(X: Reg);
370 WWMReservedRegs.insert(X: NewReg);
371 MRI.reserveReg(PhysReg: NewReg, TRI);
372
373 // Replace the register in SpillPhysVGPRs. This is needed to look for free
374 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
375 auto *RegItr = llvm::find(Range&: SpillPhysVGPRs, Val: Reg);
376 if (RegItr != SpillPhysVGPRs.end()) {
377 unsigned Idx = std::distance(first: SpillPhysVGPRs.begin(), last: RegItr);
378 SpillPhysVGPRs[Idx] = NewReg;
379 }
380
381 // The generic `determineCalleeSaves` might have set the old register if it
382 // is in the CSR range.
383 SavedVGPRs.reset(Idx: Reg);
384
385 for (MachineBasicBlock &MBB : MF) {
386 MBB.removeLiveIn(Reg);
387 MBB.sortUniqueLiveIns();
388 }
389
390 Reg = NewReg;
391 }
392}
393
394bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
395 MachineFunction &MF, int FI, unsigned LaneIndex) {
396 MachineRegisterInfo &MRI = MF.getRegInfo();
397 Register LaneVGPR;
398 if (!LaneIndex) {
399 LaneVGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
400 SpillVGPRs.push_back(Elt: LaneVGPR);
401 } else {
402 LaneVGPR = SpillVGPRs.back();
403 }
404
405 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(args&: LaneVGPR, args&: LaneIndex);
406 return true;
407}
408
409bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
410 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
411 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
412 const SIRegisterInfo *TRI = ST.getRegisterInfo();
413 MachineRegisterInfo &MRI = MF.getRegInfo();
414 Register LaneVGPR;
415 if (!LaneIndex) {
416 // Find the highest available register if called before RA to ensure the
417 // lowest registers are available for allocation. The LaneVGPR, in that
418 // case, will be shifted back to the lowest range after VGPR allocation.
419 LaneVGPR = TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF,
420 ReserveHighestVGPR: !IsPrologEpilog);
421 if (LaneVGPR == AMDGPU::NoRegister) {
422 // We have no VGPRs left for spilling SGPRs. Reset because we will not
423 // partially spill the SGPR to VGPRs.
424 SGPRSpillsToPhysicalVGPRLanes.erase(Val: FI);
425 return false;
426 }
427
428 if (IsPrologEpilog)
429 allocateWWMSpill(MF, VGPR: LaneVGPR);
430
431 reserveWWMRegister(Reg: LaneVGPR);
432 for (MachineBasicBlock &MBB : MF) {
433 MBB.addLiveIn(PhysReg: LaneVGPR);
434 MBB.sortUniqueLiveIns();
435 }
436 SpillPhysVGPRs.push_back(Elt: LaneVGPR);
437 } else {
438 LaneVGPR = SpillPhysVGPRs.back();
439 }
440
441 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(args&: LaneVGPR, args&: LaneIndex);
442 return true;
443}
444
445bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
446 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
447 bool IsPrologEpilog) {
448 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
449 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
450 : SGPRSpillsToVirtualVGPRLanes[FI];
451
452 // This has already been allocated.
453 if (!SpillLanes.empty())
454 return true;
455
456 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
457 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
458 unsigned WaveSize = ST.getWavefrontSize();
459
460 unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI);
461 unsigned NumLanes = Size / 4;
462
463 if (NumLanes > WaveSize)
464 return false;
465
466 assert(Size >= 4 && "invalid sgpr spill size");
467 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
468 "not spilling SGPRs to VGPRs");
469
470 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
471 : NumVirtualVGPRSpillLanes;
472
473 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
474 unsigned LaneIndex = (NumSpillLanes % WaveSize);
475
476 bool Allocated = SpillToPhysVGPRLane
477 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
478 IsPrologEpilog)
479 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
480 if (!Allocated) {
481 NumSpillLanes -= I;
482 return false;
483 }
484 }
485
486 return true;
487}
488
489/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
490/// Either AGPR is spilled to VGPR to vice versa.
491/// Returns true if a \p FI can be eliminated completely.
492bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
493 int FI,
494 bool isAGPRtoVGPR) {
495 MachineRegisterInfo &MRI = MF.getRegInfo();
496 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
497 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
498
499 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
500
501 auto &Spill = VGPRToAGPRSpills[FI];
502
503 // This has already been allocated.
504 if (!Spill.Lanes.empty())
505 return Spill.FullyAllocated;
506
507 unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI);
508 unsigned NumLanes = Size / 4;
509 Spill.Lanes.resize(N: NumLanes, NV: AMDGPU::NoRegister);
510
511 const TargetRegisterClass &RC =
512 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
513 auto Regs = RC.getRegisters();
514
515 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
516 const SIRegisterInfo *TRI = ST.getRegisterInfo();
517 Spill.FullyAllocated = true;
518
519 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
520 // once.
521 BitVector OtherUsedRegs;
522 OtherUsedRegs.resize(N: TRI->getNumRegs());
523
524 const uint32_t *CSRMask =
525 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
526 if (CSRMask)
527 OtherUsedRegs.setBitsInMask(Mask: CSRMask);
528
529 // TODO: Should include register tuples, but doesn't matter with current
530 // usage.
531 for (MCPhysReg Reg : SpillAGPR)
532 OtherUsedRegs.set(Reg);
533 for (MCPhysReg Reg : SpillVGPR)
534 OtherUsedRegs.set(Reg);
535
536 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
537 for (int I = NumLanes - 1; I >= 0; --I) {
538 NextSpillReg = std::find_if(
539 first: NextSpillReg, last: Regs.end(), pred: [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
540 return MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg) &&
541 !OtherUsedRegs[Reg];
542 });
543
544 if (NextSpillReg == Regs.end()) { // Registers exhausted
545 Spill.FullyAllocated = false;
546 break;
547 }
548
549 OtherUsedRegs.set(*NextSpillReg);
550 SpillRegs.push_back(Elt: *NextSpillReg);
551 MRI.reserveReg(PhysReg: *NextSpillReg, TRI);
552 Spill.Lanes[I] = *NextSpillReg++;
553 }
554
555 return Spill.FullyAllocated;
556}
557
558bool SIMachineFunctionInfo::removeDeadFrameIndices(
559 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
560 // Remove dead frame indices from function frame, however keep FP & BP since
561 // spills for them haven't been inserted yet. And also make sure to remove the
562 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
563 // otherwise, it could result in an unexpected side effect and bug, in case of
564 // any re-mapping of freed frame indices by later pass(es) like "stack slot
565 // coloring".
566 for (auto &R : make_early_inc_range(Range&: SGPRSpillsToVirtualVGPRLanes)) {
567 MFI.RemoveStackObject(ObjectIdx: R.first);
568 SGPRSpillsToVirtualVGPRLanes.erase(Val: R.first);
569 }
570
571 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
572 // VGPR lanes during SILowerSGPRSpills pass.
573 if (!ResetSGPRSpillStackIDs) {
574 for (auto &R : make_early_inc_range(Range&: SGPRSpillsToPhysicalVGPRLanes)) {
575 MFI.RemoveStackObject(ObjectIdx: R.first);
576 SGPRSpillsToPhysicalVGPRLanes.erase(Val: R.first);
577 }
578 }
579 bool HaveSGPRToMemory = false;
580
581 if (ResetSGPRSpillStackIDs) {
582 // All other SGPRs must be allocated on the default stack, so reset the
583 // stack ID.
584 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
585 ++I) {
586 if (!checkIndexInPrologEpilogSGPRSpills(FI: I)) {
587 if (MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill) {
588 MFI.setStackID(ObjectIdx: I, ID: TargetStackID::Default);
589 HaveSGPRToMemory = true;
590 }
591 }
592 }
593 }
594
595 for (auto &R : VGPRToAGPRSpills) {
596 if (R.second.IsDead)
597 MFI.RemoveStackObject(ObjectIdx: R.first);
598 }
599
600 return HaveSGPRToMemory;
601}
602
603int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
604 const SIRegisterInfo &TRI) {
605 if (ScavengeFI)
606 return *ScavengeFI;
607
608 ScavengeFI =
609 MFI.CreateStackObject(Size: TRI.getSpillSize(RC: AMDGPU::SGPR_32RegClass),
610 Alignment: TRI.getSpillAlign(RC: AMDGPU::SGPR_32RegClass), isSpillSlot: false);
611 return *ScavengeFI;
612}
613
614MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
615 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
616 return AMDGPU::SGPR0 + NumUserSGPRs;
617}
618
619MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
620 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
621}
622
623void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
624 VRegFlags.grow(N: Reg);
625}
626
627void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
628 Register SrcReg) {
629 VRegFlags.grow(N: NewReg);
630 VRegFlags[NewReg] = VRegFlags[SrcReg];
631}
632
633Register
634SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
635 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
636 if (!ST.isAmdPalOS())
637 return Register();
638 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
639 if (ST.hasMergedShaders()) {
640 switch (MF.getFunction().getCallingConv()) {
641 case CallingConv::AMDGPU_HS:
642 case CallingConv::AMDGPU_GS:
643 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
644 // ES+GS merged shader on gfx9+.
645 GitPtrLo = AMDGPU::SGPR8;
646 return GitPtrLo;
647 default:
648 return GitPtrLo;
649 }
650 }
651 return GitPtrLo;
652}
653
654static yaml::StringValue regToString(Register Reg,
655 const TargetRegisterInfo &TRI) {
656 yaml::StringValue Dest;
657 {
658 raw_string_ostream OS(Dest.Value);
659 OS << printReg(Reg, TRI: &TRI);
660 }
661 return Dest;
662}
663
664static std::optional<yaml::SIArgumentInfo>
665convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
666 const TargetRegisterInfo &TRI) {
667 yaml::SIArgumentInfo AI;
668
669 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
670 const ArgDescriptor &Arg) {
671 if (!Arg)
672 return false;
673
674 // Create a register or stack argument.
675 yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: Arg.isRegister());
676 if (Arg.isRegister()) {
677 raw_string_ostream OS(SA.RegisterName.Value);
678 OS << printReg(Reg: Arg.getRegister(), TRI: &TRI);
679 } else
680 SA.StackOffset = Arg.getStackOffset();
681 // Check and update the optional mask.
682 if (Arg.isMasked())
683 SA.Mask = Arg.getMask();
684
685 A = std::move(SA);
686 return true;
687 };
688
689 bool Any = false;
690 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
691 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
692 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
693 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
694 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
695 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
696 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
697 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
698 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
699 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
700 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
701 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
702 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
703 ArgInfo.PrivateSegmentWaveByteOffset);
704 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
705 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
706 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
707 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
708 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
709
710 // Write FirstKernArgPreloadReg separately, since it's a Register,
711 // not ArgDescriptor.
712 if (ArgInfo.FirstKernArgPreloadReg) {
713 Register Reg = ArgInfo.FirstKernArgPreloadReg;
714 assert(Reg.isPhysical() &&
715 "FirstKernArgPreloadReg must be a physical register");
716
717 yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: true);
718 raw_string_ostream OS(SA.RegisterName.Value);
719 OS << printReg(Reg, TRI: &TRI);
720
721 AI.FirstKernArgPreloadReg = SA;
722 Any = true;
723 }
724
725 if (Any)
726 return AI;
727
728 return std::nullopt;
729}
730
731yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
732 const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI,
733 const llvm::MachineFunction &MF)
734 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
735 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
736 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
737 IsEntryFunction(MFI.isEntryFunction()), MemoryBound(MFI.isMemoryBound()),
738 WaveLimiter(MFI.needsWaveLimiter()),
739 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
740 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
741 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
742 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
743 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
744 Occupancy(MFI.getOccupancy()),
745 ScratchRSrcReg(regToString(Reg: MFI.getScratchRSrcReg(), TRI)),
746 FrameOffsetReg(regToString(Reg: MFI.getFrameOffsetReg(), TRI)),
747 StackPtrOffsetReg(regToString(Reg: MFI.getStackPtrOffsetReg(), TRI)),
748 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
749 ReturnsVoid(MFI.returnsVoid()),
750 ArgInfo(convertArgumentInfo(ArgInfo: MFI.getArgInfo(), TRI)),
751 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
752 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
753 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
754 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
755 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
756 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
757 NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()) {
758 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
759 SpillPhysVGPRS.push_back(Elt: regToString(Reg, TRI));
760
761 for (Register Reg : MFI.getWWMReservedRegs())
762 WWMReservedRegs.push_back(Elt: regToString(Reg, TRI));
763
764 if (MFI.getLongBranchReservedReg())
765 LongBranchReservedReg = regToString(Reg: MFI.getLongBranchReservedReg(), TRI);
766 if (MFI.getVGPRForAGPRCopy())
767 VGPRForAGPRCopy = regToString(Reg: MFI.getVGPRForAGPRCopy(), TRI);
768
769 if (MFI.getSGPRForEXECCopy())
770 SGPRForEXECCopy = regToString(Reg: MFI.getSGPRForEXECCopy(), TRI);
771
772 auto SFI = MFI.getOptionalScavengeFI();
773 if (SFI)
774 ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo());
775}
776
777void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
778 MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, MFI&: *this);
779}
780
781bool SIMachineFunctionInfo::initializeBaseYamlFields(
782 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
783 PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) {
784 ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
785 MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
786 LDSSize = YamlMFI.LDSSize;
787 GDSSize = YamlMFI.GDSSize;
788 DynLDSAlign = YamlMFI.DynLDSAlign;
789 PSInputAddr = YamlMFI.PSInputAddr;
790 PSInputEnable = YamlMFI.PSInputEnable;
791 MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
792 HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
793 Occupancy = YamlMFI.Occupancy;
794 IsEntryFunction = YamlMFI.IsEntryFunction;
795 MemoryBound = YamlMFI.MemoryBound;
796 WaveLimiter = YamlMFI.WaveLimiter;
797 HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
798 HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
799 NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
800 NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
801 BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
802 ReturnsVoid = YamlMFI.ReturnsVoid;
803 IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
804
805 UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: YamlMFI.NumKernargPreloadSGPRs);
806
807 if (YamlMFI.ScavengeFI) {
808 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MFI: MF.getFrameInfo());
809 if (!FIOrErr) {
810 // Create a diagnostic for a the frame index.
811 const MemoryBuffer &Buffer =
812 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
813
814 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
815 SourceMgr::DK_Error, toString(E: FIOrErr.takeError()),
816 "", {}, {});
817 SourceRange = YamlMFI.ScavengeFI->SourceRange;
818 return true;
819 }
820 ScavengeFI = *FIOrErr;
821 } else {
822 ScavengeFI = std::nullopt;
823 }
824 return false;
825}
826
827bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
828 auto [MinNumAGPR, MaxNumAGPR] =
829 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-agpr-alloc", Default: {~0u, ~0u},
830 /*OnlyFirstRequired=*/true);
831 return MinNumAGPR != 0u;
832}
833