1 | //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "SIMachineFunctionInfo.h" |
10 | #include "AMDGPUSubtarget.h" |
11 | #include "AMDGPUTargetMachine.h" |
12 | #include "GCNSubtarget.h" |
13 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
14 | #include "SIRegisterInfo.h" |
15 | #include "Utils/AMDGPUBaseInfo.h" |
16 | #include "llvm/CodeGen/LiveIntervals.h" |
17 | #include "llvm/CodeGen/MIRParser/MIParser.h" |
18 | #include "llvm/CodeGen/MachineBasicBlock.h" |
19 | #include "llvm/CodeGen/MachineFrameInfo.h" |
20 | #include "llvm/CodeGen/MachineFunction.h" |
21 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
22 | #include "llvm/IR/CallingConv.h" |
23 | #include "llvm/IR/DiagnosticInfo.h" |
24 | #include "llvm/IR/Function.h" |
25 | #include <cassert> |
26 | #include <optional> |
27 | #include <vector> |
28 | |
29 | enum { MAX_LANES = 64 }; |
30 | |
31 | using namespace llvm; |
32 | |
33 | const GCNTargetMachine &getTM(const GCNSubtarget *STI) { |
34 | const SITargetLowering *TLI = STI->getTargetLowering(); |
35 | return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine()); |
36 | } |
37 | |
38 | SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, |
39 | const GCNSubtarget *STI) |
40 | : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), |
41 | UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false), |
42 | WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false), |
43 | PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), |
44 | WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), |
45 | GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { |
46 | const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI); |
47 | FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); |
48 | WavesPerEU = ST.getWavesPerEU(F); |
49 | MaxNumWorkGroups = ST.getMaxNumWorkGroups(F); |
50 | assert(MaxNumWorkGroups.size() == 3); |
51 | |
52 | Occupancy = ST.computeOccupancy(F, LDSSize: getLDSSize()); |
53 | CallingConv::ID CC = F.getCallingConv(); |
54 | |
55 | VRegFlags.reserve(s: 1024); |
56 | |
57 | const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || |
58 | CC == CallingConv::SPIR_KERNEL; |
59 | |
60 | if (IsKernel) { |
61 | WorkGroupIDX = true; |
62 | WorkItemIDX = true; |
63 | } else if (CC == CallingConv::AMDGPU_PS) { |
64 | PSInputAddr = AMDGPU::getInitialPSInputAddr(F); |
65 | } |
66 | |
67 | MayNeedAGPRs = ST.hasMAIInsts(); |
68 | |
69 | if (AMDGPU::isChainCC(CC)) { |
70 | // Chain functions don't receive an SP from their caller, but are free to |
71 | // set one up. For now, we can use s32 to match what amdgpu_gfx functions |
72 | // would use if called, but this can be revisited. |
73 | // FIXME: Only reserve this if we actually need it. |
74 | StackPtrOffsetReg = AMDGPU::SGPR32; |
75 | |
76 | ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51; |
77 | |
78 | ArgInfo.PrivateSegmentBuffer = |
79 | ArgDescriptor::createRegister(Reg: ScratchRSrcReg); |
80 | |
81 | ImplicitArgPtr = false; |
82 | } else if (!isEntryFunction()) { |
83 | if (CC != CallingConv::AMDGPU_Gfx) |
84 | ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; |
85 | |
86 | FrameOffsetReg = AMDGPU::SGPR33; |
87 | StackPtrOffsetReg = AMDGPU::SGPR32; |
88 | |
89 | if (!ST.enableFlatScratch()) { |
90 | // Non-entry functions have no special inputs for now, other registers |
91 | // required for scratch access. |
92 | ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; |
93 | |
94 | ArgInfo.PrivateSegmentBuffer = |
95 | ArgDescriptor::createRegister(Reg: ScratchRSrcReg); |
96 | } |
97 | |
98 | if (!F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr" )) |
99 | ImplicitArgPtr = true; |
100 | } else { |
101 | ImplicitArgPtr = false; |
102 | MaxKernArgAlign = std::max(a: ST.getAlignmentForImplicitArgPtr(), |
103 | b: MaxKernArgAlign); |
104 | |
105 | if (ST.hasGFX90AInsts() && |
106 | ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && |
107 | !mayUseAGPRs(F)) |
108 | MayNeedAGPRs = false; // We will select all MAI with VGPR operands. |
109 | } |
110 | |
111 | if (!AMDGPU::isGraphics(CC) || |
112 | ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_Gfx) && |
113 | ST.hasArchitectedSGPRs())) { |
114 | if (IsKernel || !F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-x" )) |
115 | WorkGroupIDX = true; |
116 | |
117 | if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-y" )) |
118 | WorkGroupIDY = true; |
119 | |
120 | if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-z" )) |
121 | WorkGroupIDZ = true; |
122 | } |
123 | |
124 | if (!AMDGPU::isGraphics(CC)) { |
125 | if (IsKernel || !F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-x" )) |
126 | WorkItemIDX = true; |
127 | |
128 | if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-y" ) && |
129 | ST.getMaxWorkitemID(Kernel: F, Dimension: 1) != 0) |
130 | WorkItemIDY = true; |
131 | |
132 | if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-z" ) && |
133 | ST.getMaxWorkitemID(Kernel: F, Dimension: 2) != 0) |
134 | WorkItemIDZ = true; |
135 | |
136 | if (!IsKernel && !F.hasFnAttribute(Kind: "amdgpu-no-lds-kernel-id" )) |
137 | LDSKernelId = true; |
138 | } |
139 | |
140 | if (isEntryFunction()) { |
141 | // X, XY, and XYZ are the only supported combinations, so make sure Y is |
142 | // enabled if Z is. |
143 | if (WorkItemIDZ) |
144 | WorkItemIDY = true; |
145 | |
146 | if (!ST.flatScratchIsArchitected()) { |
147 | PrivateSegmentWaveByteOffset = true; |
148 | |
149 | // HS and GS always have the scratch wave offset in SGPR5 on GFX9. |
150 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && |
151 | (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) |
152 | ArgInfo.PrivateSegmentWaveByteOffset = |
153 | ArgDescriptor::createRegister(Reg: AMDGPU::SGPR5); |
154 | } |
155 | } |
156 | |
157 | Attribute A = F.getFnAttribute(Kind: "amdgpu-git-ptr-high" ); |
158 | StringRef S = A.getValueAsString(); |
159 | if (!S.empty()) |
160 | S.consumeInteger(Radix: 0, Result&: GITPtrHigh); |
161 | |
162 | A = F.getFnAttribute(Kind: "amdgpu-32bit-address-high-bits" ); |
163 | S = A.getValueAsString(); |
164 | if (!S.empty()) |
165 | S.consumeInteger(Radix: 0, Result&: HighBitsOf32BitAddress); |
166 | |
167 | // On GFX908, in order to guarantee copying between AGPRs, we need a scratch |
168 | // VGPR available at all times. For now, reserve highest available VGPR. After |
169 | // RA, shift it to the lowest available unused VGPR if the one exist. |
170 | if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { |
171 | VGPRForAGPRCopy = |
172 | AMDGPU::VGPR_32RegClass.getRegister(i: ST.getMaxNumVGPRs(F) - 1); |
173 | } |
174 | } |
175 | |
176 | MachineFunctionInfo *SIMachineFunctionInfo::clone( |
177 | BumpPtrAllocator &Allocator, MachineFunction &DestMF, |
178 | const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) |
179 | const { |
180 | return DestMF.cloneInfo<SIMachineFunctionInfo>(Old: *this); |
181 | } |
182 | |
183 | void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { |
184 | limitOccupancy(Limit: getMaxWavesPerEU()); |
185 | const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>(); |
186 | limitOccupancy(Limit: ST.getOccupancyWithLocalMemSize(Bytes: getLDSSize(), |
187 | MF.getFunction())); |
188 | } |
189 | |
190 | Register SIMachineFunctionInfo::addPrivateSegmentBuffer( |
191 | const SIRegisterInfo &TRI) { |
192 | ArgInfo.PrivateSegmentBuffer = |
193 | ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
194 | Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SGPR_128RegClass)); |
195 | NumUserSGPRs += 4; |
196 | return ArgInfo.PrivateSegmentBuffer.getRegister(); |
197 | } |
198 | |
199 | Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { |
200 | ArgInfo.DispatchPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
201 | Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass)); |
202 | NumUserSGPRs += 2; |
203 | return ArgInfo.DispatchPtr.getRegister(); |
204 | } |
205 | |
206 | Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { |
207 | ArgInfo.QueuePtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
208 | Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass)); |
209 | NumUserSGPRs += 2; |
210 | return ArgInfo.QueuePtr.getRegister(); |
211 | } |
212 | |
213 | Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { |
214 | ArgInfo.KernargSegmentPtr |
215 | = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
216 | Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass)); |
217 | NumUserSGPRs += 2; |
218 | return ArgInfo.KernargSegmentPtr.getRegister(); |
219 | } |
220 | |
221 | Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { |
222 | ArgInfo.DispatchID = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
223 | Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass)); |
224 | NumUserSGPRs += 2; |
225 | return ArgInfo.DispatchID.getRegister(); |
226 | } |
227 | |
228 | Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { |
229 | ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
230 | Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass)); |
231 | NumUserSGPRs += 2; |
232 | return ArgInfo.FlatScratchInit.getRegister(); |
233 | } |
234 | |
235 | Register SIMachineFunctionInfo::addPrivateSegmentSize(const SIRegisterInfo &TRI) { |
236 | ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(Reg: getNextUserSGPR()); |
237 | NumUserSGPRs += 1; |
238 | return ArgInfo.PrivateSegmentSize.getRegister(); |
239 | } |
240 | |
241 | Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { |
242 | ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
243 | Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC: &AMDGPU::SReg_64RegClass)); |
244 | NumUserSGPRs += 2; |
245 | return ArgInfo.ImplicitBufferPtr.getRegister(); |
246 | } |
247 | |
248 | Register SIMachineFunctionInfo::addLDSKernelId() { |
249 | ArgInfo.LDSKernelId = ArgDescriptor::createRegister(Reg: getNextUserSGPR()); |
250 | NumUserSGPRs += 1; |
251 | return ArgInfo.LDSKernelId.getRegister(); |
252 | } |
253 | |
254 | SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg( |
255 | const SIRegisterInfo &TRI, const TargetRegisterClass *RC, |
256 | unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) { |
257 | assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) && |
258 | "Preload kernel argument allocated twice." ); |
259 | NumUserSGPRs += PaddingSGPRs; |
260 | // If the available register tuples are aligned with the kernarg to be |
261 | // preloaded use that register, otherwise we need to use a set of SGPRs and |
262 | // merge them. |
263 | Register PreloadReg = |
264 | TRI.getMatchingSuperReg(Reg: getNextUserSGPR(), SubIdx: AMDGPU::sub0, RC); |
265 | if (PreloadReg && |
266 | (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) { |
267 | ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(Elt: PreloadReg); |
268 | NumUserSGPRs += AllocSizeDWord; |
269 | } else { |
270 | for (unsigned I = 0; I < AllocSizeDWord; ++I) { |
271 | ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(Elt: getNextUserSGPR()); |
272 | NumUserSGPRs++; |
273 | } |
274 | } |
275 | |
276 | // Track the actual number of SGPRs that HW will preload to. |
277 | UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: AllocSizeDWord + PaddingSGPRs); |
278 | return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs; |
279 | } |
280 | |
281 | void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR, |
282 | uint64_t Size, Align Alignment) { |
283 | // Skip if it is an entry function or the register is already added. |
284 | if (isEntryFunction() || WWMSpills.count(Key: VGPR)) |
285 | return; |
286 | |
287 | // Skip if this is a function with the amdgpu_cs_chain or |
288 | // amdgpu_cs_chain_preserve calling convention and this is a scratch register. |
289 | // We never need to allocate a spill for these because we don't even need to |
290 | // restore the inactive lanes for them (they're scratchier than the usual |
291 | // scratch registers). |
292 | if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR)) |
293 | return; |
294 | |
295 | WWMSpills.insert(KV: std::make_pair( |
296 | x&: VGPR, y: MF.getFrameInfo().CreateSpillStackObject(Size, Alignment))); |
297 | } |
298 | |
299 | // Separate out the callee-saved and scratch registers. |
300 | void SIMachineFunctionInfo::splitWWMSpillRegisters( |
301 | MachineFunction &MF, |
302 | SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs, |
303 | SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const { |
304 | const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); |
305 | for (auto &Reg : WWMSpills) { |
306 | if (isCalleeSavedReg(CSRegs, Reg: Reg.first)) |
307 | CalleeSavedRegs.push_back(Elt: Reg); |
308 | else |
309 | ScratchRegs.push_back(Elt: Reg); |
310 | } |
311 | } |
312 | |
313 | bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, |
314 | MCPhysReg Reg) const { |
315 | for (unsigned I = 0; CSRegs[I]; ++I) { |
316 | if (CSRegs[I] == Reg) |
317 | return true; |
318 | } |
319 | |
320 | return false; |
321 | } |
322 | |
323 | void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( |
324 | MachineFunction &MF) { |
325 | const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); |
326 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
327 | for (Register &Reg : SpillPhysVGPRs) { |
328 | Register NewReg = |
329 | TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF); |
330 | if (!NewReg || NewReg >= Reg) |
331 | break; |
332 | |
333 | MRI.replaceRegWith(FromReg: Reg, ToReg: NewReg); |
334 | |
335 | // Update various tables with the new VGPR. |
336 | WWMReservedRegs.remove(X: Reg); |
337 | WWMReservedRegs.insert(X: NewReg); |
338 | WWMSpills.insert(KV: std::make_pair(x&: NewReg, y&: WWMSpills[Reg])); |
339 | WWMSpills.erase(Key: Reg); |
340 | |
341 | for (MachineBasicBlock &MBB : MF) { |
342 | MBB.removeLiveIn(Reg); |
343 | MBB.sortUniqueLiveIns(); |
344 | } |
345 | |
346 | Reg = NewReg; |
347 | } |
348 | } |
349 | |
350 | bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( |
351 | MachineFunction &MF, int FI, unsigned LaneIndex) { |
352 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
353 | Register LaneVGPR; |
354 | if (!LaneIndex) { |
355 | LaneVGPR = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
356 | SpillVGPRs.push_back(Elt: LaneVGPR); |
357 | } else { |
358 | LaneVGPR = SpillVGPRs.back(); |
359 | } |
360 | |
361 | SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(args&: LaneVGPR, args&: LaneIndex); |
362 | return true; |
363 | } |
364 | |
365 | bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( |
366 | MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) { |
367 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
368 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
369 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
370 | Register LaneVGPR; |
371 | if (!LaneIndex) { |
372 | // Find the highest available register if called before RA to ensure the |
373 | // lowest registers are available for allocation. The LaneVGPR, in that |
374 | // case, will be shifted back to the lowest range after VGPR allocation. |
375 | LaneVGPR = TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF, |
376 | ReserveHighestVGPR: !IsPrologEpilog); |
377 | if (LaneVGPR == AMDGPU::NoRegister) { |
378 | // We have no VGPRs left for spilling SGPRs. Reset because we will not |
379 | // partially spill the SGPR to VGPRs. |
380 | SGPRSpillsToPhysicalVGPRLanes.erase(Val: FI); |
381 | return false; |
382 | } |
383 | |
384 | allocateWWMSpill(MF, VGPR: LaneVGPR); |
385 | reserveWWMRegister(Reg: LaneVGPR); |
386 | for (MachineBasicBlock &MBB : MF) { |
387 | MBB.addLiveIn(PhysReg: LaneVGPR); |
388 | MBB.sortUniqueLiveIns(); |
389 | } |
390 | SpillPhysVGPRs.push_back(Elt: LaneVGPR); |
391 | } else { |
392 | LaneVGPR = SpillPhysVGPRs.back(); |
393 | } |
394 | |
395 | SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(args&: LaneVGPR, args&: LaneIndex); |
396 | return true; |
397 | } |
398 | |
399 | bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane( |
400 | MachineFunction &MF, int FI, bool SpillToPhysVGPRLane, |
401 | bool IsPrologEpilog) { |
402 | std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = |
403 | SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI] |
404 | : SGPRSpillsToVirtualVGPRLanes[FI]; |
405 | |
406 | // This has already been allocated. |
407 | if (!SpillLanes.empty()) |
408 | return true; |
409 | |
410 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
411 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
412 | unsigned WaveSize = ST.getWavefrontSize(); |
413 | |
414 | unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI); |
415 | unsigned NumLanes = Size / 4; |
416 | |
417 | if (NumLanes > WaveSize) |
418 | return false; |
419 | |
420 | assert(Size >= 4 && "invalid sgpr spill size" ); |
421 | assert(ST.getRegisterInfo()->spillSGPRToVGPR() && |
422 | "not spilling SGPRs to VGPRs" ); |
423 | |
424 | unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes |
425 | : NumVirtualVGPRSpillLanes; |
426 | |
427 | for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { |
428 | unsigned LaneIndex = (NumSpillLanes % WaveSize); |
429 | |
430 | bool Allocated = SpillToPhysVGPRLane |
431 | ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex, |
432 | IsPrologEpilog) |
433 | : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); |
434 | if (!Allocated) { |
435 | NumSpillLanes -= I; |
436 | return false; |
437 | } |
438 | } |
439 | |
440 | return true; |
441 | } |
442 | |
443 | /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. |
444 | /// Either AGPR is spilled to VGPR to vice versa. |
445 | /// Returns true if a \p FI can be eliminated completely. |
446 | bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, |
447 | int FI, |
448 | bool isAGPRtoVGPR) { |
449 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
450 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
451 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
452 | |
453 | assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI)); |
454 | |
455 | auto &Spill = VGPRToAGPRSpills[FI]; |
456 | |
457 | // This has already been allocated. |
458 | if (!Spill.Lanes.empty()) |
459 | return Spill.FullyAllocated; |
460 | |
461 | unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI); |
462 | unsigned NumLanes = Size / 4; |
463 | Spill.Lanes.resize(N: NumLanes, NV: AMDGPU::NoRegister); |
464 | |
465 | const TargetRegisterClass &RC = |
466 | isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass; |
467 | auto Regs = RC.getRegisters(); |
468 | |
469 | auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR; |
470 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
471 | Spill.FullyAllocated = true; |
472 | |
473 | // FIXME: Move allocation logic out of MachineFunctionInfo and initialize |
474 | // once. |
475 | BitVector OtherUsedRegs; |
476 | OtherUsedRegs.resize(N: TRI->getNumRegs()); |
477 | |
478 | const uint32_t *CSRMask = |
479 | TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv()); |
480 | if (CSRMask) |
481 | OtherUsedRegs.setBitsInMask(Mask: CSRMask); |
482 | |
483 | // TODO: Should include register tuples, but doesn't matter with current |
484 | // usage. |
485 | for (MCPhysReg Reg : SpillAGPR) |
486 | OtherUsedRegs.set(Reg); |
487 | for (MCPhysReg Reg : SpillVGPR) |
488 | OtherUsedRegs.set(Reg); |
489 | |
490 | SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin(); |
491 | for (int I = NumLanes - 1; I >= 0; --I) { |
492 | NextSpillReg = std::find_if( |
493 | first: NextSpillReg, last: Regs.end(), pred: [&MRI, &OtherUsedRegs](MCPhysReg Reg) { |
494 | return MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg) && |
495 | !OtherUsedRegs[Reg]; |
496 | }); |
497 | |
498 | if (NextSpillReg == Regs.end()) { // Registers exhausted |
499 | Spill.FullyAllocated = false; |
500 | break; |
501 | } |
502 | |
503 | OtherUsedRegs.set(*NextSpillReg); |
504 | SpillRegs.push_back(Elt: *NextSpillReg); |
505 | MRI.reserveReg(PhysReg: *NextSpillReg, TRI); |
506 | Spill.Lanes[I] = *NextSpillReg++; |
507 | } |
508 | |
509 | return Spill.FullyAllocated; |
510 | } |
511 | |
512 | bool SIMachineFunctionInfo::removeDeadFrameIndices( |
513 | MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { |
514 | // Remove dead frame indices from function frame, however keep FP & BP since |
515 | // spills for them haven't been inserted yet. And also make sure to remove the |
516 | // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure, |
517 | // otherwise, it could result in an unexpected side effect and bug, in case of |
518 | // any re-mapping of freed frame indices by later pass(es) like "stack slot |
519 | // coloring". |
520 | for (auto &R : make_early_inc_range(Range&: SGPRSpillsToVirtualVGPRLanes)) { |
521 | MFI.RemoveStackObject(ObjectIdx: R.first); |
522 | SGPRSpillsToVirtualVGPRLanes.erase(Val: R.first); |
523 | } |
524 | |
525 | // Remove the dead frame indices of CSR SGPRs which are spilled to physical |
526 | // VGPR lanes during SILowerSGPRSpills pass. |
527 | if (!ResetSGPRSpillStackIDs) { |
528 | for (auto &R : make_early_inc_range(Range&: SGPRSpillsToPhysicalVGPRLanes)) { |
529 | MFI.RemoveStackObject(ObjectIdx: R.first); |
530 | SGPRSpillsToPhysicalVGPRLanes.erase(Val: R.first); |
531 | } |
532 | } |
533 | bool HaveSGPRToMemory = false; |
534 | |
535 | if (ResetSGPRSpillStackIDs) { |
536 | // All other SGPRs must be allocated on the default stack, so reset the |
537 | // stack ID. |
538 | for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; |
539 | ++I) { |
540 | if (!checkIndexInPrologEpilogSGPRSpills(FI: I)) { |
541 | if (MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill) { |
542 | MFI.setStackID(ObjectIdx: I, ID: TargetStackID::Default); |
543 | HaveSGPRToMemory = true; |
544 | } |
545 | } |
546 | } |
547 | } |
548 | |
549 | for (auto &R : VGPRToAGPRSpills) { |
550 | if (R.second.IsDead) |
551 | MFI.RemoveStackObject(ObjectIdx: R.first); |
552 | } |
553 | |
554 | return HaveSGPRToMemory; |
555 | } |
556 | |
557 | int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI, |
558 | const SIRegisterInfo &TRI) { |
559 | if (ScavengeFI) |
560 | return *ScavengeFI; |
561 | |
562 | ScavengeFI = |
563 | MFI.CreateStackObject(Size: TRI.getSpillSize(RC: AMDGPU::SGPR_32RegClass), |
564 | Alignment: TRI.getSpillAlign(RC: AMDGPU::SGPR_32RegClass), isSpillSlot: false); |
565 | return *ScavengeFI; |
566 | } |
567 | |
568 | MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { |
569 | assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs" ); |
570 | return AMDGPU::SGPR0 + NumUserSGPRs; |
571 | } |
572 | |
573 | MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { |
574 | return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; |
575 | } |
576 | |
577 | void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) { |
578 | VRegFlags.grow(n: Reg); |
579 | } |
580 | |
581 | void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg, |
582 | Register SrcReg) { |
583 | VRegFlags.grow(n: NewReg); |
584 | VRegFlags[NewReg] = VRegFlags[SrcReg]; |
585 | } |
586 | |
587 | Register |
588 | SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { |
589 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
590 | if (!ST.isAmdPalOS()) |
591 | return Register(); |
592 | Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in |
593 | if (ST.hasMergedShaders()) { |
594 | switch (MF.getFunction().getCallingConv()) { |
595 | case CallingConv::AMDGPU_HS: |
596 | case CallingConv::AMDGPU_GS: |
597 | // Low GIT address is passed in s8 rather than s0 for an LS+HS or |
598 | // ES+GS merged shader on gfx9+. |
599 | GitPtrLo = AMDGPU::SGPR8; |
600 | return GitPtrLo; |
601 | default: |
602 | return GitPtrLo; |
603 | } |
604 | } |
605 | return GitPtrLo; |
606 | } |
607 | |
608 | static yaml::StringValue regToString(Register Reg, |
609 | const TargetRegisterInfo &TRI) { |
610 | yaml::StringValue Dest; |
611 | { |
612 | raw_string_ostream OS(Dest.Value); |
613 | OS << printReg(Reg, TRI: &TRI); |
614 | } |
615 | return Dest; |
616 | } |
617 | |
618 | static std::optional<yaml::SIArgumentInfo> |
619 | convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, |
620 | const TargetRegisterInfo &TRI) { |
621 | yaml::SIArgumentInfo AI; |
622 | |
623 | auto convertArg = [&](std::optional<yaml::SIArgument> &A, |
624 | const ArgDescriptor &Arg) { |
625 | if (!Arg) |
626 | return false; |
627 | |
628 | // Create a register or stack argument. |
629 | yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: Arg.isRegister()); |
630 | if (Arg.isRegister()) { |
631 | raw_string_ostream OS(SA.RegisterName.Value); |
632 | OS << printReg(Reg: Arg.getRegister(), TRI: &TRI); |
633 | } else |
634 | SA.StackOffset = Arg.getStackOffset(); |
635 | // Check and update the optional mask. |
636 | if (Arg.isMasked()) |
637 | SA.Mask = Arg.getMask(); |
638 | |
639 | A = SA; |
640 | return true; |
641 | }; |
642 | |
643 | // TODO: Need to serialize kernarg preloads. |
644 | bool Any = false; |
645 | Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); |
646 | Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); |
647 | Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr); |
648 | Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr); |
649 | Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID); |
650 | Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit); |
651 | Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId); |
652 | Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize); |
653 | Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX); |
654 | Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY); |
655 | Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ); |
656 | Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo); |
657 | Any |= convertArg(AI.PrivateSegmentWaveByteOffset, |
658 | ArgInfo.PrivateSegmentWaveByteOffset); |
659 | Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr); |
660 | Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr); |
661 | Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX); |
662 | Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY); |
663 | Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ); |
664 | |
665 | if (Any) |
666 | return AI; |
667 | |
668 | return std::nullopt; |
669 | } |
670 | |
671 | yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( |
672 | const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI, |
673 | const llvm::MachineFunction &MF) |
674 | : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), |
675 | MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()), |
676 | GDSSize(MFI.getGDSSize()), |
677 | DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()), |
678 | NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), |
679 | MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), |
680 | HasSpilledSGPRs(MFI.hasSpilledSGPRs()), |
681 | HasSpilledVGPRs(MFI.hasSpilledVGPRs()), |
682 | HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), |
683 | Occupancy(MFI.getOccupancy()), |
684 | ScratchRSrcReg(regToString(Reg: MFI.getScratchRSrcReg(), TRI)), |
685 | FrameOffsetReg(regToString(Reg: MFI.getFrameOffsetReg(), TRI)), |
686 | StackPtrOffsetReg(regToString(Reg: MFI.getStackPtrOffsetReg(), TRI)), |
687 | BytesInStackArgArea(MFI.getBytesInStackArgArea()), |
688 | ReturnsVoid(MFI.returnsVoid()), |
689 | ArgInfo(convertArgumentInfo(ArgInfo: MFI.getArgInfo(), TRI)), |
690 | PSInputAddr(MFI.getPSInputAddr()), |
691 | PSInputEnable(MFI.getPSInputEnable()), |
692 | Mode(MFI.getMode()) { |
693 | for (Register Reg : MFI.getWWMReservedRegs()) |
694 | WWMReservedRegs.push_back(Elt: regToString(Reg, TRI)); |
695 | |
696 | if (MFI.getLongBranchReservedReg()) |
697 | LongBranchReservedReg = regToString(Reg: MFI.getLongBranchReservedReg(), TRI); |
698 | if (MFI.getVGPRForAGPRCopy()) |
699 | VGPRForAGPRCopy = regToString(Reg: MFI.getVGPRForAGPRCopy(), TRI); |
700 | |
701 | if (MFI.getSGPRForEXECCopy()) |
702 | SGPRForEXECCopy = regToString(Reg: MFI.getSGPRForEXECCopy(), TRI); |
703 | |
704 | auto SFI = MFI.getOptionalScavengeFI(); |
705 | if (SFI) |
706 | ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); |
707 | } |
708 | |
709 | void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { |
710 | MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, MFI&: *this); |
711 | } |
712 | |
713 | bool SIMachineFunctionInfo::initializeBaseYamlFields( |
714 | const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, |
715 | PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) { |
716 | ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; |
717 | MaxKernArgAlign = YamlMFI.MaxKernArgAlign; |
718 | LDSSize = YamlMFI.LDSSize; |
719 | GDSSize = YamlMFI.GDSSize; |
720 | DynLDSAlign = YamlMFI.DynLDSAlign; |
721 | PSInputAddr = YamlMFI.PSInputAddr; |
722 | PSInputEnable = YamlMFI.PSInputEnable; |
723 | HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; |
724 | Occupancy = YamlMFI.Occupancy; |
725 | IsEntryFunction = YamlMFI.IsEntryFunction; |
726 | NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; |
727 | MemoryBound = YamlMFI.MemoryBound; |
728 | WaveLimiter = YamlMFI.WaveLimiter; |
729 | HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; |
730 | HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; |
731 | BytesInStackArgArea = YamlMFI.BytesInStackArgArea; |
732 | ReturnsVoid = YamlMFI.ReturnsVoid; |
733 | |
734 | if (YamlMFI.ScavengeFI) { |
735 | auto FIOrErr = YamlMFI.ScavengeFI->getFI(MFI: MF.getFrameInfo()); |
736 | if (!FIOrErr) { |
737 | // Create a diagnostic for a the frame index. |
738 | const MemoryBuffer &Buffer = |
739 | *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID()); |
740 | |
741 | Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1, |
742 | SourceMgr::DK_Error, toString(E: FIOrErr.takeError()), |
743 | "" , std::nullopt, std::nullopt); |
744 | SourceRange = YamlMFI.ScavengeFI->SourceRange; |
745 | return true; |
746 | } |
747 | ScavengeFI = *FIOrErr; |
748 | } else { |
749 | ScavengeFI = std::nullopt; |
750 | } |
751 | return false; |
752 | } |
753 | |
754 | bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const { |
755 | return !F.hasFnAttribute(Kind: "amdgpu-no-agpr" ); |
756 | } |
757 | |
758 | bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const { |
759 | if (UsesAGPRs) |
760 | return *UsesAGPRs; |
761 | |
762 | if (!mayNeedAGPRs()) { |
763 | UsesAGPRs = false; |
764 | return false; |
765 | } |
766 | |
767 | if (!AMDGPU::isEntryFunctionCC(CC: MF.getFunction().getCallingConv()) || |
768 | MF.getFrameInfo().hasCalls()) { |
769 | UsesAGPRs = true; |
770 | return true; |
771 | } |
772 | |
773 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
774 | |
775 | for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { |
776 | const Register Reg = Register::index2VirtReg(Index: I); |
777 | const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); |
778 | if (RC && SIRegisterInfo::isAGPRClass(RC)) { |
779 | UsesAGPRs = true; |
780 | return true; |
781 | } |
782 | if (!RC && !MRI.use_empty(RegNo: Reg) && MRI.getType(Reg).isValid()) { |
783 | // Defer caching UsesAGPRs, function might not yet been regbank selected. |
784 | return true; |
785 | } |
786 | } |
787 | |
788 | for (MCRegister Reg : AMDGPU::AGPR_32RegClass) { |
789 | if (MRI.isPhysRegUsed(PhysReg: Reg)) { |
790 | UsesAGPRs = true; |
791 | return true; |
792 | } |
793 | } |
794 | |
795 | UsesAGPRs = false; |
796 | return false; |
797 | } |
798 | |