1//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the lowering of LLVM calls to machine code calls for
11/// GlobalISel.
12///
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUCallLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPULegalizerInfo.h"
18#include "SIMachineFunctionInfo.h"
19#include "SIRegisterInfo.h"
20#include "llvm/CodeGen/Analysis.h"
21#include "llvm/CodeGen/FunctionLoweringInfo.h"
22#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/CodeGen/PseudoSourceValueManager.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-call-lowering"
28
29using namespace llvm;
30
31namespace {
32
33/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
34static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
35 Register ValVReg, const CCValAssign &VA) {
36 if (VA.getLocVT().getSizeInBits() < 32) {
37 // 16-bit types are reported as legal for 32-bit registers. We need to
38 // extend and do a 32-bit copy to avoid the verifier complaining about it.
39 return Handler.MIRBuilder.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: ValVReg).getReg(Idx: 0);
40 }
41
42 return Handler.extendRegister(ValReg: ValVReg, VA);
43}
44
45struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
46 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
47 MachineInstrBuilder MIB)
48 : OutgoingValueHandler(B, MRI), MIB(MIB) {}
49
50 MachineInstrBuilder MIB;
51
52 Register getStackAddress(uint64_t Size, int64_t Offset,
53 MachinePointerInfo &MPO,
54 ISD::ArgFlagsTy Flags) override {
55 llvm_unreachable("not implemented");
56 }
57
58 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
59 const MachinePointerInfo &MPO,
60 const CCValAssign &VA) override {
61 llvm_unreachable("not implemented");
62 }
63
64 void assignValueToReg(Register ValVReg, Register PhysReg,
65 const CCValAssign &VA,
66 ISD::ArgFlagsTy Flags = {}) override {
67 Register ExtReg = extendRegisterMin32(Handler&: *this, ValVReg, VA);
68
69 // If this is a scalar return, insert a readfirstlane just in case the value
70 // ends up in a VGPR.
71 // FIXME: Assert this is a shader return.
72 const SIRegisterInfo *TRI
73 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
74 if (TRI->isSGPRReg(MRI, Reg: PhysReg)) {
75 LLT Ty = MRI.getType(Reg: ExtReg);
76 LLT S32 = LLT::scalar(SizeInBits: 32);
77 if (Ty != S32) {
78 // FIXME: We should probably support readfirstlane intrinsics with all
79 // legal 32-bit types.
80 assert(Ty.getSizeInBits() == 32);
81 if (Ty.isPointer())
82 ExtReg = MIRBuilder.buildPtrToInt(Dst: S32, Src: ExtReg).getReg(Idx: 0);
83 else
84 ExtReg = MIRBuilder.buildBitcast(Dst: S32, Src: ExtReg).getReg(Idx: 0);
85 }
86
87 auto ToSGPR = MIRBuilder
88 .buildIntrinsic(ID: Intrinsic::amdgcn_readfirstlane,
89 Res: {MRI.getType(Reg: ExtReg)})
90 .addReg(RegNo: ExtReg);
91 ExtReg = ToSGPR.getReg(Idx: 0);
92 }
93
94 MIRBuilder.buildCopy(Res: PhysReg, Op: ExtReg);
95 MIB.addUse(RegNo: PhysReg, Flags: RegState::Implicit);
96 }
97};
98
99struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
100 uint64_t StackUsed = 0;
101
102 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
103 : IncomingValueHandler(B, MRI) {}
104
105 Register getStackAddress(uint64_t Size, int64_t Offset,
106 MachinePointerInfo &MPO,
107 ISD::ArgFlagsTy Flags) override {
108 auto &MFI = MIRBuilder.getMF().getFrameInfo();
109
110 // Byval is assumed to be writable memory, but other stack passed arguments
111 // are not.
112 const bool IsImmutable = !Flags.isByVal();
113 int FI = MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable);
114 MPO = MachinePointerInfo::getFixedStack(MF&: MIRBuilder.getMF(), FI);
115 auto AddrReg = MIRBuilder.buildFrameIndex(
116 Res: LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: 32), Idx: FI);
117 StackUsed = std::max(a: StackUsed, b: Size + Offset);
118 return AddrReg.getReg(Idx: 0);
119 }
120
121 void copyToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA) {
122 if (VA.getLocVT().getSizeInBits() < 32) {
123 // 16-bit types are reported as legal for 32-bit registers. We need to
124 // do a 32-bit copy, and truncate to avoid the verifier complaining
125 // about it.
126 auto Copy = MIRBuilder.buildCopy(Res: LLT::scalar(SizeInBits: 32), Op: PhysReg);
127
128 // If we have signext/zeroext, it applies to the whole 32-bit register
129 // before truncation.
130 auto Extended =
131 buildExtensionHint(VA, SrcReg: Copy.getReg(Idx: 0), NarrowTy: LLT(VA.getLocVT()));
132 MIRBuilder.buildTrunc(Res: ValVReg, Op: Extended);
133 return;
134 }
135
136 IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
137 }
138
139 void readLaneToSGPR(Register ValVReg, Register PhysReg,
140 const CCValAssign &VA) {
141 // Handle inreg parameters passed through VGPRs due to SGPR exhaustion.
142 // When SGPRs are exhausted, the calling convention may allocate inreg
143 // parameters to VGPRs. We insert readfirstlane to move the value from
144 // VGPR to SGPR, as required by the inreg ABI.
145 //
146 // FIXME: This may increase instruction count in some cases. If the
147 // readfirstlane result is subsequently copied back to a VGPR, we cannot
148 // optimize away the unnecessary VGPR->SGPR->VGPR sequence in later passes
149 // because the inreg attribute information is not preserved in MIR. We could
150 // use WWM_COPY (or similar instructions) and mark it as foldable to enable
151 // later optimization passes to eliminate the redundant readfirstlane.
152 auto Copy = MIRBuilder.buildCopy(Res: LLT::scalar(SizeInBits: 32), Op: PhysReg);
153 if (VA.getLocVT().getSizeInBits() < 32) {
154 auto ToSGPR = MIRBuilder
155 .buildIntrinsic(ID: Intrinsic::amdgcn_readfirstlane,
156 Res: {MRI.getType(Reg: Copy.getReg(Idx: 0))})
157 .addReg(RegNo: Copy.getReg(Idx: 0));
158 auto Extended =
159 buildExtensionHint(VA, SrcReg: ToSGPR.getReg(Idx: 0), NarrowTy: LLT(VA.getLocVT()));
160 MIRBuilder.buildTrunc(Res: ValVReg, Op: Extended);
161 return;
162 }
163
164 MIRBuilder.buildIntrinsic(ID: Intrinsic::amdgcn_readfirstlane, Res: ValVReg)
165 .addReg(RegNo: Copy.getReg(Idx: 0));
166 }
167
168 void assignValueToReg(Register ValVReg, Register PhysReg,
169 const CCValAssign &VA,
170 ISD::ArgFlagsTy Flags = {}) override {
171 markPhysRegUsed(PhysReg);
172
173 const SIRegisterInfo *TRI =
174 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
175
176 // Inreg flag should be the same across SplitArg[i]
177 if (Flags.isInReg() && TRI->isVGPR(MRI, Reg: PhysReg))
178 readLaneToSGPR(ValVReg, PhysReg, VA);
179 else
180 copyToReg(ValVReg, PhysReg, VA);
181 }
182
183 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
184 const MachinePointerInfo &MPO,
185 const CCValAssign &VA) override {
186 MachineFunction &MF = MIRBuilder.getMF();
187
188 auto *MMO = MF.getMachineMemOperand(
189 PtrInfo: MPO, f: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemTy,
190 base_alignment: inferAlignFromPtrInfo(MF, MPO));
191 MIRBuilder.buildLoad(Res: ValVReg, Addr, MMO&: *MMO);
192 }
193
194 /// How the physical register gets marked varies between formal
195 /// parameters (it's a basic-block live-in), and a call instruction
196 /// (it's an implicit-def of the BL).
197 virtual void markPhysRegUsed(unsigned PhysReg) = 0;
198};
199
200struct FormalArgHandler : public AMDGPUIncomingArgHandler {
201 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
202 : AMDGPUIncomingArgHandler(B, MRI) {}
203
204 void markPhysRegUsed(unsigned PhysReg) override {
205 MIRBuilder.getMBB().addLiveIn(PhysReg);
206 }
207};
208
209struct CallReturnHandler : public AMDGPUIncomingArgHandler {
210 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
211 MachineInstrBuilder MIB)
212 : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
213
214 void markPhysRegUsed(unsigned PhysReg) override {
215 MIB.addDef(RegNo: PhysReg, Flags: RegState::Implicit);
216 }
217
218 MachineInstrBuilder MIB;
219};
220
221struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
222 /// For tail calls, the byte offset of the call's argument area from the
223 /// callee's. Unused elsewhere.
224 int FPDiff;
225
226 // Cache the SP register vreg if we need it more than once in this call site.
227 Register SPReg;
228
229 bool IsTailCall;
230
231 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
232 MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
233 bool IsTailCall = false, int FPDiff = 0)
234 : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
235 IsTailCall(IsTailCall) {}
236
237 Register getStackAddress(uint64_t Size, int64_t Offset,
238 MachinePointerInfo &MPO,
239 ISD::ArgFlagsTy Flags) override {
240 MachineFunction &MF = MIRBuilder.getMF();
241 const LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: 32);
242 const LLT S32 = LLT::scalar(SizeInBits: 32);
243
244 if (IsTailCall) {
245 Offset += FPDiff;
246 int FI = MF.getFrameInfo().CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
247 auto FIReg = MIRBuilder.buildFrameIndex(Res: PtrTy, Idx: FI);
248 MPO = MachinePointerInfo::getFixedStack(MF, FI);
249 return FIReg.getReg(Idx: 0);
250 }
251
252 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
253
254 if (!SPReg) {
255 const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
256 if (ST.hasFlatScratchEnabled()) {
257 // The stack is accessed unswizzled, so we can use a regular copy.
258 SPReg = MIRBuilder.buildCopy(Res: PtrTy,
259 Op: MFI->getStackPtrOffsetReg()).getReg(Idx: 0);
260 } else {
261 // The address we produce here, without knowing the use context, is going
262 // to be interpreted as a vector address, so we need to convert to a
263 // swizzled address.
264 SPReg = MIRBuilder.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {PtrTy},
265 SrcOps: {MFI->getStackPtrOffsetReg()}).getReg(Idx: 0);
266 }
267 }
268
269 auto OffsetReg = MIRBuilder.buildConstant(Res: S32, Val: Offset);
270
271 auto AddrReg = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: SPReg, Op1: OffsetReg);
272 MPO = MachinePointerInfo::getStack(MF, Offset);
273 return AddrReg.getReg(Idx: 0);
274 }
275
276 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
277 const MachinePointerInfo &MPO,
278 const CCValAssign &VA) override {
279 MachineFunction &MF = MIRBuilder.getMF();
280 uint64_t LocMemOffset = VA.getLocMemOffset();
281 const auto &ST = MF.getSubtarget<GCNSubtarget>();
282
283 auto *MMO = MF.getMachineMemOperand(
284 PtrInfo: MPO, f: MachineMemOperand::MOStore, MemTy,
285 base_alignment: commonAlignment(A: ST.getStackAlignment(), Offset: LocMemOffset));
286 MIRBuilder.buildStore(Val: ValVReg, Addr, MMO&: *MMO);
287 }
288
289 void assignValueToAddress(const CallLowering::ArgInfo &Arg,
290 unsigned ValRegIndex, Register Addr, LLT MemTy,
291 const MachinePointerInfo &MPO,
292 const CCValAssign &VA) override {
293 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
294 ? extendRegister(ValReg: Arg.Regs[ValRegIndex], VA)
295 : Arg.Regs[ValRegIndex];
296 assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
297 }
298};
299} // anonymous namespace
300
301AMDGPUCallLowering::AMDGPUCallLowering(const TargetLowering &TLI)
302 : CallLowering(&TLI) {}
303
304// FIXME: Compatibility shim
305static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
306 switch (MIOpc) {
307 case TargetOpcode::G_SEXT:
308 return ISD::SIGN_EXTEND;
309 case TargetOpcode::G_ZEXT:
310 return ISD::ZERO_EXTEND;
311 case TargetOpcode::G_ANYEXT:
312 return ISD::ANY_EXTEND;
313 default:
314 llvm_unreachable("not an extend opcode");
315 }
316}
317
318bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
319 CallingConv::ID CallConv,
320 SmallVectorImpl<BaseArgInfo> &Outs,
321 bool IsVarArg) const {
322 // For shaders. Vector types should be explicitly handled by CC.
323 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
324 return true;
325
326 SmallVector<CCValAssign, 16> ArgLocs;
327 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
328 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
329 MF.getFunction().getContext());
330
331 return checkReturn(CCInfo, Outs, Fn: TLI.CCAssignFnForReturn(CC: CallConv, IsVarArg));
332}
333
334/// Lower the return value for the already existing \p Ret. This assumes that
335/// \p B's insertion point is correct.
336bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
337 const Value *Val, ArrayRef<Register> VRegs,
338 MachineInstrBuilder &Ret) const {
339 if (!Val)
340 return true;
341
342 auto &MF = B.getMF();
343 const auto &F = MF.getFunction();
344 const DataLayout &DL = MF.getDataLayout();
345 MachineRegisterInfo *MRI = B.getMRI();
346 LLVMContext &Ctx = F.getContext();
347
348 CallingConv::ID CC = F.getCallingConv();
349 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
350
351 SmallVector<EVT, 8> SplitEVTs;
352 ComputeValueVTs(TLI, DL, Ty: Val->getType(), ValueVTs&: SplitEVTs);
353 assert(VRegs.size() == SplitEVTs.size() &&
354 "For each split Type there should be exactly one VReg.");
355
356 SmallVector<ArgInfo, 8> SplitRetInfos;
357
358 for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
359 EVT VT = SplitEVTs[i];
360 Register Reg = VRegs[i];
361 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Context&: Ctx), 0);
362 setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
363
364 if (VT.isScalarInteger()) {
365 unsigned ExtendOp = TargetOpcode::G_ANYEXT;
366 if (RetInfo.Flags[0].isSExt()) {
367 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
368 ExtendOp = TargetOpcode::G_SEXT;
369 } else if (RetInfo.Flags[0].isZExt()) {
370 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
371 ExtendOp = TargetOpcode::G_ZEXT;
372 }
373
374 EVT ExtVT = TLI.getTypeForExtReturn(Context&: Ctx, VT,
375 ExtendKind: extOpcodeToISDExtOpcode(MIOpc: ExtendOp));
376 if (ExtVT != VT) {
377 RetInfo.Ty = ExtVT.getTypeForEVT(Context&: Ctx);
378 LLT ExtTy = getLLTForType(Ty&: *RetInfo.Ty, DL);
379 Reg = B.buildInstr(Opc: ExtendOp, DstOps: {ExtTy}, SrcOps: {Reg}).getReg(Idx: 0);
380 }
381 }
382
383 if (Reg != RetInfo.Regs[0]) {
384 RetInfo.Regs[0] = Reg;
385 // Reset the arg flags after modifying Reg.
386 setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
387 }
388
389 splitToValueTypes(OrigArgInfo: RetInfo, SplitArgs&: SplitRetInfos, DL, CallConv: CC);
390 }
391
392 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, IsVarArg: F.isVarArg());
393
394 OutgoingValueAssigner Assigner(AssignFn);
395 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
396 return determineAndHandleAssignments(Handler&: RetHandler, Assigner, Args&: SplitRetInfos, MIRBuilder&: B,
397 CallConv: CC, IsVarArg: F.isVarArg());
398}
399
400bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
401 ArrayRef<Register> VRegs,
402 FunctionLoweringInfo &FLI) const {
403
404 MachineFunction &MF = B.getMF();
405 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
406 MFI->setIfReturnsVoid(!Val);
407
408 assert(!Val == VRegs.empty() && "Return value without a vreg");
409
410 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
411 const bool IsShader = AMDGPU::isShader(CC);
412 const bool IsWaveEnd =
413 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
414 if (IsWaveEnd) {
415 B.buildInstr(Opcode: AMDGPU::S_ENDPGM)
416 .addImm(Val: 0);
417 return true;
418 }
419
420 const bool IsWholeWave = MFI->isWholeWaveFunction();
421 unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
422 : IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
423 : AMDGPU::SI_RETURN;
424 auto Ret = B.buildInstrNoInsert(Opcode: ReturnOpc);
425
426 if (!FLI.CanLowerReturn)
427 insertSRetStores(MIRBuilder&: B, RetTy: Val->getType(), VRegs, DemoteReg: FLI.DemoteRegister);
428 else if (!lowerReturnVal(B, Val, VRegs, Ret))
429 return false;
430
431 if (IsWholeWave)
432 addOriginalExecToReturn(MF&: B.getMF(), Ret);
433
434 // TODO: Handle CalleeSavedRegsViaCopy.
435
436 B.insertInstr(MIB: Ret);
437 return true;
438}
439
440void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
441 uint64_t Offset) const {
442 MachineFunction &MF = B.getMF();
443 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
444 MachineRegisterInfo &MRI = MF.getRegInfo();
445 Register KernArgSegmentPtr =
446 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
447 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(PReg: KernArgSegmentPtr);
448
449 auto OffsetReg = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
450
451 B.buildPtrAdd(Res: DstReg, Op0: KernArgSegmentVReg, Op1: OffsetReg);
452}
453
454void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
455 uint64_t Offset,
456 Align Alignment) const {
457 MachineFunction &MF = B.getMF();
458 const Function &F = MF.getFunction();
459 const DataLayout &DL = F.getDataLayout();
460 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
461 MachinePointerInfo PtrInfo = TLI.getKernargSegmentPtrInfo(MF);
462
463 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
464
465 SmallVector<ArgInfo, 32> SplitArgs;
466 SmallVector<TypeSize> FieldOffsets;
467 splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: F.getCallingConv(), Offsets: &FieldOffsets);
468
469 unsigned Idx = 0;
470 for (ArgInfo &SplitArg : SplitArgs) {
471 Register PtrReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
472 lowerParameterPtr(DstReg: PtrReg, B, Offset: Offset + FieldOffsets[Idx]);
473
474 LLT ArgTy = getLLTForType(Ty&: *SplitArg.Ty, DL);
475 if (SplitArg.Flags[0].isPointer()) {
476 // Compensate for losing pointeriness in splitValueTypes.
477 LLT PtrTy = LLT::pointer(AddressSpace: SplitArg.Flags[0].getPointerAddrSpace(),
478 SizeInBits: ArgTy.getScalarSizeInBits());
479 ArgTy = ArgTy.isVector() ? LLT::vector(EC: ArgTy.getElementCount(), ScalarTy: PtrTy)
480 : PtrTy;
481 }
482
483 MachineMemOperand *MMO = MF.getMachineMemOperand(
484 PtrInfo,
485 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
486 MachineMemOperand::MOInvariant,
487 MemTy: ArgTy, base_alignment: commonAlignment(A: Alignment, Offset: FieldOffsets[Idx]));
488
489 assert(SplitArg.Regs.size() == 1);
490
491 B.buildLoad(Res: SplitArg.Regs[0], Addr: PtrReg, MMO&: *MMO);
492 ++Idx;
493 }
494}
495
496// Allocate special inputs passed in user SGPRs.
497static void allocateHSAUserSGPRs(CCState &CCInfo,
498 MachineIRBuilder &B,
499 MachineFunction &MF,
500 const SIRegisterInfo &TRI,
501 SIMachineFunctionInfo &Info) {
502 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
503 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
504 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
505 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
506 MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
507 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
508 }
509
510 if (UserSGPRInfo.hasDispatchPtr()) {
511 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
512 MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
513 CCInfo.AllocateReg(Reg: DispatchPtrReg);
514 }
515
516 if (UserSGPRInfo.hasQueuePtr()) {
517 Register QueuePtrReg = Info.addQueuePtr(TRI);
518 MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
519 CCInfo.AllocateReg(Reg: QueuePtrReg);
520 }
521
522 if (UserSGPRInfo.hasKernargSegmentPtr()) {
523 MachineRegisterInfo &MRI = MF.getRegInfo();
524 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
525 const LLT P4 = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
526 Register VReg = MRI.createGenericVirtualRegister(Ty: P4);
527 MRI.addLiveIn(Reg: InputPtrReg, vreg: VReg);
528 B.getMBB().addLiveIn(PhysReg: InputPtrReg);
529 B.buildCopy(Res: VReg, Op: InputPtrReg);
530 CCInfo.AllocateReg(Reg: InputPtrReg);
531 }
532
533 if (UserSGPRInfo.hasDispatchID()) {
534 Register DispatchIDReg = Info.addDispatchID(TRI);
535 MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
536 CCInfo.AllocateReg(Reg: DispatchIDReg);
537 }
538
539 if (UserSGPRInfo.hasFlatScratchInit()) {
540 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
541 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
542 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
543 }
544
545 if (UserSGPRInfo.hasPrivateSegmentSize()) {
546 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
547 MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
548 CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
549 }
550
551 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
552 // these from the dispatch pointer.
553}
554
555bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
556 MachineIRBuilder &B, const Function &F,
557 ArrayRef<ArrayRef<Register>> VRegs) const {
558 MachineFunction &MF = B.getMF();
559 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
560 MachineRegisterInfo &MRI = MF.getRegInfo();
561 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
562 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
563 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
564 const DataLayout &DL = F.getDataLayout();
565
566 SmallVector<CCValAssign, 16> ArgLocs;
567 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
568
569 allocateHSAUserSGPRs(CCInfo, B, MF, TRI: *TRI, Info&: *Info);
570
571 unsigned i = 0;
572 const Align KernArgBaseAlign(16);
573 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
574 uint64_t ExplicitArgOffset = 0;
575
576 // TODO: Align down to dword alignment and extract bits for extending loads.
577 for (auto &Arg : F.args()) {
578 // TODO: Add support for kernarg preload.
579 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
580 LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
581 return false;
582 }
583
584 const bool IsByRef = Arg.hasByRefAttr();
585 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
586 unsigned AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
587 if (AllocSize == 0)
588 continue;
589
590 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
591 Align ABIAlign = DL.getValueOrABITypeAlignment(Alignment: ParamAlign, Ty: ArgTy);
592
593 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + BaseOffset;
594 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + AllocSize;
595
596 if (Arg.use_empty()) {
597 ++i;
598 continue;
599 }
600
601 Align Alignment = commonAlignment(A: KernArgBaseAlign, Offset: ArgOffset);
602
603 if (IsByRef) {
604 unsigned ByRefAS = cast<PointerType>(Val: Arg.getType())->getAddressSpace();
605
606 assert(VRegs[i].size() == 1 &&
607 "expected only one register for byval pointers");
608 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
609 lowerParameterPtr(DstReg: VRegs[i][0], B, Offset: ArgOffset);
610 } else {
611 const LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
612 Register PtrReg = MRI.createGenericVirtualRegister(Ty: ConstPtrTy);
613 lowerParameterPtr(DstReg: PtrReg, B, Offset: ArgOffset);
614
615 B.buildAddrSpaceCast(Dst: VRegs[i][0], Src: PtrReg);
616 }
617 } else {
618 ArgInfo OrigArg(VRegs[i], Arg, i);
619 const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
620 setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
621 lowerParameter(B, OrigArg, Offset: ArgOffset, Alignment);
622 }
623
624 ++i;
625 }
626
627 if (Info->getNumKernargPreloadedSGPRs())
628 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
629
630 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
631 TLI.allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv: F.getCallingConv(), IsShader: false);
632 return true;
633}
634
635bool AMDGPUCallLowering::lowerFormalArguments(
636 MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
637 FunctionLoweringInfo &FLI) const {
638 CallingConv::ID CC = F.getCallingConv();
639
640 // The infrastructure for normal calling convention lowering is essentially
641 // useless for kernels. We want to avoid any kind of legalization or argument
642 // splitting.
643 if (CC == CallingConv::AMDGPU_KERNEL)
644 return lowerFormalArgumentsKernel(B, F, VRegs);
645
646 const bool IsGraphics = AMDGPU::isGraphics(CC);
647 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
648
649 MachineFunction &MF = B.getMF();
650 MachineBasicBlock &MBB = B.getMBB();
651 MachineRegisterInfo &MRI = MF.getRegInfo();
652 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
653 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
654 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
655 const DataLayout &DL = F.getDataLayout();
656
657 SmallVector<CCValAssign, 16> ArgLocs;
658 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
659 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
660
661 if (UserSGPRInfo.hasImplicitBufferPtr()) {
662 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(TRI: *TRI);
663 MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
664 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
665 }
666
667 // FIXME: This probably isn't defined for mesa
668 if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
669 Register FlatScratchInitReg = Info->addFlatScratchInit(TRI: *TRI);
670 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
671 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
672 }
673
674 SmallVector<ArgInfo, 32> SplitArgs;
675 unsigned Idx = 0;
676 unsigned PSInputNum = 0;
677
678 // Insert the hidden sret parameter if the return value won't fit in the
679 // return registers.
680 if (!FLI.CanLowerReturn)
681 insertSRetIncomingArgument(F, SplitArgs, DemoteReg&: FLI.DemoteRegister, MRI, DL);
682
683 for (auto &Arg : F.args()) {
684 if (DL.getTypeStoreSize(Ty: Arg.getType()) == 0)
685 continue;
686
687 if (Info->isWholeWaveFunction() && Idx == 0) {
688 assert(VRegs[Idx].size() == 1 && "Expected only one register");
689
690 // The first argument for whole wave functions is the original EXEC value.
691 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
692 .addDef(RegNo: VRegs[Idx][0]);
693
694 ++Idx;
695 continue;
696 }
697
698 const bool InReg = Arg.hasAttribute(Kind: Attribute::InReg);
699
700 if (Arg.hasAttribute(Kind: Attribute::SwiftSelf) ||
701 Arg.hasAttribute(Kind: Attribute::SwiftError) ||
702 Arg.hasAttribute(Kind: Attribute::Nest))
703 return false;
704
705 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
706 const bool ArgUsed = !Arg.use_empty();
707 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(Index: PSInputNum);
708
709 if (!SkipArg) {
710 Info->markPSInputAllocated(Index: PSInputNum);
711 if (ArgUsed)
712 Info->markPSInputEnabled(Index: PSInputNum);
713 }
714
715 ++PSInputNum;
716
717 if (SkipArg) {
718 for (Register R : VRegs[Idx])
719 B.buildUndef(Res: R);
720
721 ++Idx;
722 continue;
723 }
724 }
725
726 ArgInfo OrigArg(VRegs[Idx], Arg, Idx);
727 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
728 setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
729
730 splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: CC);
731 ++Idx;
732 }
733
734 // At least one interpolation mode must be enabled or else the GPU will
735 // hang.
736 //
737 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
738 // set PSInputAddr, the user wants to enable some bits after the compilation
739 // based on run-time states. Since we can't know what the final PSInputEna
740 // will look like, so we shouldn't do anything here and the user should take
741 // responsibility for the correct programming.
742 //
743 // Otherwise, the following restrictions apply:
744 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
745 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
746 // enabled too.
747 if (CC == CallingConv::AMDGPU_PS) {
748 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
749 ((Info->getPSInputAddr() & 0xF) == 0 &&
750 Info->isPSInputAllocated(Index: 11))) {
751 CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
752 CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
753 Info->markPSInputAllocated(Index: 0);
754 Info->markPSInputEnabled(Index: 0);
755 }
756
757 if (Subtarget.isAmdPalOS()) {
758 // For isAmdPalOS, the user does not enable some bits after compilation
759 // based on run-time states; the register values being generated here are
760 // the final ones set in hardware. Therefore we need to apply the
761 // workaround to PSInputAddr and PSInputEnable together. (The case where
762 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
763 // set up an input arg for a particular interpolation mode, but nothing
764 // uses that input arg. Really we should have an earlier pass that removes
765 // such an arg.)
766 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
767 if ((PsInputBits & 0x7F) == 0 ||
768 ((PsInputBits & 0xF) == 0 &&
769 (PsInputBits >> 11 & 1)))
770 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
771 }
772 }
773
774 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
775 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, IsVarArg: F.isVarArg());
776
777 if (!MBB.empty())
778 B.setInstr(*MBB.begin());
779
780 if (!IsEntryFunc && !IsGraphics) {
781 // For the fixed ABI, pass workitem IDs in the last argument register.
782 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
783
784 if (!Subtarget.hasFlatScratchEnabled())
785 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
786 TLI.allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
787 }
788
789 IncomingValueAssigner Assigner(AssignFn);
790 if (!determineAssignments(Assigner, Args&: SplitArgs, CCInfo))
791 return false;
792
793 if (IsEntryFunc) {
794 // This assumes the registers are allocated by CCInfo in ascending order
795 // with no gaps.
796 Info->setNumWaveDispatchSGPRs(
797 CCInfo.getFirstUnallocated(Regs: AMDGPU::SGPR_32RegClass.getRegisters()));
798 Info->setNumWaveDispatchVGPRs(
799 CCInfo.getFirstUnallocated(Regs: AMDGPU::VGPR_32RegClass.getRegisters()));
800 }
801
802 FormalArgHandler Handler(B, MRI);
803 if (!handleAssignments(Handler, Args&: SplitArgs, CCState&: CCInfo, ArgLocs, MIRBuilder&: B))
804 return false;
805
806 uint64_t StackSize = Assigner.StackSize;
807
808 // Start adding system SGPRs.
809 if (IsEntryFunc)
810 TLI.allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv: CC, IsShader: IsGraphics);
811
812 // When we tail call, we need to check if the callee's arguments will fit on
813 // the caller's stack. So, whenever we lower formal arguments, we should keep
814 // track of this information, since we might lower a tail call in this
815 // function later.
816 Info->setBytesInStackArgArea(StackSize);
817
818 // Move back to the end of the basic block.
819 B.setMBB(MBB);
820
821 return true;
822}
823
824bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
825 CCState &CCInfo,
826 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
827 CallLoweringInfo &Info) const {
828 MachineFunction &MF = MIRBuilder.getMF();
829
830 // If there's no call site, this doesn't correspond to a call from the IR and
831 // doesn't need implicit inputs.
832 if (!Info.CB)
833 return true;
834
835 const AMDGPUFunctionArgInfo &CalleeArgInfo =
836 AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
837
838 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
839 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
840
841
842 // TODO: Unify with private memory register handling. This is complicated by
843 // the fact that at least in kernels, the input argument is not necessarily
844 // in the same location as the input.
845 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
846 AMDGPUFunctionArgInfo::DISPATCH_PTR,
847 AMDGPUFunctionArgInfo::QUEUE_PTR,
848 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
849 AMDGPUFunctionArgInfo::DISPATCH_ID,
850 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
851 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
852 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
853 AMDGPUFunctionArgInfo::LDS_KERNEL_ID,
854 };
855
856 static constexpr StringLiteral ImplicitAttrNames[][2] = {
857 {"amdgpu-no-dispatch-ptr", ""},
858 {"amdgpu-no-queue-ptr", ""},
859 {"amdgpu-no-implicitarg-ptr", ""},
860 {"amdgpu-no-dispatch-id", ""},
861 {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"},
862 {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"},
863 {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"},
864 {"amdgpu-no-lds-kernel-id", ""},
865 };
866
867 MachineRegisterInfo &MRI = MF.getRegInfo();
868
869 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
870 const AMDGPULegalizerInfo *LI
871 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
872
873 unsigned I = 0;
874 for (auto InputID : InputRegs) {
875 const ArgDescriptor *OutgoingArg;
876 const TargetRegisterClass *ArgRC;
877 LLT ArgTy;
878
879 // If the callee does not use the attribute value, skip copying the value.
880 if (all_of(Range: ImplicitAttrNames[I++], P: [&](StringRef AttrName) {
881 return AttrName.empty() || Info.CB->hasFnAttr(Kind: AttrName);
882 }))
883 continue;
884
885 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
886 CalleeArgInfo.getPreloadedValue(Value: InputID);
887 if (!OutgoingArg)
888 continue;
889
890 const ArgDescriptor *IncomingArg;
891 const TargetRegisterClass *IncomingArgRC;
892 std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: ArgTy) =
893 CallerArgInfo.getPreloadedValue(Value: InputID);
894 assert(IncomingArgRC == ArgRC);
895
896 Register InputReg = MRI.createGenericVirtualRegister(Ty: ArgTy);
897
898 if (IncomingArg) {
899 LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArg, ArgRC, ArgTy);
900 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
901 LI->getImplicitArgPtr(DstReg: InputReg, MRI, B&: MIRBuilder);
902 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
903 std::optional<uint32_t> Id =
904 AMDGPUMachineFunction::getLDSKernelIdMetadata(F: MF.getFunction());
905 if (Id) {
906 MIRBuilder.buildConstant(Res: InputReg, Val: *Id);
907 } else {
908 MIRBuilder.buildUndef(Res: InputReg);
909 }
910 } else {
911 // We may have proven the input wasn't needed, although the ABI is
912 // requiring it. We just need to allocate the register appropriately.
913 MIRBuilder.buildUndef(Res: InputReg);
914 }
915
916 if (OutgoingArg->isRegister()) {
917 ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
918 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
919 report_fatal_error(reason: "failed to allocate implicit input argument");
920 } else {
921 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
922 return false;
923 }
924 }
925
926 // Pack workitem IDs into a single register or pass it as is if already
927 // packed.
928 const ArgDescriptor *OutgoingArg;
929 const TargetRegisterClass *ArgRC;
930 LLT ArgTy;
931
932 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
933 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
934 if (!OutgoingArg)
935 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
936 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
937 if (!OutgoingArg)
938 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
939 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
940 if (!OutgoingArg)
941 return false;
942
943 auto WorkitemIDX =
944 CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
945 auto WorkitemIDY =
946 CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
947 auto WorkitemIDZ =
948 CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
949
950 const ArgDescriptor *IncomingArgX = std::get<0>(t&: WorkitemIDX);
951 const ArgDescriptor *IncomingArgY = std::get<0>(t&: WorkitemIDY);
952 const ArgDescriptor *IncomingArgZ = std::get<0>(t&: WorkitemIDZ);
953 const LLT S32 = LLT::scalar(SizeInBits: 32);
954
955 const bool NeedWorkItemIDX = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
956 const bool NeedWorkItemIDY = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
957 const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
958
959 // If incoming ids are not packed we need to pack them.
960 // FIXME: Should consider known workgroup size to eliminate known 0 cases.
961 Register InputReg;
962 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
963 NeedWorkItemIDX) {
964 if (ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: 0) != 0) {
965 InputReg = MRI.createGenericVirtualRegister(Ty: S32);
966 LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArgX,
967 ArgRC: std::get<1>(t&: WorkitemIDX),
968 ArgTy: std::get<2>(t&: WorkitemIDX));
969 } else {
970 InputReg = MIRBuilder.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
971 }
972 }
973
974 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
975 NeedWorkItemIDY && ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: 1) != 0) {
976 Register Y = MRI.createGenericVirtualRegister(Ty: S32);
977 LI->buildLoadInputValue(DstReg: Y, B&: MIRBuilder, Arg: IncomingArgY,
978 ArgRC: std::get<1>(t&: WorkitemIDY), ArgTy: std::get<2>(t&: WorkitemIDY));
979
980 Y = MIRBuilder.buildShl(Dst: S32, Src0: Y, Src1: MIRBuilder.buildConstant(Res: S32, Val: 10)).getReg(Idx: 0);
981 InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Y).getReg(Idx: 0) : Y;
982 }
983
984 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
985 NeedWorkItemIDZ && ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: 2) != 0) {
986 Register Z = MRI.createGenericVirtualRegister(Ty: S32);
987 LI->buildLoadInputValue(DstReg: Z, B&: MIRBuilder, Arg: IncomingArgZ,
988 ArgRC: std::get<1>(t&: WorkitemIDZ), ArgTy: std::get<2>(t&: WorkitemIDZ));
989
990 Z = MIRBuilder.buildShl(Dst: S32, Src0: Z, Src1: MIRBuilder.buildConstant(Res: S32, Val: 20)).getReg(Idx: 0);
991 InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Z).getReg(Idx: 0) : Z;
992 }
993
994 if (!InputReg &&
995 (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
996 InputReg = MRI.createGenericVirtualRegister(Ty: S32);
997 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
998 // We're in a situation where the outgoing function requires the workitem
999 // ID, but the calling function does not have it (e.g a graphics function
1000 // calling a C calling convention function). This is illegal, but we need
1001 // to produce something.
1002 MIRBuilder.buildUndef(Res: InputReg);
1003 } else {
1004 // Workitem ids are already packed, any of present incoming arguments will
1005 // carry all required fields.
1006 ArgDescriptor IncomingArg = ArgDescriptor::createArg(
1007 Arg: IncomingArgX ? *IncomingArgX :
1008 IncomingArgY ? *IncomingArgY : *IncomingArgZ, Mask: ~0u);
1009 LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: &IncomingArg,
1010 ArgRC: &AMDGPU::VGPR_32RegClass, ArgTy: S32);
1011 }
1012 }
1013
1014 if (OutgoingArg->isRegister()) {
1015 if (InputReg)
1016 ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
1017
1018 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
1019 report_fatal_error(reason: "failed to allocate implicit input argument");
1020 } else {
1021 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1022 return false;
1023 }
1024
1025 return true;
1026}
1027
1028/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
1029/// CC.
1030static std::pair<CCAssignFn *, CCAssignFn *>
1031getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
1032 return {TLI.CCAssignFnForCall(CC, IsVarArg: false), TLI.CCAssignFnForCall(CC, IsVarArg: true)};
1033}
1034
1035static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
1036 bool IsTailCall, bool IsWave32,
1037 CallingConv::ID CC,
1038 bool IsDynamicVGPRChainCall = false) {
1039 // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
1040 assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
1041 "Indirect calls can't be tail calls, "
1042 "because the address can be divergent");
1043 if (!IsTailCall)
1044 return AMDGPU::G_SI_CALL;
1045
1046 if (AMDGPU::isChainCC(CC)) {
1047 if (IsDynamicVGPRChainCall)
1048 return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR
1049 : AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR;
1050 return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
1051 }
1052
1053 if (CallerF.getFunction().getCallingConv() ==
1054 CallingConv::AMDGPU_Gfx_WholeWave)
1055 return AMDGPU::SI_TCRETURN_GFX_WholeWave;
1056
1057 if (CC == CallingConv::AMDGPU_Gfx || CC == CallingConv::AMDGPU_Gfx_WholeWave)
1058 return AMDGPU::SI_TCRETURN_GFX;
1059
1060 return AMDGPU::SI_TCRETURN;
1061}
1062
1063// Add operands to call instruction to track the callee.
1064static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
1065 MachineIRBuilder &MIRBuilder,
1066 AMDGPUCallLowering::CallLoweringInfo &Info,
1067 bool IsDynamicVGPRChainCall = false) {
1068 if (Info.Callee.isReg()) {
1069 CallInst.addReg(RegNo: Info.Callee.getReg());
1070 CallInst.addImm(Val: 0);
1071 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
1072 // The call lowering lightly assumed we can directly encode a call target in
1073 // the instruction, which is not the case. Materialize the address here.
1074 const GlobalValue *GV = Info.Callee.getGlobal();
1075 auto Ptr = MIRBuilder.buildGlobalValue(
1076 Res: LLT::pointer(AddressSpace: GV->getAddressSpace(), SizeInBits: 64), GV);
1077 CallInst.addReg(RegNo: Ptr.getReg(Idx: 0));
1078
1079 if (IsDynamicVGPRChainCall) {
1080 // DynamicVGPR chain calls are always indirect.
1081 CallInst.addImm(Val: 0);
1082 } else
1083 CallInst.add(MO: Info.Callee);
1084 } else
1085 return false;
1086
1087 return true;
1088}
1089
1090bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
1091 CallLoweringInfo &Info, MachineFunction &MF,
1092 SmallVectorImpl<ArgInfo> &InArgs) const {
1093 const Function &CallerF = MF.getFunction();
1094 CallingConv::ID CalleeCC = Info.CallConv;
1095 CallingConv::ID CallerCC = CallerF.getCallingConv();
1096
1097 // If the calling conventions match, then everything must be the same.
1098 if (CalleeCC == CallerCC)
1099 return true;
1100
1101 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1102
1103 // Make sure that the caller and callee preserve all of the same registers.
1104 const auto *TRI = ST.getRegisterInfo();
1105
1106 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1107 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
1108 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
1109 return false;
1110
1111 // Check if the caller and callee will handle arguments in the same way.
1112 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1113 CCAssignFn *CalleeAssignFnFixed;
1114 CCAssignFn *CalleeAssignFnVarArg;
1115 std::tie(args&: CalleeAssignFnFixed, args&: CalleeAssignFnVarArg) =
1116 getAssignFnsForCC(CC: CalleeCC, TLI);
1117
1118 CCAssignFn *CallerAssignFnFixed;
1119 CCAssignFn *CallerAssignFnVarArg;
1120 std::tie(args&: CallerAssignFnFixed, args&: CallerAssignFnVarArg) =
1121 getAssignFnsForCC(CC: CallerCC, TLI);
1122
1123 // FIXME: We are not accounting for potential differences in implicitly passed
1124 // inputs, but only the fixed ABI is supported now anyway.
1125 IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
1126 CalleeAssignFnVarArg);
1127 IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
1128 CallerAssignFnVarArg);
1129 return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
1130}
1131
1132bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
1133 CallLoweringInfo &Info, MachineFunction &MF,
1134 SmallVectorImpl<ArgInfo> &OutArgs) const {
1135 // If there are no outgoing arguments, then we are done.
1136 if (OutArgs.empty())
1137 return true;
1138
1139 const Function &CallerF = MF.getFunction();
1140 CallingConv::ID CalleeCC = Info.CallConv;
1141 CallingConv::ID CallerCC = CallerF.getCallingConv();
1142 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1143
1144 CCAssignFn *AssignFnFixed;
1145 CCAssignFn *AssignFnVarArg;
1146 std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1147
1148 // We have outgoing arguments. Make sure that we can tail call with them.
1149 SmallVector<CCValAssign, 16> OutLocs;
1150 CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
1151 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1152
1153 if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo&: OutInfo)) {
1154 LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1155 return false;
1156 }
1157
1158 // Make sure that they can fit on the caller's stack.
1159 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1160 if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
1161 LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1162 return false;
1163 }
1164
1165 // Verify that the parameters in callee-saved registers match.
1166 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1167 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1168 const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1169 MachineRegisterInfo &MRI = MF.getRegInfo();
1170 return parametersInCSRMatch(MRI, CallerPreservedMask, ArgLocs: OutLocs, OutVals: OutArgs);
1171}
1172
1173bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1174 MachineIRBuilder &B, CallLoweringInfo &Info,
1175 SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1176 // Must pass all target-independent checks in order to tail call optimize.
1177 if (!Info.IsTailCall)
1178 return false;
1179
1180 // Indirect calls can't be tail calls, because the address can be divergent.
1181 // TODO Check divergence info if the call really is divergent.
1182 if (Info.Callee.isReg())
1183 return false;
1184
1185 MachineFunction &MF = B.getMF();
1186 const Function &CallerF = MF.getFunction();
1187 CallingConv::ID CalleeCC = Info.CallConv;
1188 CallingConv::ID CallerCC = CallerF.getCallingConv();
1189
1190 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1191 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1192 // Kernels aren't callable, and don't have a live in return address so it
1193 // doesn't make sense to do a tail call with entry functions.
1194 if (!CallerPreserved)
1195 return false;
1196
1197 if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC)) {
1198 LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1199 return false;
1200 }
1201
1202 if (any_of(Range: CallerF.args(), P: [](const Argument &A) {
1203 return A.hasByValAttr() || A.hasSwiftErrorAttr();
1204 })) {
1205 LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1206 "or swifterror arguments\n");
1207 return false;
1208 }
1209
1210 // If we have -tailcallopt, then we're done.
1211 if (MF.getTarget().Options.GuaranteedTailCallOpt) {
1212 return AMDGPU::canGuaranteeTCO(CC: CalleeCC) &&
1213 CalleeCC == CallerF.getCallingConv();
1214 }
1215
1216 // Verify that the incoming and outgoing arguments from the callee are
1217 // safe to tail call.
1218 if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1219 LLVM_DEBUG(
1220 dbgs()
1221 << "... Caller and callee have incompatible calling conventions.\n");
1222 return false;
1223 }
1224
1225 // FIXME: We need to check if any arguments passed in SGPR are uniform. If
1226 // they are not, this cannot be a tail call. If they are uniform, but may be
1227 // VGPR, we need to insert readfirstlanes.
1228 if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1229 return false;
1230
1231 LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1232 return true;
1233}
1234
1235// Insert outgoing implicit arguments for a call, by inserting copies to the
1236// implicit argument registers and adding the necessary implicit uses to the
1237// call instruction.
1238void AMDGPUCallLowering::handleImplicitCallArguments(
1239 MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
1240 const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1241 CallingConv::ID CalleeCC,
1242 ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1243 if (!ST.hasFlatScratchEnabled()) {
1244 // Insert copies for the SRD. In the HSA case, this should be an identity
1245 // copy.
1246 auto ScratchRSrcReg = MIRBuilder.buildCopy(Res: LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32),
1247 Op: FuncInfo.getScratchRSrcReg());
1248
1249 auto CalleeRSrcReg = AMDGPU::isChainCC(CC: CalleeCC)
1250 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1251 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1252
1253 MIRBuilder.buildCopy(Res: CalleeRSrcReg, Op: ScratchRSrcReg);
1254 CallInst.addReg(RegNo: CalleeRSrcReg, Flags: RegState::Implicit);
1255 }
1256
1257 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1258 MIRBuilder.buildCopy(Res: (Register)ArgReg.first, Op: ArgReg.second);
1259 CallInst.addReg(RegNo: ArgReg.first, Flags: RegState::Implicit);
1260 }
1261}
1262
1263namespace {
1264// Chain calls have special arguments that we need to handle. These have the
1265// same index as they do in the llvm.amdgcn.cs.chain intrinsic.
1266enum ChainCallArgIdx {
1267 Exec = 1,
1268 Flags = 4,
1269 NumVGPRs = 5,
1270 FallbackExec = 6,
1271 FallbackCallee = 7,
1272};
1273} // anonymous namespace
1274
1275bool AMDGPUCallLowering::lowerTailCall(
1276 MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1277 SmallVectorImpl<ArgInfo> &OutArgs) const {
1278 MachineFunction &MF = MIRBuilder.getMF();
1279 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1280 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1281 const Function &F = MF.getFunction();
1282 MachineRegisterInfo &MRI = MF.getRegInfo();
1283 const SIInstrInfo *TII = ST.getInstrInfo();
1284 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1285 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1286
1287 // True when we're tail calling, but without -tailcallopt.
1288 bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1289
1290 // Find out which ABI gets to decide where things go.
1291 CallingConv::ID CalleeCC = Info.CallConv;
1292 CCAssignFn *AssignFnFixed;
1293 CCAssignFn *AssignFnVarArg;
1294 std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1295
1296 MachineInstrBuilder CallSeqStart;
1297 if (!IsSibCall)
1298 CallSeqStart = MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKUP);
1299
1300 bool IsChainCall = AMDGPU::isChainCC(CC: Info.CallConv);
1301 bool IsDynamicVGPRChainCall = false;
1302
1303 if (IsChainCall) {
1304 ArgInfo FlagsArg = Info.OrigArgs[ChainCallArgIdx::Flags];
1305 const APInt &FlagsValue = cast<ConstantInt>(Val: FlagsArg.OrigValue)->getValue();
1306 if (FlagsValue.isZero()) {
1307 if (Info.OrigArgs.size() != 5) {
1308 LLVM_DEBUG(dbgs() << "No additional args allowed if flags == 0\n");
1309 return false;
1310 }
1311 } else if (FlagsValue.isOneBitSet(BitNo: 0)) {
1312 IsDynamicVGPRChainCall = true;
1313
1314 if (Info.OrigArgs.size() != 8) {
1315 LLVM_DEBUG(dbgs() << "Expected 3 additional args\n");
1316 return false;
1317 }
1318
1319 // On GFX12, we can only change the VGPR allocation for wave32.
1320 if (!ST.isWave32()) {
1321 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1322 F, "dynamic VGPR mode is only supported for wave32"));
1323 return false;
1324 }
1325
1326 ArgInfo FallbackExecArg = Info.OrigArgs[ChainCallArgIdx::FallbackExec];
1327 assert(FallbackExecArg.Regs.size() == 1 &&
1328 "Expected single register for fallback EXEC");
1329 if (!FallbackExecArg.Ty->isIntegerTy(Bitwidth: ST.getWavefrontSize())) {
1330 LLVM_DEBUG(dbgs() << "Bad type for fallback EXEC\n");
1331 return false;
1332 }
1333 }
1334 }
1335
1336 unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), /*IsTailCall*/ true,
1337 IsWave32: ST.isWave32(), CC: CalleeCC, IsDynamicVGPRChainCall);
1338 auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1339
1340 if (FuncInfo->isWholeWaveFunction())
1341 addOriginalExecToReturn(MF, Ret&: MIB);
1342
1343 // Keep track of the index of the next operand to be added to the call
1344 unsigned CalleeIdx = MIB->getNumOperands();
1345
1346 if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
1347 return false;
1348
1349 // Byte offset for the tail call. When we are sibcalling, this will always
1350 // be 0.
1351 MIB.addImm(Val: 0);
1352
1353 // If this is a chain call, we need to pass in the EXEC mask as well as any
1354 // other special args.
1355 if (IsChainCall) {
1356 auto AddRegOrImm = [&](const ArgInfo &Arg) {
1357 if (auto CI = dyn_cast<ConstantInt>(Val: Arg.OrigValue)) {
1358 MIB.addImm(Val: CI->getSExtValue());
1359 } else {
1360 MIB.addReg(RegNo: Arg.Regs[0]);
1361 unsigned Idx = MIB->getNumOperands() - 1;
1362 MIB->getOperand(i: Idx).setReg(constrainOperandRegClass(
1363 MF, TRI: *TRI, MRI, TII: *TII, RBI: *ST.getRegBankInfo(), InsertPt&: *MIB, II: MIB->getDesc(),
1364 RegMO&: MIB->getOperand(i: Idx), OpIdx: Idx));
1365 }
1366 };
1367
1368 ArgInfo ExecArg = Info.OrigArgs[ChainCallArgIdx::Exec];
1369 assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
1370
1371 if (!ExecArg.Ty->isIntegerTy(Bitwidth: ST.getWavefrontSize())) {
1372 LLVM_DEBUG(dbgs() << "Bad type for EXEC");
1373 return false;
1374 }
1375
1376 AddRegOrImm(ExecArg);
1377 if (IsDynamicVGPRChainCall)
1378 std::for_each(first: Info.OrigArgs.begin() + ChainCallArgIdx::NumVGPRs,
1379 last: Info.OrigArgs.end(), f: AddRegOrImm);
1380 }
1381
1382 // Tell the call which registers are clobbered.
1383 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1384 MIB.addRegMask(Mask);
1385
1386 // FPDiff is the byte offset of the call's argument area from the callee's.
1387 // Stores to callee stack arguments will be placed in FixedStackSlots offset
1388 // by this amount for a tail call. In a sibling call it must be 0 because the
1389 // caller will deallocate the entire stack and the callee still expects its
1390 // arguments to begin at SP+0.
1391 int FPDiff = 0;
1392
1393 // This will be 0 for sibcalls, potentially nonzero for tail calls produced
1394 // by -tailcallopt. For sibcalls, the memory operands for the call are
1395 // already available in the caller's incoming argument space.
1396 unsigned NumBytes = 0;
1397 if (!IsSibCall) {
1398 // We aren't sibcalling, so we need to compute FPDiff. We need to do this
1399 // before handling assignments, because FPDiff must be known for memory
1400 // arguments.
1401 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1402 SmallVector<CCValAssign, 16> OutLocs;
1403 CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1404
1405 // FIXME: Not accounting for callee implicit inputs
1406 OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1407 if (!determineAssignments(Assigner&: CalleeAssigner, Args&: OutArgs, CCInfo&: OutInfo))
1408 return false;
1409
1410 // The callee will pop the argument stack as a tail call. Thus, we must
1411 // keep it 16-byte aligned.
1412 NumBytes = alignTo(Size: OutInfo.getStackSize(), A: ST.getStackAlignment());
1413
1414 // FPDiff will be negative if this tail call requires more space than we
1415 // would automatically have in our incoming argument space. Positive if we
1416 // actually shrink the stack.
1417 FPDiff = NumReusableBytes - NumBytes;
1418
1419 // The stack pointer must be 16-byte aligned at all times it's used for a
1420 // memory operation, which in practice means at *all* times and in
1421 // particular across call boundaries. Therefore our own arguments started at
1422 // a 16-byte aligned SP and the delta applied for the tail call should
1423 // satisfy the same constraint.
1424 assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1425 "unaligned stack on tail call");
1426 }
1427
1428 SmallVector<CCValAssign, 16> ArgLocs;
1429 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1430
1431 // We could pass MIB and directly add the implicit uses to the call
1432 // now. However, as an aesthetic choice, place implicit argument operands
1433 // after the ordinary user argument registers.
1434 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1435
1436 if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1437 Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
1438 !AMDGPU::isChainCC(CC: Info.CallConv)) {
1439 // With a fixed ABI, allocate fixed registers before user arguments.
1440 if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1441 return false;
1442 }
1443
1444 // Mark the scratch resource descriptor as allocated so the CC analysis
1445 // does not assign user arguments to these registers, matching the callee.
1446 if (!ST.hasFlatScratchEnabled())
1447 CCInfo.AllocateReg(Reg: FuncInfo->getScratchRSrcReg());
1448
1449 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1450
1451 if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1452 return false;
1453
1454 // Do the actual argument marshalling.
1455 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1456 if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1457 return false;
1458
1459 if (Info.ConvergenceCtrlToken) {
1460 MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1461 }
1462 handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *FuncInfo, CalleeCC,
1463 ImplicitArgRegs);
1464
1465 // If we have -tailcallopt, we need to adjust the stack. We'll do the call
1466 // sequence start and end here.
1467 if (!IsSibCall) {
1468 MIB->getOperand(i: CalleeIdx + 1).setImm(FPDiff);
1469 CallSeqStart.addImm(Val: NumBytes).addImm(Val: 0);
1470 // End the call sequence *before* emitting the call. Normally, we would
1471 // tidy the frame up after the call. However, here, we've laid out the
1472 // parameters so that when SP is reset, they will be in the correct
1473 // location.
1474 MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKDOWN).addImm(Val: NumBytes).addImm(Val: 0);
1475 }
1476
1477 // Now we can add the actual call instruction to the correct basic block.
1478 MIRBuilder.insertInstr(MIB);
1479
1480 // If this is a whole wave tail call, we need to constrain the register for
1481 // the original EXEC.
1482 if (MIB->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) {
1483 MIB->getOperand(i: 0).setReg(
1484 constrainOperandRegClass(MF, TRI: *TRI, MRI, TII: *TII, RBI: *ST.getRegBankInfo(),
1485 InsertPt&: *MIB, II: MIB->getDesc(), RegMO&: MIB->getOperand(i: 0), OpIdx: 0));
1486 }
1487
1488 // If Callee is a reg, since it is used by a target specific
1489 // instruction, it must have a register class matching the
1490 // constraint of that instruction.
1491
1492 // FIXME: We should define regbankselectable call instructions to handle
1493 // divergent call targets.
1494 if (MIB->getOperand(i: CalleeIdx).isReg()) {
1495 MIB->getOperand(i: CalleeIdx).setReg(constrainOperandRegClass(
1496 MF, TRI: *TRI, MRI, TII: *TII, RBI: *ST.getRegBankInfo(), InsertPt&: *MIB, II: MIB->getDesc(),
1497 RegMO&: MIB->getOperand(i: CalleeIdx), OpIdx: CalleeIdx));
1498 }
1499
1500 MF.getFrameInfo().setHasTailCall();
1501 Info.LoweredTailCall = true;
1502 return true;
1503}
1504
1505/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1506bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
1507 CallLoweringInfo &Info) const {
1508 ArgInfo Callee = Info.OrigArgs[0];
1509 ArgInfo SGPRArgs = Info.OrigArgs[2];
1510 ArgInfo VGPRArgs = Info.OrigArgs[3];
1511
1512 MachineFunction &MF = MIRBuilder.getMF();
1513 const Function &F = MF.getFunction();
1514 const DataLayout &DL = F.getDataLayout();
1515
1516 // The function to jump to is actually the first argument, so we'll change the
1517 // Callee and other info to match that before using our existing helper.
1518 const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1519 if (const Function *F = dyn_cast<Function>(Val: CalleeV)) {
1520 Info.Callee = MachineOperand::CreateGA(GV: F, Offset: 0);
1521 Info.CallConv = F->getCallingConv();
1522 } else {
1523 assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
1524 Info.Callee = MachineOperand::CreateReg(Reg: Callee.Regs[0], isDef: false);
1525 Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1526 // behaves the same here.
1527 }
1528
1529 // The function that we're calling cannot be vararg (only the intrinsic is).
1530 Info.IsVarArg = false;
1531
1532 assert(
1533 all_of(SGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1534 "SGPR arguments should be marked inreg");
1535 assert(
1536 none_of(VGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1537 "VGPR arguments should not be marked inreg");
1538
1539 SmallVector<ArgInfo, 8> OutArgs;
1540 splitToValueTypes(OrigArgInfo: SGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1541 splitToValueTypes(OrigArgInfo: VGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1542
1543 Info.IsMustTailCall = true;
1544 return lowerTailCall(MIRBuilder, Info, OutArgs);
1545}
1546
1547bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1548 CallLoweringInfo &Info) const {
1549 if (Function *F = Info.CB->getCalledFunction())
1550 if (F->isIntrinsic()) {
1551 switch (F->getIntrinsicID()) {
1552 case Intrinsic::amdgcn_cs_chain:
1553 return lowerChainCall(MIRBuilder, Info);
1554 case Intrinsic::amdgcn_call_whole_wave:
1555 Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
1556
1557 // Get the callee from the original instruction, so it doesn't look like
1558 // this is an indirect call.
1559 Info.Callee = MachineOperand::CreateGA(
1560 GV: cast<GlobalValue>(Val: Info.CB->getOperand(i_nocapture: 0)), /*Offset=*/0);
1561 Info.OrigArgs.erase(CI: Info.OrigArgs.begin());
1562 Info.IsVarArg = false;
1563 break;
1564 default:
1565 llvm_unreachable("Unexpected intrinsic call");
1566 }
1567 }
1568
1569 if (Info.IsVarArg) {
1570 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1571 return false;
1572 }
1573
1574 MachineFunction &MF = MIRBuilder.getMF();
1575 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1576 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1577
1578 const Function &F = MF.getFunction();
1579 MachineRegisterInfo &MRI = MF.getRegInfo();
1580 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1581 const DataLayout &DL = F.getDataLayout();
1582
1583 SmallVector<ArgInfo, 8> OutArgs;
1584 for (auto &OrigArg : Info.OrigArgs)
1585 splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1586
1587 SmallVector<ArgInfo, 8> InArgs;
1588 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1589 splitToValueTypes(OrigArgInfo: Info.OrigRet, SplitArgs&: InArgs, DL, CallConv: Info.CallConv);
1590
1591 // If we can lower as a tail call, do that instead.
1592 bool CanTailCallOpt =
1593 isEligibleForTailCallOptimization(B&: MIRBuilder, Info, InArgs, OutArgs);
1594
1595 // We must emit a tail call if we have musttail.
1596 if (Info.IsMustTailCall && !CanTailCallOpt) {
1597 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1598 return false;
1599 }
1600
1601 Info.IsTailCall = CanTailCallOpt;
1602 if (CanTailCallOpt)
1603 return lowerTailCall(MIRBuilder, Info, OutArgs);
1604
1605 // Find out which ABI gets to decide where things go.
1606 CCAssignFn *AssignFnFixed;
1607 CCAssignFn *AssignFnVarArg;
1608 std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) =
1609 getAssignFnsForCC(CC: Info.CallConv, TLI);
1610
1611 MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKUP)
1612 .addImm(Val: 0)
1613 .addImm(Val: 0);
1614
1615 // Create a temporarily-floating call instruction so we can add the implicit
1616 // uses of arg registers.
1617 unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), IsTailCall: false, IsWave32: ST.isWave32(),
1618 CC: Info.CallConv);
1619
1620 auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1621 MIB.addDef(RegNo: TRI->getReturnAddressReg(MF));
1622
1623 if (!Info.IsConvergent)
1624 MIB.setMIFlag(MachineInstr::NoConvergent);
1625
1626 if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info))
1627 return false;
1628
1629 // Tell the call which registers are clobbered.
1630 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1631 MIB.addRegMask(Mask);
1632
1633 SmallVector<CCValAssign, 16> ArgLocs;
1634 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1635
1636 // We could pass MIB and directly add the implicit uses to the call
1637 // now. However, as an aesthetic choice, place implicit argument operands
1638 // after the ordinary user argument registers.
1639 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1640
1641 if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1642 Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
1643 // With a fixed ABI, allocate fixed registers before user arguments.
1644 if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1645 return false;
1646 }
1647
1648 // Mark the scratch resource descriptor as allocated so the CC analysis
1649 // does not assign user arguments to these registers, matching the callee.
1650 if (!ST.hasFlatScratchEnabled()) {
1651 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1652 CCInfo.AllocateReg(Reg: FuncInfo->getScratchRSrcReg());
1653 }
1654
1655 // Do the actual argument marshalling.
1656 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1657 if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1658 return false;
1659
1660 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1661 if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1662 return false;
1663
1664 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1665
1666 if (Info.ConvergenceCtrlToken) {
1667 MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1668 }
1669 handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *MFI, CalleeCC: Info.CallConv,
1670 ImplicitArgRegs);
1671
1672 // Get a count of how many bytes are to be pushed on the stack.
1673 unsigned NumBytes = CCInfo.getStackSize();
1674
1675 // If Callee is a reg, since it is used by a target specific
1676 // instruction, it must have a register class matching the
1677 // constraint of that instruction.
1678
1679 // FIXME: We should define regbankselectable call instructions to handle
1680 // divergent call targets.
1681 if (MIB->getOperand(i: 1).isReg()) {
1682 MIB->getOperand(i: 1).setReg(constrainOperandRegClass(
1683 MF, TRI: *TRI, MRI, TII: *ST.getInstrInfo(),
1684 RBI: *ST.getRegBankInfo(), InsertPt&: *MIB, II: MIB->getDesc(), RegMO&: MIB->getOperand(i: 1),
1685 OpIdx: 1));
1686 }
1687
1688 // Now we can add the actual call instruction to the correct position.
1689 MIRBuilder.insertInstr(MIB);
1690
1691 // Finally we can copy the returned value back into its virtual-register. In
1692 // symmetry with the arguments, the physical register must be an
1693 // implicit-define of the call instruction.
1694 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1695 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(CC: Info.CallConv,
1696 IsVarArg: Info.IsVarArg);
1697 IncomingValueAssigner Assigner(RetAssignFn);
1698 CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1699 if (!determineAndHandleAssignments(Handler, Assigner, Args&: InArgs, MIRBuilder,
1700 CallConv: Info.CallConv, IsVarArg: Info.IsVarArg))
1701 return false;
1702 }
1703
1704 uint64_t CalleePopBytes = NumBytes;
1705
1706 MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKDOWN)
1707 .addImm(Val: 0)
1708 .addImm(Val: CalleePopBytes);
1709
1710 if (!Info.CanLowerReturn) {
1711 insertSRetLoads(MIRBuilder, RetTy: Info.OrigRet.Ty, VRegs: Info.OrigRet.Regs,
1712 DemoteReg: Info.DemoteRegister, FI: Info.DemoteStackIndex);
1713 }
1714
1715 return true;
1716}
1717
1718void AMDGPUCallLowering::addOriginalExecToReturn(
1719 MachineFunction &MF, MachineInstrBuilder &Ret) const {
1720 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1721 const SIInstrInfo *TII = ST.getInstrInfo();
1722 const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
1723 Ret.addReg(RegNo: Setup->getOperand(i: 0).getReg());
1724}
1725