1//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements the lowering of LLVM calls to machine code calls for
11/// GlobalISel.
12///
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUCallLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPULegalizerInfo.h"
18#include "SIMachineFunctionInfo.h"
19#include "SIRegisterInfo.h"
20#include "llvm/CodeGen/Analysis.h"
21#include "llvm/CodeGen/FunctionLoweringInfo.h"
22#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/CodeGen/PseudoSourceValueManager.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-call-lowering"
28
29using namespace llvm;
30
31namespace {
32
33/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
34static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
35 Register ValVReg, const CCValAssign &VA) {
36 if (VA.getLocVT().getSizeInBits() < 32) {
37 // 16-bit types are reported as legal for 32-bit registers. We need to
38 // extend and do a 32-bit copy to avoid the verifier complaining about it.
39 return Handler.MIRBuilder.buildAnyExt(Res: LLT::scalar(SizeInBits: 32), Op: ValVReg).getReg(Idx: 0);
40 }
41
42 return Handler.extendRegister(ValReg: ValVReg, VA);
43}
44
45struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
46 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
47 MachineInstrBuilder MIB)
48 : OutgoingValueHandler(B, MRI), MIB(MIB) {}
49
50 MachineInstrBuilder MIB;
51
52 Register getStackAddress(uint64_t Size, int64_t Offset,
53 MachinePointerInfo &MPO,
54 ISD::ArgFlagsTy Flags) override {
55 llvm_unreachable("not implemented");
56 }
57
58 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
59 const MachinePointerInfo &MPO,
60 const CCValAssign &VA) override {
61 llvm_unreachable("not implemented");
62 }
63
64 void assignValueToReg(Register ValVReg, Register PhysReg,
65 const CCValAssign &VA) override {
66 Register ExtReg = extendRegisterMin32(Handler&: *this, ValVReg, VA);
67
68 // If this is a scalar return, insert a readfirstlane just in case the value
69 // ends up in a VGPR.
70 // FIXME: Assert this is a shader return.
71 const SIRegisterInfo *TRI
72 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
73 if (TRI->isSGPRReg(MRI, Reg: PhysReg)) {
74 LLT Ty = MRI.getType(Reg: ExtReg);
75 LLT S32 = LLT::scalar(SizeInBits: 32);
76 if (Ty != S32) {
77 // FIXME: We should probably support readfirstlane intrinsics with all
78 // legal 32-bit types.
79 assert(Ty.getSizeInBits() == 32);
80 if (Ty.isPointer())
81 ExtReg = MIRBuilder.buildPtrToInt(Dst: S32, Src: ExtReg).getReg(Idx: 0);
82 else
83 ExtReg = MIRBuilder.buildBitcast(Dst: S32, Src: ExtReg).getReg(Idx: 0);
84 }
85
86 auto ToSGPR = MIRBuilder
87 .buildIntrinsic(ID: Intrinsic::amdgcn_readfirstlane,
88 Res: {MRI.getType(Reg: ExtReg)})
89 .addReg(RegNo: ExtReg);
90 ExtReg = ToSGPR.getReg(Idx: 0);
91 }
92
93 MIRBuilder.buildCopy(Res: PhysReg, Op: ExtReg);
94 MIB.addUse(RegNo: PhysReg, Flags: RegState::Implicit);
95 }
96};
97
98struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
99 uint64_t StackUsed = 0;
100
101 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
102 : IncomingValueHandler(B, MRI) {}
103
104 Register getStackAddress(uint64_t Size, int64_t Offset,
105 MachinePointerInfo &MPO,
106 ISD::ArgFlagsTy Flags) override {
107 auto &MFI = MIRBuilder.getMF().getFrameInfo();
108
109 // Byval is assumed to be writable memory, but other stack passed arguments
110 // are not.
111 const bool IsImmutable = !Flags.isByVal();
112 int FI = MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable);
113 MPO = MachinePointerInfo::getFixedStack(MF&: MIRBuilder.getMF(), FI);
114 auto AddrReg = MIRBuilder.buildFrameIndex(
115 Res: LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: 32), Idx: FI);
116 StackUsed = std::max(a: StackUsed, b: Size + Offset);
117 return AddrReg.getReg(Idx: 0);
118 }
119
120 void assignValueToReg(Register ValVReg, Register PhysReg,
121 const CCValAssign &VA) override {
122 markPhysRegUsed(PhysReg);
123
124 if (VA.getLocVT().getSizeInBits() < 32) {
125 // 16-bit types are reported as legal for 32-bit registers. We need to do
126 // a 32-bit copy, and truncate to avoid the verifier complaining about it.
127 auto Copy = MIRBuilder.buildCopy(Res: LLT::scalar(SizeInBits: 32), Op: PhysReg);
128
129 // If we have signext/zeroext, it applies to the whole 32-bit register
130 // before truncation.
131 auto Extended =
132 buildExtensionHint(VA, SrcReg: Copy.getReg(Idx: 0), NarrowTy: LLT(VA.getLocVT()));
133 MIRBuilder.buildTrunc(Res: ValVReg, Op: Extended);
134 return;
135 }
136
137 IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
138 }
139
140 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
141 const MachinePointerInfo &MPO,
142 const CCValAssign &VA) override {
143 MachineFunction &MF = MIRBuilder.getMF();
144
145 auto *MMO = MF.getMachineMemOperand(
146 PtrInfo: MPO, f: MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemTy,
147 base_alignment: inferAlignFromPtrInfo(MF, MPO));
148 MIRBuilder.buildLoad(Res: ValVReg, Addr, MMO&: *MMO);
149 }
150
151 /// How the physical register gets marked varies between formal
152 /// parameters (it's a basic-block live-in), and a call instruction
153 /// (it's an implicit-def of the BL).
154 virtual void markPhysRegUsed(unsigned PhysReg) = 0;
155};
156
157struct FormalArgHandler : public AMDGPUIncomingArgHandler {
158 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
159 : AMDGPUIncomingArgHandler(B, MRI) {}
160
161 void markPhysRegUsed(unsigned PhysReg) override {
162 MIRBuilder.getMBB().addLiveIn(PhysReg);
163 }
164};
165
166struct CallReturnHandler : public AMDGPUIncomingArgHandler {
167 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
168 MachineInstrBuilder MIB)
169 : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
170
171 void markPhysRegUsed(unsigned PhysReg) override {
172 MIB.addDef(RegNo: PhysReg, Flags: RegState::Implicit);
173 }
174
175 MachineInstrBuilder MIB;
176};
177
178struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
179 /// For tail calls, the byte offset of the call's argument area from the
180 /// callee's. Unused elsewhere.
181 int FPDiff;
182
183 // Cache the SP register vreg if we need it more than once in this call site.
184 Register SPReg;
185
186 bool IsTailCall;
187
188 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
189 MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
190 bool IsTailCall = false, int FPDiff = 0)
191 : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
192 IsTailCall(IsTailCall) {}
193
194 Register getStackAddress(uint64_t Size, int64_t Offset,
195 MachinePointerInfo &MPO,
196 ISD::ArgFlagsTy Flags) override {
197 MachineFunction &MF = MIRBuilder.getMF();
198 const LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: 32);
199 const LLT S32 = LLT::scalar(SizeInBits: 32);
200
201 if (IsTailCall) {
202 Offset += FPDiff;
203 int FI = MF.getFrameInfo().CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
204 auto FIReg = MIRBuilder.buildFrameIndex(Res: PtrTy, Idx: FI);
205 MPO = MachinePointerInfo::getFixedStack(MF, FI);
206 return FIReg.getReg(Idx: 0);
207 }
208
209 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
210
211 if (!SPReg) {
212 const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
213 if (ST.hasFlatScratchEnabled()) {
214 // The stack is accessed unswizzled, so we can use a regular copy.
215 SPReg = MIRBuilder.buildCopy(Res: PtrTy,
216 Op: MFI->getStackPtrOffsetReg()).getReg(Idx: 0);
217 } else {
218 // The address we produce here, without knowing the use context, is going
219 // to be interpreted as a vector address, so we need to convert to a
220 // swizzled address.
221 SPReg = MIRBuilder.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {PtrTy},
222 SrcOps: {MFI->getStackPtrOffsetReg()}).getReg(Idx: 0);
223 }
224 }
225
226 auto OffsetReg = MIRBuilder.buildConstant(Res: S32, Val: Offset);
227
228 auto AddrReg = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: SPReg, Op1: OffsetReg);
229 MPO = MachinePointerInfo::getStack(MF, Offset);
230 return AddrReg.getReg(Idx: 0);
231 }
232
233 void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
234 const MachinePointerInfo &MPO,
235 const CCValAssign &VA) override {
236 MachineFunction &MF = MIRBuilder.getMF();
237 uint64_t LocMemOffset = VA.getLocMemOffset();
238 const auto &ST = MF.getSubtarget<GCNSubtarget>();
239
240 auto *MMO = MF.getMachineMemOperand(
241 PtrInfo: MPO, f: MachineMemOperand::MOStore, MemTy,
242 base_alignment: commonAlignment(A: ST.getStackAlignment(), Offset: LocMemOffset));
243 MIRBuilder.buildStore(Val: ValVReg, Addr, MMO&: *MMO);
244 }
245
246 void assignValueToAddress(const CallLowering::ArgInfo &Arg,
247 unsigned ValRegIndex, Register Addr, LLT MemTy,
248 const MachinePointerInfo &MPO,
249 const CCValAssign &VA) override {
250 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
251 ? extendRegister(ValReg: Arg.Regs[ValRegIndex], VA)
252 : Arg.Regs[ValRegIndex];
253 assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
254 }
255};
256} // anonymous namespace
257
258AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
259 : CallLowering(&TLI) {
260}
261
262// FIXME: Compatibility shim
263static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
264 switch (MIOpc) {
265 case TargetOpcode::G_SEXT:
266 return ISD::SIGN_EXTEND;
267 case TargetOpcode::G_ZEXT:
268 return ISD::ZERO_EXTEND;
269 case TargetOpcode::G_ANYEXT:
270 return ISD::ANY_EXTEND;
271 default:
272 llvm_unreachable("not an extend opcode");
273 }
274}
275
276bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
277 CallingConv::ID CallConv,
278 SmallVectorImpl<BaseArgInfo> &Outs,
279 bool IsVarArg) const {
280 // For shaders. Vector types should be explicitly handled by CC.
281 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
282 return true;
283
284 SmallVector<CCValAssign, 16> ArgLocs;
285 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
286 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
287 MF.getFunction().getContext());
288
289 return checkReturn(CCInfo, Outs, Fn: TLI.CCAssignFnForReturn(CC: CallConv, IsVarArg));
290}
291
292/// Lower the return value for the already existing \p Ret. This assumes that
293/// \p B's insertion point is correct.
294bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
295 const Value *Val, ArrayRef<Register> VRegs,
296 MachineInstrBuilder &Ret) const {
297 if (!Val)
298 return true;
299
300 auto &MF = B.getMF();
301 const auto &F = MF.getFunction();
302 const DataLayout &DL = MF.getDataLayout();
303 MachineRegisterInfo *MRI = B.getMRI();
304 LLVMContext &Ctx = F.getContext();
305
306 CallingConv::ID CC = F.getCallingConv();
307 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
308
309 SmallVector<EVT, 8> SplitEVTs;
310 ComputeValueVTs(TLI, DL, Ty: Val->getType(), ValueVTs&: SplitEVTs);
311 assert(VRegs.size() == SplitEVTs.size() &&
312 "For each split Type there should be exactly one VReg.");
313
314 SmallVector<ArgInfo, 8> SplitRetInfos;
315
316 for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
317 EVT VT = SplitEVTs[i];
318 Register Reg = VRegs[i];
319 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Context&: Ctx), 0);
320 setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
321
322 if (VT.isScalarInteger()) {
323 unsigned ExtendOp = TargetOpcode::G_ANYEXT;
324 if (RetInfo.Flags[0].isSExt()) {
325 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
326 ExtendOp = TargetOpcode::G_SEXT;
327 } else if (RetInfo.Flags[0].isZExt()) {
328 assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
329 ExtendOp = TargetOpcode::G_ZEXT;
330 }
331
332 EVT ExtVT = TLI.getTypeForExtReturn(Context&: Ctx, VT,
333 ExtendKind: extOpcodeToISDExtOpcode(MIOpc: ExtendOp));
334 if (ExtVT != VT) {
335 RetInfo.Ty = ExtVT.getTypeForEVT(Context&: Ctx);
336 LLT ExtTy = getLLTForType(Ty&: *RetInfo.Ty, DL);
337 Reg = B.buildInstr(Opc: ExtendOp, DstOps: {ExtTy}, SrcOps: {Reg}).getReg(Idx: 0);
338 }
339 }
340
341 if (Reg != RetInfo.Regs[0]) {
342 RetInfo.Regs[0] = Reg;
343 // Reset the arg flags after modifying Reg.
344 setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
345 }
346
347 splitToValueTypes(OrigArgInfo: RetInfo, SplitArgs&: SplitRetInfos, DL, CallConv: CC);
348 }
349
350 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, IsVarArg: F.isVarArg());
351
352 OutgoingValueAssigner Assigner(AssignFn);
353 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
354 return determineAndHandleAssignments(Handler&: RetHandler, Assigner, Args&: SplitRetInfos, MIRBuilder&: B,
355 CallConv: CC, IsVarArg: F.isVarArg());
356}
357
358bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
359 ArrayRef<Register> VRegs,
360 FunctionLoweringInfo &FLI) const {
361
362 MachineFunction &MF = B.getMF();
363 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
364 MFI->setIfReturnsVoid(!Val);
365
366 assert(!Val == VRegs.empty() && "Return value without a vreg");
367
368 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
369 const bool IsShader = AMDGPU::isShader(CC);
370 const bool IsWaveEnd =
371 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
372 if (IsWaveEnd) {
373 B.buildInstr(Opcode: AMDGPU::S_ENDPGM)
374 .addImm(Val: 0);
375 return true;
376 }
377
378 const bool IsWholeWave = MFI->isWholeWaveFunction();
379 unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
380 : IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
381 : AMDGPU::SI_RETURN;
382 auto Ret = B.buildInstrNoInsert(Opcode: ReturnOpc);
383
384 if (!FLI.CanLowerReturn)
385 insertSRetStores(MIRBuilder&: B, RetTy: Val->getType(), VRegs, DemoteReg: FLI.DemoteRegister);
386 else if (!lowerReturnVal(B, Val, VRegs, Ret))
387 return false;
388
389 if (IsWholeWave)
390 addOriginalExecToReturn(MF&: B.getMF(), Ret);
391
392 // TODO: Handle CalleeSavedRegsViaCopy.
393
394 B.insertInstr(MIB: Ret);
395 return true;
396}
397
398void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
399 uint64_t Offset) const {
400 MachineFunction &MF = B.getMF();
401 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
402 MachineRegisterInfo &MRI = MF.getRegInfo();
403 Register KernArgSegmentPtr =
404 MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
405 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(PReg: KernArgSegmentPtr);
406
407 auto OffsetReg = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Offset);
408
409 B.buildPtrAdd(Res: DstReg, Op0: KernArgSegmentVReg, Op1: OffsetReg);
410}
411
412void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
413 uint64_t Offset,
414 Align Alignment) const {
415 MachineFunction &MF = B.getMF();
416 const Function &F = MF.getFunction();
417 const DataLayout &DL = F.getDataLayout();
418 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
419 MachinePointerInfo PtrInfo = TLI.getKernargSegmentPtrInfo(MF);
420
421 LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
422
423 SmallVector<ArgInfo, 32> SplitArgs;
424 SmallVector<uint64_t> FieldOffsets;
425 splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: F.getCallingConv(), Offsets: &FieldOffsets);
426
427 unsigned Idx = 0;
428 for (ArgInfo &SplitArg : SplitArgs) {
429 Register PtrReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
430 lowerParameterPtr(DstReg: PtrReg, B, Offset: Offset + FieldOffsets[Idx]);
431
432 LLT ArgTy = getLLTForType(Ty&: *SplitArg.Ty, DL);
433 if (SplitArg.Flags[0].isPointer()) {
434 // Compensate for losing pointeriness in splitValueTypes.
435 LLT PtrTy = LLT::pointer(AddressSpace: SplitArg.Flags[0].getPointerAddrSpace(),
436 SizeInBits: ArgTy.getScalarSizeInBits());
437 ArgTy = ArgTy.isVector() ? LLT::vector(EC: ArgTy.getElementCount(), ScalarTy: PtrTy)
438 : PtrTy;
439 }
440
441 MachineMemOperand *MMO = MF.getMachineMemOperand(
442 PtrInfo,
443 f: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
444 MachineMemOperand::MOInvariant,
445 MemTy: ArgTy, base_alignment: commonAlignment(A: Alignment, Offset: FieldOffsets[Idx]));
446
447 assert(SplitArg.Regs.size() == 1);
448
449 B.buildLoad(Res: SplitArg.Regs[0], Addr: PtrReg, MMO&: *MMO);
450 ++Idx;
451 }
452}
453
454// Allocate special inputs passed in user SGPRs.
455static void allocateHSAUserSGPRs(CCState &CCInfo,
456 MachineIRBuilder &B,
457 MachineFunction &MF,
458 const SIRegisterInfo &TRI,
459 SIMachineFunctionInfo &Info) {
460 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
461 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
462 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
463 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
464 MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
465 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
466 }
467
468 if (UserSGPRInfo.hasDispatchPtr()) {
469 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
470 MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
471 CCInfo.AllocateReg(Reg: DispatchPtrReg);
472 }
473
474 if (UserSGPRInfo.hasQueuePtr()) {
475 Register QueuePtrReg = Info.addQueuePtr(TRI);
476 MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
477 CCInfo.AllocateReg(Reg: QueuePtrReg);
478 }
479
480 if (UserSGPRInfo.hasKernargSegmentPtr()) {
481 MachineRegisterInfo &MRI = MF.getRegInfo();
482 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
483 const LLT P4 = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
484 Register VReg = MRI.createGenericVirtualRegister(Ty: P4);
485 MRI.addLiveIn(Reg: InputPtrReg, vreg: VReg);
486 B.getMBB().addLiveIn(PhysReg: InputPtrReg);
487 B.buildCopy(Res: VReg, Op: InputPtrReg);
488 CCInfo.AllocateReg(Reg: InputPtrReg);
489 }
490
491 if (UserSGPRInfo.hasDispatchID()) {
492 Register DispatchIDReg = Info.addDispatchID(TRI);
493 MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
494 CCInfo.AllocateReg(Reg: DispatchIDReg);
495 }
496
497 if (UserSGPRInfo.hasFlatScratchInit()) {
498 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
499 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
500 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
501 }
502
503 if (UserSGPRInfo.hasPrivateSegmentSize()) {
504 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
505 MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
506 CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
507 }
508
509 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
510 // these from the dispatch pointer.
511}
512
513bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
514 MachineIRBuilder &B, const Function &F,
515 ArrayRef<ArrayRef<Register>> VRegs) const {
516 MachineFunction &MF = B.getMF();
517 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
518 MachineRegisterInfo &MRI = MF.getRegInfo();
519 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
520 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
521 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
522 const DataLayout &DL = F.getDataLayout();
523
524 SmallVector<CCValAssign, 16> ArgLocs;
525 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
526
527 allocateHSAUserSGPRs(CCInfo, B, MF, TRI: *TRI, Info&: *Info);
528
529 unsigned i = 0;
530 const Align KernArgBaseAlign(16);
531 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
532 uint64_t ExplicitArgOffset = 0;
533
534 // TODO: Align down to dword alignment and extract bits for extending loads.
535 for (auto &Arg : F.args()) {
536 // TODO: Add support for kernarg preload.
537 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
538 LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
539 return false;
540 }
541
542 const bool IsByRef = Arg.hasByRefAttr();
543 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
544 unsigned AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
545 if (AllocSize == 0)
546 continue;
547
548 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
549 Align ABIAlign = DL.getValueOrABITypeAlignment(Alignment: ParamAlign, Ty: ArgTy);
550
551 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + BaseOffset;
552 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + AllocSize;
553
554 if (Arg.use_empty()) {
555 ++i;
556 continue;
557 }
558
559 Align Alignment = commonAlignment(A: KernArgBaseAlign, Offset: ArgOffset);
560
561 if (IsByRef) {
562 unsigned ByRefAS = cast<PointerType>(Val: Arg.getType())->getAddressSpace();
563
564 assert(VRegs[i].size() == 1 &&
565 "expected only one register for byval pointers");
566 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
567 lowerParameterPtr(DstReg: VRegs[i][0], B, Offset: ArgOffset);
568 } else {
569 const LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64);
570 Register PtrReg = MRI.createGenericVirtualRegister(Ty: ConstPtrTy);
571 lowerParameterPtr(DstReg: PtrReg, B, Offset: ArgOffset);
572
573 B.buildAddrSpaceCast(Dst: VRegs[i][0], Src: PtrReg);
574 }
575 } else {
576 ArgInfo OrigArg(VRegs[i], Arg, i);
577 const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
578 setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
579 lowerParameter(B, OrigArg, Offset: ArgOffset, Alignment);
580 }
581
582 ++i;
583 }
584
585 if (Info->getNumKernargPreloadedSGPRs())
586 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
587
588 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
589 TLI.allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv: F.getCallingConv(), IsShader: false);
590 return true;
591}
592
593bool AMDGPUCallLowering::lowerFormalArguments(
594 MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
595 FunctionLoweringInfo &FLI) const {
596 CallingConv::ID CC = F.getCallingConv();
597
598 // The infrastructure for normal calling convention lowering is essentially
599 // useless for kernels. We want to avoid any kind of legalization or argument
600 // splitting.
601 if (CC == CallingConv::AMDGPU_KERNEL)
602 return lowerFormalArgumentsKernel(B, F, VRegs);
603
604 const bool IsGraphics = AMDGPU::isGraphics(CC);
605 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
606
607 MachineFunction &MF = B.getMF();
608 MachineBasicBlock &MBB = B.getMBB();
609 MachineRegisterInfo &MRI = MF.getRegInfo();
610 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
611 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
612 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
613 const DataLayout &DL = F.getDataLayout();
614
615 SmallVector<CCValAssign, 16> ArgLocs;
616 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
617 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
618
619 if (UserSGPRInfo.hasImplicitBufferPtr()) {
620 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(TRI: *TRI);
621 MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
622 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
623 }
624
625 // FIXME: This probably isn't defined for mesa
626 if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
627 Register FlatScratchInitReg = Info->addFlatScratchInit(TRI: *TRI);
628 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
629 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
630 }
631
632 SmallVector<ArgInfo, 32> SplitArgs;
633 unsigned Idx = 0;
634 unsigned PSInputNum = 0;
635
636 // Insert the hidden sret parameter if the return value won't fit in the
637 // return registers.
638 if (!FLI.CanLowerReturn)
639 insertSRetIncomingArgument(F, SplitArgs, DemoteReg&: FLI.DemoteRegister, MRI, DL);
640
641 for (auto &Arg : F.args()) {
642 if (DL.getTypeStoreSize(Ty: Arg.getType()) == 0)
643 continue;
644
645 if (Info->isWholeWaveFunction() && Idx == 0) {
646 assert(VRegs[Idx].size() == 1 && "Expected only one register");
647
648 // The first argument for whole wave functions is the original EXEC value.
649 B.buildInstr(Opcode: AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
650 .addDef(RegNo: VRegs[Idx][0]);
651
652 ++Idx;
653 continue;
654 }
655
656 const bool InReg = Arg.hasAttribute(Kind: Attribute::InReg);
657
658 if (Arg.hasAttribute(Kind: Attribute::SwiftSelf) ||
659 Arg.hasAttribute(Kind: Attribute::SwiftError) ||
660 Arg.hasAttribute(Kind: Attribute::Nest))
661 return false;
662
663 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
664 const bool ArgUsed = !Arg.use_empty();
665 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(Index: PSInputNum);
666
667 if (!SkipArg) {
668 Info->markPSInputAllocated(Index: PSInputNum);
669 if (ArgUsed)
670 Info->markPSInputEnabled(Index: PSInputNum);
671 }
672
673 ++PSInputNum;
674
675 if (SkipArg) {
676 for (Register R : VRegs[Idx])
677 B.buildUndef(Res: R);
678
679 ++Idx;
680 continue;
681 }
682 }
683
684 ArgInfo OrigArg(VRegs[Idx], Arg, Idx);
685 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
686 setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
687
688 splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: CC);
689 ++Idx;
690 }
691
692 // At least one interpolation mode must be enabled or else the GPU will
693 // hang.
694 //
695 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
696 // set PSInputAddr, the user wants to enable some bits after the compilation
697 // based on run-time states. Since we can't know what the final PSInputEna
698 // will look like, so we shouldn't do anything here and the user should take
699 // responsibility for the correct programming.
700 //
701 // Otherwise, the following restrictions apply:
702 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
703 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
704 // enabled too.
705 if (CC == CallingConv::AMDGPU_PS) {
706 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
707 ((Info->getPSInputAddr() & 0xF) == 0 &&
708 Info->isPSInputAllocated(Index: 11))) {
709 CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
710 CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
711 Info->markPSInputAllocated(Index: 0);
712 Info->markPSInputEnabled(Index: 0);
713 }
714
715 if (Subtarget.isAmdPalOS()) {
716 // For isAmdPalOS, the user does not enable some bits after compilation
717 // based on run-time states; the register values being generated here are
718 // the final ones set in hardware. Therefore we need to apply the
719 // workaround to PSInputAddr and PSInputEnable together. (The case where
720 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
721 // set up an input arg for a particular interpolation mode, but nothing
722 // uses that input arg. Really we should have an earlier pass that removes
723 // such an arg.)
724 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
725 if ((PsInputBits & 0x7F) == 0 ||
726 ((PsInputBits & 0xF) == 0 &&
727 (PsInputBits >> 11 & 1)))
728 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
729 }
730 }
731
732 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
733 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, IsVarArg: F.isVarArg());
734
735 if (!MBB.empty())
736 B.setInstr(*MBB.begin());
737
738 if (!IsEntryFunc && !IsGraphics) {
739 // For the fixed ABI, pass workitem IDs in the last argument register.
740 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
741
742 if (!Subtarget.hasFlatScratchEnabled())
743 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
744 TLI.allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
745 }
746
747 IncomingValueAssigner Assigner(AssignFn);
748 if (!determineAssignments(Assigner, Args&: SplitArgs, CCInfo))
749 return false;
750
751 if (IsEntryFunc) {
752 // This assumes the registers are allocated by CCInfo in ascending order
753 // with no gaps.
754 Info->setNumWaveDispatchSGPRs(
755 CCInfo.getFirstUnallocated(Regs: AMDGPU::SGPR_32RegClass.getRegisters()));
756 Info->setNumWaveDispatchVGPRs(
757 CCInfo.getFirstUnallocated(Regs: AMDGPU::VGPR_32RegClass.getRegisters()));
758 }
759
760 FormalArgHandler Handler(B, MRI);
761 if (!handleAssignments(Handler, Args&: SplitArgs, CCState&: CCInfo, ArgLocs, MIRBuilder&: B))
762 return false;
763
764 uint64_t StackSize = Assigner.StackSize;
765
766 // Start adding system SGPRs.
767 if (IsEntryFunc)
768 TLI.allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv: CC, IsShader: IsGraphics);
769
770 // When we tail call, we need to check if the callee's arguments will fit on
771 // the caller's stack. So, whenever we lower formal arguments, we should keep
772 // track of this information, since we might lower a tail call in this
773 // function later.
774 Info->setBytesInStackArgArea(StackSize);
775
776 // Move back to the end of the basic block.
777 B.setMBB(MBB);
778
779 return true;
780}
781
782bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
783 CCState &CCInfo,
784 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
785 CallLoweringInfo &Info) const {
786 MachineFunction &MF = MIRBuilder.getMF();
787
788 // If there's no call site, this doesn't correspond to a call from the IR and
789 // doesn't need implicit inputs.
790 if (!Info.CB)
791 return true;
792
793 const AMDGPUFunctionArgInfo *CalleeArgInfo
794 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
795
796 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
797 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
798
799
800 // TODO: Unify with private memory register handling. This is complicated by
801 // the fact that at least in kernels, the input argument is not necessarily
802 // in the same location as the input.
803 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
804 AMDGPUFunctionArgInfo::DISPATCH_PTR,
805 AMDGPUFunctionArgInfo::QUEUE_PTR,
806 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
807 AMDGPUFunctionArgInfo::DISPATCH_ID,
808 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
809 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
810 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
811 AMDGPUFunctionArgInfo::LDS_KERNEL_ID,
812 };
813
814 static constexpr StringLiteral ImplicitAttrNames[][2] = {
815 {"amdgpu-no-dispatch-ptr", ""},
816 {"amdgpu-no-queue-ptr", ""},
817 {"amdgpu-no-implicitarg-ptr", ""},
818 {"amdgpu-no-dispatch-id", ""},
819 {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"},
820 {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"},
821 {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"},
822 {"amdgpu-no-lds-kernel-id", ""},
823 };
824
825 MachineRegisterInfo &MRI = MF.getRegInfo();
826
827 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
828 const AMDGPULegalizerInfo *LI
829 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
830
831 unsigned I = 0;
832 for (auto InputID : InputRegs) {
833 const ArgDescriptor *OutgoingArg;
834 const TargetRegisterClass *ArgRC;
835 LLT ArgTy;
836
837 // If the callee does not use the attribute value, skip copying the value.
838 if (all_of(Range: ImplicitAttrNames[I++], P: [&](StringRef AttrName) {
839 return AttrName.empty() || Info.CB->hasFnAttr(Kind: AttrName);
840 }))
841 continue;
842
843 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
844 CalleeArgInfo->getPreloadedValue(Value: InputID);
845 if (!OutgoingArg)
846 continue;
847
848 const ArgDescriptor *IncomingArg;
849 const TargetRegisterClass *IncomingArgRC;
850 std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: ArgTy) =
851 CallerArgInfo.getPreloadedValue(Value: InputID);
852 assert(IncomingArgRC == ArgRC);
853
854 Register InputReg = MRI.createGenericVirtualRegister(Ty: ArgTy);
855
856 if (IncomingArg) {
857 LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArg, ArgRC, ArgTy);
858 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
859 LI->getImplicitArgPtr(DstReg: InputReg, MRI, B&: MIRBuilder);
860 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
861 std::optional<uint32_t> Id =
862 AMDGPUMachineFunction::getLDSKernelIdMetadata(F: MF.getFunction());
863 if (Id) {
864 MIRBuilder.buildConstant(Res: InputReg, Val: *Id);
865 } else {
866 MIRBuilder.buildUndef(Res: InputReg);
867 }
868 } else {
869 // We may have proven the input wasn't needed, although the ABI is
870 // requiring it. We just need to allocate the register appropriately.
871 MIRBuilder.buildUndef(Res: InputReg);
872 }
873
874 if (OutgoingArg->isRegister()) {
875 ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
876 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
877 report_fatal_error(reason: "failed to allocate implicit input argument");
878 } else {
879 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
880 return false;
881 }
882 }
883
884 // Pack workitem IDs into a single register or pass it as is if already
885 // packed.
886 const ArgDescriptor *OutgoingArg;
887 const TargetRegisterClass *ArgRC;
888 LLT ArgTy;
889
890 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
891 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
892 if (!OutgoingArg)
893 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
894 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
895 if (!OutgoingArg)
896 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
897 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
898 if (!OutgoingArg)
899 return false;
900
901 auto WorkitemIDX =
902 CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
903 auto WorkitemIDY =
904 CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
905 auto WorkitemIDZ =
906 CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
907
908 const ArgDescriptor *IncomingArgX = std::get<0>(t&: WorkitemIDX);
909 const ArgDescriptor *IncomingArgY = std::get<0>(t&: WorkitemIDY);
910 const ArgDescriptor *IncomingArgZ = std::get<0>(t&: WorkitemIDZ);
911 const LLT S32 = LLT::scalar(SizeInBits: 32);
912
913 const bool NeedWorkItemIDX = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
914 const bool NeedWorkItemIDY = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
915 const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
916
917 // If incoming ids are not packed we need to pack them.
918 // FIXME: Should consider known workgroup size to eliminate known 0 cases.
919 Register InputReg;
920 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
921 NeedWorkItemIDX) {
922 if (ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: 0) != 0) {
923 InputReg = MRI.createGenericVirtualRegister(Ty: S32);
924 LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArgX,
925 ArgRC: std::get<1>(t&: WorkitemIDX),
926 ArgTy: std::get<2>(t&: WorkitemIDX));
927 } else {
928 InputReg = MIRBuilder.buildConstant(Res: S32, Val: 0).getReg(Idx: 0);
929 }
930 }
931
932 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
933 NeedWorkItemIDY && ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: 1) != 0) {
934 Register Y = MRI.createGenericVirtualRegister(Ty: S32);
935 LI->buildLoadInputValue(DstReg: Y, B&: MIRBuilder, Arg: IncomingArgY,
936 ArgRC: std::get<1>(t&: WorkitemIDY), ArgTy: std::get<2>(t&: WorkitemIDY));
937
938 Y = MIRBuilder.buildShl(Dst: S32, Src0: Y, Src1: MIRBuilder.buildConstant(Res: S32, Val: 10)).getReg(Idx: 0);
939 InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Y).getReg(Idx: 0) : Y;
940 }
941
942 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
943 NeedWorkItemIDZ && ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: 2) != 0) {
944 Register Z = MRI.createGenericVirtualRegister(Ty: S32);
945 LI->buildLoadInputValue(DstReg: Z, B&: MIRBuilder, Arg: IncomingArgZ,
946 ArgRC: std::get<1>(t&: WorkitemIDZ), ArgTy: std::get<2>(t&: WorkitemIDZ));
947
948 Z = MIRBuilder.buildShl(Dst: S32, Src0: Z, Src1: MIRBuilder.buildConstant(Res: S32, Val: 20)).getReg(Idx: 0);
949 InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Z).getReg(Idx: 0) : Z;
950 }
951
952 if (!InputReg &&
953 (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
954 InputReg = MRI.createGenericVirtualRegister(Ty: S32);
955 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
956 // We're in a situation where the outgoing function requires the workitem
957 // ID, but the calling function does not have it (e.g a graphics function
958 // calling a C calling convention function). This is illegal, but we need
959 // to produce something.
960 MIRBuilder.buildUndef(Res: InputReg);
961 } else {
962 // Workitem ids are already packed, any of present incoming arguments will
963 // carry all required fields.
964 ArgDescriptor IncomingArg = ArgDescriptor::createArg(
965 Arg: IncomingArgX ? *IncomingArgX :
966 IncomingArgY ? *IncomingArgY : *IncomingArgZ, Mask: ~0u);
967 LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: &IncomingArg,
968 ArgRC: &AMDGPU::VGPR_32RegClass, ArgTy: S32);
969 }
970 }
971
972 if (OutgoingArg->isRegister()) {
973 if (InputReg)
974 ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
975
976 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
977 report_fatal_error(reason: "failed to allocate implicit input argument");
978 } else {
979 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
980 return false;
981 }
982
983 return true;
984}
985
986/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
987/// CC.
988static std::pair<CCAssignFn *, CCAssignFn *>
989getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
990 return {TLI.CCAssignFnForCall(CC, IsVarArg: false), TLI.CCAssignFnForCall(CC, IsVarArg: true)};
991}
992
993static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
994 bool IsTailCall, bool IsWave32,
995 CallingConv::ID CC,
996 bool IsDynamicVGPRChainCall = false) {
997 // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
998 assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
999 "Indirect calls can't be tail calls, "
1000 "because the address can be divergent");
1001 if (!IsTailCall)
1002 return AMDGPU::G_SI_CALL;
1003
1004 if (AMDGPU::isChainCC(CC)) {
1005 if (IsDynamicVGPRChainCall)
1006 return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR
1007 : AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR;
1008 return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
1009 }
1010
1011 if (CallerF.getFunction().getCallingConv() ==
1012 CallingConv::AMDGPU_Gfx_WholeWave)
1013 return AMDGPU::SI_TCRETURN_GFX_WholeWave;
1014
1015 if (CC == CallingConv::AMDGPU_Gfx || CC == CallingConv::AMDGPU_Gfx_WholeWave)
1016 return AMDGPU::SI_TCRETURN_GFX;
1017
1018 return AMDGPU::SI_TCRETURN;
1019}
1020
1021// Add operands to call instruction to track the callee.
1022static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
1023 MachineIRBuilder &MIRBuilder,
1024 AMDGPUCallLowering::CallLoweringInfo &Info,
1025 bool IsDynamicVGPRChainCall = false) {
1026 if (Info.Callee.isReg()) {
1027 CallInst.addReg(RegNo: Info.Callee.getReg());
1028 CallInst.addImm(Val: 0);
1029 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
1030 // The call lowering lightly assumed we can directly encode a call target in
1031 // the instruction, which is not the case. Materialize the address here.
1032 const GlobalValue *GV = Info.Callee.getGlobal();
1033 auto Ptr = MIRBuilder.buildGlobalValue(
1034 Res: LLT::pointer(AddressSpace: GV->getAddressSpace(), SizeInBits: 64), GV);
1035 CallInst.addReg(RegNo: Ptr.getReg(Idx: 0));
1036
1037 if (IsDynamicVGPRChainCall) {
1038 // DynamicVGPR chain calls are always indirect.
1039 CallInst.addImm(Val: 0);
1040 } else
1041 CallInst.add(MO: Info.Callee);
1042 } else
1043 return false;
1044
1045 return true;
1046}
1047
1048bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
1049 CallLoweringInfo &Info, MachineFunction &MF,
1050 SmallVectorImpl<ArgInfo> &InArgs) const {
1051 const Function &CallerF = MF.getFunction();
1052 CallingConv::ID CalleeCC = Info.CallConv;
1053 CallingConv::ID CallerCC = CallerF.getCallingConv();
1054
1055 // If the calling conventions match, then everything must be the same.
1056 if (CalleeCC == CallerCC)
1057 return true;
1058
1059 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1060
1061 // Make sure that the caller and callee preserve all of the same registers.
1062 const auto *TRI = ST.getRegisterInfo();
1063
1064 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1065 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
1066 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
1067 return false;
1068
1069 // Check if the caller and callee will handle arguments in the same way.
1070 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1071 CCAssignFn *CalleeAssignFnFixed;
1072 CCAssignFn *CalleeAssignFnVarArg;
1073 std::tie(args&: CalleeAssignFnFixed, args&: CalleeAssignFnVarArg) =
1074 getAssignFnsForCC(CC: CalleeCC, TLI);
1075
1076 CCAssignFn *CallerAssignFnFixed;
1077 CCAssignFn *CallerAssignFnVarArg;
1078 std::tie(args&: CallerAssignFnFixed, args&: CallerAssignFnVarArg) =
1079 getAssignFnsForCC(CC: CallerCC, TLI);
1080
1081 // FIXME: We are not accounting for potential differences in implicitly passed
1082 // inputs, but only the fixed ABI is supported now anyway.
1083 IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
1084 CalleeAssignFnVarArg);
1085 IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
1086 CallerAssignFnVarArg);
1087 return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
1088}
1089
1090bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
1091 CallLoweringInfo &Info, MachineFunction &MF,
1092 SmallVectorImpl<ArgInfo> &OutArgs) const {
1093 // If there are no outgoing arguments, then we are done.
1094 if (OutArgs.empty())
1095 return true;
1096
1097 const Function &CallerF = MF.getFunction();
1098 CallingConv::ID CalleeCC = Info.CallConv;
1099 CallingConv::ID CallerCC = CallerF.getCallingConv();
1100 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1101
1102 CCAssignFn *AssignFnFixed;
1103 CCAssignFn *AssignFnVarArg;
1104 std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1105
1106 // We have outgoing arguments. Make sure that we can tail call with them.
1107 SmallVector<CCValAssign, 16> OutLocs;
1108 CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
1109 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1110
1111 if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo&: OutInfo)) {
1112 LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1113 return false;
1114 }
1115
1116 // Make sure that they can fit on the caller's stack.
1117 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1118 if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
1119 LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1120 return false;
1121 }
1122
1123 // Verify that the parameters in callee-saved registers match.
1124 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1125 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1126 const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1127 MachineRegisterInfo &MRI = MF.getRegInfo();
1128 return parametersInCSRMatch(MRI, CallerPreservedMask, ArgLocs: OutLocs, OutVals: OutArgs);
1129}
1130
1131bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1132 MachineIRBuilder &B, CallLoweringInfo &Info,
1133 SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1134 // Must pass all target-independent checks in order to tail call optimize.
1135 if (!Info.IsTailCall)
1136 return false;
1137
1138 // Indirect calls can't be tail calls, because the address can be divergent.
1139 // TODO Check divergence info if the call really is divergent.
1140 if (Info.Callee.isReg())
1141 return false;
1142
1143 MachineFunction &MF = B.getMF();
1144 const Function &CallerF = MF.getFunction();
1145 CallingConv::ID CalleeCC = Info.CallConv;
1146 CallingConv::ID CallerCC = CallerF.getCallingConv();
1147
1148 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1149 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1150 // Kernels aren't callable, and don't have a live in return address so it
1151 // doesn't make sense to do a tail call with entry functions.
1152 if (!CallerPreserved)
1153 return false;
1154
1155 if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC)) {
1156 LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1157 return false;
1158 }
1159
1160 if (any_of(Range: CallerF.args(), P: [](const Argument &A) {
1161 return A.hasByValAttr() || A.hasSwiftErrorAttr();
1162 })) {
1163 LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1164 "or swifterror arguments\n");
1165 return false;
1166 }
1167
1168 // If we have -tailcallopt, then we're done.
1169 if (MF.getTarget().Options.GuaranteedTailCallOpt) {
1170 return AMDGPU::canGuaranteeTCO(CC: CalleeCC) &&
1171 CalleeCC == CallerF.getCallingConv();
1172 }
1173
1174 // Verify that the incoming and outgoing arguments from the callee are
1175 // safe to tail call.
1176 if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1177 LLVM_DEBUG(
1178 dbgs()
1179 << "... Caller and callee have incompatible calling conventions.\n");
1180 return false;
1181 }
1182
1183 // FIXME: We need to check if any arguments passed in SGPR are uniform. If
1184 // they are not, this cannot be a tail call. If they are uniform, but may be
1185 // VGPR, we need to insert readfirstlanes.
1186 if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1187 return false;
1188
1189 LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1190 return true;
1191}
1192
1193// Insert outgoing implicit arguments for a call, by inserting copies to the
1194// implicit argument registers and adding the necessary implicit uses to the
1195// call instruction.
1196void AMDGPUCallLowering::handleImplicitCallArguments(
1197 MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
1198 const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1199 CallingConv::ID CalleeCC,
1200 ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1201 if (!ST.hasFlatScratchEnabled()) {
1202 // Insert copies for the SRD. In the HSA case, this should be an identity
1203 // copy.
1204 auto ScratchRSrcReg = MIRBuilder.buildCopy(Res: LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32),
1205 Op: FuncInfo.getScratchRSrcReg());
1206
1207 auto CalleeRSrcReg = AMDGPU::isChainCC(CC: CalleeCC)
1208 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1209 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1210
1211 MIRBuilder.buildCopy(Res: CalleeRSrcReg, Op: ScratchRSrcReg);
1212 CallInst.addReg(RegNo: CalleeRSrcReg, Flags: RegState::Implicit);
1213 }
1214
1215 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1216 MIRBuilder.buildCopy(Res: (Register)ArgReg.first, Op: ArgReg.second);
1217 CallInst.addReg(RegNo: ArgReg.first, Flags: RegState::Implicit);
1218 }
1219}
1220
1221namespace {
1222// Chain calls have special arguments that we need to handle. These have the
1223// same index as they do in the llvm.amdgcn.cs.chain intrinsic.
1224enum ChainCallArgIdx {
1225 Exec = 1,
1226 Flags = 4,
1227 NumVGPRs = 5,
1228 FallbackExec = 6,
1229 FallbackCallee = 7,
1230};
1231} // anonymous namespace
1232
1233bool AMDGPUCallLowering::lowerTailCall(
1234 MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1235 SmallVectorImpl<ArgInfo> &OutArgs) const {
1236 MachineFunction &MF = MIRBuilder.getMF();
1237 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1238 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1239 const Function &F = MF.getFunction();
1240 MachineRegisterInfo &MRI = MF.getRegInfo();
1241 const SIInstrInfo *TII = ST.getInstrInfo();
1242 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1243 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1244
1245 // True when we're tail calling, but without -tailcallopt.
1246 bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1247
1248 // Find out which ABI gets to decide where things go.
1249 CallingConv::ID CalleeCC = Info.CallConv;
1250 CCAssignFn *AssignFnFixed;
1251 CCAssignFn *AssignFnVarArg;
1252 std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1253
1254 MachineInstrBuilder CallSeqStart;
1255 if (!IsSibCall)
1256 CallSeqStart = MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKUP);
1257
1258 bool IsChainCall = AMDGPU::isChainCC(CC: Info.CallConv);
1259 bool IsDynamicVGPRChainCall = false;
1260
1261 if (IsChainCall) {
1262 ArgInfo FlagsArg = Info.OrigArgs[ChainCallArgIdx::Flags];
1263 const APInt &FlagsValue = cast<ConstantInt>(Val: FlagsArg.OrigValue)->getValue();
1264 if (FlagsValue.isZero()) {
1265 if (Info.OrigArgs.size() != 5) {
1266 LLVM_DEBUG(dbgs() << "No additional args allowed if flags == 0\n");
1267 return false;
1268 }
1269 } else if (FlagsValue.isOneBitSet(BitNo: 0)) {
1270 IsDynamicVGPRChainCall = true;
1271
1272 if (Info.OrigArgs.size() != 8) {
1273 LLVM_DEBUG(dbgs() << "Expected 3 additional args\n");
1274 return false;
1275 }
1276
1277 // On GFX12, we can only change the VGPR allocation for wave32.
1278 if (!ST.isWave32()) {
1279 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(
1280 F, "dynamic VGPR mode is only supported for wave32"));
1281 return false;
1282 }
1283
1284 ArgInfo FallbackExecArg = Info.OrigArgs[ChainCallArgIdx::FallbackExec];
1285 assert(FallbackExecArg.Regs.size() == 1 &&
1286 "Expected single register for fallback EXEC");
1287 if (!FallbackExecArg.Ty->isIntegerTy(Bitwidth: ST.getWavefrontSize())) {
1288 LLVM_DEBUG(dbgs() << "Bad type for fallback EXEC\n");
1289 return false;
1290 }
1291 }
1292 }
1293
1294 unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), /*IsTailCall*/ true,
1295 IsWave32: ST.isWave32(), CC: CalleeCC, IsDynamicVGPRChainCall);
1296 auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1297
1298 if (FuncInfo->isWholeWaveFunction())
1299 addOriginalExecToReturn(MF, Ret&: MIB);
1300
1301 // Keep track of the index of the next operand to be added to the call
1302 unsigned CalleeIdx = MIB->getNumOperands();
1303
1304 if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
1305 return false;
1306
1307 // Byte offset for the tail call. When we are sibcalling, this will always
1308 // be 0.
1309 MIB.addImm(Val: 0);
1310
1311 // If this is a chain call, we need to pass in the EXEC mask as well as any
1312 // other special args.
1313 if (IsChainCall) {
1314 auto AddRegOrImm = [&](const ArgInfo &Arg) {
1315 if (auto CI = dyn_cast<ConstantInt>(Val: Arg.OrigValue)) {
1316 MIB.addImm(Val: CI->getSExtValue());
1317 } else {
1318 MIB.addReg(RegNo: Arg.Regs[0]);
1319 unsigned Idx = MIB->getNumOperands() - 1;
1320 MIB->getOperand(i: Idx).setReg(constrainOperandRegClass(
1321 MF, TRI: *TRI, MRI, TII: *TII, RBI: *ST.getRegBankInfo(), InsertPt&: *MIB, II: MIB->getDesc(),
1322 RegMO&: MIB->getOperand(i: Idx), OpIdx: Idx));
1323 }
1324 };
1325
1326 ArgInfo ExecArg = Info.OrigArgs[ChainCallArgIdx::Exec];
1327 assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
1328
1329 if (!ExecArg.Ty->isIntegerTy(Bitwidth: ST.getWavefrontSize())) {
1330 LLVM_DEBUG(dbgs() << "Bad type for EXEC");
1331 return false;
1332 }
1333
1334 AddRegOrImm(ExecArg);
1335 if (IsDynamicVGPRChainCall)
1336 std::for_each(first: Info.OrigArgs.begin() + ChainCallArgIdx::NumVGPRs,
1337 last: Info.OrigArgs.end(), f: AddRegOrImm);
1338 }
1339
1340 // Tell the call which registers are clobbered.
1341 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1342 MIB.addRegMask(Mask);
1343
1344 // FPDiff is the byte offset of the call's argument area from the callee's.
1345 // Stores to callee stack arguments will be placed in FixedStackSlots offset
1346 // by this amount for a tail call. In a sibling call it must be 0 because the
1347 // caller will deallocate the entire stack and the callee still expects its
1348 // arguments to begin at SP+0.
1349 int FPDiff = 0;
1350
1351 // This will be 0 for sibcalls, potentially nonzero for tail calls produced
1352 // by -tailcallopt. For sibcalls, the memory operands for the call are
1353 // already available in the caller's incoming argument space.
1354 unsigned NumBytes = 0;
1355 if (!IsSibCall) {
1356 // We aren't sibcalling, so we need to compute FPDiff. We need to do this
1357 // before handling assignments, because FPDiff must be known for memory
1358 // arguments.
1359 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1360 SmallVector<CCValAssign, 16> OutLocs;
1361 CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1362
1363 // FIXME: Not accounting for callee implicit inputs
1364 OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1365 if (!determineAssignments(Assigner&: CalleeAssigner, Args&: OutArgs, CCInfo&: OutInfo))
1366 return false;
1367
1368 // The callee will pop the argument stack as a tail call. Thus, we must
1369 // keep it 16-byte aligned.
1370 NumBytes = alignTo(Size: OutInfo.getStackSize(), A: ST.getStackAlignment());
1371
1372 // FPDiff will be negative if this tail call requires more space than we
1373 // would automatically have in our incoming argument space. Positive if we
1374 // actually shrink the stack.
1375 FPDiff = NumReusableBytes - NumBytes;
1376
1377 // The stack pointer must be 16-byte aligned at all times it's used for a
1378 // memory operation, which in practice means at *all* times and in
1379 // particular across call boundaries. Therefore our own arguments started at
1380 // a 16-byte aligned SP and the delta applied for the tail call should
1381 // satisfy the same constraint.
1382 assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1383 "unaligned stack on tail call");
1384 }
1385
1386 SmallVector<CCValAssign, 16> ArgLocs;
1387 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1388
1389 // We could pass MIB and directly add the implicit uses to the call
1390 // now. However, as an aesthetic choice, place implicit argument operands
1391 // after the ordinary user argument registers.
1392 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1393
1394 if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1395 Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
1396 !AMDGPU::isChainCC(CC: Info.CallConv)) {
1397 // With a fixed ABI, allocate fixed registers before user arguments.
1398 if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1399 return false;
1400 }
1401
1402 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1403
1404 if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1405 return false;
1406
1407 // Do the actual argument marshalling.
1408 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1409 if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1410 return false;
1411
1412 if (Info.ConvergenceCtrlToken) {
1413 MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1414 }
1415 handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *FuncInfo, CalleeCC,
1416 ImplicitArgRegs);
1417
1418 // If we have -tailcallopt, we need to adjust the stack. We'll do the call
1419 // sequence start and end here.
1420 if (!IsSibCall) {
1421 MIB->getOperand(i: CalleeIdx + 1).setImm(FPDiff);
1422 CallSeqStart.addImm(Val: NumBytes).addImm(Val: 0);
1423 // End the call sequence *before* emitting the call. Normally, we would
1424 // tidy the frame up after the call. However, here, we've laid out the
1425 // parameters so that when SP is reset, they will be in the correct
1426 // location.
1427 MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKDOWN).addImm(Val: NumBytes).addImm(Val: 0);
1428 }
1429
1430 // Now we can add the actual call instruction to the correct basic block.
1431 MIRBuilder.insertInstr(MIB);
1432
1433 // If this is a whole wave tail call, we need to constrain the register for
1434 // the original EXEC.
1435 if (MIB->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) {
1436 MIB->getOperand(i: 0).setReg(
1437 constrainOperandRegClass(MF, TRI: *TRI, MRI, TII: *TII, RBI: *ST.getRegBankInfo(),
1438 InsertPt&: *MIB, II: MIB->getDesc(), RegMO&: MIB->getOperand(i: 0), OpIdx: 0));
1439 }
1440
1441 // If Callee is a reg, since it is used by a target specific
1442 // instruction, it must have a register class matching the
1443 // constraint of that instruction.
1444
1445 // FIXME: We should define regbankselectable call instructions to handle
1446 // divergent call targets.
1447 if (MIB->getOperand(i: CalleeIdx).isReg()) {
1448 MIB->getOperand(i: CalleeIdx).setReg(constrainOperandRegClass(
1449 MF, TRI: *TRI, MRI, TII: *TII, RBI: *ST.getRegBankInfo(), InsertPt&: *MIB, II: MIB->getDesc(),
1450 RegMO&: MIB->getOperand(i: CalleeIdx), OpIdx: CalleeIdx));
1451 }
1452
1453 MF.getFrameInfo().setHasTailCall();
1454 Info.LoweredTailCall = true;
1455 return true;
1456}
1457
1458/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1459bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
1460 CallLoweringInfo &Info) const {
1461 ArgInfo Callee = Info.OrigArgs[0];
1462 ArgInfo SGPRArgs = Info.OrigArgs[2];
1463 ArgInfo VGPRArgs = Info.OrigArgs[3];
1464
1465 MachineFunction &MF = MIRBuilder.getMF();
1466 const Function &F = MF.getFunction();
1467 const DataLayout &DL = F.getDataLayout();
1468
1469 // The function to jump to is actually the first argument, so we'll change the
1470 // Callee and other info to match that before using our existing helper.
1471 const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1472 if (const Function *F = dyn_cast<Function>(Val: CalleeV)) {
1473 Info.Callee = MachineOperand::CreateGA(GV: F, Offset: 0);
1474 Info.CallConv = F->getCallingConv();
1475 } else {
1476 assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
1477 Info.Callee = MachineOperand::CreateReg(Reg: Callee.Regs[0], isDef: false);
1478 Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1479 // behaves the same here.
1480 }
1481
1482 // The function that we're calling cannot be vararg (only the intrinsic is).
1483 Info.IsVarArg = false;
1484
1485 assert(
1486 all_of(SGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1487 "SGPR arguments should be marked inreg");
1488 assert(
1489 none_of(VGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1490 "VGPR arguments should not be marked inreg");
1491
1492 SmallVector<ArgInfo, 8> OutArgs;
1493 splitToValueTypes(OrigArgInfo: SGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1494 splitToValueTypes(OrigArgInfo: VGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1495
1496 Info.IsMustTailCall = true;
1497 return lowerTailCall(MIRBuilder, Info, OutArgs);
1498}
1499
1500bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1501 CallLoweringInfo &Info) const {
1502 if (Function *F = Info.CB->getCalledFunction())
1503 if (F->isIntrinsic()) {
1504 switch (F->getIntrinsicID()) {
1505 case Intrinsic::amdgcn_cs_chain:
1506 return lowerChainCall(MIRBuilder, Info);
1507 case Intrinsic::amdgcn_call_whole_wave:
1508 Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
1509
1510 // Get the callee from the original instruction, so it doesn't look like
1511 // this is an indirect call.
1512 Info.Callee = MachineOperand::CreateGA(
1513 GV: cast<GlobalValue>(Val: Info.CB->getOperand(i_nocapture: 0)), /*Offset=*/0);
1514 Info.OrigArgs.erase(CI: Info.OrigArgs.begin());
1515 Info.IsVarArg = false;
1516 break;
1517 default:
1518 llvm_unreachable("Unexpected intrinsic call");
1519 }
1520 }
1521
1522 if (Info.IsVarArg) {
1523 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1524 return false;
1525 }
1526
1527 MachineFunction &MF = MIRBuilder.getMF();
1528 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1529 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1530
1531 const Function &F = MF.getFunction();
1532 MachineRegisterInfo &MRI = MF.getRegInfo();
1533 const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1534 const DataLayout &DL = F.getDataLayout();
1535
1536 SmallVector<ArgInfo, 8> OutArgs;
1537 for (auto &OrigArg : Info.OrigArgs)
1538 splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1539
1540 SmallVector<ArgInfo, 8> InArgs;
1541 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1542 splitToValueTypes(OrigArgInfo: Info.OrigRet, SplitArgs&: InArgs, DL, CallConv: Info.CallConv);
1543
1544 // If we can lower as a tail call, do that instead.
1545 bool CanTailCallOpt =
1546 isEligibleForTailCallOptimization(B&: MIRBuilder, Info, InArgs, OutArgs);
1547
1548 // We must emit a tail call if we have musttail.
1549 if (Info.IsMustTailCall && !CanTailCallOpt) {
1550 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1551 return false;
1552 }
1553
1554 Info.IsTailCall = CanTailCallOpt;
1555 if (CanTailCallOpt)
1556 return lowerTailCall(MIRBuilder, Info, OutArgs);
1557
1558 // Find out which ABI gets to decide where things go.
1559 CCAssignFn *AssignFnFixed;
1560 CCAssignFn *AssignFnVarArg;
1561 std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) =
1562 getAssignFnsForCC(CC: Info.CallConv, TLI);
1563
1564 MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKUP)
1565 .addImm(Val: 0)
1566 .addImm(Val: 0);
1567
1568 // Create a temporarily-floating call instruction so we can add the implicit
1569 // uses of arg registers.
1570 unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), IsTailCall: false, IsWave32: ST.isWave32(),
1571 CC: Info.CallConv);
1572
1573 auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1574 MIB.addDef(RegNo: TRI->getReturnAddressReg(MF));
1575
1576 if (!Info.IsConvergent)
1577 MIB.setMIFlag(MachineInstr::NoConvergent);
1578
1579 if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info))
1580 return false;
1581
1582 // Tell the call which registers are clobbered.
1583 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1584 MIB.addRegMask(Mask);
1585
1586 SmallVector<CCValAssign, 16> ArgLocs;
1587 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1588
1589 // We could pass MIB and directly add the implicit uses to the call
1590 // now. However, as an aesthetic choice, place implicit argument operands
1591 // after the ordinary user argument registers.
1592 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1593
1594 if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1595 Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
1596 // With a fixed ABI, allocate fixed registers before user arguments.
1597 if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1598 return false;
1599 }
1600
1601 // Do the actual argument marshalling.
1602 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1603 if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1604 return false;
1605
1606 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1607 if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1608 return false;
1609
1610 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1611
1612 if (Info.ConvergenceCtrlToken) {
1613 MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1614 }
1615 handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *MFI, CalleeCC: Info.CallConv,
1616 ImplicitArgRegs);
1617
1618 // Get a count of how many bytes are to be pushed on the stack.
1619 unsigned NumBytes = CCInfo.getStackSize();
1620
1621 // If Callee is a reg, since it is used by a target specific
1622 // instruction, it must have a register class matching the
1623 // constraint of that instruction.
1624
1625 // FIXME: We should define regbankselectable call instructions to handle
1626 // divergent call targets.
1627 if (MIB->getOperand(i: 1).isReg()) {
1628 MIB->getOperand(i: 1).setReg(constrainOperandRegClass(
1629 MF, TRI: *TRI, MRI, TII: *ST.getInstrInfo(),
1630 RBI: *ST.getRegBankInfo(), InsertPt&: *MIB, II: MIB->getDesc(), RegMO&: MIB->getOperand(i: 1),
1631 OpIdx: 1));
1632 }
1633
1634 // Now we can add the actual call instruction to the correct position.
1635 MIRBuilder.insertInstr(MIB);
1636
1637 // Finally we can copy the returned value back into its virtual-register. In
1638 // symmetry with the arguments, the physical register must be an
1639 // implicit-define of the call instruction.
1640 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1641 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(CC: Info.CallConv,
1642 IsVarArg: Info.IsVarArg);
1643 IncomingValueAssigner Assigner(RetAssignFn);
1644 CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1645 if (!determineAndHandleAssignments(Handler, Assigner, Args&: InArgs, MIRBuilder,
1646 CallConv: Info.CallConv, IsVarArg: Info.IsVarArg))
1647 return false;
1648 }
1649
1650 uint64_t CalleePopBytes = NumBytes;
1651
1652 MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKDOWN)
1653 .addImm(Val: 0)
1654 .addImm(Val: CalleePopBytes);
1655
1656 if (!Info.CanLowerReturn) {
1657 insertSRetLoads(MIRBuilder, RetTy: Info.OrigRet.Ty, VRegs: Info.OrigRet.Regs,
1658 DemoteReg: Info.DemoteRegister, FI: Info.DemoteStackIndex);
1659 }
1660
1661 return true;
1662}
1663
1664void AMDGPUCallLowering::addOriginalExecToReturn(
1665 MachineFunction &MF, MachineInstrBuilder &Ret) const {
1666 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1667 const SIInstrInfo *TII = ST.getInstrInfo();
1668 const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
1669 Ret.addReg(RegNo: Setup->getOperand(i: 0).getReg());
1670}
1671