AMDGPUCallLowering.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp]

1	//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements the lowering of LLVM calls to machine code calls for
11	/// GlobalISel.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPU.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "SIMachineFunctionInfo.h"
19	#include "SIRegisterInfo.h"
20	#include "llvm/CodeGen/Analysis.h"
21	#include "llvm/CodeGen/FunctionLoweringInfo.h"
22	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23	#include "llvm/CodeGen/MachineFrameInfo.h"
24	#include "llvm/IR/IntrinsicsAMDGPU.h"
25
26	#define DEBUG_TYPE "amdgpu-call-lowering"
27
28	using namespace llvm;
29
30	namespace {
31
32	/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
33	static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
34	Register ValVReg, const CCValAssign &VA) {
35	if (VA.getLocVT().getSizeInBits() < `32`) {
36	// 16-bit types are reported as legal for 32-bit registers. We need to
37	// extend and do a 32-bit copy to avoid the verifier complaining about it.
38	return Handler.MIRBuilder.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: ValVReg).getReg(Idx: `0`);
39	}
40
41	return Handler.extendRegister(ValReg: ValVReg, VA);
42	}
43
44	struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
45	AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
46	MachineInstrBuilder MIB)
47	: OutgoingValueHandler (B, MRI), MIB (MIB) {}
48
49	MachineInstrBuilder MIB;
50
51	Register getStackAddress(uint64_t Size, int64_t Offset,
52	MachinePointerInfo &MPO,
53	ISD::ArgFlagsTy Flags) override {
54	llvm_unreachable("not implemented");
55	}
56
57	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
58	const MachinePointerInfo &MPO,
59	const CCValAssign &VA) override {
60	llvm_unreachable("not implemented");
61	}
62
63	void assignValueToReg(Register ValVReg, Register PhysReg,
64	const CCValAssign &VA) override {
65	Register ExtReg = extendRegisterMin32(Handler&: *this, ValVReg, VA);
66
67	// If this is a scalar return, insert a readfirstlane just in case the value
68	// ends up in a VGPR.
69	// FIXME: Assert this is a shader return.
70	const SIRegisterInfo *TRI
71	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
72	if (TRI->isSGPRReg(MRI, Reg: PhysReg)) {
73	LLT Ty = MRI.getType(Reg: ExtReg);
74	LLT S32 = LLT::scalar(SizeInBits: `32`);
75	if (Ty != S32) {
76	// FIXME: We should probably support readfirstlane intrinsics with all
77	// legal 32-bit types.
78	assert(Ty.getSizeInBits() == `32`);
79	if (Ty.isPointer())
80	ExtReg = MIRBuilder.buildPtrToInt(Dst: S32, Src: ExtReg).getReg(Idx: `0`);
81	else
82	ExtReg = MIRBuilder.buildBitcast(Dst: S32, Src: ExtReg).getReg(Idx: `0`);
83	}
84
85	auto ToSGPR = MIRBuilder
86	.buildIntrinsic(ID: Intrinsic::amdgcn_readfirstlane,
87	Res: {MRI.getType(Reg: ExtReg)})
88	.addReg(RegNo: ExtReg);
89	ExtReg = ToSGPR.getReg(Idx: `0`);
90	}
91
92	MIRBuilder.buildCopy(Res: PhysReg, Op: ExtReg);
93	MIB.addUse(RegNo: PhysReg, Flags: RegState::Implicit);
94	}
95	};
96
97	struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
98	uint64_t StackUsed = `0`;
99
100	AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
101	: IncomingValueHandler (B, MRI) {}
102
103	Register getStackAddress(uint64_t Size, int64_t Offset,
104	MachinePointerInfo &MPO,
105	ISD::ArgFlagsTy Flags) override {
106	auto &MFI = MIRBuilder.getMF().getFrameInfo();
107
108	// Byval is assumed to be writable memory, but other stack passed arguments
109	// are not.
110	const bool IsImmutable = !Flags.isByVal();
111	int FI = MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable);
112	MPO = MachinePointerInfo::getFixedStack(MF&: MIRBuilder.getMF(), FI);
113	auto AddrReg = MIRBuilder.buildFrameIndex(
114	Res: LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: `32`), Idx: FI);
115	StackUsed = std::max(a: StackUsed, b: Size + Offset);
116	return AddrReg.getReg(Idx: `0`);
117	}
118
119	void assignValueToReg(Register ValVReg, Register PhysReg,
120	const CCValAssign &VA) override {
121	markPhysRegUsed(PhysReg);
122
123	if (VA.getLocVT().getSizeInBits() < `32`) {
124	// 16-bit types are reported as legal for 32-bit registers. We need to do
125	// a 32-bit copy, and truncate to avoid the verifier complaining about it.
126	auto Copy = MIRBuilder.buildCopy(Res: LLT::scalar(SizeInBits: `32`), Op: PhysReg);
127
128	// If we have signext/zeroext, it applies to the whole 32-bit register
129	// before truncation.
130	auto Extended =
131	buildExtensionHint(VA, SrcReg: Copy.getReg(Idx: `0`), NarrowTy: LLT (VA.getLocVT()));
132	MIRBuilder.buildTrunc(Res: ValVReg, Op: Extended);
133	return;
134	}
135
136	IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
137	}
138
139	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
140	const MachinePointerInfo &MPO,
141	const CCValAssign &VA) override {
142	MachineFunction &MF = MIRBuilder.getMF();
143
144	auto *MMO = MF.getMachineMemOperand(
145	PtrInfo: MPO, f: MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant, MemTy,
146	base_alignment: inferAlignFromPtrInfo(MF, MPO));
147	MIRBuilder.buildLoad(Res: ValVReg, Addr, MMO&: *MMO);
148	}
149
150	/// How the physical register gets marked varies between formal
151	/// parameters (it's a basic-block live-in), and a call instruction
152	/// (it's an implicit-def of the BL).
153	virtual void markPhysRegUsed(unsigned PhysReg) = `0`;
154	};
155
156	struct FormalArgHandler : public AMDGPUIncomingArgHandler {
157	FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
158	: AMDGPUIncomingArgHandler (B, MRI) {}
159
160	void markPhysRegUsed(unsigned PhysReg) override {
161	MIRBuilder.getMBB().addLiveIn(PhysReg);
162	}
163	};
164
165	struct CallReturnHandler : public AMDGPUIncomingArgHandler {
166	CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
167	MachineInstrBuilder MIB)
168	: AMDGPUIncomingArgHandler (MIRBuilder, MRI), MIB (MIB) {}
169
170	void markPhysRegUsed(unsigned PhysReg) override {
171	MIB.addDef(RegNo: PhysReg, Flags: RegState::Implicit);
172	}
173
174	MachineInstrBuilder MIB;
175	};
176
177	struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
178	/// For tail calls, the byte offset of the call's argument area from the
179	/// callee's. Unused elsewhere.
180	int FPDiff;
181
182	// Cache the SP register vreg if we need it more than once in this call site.
183	Register SPReg;
184
185	bool IsTailCall;
186
187	AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
188	MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
189	bool IsTailCall = false, int FPDiff = `0`)
190	: AMDGPUOutgoingValueHandler (MIRBuilder, MRI, MIB), FPDiff(FPDiff),
191	IsTailCall(IsTailCall) {}
192
193	Register getStackAddress(uint64_t Size, int64_t Offset,
194	MachinePointerInfo &MPO,
195	ISD::ArgFlagsTy Flags) override {
196	MachineFunction &MF = MIRBuilder.getMF();
197	const LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: `32`);
198	const LLT S32 = LLT::scalar(SizeInBits: `32`);
199
200	if (IsTailCall) {
201	Offset += FPDiff;
202	int FI = MF.getFrameInfo().CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
203	auto FIReg = MIRBuilder.buildFrameIndex(Res: PtrTy, Idx: FI);
204	MPO = MachinePointerInfo::getFixedStack(MF, FI);
205	return FIReg.getReg(Idx: `0`);
206	}
207
208	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
209
210	if (!SPReg) {
211	const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
212	if (ST.enableFlatScratch()) {
213	// The stack is accessed unswizzled, so we can use a regular copy.
214	SPReg = MIRBuilder.buildCopy(Res: PtrTy,
215	Op: MFI->getStackPtrOffsetReg()).getReg(Idx: `0`);
216	} else {
217	// The address we produce here, without knowing the use context, is going
218	// to be interpreted as a vector address, so we need to convert to a
219	// swizzled address.
220	SPReg = MIRBuilder.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {PtrTy},
221	SrcOps: {MFI->getStackPtrOffsetReg()}).getReg(Idx: `0`);
222	}
223	}
224
225	auto OffsetReg = MIRBuilder.buildConstant(Res: S32, Val: Offset);
226
227	auto AddrReg = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: SPReg, Op1: OffsetReg);
228	MPO = MachinePointerInfo::getStack(MF, Offset);
229	return AddrReg.getReg(Idx: `0`);
230	}
231
232	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
233	const MachinePointerInfo &MPO,
234	const CCValAssign &VA) override {
235	MachineFunction &MF = MIRBuilder.getMF();
236	uint64_t LocMemOffset = VA.getLocMemOffset();
237	const auto &ST = MF.getSubtarget<GCNSubtarget>();
238
239	auto *MMO = MF.getMachineMemOperand(
240	PtrInfo: MPO, f: MachineMemOperand::MOStore, MemTy,
241	base_alignment: commonAlignment(A: ST.getStackAlignment(), Offset: LocMemOffset));
242	MIRBuilder.buildStore(Val: ValVReg, Addr, MMO&: *MMO);
243	}
244
245	void assignValueToAddress(const CallLowering::ArgInfo &Arg,
246	unsigned ValRegIndex, Register Addr, LLT MemTy,
247	const MachinePointerInfo &MPO,
248	const CCValAssign &VA) override {
249	Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
250	? extendRegister(ValReg: Arg.Regs [ValRegIndex], VA)
251	: Arg.Regs [ValRegIndex];
252	assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
253	}
254	};
255	} // anonymous namespace
256
257	AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
258	: CallLowering (&TLI) {
259	}
260
261	// FIXME: Compatibility shim
262	static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
263	switch (MIOpc) {
264	case TargetOpcode::G_SEXT:
265	return ISD::SIGN_EXTEND;
266	case TargetOpcode::G_ZEXT:
267	return ISD::ZERO_EXTEND;
268	case TargetOpcode::G_ANYEXT:
269	return ISD::ANY_EXTEND;
270	default:
271	llvm_unreachable("not an extend opcode");
272	}
273	}
274
275	bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
276	CallingConv::ID CallConv,
277	SmallVectorImpl<BaseArgInfo> &Outs,
278	bool IsVarArg) const {
279	// For shaders. Vector types should be explicitly handled by CC.
280	if (AMDGPU::isEntryFunctionCC(CC: CallConv))
281	return true;
282
283	SmallVector<CCValAssign, `16`> ArgLocs;
284	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
285	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
286	MF.getFunction().getContext());
287
288	return checkReturn(CCInfo, Outs, Fn: TLI.CCAssignFnForReturn(CC: CallConv, IsVarArg));
289	}
290
291	/// Lower the return value for the already existing \p Ret. This assumes that
292	/// \p B's insertion point is correct.
293	bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
294	const Value *Val, ArrayRef<Register> VRegs,
295	MachineInstrBuilder &Ret) const {
296	if (!Val)
297	return true;
298
299	auto &MF = B.getMF();
300	const auto &F = MF.getFunction();
301	const DataLayout &DL = MF.getDataLayout();
302	MachineRegisterInfo *MRI = B.getMRI();
303	LLVMContext &Ctx = F.getContext();
304
305	CallingConv::ID CC = F.getCallingConv();
306	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
307
308	SmallVector<EVT, `8`> SplitEVTs;
309	ComputeValueVTs(TLI, DL, Ty: Val->getType(), ValueVTs&: SplitEVTs);
310	assert(VRegs.size() == SplitEVTs.size() &&
311	"For each split Type there should be exactly one VReg.");
312
313	SmallVector<ArgInfo, `8`> SplitRetInfos;
314
315	for (unsigned i = `0`; i < SplitEVTs.size(); ++i) {
316	EVT VT = SplitEVTs [i];
317	Register Reg = VRegs [i];
318	ArgInfo RetInfo(Reg, VT.getTypeForEVT(Context&: Ctx), `0`);
319	setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
320
321	if (VT.isScalarInteger()) {
322	unsigned ExtendOp = TargetOpcode::G_ANYEXT;
323	if (RetInfo.Flags [`0`].isSExt()) {
324	assert(RetInfo.Regs.size() == `1` && "expect only simple return values");
325	ExtendOp = TargetOpcode::G_SEXT;
326	} else if (RetInfo.Flags [`0`].isZExt()) {
327	assert(RetInfo.Regs.size() == `1` && "expect only simple return values");
328	ExtendOp = TargetOpcode::G_ZEXT;
329	}
330
331	EVT ExtVT = TLI.getTypeForExtReturn(Context&: Ctx, VT,
332	ExtendKind: extOpcodeToISDExtOpcode(MIOpc: ExtendOp));
333	if (ExtVT != VT) {
334	RetInfo.Ty = ExtVT.getTypeForEVT(Context&: Ctx);
335	LLT ExtTy = getLLTForType(Ty&: *RetInfo.Ty, DL);
336	Reg = B.buildInstr(Opc: ExtendOp, DstOps: {ExtTy}, SrcOps: {Reg}).getReg(Idx: `0`);
337	}
338	}
339
340	if (Reg != RetInfo.Regs [`0`]) {
341	RetInfo.Regs [`0`] = Reg;
342	// Reset the arg flags after modifying Reg.
343	setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
344	}
345
346	splitToValueTypes(OrigArgInfo: RetInfo, SplitArgs&: SplitRetInfos, DL, CallConv: CC);
347	}
348
349	CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, IsVarArg: F.isVarArg());
350
351	OutgoingValueAssigner Assigner(AssignFn);
352	AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
353	return determineAndHandleAssignments(Handler&: RetHandler, Assigner, Args&: SplitRetInfos, MIRBuilder&: B,
354	CallConv: CC, IsVarArg: F.isVarArg());
355	}
356
357	bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
358	ArrayRef<Register> VRegs,
359	FunctionLoweringInfo &FLI) const {
360
361	MachineFunction &MF = B.getMF();
362	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
363	MFI->setIfReturnsVoid(!Val);
364
365	assert(!Val == VRegs.empty() && "Return value without a vreg");
366
367	CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
368	const bool IsShader = AMDGPU::isShader(CC);
369	const bool IsWaveEnd =
370	(IsShader && MFI->returnsVoid()) \|\| AMDGPU::isKernel(CC);
371	if (IsWaveEnd) {
372	B.buildInstr(Opcode: AMDGPU::S_ENDPGM)
373	.addImm(Val: `0`);
374	return true;
375	}
376
377	unsigned ReturnOpc =
378	IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
379	auto Ret = B.buildInstrNoInsert(Opcode: ReturnOpc);
380
381	if (!FLI.CanLowerReturn)
382	insertSRetStores(MIRBuilder&: B, RetTy: Val->getType(), VRegs, DemoteReg: FLI.DemoteRegister);
383	else if (!lowerReturnVal(B, Val, VRegs, Ret))
384	return false;
385
386	// TODO: Handle CalleeSavedRegsViaCopy.
387
388	B.insertInstr(MIB: Ret);
389	return true;
390	}
391
392	void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
393	uint64_t Offset) const {
394	MachineFunction &MF = B.getMF();
395	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
396	MachineRegisterInfo &MRI = MF.getRegInfo();
397	Register KernArgSegmentPtr =
398	MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
399	Register KernArgSegmentVReg = MRI.getLiveInVirtReg(PReg: KernArgSegmentPtr);
400
401	auto OffsetReg = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset);
402
403	B.buildPtrAdd(Res: DstReg, Op0: KernArgSegmentVReg, Op1: OffsetReg);
404	}
405
406	void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
407	uint64_t Offset,
408	Align Alignment) const {
409	MachineFunction &MF = B.getMF();
410	const Function &F = MF.getFunction();
411	const DataLayout &DL = F.getDataLayout();
412	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
413
414	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
415
416	SmallVector<ArgInfo, `32`> SplitArgs;
417	SmallVector<uint64_t> FieldOffsets;
418	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: F.getCallingConv(), Offsets: &FieldOffsets);
419
420	unsigned Idx = `0`;
421	for (ArgInfo &SplitArg : SplitArgs) {
422	Register PtrReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
423	lowerParameterPtr(DstReg: PtrReg, B, Offset: Offset + FieldOffsets [Idx]);
424
425	LLT ArgTy = getLLTForType(Ty&: *SplitArg.Ty, DL);
426	if (SplitArg.Flags [`0`].isPointer()) {
427	// Compensate for losing pointeriness in splitValueTypes.
428	LLT PtrTy = LLT::pointer(AddressSpace: SplitArg.Flags [`0`].getPointerAddrSpace(),
429	SizeInBits: ArgTy.getScalarSizeInBits());
430	ArgTy = ArgTy.isVector() ? LLT::vector(EC: ArgTy.getElementCount(), ScalarTy: PtrTy)
431	: PtrTy;
432	}
433
434	MachineMemOperand *MMO = MF.getMachineMemOperand(
435	PtrInfo,
436	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
437	MachineMemOperand::MOInvariant,
438	MemTy: ArgTy, base_alignment: commonAlignment(A: Alignment, Offset: FieldOffsets [Idx]));
439
440	assert(SplitArg.Regs.size() == `1`);
441
442	B.buildLoad(Res: SplitArg.Regs [`0`], Addr: PtrReg, MMO&: *MMO);
443	++Idx;
444	}
445	}
446
447	// Allocate special inputs passed in user SGPRs.
448	static void allocateHSAUserSGPRs(CCState &CCInfo,
449	MachineIRBuilder &B,
450	MachineFunction &MF,
451	const SIRegisterInfo &TRI,
452	SIMachineFunctionInfo &Info) {
453	// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
454	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
455	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
456	Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
457	MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
458	CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
459	}
460
461	if (UserSGPRInfo.hasDispatchPtr()) {
462	Register DispatchPtrReg = Info.addDispatchPtr(TRI);
463	MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
464	CCInfo.AllocateReg(Reg: DispatchPtrReg);
465	}
466
467	if (UserSGPRInfo.hasQueuePtr()) {
468	Register QueuePtrReg = Info.addQueuePtr(TRI);
469	MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
470	CCInfo.AllocateReg(Reg: QueuePtrReg);
471	}
472
473	if (UserSGPRInfo.hasKernargSegmentPtr()) {
474	MachineRegisterInfo &MRI = MF.getRegInfo();
475	Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
476	const LLT P4 = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
477	Register VReg = MRI.createGenericVirtualRegister(Ty: P4);
478	MRI.addLiveIn(Reg: InputPtrReg, vreg: VReg);
479	B.getMBB().addLiveIn(PhysReg: InputPtrReg);
480	B.buildCopy(Res: VReg, Op: InputPtrReg);
481	CCInfo.AllocateReg(Reg: InputPtrReg);
482	}
483
484	if (UserSGPRInfo.hasDispatchID()) {
485	Register DispatchIDReg = Info.addDispatchID(TRI);
486	MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
487	CCInfo.AllocateReg(Reg: DispatchIDReg);
488	}
489
490	if (UserSGPRInfo.hasFlatScratchInit()) {
491	Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
492	MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
493	CCInfo.AllocateReg(Reg: FlatScratchInitReg);
494	}
495
496	if (UserSGPRInfo.hasPrivateSegmentSize()) {
497	Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
498	MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
499	CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
500	}
501
502	// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
503	// these from the dispatch pointer.
504	}
505
506	bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
507	MachineIRBuilder &B, const Function &F,
508	ArrayRef<ArrayRef<Register>> VRegs) const {
509	MachineFunction &MF = B.getMF();
510	const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
511	MachineRegisterInfo &MRI = MF.getRegInfo();
512	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
513	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
514	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
515	const DataLayout &DL = F.getDataLayout();
516
517	SmallVector<CCValAssign, `16`> ArgLocs;
518	CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
519
520	allocateHSAUserSGPRs(CCInfo, B, MF, TRI: TRI, Info&: Info);
521
522	unsigned i = `0`;
523	const Align KernArgBaseAlign(`16`);
524	const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
525	uint64_t ExplicitArgOffset = `0`;
526
527	// TODO: Align down to dword alignment and extract bits for extending loads.
528	for (auto &Arg : F.args()) {
529	// TODO: Add support for kernarg preload.
530	if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
531	LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
532	return false;
533	}
534
535	const bool IsByRef = Arg.hasByRefAttr();
536	Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
537	unsigned AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
538	if (AllocSize == `0`)
539	continue;
540
541	MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
542	Align ABIAlign = DL.getValueOrABITypeAlignment(Alignment: ParamAlign, Ty: ArgTy);
543
544	uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + BaseOffset;
545	ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + AllocSize;
546
547	if (Arg.use_empty()) {
548	++i;
549	continue;
550	}
551
552	Align Alignment = commonAlignment(A: KernArgBaseAlign, Offset: ArgOffset);
553
554	if (IsByRef) {
555	unsigned ByRefAS = cast<PointerType>(Val: Arg.getType())->getAddressSpace();
556
557	assert(VRegs[i].size() == `1` &&
558	"expected only one register for byval pointers");
559	if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
560	lowerParameterPtr(DstReg: VRegs [i][`0`], B, Offset: ArgOffset);
561	} else {
562	const LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
563	Register PtrReg = MRI.createGenericVirtualRegister(Ty: ConstPtrTy);
564	lowerParameterPtr(DstReg: PtrReg, B, Offset: ArgOffset);
565
566	B.buildAddrSpaceCast(Dst: VRegs [i][`0`], Src: PtrReg);
567	}
568	} else {
569	ArgInfo OrigArg(VRegs [i], Arg, i);
570	const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
571	setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
572	lowerParameter(B, OrigArg, Offset: ArgOffset, Alignment);
573	}
574
575	++i;
576	}
577
578	TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
579	TLI.allocateSystemSGPRs(CCInfo, MF, Info&: Info, CallConv: F.getCallingConv(), IsShader: false*);
580	return true;
581	}
582
583	bool AMDGPUCallLowering::lowerFormalArguments(
584	MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
585	FunctionLoweringInfo &FLI) const {
586	CallingConv::ID CC = F.getCallingConv();
587
588	// The infrastructure for normal calling convention lowering is essentially
589	// useless for kernels. We want to avoid any kind of legalization or argument
590	// splitting.
591	if (CC == CallingConv::AMDGPU_KERNEL)
592	return lowerFormalArgumentsKernel(B, F, VRegs);
593
594	const bool IsGraphics = AMDGPU::isGraphics(CC);
595	const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
596
597	MachineFunction &MF = B.getMF();
598	MachineBasicBlock &MBB = B.getMBB();
599	MachineRegisterInfo &MRI = MF.getRegInfo();
600	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
601	const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
602	const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
603	const DataLayout &DL = F.getDataLayout();
604
605	SmallVector<CCValAssign, `16`> ArgLocs;
606	CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
607	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
608
609	if (UserSGPRInfo.hasImplicitBufferPtr()) {
610	Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(TRI: *TRI);
611	MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
612	CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
613	}
614
615	// FIXME: This probably isn't defined for mesa
616	if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
617	Register FlatScratchInitReg = Info->addFlatScratchInit(TRI: *TRI);
618	MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
619	CCInfo.AllocateReg(Reg: FlatScratchInitReg);
620	}
621
622	SmallVector<ArgInfo, `32`> SplitArgs;
623	unsigned Idx = `0`;
624	unsigned PSInputNum = `0`;
625
626	// Insert the hidden sret parameter if the return value won't fit in the
627	// return registers.
628	if (!FLI.CanLowerReturn)
629	insertSRetIncomingArgument(F, SplitArgs, DemoteReg&: FLI.DemoteRegister, MRI, DL);
630
631	for (auto &Arg : F.args()) {
632	if (DL.getTypeStoreSize(Ty: Arg.getType()) == `0`)
633	continue;
634
635	const bool InReg = Arg.hasAttribute(Kind: Attribute::InReg);
636
637	if (Arg.hasAttribute(Kind: Attribute::SwiftSelf) \|\|
638	Arg.hasAttribute(Kind: Attribute::SwiftError) \|\|
639	Arg.hasAttribute(Kind: Attribute::Nest))
640	return false;
641
642	if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= `15`) {
643	const bool ArgUsed = !Arg.use_empty();
644	bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(Index: PSInputNum);
645
646	if (!SkipArg) {
647	Info->markPSInputAllocated(Index: PSInputNum);
648	if (ArgUsed)
649	Info->markPSInputEnabled(Index: PSInputNum);
650	}
651
652	++PSInputNum;
653
654	if (SkipArg) {
655	for (Register R : VRegs [Idx])
656	B.buildUndef(Res: R);
657
658	++Idx;
659	continue;
660	}
661	}
662
663	ArgInfo OrigArg(VRegs [Idx], Arg, Idx);
664	const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
665	setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
666
667	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: CC);
668	++Idx;
669	}
670
671	// At least one interpolation mode must be enabled or else the GPU will
672	// hang.
673	//
674	// Check PSInputAddr instead of PSInputEnable. The idea is that if the user
675	// set PSInputAddr, the user wants to enable some bits after the compilation
676	// based on run-time states. Since we can't know what the final PSInputEna
677	// will look like, so we shouldn't do anything here and the user should take
678	// responsibility for the correct programming.
679	//
680	// Otherwise, the following restrictions apply:
681	// - At least one of PERSP_ (0xF) or LINEAR_* (0x70) must be enabled.*
682	// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_ must be*
683	// enabled too.
684	if (CC == CallingConv::AMDGPU_PS) {
685	if ((Info->getPSInputAddr() & `0x7F`) == `0` \|\|
686	((Info->getPSInputAddr() & `0xF`) == `0` &&
687	Info->isPSInputAllocated(Index: `11`))) {
688	CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
689	CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
690	Info->markPSInputAllocated(Index: `0`);
691	Info->markPSInputEnabled(Index: `0`);
692	}
693
694	if (Subtarget.isAmdPalOS()) {
695	// For isAmdPalOS, the user does not enable some bits after compilation
696	// based on run-time states; the register values being generated here are
697	// the final ones set in hardware. Therefore we need to apply the
698	// workaround to PSInputAddr and PSInputEnable together. (The case where
699	// a bit is set in PSInputAddr but not PSInputEnable is where the frontend
700	// set up an input arg for a particular interpolation mode, but nothing
701	// uses that input arg. Really we should have an earlier pass that removes
702	// such an arg.)
703	unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
704	if ((PsInputBits & `0x7F`) == `0` \|\|
705	((PsInputBits & `0xF`) == `0` &&
706	(PsInputBits >> `11` & `1`)))
707	Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
708	}
709	}
710
711	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
712	CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, IsVarArg: F.isVarArg());
713
714	if (!MBB.empty())
715	B.setInstr(*MBB.begin());
716
717	if (!IsEntryFunc && !IsGraphics) {
718	// For the fixed ABI, pass workitem IDs in the last argument register.
719	TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: TRI, Info&: Info);
720
721	if (!Subtarget.enableFlatScratch())
722	CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
723	TLI.allocateSpecialInputSGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
724	}
725
726	IncomingValueAssigner Assigner(AssignFn);
727	if (!determineAssignments(Assigner, Args&: SplitArgs, CCInfo))
728	return false;
729
730	FormalArgHandler Handler(B, MRI);
731	if (!handleAssignments(Handler, Args&: SplitArgs, CCState&: CCInfo, ArgLocs, MIRBuilder&: B))
732	return false;
733
734	uint64_t StackSize = Assigner.StackSize;
735
736	// Start adding system SGPRs.
737	if (IsEntryFunc)
738	TLI.allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv: CC, IsShader: IsGraphics);
739
740	// When we tail call, we need to check if the callee's arguments will fit on
741	// the caller's stack. So, whenever we lower formal arguments, we should keep
742	// track of this information, since we might lower a tail call in this
743	// function later.
744	Info->setBytesInStackArgArea(StackSize);
745
746	// Move back to the end of the basic block.
747	B.setMBB(MBB);
748
749	return true;
750	}
751
752	bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
753	CCState &CCInfo,
754	SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
755	CallLoweringInfo &Info) const {
756	MachineFunction &MF = MIRBuilder.getMF();
757
758	// If there's no call site, this doesn't correspond to a call from the IR and
759	// doesn't need implicit inputs.
760	if (!Info.CB)
761	return true;
762
763	const AMDGPUFunctionArgInfo *CalleeArgInfo
764	= &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
765
766	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
767	const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
768
769
770	// TODO: Unify with private memory register handling. This is complicated by
771	// the fact that at least in kernels, the input argument is not necessarily
772	// in the same location as the input.
773	AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
774	AMDGPUFunctionArgInfo::DISPATCH_PTR,
775	AMDGPUFunctionArgInfo::QUEUE_PTR,
776	AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
777	AMDGPUFunctionArgInfo::DISPATCH_ID,
778	AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
779	AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
780	AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
781	AMDGPUFunctionArgInfo::LDS_KERNEL_ID,
782	};
783
784	static constexpr StringLiteral ImplicitAttrNames[] = {
785	"amdgpu-no-dispatch-ptr",
786	"amdgpu-no-queue-ptr",
787	"amdgpu-no-implicitarg-ptr",
788	"amdgpu-no-dispatch-id",
789	"amdgpu-no-workgroup-id-x",
790	"amdgpu-no-workgroup-id-y",
791	"amdgpu-no-workgroup-id-z",
792	"amdgpu-no-lds-kernel-id",
793	};
794
795	MachineRegisterInfo &MRI = MF.getRegInfo();
796
797	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
798	const AMDGPULegalizerInfo *LI
799	= static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
800
801	unsigned I = `0`;
802	for (auto InputID : InputRegs) {
803	const ArgDescriptor *OutgoingArg;
804	const TargetRegisterClass *ArgRC;
805	LLT ArgTy;
806
807	// If the callee does not use the attribute value, skip copying the value.
808	if (Info.CB->hasFnAttr(Kind: ImplicitAttrNames[I++]))
809	continue;
810
811	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
812	CalleeArgInfo->getPreloadedValue(Value: InputID);
813	if (!OutgoingArg)
814	continue;
815
816	const ArgDescriptor *IncomingArg;
817	const TargetRegisterClass *IncomingArgRC;
818	std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: ArgTy) =
819	CallerArgInfo.getPreloadedValue(Value: InputID);
820	assert(IncomingArgRC == ArgRC);
821
822	Register InputReg = MRI.createGenericVirtualRegister(Ty: ArgTy);
823
824	if (IncomingArg) {
825	LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArg, ArgRC, ArgTy);
826	} else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
827	LI->getImplicitArgPtr(DstReg: InputReg, MRI, B&: MIRBuilder);
828	} else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
829	std::optional<uint32_t> Id =
830	AMDGPUMachineFunction::getLDSKernelIdMetadata(F: MF.getFunction());
831	if (Id) {
832	MIRBuilder.buildConstant(Res: InputReg, Val: *Id);
833	} else {
834	MIRBuilder.buildUndef(Res: InputReg);
835	}
836	} else {
837	// We may have proven the input wasn't needed, although the ABI is
838	// requiring it. We just need to allocate the register appropriately.
839	MIRBuilder.buildUndef(Res: InputReg);
840	}
841
842	if (OutgoingArg->isRegister()) {
843	ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
844	if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
845	report_fatal_error(reason: "failed to allocate implicit input argument");
846	} else {
847	LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
848	return false;
849	}
850	}
851
852	// Pack workitem IDs into a single register or pass it as is if already
853	// packed.
854	const ArgDescriptor *OutgoingArg;
855	const TargetRegisterClass *ArgRC;
856	LLT ArgTy;
857
858	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
859	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
860	if (!OutgoingArg)
861	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
862	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
863	if (!OutgoingArg)
864	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
865	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
866	if (!OutgoingArg)
867	return false;
868
869	auto WorkitemIDX =
870	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
871	auto WorkitemIDY =
872	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
873	auto WorkitemIDZ =
874	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
875
876	const ArgDescriptor *IncomingArgX = std::get<`0`>(t&: WorkitemIDX);
877	const ArgDescriptor *IncomingArgY = std::get<`0`>(t&: WorkitemIDY);
878	const ArgDescriptor *IncomingArgZ = std::get<`0`>(t&: WorkitemIDZ);
879	const LLT S32 = LLT::scalar(SizeInBits: `32`);
880
881	const bool NeedWorkItemIDX = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
882	const bool NeedWorkItemIDY = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
883	const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
884
885	// If incoming ids are not packed we need to pack them.
886	// FIXME: Should consider known workgroup size to eliminate known 0 cases.
887	Register InputReg;
888	if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
889	NeedWorkItemIDX) {
890	if (ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: `0`) != `0`) {
891	InputReg = MRI.createGenericVirtualRegister(Ty: S32);
892	LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArgX,
893	ArgRC: std::get<`1`>(t&: WorkitemIDX),
894	ArgTy: std::get<`2`>(t&: WorkitemIDX));
895	} else {
896	InputReg = MIRBuilder.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
897	}
898	}
899
900	if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
901	NeedWorkItemIDY && ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: `1`) != `0`) {
902	Register Y = MRI.createGenericVirtualRegister(Ty: S32);
903	LI->buildLoadInputValue(DstReg: Y, B&: MIRBuilder, Arg: IncomingArgY,
904	ArgRC: std::get<`1`>(t&: WorkitemIDY), ArgTy: std::get<`2`>(t&: WorkitemIDY));
905
906	Y = MIRBuilder.buildShl(Dst: S32, Src0: Y, Src1: MIRBuilder.buildConstant(Res: S32, Val: `10`)).getReg(Idx: `0`);
907	InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Y).getReg(Idx: `0`) : Y;
908	}
909
910	if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
911	NeedWorkItemIDZ && ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: `2`) != `0`) {
912	Register Z = MRI.createGenericVirtualRegister(Ty: S32);
913	LI->buildLoadInputValue(DstReg: Z, B&: MIRBuilder, Arg: IncomingArgZ,
914	ArgRC: std::get<`1`>(t&: WorkitemIDZ), ArgTy: std::get<`2`>(t&: WorkitemIDZ));
915
916	Z = MIRBuilder.buildShl(Dst: S32, Src0: Z, Src1: MIRBuilder.buildConstant(Res: S32, Val: `20`)).getReg(Idx: `0`);
917	InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Z).getReg(Idx: `0`) : Z;
918	}
919
920	if (!InputReg &&
921	(NeedWorkItemIDX \|\| NeedWorkItemIDY \|\| NeedWorkItemIDZ)) {
922	InputReg = MRI.createGenericVirtualRegister(Ty: S32);
923	if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
924	// We're in a situation where the outgoing function requires the workitem
925	// ID, but the calling function does not have it (e.g a graphics function
926	// calling a C calling convention function). This is illegal, but we need
927	// to produce something.
928	MIRBuilder.buildUndef(Res: InputReg);
929	} else {
930	// Workitem ids are already packed, any of present incoming arguments will
931	// carry all required fields.
932	ArgDescriptor IncomingArg = ArgDescriptor::createArg(
933	Arg: IncomingArgX ? *IncomingArgX :
934	IncomingArgY ? IncomingArgY : IncomingArgZ, Mask: ~`0u`);
935	LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: &IncomingArg,
936	ArgRC: &AMDGPU::VGPR_32RegClass, ArgTy: S32);
937	}
938	}
939
940	if (OutgoingArg->isRegister()) {
941	if (InputReg)
942	ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
943
944	if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
945	report_fatal_error(reason: "failed to allocate implicit input argument");
946	} else {
947	LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
948	return false;
949	}
950
951	return true;
952	}
953
954	/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
955	/// CC.
956	static std::pair<CCAssignFn , CCAssignFn >
957	getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
958	return {TLI.CCAssignFnForCall(CC, IsVarArg: false), TLI.CCAssignFnForCall(CC, IsVarArg: true)};
959	}
960
961	static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
962	bool IsTailCall, bool IsWave32,
963	CallingConv::ID CC,
964	bool IsDynamicVGPRChainCall = false) {
965	// For calls to amdgpu_cs_chain functions, the address is known to be uniform.
966	assert((AMDGPU::isChainCC(CC) \|\| !IsIndirect \|\| !IsTailCall) &&
967	"Indirect calls can't be tail calls, "
968	"because the address can be divergent");
969	if (!IsTailCall)
970	return AMDGPU::G_SI_CALL;
971
972	if (AMDGPU::isChainCC(CC)) {
973	if (IsDynamicVGPRChainCall)
974	return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR
975	: AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR;
976	return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
977	}
978
979	return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
980	AMDGPU::SI_TCRETURN;
981	}
982
983	// Add operands to call instruction to track the callee.
984	static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
985	MachineIRBuilder &MIRBuilder,
986	AMDGPUCallLowering::CallLoweringInfo &Info,
987	bool IsDynamicVGPRChainCall = false) {
988	if (Info.Callee.isReg()) {
989	CallInst.addReg(RegNo: Info.Callee.getReg());
990	CallInst.addImm(Val: `0`);
991	} else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == `0`) {
992	// The call lowering lightly assumed we can directly encode a call target in
993	// the instruction, which is not the case. Materialize the address here.
994	const GlobalValue *GV = Info.Callee.getGlobal();
995	auto Ptr = MIRBuilder.buildGlobalValue(
996	Res: LLT::pointer(AddressSpace: GV->getAddressSpace(), SizeInBits: `64`), GV);
997	CallInst.addReg(RegNo: Ptr.getReg(Idx: `0`));
998
999	if (IsDynamicVGPRChainCall) {
1000	// DynamicVGPR chain calls are always indirect.
1001	CallInst.addImm(Val: `0`);
1002	} else
1003	CallInst.add(MO: Info.Callee);
1004	} else
1005	return false;
1006
1007	return true;
1008	}
1009
1010	bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
1011	CallLoweringInfo &Info, MachineFunction &MF,
1012	SmallVectorImpl<ArgInfo> &InArgs) const {
1013	const Function &CallerF = MF.getFunction();
1014	CallingConv::ID CalleeCC = Info.CallConv;
1015	CallingConv::ID CallerCC = CallerF.getCallingConv();
1016
1017	// If the calling conventions match, then everything must be the same.
1018	if (CalleeCC == CallerCC)
1019	return true;
1020
1021	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1022
1023	// Make sure that the caller and callee preserve all of the same registers.
1024	const auto *TRI = ST.getRegisterInfo();
1025
1026	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1027	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
1028	if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
1029	return false;
1030
1031	// Check if the caller and callee will handle arguments in the same way.
1032	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1033	CCAssignFn *CalleeAssignFnFixed;
1034	CCAssignFn *CalleeAssignFnVarArg;
1035	std::tie(args&: CalleeAssignFnFixed, args&: CalleeAssignFnVarArg) =
1036	getAssignFnsForCC(CC: CalleeCC, TLI);
1037
1038	CCAssignFn *CallerAssignFnFixed;
1039	CCAssignFn *CallerAssignFnVarArg;
1040	std::tie(args&: CallerAssignFnFixed, args&: CallerAssignFnVarArg) =
1041	getAssignFnsForCC(CC: CallerCC, TLI);
1042
1043	// FIXME: We are not accounting for potential differences in implicitly passed
1044	// inputs, but only the fixed ABI is supported now anyway.
1045	IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
1046	CalleeAssignFnVarArg);
1047	IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
1048	CallerAssignFnVarArg);
1049	return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
1050	}
1051
1052	bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
1053	CallLoweringInfo &Info, MachineFunction &MF,
1054	SmallVectorImpl<ArgInfo> &OutArgs) const {
1055	// If there are no outgoing arguments, then we are done.
1056	if (OutArgs.empty())
1057	return true;
1058
1059	const Function &CallerF = MF.getFunction();
1060	CallingConv::ID CalleeCC = Info.CallConv;
1061	CallingConv::ID CallerCC = CallerF.getCallingConv();
1062	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1063
1064	CCAssignFn *AssignFnFixed;
1065	CCAssignFn *AssignFnVarArg;
1066	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1067
1068	// We have outgoing arguments. Make sure that we can tail call with them.
1069	SmallVector<CCValAssign, `16`> OutLocs;
1070	CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
1071	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1072
1073	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo&: OutInfo)) {
1074	LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1075	return false;
1076	}
1077
1078	// Make sure that they can fit on the caller's stack.
1079	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1080	if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
1081	LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1082	return false;
1083	}
1084
1085	// Verify that the parameters in callee-saved registers match.
1086	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1087	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1088	const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1089	MachineRegisterInfo &MRI = MF.getRegInfo();
1090	return parametersInCSRMatch(MRI, CallerPreservedMask, ArgLocs: OutLocs, OutVals: OutArgs);
1091	}
1092
1093	bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1094	MachineIRBuilder &B, CallLoweringInfo &Info,
1095	SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1096	// Must pass all target-independent checks in order to tail call optimize.
1097	if (!Info.IsTailCall)
1098	return false;
1099
1100	// Indirect calls can't be tail calls, because the address can be divergent.
1101	// TODO Check divergence info if the call really is divergent.
1102	if (Info.Callee.isReg())
1103	return false;
1104
1105	MachineFunction &MF = B.getMF();
1106	const Function &CallerF = MF.getFunction();
1107	CallingConv::ID CalleeCC = Info.CallConv;
1108	CallingConv::ID CallerCC = CallerF.getCallingConv();
1109
1110	const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1111	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1112	// Kernels aren't callable, and don't have a live in return address so it
1113	// doesn't make sense to do a tail call with entry functions.
1114	if (!CallerPreserved)
1115	return false;
1116
1117	if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC)) {
1118	LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1119	return false;
1120	}
1121
1122	if (any_of(Range: CallerF.args(), P: [](const Argument &A) {
1123	return A.hasByValAttr() \|\| A.hasSwiftErrorAttr();
1124	})) {
1125	LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1126	"or swifterror arguments\n");
1127	return false;
1128	}
1129
1130	// If we have -tailcallopt, then we're done.
1131	if (MF.getTarget().Options.GuaranteedTailCallOpt) {
1132	return AMDGPU::canGuaranteeTCO(CC: CalleeCC) &&
1133	CalleeCC == CallerF.getCallingConv();
1134	}
1135
1136	// Verify that the incoming and outgoing arguments from the callee are
1137	// safe to tail call.
1138	if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1139	LLVM_DEBUG(
1140	dbgs()
1141	<< "... Caller and callee have incompatible calling conventions.\n");
1142	return false;
1143	}
1144
1145	// FIXME: We need to check if any arguments passed in SGPR are uniform. If
1146	// they are not, this cannot be a tail call. If they are uniform, but may be
1147	// VGPR, we need to insert readfirstlanes.
1148	if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1149	return false;
1150
1151	LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1152	return true;
1153	}
1154
1155	// Insert outgoing implicit arguments for a call, by inserting copies to the
1156	// implicit argument registers and adding the necessary implicit uses to the
1157	// call instruction.
1158	void AMDGPUCallLowering::handleImplicitCallArguments(
1159	MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
1160	const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1161	CallingConv::ID CalleeCC,
1162	ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1163	if (!ST.enableFlatScratch()) {
1164	// Insert copies for the SRD. In the HSA case, this should be an identity
1165	// copy.
1166	auto ScratchRSrcReg = MIRBuilder.buildCopy(Res: LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`),
1167	Op: FuncInfo.getScratchRSrcReg());
1168
1169	auto CalleeRSrcReg = AMDGPU::isChainCC(CC: CalleeCC)
1170	? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1171	: AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1172
1173	MIRBuilder.buildCopy(Res: CalleeRSrcReg, Op: ScratchRSrcReg);
1174	CallInst.addReg(RegNo: CalleeRSrcReg, flags: RegState::Implicit);
1175	}
1176
1177	for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1178	MIRBuilder.buildCopy(Res: (Register)ArgReg.first, Op: ArgReg.second);
1179	CallInst.addReg(RegNo: ArgReg.first, flags: RegState::Implicit);
1180	}
1181	}
1182
1183	namespace {
1184	// Chain calls have special arguments that we need to handle. These have the
1185	// same index as they do in the llvm.amdgcn.cs.chain intrinsic.
1186	enum ChainCallArgIdx {
1187	Exec = `1`,
1188	Flags = `4`,
1189	NumVGPRs = `5`,
1190	FallbackExec = `6`,
1191	FallbackCallee = `7`,
1192	};
1193	} // anonymous namespace
1194
1195	bool AMDGPUCallLowering::lowerTailCall(
1196	MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1197	SmallVectorImpl<ArgInfo> &OutArgs) const {
1198	MachineFunction &MF = MIRBuilder.getMF();
1199	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1200	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1201	const Function &F = MF.getFunction();
1202	MachineRegisterInfo &MRI = MF.getRegInfo();
1203	const SIInstrInfo *TII = ST.getInstrInfo();
1204	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1205	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1206
1207	// True when we're tail calling, but without -tailcallopt.
1208	bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1209
1210	// Find out which ABI gets to decide where things go.
1211	CallingConv::ID CalleeCC = Info.CallConv;
1212	CCAssignFn *AssignFnFixed;
1213	CCAssignFn *AssignFnVarArg;
1214	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1215
1216	MachineInstrBuilder CallSeqStart;
1217	if (!IsSibCall)
1218	CallSeqStart = MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKUP);
1219
1220	bool IsChainCall = AMDGPU::isChainCC(CC: Info.CallConv);
1221	bool IsDynamicVGPRChainCall = false;
1222
1223	if (IsChainCall) {
1224	ArgInfo FlagsArg = Info.OrigArgs [ChainCallArgIdx::Flags];
1225	const APInt &FlagsValue = cast<ConstantInt>(Val: FlagsArg.OrigValue)->getValue();
1226	if (FlagsValue.isZero()) {
1227	if (Info.OrigArgs.size() != `5`) {
1228	LLVM_DEBUG(dbgs() << "No additional args allowed if flags == 0\n");
1229	return false;
1230	}
1231	} else if (FlagsValue.isOneBitSet(BitNo: `0`)) {
1232	IsDynamicVGPRChainCall = true;
1233
1234	if (Info.OrigArgs.size() != `8`) {
1235	LLVM_DEBUG(dbgs() << "Expected 3 additional args\n");
1236	return false;
1237	}
1238
1239	// On GFX12, we can only change the VGPR allocation for wave32.
1240	if (!ST.isWave32()) {
1241	F.getContext().diagnose(DI: DiagnosticInfoUnsupported (
1242	F, "dynamic VGPR mode is only supported for wave32"));
1243	return false;
1244	}
1245
1246	ArgInfo FallbackExecArg = Info.OrigArgs [ChainCallArgIdx::FallbackExec];
1247	assert(FallbackExecArg.Regs.size() == `1` &&
1248	"Expected single register for fallback EXEC");
1249	if (!FallbackExecArg.Ty->isIntegerTy(Bitwidth: ST.getWavefrontSize())) {
1250	LLVM_DEBUG(dbgs() << "Bad type for fallback EXEC\n");
1251	return false;
1252	}
1253	}
1254	}
1255
1256	unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), /IsTailCall/ true,
1257	IsWave32: ST.isWave32(), CC: CalleeCC, IsDynamicVGPRChainCall);
1258	auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1259	if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
1260	return false;
1261
1262	// Byte offset for the tail call. When we are sibcalling, this will always
1263	// be 0.
1264	MIB.addImm(Val: `0`);
1265
1266	// If this is a chain call, we need to pass in the EXEC mask as well as any
1267	// other special args.
1268	if (IsChainCall) {
1269	auto AddRegOrImm = [&](const ArgInfo &Arg) {
1270	if (auto CI = dyn_cast<ConstantInt>(Val: Arg.OrigValue)) {
1271	MIB.addImm(Val: CI->getSExtValue());
1272	} else {
1273	MIB.addReg(RegNo: Arg.Regs [`0`]);
1274	unsigned Idx = MIB ->getNumOperands() - `1`;
1275	MIB ->getOperand(i: Idx).setReg(constrainOperandRegClass(
1276	MF, TRI: TRI, MRI, TII: TII, RBI: ST.getRegBankInfo(), InsertPt&: MIB, II: MIB ->getDesc(),
1277	RegMO&: MIB ->getOperand(i: Idx), OpIdx: Idx));
1278	}
1279	};
1280
1281	ArgInfo ExecArg = Info.OrigArgs [ChainCallArgIdx::Exec];
1282	assert(ExecArg.Regs.size() == `1` && "Too many regs for EXEC");
1283
1284	if (!ExecArg.Ty->isIntegerTy(Bitwidth: ST.getWavefrontSize())) {
1285	LLVM_DEBUG(dbgs() << "Bad type for EXEC");
1286	return false;
1287	}
1288
1289	AddRegOrImm (ExecArg);
1290	if (IsDynamicVGPRChainCall)
1291	std::for_each(first: Info.OrigArgs.begin() + ChainCallArgIdx::NumVGPRs,
1292	last: Info.OrigArgs.end(), f: AddRegOrImm);
1293	}
1294
1295	// Tell the call which registers are clobbered.
1296	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1297	MIB.addRegMask(Mask);
1298
1299	// FPDiff is the byte offset of the call's argument area from the callee's.
1300	// Stores to callee stack arguments will be placed in FixedStackSlots offset
1301	// by this amount for a tail call. In a sibling call it must be 0 because the
1302	// caller will deallocate the entire stack and the callee still expects its
1303	// arguments to begin at SP+0.
1304	int FPDiff = `0`;
1305
1306	// This will be 0 for sibcalls, potentially nonzero for tail calls produced
1307	// by -tailcallopt. For sibcalls, the memory operands for the call are
1308	// already available in the caller's incoming argument space.
1309	unsigned NumBytes = `0`;
1310	if (!IsSibCall) {
1311	// We aren't sibcalling, so we need to compute FPDiff. We need to do this
1312	// before handling assignments, because FPDiff must be known for memory
1313	// arguments.
1314	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1315	SmallVector<CCValAssign, `16`> OutLocs;
1316	CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1317
1318	// FIXME: Not accounting for callee implicit inputs
1319	OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1320	if (!determineAssignments(Assigner&: CalleeAssigner, Args&: OutArgs, CCInfo&: OutInfo))
1321	return false;
1322
1323	// The callee will pop the argument stack as a tail call. Thus, we must
1324	// keep it 16-byte aligned.
1325	NumBytes = alignTo(Size: OutInfo.getStackSize(), A: ST.getStackAlignment());
1326
1327	// FPDiff will be negative if this tail call requires more space than we
1328	// would automatically have in our incoming argument space. Positive if we
1329	// actually shrink the stack.
1330	FPDiff = NumReusableBytes - NumBytes;
1331
1332	// The stack pointer must be 16-byte aligned at all times it's used for a
1333	// memory operation, which in practice means at all* times and in*
1334	// particular across call boundaries. Therefore our own arguments started at
1335	// a 16-byte aligned SP and the delta applied for the tail call should
1336	// satisfy the same constraint.
1337	assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1338	"unaligned stack on tail call");
1339	}
1340
1341	SmallVector<CCValAssign, `16`> ArgLocs;
1342	CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1343
1344	// We could pass MIB and directly add the implicit uses to the call
1345	// now. However, as an aesthetic choice, place implicit argument operands
1346	// after the ordinary user argument registers.
1347	SmallVector<std::pair<MCRegister, Register>, `12`> ImplicitArgRegs;
1348
1349	if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1350	!AMDGPU::isChainCC(CC: Info.CallConv)) {
1351	// With a fixed ABI, allocate fixed registers before user arguments.
1352	if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1353	return false;
1354	}
1355
1356	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1357
1358	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1359	return false;
1360
1361	// Do the actual argument marshalling.
1362	AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1363	if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1364	return false;
1365
1366	if (Info.ConvergenceCtrlToken) {
1367	MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1368	}
1369	handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *FuncInfo, CalleeCC,
1370	ImplicitArgRegs);
1371
1372	// If we have -tailcallopt, we need to adjust the stack. We'll do the call
1373	// sequence start and end here.
1374	if (!IsSibCall) {
1375	MIB ->getOperand(i: `1`).setImm(FPDiff);
1376	CallSeqStart.addImm(Val: NumBytes).addImm(Val: `0`);
1377	// End the call sequence before* emitting the call. Normally, we would*
1378	// tidy the frame up after the call. However, here, we've laid out the
1379	// parameters so that when SP is reset, they will be in the correct
1380	// location.
1381	MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKDOWN).addImm(Val: NumBytes).addImm(Val: `0`);
1382	}
1383
1384	// Now we can add the actual call instruction to the correct basic block.
1385	MIRBuilder.insertInstr(MIB);
1386
1387	// If Callee is a reg, since it is used by a target specific
1388	// instruction, it must have a register class matching the
1389	// constraint of that instruction.
1390
1391	// FIXME: We should define regbankselectable call instructions to handle
1392	// divergent call targets.
1393	if (MIB ->getOperand(i: `0`).isReg()) {
1394	MIB ->getOperand(i: `0`).setReg(
1395	constrainOperandRegClass(MF, TRI: TRI, MRI, TII: TII, RBI: *ST.getRegBankInfo(),
1396	InsertPt&: *MIB, II: MIB ->getDesc(), RegMO&: MIB ->getOperand(i: `0`), OpIdx: `0`));
1397	}
1398
1399	MF.getFrameInfo().setHasTailCall();
1400	Info.LoweredTailCall = true;
1401	return true;
1402	}
1403
1404	/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1405	bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
1406	CallLoweringInfo &Info) const {
1407	ArgInfo Callee = Info.OrigArgs [`0`];
1408	ArgInfo SGPRArgs = Info.OrigArgs [`2`];
1409	ArgInfo VGPRArgs = Info.OrigArgs [`3`];
1410
1411	MachineFunction &MF = MIRBuilder.getMF();
1412	const Function &F = MF.getFunction();
1413	const DataLayout &DL = F.getDataLayout();
1414
1415	// The function to jump to is actually the first argument, so we'll change the
1416	// Callee and other info to match that before using our existing helper.
1417	const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1418	if (const Function *F = dyn_cast<Function>(Val: CalleeV)) {
1419	Info.Callee = MachineOperand::CreateGA(GV: F, Offset: `0`);
1420	Info.CallConv = F->getCallingConv();
1421	} else {
1422	assert(Callee.Regs.size() == `1` && "Too many regs for the callee");
1423	Info.Callee = MachineOperand::CreateReg(Reg: Callee.Regs [`0`], isDef: false);
1424	Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1425	// behaves the same here.
1426	}
1427
1428	// The function that we're calling cannot be vararg (only the intrinsic is).
1429	Info.IsVarArg = false;
1430
1431	assert(
1432	all_of(SGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1433	"SGPR arguments should be marked inreg");
1434	assert(
1435	none_of(VGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1436	"VGPR arguments should not be marked inreg");
1437
1438	SmallVector<ArgInfo, `8`> OutArgs;
1439	splitToValueTypes(OrigArgInfo: SGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1440	splitToValueTypes(OrigArgInfo: VGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1441
1442	Info.IsMustTailCall = true;
1443	return lowerTailCall(MIRBuilder, Info, OutArgs);
1444	}
1445
1446	bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1447	CallLoweringInfo &Info) const {
1448	if (Function *F = Info.CB->getCalledFunction())
1449	if (F->isIntrinsic()) {
1450	assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
1451	"Unexpected intrinsic");
1452	return lowerChainCall(MIRBuilder, Info);
1453	}
1454
1455	if (Info.IsVarArg) {
1456	LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1457	return false;
1458	}
1459
1460	MachineFunction &MF = MIRBuilder.getMF();
1461	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1462	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1463
1464	const Function &F = MF.getFunction();
1465	MachineRegisterInfo &MRI = MF.getRegInfo();
1466	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1467	const DataLayout &DL = F.getDataLayout();
1468
1469	SmallVector<ArgInfo, `8`> OutArgs;
1470	for (auto &OrigArg : Info.OrigArgs)
1471	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1472
1473	SmallVector<ArgInfo, `8`> InArgs;
1474	if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1475	splitToValueTypes(OrigArgInfo: Info.OrigRet, SplitArgs&: InArgs, DL, CallConv: Info.CallConv);
1476
1477	// If we can lower as a tail call, do that instead.
1478	bool CanTailCallOpt =
1479	isEligibleForTailCallOptimization(B&: MIRBuilder, Info, InArgs, OutArgs);
1480
1481	// We must emit a tail call if we have musttail.
1482	if (Info.IsMustTailCall && !CanTailCallOpt) {
1483	LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1484	return false;
1485	}
1486
1487	Info.IsTailCall = CanTailCallOpt;
1488	if (CanTailCallOpt)
1489	return lowerTailCall(MIRBuilder, Info, OutArgs);
1490
1491	// Find out which ABI gets to decide where things go.
1492	CCAssignFn *AssignFnFixed;
1493	CCAssignFn *AssignFnVarArg;
1494	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) =
1495	getAssignFnsForCC(CC: Info.CallConv, TLI);
1496
1497	MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKUP)
1498	.addImm(Val: `0`)
1499	.addImm(Val: `0`);
1500
1501	// Create a temporarily-floating call instruction so we can add the implicit
1502	// uses of arg registers.
1503	unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), IsTailCall: false, IsWave32: ST.isWave32(),
1504	CC: Info.CallConv);
1505
1506	auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1507	MIB.addDef(RegNo: TRI->getReturnAddressReg(MF));
1508
1509	if (!Info.IsConvergent)
1510	MIB.setMIFlag(MachineInstr::NoConvergent);
1511
1512	if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info))
1513	return false;
1514
1515	// Tell the call which registers are clobbered.
1516	const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1517	MIB.addRegMask(Mask);
1518
1519	SmallVector<CCValAssign, `16`> ArgLocs;
1520	CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1521
1522	// We could pass MIB and directly add the implicit uses to the call
1523	// now. However, as an aesthetic choice, place implicit argument operands
1524	// after the ordinary user argument registers.
1525	SmallVector<std::pair<MCRegister, Register>, `12`> ImplicitArgRegs;
1526
1527	if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
1528	// With a fixed ABI, allocate fixed registers before user arguments.
1529	if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1530	return false;
1531	}
1532
1533	// Do the actual argument marshalling.
1534	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1535	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1536	return false;
1537
1538	AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1539	if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1540	return false;
1541
1542	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1543
1544	if (Info.ConvergenceCtrlToken) {
1545	MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1546	}
1547	handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *MFI, CalleeCC: Info.CallConv,
1548	ImplicitArgRegs);
1549
1550	// Get a count of how many bytes are to be pushed on the stack.
1551	unsigned NumBytes = CCInfo.getStackSize();
1552
1553	// If Callee is a reg, since it is used by a target specific
1554	// instruction, it must have a register class matching the
1555	// constraint of that instruction.
1556
1557	// FIXME: We should define regbankselectable call instructions to handle
1558	// divergent call targets.
1559	if (MIB ->getOperand(i: `1`).isReg()) {
1560	MIB ->getOperand(i: `1`).setReg(constrainOperandRegClass(
1561	MF, TRI: TRI, MRI, TII: ST.getInstrInfo(),
1562	RBI: ST.getRegBankInfo(), InsertPt&: MIB, II: MIB ->getDesc(), RegMO&: MIB ->getOperand(i: `1`),
1563	OpIdx: `1`));
1564	}
1565
1566	// Now we can add the actual call instruction to the correct position.
1567	MIRBuilder.insertInstr(MIB);
1568
1569	// Finally we can copy the returned value back into its virtual-register. In
1570	// symmetry with the arguments, the physical register must be an
1571	// implicit-define of the call instruction.
1572	if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1573	CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(CC: Info.CallConv,
1574	IsVarArg: Info.IsVarArg);
1575	IncomingValueAssigner Assigner(RetAssignFn);
1576	CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1577	if (!determineAndHandleAssignments(Handler, Assigner, Args&: InArgs, MIRBuilder,
1578	CallConv: Info.CallConv, IsVarArg: Info.IsVarArg))
1579	return false;
1580	}
1581
1582	uint64_t CalleePopBytes = NumBytes;
1583
1584	MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKDOWN)
1585	.addImm(Val: `0`)
1586	.addImm(Val: CalleePopBytes);
1587
1588	if (!Info.CanLowerReturn) {
1589	insertSRetLoads(MIRBuilder, RetTy: Info.OrigRet.Ty, VRegs: Info.OrigRet.Regs,
1590	DemoteReg: Info.DemoteRegister, FI: Info.DemoteStackIndex);
1591	}
1592
1593	return true;
1594	}
1595

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp