AMDGPUCallLowering.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp]

1	//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements the lowering of LLVM calls to machine code calls for
11	/// GlobalISel.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPU.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "SIMachineFunctionInfo.h"
19	#include "SIRegisterInfo.h"
20	#include "llvm/CodeGen/Analysis.h"
21	#include "llvm/CodeGen/FunctionLoweringInfo.h"
22	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23	#include "llvm/CodeGen/MachineFrameInfo.h"
24	#include "llvm/CodeGen/PseudoSourceValueManager.h"
25	#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27	#define DEBUG_TYPE "amdgpu-call-lowering"
28
29	using namespace llvm;
30
31	namespace {
32
33	/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
34	static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
35	Register ValVReg, const CCValAssign &VA) {
36	if (VA.getLocVT().getSizeInBits() < `32`) {
37	// 16-bit types are reported as legal for 32-bit registers. We need to
38	// extend and do a 32-bit copy to avoid the verifier complaining about it.
39	return Handler.MIRBuilder.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: ValVReg).getReg(Idx: `0`);
40	}
41
42	return Handler.extendRegister(ValReg: ValVReg, VA);
43	}
44
45	struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
46	AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
47	MachineInstrBuilder MIB)
48	: OutgoingValueHandler (B, MRI), MIB (MIB) {}
49
50	MachineInstrBuilder MIB;
51
52	Register getStackAddress(uint64_t Size, int64_t Offset,
53	MachinePointerInfo &MPO,
54	ISD::ArgFlagsTy Flags) override {
55	llvm_unreachable("not implemented");
56	}
57
58	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
59	const MachinePointerInfo &MPO,
60	const CCValAssign &VA) override {
61	llvm_unreachable("not implemented");
62	}
63
64	void assignValueToReg(Register ValVReg, Register PhysReg,
65	const CCValAssign &VA,
66	ISD::ArgFlagsTy Flags = {}) override {
67	Register ExtReg = extendRegisterMin32(Handler&: *this, ValVReg, VA);
68
69	// If this is a scalar return, insert a readfirstlane just in case the value
70	// ends up in a VGPR.
71	// FIXME: Assert this is a shader return.
72	const SIRegisterInfo *TRI
73	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
74	if (TRI->isSGPRReg(MRI, Reg: PhysReg)) {
75	LLT Ty = MRI.getType(Reg: ExtReg);
76	LLT S32 = LLT::scalar(SizeInBits: `32`);
77	if (Ty != S32) {
78	// FIXME: We should probably support readfirstlane intrinsics with all
79	// legal 32-bit types.
80	assert(Ty.getSizeInBits() == `32`);
81	if (Ty.isPointer())
82	ExtReg = MIRBuilder.buildPtrToInt(Dst: S32, Src: ExtReg).getReg(Idx: `0`);
83	else
84	ExtReg = MIRBuilder.buildBitcast(Dst: S32, Src: ExtReg).getReg(Idx: `0`);
85	}
86
87	auto ToSGPR = MIRBuilder
88	.buildIntrinsic(ID: Intrinsic::amdgcn_readfirstlane,
89	Res: {MRI.getType(Reg: ExtReg)})
90	.addReg(RegNo: ExtReg);
91	ExtReg = ToSGPR.getReg(Idx: `0`);
92	}
93
94	MIRBuilder.buildCopy(Res: PhysReg, Op: ExtReg);
95	MIB.addUse(RegNo: PhysReg, Flags: RegState::Implicit);
96	}
97	};
98
99	struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
100	uint64_t StackUsed = `0`;
101
102	AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
103	: IncomingValueHandler (B, MRI) {}
104
105	Register getStackAddress(uint64_t Size, int64_t Offset,
106	MachinePointerInfo &MPO,
107	ISD::ArgFlagsTy Flags) override {
108	auto &MFI = MIRBuilder.getMF().getFrameInfo();
109
110	// Byval is assumed to be writable memory, but other stack passed arguments
111	// are not.
112	const bool IsImmutable = !Flags.isByVal();
113	int FI = MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable);
114	MPO = MachinePointerInfo::getFixedStack(MF&: MIRBuilder.getMF(), FI);
115	auto AddrReg = MIRBuilder.buildFrameIndex(
116	Res: LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: `32`), Idx: FI);
117	StackUsed = std::max(a: StackUsed, b: Size + Offset);
118	return AddrReg.getReg(Idx: `0`);
119	}
120
121	void copyToReg(Register ValVReg, Register PhysReg, const CCValAssign &VA) {
122	if (VA.getLocVT().getSizeInBits() < `32`) {
123	// 16-bit types are reported as legal for 32-bit registers. We need to
124	// do a 32-bit copy, and truncate to avoid the verifier complaining
125	// about it.
126	auto Copy = MIRBuilder.buildCopy(Res: LLT::scalar(SizeInBits: `32`), Op: PhysReg);
127
128	// If we have signext/zeroext, it applies to the whole 32-bit register
129	// before truncation.
130	auto Extended =
131	buildExtensionHint(VA, SrcReg: Copy.getReg(Idx: `0`), NarrowTy: LLT (VA.getLocVT()));
132	MIRBuilder.buildTrunc(Res: ValVReg, Op: Extended);
133	return;
134	}
135
136	IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
137	}
138
139	void readLaneToSGPR(Register ValVReg, Register PhysReg,
140	const CCValAssign &VA) {
141	// Handle inreg parameters passed through VGPRs due to SGPR exhaustion.
142	// When SGPRs are exhausted, the calling convention may allocate inreg
143	// parameters to VGPRs. We insert readfirstlane to move the value from
144	// VGPR to SGPR, as required by the inreg ABI.
145	//
146	// FIXME: This may increase instruction count in some cases. If the
147	// readfirstlane result is subsequently copied back to a VGPR, we cannot
148	// optimize away the unnecessary VGPR->SGPR->VGPR sequence in later passes
149	// because the inreg attribute information is not preserved in MIR. We could
150	// use WWM_COPY (or similar instructions) and mark it as foldable to enable
151	// later optimization passes to eliminate the redundant readfirstlane.
152	auto Copy = MIRBuilder.buildCopy(Res: LLT::scalar(SizeInBits: `32`), Op: PhysReg);
153	if (VA.getLocVT().getSizeInBits() < `32`) {
154	auto ToSGPR = MIRBuilder
155	.buildIntrinsic(ID: Intrinsic::amdgcn_readfirstlane,
156	Res: {MRI.getType(Reg: Copy.getReg(Idx: `0`))})
157	.addReg(RegNo: Copy.getReg(Idx: `0`));
158	auto Extended =
159	buildExtensionHint(VA, SrcReg: ToSGPR.getReg(Idx: `0`), NarrowTy: LLT (VA.getLocVT()));
160	MIRBuilder.buildTrunc(Res: ValVReg, Op: Extended);
161	return;
162	}
163
164	MIRBuilder.buildIntrinsic(ID: Intrinsic::amdgcn_readfirstlane, Res: ValVReg)
165	.addReg(RegNo: Copy.getReg(Idx: `0`));
166	}
167
168	void assignValueToReg(Register ValVReg, Register PhysReg,
169	const CCValAssign &VA,
170	ISD::ArgFlagsTy Flags = {}) override {
171	markPhysRegUsed(PhysReg);
172
173	const SIRegisterInfo *TRI =
174	static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
175
176	// Inreg flag should be the same across SplitArg[i]
177	if (Flags.isInReg() && TRI->isVGPR(MRI, Reg: PhysReg))
178	readLaneToSGPR(ValVReg, PhysReg, VA);
179	else
180	copyToReg(ValVReg, PhysReg, VA);
181	}
182
183	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
184	const MachinePointerInfo &MPO,
185	const CCValAssign &VA) override {
186	MachineFunction &MF = MIRBuilder.getMF();
187
188	auto *MMO = MF.getMachineMemOperand(
189	PtrInfo: MPO, f: MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant, MemTy,
190	base_alignment: inferAlignFromPtrInfo(MF, MPO));
191	MIRBuilder.buildLoad(Res: ValVReg, Addr, MMO&: *MMO);
192	}
193
194	/// How the physical register gets marked varies between formal
195	/// parameters (it's a basic-block live-in), and a call instruction
196	/// (it's an implicit-def of the BL).
197	virtual void markPhysRegUsed(unsigned PhysReg) = `0`;
198	};
199
200	struct FormalArgHandler : public AMDGPUIncomingArgHandler {
201	FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
202	: AMDGPUIncomingArgHandler (B, MRI) {}
203
204	void markPhysRegUsed(unsigned PhysReg) override {
205	MIRBuilder.getMBB().addLiveIn(PhysReg);
206	}
207	};
208
209	struct CallReturnHandler : public AMDGPUIncomingArgHandler {
210	CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
211	MachineInstrBuilder MIB)
212	: AMDGPUIncomingArgHandler (MIRBuilder, MRI), MIB (MIB) {}
213
214	void markPhysRegUsed(unsigned PhysReg) override {
215	MIB.addDef(RegNo: PhysReg, Flags: RegState::Implicit);
216	}
217
218	MachineInstrBuilder MIB;
219	};
220
221	struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
222	/// For tail calls, the byte offset of the call's argument area from the
223	/// callee's. Unused elsewhere.
224	int FPDiff;
225
226	// Cache the SP register vreg if we need it more than once in this call site.
227	Register SPReg;
228
229	bool IsTailCall;
230
231	AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
232	MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
233	bool IsTailCall = false, int FPDiff = `0`)
234	: AMDGPUOutgoingValueHandler (MIRBuilder, MRI, MIB), FPDiff(FPDiff),
235	IsTailCall(IsTailCall) {}
236
237	Register getStackAddress(uint64_t Size, int64_t Offset,
238	MachinePointerInfo &MPO,
239	ISD::ArgFlagsTy Flags) override {
240	MachineFunction &MF = MIRBuilder.getMF();
241	const LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::PRIVATE_ADDRESS, SizeInBits: `32`);
242	const LLT S32 = LLT::scalar(SizeInBits: `32`);
243
244	if (IsTailCall) {
245	Offset += FPDiff;
246	int FI = MF.getFrameInfo().CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
247	auto FIReg = MIRBuilder.buildFrameIndex(Res: PtrTy, Idx: FI);
248	MPO = MachinePointerInfo::getFixedStack(MF, FI);
249	return FIReg.getReg(Idx: `0`);
250	}
251
252	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
253
254	if (!SPReg) {
255	const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
256	if (ST.hasFlatScratchEnabled()) {
257	// The stack is accessed unswizzled, so we can use a regular copy.
258	SPReg = MIRBuilder.buildCopy(Res: PtrTy,
259	Op: MFI->getStackPtrOffsetReg()).getReg(Idx: `0`);
260	} else {
261	// The address we produce here, without knowing the use context, is going
262	// to be interpreted as a vector address, so we need to convert to a
263	// swizzled address.
264	SPReg = MIRBuilder.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {PtrTy},
265	SrcOps: {MFI->getStackPtrOffsetReg()}).getReg(Idx: `0`);
266	}
267	}
268
269	auto OffsetReg = MIRBuilder.buildConstant(Res: S32, Val: Offset);
270
271	auto AddrReg = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: SPReg, Op1: OffsetReg);
272	MPO = MachinePointerInfo::getStack(MF, Offset);
273	return AddrReg.getReg(Idx: `0`);
274	}
275
276	void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
277	const MachinePointerInfo &MPO,
278	const CCValAssign &VA) override {
279	MachineFunction &MF = MIRBuilder.getMF();
280	uint64_t LocMemOffset = VA.getLocMemOffset();
281	const auto &ST = MF.getSubtarget<GCNSubtarget>();
282
283	auto *MMO = MF.getMachineMemOperand(
284	PtrInfo: MPO, f: MachineMemOperand::MOStore, MemTy,
285	base_alignment: commonAlignment(A: ST.getStackAlignment(), Offset: LocMemOffset));
286	MIRBuilder.buildStore(Val: ValVReg, Addr, MMO&: *MMO);
287	}
288
289	void assignValueToAddress(const CallLowering::ArgInfo &Arg,
290	unsigned ValRegIndex, Register Addr, LLT MemTy,
291	const MachinePointerInfo &MPO,
292	const CCValAssign &VA) override {
293	Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
294	? extendRegister(ValReg: Arg.Regs [ValRegIndex], VA)
295	: Arg.Regs [ValRegIndex];
296	assignValueToAddress(ValVReg, Addr, MemTy, MPO, VA);
297	}
298	};
299	} // anonymous namespace
300
301	AMDGPUCallLowering::AMDGPUCallLowering(const TargetLowering &TLI)
302	: CallLowering (&TLI) {}
303
304	// FIXME: Compatibility shim
305	static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
306	switch (MIOpc) {
307	case TargetOpcode::G_SEXT:
308	return ISD::SIGN_EXTEND;
309	case TargetOpcode::G_ZEXT:
310	return ISD::ZERO_EXTEND;
311	case TargetOpcode::G_ANYEXT:
312	return ISD::ANY_EXTEND;
313	default:
314	llvm_unreachable("not an extend opcode");
315	}
316	}
317
318	bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
319	CallingConv::ID CallConv,
320	SmallVectorImpl<BaseArgInfo> &Outs,
321	bool IsVarArg) const {
322	// For shaders. Vector types should be explicitly handled by CC.
323	if (AMDGPU::isEntryFunctionCC(CC: CallConv))
324	return true;
325
326	SmallVector<CCValAssign, `16`> ArgLocs;
327	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
328	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
329	MF.getFunction().getContext());
330
331	return checkReturn(CCInfo, Outs, Fn: TLI.CCAssignFnForReturn(CC: CallConv, IsVarArg));
332	}
333
334	/// Lower the return value for the already existing \p Ret. This assumes that
335	/// \p B's insertion point is correct.
336	bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
337	const Value *Val, ArrayRef<Register> VRegs,
338	MachineInstrBuilder &Ret) const {
339	if (!Val)
340	return true;
341
342	auto &MF = B.getMF();
343	const auto &F = MF.getFunction();
344	const DataLayout &DL = MF.getDataLayout();
345	MachineRegisterInfo *MRI = B.getMRI();
346	LLVMContext &Ctx = F.getContext();
347
348	CallingConv::ID CC = F.getCallingConv();
349	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
350
351	SmallVector<EVT, `8`> SplitEVTs;
352	ComputeValueVTs(TLI, DL, Ty: Val->getType(), ValueVTs&: SplitEVTs);
353	assert(VRegs.size() == SplitEVTs.size() &&
354	"For each split Type there should be exactly one VReg.");
355
356	SmallVector<ArgInfo, `8`> SplitRetInfos;
357
358	for (unsigned i = `0`; i < SplitEVTs.size(); ++i) {
359	EVT VT = SplitEVTs [i];
360	Register Reg = VRegs [i];
361	ArgInfo RetInfo(Reg, VT.getTypeForEVT(Context&: Ctx), `0`);
362	setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
363
364	if (VT.isScalarInteger()) {
365	unsigned ExtendOp = TargetOpcode::G_ANYEXT;
366	if (RetInfo.Flags [`0`].isSExt()) {
367	assert(RetInfo.Regs.size() == `1` && "expect only simple return values");
368	ExtendOp = TargetOpcode::G_SEXT;
369	} else if (RetInfo.Flags [`0`].isZExt()) {
370	assert(RetInfo.Regs.size() == `1` && "expect only simple return values");
371	ExtendOp = TargetOpcode::G_ZEXT;
372	}
373
374	EVT ExtVT = TLI.getTypeForExtReturn(Context&: Ctx, VT,
375	ExtendKind: extOpcodeToISDExtOpcode(MIOpc: ExtendOp));
376	if (ExtVT != VT) {
377	RetInfo.Ty = ExtVT.getTypeForEVT(Context&: Ctx);
378	LLT ExtTy = getLLTForType(Ty&: *RetInfo.Ty, DL);
379	Reg = B.buildInstr(Opc: ExtendOp, DstOps: {ExtTy}, SrcOps: {Reg}).getReg(Idx: `0`);
380	}
381	}
382
383	if (Reg != RetInfo.Regs [`0`]) {
384	RetInfo.Regs [`0`] = Reg;
385	// Reset the arg flags after modifying Reg.
386	setArgFlags(Arg&: RetInfo, OpIdx: AttributeList::ReturnIndex, DL, FuncInfo: F);
387	}
388
389	splitToValueTypes(OrigArgInfo: RetInfo, SplitArgs&: SplitRetInfos, DL, CallConv: CC);
390	}
391
392	CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, IsVarArg: F.isVarArg());
393
394	OutgoingValueAssigner Assigner(AssignFn);
395	AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
396	return determineAndHandleAssignments(Handler&: RetHandler, Assigner, Args&: SplitRetInfos, MIRBuilder&: B,
397	CallConv: CC, IsVarArg: F.isVarArg());
398	}
399
400	bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
401	ArrayRef<Register> VRegs,
402	FunctionLoweringInfo &FLI) const {
403
404	MachineFunction &MF = B.getMF();
405	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
406	MFI->setIfReturnsVoid(!Val);
407
408	assert(!Val == VRegs.empty() && "Return value without a vreg");
409
410	CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
411	const bool IsShader = AMDGPU::isShader(CC);
412	const bool IsWaveEnd =
413	(IsShader && MFI->returnsVoid()) \|\| AMDGPU::isKernel(CC);
414	if (IsWaveEnd) {
415	B.buildInstr(Opcode: AMDGPU::S_ENDPGM)
416	.addImm(Val: `0`);
417	return true;
418	}
419
420	const bool IsWholeWave = MFI->isWholeWaveFunction();
421	unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
422	: IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
423	: AMDGPU::SI_RETURN;
424	auto Ret = B.buildInstrNoInsert(Opcode: ReturnOpc);
425
426	if (!FLI.CanLowerReturn)
427	insertSRetStores(MIRBuilder&: B, RetTy: Val->getType(), VRegs, DemoteReg: FLI.DemoteRegister);
428	else if (!lowerReturnVal(B, Val, VRegs, Ret))
429	return false;
430
431	if (IsWholeWave)
432	addOriginalExecToReturn(MF&: B.getMF(), Ret);
433
434	// TODO: Handle CalleeSavedRegsViaCopy.
435
436	B.insertInstr(MIB: Ret);
437	return true;
438	}
439
440	void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
441	uint64_t Offset) const {
442	MachineFunction &MF = B.getMF();
443	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
444	MachineRegisterInfo &MRI = MF.getRegInfo();
445	Register KernArgSegmentPtr =
446	MFI->getPreloadedReg(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
447	Register KernArgSegmentVReg = MRI.getLiveInVirtReg(PReg: KernArgSegmentPtr);
448
449	auto OffsetReg = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset);
450
451	B.buildPtrAdd(Res: DstReg, Op0: KernArgSegmentVReg, Op1: OffsetReg);
452	}
453
454	void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, ArgInfo &OrigArg,
455	uint64_t Offset,
456	Align Alignment) const {
457	MachineFunction &MF = B.getMF();
458	const Function &F = MF.getFunction();
459	const DataLayout &DL = F.getDataLayout();
460	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
461	MachinePointerInfo PtrInfo = TLI.getKernargSegmentPtrInfo(MF);
462
463	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
464
465	SmallVector<ArgInfo, `32`> SplitArgs;
466	SmallVector<TypeSize> FieldOffsets;
467	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: F.getCallingConv(), Offsets: &FieldOffsets);
468
469	unsigned Idx = `0`;
470	for (ArgInfo &SplitArg : SplitArgs) {
471	Register PtrReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
472	lowerParameterPtr(DstReg: PtrReg, B, Offset: Offset + FieldOffsets [Idx]);
473
474	LLT ArgTy = getLLTForType(Ty&: *SplitArg.Ty, DL);
475	if (SplitArg.Flags [`0`].isPointer()) {
476	// Compensate for losing pointeriness in splitValueTypes.
477	LLT PtrTy = LLT::pointer(AddressSpace: SplitArg.Flags [`0`].getPointerAddrSpace(),
478	SizeInBits: ArgTy.getScalarSizeInBits());
479	ArgTy = ArgTy.isVector() ? LLT::vector(EC: ArgTy.getElementCount(), ScalarTy: PtrTy)
480	: PtrTy;
481	}
482
483	MachineMemOperand *MMO = MF.getMachineMemOperand(
484	PtrInfo,
485	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
486	MachineMemOperand::MOInvariant,
487	MemTy: ArgTy, base_alignment: commonAlignment(A: Alignment, Offset: FieldOffsets [Idx]));
488
489	assert(SplitArg.Regs.size() == `1`);
490
491	B.buildLoad(Res: SplitArg.Regs [`0`], Addr: PtrReg, MMO&: *MMO);
492	++Idx;
493	}
494	}
495
496	// Allocate special inputs passed in user SGPRs.
497	static void allocateHSAUserSGPRs(CCState &CCInfo,
498	MachineIRBuilder &B,
499	MachineFunction &MF,
500	const SIRegisterInfo &TRI,
501	SIMachineFunctionInfo &Info) {
502	// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
503	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
504	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
505	Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
506	MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
507	CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
508	}
509
510	if (UserSGPRInfo.hasDispatchPtr()) {
511	Register DispatchPtrReg = Info.addDispatchPtr(TRI);
512	MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
513	CCInfo.AllocateReg(Reg: DispatchPtrReg);
514	}
515
516	if (UserSGPRInfo.hasQueuePtr()) {
517	Register QueuePtrReg = Info.addQueuePtr(TRI);
518	MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
519	CCInfo.AllocateReg(Reg: QueuePtrReg);
520	}
521
522	if (UserSGPRInfo.hasKernargSegmentPtr()) {
523	MachineRegisterInfo &MRI = MF.getRegInfo();
524	Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
525	const LLT P4 = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
526	Register VReg = MRI.createGenericVirtualRegister(Ty: P4);
527	MRI.addLiveIn(Reg: InputPtrReg, vreg: VReg);
528	B.getMBB().addLiveIn(PhysReg: InputPtrReg);
529	B.buildCopy(Res: VReg, Op: InputPtrReg);
530	CCInfo.AllocateReg(Reg: InputPtrReg);
531	}
532
533	if (UserSGPRInfo.hasDispatchID()) {
534	Register DispatchIDReg = Info.addDispatchID(TRI);
535	MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
536	CCInfo.AllocateReg(Reg: DispatchIDReg);
537	}
538
539	if (UserSGPRInfo.hasFlatScratchInit()) {
540	Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
541	MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
542	CCInfo.AllocateReg(Reg: FlatScratchInitReg);
543	}
544
545	if (UserSGPRInfo.hasPrivateSegmentSize()) {
546	Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
547	MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
548	CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
549	}
550
551	// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
552	// these from the dispatch pointer.
553	}
554
555	bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
556	MachineIRBuilder &B, const Function &F,
557	ArrayRef<ArrayRef<Register>> VRegs) const {
558	MachineFunction &MF = B.getMF();
559	const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
560	MachineRegisterInfo &MRI = MF.getRegInfo();
561	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
562	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
563	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
564	const DataLayout &DL = F.getDataLayout();
565
566	SmallVector<CCValAssign, `16`> ArgLocs;
567	CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
568
569	allocateHSAUserSGPRs(CCInfo, B, MF, TRI: TRI, Info&: Info);
570
571	unsigned i = `0`;
572	const Align KernArgBaseAlign(`16`);
573	const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
574	uint64_t ExplicitArgOffset = `0`;
575
576	// TODO: Align down to dword alignment and extract bits for extending loads.
577	for (auto &Arg : F.args()) {
578	// TODO: Add support for kernarg preload.
579	if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
580	LLVM_DEBUG(dbgs() << "Preloading hidden arguments is not supported\n");
581	return false;
582	}
583
584	const bool IsByRef = Arg.hasByRefAttr();
585	Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
586	unsigned AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
587	if (AllocSize == `0`)
588	continue;
589
590	MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
591	Align ABIAlign = DL.getValueOrABITypeAlignment(Alignment: ParamAlign, Ty: ArgTy);
592
593	uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + BaseOffset;
594	ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: ABIAlign) + AllocSize;
595
596	if (Arg.use_empty()) {
597	++i;
598	continue;
599	}
600
601	Align Alignment = commonAlignment(A: KernArgBaseAlign, Offset: ArgOffset);
602
603	if (IsByRef) {
604	unsigned ByRefAS = cast<PointerType>(Val: Arg.getType())->getAddressSpace();
605
606	assert(VRegs[i].size() == `1` &&
607	"expected only one register for byval pointers");
608	if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
609	lowerParameterPtr(DstReg: VRegs [i][`0`], B, Offset: ArgOffset);
610	} else {
611	const LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
612	Register PtrReg = MRI.createGenericVirtualRegister(Ty: ConstPtrTy);
613	lowerParameterPtr(DstReg: PtrReg, B, Offset: ArgOffset);
614
615	B.buildAddrSpaceCast(Dst: VRegs [i][`0`], Src: PtrReg);
616	}
617	} else {
618	ArgInfo OrigArg(VRegs [i], Arg, i);
619	const unsigned OrigArgIdx = i + AttributeList::FirstArgIndex;
620	setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
621	lowerParameter(B, OrigArg, Offset: ArgOffset, Alignment);
622	}
623
624	++i;
625	}
626
627	if (Info->getNumKernargPreloadedSGPRs())
628	Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
629
630	TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
631	TLI.allocateSystemSGPRs(CCInfo, MF, Info&: Info, CallConv: F.getCallingConv(), IsShader: false*);
632	return true;
633	}
634
635	bool AMDGPUCallLowering::lowerFormalArguments(
636	MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
637	FunctionLoweringInfo &FLI) const {
638	CallingConv::ID CC = F.getCallingConv();
639
640	// The infrastructure for normal calling convention lowering is essentially
641	// useless for kernels. We want to avoid any kind of legalization or argument
642	// splitting.
643	if (CC == CallingConv::AMDGPU_KERNEL)
644	return lowerFormalArgumentsKernel(B, F, VRegs);
645
646	const bool IsGraphics = AMDGPU::isGraphics(CC);
647	const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
648
649	MachineFunction &MF = B.getMF();
650	MachineBasicBlock &MBB = B.getMBB();
651	MachineRegisterInfo &MRI = MF.getRegInfo();
652	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
653	const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
654	const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
655	const DataLayout &DL = F.getDataLayout();
656
657	SmallVector<CCValAssign, `16`> ArgLocs;
658	CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
659	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
660
661	if (UserSGPRInfo.hasImplicitBufferPtr()) {
662	Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(TRI: *TRI);
663	MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
664	CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
665	}
666
667	// FIXME: This probably isn't defined for mesa
668	if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
669	Register FlatScratchInitReg = Info->addFlatScratchInit(TRI: *TRI);
670	MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
671	CCInfo.AllocateReg(Reg: FlatScratchInitReg);
672	}
673
674	SmallVector<ArgInfo, `32`> SplitArgs;
675	unsigned Idx = `0`;
676	unsigned PSInputNum = `0`;
677
678	// Insert the hidden sret parameter if the return value won't fit in the
679	// return registers.
680	if (!FLI.CanLowerReturn)
681	insertSRetIncomingArgument(F, SplitArgs, DemoteReg&: FLI.DemoteRegister, MRI, DL);
682
683	for (auto &Arg : F.args()) {
684	if (DL.getTypeStoreSize(Ty: Arg.getType()) == `0`)
685	continue;
686
687	if (Info->isWholeWaveFunction() && Idx == `0`) {
688	assert(VRegs[Idx].size() == `1` && "Expected only one register");
689
690	// The first argument for whole wave functions is the original EXEC value.
691	B.buildInstr(Opcode: AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
692	.addDef(RegNo: VRegs [Idx][`0`]);
693
694	++Idx;
695	continue;
696	}
697
698	const bool InReg = Arg.hasAttribute(Kind: Attribute::InReg);
699
700	if (Arg.hasAttribute(Kind: Attribute::SwiftSelf) \|\|
701	Arg.hasAttribute(Kind: Attribute::SwiftError) \|\|
702	Arg.hasAttribute(Kind: Attribute::Nest))
703	return false;
704
705	if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= `15`) {
706	const bool ArgUsed = !Arg.use_empty();
707	bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(Index: PSInputNum);
708
709	if (!SkipArg) {
710	Info->markPSInputAllocated(Index: PSInputNum);
711	if (ArgUsed)
712	Info->markPSInputEnabled(Index: PSInputNum);
713	}
714
715	++PSInputNum;
716
717	if (SkipArg) {
718	for (Register R : VRegs [Idx])
719	B.buildUndef(Res: R);
720
721	++Idx;
722	continue;
723	}
724	}
725
726	ArgInfo OrigArg(VRegs [Idx], Arg, Idx);
727	const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
728	setArgFlags(Arg&: OrigArg, OpIdx: OrigArgIdx, DL, FuncInfo: F);
729
730	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs, DL, CallConv: CC);
731	++Idx;
732	}
733
734	// At least one interpolation mode must be enabled or else the GPU will
735	// hang.
736	//
737	// Check PSInputAddr instead of PSInputEnable. The idea is that if the user
738	// set PSInputAddr, the user wants to enable some bits after the compilation
739	// based on run-time states. Since we can't know what the final PSInputEna
740	// will look like, so we shouldn't do anything here and the user should take
741	// responsibility for the correct programming.
742	//
743	// Otherwise, the following restrictions apply:
744	// - At least one of PERSP_ (0xF) or LINEAR_* (0x70) must be enabled.*
745	// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_ must be*
746	// enabled too.
747	if (CC == CallingConv::AMDGPU_PS) {
748	if ((Info->getPSInputAddr() & `0x7F`) == `0` \|\|
749	((Info->getPSInputAddr() & `0xF`) == `0` &&
750	Info->isPSInputAllocated(Index: `11`))) {
751	CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
752	CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
753	Info->markPSInputAllocated(Index: `0`);
754	Info->markPSInputEnabled(Index: `0`);
755	}
756
757	if (Subtarget.isAmdPalOS()) {
758	// For isAmdPalOS, the user does not enable some bits after compilation
759	// based on run-time states; the register values being generated here are
760	// the final ones set in hardware. Therefore we need to apply the
761	// workaround to PSInputAddr and PSInputEnable together. (The case where
762	// a bit is set in PSInputAddr but not PSInputEnable is where the frontend
763	// set up an input arg for a particular interpolation mode, but nothing
764	// uses that input arg. Really we should have an earlier pass that removes
765	// such an arg.)
766	unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
767	if ((PsInputBits & `0x7F`) == `0` \|\|
768	((PsInputBits & `0xF`) == `0` &&
769	(PsInputBits >> `11` & `1`)))
770	Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
771	}
772	}
773
774	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
775	CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, IsVarArg: F.isVarArg());
776
777	if (!MBB.empty())
778	B.setInstr(*MBB.begin());
779
780	if (!IsEntryFunc && !IsGraphics) {
781	// For the fixed ABI, pass workitem IDs in the last argument register.
782	TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: TRI, Info&: Info);
783
784	if (!Subtarget.hasFlatScratchEnabled())
785	CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
786	TLI.allocateSpecialInputSGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
787	}
788
789	IncomingValueAssigner Assigner(AssignFn);
790	if (!determineAssignments(Assigner, Args&: SplitArgs, CCInfo))
791	return false;
792
793	if (IsEntryFunc) {
794	// This assumes the registers are allocated by CCInfo in ascending order
795	// with no gaps.
796	Info->setNumWaveDispatchSGPRs(
797	CCInfo.getFirstUnallocated(Regs: AMDGPU::SGPR_32RegClass.getRegisters()));
798	Info->setNumWaveDispatchVGPRs(
799	CCInfo.getFirstUnallocated(Regs: AMDGPU::VGPR_32RegClass.getRegisters()));
800	}
801
802	FormalArgHandler Handler(B, MRI);
803	if (!handleAssignments(Handler, Args&: SplitArgs, CCState&: CCInfo, ArgLocs, MIRBuilder&: B))
804	return false;
805
806	uint64_t StackSize = Assigner.StackSize;
807
808	// Start adding system SGPRs.
809	if (IsEntryFunc)
810	TLI.allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv: CC, IsShader: IsGraphics);
811
812	// When we tail call, we need to check if the callee's arguments will fit on
813	// the caller's stack. So, whenever we lower formal arguments, we should keep
814	// track of this information, since we might lower a tail call in this
815	// function later.
816	Info->setBytesInStackArgArea(StackSize);
817
818	// Move back to the end of the basic block.
819	B.setMBB(MBB);
820
821	return true;
822	}
823
824	bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
825	CCState &CCInfo,
826	SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
827	CallLoweringInfo &Info) const {
828	MachineFunction &MF = MIRBuilder.getMF();
829
830	// If there's no call site, this doesn't correspond to a call from the IR and
831	// doesn't need implicit inputs.
832	if (!Info.CB)
833	return true;
834
835	const AMDGPUFunctionArgInfo &CalleeArgInfo =
836	AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
837
838	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
839	const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
840
841
842	// TODO: Unify with private memory register handling. This is complicated by
843	// the fact that at least in kernels, the input argument is not necessarily
844	// in the same location as the input.
845	AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
846	AMDGPUFunctionArgInfo::DISPATCH_PTR,
847	AMDGPUFunctionArgInfo::QUEUE_PTR,
848	AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
849	AMDGPUFunctionArgInfo::DISPATCH_ID,
850	AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
851	AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
852	AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
853	AMDGPUFunctionArgInfo::LDS_KERNEL_ID,
854	};
855
856	static constexpr StringLiteral ImplicitAttrNames[][`2`] = {
857	{"amdgpu-no-dispatch-ptr", ""},
858	{"amdgpu-no-queue-ptr", ""},
859	{"amdgpu-no-implicitarg-ptr", ""},
860	{"amdgpu-no-dispatch-id", ""},
861	{"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"},
862	{"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"},
863	{"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"},
864	{"amdgpu-no-lds-kernel-id", ""},
865	};
866
867	MachineRegisterInfo &MRI = MF.getRegInfo();
868
869	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
870	const AMDGPULegalizerInfo *LI
871	= static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
872
873	unsigned I = `0`;
874	for (auto InputID : InputRegs) {
875	const ArgDescriptor *OutgoingArg;
876	const TargetRegisterClass *ArgRC;
877	LLT ArgTy;
878
879	// If the callee does not use the attribute value, skip copying the value.
880	if (all_of(Range: ImplicitAttrNames[I++], P: [&](StringRef AttrName) {
881	return AttrName.empty() \|\| Info.CB->hasFnAttr(Kind: AttrName);
882	}))
883	continue;
884
885	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
886	CalleeArgInfo.getPreloadedValue(Value: InputID);
887	if (!OutgoingArg)
888	continue;
889
890	const ArgDescriptor *IncomingArg;
891	const TargetRegisterClass *IncomingArgRC;
892	std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: ArgTy) =
893	CallerArgInfo.getPreloadedValue(Value: InputID);
894	assert(IncomingArgRC == ArgRC);
895
896	Register InputReg = MRI.createGenericVirtualRegister(Ty: ArgTy);
897
898	if (IncomingArg) {
899	LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArg, ArgRC, ArgTy);
900	} else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
901	LI->getImplicitArgPtr(DstReg: InputReg, MRI, B&: MIRBuilder);
902	} else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
903	std::optional<uint32_t> Id =
904	AMDGPUMachineFunction::getLDSKernelIdMetadata(F: MF.getFunction());
905	if (Id) {
906	MIRBuilder.buildConstant(Res: InputReg, Val: *Id);
907	} else {
908	MIRBuilder.buildUndef(Res: InputReg);
909	}
910	} else {
911	// We may have proven the input wasn't needed, although the ABI is
912	// requiring it. We just need to allocate the register appropriately.
913	MIRBuilder.buildUndef(Res: InputReg);
914	}
915
916	if (OutgoingArg->isRegister()) {
917	ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
918	if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
919	report_fatal_error(reason: "failed to allocate implicit input argument");
920	} else {
921	LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
922	return false;
923	}
924	}
925
926	// Pack workitem IDs into a single register or pass it as is if already
927	// packed.
928	const ArgDescriptor *OutgoingArg;
929	const TargetRegisterClass *ArgRC;
930	LLT ArgTy;
931
932	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
933	CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
934	if (!OutgoingArg)
935	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
936	CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
937	if (!OutgoingArg)
938	std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
939	CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
940	if (!OutgoingArg)
941	return false;
942
943	auto WorkitemIDX =
944	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
945	auto WorkitemIDY =
946	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
947	auto WorkitemIDZ =
948	CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
949
950	const ArgDescriptor *IncomingArgX = std::get<`0`>(t&: WorkitemIDX);
951	const ArgDescriptor *IncomingArgY = std::get<`0`>(t&: WorkitemIDY);
952	const ArgDescriptor *IncomingArgZ = std::get<`0`>(t&: WorkitemIDZ);
953	const LLT S32 = LLT::scalar(SizeInBits: `32`);
954
955	const bool NeedWorkItemIDX = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
956	const bool NeedWorkItemIDY = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
957	const bool NeedWorkItemIDZ = !Info.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
958
959	// If incoming ids are not packed we need to pack them.
960	// FIXME: Should consider known workgroup size to eliminate known 0 cases.
961	Register InputReg;
962	if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
963	NeedWorkItemIDX) {
964	if (ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: `0`) != `0`) {
965	InputReg = MRI.createGenericVirtualRegister(Ty: S32);
966	LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: IncomingArgX,
967	ArgRC: std::get<`1`>(t&: WorkitemIDX),
968	ArgTy: std::get<`2`>(t&: WorkitemIDX));
969	} else {
970	InputReg = MIRBuilder.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
971	}
972	}
973
974	if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
975	NeedWorkItemIDY && ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: `1`) != `0`) {
976	Register Y = MRI.createGenericVirtualRegister(Ty: S32);
977	LI->buildLoadInputValue(DstReg: Y, B&: MIRBuilder, Arg: IncomingArgY,
978	ArgRC: std::get<`1`>(t&: WorkitemIDY), ArgTy: std::get<`2`>(t&: WorkitemIDY));
979
980	Y = MIRBuilder.buildShl(Dst: S32, Src0: Y, Src1: MIRBuilder.buildConstant(Res: S32, Val: `10`)).getReg(Idx: `0`);
981	InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Y).getReg(Idx: `0`) : Y;
982	}
983
984	if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
985	NeedWorkItemIDZ && ST.getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: `2`) != `0`) {
986	Register Z = MRI.createGenericVirtualRegister(Ty: S32);
987	LI->buildLoadInputValue(DstReg: Z, B&: MIRBuilder, Arg: IncomingArgZ,
988	ArgRC: std::get<`1`>(t&: WorkitemIDZ), ArgTy: std::get<`2`>(t&: WorkitemIDZ));
989
990	Z = MIRBuilder.buildShl(Dst: S32, Src0: Z, Src1: MIRBuilder.buildConstant(Res: S32, Val: `20`)).getReg(Idx: `0`);
991	InputReg = InputReg ? MIRBuilder.buildOr(Dst: S32, Src0: InputReg, Src1: Z).getReg(Idx: `0`) : Z;
992	}
993
994	if (!InputReg &&
995	(NeedWorkItemIDX \|\| NeedWorkItemIDY \|\| NeedWorkItemIDZ)) {
996	InputReg = MRI.createGenericVirtualRegister(Ty: S32);
997	if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
998	// We're in a situation where the outgoing function requires the workitem
999	// ID, but the calling function does not have it (e.g a graphics function
1000	// calling a C calling convention function). This is illegal, but we need
1001	// to produce something.
1002	MIRBuilder.buildUndef(Res: InputReg);
1003	} else {
1004	// Workitem ids are already packed, any of present incoming arguments will
1005	// carry all required fields.
1006	ArgDescriptor IncomingArg = ArgDescriptor::createArg(
1007	Arg: IncomingArgX ? *IncomingArgX :
1008	IncomingArgY ? IncomingArgY : IncomingArgZ, Mask: ~`0u`);
1009	LI->buildLoadInputValue(DstReg: InputReg, B&: MIRBuilder, Arg: &IncomingArg,
1010	ArgRC: &AMDGPU::VGPR_32RegClass, ArgTy: S32);
1011	}
1012	}
1013
1014	if (OutgoingArg->isRegister()) {
1015	if (InputReg)
1016	ArgRegs.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
1017
1018	if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
1019	report_fatal_error(reason: "failed to allocate implicit input argument");
1020	} else {
1021	LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1022	return false;
1023	}
1024
1025	return true;
1026	}
1027
1028	/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
1029	/// CC.
1030	static std::pair<CCAssignFn , CCAssignFn >
1031	getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
1032	return {TLI.CCAssignFnForCall(CC, IsVarArg: false), TLI.CCAssignFnForCall(CC, IsVarArg: true)};
1033	}
1034
1035	static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
1036	bool IsTailCall, bool IsWave32,
1037	CallingConv::ID CC,
1038	bool IsDynamicVGPRChainCall = false) {
1039	// For calls to amdgpu_cs_chain functions, the address is known to be uniform.
1040	assert((AMDGPU::isChainCC(CC) \|\| !IsIndirect \|\| !IsTailCall) &&
1041	"Indirect calls can't be tail calls, "
1042	"because the address can be divergent");
1043	if (!IsTailCall)
1044	return AMDGPU::G_SI_CALL;
1045
1046	if (AMDGPU::isChainCC(CC)) {
1047	if (IsDynamicVGPRChainCall)
1048	return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR
1049	: AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR;
1050	return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
1051	}
1052
1053	if (CallerF.getFunction().getCallingConv() ==
1054	CallingConv::AMDGPU_Gfx_WholeWave)
1055	return AMDGPU::SI_TCRETURN_GFX_WholeWave;
1056
1057	if (CC == CallingConv::AMDGPU_Gfx \|\| CC == CallingConv::AMDGPU_Gfx_WholeWave)
1058	return AMDGPU::SI_TCRETURN_GFX;
1059
1060	return AMDGPU::SI_TCRETURN;
1061	}
1062
1063	// Add operands to call instruction to track the callee.
1064	static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
1065	MachineIRBuilder &MIRBuilder,
1066	AMDGPUCallLowering::CallLoweringInfo &Info,
1067	bool IsDynamicVGPRChainCall = false) {
1068	if (Info.Callee.isReg()) {
1069	CallInst.addReg(RegNo: Info.Callee.getReg());
1070	CallInst.addImm(Val: `0`);
1071	} else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == `0`) {
1072	// The call lowering lightly assumed we can directly encode a call target in
1073	// the instruction, which is not the case. Materialize the address here.
1074	const GlobalValue *GV = Info.Callee.getGlobal();
1075	auto Ptr = MIRBuilder.buildGlobalValue(
1076	Res: LLT::pointer(AddressSpace: GV->getAddressSpace(), SizeInBits: `64`), GV);
1077	CallInst.addReg(RegNo: Ptr.getReg(Idx: `0`));
1078
1079	if (IsDynamicVGPRChainCall) {
1080	// DynamicVGPR chain calls are always indirect.
1081	CallInst.addImm(Val: `0`);
1082	} else
1083	CallInst.add(MO: Info.Callee);
1084	} else
1085	return false;
1086
1087	return true;
1088	}
1089
1090	bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
1091	CallLoweringInfo &Info, MachineFunction &MF,
1092	SmallVectorImpl<ArgInfo> &InArgs) const {
1093	const Function &CallerF = MF.getFunction();
1094	CallingConv::ID CalleeCC = Info.CallConv;
1095	CallingConv::ID CallerCC = CallerF.getCallingConv();
1096
1097	// If the calling conventions match, then everything must be the same.
1098	if (CalleeCC == CallerCC)
1099	return true;
1100
1101	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1102
1103	// Make sure that the caller and callee preserve all of the same registers.
1104	const auto *TRI = ST.getRegisterInfo();
1105
1106	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1107	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
1108	if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
1109	return false;
1110
1111	// Check if the caller and callee will handle arguments in the same way.
1112	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1113	CCAssignFn *CalleeAssignFnFixed;
1114	CCAssignFn *CalleeAssignFnVarArg;
1115	std::tie(args&: CalleeAssignFnFixed, args&: CalleeAssignFnVarArg) =
1116	getAssignFnsForCC(CC: CalleeCC, TLI);
1117
1118	CCAssignFn *CallerAssignFnFixed;
1119	CCAssignFn *CallerAssignFnVarArg;
1120	std::tie(args&: CallerAssignFnFixed, args&: CallerAssignFnVarArg) =
1121	getAssignFnsForCC(CC: CallerCC, TLI);
1122
1123	// FIXME: We are not accounting for potential differences in implicitly passed
1124	// inputs, but only the fixed ABI is supported now anyway.
1125	IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
1126	CalleeAssignFnVarArg);
1127	IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
1128	CallerAssignFnVarArg);
1129	return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
1130	}
1131
1132	bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
1133	CallLoweringInfo &Info, MachineFunction &MF,
1134	SmallVectorImpl<ArgInfo> &OutArgs) const {
1135	// If there are no outgoing arguments, then we are done.
1136	if (OutArgs.empty())
1137	return true;
1138
1139	const Function &CallerF = MF.getFunction();
1140	CallingConv::ID CalleeCC = Info.CallConv;
1141	CallingConv::ID CallerCC = CallerF.getCallingConv();
1142	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1143
1144	CCAssignFn *AssignFnFixed;
1145	CCAssignFn *AssignFnVarArg;
1146	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1147
1148	// We have outgoing arguments. Make sure that we can tail call with them.
1149	SmallVector<CCValAssign, `16`> OutLocs;
1150	CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
1151	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1152
1153	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo&: OutInfo)) {
1154	LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
1155	return false;
1156	}
1157
1158	// Make sure that they can fit on the caller's stack.
1159	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1160	if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
1161	LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
1162	return false;
1163	}
1164
1165	// Verify that the parameters in callee-saved registers match.
1166	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1167	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1168	const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
1169	MachineRegisterInfo &MRI = MF.getRegInfo();
1170	return parametersInCSRMatch(MRI, CallerPreservedMask, ArgLocs: OutLocs, OutVals: OutArgs);
1171	}
1172
1173	bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1174	MachineIRBuilder &B, CallLoweringInfo &Info,
1175	SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
1176	// Must pass all target-independent checks in order to tail call optimize.
1177	if (!Info.IsTailCall)
1178	return false;
1179
1180	// Indirect calls can't be tail calls, because the address can be divergent.
1181	// TODO Check divergence info if the call really is divergent.
1182	if (Info.Callee.isReg())
1183	return false;
1184
1185	MachineFunction &MF = B.getMF();
1186	const Function &CallerF = MF.getFunction();
1187	CallingConv::ID CalleeCC = Info.CallConv;
1188	CallingConv::ID CallerCC = CallerF.getCallingConv();
1189
1190	const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1191	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1192	// Kernels aren't callable, and don't have a live in return address so it
1193	// doesn't make sense to do a tail call with entry functions.
1194	if (!CallerPreserved)
1195	return false;
1196
1197	if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC)) {
1198	LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
1199	return false;
1200	}
1201
1202	if (any_of(Range: CallerF.args(), P: [](const Argument &A) {
1203	return A.hasByValAttr() \|\| A.hasSwiftErrorAttr();
1204	})) {
1205	LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
1206	"or swifterror arguments\n");
1207	return false;
1208	}
1209
1210	// If we have -tailcallopt, then we're done.
1211	if (MF.getTarget().Options.GuaranteedTailCallOpt) {
1212	return AMDGPU::canGuaranteeTCO(CC: CalleeCC) &&
1213	CalleeCC == CallerF.getCallingConv();
1214	}
1215
1216	// Verify that the incoming and outgoing arguments from the callee are
1217	// safe to tail call.
1218	if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
1219	LLVM_DEBUG(
1220	dbgs()
1221	<< "... Caller and callee have incompatible calling conventions.\n");
1222	return false;
1223	}
1224
1225	// FIXME: We need to check if any arguments passed in SGPR are uniform. If
1226	// they are not, this cannot be a tail call. If they are uniform, but may be
1227	// VGPR, we need to insert readfirstlanes.
1228	if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
1229	return false;
1230
1231	LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
1232	return true;
1233	}
1234
1235	// Insert outgoing implicit arguments for a call, by inserting copies to the
1236	// implicit argument registers and adding the necessary implicit uses to the
1237	// call instruction.
1238	void AMDGPUCallLowering::handleImplicitCallArguments(
1239	MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
1240	const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1241	CallingConv::ID CalleeCC,
1242	ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1243	if (!ST.hasFlatScratchEnabled()) {
1244	// Insert copies for the SRD. In the HSA case, this should be an identity
1245	// copy.
1246	auto ScratchRSrcReg = MIRBuilder.buildCopy(Res: LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`),
1247	Op: FuncInfo.getScratchRSrcReg());
1248
1249	auto CalleeRSrcReg = AMDGPU::isChainCC(CC: CalleeCC)
1250	? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1251	: AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1252
1253	MIRBuilder.buildCopy(Res: CalleeRSrcReg, Op: ScratchRSrcReg);
1254	CallInst.addReg(RegNo: CalleeRSrcReg, Flags: RegState::Implicit);
1255	}
1256
1257	for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1258	MIRBuilder.buildCopy(Res: (Register)ArgReg.first, Op: ArgReg.second);
1259	CallInst.addReg(RegNo: ArgReg.first, Flags: RegState::Implicit);
1260	}
1261	}
1262
1263	namespace {
1264	// Chain calls have special arguments that we need to handle. These have the
1265	// same index as they do in the llvm.amdgcn.cs.chain intrinsic.
1266	enum ChainCallArgIdx {
1267	Exec = `1`,
1268	Flags = `4`,
1269	NumVGPRs = `5`,
1270	FallbackExec = `6`,
1271	FallbackCallee = `7`,
1272	};
1273	} // anonymous namespace
1274
1275	bool AMDGPUCallLowering::lowerTailCall(
1276	MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
1277	SmallVectorImpl<ArgInfo> &OutArgs) const {
1278	MachineFunction &MF = MIRBuilder.getMF();
1279	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1280	SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1281	const Function &F = MF.getFunction();
1282	MachineRegisterInfo &MRI = MF.getRegInfo();
1283	const SIInstrInfo *TII = ST.getInstrInfo();
1284	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1285	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1286
1287	// True when we're tail calling, but without -tailcallopt.
1288	bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
1289
1290	// Find out which ABI gets to decide where things go.
1291	CallingConv::ID CalleeCC = Info.CallConv;
1292	CCAssignFn *AssignFnFixed;
1293	CCAssignFn *AssignFnVarArg;
1294	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) = getAssignFnsForCC(CC: CalleeCC, TLI);
1295
1296	MachineInstrBuilder CallSeqStart;
1297	if (!IsSibCall)
1298	CallSeqStart = MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKUP);
1299
1300	bool IsChainCall = AMDGPU::isChainCC(CC: Info.CallConv);
1301	bool IsDynamicVGPRChainCall = false;
1302
1303	if (IsChainCall) {
1304	ArgInfo FlagsArg = Info.OrigArgs [ChainCallArgIdx::Flags];
1305	const APInt &FlagsValue = cast<ConstantInt>(Val: FlagsArg.OrigValue)->getValue();
1306	if (FlagsValue.isZero()) {
1307	if (Info.OrigArgs.size() != `5`) {
1308	LLVM_DEBUG(dbgs() << "No additional args allowed if flags == 0\n");
1309	return false;
1310	}
1311	} else if (FlagsValue.isOneBitSet(BitNo: `0`)) {
1312	IsDynamicVGPRChainCall = true;
1313
1314	if (Info.OrigArgs.size() != `8`) {
1315	LLVM_DEBUG(dbgs() << "Expected 3 additional args\n");
1316	return false;
1317	}
1318
1319	// On GFX12, we can only change the VGPR allocation for wave32.
1320	if (!ST.isWave32()) {
1321	F.getContext().diagnose(DI: DiagnosticInfoUnsupported (
1322	F, "dynamic VGPR mode is only supported for wave32"));
1323	return false;
1324	}
1325
1326	ArgInfo FallbackExecArg = Info.OrigArgs [ChainCallArgIdx::FallbackExec];
1327	assert(FallbackExecArg.Regs.size() == `1` &&
1328	"Expected single register for fallback EXEC");
1329	if (!FallbackExecArg.Ty->isIntegerTy(Bitwidth: ST.getWavefrontSize())) {
1330	LLVM_DEBUG(dbgs() << "Bad type for fallback EXEC\n");
1331	return false;
1332	}
1333	}
1334	}
1335
1336	unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), /IsTailCall/ true,
1337	IsWave32: ST.isWave32(), CC: CalleeCC, IsDynamicVGPRChainCall);
1338	auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1339
1340	if (FuncInfo->isWholeWaveFunction())
1341	addOriginalExecToReturn(MF, Ret&: MIB);
1342
1343	// Keep track of the index of the next operand to be added to the call
1344	unsigned CalleeIdx = MIB ->getNumOperands();
1345
1346	if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
1347	return false;
1348
1349	// Byte offset for the tail call. When we are sibcalling, this will always
1350	// be 0.
1351	MIB.addImm(Val: `0`);
1352
1353	// If this is a chain call, we need to pass in the EXEC mask as well as any
1354	// other special args.
1355	if (IsChainCall) {
1356	auto AddRegOrImm = [&](const ArgInfo &Arg) {
1357	if (auto CI = dyn_cast<ConstantInt>(Val: Arg.OrigValue)) {
1358	MIB.addImm(Val: CI->getSExtValue());
1359	} else {
1360	MIB.addReg(RegNo: Arg.Regs [`0`]);
1361	unsigned Idx = MIB ->getNumOperands() - `1`;
1362	MIB ->getOperand(i: Idx).setReg(constrainOperandRegClass(
1363	MF, TRI: TRI, MRI, TII: TII, RBI: ST.getRegBankInfo(), InsertPt&: MIB, II: MIB ->getDesc(),
1364	RegMO&: MIB ->getOperand(i: Idx), OpIdx: Idx));
1365	}
1366	};
1367
1368	ArgInfo ExecArg = Info.OrigArgs [ChainCallArgIdx::Exec];
1369	assert(ExecArg.Regs.size() == `1` && "Too many regs for EXEC");
1370
1371	if (!ExecArg.Ty->isIntegerTy(Bitwidth: ST.getWavefrontSize())) {
1372	LLVM_DEBUG(dbgs() << "Bad type for EXEC");
1373	return false;
1374	}
1375
1376	AddRegOrImm (ExecArg);
1377	if (IsDynamicVGPRChainCall)
1378	std::for_each(first: Info.OrigArgs.begin() + ChainCallArgIdx::NumVGPRs,
1379	last: Info.OrigArgs.end(), f: AddRegOrImm);
1380	}
1381
1382	// Tell the call which registers are clobbered.
1383	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
1384	MIB.addRegMask(Mask);
1385
1386	// FPDiff is the byte offset of the call's argument area from the callee's.
1387	// Stores to callee stack arguments will be placed in FixedStackSlots offset
1388	// by this amount for a tail call. In a sibling call it must be 0 because the
1389	// caller will deallocate the entire stack and the callee still expects its
1390	// arguments to begin at SP+0.
1391	int FPDiff = `0`;
1392
1393	// This will be 0 for sibcalls, potentially nonzero for tail calls produced
1394	// by -tailcallopt. For sibcalls, the memory operands for the call are
1395	// already available in the caller's incoming argument space.
1396	unsigned NumBytes = `0`;
1397	if (!IsSibCall) {
1398	// We aren't sibcalling, so we need to compute FPDiff. We need to do this
1399	// before handling assignments, because FPDiff must be known for memory
1400	// arguments.
1401	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1402	SmallVector<CCValAssign, `16`> OutLocs;
1403	CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
1404
1405	// FIXME: Not accounting for callee implicit inputs
1406	OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
1407	if (!determineAssignments(Assigner&: CalleeAssigner, Args&: OutArgs, CCInfo&: OutInfo))
1408	return false;
1409
1410	// The callee will pop the argument stack as a tail call. Thus, we must
1411	// keep it 16-byte aligned.
1412	NumBytes = alignTo(Size: OutInfo.getStackSize(), A: ST.getStackAlignment());
1413
1414	// FPDiff will be negative if this tail call requires more space than we
1415	// would automatically have in our incoming argument space. Positive if we
1416	// actually shrink the stack.
1417	FPDiff = NumReusableBytes - NumBytes;
1418
1419	// The stack pointer must be 16-byte aligned at all times it's used for a
1420	// memory operation, which in practice means at all* times and in*
1421	// particular across call boundaries. Therefore our own arguments started at
1422	// a 16-byte aligned SP and the delta applied for the tail call should
1423	// satisfy the same constraint.
1424	assert(isAligned(ST.getStackAlignment(), FPDiff) &&
1425	"unaligned stack on tail call");
1426	}
1427
1428	SmallVector<CCValAssign, `16`> ArgLocs;
1429	CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1430
1431	// We could pass MIB and directly add the implicit uses to the call
1432	// now. However, as an aesthetic choice, place implicit argument operands
1433	// after the ordinary user argument registers.
1434	SmallVector<std::pair<MCRegister, Register>, `12`> ImplicitArgRegs;
1435
1436	if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1437	Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
1438	!AMDGPU::isChainCC(CC: Info.CallConv)) {
1439	// With a fixed ABI, allocate fixed registers before user arguments.
1440	if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1441	return false;
1442	}
1443
1444	// Mark the scratch resource descriptor as allocated so the CC analysis
1445	// does not assign user arguments to these registers, matching the callee.
1446	if (!ST.hasFlatScratchEnabled())
1447	CCInfo.AllocateReg(Reg: FuncInfo->getScratchRSrcReg());
1448
1449	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1450
1451	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1452	return false;
1453
1454	// Do the actual argument marshalling.
1455	AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
1456	if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1457	return false;
1458
1459	if (Info.ConvergenceCtrlToken) {
1460	MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1461	}
1462	handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *FuncInfo, CalleeCC,
1463	ImplicitArgRegs);
1464
1465	// If we have -tailcallopt, we need to adjust the stack. We'll do the call
1466	// sequence start and end here.
1467	if (!IsSibCall) {
1468	MIB ->getOperand(i: CalleeIdx + `1`).setImm(FPDiff);
1469	CallSeqStart.addImm(Val: NumBytes).addImm(Val: `0`);
1470	// End the call sequence before* emitting the call. Normally, we would*
1471	// tidy the frame up after the call. However, here, we've laid out the
1472	// parameters so that when SP is reset, they will be in the correct
1473	// location.
1474	MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKDOWN).addImm(Val: NumBytes).addImm(Val: `0`);
1475	}
1476
1477	// Now we can add the actual call instruction to the correct basic block.
1478	MIRBuilder.insertInstr(MIB);
1479
1480	// If this is a whole wave tail call, we need to constrain the register for
1481	// the original EXEC.
1482	if (MIB ->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) {
1483	MIB ->getOperand(i: `0`).setReg(
1484	constrainOperandRegClass(MF, TRI: TRI, MRI, TII: TII, RBI: *ST.getRegBankInfo(),
1485	InsertPt&: *MIB, II: MIB ->getDesc(), RegMO&: MIB ->getOperand(i: `0`), OpIdx: `0`));
1486	}
1487
1488	// If Callee is a reg, since it is used by a target specific
1489	// instruction, it must have a register class matching the
1490	// constraint of that instruction.
1491
1492	// FIXME: We should define regbankselectable call instructions to handle
1493	// divergent call targets.
1494	if (MIB ->getOperand(i: CalleeIdx).isReg()) {
1495	MIB ->getOperand(i: CalleeIdx).setReg(constrainOperandRegClass(
1496	MF, TRI: TRI, MRI, TII: TII, RBI: ST.getRegBankInfo(), InsertPt&: MIB, II: MIB ->getDesc(),
1497	RegMO&: MIB ->getOperand(i: CalleeIdx), OpIdx: CalleeIdx));
1498	}
1499
1500	MF.getFrameInfo().setHasTailCall();
1501	Info.LoweredTailCall = true;
1502	return true;
1503	}
1504
1505	/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1506	bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
1507	CallLoweringInfo &Info) const {
1508	ArgInfo Callee = Info.OrigArgs [`0`];
1509	ArgInfo SGPRArgs = Info.OrigArgs [`2`];
1510	ArgInfo VGPRArgs = Info.OrigArgs [`3`];
1511
1512	MachineFunction &MF = MIRBuilder.getMF();
1513	const Function &F = MF.getFunction();
1514	const DataLayout &DL = F.getDataLayout();
1515
1516	// The function to jump to is actually the first argument, so we'll change the
1517	// Callee and other info to match that before using our existing helper.
1518	const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1519	if (const Function *F = dyn_cast<Function>(Val: CalleeV)) {
1520	Info.Callee = MachineOperand::CreateGA(GV: F, Offset: `0`);
1521	Info.CallConv = F->getCallingConv();
1522	} else {
1523	assert(Callee.Regs.size() == `1` && "Too many regs for the callee");
1524	Info.Callee = MachineOperand::CreateReg(Reg: Callee.Regs [`0`], isDef: false);
1525	Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1526	// behaves the same here.
1527	}
1528
1529	// The function that we're calling cannot be vararg (only the intrinsic is).
1530	Info.IsVarArg = false;
1531
1532	assert(
1533	all_of(SGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1534	"SGPR arguments should be marked inreg");
1535	assert(
1536	none_of(VGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1537	"VGPR arguments should not be marked inreg");
1538
1539	SmallVector<ArgInfo, `8`> OutArgs;
1540	splitToValueTypes(OrigArgInfo: SGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1541	splitToValueTypes(OrigArgInfo: VGPRArgs, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1542
1543	Info.IsMustTailCall = true;
1544	return lowerTailCall(MIRBuilder, Info, OutArgs);
1545	}
1546
1547	bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1548	CallLoweringInfo &Info) const {
1549	if (Function *F = Info.CB->getCalledFunction())
1550	if (F->isIntrinsic()) {
1551	switch (F->getIntrinsicID()) {
1552	case Intrinsic::amdgcn_cs_chain:
1553	return lowerChainCall(MIRBuilder, Info);
1554	case Intrinsic::amdgcn_call_whole_wave:
1555	Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
1556
1557	// Get the callee from the original instruction, so it doesn't look like
1558	// this is an indirect call.
1559	Info.Callee = MachineOperand::CreateGA(
1560	GV: cast<GlobalValue>(Val: Info.CB->getOperand(i_nocapture: `0`)), /Offset=/`0`);
1561	Info.OrigArgs.erase(CI: Info.OrigArgs.begin());
1562	Info.IsVarArg = false;
1563	break;
1564	default:
1565	llvm_unreachable("Unexpected intrinsic call");
1566	}
1567	}
1568
1569	if (Info.IsVarArg) {
1570	LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1571	return false;
1572	}
1573
1574	MachineFunction &MF = MIRBuilder.getMF();
1575	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1576	const SIRegisterInfo *TRI = ST.getRegisterInfo();
1577
1578	const Function &F = MF.getFunction();
1579	MachineRegisterInfo &MRI = MF.getRegInfo();
1580	const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1581	const DataLayout &DL = F.getDataLayout();
1582
1583	SmallVector<ArgInfo, `8`> OutArgs;
1584	for (auto &OrigArg : Info.OrigArgs)
1585	splitToValueTypes(OrigArgInfo: OrigArg, SplitArgs&: OutArgs, DL, CallConv: Info.CallConv);
1586
1587	SmallVector<ArgInfo, `8`> InArgs;
1588	if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
1589	splitToValueTypes(OrigArgInfo: Info.OrigRet, SplitArgs&: InArgs, DL, CallConv: Info.CallConv);
1590
1591	// If we can lower as a tail call, do that instead.
1592	bool CanTailCallOpt =
1593	isEligibleForTailCallOptimization(B&: MIRBuilder, Info, InArgs, OutArgs);
1594
1595	// We must emit a tail call if we have musttail.
1596	if (Info.IsMustTailCall && !CanTailCallOpt) {
1597	LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1598	return false;
1599	}
1600
1601	Info.IsTailCall = CanTailCallOpt;
1602	if (CanTailCallOpt)
1603	return lowerTailCall(MIRBuilder, Info, OutArgs);
1604
1605	// Find out which ABI gets to decide where things go.
1606	CCAssignFn *AssignFnFixed;
1607	CCAssignFn *AssignFnVarArg;
1608	std::tie(args&: AssignFnFixed, args&: AssignFnVarArg) =
1609	getAssignFnsForCC(CC: Info.CallConv, TLI);
1610
1611	MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKUP)
1612	.addImm(Val: `0`)
1613	.addImm(Val: `0`);
1614
1615	// Create a temporarily-floating call instruction so we can add the implicit
1616	// uses of arg registers.
1617	unsigned Opc = getCallOpcode(CallerF: MF, IsIndirect: Info.Callee.isReg(), IsTailCall: false, IsWave32: ST.isWave32(),
1618	CC: Info.CallConv);
1619
1620	auto MIB = MIRBuilder.buildInstrNoInsert(Opcode: Opc);
1621	MIB.addDef(RegNo: TRI->getReturnAddressReg(MF));
1622
1623	if (!Info.IsConvergent)
1624	MIB.setMIFlag(MachineInstr::NoConvergent);
1625
1626	if (!addCallTargetOperands(CallInst&: MIB, MIRBuilder, Info))
1627	return false;
1628
1629	// Tell the call which registers are clobbered.
1630	const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1631	MIB.addRegMask(Mask);
1632
1633	SmallVector<CCValAssign, `16`> ArgLocs;
1634	CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1635
1636	// We could pass MIB and directly add the implicit uses to the call
1637	// now. However, as an aesthetic choice, place implicit argument operands
1638	// after the ordinary user argument registers.
1639	SmallVector<std::pair<MCRegister, Register>, `12`> ImplicitArgRegs;
1640
1641	if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1642	Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
1643	// With a fixed ABI, allocate fixed registers before user arguments.
1644	if (!passSpecialInputs(MIRBuilder, CCInfo, ArgRegs&: ImplicitArgRegs, Info))
1645	return false;
1646	}
1647
1648	// Mark the scratch resource descriptor as allocated so the CC analysis
1649	// does not assign user arguments to these registers, matching the callee.
1650	if (!ST.hasFlatScratchEnabled()) {
1651	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1652	CCInfo.AllocateReg(Reg: FuncInfo->getScratchRSrcReg());
1653	}
1654
1655	// Do the actual argument marshalling.
1656	OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
1657	if (!determineAssignments(Assigner, Args&: OutArgs, CCInfo))
1658	return false;
1659
1660	AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
1661	if (!handleAssignments(Handler, Args&: OutArgs, CCState&: CCInfo, ArgLocs, MIRBuilder))
1662	return false;
1663
1664	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1665
1666	if (Info.ConvergenceCtrlToken) {
1667	MIB.addUse(RegNo: Info.ConvergenceCtrlToken, Flags: RegState::Implicit);
1668	}
1669	handleImplicitCallArguments(MIRBuilder, CallInst&: MIB, ST, FuncInfo: *MFI, CalleeCC: Info.CallConv,
1670	ImplicitArgRegs);
1671
1672	// Get a count of how many bytes are to be pushed on the stack.
1673	unsigned NumBytes = CCInfo.getStackSize();
1674
1675	// If Callee is a reg, since it is used by a target specific
1676	// instruction, it must have a register class matching the
1677	// constraint of that instruction.
1678
1679	// FIXME: We should define regbankselectable call instructions to handle
1680	// divergent call targets.
1681	if (MIB ->getOperand(i: `1`).isReg()) {
1682	MIB ->getOperand(i: `1`).setReg(constrainOperandRegClass(
1683	MF, TRI: TRI, MRI, TII: ST.getInstrInfo(),
1684	RBI: ST.getRegBankInfo(), InsertPt&: MIB, II: MIB ->getDesc(), RegMO&: MIB ->getOperand(i: `1`),
1685	OpIdx: `1`));
1686	}
1687
1688	// Now we can add the actual call instruction to the correct position.
1689	MIRBuilder.insertInstr(MIB);
1690
1691	// Finally we can copy the returned value back into its virtual-register. In
1692	// symmetry with the arguments, the physical register must be an
1693	// implicit-define of the call instruction.
1694	if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
1695	CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(CC: Info.CallConv,
1696	IsVarArg: Info.IsVarArg);
1697	IncomingValueAssigner Assigner(RetAssignFn);
1698	CallReturnHandler Handler(MIRBuilder, MRI, MIB);
1699	if (!determineAndHandleAssignments(Handler, Assigner, Args&: InArgs, MIRBuilder,
1700	CallConv: Info.CallConv, IsVarArg: Info.IsVarArg))
1701	return false;
1702	}
1703
1704	uint64_t CalleePopBytes = NumBytes;
1705
1706	MIRBuilder.buildInstr(Opcode: AMDGPU::ADJCALLSTACKDOWN)
1707	.addImm(Val: `0`)
1708	.addImm(Val: CalleePopBytes);
1709
1710	if (!Info.CanLowerReturn) {
1711	insertSRetLoads(MIRBuilder, RetTy: Info.OrigRet.Ty, VRegs: Info.OrigRet.Regs,
1712	DemoteReg: Info.DemoteRegister, FI: Info.DemoteStackIndex);
1713	}
1714
1715	return true;
1716	}
1717
1718	void AMDGPUCallLowering::addOriginalExecToReturn(
1719	MachineFunction &MF, MachineInstrBuilder &Ret) const {
1720	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1721	const SIInstrInfo *TII = ST.getInstrInfo();
1722	const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
1723	Ret.addReg(RegNo: Setup->getOperand(i: `0`).getReg());
1724	}
1725

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp