X86ISelLoweringCall.cpp source code [llvm_projects/llvm/lib/Target/X86/X86ISelLoweringCall.cpp]

1	//===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file implements the lowering of LLVM calls to DAG nodes.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "MCTargetDesc/X86MCAsmInfo.h"
15	#include "X86.h"
16	#include "X86CallingConv.h"
17	#include "X86FrameLowering.h"
18	#include "X86ISelLowering.h"
19	#include "X86InstrBuilder.h"
20	#include "X86MachineFunctionInfo.h"
21	#include "X86TargetMachine.h"
22	#include "llvm/ADT/Statistic.h"
23	#include "llvm/Analysis/ObjCARCUtil.h"
24	#include "llvm/CodeGen/MachineJumpTableInfo.h"
25	#include "llvm/CodeGen/MachineModuleInfo.h"
26	#include "llvm/CodeGen/WinEHFuncInfo.h"
27	#include "llvm/IR/DiagnosticInfo.h"
28	#include "llvm/IR/IRBuilder.h"
29	#include "llvm/IR/Module.h"
30	#include "llvm/Transforms/CFGuard.h"
31
32	#define DEBUG_TYPE "x86-isel"
33
34	using namespace llvm;
35
36	STATISTIC(NumTailCalls, "Number of tail calls");
37
38	/// Call this when the user attempts to do something unsupported, like
39	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
40	/// report_fatal_error, so calling code should attempt to recover without
41	/// crashing.
42	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
43	const char *Msg) {
44	MachineFunction &MF = DAG.getMachineFunction();
45	DAG.getContext()->diagnose(
46	DI: DiagnosticInfoUnsupported (MF.getFunction(), Msg, dl.getDebugLoc()));
47	}
48
49	/// Returns true if a CC can dynamically exclude a register from the list of
50	/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
51	/// the return registers.
52	static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
53	switch (CC) {
54	default:
55	return false;
56	case CallingConv::X86_RegCall:
57	case CallingConv::PreserveMost:
58	case CallingConv::PreserveAll:
59	return true;
60	}
61	}
62
63	/// Returns true if a CC can dynamically exclude a register from the list of
64	/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
65	/// the parameters.
66	static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
67	return CC == CallingConv::X86_RegCall;
68	}
69
70	static std::pair<MVT, unsigned>
71	handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
72	const X86Subtarget &Subtarget) {
73	// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
74	// convention is one that uses k registers.
75	if (NumElts == `2`)
76	return {MVT::v2i64, `1`};
77	if (NumElts == `4`)
78	return {MVT::v4i32, `1`};
79	if (NumElts == `8` && CC != CallingConv::X86_RegCall &&
80	CC != CallingConv::Intel_OCL_BI)
81	return {MVT::v8i16, `1`};
82	if (NumElts == `16` && CC != CallingConv::X86_RegCall &&
83	CC != CallingConv::Intel_OCL_BI)
84	return {MVT::v16i8, `1`};
85	// v32i1 passes in ymm unless we have BWI and the calling convention is
86	// regcall.
87	if (NumElts == `32` && (!Subtarget.hasBWI() \|\| CC != CallingConv::X86_RegCall))
88	return {MVT::v32i8, `1`};
89	// Split v64i1 vectors if we don't have v64i8 available.
90	if (NumElts == `64` && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
91	if (Subtarget.useAVX512Regs())
92	return {MVT::v64i8, `1`};
93	return {MVT::v32i8, `2`};
94	}
95
96	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
97	if (!isPowerOf2_32(Value: NumElts) \|\| (NumElts == `64` && !Subtarget.hasBWI()) \|\|
98	NumElts > `64`)
99	return {MVT::i8, NumElts};
100
101	return {MVT::INVALID_SIMPLE_VALUE_TYPE, `0`};
102	}
103
104	MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
105	CallingConv::ID CC,
106	EVT VT) const {
107	if (VT.isVector()) {
108	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
109	unsigned NumElts = VT.getVectorNumElements();
110
111	MVT RegisterVT;
112	unsigned NumRegisters;
113	std::tie(args&: RegisterVT, args&: NumRegisters) =
114	handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
115	if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
116	return RegisterVT;
117	}
118
119	if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < `8`)
120	return MVT::v8f16;
121	}
122
123	// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
124	if ((VT == MVT::f64 \|\| VT == MVT::f80) && !Subtarget.is64Bit() &&
125	!Subtarget.hasX87())
126	return MVT::i32;
127
128	if (isTypeLegal(VT: MVT::f16)) {
129	if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
130	return getRegisterTypeForCallingConv(
131	Context, CC, VT: VT.changeVectorElementType(Context, EltVT: MVT::f16));
132
133	if (VT == MVT::bf16)
134	return MVT::f16;
135	}
136
137	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
138	}
139
140	unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
141	CallingConv::ID CC,
142	EVT VT) const {
143	if (VT.isVector()) {
144	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
145	unsigned NumElts = VT.getVectorNumElements();
146
147	MVT RegisterVT;
148	unsigned NumRegisters;
149	std::tie(args&: RegisterVT, args&: NumRegisters) =
150	handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
151	if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
152	return NumRegisters;
153	}
154
155	if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < `8`)
156	return `1`;
157	}
158
159	// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
160	// x87 is disabled.
161	if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
162	if (VT == MVT::f64)
163	return `2`;
164	if (VT == MVT::f80)
165	return `3`;
166	}
167
168	if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
169	isTypeLegal(VT: MVT::f16))
170	return getNumRegistersForCallingConv(
171	Context, CC, VT: VT.changeVectorElementType(Context, EltVT: MVT::f16));
172
173	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
174	}
175
176	unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
177	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
178	unsigned &NumIntermediates, MVT &RegisterVT) const {
179	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
180	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
181	Subtarget.hasAVX512() &&
182	(!isPowerOf2_32(Value: VT.getVectorNumElements()) \|\|
183	(VT.getVectorNumElements() == `64` && !Subtarget.hasBWI()) \|\|
184	VT.getVectorNumElements() > `64`)) {
185	RegisterVT = MVT::i8;
186	IntermediateVT = MVT::i1;
187	NumIntermediates = VT.getVectorNumElements();
188	return NumIntermediates;
189	}
190
191	// Split v64i1 vectors if we don't have v64i8 available.
192	if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
193	CC != CallingConv::X86_RegCall) {
194	RegisterVT = MVT::v32i8;
195	IntermediateVT = MVT::v32i1;
196	NumIntermediates = `2`;
197	return `2`;
198	}
199
200	// Split vNbf16 vectors according to vNf16.
201	if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
202	isTypeLegal(VT: MVT::f16))
203	VT = VT.changeVectorElementType(Context, EltVT: MVT::f16);
204
205	return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
206	NumIntermediates, RegisterVT);
207	}
208
209	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
210	LLVMContext& Context,
211	EVT VT) const {
212	if (!VT.isVector())
213	return MVT::i8;
214
215	if (Subtarget.hasAVX512()) {
216	// Figure out what this type will be legalized to.
217	EVT LegalVT = VT;
218	while (getTypeAction(Context, VT: LegalVT) != TypeLegal)
219	LegalVT = getTypeToTransformTo(Context, VT: LegalVT);
220
221	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
222	if (LegalVT.getSimpleVT().is512BitVector())
223	return EVT::getVectorVT(Context, VT: MVT::i1, EC: VT.getVectorElementCount());
224
225	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
226	// If we legalized to less than a 512-bit vector, then we will use a vXi1
227	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
228	// vXi16/vXi8.
229	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
230	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= `32`)
231	return EVT::getVectorVT(Context, VT: MVT::i1, EC: VT.getVectorElementCount());
232	}
233	}
234
235	return VT.changeVectorElementTypeToInteger();
236	}
237
238	bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
239	Type Ty, CallingConv::ID CallConv, bool* isVarArg,
240	const DataLayout &DL) const {
241	// On x86-64 i128 is split into two i64s and needs to be allocated to two
242	// consecutive registers, or spilled to the stack as a whole. On x86-32 i128
243	// is split to four i32s and never actually passed in registers, but we use
244	// the consecutive register mark to match it in TableGen.
245	if (Ty->isIntegerTy(Bitwidth: `128`))
246	return true;
247
248	// On x86-32, fp128 acts the same as i128.
249	if (Subtarget.is32Bit() && Ty->isFP128Ty())
250	return true;
251
252	return false;
253	}
254
255	/// Helper for getByValTypeAlignment to determine
256	/// the desired ByVal argument alignment.
257	static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
258	if (MaxAlign == `16`)
259	return;
260	if (VectorType *VTy = dyn_cast<VectorType>(Val: Ty)) {
261	if (VTy->getPrimitiveSizeInBits().getFixedValue() == `128`)
262	MaxAlign = Align (`16`);
263	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
264	Align EltAlign;
265	getMaxByValAlign(Ty: ATy->getElementType(), MaxAlign&: EltAlign);
266	if (EltAlign > MaxAlign)
267	MaxAlign = EltAlign;
268	} else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
269	for (auto *EltTy : STy->elements()) {
270	Align EltAlign;
271	getMaxByValAlign(Ty: EltTy, MaxAlign&: EltAlign);
272	if (EltAlign > MaxAlign)
273	MaxAlign = EltAlign;
274	if (MaxAlign == `16`)
275	break;
276	}
277	}
278	}
279
280	/// Return the desired alignment for ByVal aggregate
281	/// function arguments in the caller parameter area. For X86, aggregates
282	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
283	/// are at 4-byte boundaries.
284	Align X86TargetLowering::getByValTypeAlignment(Type *Ty,
285	const DataLayout &DL) const {
286	if (Subtarget.is64Bit())
287	return std::max(a: DL.getABITypeAlign(Ty), b: Align::Constant<`8`>());
288
289	Align Alignment(`4`);
290	if (Subtarget.hasSSE1())
291	getMaxByValAlign(Ty, MaxAlign&: Alignment);
292	return Alignment;
293	}
294
295	/// It returns EVT::Other if the type should be determined using generic
296	/// target-independent logic.
297	/// For vector ops we check that the overall size isn't larger than our
298	/// preferred vector width.
299	EVT X86TargetLowering::getOptimalMemOpType(
300	LLVMContext &Context, const MemOp &Op,
301	const AttributeList &FuncAttributes) const {
302	if (!FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat)) {
303	if (Op.size() >= `16` &&
304	(!Subtarget.isUnalignedMem16Slow() \|\| Op.isAligned(AlignCheck: Align (`16`)))) {
305	// FIXME: Check if unaligned 64-byte accesses are slow.
306	if (Op.size() >= `64` && Subtarget.hasAVX512() &&
307	(Subtarget.getPreferVectorWidth() >= `512`)) {
308	return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
309	}
310	// FIXME: Check if unaligned 32-byte accesses are slow.
311	if (Op.size() >= `32` && Subtarget.hasAVX() &&
312	Subtarget.useLight256BitInstructions()) {
313	// Although this isn't a well-supported type for AVX1, we'll let
314	// legalization and shuffle lowering produce the optimal codegen. If we
315	// choose an optimal type with a vector element larger than a byte,
316	// getMemsetStores() may create an intermediate splat (using an integer
317	// multiply) before we splat as a vector.
318	return MVT::v32i8;
319	}
320	if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= `128`))
321	return MVT::v16i8;
322	// TODO: Can SSE1 handle a byte vector?
323	// If we have SSE1 registers we should be able to use them.
324	if (Subtarget.hasSSE1() && (Subtarget.is64Bit() \|\| Subtarget.hasX87()) &&
325	(Subtarget.getPreferVectorWidth() >= `128`))
326	return MVT::v4f32;
327	} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) \|\| Op.isZeroMemset()) &&
328	Op.size() >= `8` && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
329	// Do not use f64 to lower memcpy if source is string constant. It's
330	// better to use i32 to avoid the loads.
331	// Also, do not use f64 to lower memset unless this is a memset of zeros.
332	// The gymnastics of splatting a byte value into an XMM register and then
333	// only using 8-byte stores (because this is a CPU with slow unaligned
334	// 16-byte accesses) makes that a loser.
335	return MVT::f64;
336	}
337	}
338	// This is a compromise. If we reach here, unaligned accesses may be slow on
339	// this target. However, creating smaller, aligned accesses could be even
340	// slower and would certainly be a lot more code.
341	if (Subtarget.is64Bit() && Op.size() >= `8`)
342	return MVT::i64;
343	return MVT::i32;
344	}
345
346	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
347	if (VT == MVT::f32)
348	return Subtarget.hasSSE1();
349	if (VT == MVT::f64)
350	return Subtarget.hasSSE2();
351	return true;
352	}
353
354	static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
355	return (`8` * Alignment.value()) % SizeInBits == `0`;
356	}
357
358	bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
359	if (isBitAligned(Alignment, SizeInBits: VT.getSizeInBits()))
360	return true;
361	switch (VT.getSizeInBits()) {
362	default:
363	// 8-byte and under are always assumed to be fast.
364	return true;
365	case `128`:
366	return !Subtarget.isUnalignedMem16Slow();
367	case `256`:
368	return !Subtarget.isUnalignedMem32Slow();
369	// TODO: What about AVX-512 (512-bit) accesses?
370	}
371	}
372
373	bool X86TargetLowering::allowsMisalignedMemoryAccesses(
374	EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
375	unsigned Fast) const* {
376	if (Fast)
377	*Fast = isMemoryAccessFast(VT, Alignment);
378	// NonTemporal vector memory ops must be aligned.
379	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
380	// NT loads can only be vector aligned, so if its less aligned than the
381	// minimum vector size (which we can split the vector down to), we might as
382	// well use a regular unaligned vector load.
383	// We don't have any NT loads pre-SSE41.
384	if (!!(Flags & MachineMemOperand::MOLoad))
385	return (Alignment < `16` \|\| !Subtarget.hasSSE41());
386	return false;
387	}
388	// Misaligned accesses of any size are always allowed.
389	return true;
390	}
391
392	bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
393	const DataLayout &DL, EVT VT,
394	unsigned AddrSpace, Align Alignment,
395	MachineMemOperand::Flags Flags,
396	unsigned Fast) const* {
397	if (Fast)
398	*Fast = isMemoryAccessFast(VT, Alignment);
399	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
400	if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
401	/Fast=/nullptr))
402	return true;
403	// NonTemporal vector memory ops are special, and must be aligned.
404	if (!isBitAligned(Alignment, SizeInBits: VT.getSizeInBits()))
405	return false;
406	switch (VT.getSizeInBits()) {
407	case `128`:
408	if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
409	return true;
410	if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
411	return true;
412	return false;
413	case `256`:
414	if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
415	return true;
416	if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
417	return true;
418	return false;
419	case `512`:
420	if (Subtarget.hasAVX512())
421	return true;
422	return false;
423	default:
424	return false; // Don't have NonTemporal vector memory ops of this size.
425	}
426	}
427	return true;
428	}
429
430	/// Return the entry encoding for a jump table in the
431	/// current function. The returned value is a member of the
432	/// MachineJumpTableInfo::JTEntryKind enum.
433	unsigned X86TargetLowering::getJumpTableEncoding() const {
434	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
435	// symbol.
436	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
437	return MachineJumpTableInfo::EK_Custom32;
438	if (isPositionIndependent() &&
439	getTargetMachine().getCodeModel() == CodeModel::Large &&
440	!Subtarget.isTargetCOFF())
441	return MachineJumpTableInfo::EK_LabelDifference64;
442
443	// Otherwise, use the normal jump table encoding heuristics.
444	return TargetLowering::getJumpTableEncoding();
445	}
446
447	bool X86TargetLowering::useSoftFloat() const {
448	return Subtarget.useSoftFloat();
449	}
450
451	void X86TargetLowering::markLibCallAttributes(MachineFunction MF, unsigned* CC,
452	ArgListTy &Args) const {
453
454	// Only relabel X86-32 for C / Stdcall CCs.
455	if (Subtarget.is64Bit())
456	return;
457	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
458	return;
459	unsigned ParamRegs = `0`;
460	if (auto *M = MF->getFunction().getParent())
461	ParamRegs = M->getNumberRegisterParameters();
462
463	// Mark the first N int arguments as having reg
464	for (auto &Arg : Args) {
465	Type *T = Arg.Ty;
466	if (T->isIntOrPtrTy())
467	if (MF->getDataLayout().getTypeAllocSize(Ty: T) <= `8`) {
468	unsigned numRegs = `1`;
469	if (MF->getDataLayout().getTypeAllocSize(Ty: T) > `4`)
470	numRegs = `2`;
471	if (ParamRegs < numRegs)
472	return;
473	ParamRegs -= numRegs;
474	Arg.IsInReg = true;
475	}
476	}
477	}
478
479	const MCExpr *
480	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
481	const MachineBasicBlock *MBB,
482	unsigned uid,MCContext &Ctx) const{
483	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
484	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
485	// entries.
486	return MCSymbolRefExpr::create(Symbol: MBB->getSymbol(), specifier: X86::S_GOTOFF, Ctx);
487	}
488
489	/// Returns relocation base for the given PIC jumptable.
490	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
491	SelectionDAG &DAG) const {
492	if (!Subtarget.is64Bit())
493	// This doesn't have SDLoc associated with it, but is not really the
494	// same as a Register.
495	return DAG.getNode(Opcode: X86ISD::GlobalBaseReg, DL: SDLoc (),
496	VT: getPointerTy(DL: DAG.getDataLayout()));
497	return Table;
498	}
499
500	/// This returns the relocation base for the given PIC jumptable,
501	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
502	const MCExpr *X86TargetLowering::
503	getPICJumpTableRelocBaseExpr(const MachineFunction MF, unsigned* JTI,
504	MCContext &Ctx) const {
505	// X86-64 uses RIP relative addressing based on the jump table label.
506	if (Subtarget.isPICStyleRIPRel() \|\|
507	(Subtarget.is64Bit() &&
508	getTargetMachine().getCodeModel() == CodeModel::Large))
509	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
510
511	// Otherwise, the reference is relative to the PIC base.
512	return MCSymbolRefExpr::create(Symbol: MF->getPICBaseSymbol(), Ctx);
513	}
514
515	std::pair<const TargetRegisterClass *, uint8_t>
516	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
517	MVT VT) const {
518	const TargetRegisterClass RRC = nullptr*;
519	uint8_t Cost = `1`;
520	switch (VT.SimpleTy) {
521	default:
522	return TargetLowering::findRepresentativeClass(TRI, VT);
523	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
524	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
525	break;
526	case MVT::x86mmx:
527	RRC = &X86::VR64RegClass;
528	break;
529	case MVT::f32: case MVT::f64:
530	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
531	case MVT::v4f32: case MVT::v2f64:
532	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
533	case MVT::v8f32: case MVT::v4f64:
534	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
535	case MVT::v16f32: case MVT::v8f64:
536	RRC = &X86::VR128XRegClass;
537	break;
538	}
539	return std::make_pair(x&: RRC, y&: Cost);
540	}
541
542	unsigned X86TargetLowering::getAddressSpace() const {
543	if (Subtarget.is64Bit())
544	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? X86AS::GS
545	: X86AS::FS;
546	return X86AS::GS;
547	}
548
549	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
550	return TargetTriple.isOSGlibc() \|\| TargetTriple.isMusl() \|\|
551	TargetTriple.isOSFuchsia() \|\| TargetTriple.isAndroid();
552	}
553
554	static Constant* SegmentOffset(IRBuilderBase &IRB,
555	int Offset, unsigned AddressSpace) {
556	return ConstantExpr::getIntToPtr(
557	C: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: IRB.getContext()), V: Offset),
558	Ty: IRB.getPtrTy(AddrSpace: AddressSpace));
559	}
560
561	Value *
562	X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB,
563	const LibcallLoweringInfo &Libcalls) const {
564	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
565	// tcbhead_t; use it instead of the usual global variable (see
566	// sysdeps/{i386,x86_64}/nptl/tls.h)
567	if (hasStackGuardSlotTLS(TargetTriple: Subtarget.getTargetTriple())) {
568	unsigned AddressSpace = getAddressSpace();
569
570	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
571	if (Subtarget.isTargetFuchsia())
572	return SegmentOffset(IRB, Offset: `0x10`, AddressSpace);
573
574	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
575	// Specially, some users may customize the base reg and offset.
576	int Offset = M->getStackProtectorGuardOffset();
577	// If we don't set -stack-protector-guard-offset value:
578	// %fs:0x28, unless we're using a Kernel code model, in which case
579	// it's %gs:0x28. gs:0x14 on i386.
580	if (Offset == INT_MAX)
581	Offset = (Subtarget.is64Bit()) ? `0x28` : `0x14`;
582
583	StringRef GuardReg = M->getStackProtectorGuardReg();
584	if (GuardReg == "fs")
585	AddressSpace = X86AS::FS;
586	else if (GuardReg == "gs")
587	AddressSpace = X86AS::GS;
588
589	// Use symbol guard if user specify.
590	StringRef GuardSymb = M->getStackProtectorGuardSymbol();
591	if (!GuardSymb.empty()) {
592	GlobalVariable *GV = M->getGlobalVariable(Name: GuardSymb);
593	if (!GV) {
594	Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(C&: M->getContext())
595	: Type::getInt32Ty(C&: M->getContext());
596	GV = new GlobalVariable (M, Ty, false*, GlobalValue::ExternalLinkage,
597	nullptr, GuardSymb, nullptr,
598	GlobalValue::NotThreadLocal, AddressSpace);
599	if (!Subtarget.isTargetDarwin())
600	GV->setDSOLocal(M->getDirectAccessExternalData());
601	}
602	return GV;
603	}
604
605	return SegmentOffset(IRB, Offset, AddressSpace);
606	}
607	return TargetLowering::getIRStackGuard(IRB, Libcalls);
608	}
609
610	void X86TargetLowering::insertSSPDeclarations(
611	Module &M, const LibcallLoweringInfo &Libcalls) const {
612	// MSVC CRT provides functionalities for stack protection.
613	RTLIB::LibcallImpl SecurityCheckCookieLibcall =
614	Libcalls.getLibcallImpl(Call: RTLIB::SECURITY_CHECK_COOKIE);
615
616	RTLIB::LibcallImpl SecurityCookieVar =
617	Libcalls.getLibcallImpl(Call: RTLIB::STACK_CHECK_GUARD);
618	if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
619	SecurityCookieVar != RTLIB::Unsupported) {
620	// MSVC CRT provides functionalities for stack protection.
621	// MSVC CRT has a global variable holding security cookie.
622	M.getOrInsertGlobal(Name: getLibcallImplName(Call: SecurityCookieVar),
623	Ty: PointerType::getUnqual(C&: M.getContext()));
624
625	// MSVC CRT has a function to validate security cookie.
626	FunctionCallee SecurityCheckCookie =
627	M.getOrInsertFunction(Name: getLibcallImplName(Call: SecurityCheckCookieLibcall),
628	RetTy: Type::getVoidTy(C&: M.getContext()),
629	Args: PointerType::getUnqual(C&: M.getContext()));
630
631	if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
632	F->setCallingConv(CallingConv::X86_FastCall);
633	F->addParamAttr(ArgNo: `0`, Kind: Attribute::AttrKind::InReg);
634	}
635	return;
636	}
637
638	StringRef GuardMode = M.getStackProtectorGuard();
639
640	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
641	if ((GuardMode == "tls" \|\| GuardMode.empty()) &&
642	hasStackGuardSlotTLS(TargetTriple: Subtarget.getTargetTriple()))
643	return;
644	TargetLowering::insertSSPDeclarations(M, Libcalls);
645	}
646
647	Value *X86TargetLowering::getSafeStackPointerLocation(
648	IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const {
649	// Android provides a fixed TLS slot for the SafeStack pointer. See the
650	// definition of TLS_SLOT_SAFESTACK in
651	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
652	if (Subtarget.isTargetAndroid()) {
653	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
654	// %gs:0x24 on i386
655	int Offset = (Subtarget.is64Bit()) ? `0x48` : `0x24`;
656	return SegmentOffset(IRB, Offset, AddressSpace: getAddressSpace());
657	}
658
659	// Fuchsia is similar.
660	if (Subtarget.isTargetFuchsia()) {
661	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
662	return SegmentOffset(IRB, Offset: `0x18`, AddressSpace: getAddressSpace());
663	}
664
665	return TargetLowering::getSafeStackPointerLocation(IRB, Libcalls);
666	}
667
668	//===----------------------------------------------------------------------===//
669	// Return Value Calling Convention Implementation
670	//===----------------------------------------------------------------------===//
671
672	bool X86TargetLowering::CanLowerReturn(
673	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
674	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
675	const Type RetTy) const* {
676	SmallVector<CCValAssign, `16`> RVLocs;
677	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
678	return CCInfo.CheckReturn(Outs, Fn: RetCC_X86);
679	}
680
681	const MCPhysReg X86TargetLowering::getScratchRegisters(CallingConv::ID) const* {
682	static const MCPhysReg ScratchRegs[] = { X86::R11, `0` };
683	return ScratchRegs;
684	}
685
686	ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
687	static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR};
688	return RCRegs;
689	}
690
691	/// Lowers masks values (vi1) to the local register values*
692	/// \returns DAG node after lowering to register type
693	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
694	const SDLoc &DL, SelectionDAG &DAG) {
695	EVT ValVT = ValArg.getValueType();
696
697	if (ValVT == MVT::v1i1)
698	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ValLoc, N1: ValArg,
699	N2: DAG.getIntPtrConstant(Val: `0`, DL));
700
701	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
702	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
703	// Two stage lowering might be required
704	// bitcast: v8i1 -> i8 / v16i1 -> i16
705	// anyextend: i8 -> i32 / i16 -> i32
706	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
707	SDValue ValToCopy = DAG.getBitcast(VT: TempValLoc, V: ValArg);
708	if (ValLoc == MVT::i32)
709	ValToCopy = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ValLoc, Operand: ValToCopy);
710	return ValToCopy;
711	}
712
713	if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
714	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
715	// One stage lowering is required
716	// bitcast: v32i1 -> i32 / v64i1 -> i64
717	return DAG.getBitcast(VT: ValLoc, V: ValArg);
718	}
719
720	return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ValLoc, Operand: ValArg);
721	}
722
723	/// Breaks v64i1 value into two registers and adds the new node to the DAG
724	static void Passv64i1ArgInRegs(
725	const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
726	SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
727	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
728	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
729	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
730	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
731	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
732	"The value should reside in two registers");
733
734	// Before splitting the value we cast it to i64
735	Arg = DAG.getBitcast(VT: MVT::i64, V: Arg);
736
737	// Splitting the value into two i32 types
738	SDValue Lo, Hi;
739	std::tie(args&: Lo, args&: Hi) = DAG.SplitScalar(N: Arg, DL, LoVT: MVT::i32, HiVT: MVT::i32);
740
741	// Attach the two i32 types into corresponding registers
742	RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Lo));
743	RegsToPass.push_back(Elt: std::make_pair(x: NextVA.getLocReg(), y&: Hi));
744	}
745
746	SDValue
747	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
748	bool isVarArg,
749	const SmallVectorImpl<ISD::OutputArg> &Outs,
750	const SmallVectorImpl<SDValue> &OutVals,
751	const SDLoc &dl, SelectionDAG &DAG) const {
752	MachineFunction &MF = DAG.getMachineFunction();
753	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
754
755	// In some cases we need to disable registers from the default CSR list.
756	// For example, when they are used as return registers (preserve_ and X86's*
757	// regcall) or for argument passing (X86's regcall).
758	bool ShouldDisableCalleeSavedRegister =
759	shouldDisableRetRegFromCSR(CC: CallConv) \|\|
760	MF.getFunction().hasFnAttribute(Kind: "no_caller_saved_registers");
761
762	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
763	report_fatal_error(reason: "X86 interrupts may not return any value");
764
765	SmallVector<CCValAssign, `16`> RVLocs;
766	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
767	CCInfo.AnalyzeReturn(Outs, Fn: RetCC_X86);
768
769	SmallVector<std::pair<Register, SDValue>, `4`> RetVals;
770	for (unsigned I = `0`, OutsIndex = `0`, E = RVLocs.size(); I != E;
771	++I, ++OutsIndex) {
772	CCValAssign &VA = RVLocs [I];
773	assert(VA.isRegLoc() && "Can only return in registers!");
774
775	// Add the register to the CalleeSaveDisableRegs list.
776	if (ShouldDisableCalleeSavedRegister)
777	MF.getRegInfo().disableCalleeSavedRegister(Reg: VA.getLocReg());
778
779	SDValue ValToCopy = OutVals [OutsIndex];
780	EVT ValVT = ValToCopy.getValueType();
781
782	// Promote values to the appropriate types.
783	if (VA.getLocInfo() == CCValAssign::SExt)
784	ValToCopy = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
785	else if (VA.getLocInfo() == CCValAssign::ZExt)
786	ValToCopy = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
787	else if (VA.getLocInfo() == CCValAssign::AExt) {
788	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
789	ValToCopy = lowerMasksToReg(ValArg: ValToCopy, ValLoc: VA.getLocVT(), DL: dl, DAG);
790	else
791	ValToCopy = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
792	}
793	else if (VA.getLocInfo() == CCValAssign::BCvt)
794	ValToCopy = DAG.getBitcast(VT: VA.getLocVT(), V: ValToCopy);
795
796	assert(VA.getLocInfo() != CCValAssign::FPExt &&
797	"Unexpected FP-extend for return value.");
798
799	// Report an error if we have attempted to return a value via an XMM
800	// register and SSE was disabled.
801	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(Reg: VA.getLocReg())) {
802	errorUnsupported(DAG, dl, Msg: "SSE register return with SSE disabled");
803	VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
804	} else if (!Subtarget.hasSSE2() &&
805	X86::FR64XRegClass.contains(Reg: VA.getLocReg()) &&
806	ValVT == MVT::f64) {
807	// When returning a double via an XMM register, report an error if SSE2 is
808	// not enabled.
809	errorUnsupported(DAG, dl, Msg: "SSE2 register return with SSE2 disabled");
810	VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
811	}
812
813	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
814	// the RET instruction and handled by the FP Stackifier.
815	if (VA.getLocReg() == X86::FP0 \|\|
816	VA.getLocReg() == X86::FP1) {
817	// If this is a copy from an xmm register to ST(0), use an FPExtend to
818	// change the value to the FP stack register class.
819	if (isScalarFPTypeInSSEReg(VT: VA.getValVT()))
820	ValToCopy = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f80, Operand: ValToCopy);
821	RetVals.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ValToCopy));
822	// Don't emit a copytoreg.
823	continue;
824	}
825
826	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
827	// which is returned in RAX / RDX.
828	if (Subtarget.is64Bit()) {
829	if (ValVT == MVT::x86mmx) {
830	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
831	ValToCopy = DAG.getBitcast(VT: MVT::i64, V: ValToCopy);
832	ValToCopy = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v2i64,
833	Operand: ValToCopy);
834	// If we don't have SSE2 available, convert to v4f32 so the generated
835	// register is legal.
836	if (!Subtarget.hasSSE2())
837	ValToCopy = DAG.getBitcast(VT: MVT::v4f32, V: ValToCopy);
838	}
839	}
840	}
841
842	if (VA.needsCustom()) {
843	assert(VA.getValVT() == MVT::v64i1 &&
844	"Currently the only custom case is when we split v64i1 to 2 regs");
845
846	Passv64i1ArgInRegs(DL: dl, DAG, Arg&: ValToCopy, RegsToPass&: RetVals, VA, NextVA&: RVLocs [++I],
847	Subtarget);
848
849	// Add the second register to the CalleeSaveDisableRegs list.
850	if (ShouldDisableCalleeSavedRegister)
851	MF.getRegInfo().disableCalleeSavedRegister(Reg: RVLocs [I].getLocReg());
852	} else {
853	RetVals.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ValToCopy));
854	}
855	}
856
857	SDValue Glue;
858	SmallVector<SDValue, `6`> RetOps;
859	RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
860	// Operand #1 = Bytes To Pop
861	RetOps.push_back(Elt: DAG.getTargetConstant(Val: FuncInfo->getBytesToPopOnReturn(), DL: dl,
862	VT: MVT::i32));
863
864	// Copy the result values into the output registers.
865	for (auto &RetVal : RetVals) {
866	if (RetVal.first == X86::FP0 \|\| RetVal.first == X86::FP1) {
867	RetOps.push_back(Elt: RetVal.second);
868	continue; // Don't emit a copytoreg.
869	}
870
871	Chain = DAG.getCopyToReg(Chain, dl, Reg: RetVal.first, N: RetVal.second, Glue);
872	Glue = Chain.getValue(R: `1`);
873	RetOps.push_back(
874	Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
875	}
876
877	// Swift calling convention does not require we copy the sret argument
878	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
879
880	// All x86 ABIs require that for returning structs by value we copy
881	// the sret argument into %rax/%eax (depending on ABI) for the return.
882	// We saved the argument into a virtual register in the entry block,
883	// so now we copy the value out and into %rax/%eax.
884	//
885	// Checking Function.hasStructRetAttr() here is insufficient because the IR
886	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
887	// false, then an sret argument may be implicitly inserted in the SelDAG. In
888	// either case FuncInfo->setSRetReturnReg() will have been called.
889	if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
890	// When we have both sret and another return value, we should use the
891	// original Chain stored in RetOps[0], instead of the current Chain updated
892	// in the above loop. If we only have sret, RetOps[0] equals to Chain.
893
894	// For the case of sret and another return value, we have
895	// Chain_0 at the function entry
896	// Chain_1 = getCopyToReg(Chain_0) in the above loop
897	// If we use Chain_1 in getCopyFromReg, we will have
898	// Val = getCopyFromReg(Chain_1)
899	// Chain_2 = getCopyToReg(Chain_1, Val) from below
900
901	// getCopyToReg(Chain_0) will be glued together with
902	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
903	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
904	// Data dependency from Unit B to Unit A due to usage of Val in
905	// getCopyToReg(Chain_1, Val)
906	// Chain dependency from Unit A to Unit B
907
908	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
909	SDValue Val = DAG.getCopyFromReg(Chain: RetOps [`0`], dl, Reg: SRetReg,
910	VT: getPointerTy(DL: MF.getDataLayout()));
911
912	Register RetValReg
913	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
914	X86::RAX : X86::EAX;
915	Chain = DAG.getCopyToReg(Chain, dl, Reg: RetValReg, N: Val, Glue);
916	Glue = Chain.getValue(R: `1`);
917
918	// RAX/EAX now acts like a return value.
919	RetOps.push_back(
920	Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
921
922	// Add the returned register to the CalleeSaveDisableRegs list. Don't do
923	// this however for preserve_most/preserve_all to minimize the number of
924	// callee-saved registers for these CCs.
925	if (ShouldDisableCalleeSavedRegister &&
926	CallConv != CallingConv::PreserveAll &&
927	CallConv != CallingConv::PreserveMost)
928	MF.getRegInfo().disableCalleeSavedRegister(Reg: RetValReg);
929	}
930
931	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
932	const MCPhysReg *I =
933	TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
934	if (I) {
935	for (; *I; ++I) {
936	if (X86::GR64RegClass.contains(Reg: *I))
937	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
938	else
939	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
940	}
941	}
942
943	RetOps [`0`] = Chain; // Update chain.
944
945	// Add the glue if we have it.
946	if (Glue.getNode())
947	RetOps.push_back(Elt: Glue);
948
949	unsigned RetOpcode = X86ISD::RET_GLUE;
950	if (CallConv == CallingConv::X86_INTR)
951	RetOpcode = X86ISD::IRET;
952	return DAG.getNode(Opcode: RetOpcode, DL: dl, VT: MVT::Other, Ops: RetOps);
953	}
954
955	bool X86TargetLowering::isUsedByReturnOnly(SDNode N, SDValue &Chain) const* {
956	if (N->getNumValues() != `1` \|\| !N->hasNUsesOfValue(NUses: `1`, Value: `0`))
957	return false;
958
959	SDValue TCChain = Chain;
960	SDNode Copy = N->user_begin();
961	if (Copy->getOpcode() == ISD::CopyToReg) {
962	// If the copy has a glue operand, we conservatively assume it isn't safe to
963	// perform a tail call.
964	if (Copy->getOperand(Num: Copy->getNumOperands()-`1`).getValueType() == MVT::Glue)
965	return false;
966	TCChain = Copy->getOperand(Num: `0`);
967	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
968	return false;
969
970	bool HasRet = false;
971	for (const SDNode *U : Copy->users()) {
972	if (U->getOpcode() != X86ISD::RET_GLUE)
973	return false;
974	// If we are returning more than one value, we can definitely
975	// not make a tail call see PR19530
976	if (U->getNumOperands() > `4`)
977	return false;
978	if (U->getNumOperands() == `4` &&
979	U->getOperand(Num: U->getNumOperands() - `1`).getValueType() != MVT::Glue)
980	return false;
981	HasRet = true;
982	}
983
984	if (!HasRet)
985	return false;
986
987	Chain = TCChain;
988	return true;
989	}
990
991	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
992	ISD::NodeType ExtendKind) const {
993	MVT ReturnMVT = MVT::i32;
994
995	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
996	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
997	// The ABI does not require i1, i8 or i16 to be extended.
998	//
999	// On Darwin, there is code in the wild relying on Clang's old behaviour of
1000	// always extending i8/i16 return values, so keep doing that for now.
1001	// (PR26665).
1002	ReturnMVT = MVT::i8;
1003	}
1004
1005	EVT MinVT = getRegisterType(Context, VT: ReturnMVT);
1006	return VT.bitsLT(VT: MinVT) ? MinVT : VT;
1007	}
1008
1009	/// Reads two 32 bit registers and creates a 64 bit mask value.
1010	/// \param VA The current 32 bit value that need to be assigned.
1011	/// \param NextVA The next 32 bit value that need to be assigned.
1012	/// \param Root The parent DAG node.
1013	/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
1014	/// glue purposes. In the case the DAG is already using
1015	/// physical register instead of virtual, we should glue
1016	/// our new SDValue to InGlue SDvalue.
1017	/// \return a new SDvalue of size 64bit.
1018	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
1019	SDValue &Root, SelectionDAG &DAG,
1020	const SDLoc &DL, const X86Subtarget &Subtarget,
1021	SDValue InGlue = nullptr*) {
1022	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
1023	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
1024	assert(VA.getValVT() == MVT::v64i1 &&
1025	"Expecting first location of 64 bit width type");
1026	assert(NextVA.getValVT() == VA.getValVT() &&
1027	"The locations should have the same type");
1028	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
1029	"The values should reside in two registers");
1030
1031	SDValue Lo, Hi;
1032	SDValue ArgValueLo, ArgValueHi;
1033
1034	MachineFunction &MF = DAG.getMachineFunction();
1035	const TargetRegisterClass *RC = &X86::GR32RegClass;
1036
1037	// Read a 32 bit value from the registers.
1038	if (nullptr == InGlue) {
1039	// When no physical register is present,
1040	// create an intermediate virtual register.
1041	Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
1042	ArgValueLo = DAG.getCopyFromReg(Chain: Root, dl: DL, Reg, VT: MVT::i32);
1043	Reg = MF.addLiveIn(PReg: NextVA.getLocReg(), RC);
1044	ArgValueHi = DAG.getCopyFromReg(Chain: Root, dl: DL, Reg, VT: MVT::i32);
1045	} else {
1046	// When a physical register is available read the value from it and glue
1047	// the reads together.
1048	ArgValueLo =
1049	DAG.getCopyFromReg(Chain: Root, dl: DL, Reg: VA.getLocReg(), VT: MVT::i32, Glue: *InGlue);
1050	*InGlue = ArgValueLo.getValue(R: `2`);
1051	ArgValueHi =
1052	DAG.getCopyFromReg(Chain: Root, dl: DL, Reg: NextVA.getLocReg(), VT: MVT::i32, Glue: *InGlue);
1053	*InGlue = ArgValueHi.getValue(R: `2`);
1054	}
1055
1056	// Convert the i32 type into v32i1 type.
1057	Lo = DAG.getBitcast(VT: MVT::v32i1, V: ArgValueLo);
1058
1059	// Convert the i32 type into v32i1 type.
1060	Hi = DAG.getBitcast(VT: MVT::v32i1, V: ArgValueHi);
1061
1062	// Concatenate the two values together.
1063	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v64i1, N1: Lo, N2: Hi);
1064	}
1065
1066	/// The function will lower a register of various sizes (8/16/32/64)
1067	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
1068	/// \returns a DAG node contains the operand after lowering to mask type.
1069	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
1070	const EVT &ValLoc, const SDLoc &DL,
1071	SelectionDAG &DAG) {
1072	SDValue ValReturned = ValArg;
1073
1074	if (ValVT == MVT::v1i1)
1075	return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i1, Operand: ValReturned);
1076
1077	if (ValVT == MVT::v64i1) {
1078	// In 32 bit machine, this case is handled by getv64i1Argument
1079	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
1080	// In 64 bit machine, There is no need to truncate the value only bitcast
1081	} else {
1082	MVT MaskLenVT;
1083	switch (ValVT.getSimpleVT().SimpleTy) {
1084	case MVT::v8i1:
1085	MaskLenVT = MVT::i8;
1086	break;
1087	case MVT::v16i1:
1088	MaskLenVT = MVT::i16;
1089	break;
1090	case MVT::v32i1:
1091	MaskLenVT = MVT::i32;
1092	break;
1093	default:
1094	llvm_unreachable("Expecting a vector of i1 types");
1095	}
1096
1097	ValReturned = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MaskLenVT, Operand: ValReturned);
1098	}
1099	return DAG.getBitcast(VT: ValVT, V: ValReturned);
1100	}
1101
1102	static SDValue getPopFromX87Reg(SelectionDAG &DAG, SDValue Chain,
1103	const SDLoc &dl, Register Reg, EVT VT,
1104	SDValue Glue) {
1105	SDVTList VTs = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
1106	SDValue Ops[] = {Chain, DAG.getRegister(Reg, VT), Glue};
1107	return DAG.getNode(Opcode: X86ISD::POP_FROM_X87_REG, DL: dl, VTList: VTs,
1108	Ops: ArrayRef(Ops, Glue.getNode() ? `3` : `2`));
1109	}
1110
1111	/// Lower the result values of a call into the
1112	/// appropriate copies out of appropriate physical registers.
1113	///
1114	SDValue X86TargetLowering::LowerCallResult(
1115	SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1116	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1117	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
1118	uint32_t RegMask) const* {
1119
1120	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1121	// Assign locations to each value returned by this call.
1122	SmallVector<CCValAssign, `16`> RVLocs;
1123	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1124	*DAG.getContext());
1125	CCInfo.AnalyzeCallResult(Ins, Fn: RetCC_X86);
1126
1127	// Copy all of the result registers out of their specified physreg.
1128	for (unsigned I = `0`, InsIndex = `0`, E = RVLocs.size(); I != E;
1129	++I, ++InsIndex) {
1130	CCValAssign &VA = RVLocs [I];
1131	EVT CopyVT = VA.getLocVT();
1132
1133	// In some calling conventions we need to remove the used registers
1134	// from the register mask.
1135	if (RegMask) {
1136	for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg: VA.getLocReg()))
1137	RegMask[SubReg / `32`] &= ~(`1u` << (SubReg % `32`));
1138	}
1139
1140	// Report an error if there was an attempt to return FP values via XMM
1141	// registers.
1142	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(Reg: VA.getLocReg())) {
1143	errorUnsupported(DAG, dl, Msg: "SSE register return with SSE disabled");
1144	if (VA.getLocReg() == X86::XMM1)
1145	VA.convertToReg(Reg: X86::FP1); // Set reg to FP1, avoid hitting asserts.
1146	else
1147	VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
1148	} else if (!Subtarget.hasSSE2() &&
1149	X86::FR64XRegClass.contains(Reg: VA.getLocReg()) &&
1150	CopyVT == MVT::f64) {
1151	errorUnsupported(DAG, dl, Msg: "SSE2 register return with SSE2 disabled");
1152	if (VA.getLocReg() == X86::XMM1)
1153	VA.convertToReg(Reg: X86::FP1); // Set reg to FP1, avoid hitting asserts.
1154	else
1155	VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
1156	}
1157
1158	// If we prefer to use the value in xmm registers, copy it out as f80 and
1159	// use a truncate to move it from fp stack reg to xmm reg.
1160	bool RoundAfterCopy = false;
1161	bool X87Result = VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1;
1162	if (X87Result && isScalarFPTypeInSSEReg(VT: VA.getValVT())) {
1163	if (!Subtarget.hasX87())
1164	report_fatal_error(reason: "X87 register return with X87 disabled");
1165	CopyVT = MVT::f80;
1166	RoundAfterCopy = (CopyVT != VA.getLocVT());
1167	}
1168
1169	SDValue Val;
1170	if (VA.needsCustom()) {
1171	assert(VA.getValVT() == MVT::v64i1 &&
1172	"Currently the only custom case is when we split v64i1 to 2 regs");
1173	Val =
1174	getv64i1Argument(VA, NextVA&: RVLocs [++I], Root&: Chain, DAG, DL: dl, Subtarget, InGlue: &InGlue);
1175	} else {
1176	Chain =
1177	X87Result
1178	? getPopFromX87Reg(DAG, Chain, dl, Reg: VA.getLocReg(), VT: CopyVT, Glue: InGlue)
1179	.getValue(R: `1`)
1180	: DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: CopyVT, Glue: InGlue)
1181	.getValue(R: `1`);
1182	Val = Chain.getValue(R: `0`);
1183	InGlue = Chain.getValue(R: `2`);
1184	}
1185
1186	if (RoundAfterCopy)
1187	Val = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: VA.getValVT(), N1: Val,
1188	// This truncation won't change the value.
1189	N2: DAG.getIntPtrConstant(Val: `1`, DL: dl, /isTarget=/true));
1190
1191	if (VA.isExtInLoc()) {
1192	if (VA.getValVT().isVector() &&
1193	VA.getValVT().getScalarType() == MVT::i1 &&
1194	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
1195	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
1196	// promoting a mask type (vi1) into a register of type i64/i32/i16/i8*
1197	Val = lowerRegToMasks(ValArg: Val, ValVT: VA.getValVT(), ValLoc: VA.getLocVT(), DL: dl, DAG);
1198	} else
1199	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
1200	}
1201
1202	if (VA.getLocInfo() == CCValAssign::BCvt)
1203	Val = DAG.getBitcast(VT: VA.getValVT(), V: Val);
1204
1205	InVals.push_back(Elt: Val);
1206	}
1207
1208	return Chain;
1209	}
1210
1211	/// Determines whether Args, either a set of outgoing arguments to a call, or a
1212	/// set of incoming args of a call, contains an sret pointer that the callee
1213	/// pops. This happens on most x86-32, System V platforms, unless register
1214	/// parameters are in use (-mregparm=1+, regcallcc, etc).
1215	template <typename T>
1216	static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
1217	const SmallVectorImpl<CCValAssign> &ArgLocs,
1218	const X86Subtarget &Subtarget) {
1219	// Not C++20 (yet), so no concepts available.
1220	static_assert(std::is_same_v<T, ISD::OutputArg> \|\|
1221	std::is_same_v<T, ISD::InputArg>,
1222	"requires ISD::OutputArg or ISD::InputArg");
1223
1224	// Popping the sret pointer only happens on x86-32 System V ABI platforms
1225	// (Linux, Cygwin, BSDs, Mac, etc). That excludes Windows-minus-Cygwin and
1226	// MCU.
1227	const Triple &TT = Subtarget.getTargetTriple();
1228	if (!TT.isX86_32() \|\| TT.isOSMSVCRT() \|\| TT.isOSIAMCU())
1229	return false;
1230
1231	// Check if the first argument is marked sret and if it is passed in memory.
1232	bool IsSRetInMem = false;
1233	if (!Args.empty())
1234	IsSRetInMem = Args.front().Flags.isSRet() && ArgLocs.front().isMemLoc();
1235	return IsSRetInMem;
1236	}
1237
1238	/// Make a copy of an aggregate at address specified by "Src" to address
1239	/// "Dst" with size and alignment information specified by the specific
1240	/// parameter attribute. The copy will be passed as a byval function parameter.
1241	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
1242	SDValue Chain, ISD::ArgFlagsTy Flags,
1243	SelectionDAG &DAG, const SDLoc &dl) {
1244	SDValue SizeNode = DAG.getIntPtrConstant(Val: Flags.getByValSize(), DL: dl);
1245
1246	return DAG.getMemcpy(
1247	Chain, dl, Dst, Src, Size: SizeNode, Alignment: Flags.getNonZeroByValAlign(),
1248	/isVolatile/ isVol: false, /AlwaysInline=/true,
1249	/CI=/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo (), SrcPtrInfo: MachinePointerInfo ());
1250	}
1251
1252	/// Return true if the calling convention is one that we can guarantee TCO for.
1253	static bool canGuaranteeTCO(CallingConv::ID CC) {
1254	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
1255	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
1256	CC == CallingConv::Tail \|\| CC == CallingConv::SwiftTail);
1257	}
1258
1259	/// Return true if we might ever do TCO for calls with this calling convention.
1260	static bool mayTailCallThisCC(CallingConv::ID CC) {
1261	switch (CC) {
1262	// C calling conventions:
1263	case CallingConv::C:
1264	case CallingConv::Win64:
1265	case CallingConv::X86_64_SysV:
1266	case CallingConv::PreserveNone:
1267	// Callee pop conventions:
1268	case CallingConv::X86_ThisCall:
1269	case CallingConv::X86_StdCall:
1270	case CallingConv::X86_VectorCall:
1271	case CallingConv::X86_FastCall:
1272	// Swift:
1273	case CallingConv::Swift:
1274	return true;
1275	default:
1276	return canGuaranteeTCO(CC);
1277	}
1278	}
1279
1280	/// Return true if the function is being made into a tailcall target by
1281	/// changing its ABI.
1282	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
1283	return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) \|\|
1284	CC == CallingConv::Tail \|\| CC == CallingConv::SwiftTail;
1285	}
1286
1287	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst CI) const* {
1288	if (!CI->isTailCall())
1289	return false;
1290
1291	CallingConv::ID CalleeCC = CI->getCallingConv();
1292	if (!mayTailCallThisCC(CC: CalleeCC))
1293	return false;
1294
1295	return true;
1296	}
1297
1298	SDValue
1299	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1300	const SmallVectorImpl<ISD::InputArg> &Ins,
1301	const SDLoc &dl, SelectionDAG &DAG,
1302	const CCValAssign &VA,
1303	MachineFrameInfo &MFI, unsigned i) const {
1304	// Create the nodes corresponding to a load from this parameter slot.
1305	ISD::ArgFlagsTy Flags = Ins [i].Flags;
1306	bool AlwaysUseMutable = shouldGuaranteeTCO(
1307	CC: CallConv, GuaranteedTailCallOpt: DAG.getTarget().Options.GuaranteedTailCallOpt);
1308	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1309	EVT ValVT;
1310	MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
1311
1312	// If value is passed by pointer we have address passed instead of the value
1313	// itself. No need to extend if the mask value and location share the same
1314	// absolute size.
1315	bool ExtendedInMem =
1316	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
1317	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
1318
1319	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
1320	ValVT = VA.getLocVT();
1321	else
1322	ValVT = VA.getValVT();
1323
1324	// FIXME: For now, all byval parameter objects are marked mutable. This can be
1325	// changed with more analysis.
1326	// In case of tail call optimization mark all arguments mutable. Since they
1327	// could be overwritten by lowering of arguments in case of a tail call.
1328	if (Flags.isByVal()) {
1329	unsigned Bytes = Flags.getByValSize();
1330	if (Bytes == `0`) Bytes = `1`; // Don't create zero-sized stack objects.
1331
1332	// FIXME: For now, all byval parameter objects are marked as aliasing. This
1333	// can be improved with deeper analysis.
1334	int FI = MFI.CreateFixedObject(Size: Bytes, SPOffset: VA.getLocMemOffset(), IsImmutable: isImmutable,
1335	/isAliased=/true);
1336	return DAG.getFrameIndex(FI, VT: PtrVT);
1337	}
1338
1339	EVT ArgVT = Ins [i].ArgVT;
1340
1341	// If this is a vector that has been split into multiple parts, don't elide
1342	// the copy. The layout on the stack may not match the packed in-memory
1343	// layout.
1344	bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
1345
1346	// This is an argument in memory. We might be able to perform copy elision.
1347	// If the argument is passed directly in memory without any extension, then we
1348	// can perform copy elision. Large vector types, for example, may be passed
1349	// indirectly by pointer.
1350	if (Flags.isCopyElisionCandidate() &&
1351	VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
1352	!ScalarizedVector) {
1353	SDValue PartAddr;
1354	if (Ins [i].PartOffset == `0`) {
1355	// If this is a one-part value or the first part of a multi-part value,
1356	// create a stack object for the entire argument value type and return a
1357	// load from our portion of it. This assumes that if the first part of an
1358	// argument is in memory, the rest will also be in memory.
1359	int FI = MFI.CreateFixedObject(Size: ArgVT.getStoreSize(), SPOffset: VA.getLocMemOffset(),
1360	/IsImmutable=/false);
1361	PartAddr = DAG.getFrameIndex(FI, VT: PtrVT);
1362	return DAG.getLoad(
1363	VT: ValVT, dl, Chain, Ptr: PartAddr,
1364	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI));
1365	}
1366
1367	// This is not the first piece of an argument in memory. See if there is
1368	// already a fixed stack object including this offset. If so, assume it
1369	// was created by the PartOffset == 0 branch above and create a load from
1370	// the appropriate offset into it.
1371	int64_t PartBegin = VA.getLocMemOffset();
1372	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / `8`;
1373	int FI = MFI.getObjectIndexBegin();
1374	for (; MFI.isFixedObjectIndex(ObjectIdx: FI); ++FI) {
1375	int64_t ObjBegin = MFI.getObjectOffset(ObjectIdx: FI);
1376	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(ObjectIdx: FI);
1377	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
1378	break;
1379	}
1380	if (MFI.isFixedObjectIndex(ObjectIdx: FI)) {
1381	SDValue Addr =
1382	DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: DAG.getFrameIndex(FI, VT: PtrVT),
1383	N2: DAG.getIntPtrConstant(Val: Ins [i].PartOffset, DL: dl));
1384	return DAG.getLoad(VT: ValVT, dl, Chain, Ptr: Addr,
1385	PtrInfo: MachinePointerInfo::getFixedStack(
1386	MF&: DAG.getMachineFunction(), FI, Offset: Ins [i].PartOffset));
1387	}
1388	}
1389
1390	int FI = MFI.CreateFixedObject(Size: ValVT.getSizeInBits() / `8`,
1391	SPOffset: VA.getLocMemOffset(), IsImmutable: isImmutable);
1392
1393	// Set SExt or ZExt flag.
1394	if (VA.getLocInfo() == CCValAssign::ZExt) {
1395	MFI.setObjectZExt(ObjectIdx: FI, IsZExt: true);
1396	} else if (VA.getLocInfo() == CCValAssign::SExt) {
1397	MFI.setObjectSExt(ObjectIdx: FI, IsSExt: true);
1398	}
1399
1400	MaybeAlign Alignment;
1401	if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1402	ValVT != MVT::f80)
1403	Alignment = MaybeAlign (`4`);
1404	SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
1405	SDValue Val = DAG.getLoad(
1406	VT: ValVT, dl, Chain, Ptr: FIN,
1407	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI),
1408	Alignment);
1409	return ExtendedInMem
1410	? (VA.getValVT().isVector()
1411	? DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VA.getValVT(), Operand: Val)
1412	: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val))
1413	: Val;
1414	}
1415
1416	// FIXME: Get this from tablegen.
1417	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
1418	const X86Subtarget &Subtarget) {
1419	assert(Subtarget.is64Bit());
1420
1421	if (Subtarget.isCallingConvWin64(CC: CallConv)) {
1422	static const MCPhysReg GPR64ArgRegsWin64[] = {
1423	X86::RCX, X86::RDX, X86::R8, X86::R9
1424	};
1425	return GPR64ArgRegsWin64;
1426	}
1427
1428	static const MCPhysReg GPR64ArgRegs64Bit[] = {
1429	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1430	};
1431	return GPR64ArgRegs64Bit;
1432	}
1433
1434	// FIXME: Get this from tablegen.
1435	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
1436	CallingConv::ID CallConv,
1437	const X86Subtarget &Subtarget) {
1438	assert(Subtarget.is64Bit());
1439	if (Subtarget.isCallingConvWin64(CC: CallConv)) {
1440	// The XMM registers which might contain var arg parameters are shadowed
1441	// in their paired GPR. So we only need to save the GPR to their home
1442	// slots.
1443	// TODO: __vectorcall will change this.
1444	return {};
1445	}
1446
1447	bool isSoftFloat = Subtarget.useSoftFloat();
1448	if (isSoftFloat \|\| !Subtarget.hasSSE1())
1449	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
1450	// registers.
1451	return {};
1452
1453	static const MCPhysReg XMMArgRegs64Bit[] = {
1454	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1455	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1456	};
1457	return XMMArgRegs64Bit;
1458	}
1459
1460	#ifndef NDEBUG
1461	static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
1462	return llvm::is_sorted(
1463	ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
1464	return A.getValNo() < B.getValNo();
1465	});
1466	}
1467	#endif
1468
1469	namespace {
1470	/// This is a helper class for lowering variable arguments parameters.
1471	class VarArgsLoweringHelper {
1472	public:
1473	VarArgsLoweringHelper(X86MachineFunctionInfo FuncInfo, const* SDLoc &Loc,
1474	SelectionDAG &DAG, const X86Subtarget &Subtarget,
1475	CallingConv::ID CallConv, CCState &CCInfo)
1476	: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
1477	TheMachineFunction(DAG.getMachineFunction()),
1478	TheFunction(TheMachineFunction.getFunction()),
1479	FrameInfo(TheMachineFunction.getFrameInfo()),
1480	FrameLowering(*Subtarget.getFrameLowering()),
1481	TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
1482	CCInfo(CCInfo) {}
1483
1484	// Lower variable arguments parameters.
1485	void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
1486
1487	private:
1488	void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
1489
1490	void forwardMustTailParameters(SDValue &Chain);
1491
1492	bool is64Bit() const { return Subtarget.is64Bit(); }
1493	bool isWin64() const { return Subtarget.isCallingConvWin64(CC: CallConv); }
1494
1495	X86MachineFunctionInfo *FuncInfo;
1496	const SDLoc &DL;
1497	SelectionDAG &DAG;
1498	const X86Subtarget &Subtarget;
1499	MachineFunction &TheMachineFunction;
1500	const Function &TheFunction;
1501	MachineFrameInfo &FrameInfo;
1502	const TargetFrameLowering &FrameLowering;
1503	const TargetLowering &TargLowering;
1504	CallingConv::ID CallConv;
1505	CCState &CCInfo;
1506	};
1507	} // namespace
1508
1509	void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
1510	SDValue &Chain, unsigned StackSize) {
1511	// If the function takes variable number of arguments, make a frame index for
1512	// the start of the first vararg value... for expansion of llvm.va_start. We
1513	// can skip this if there are no va_start calls.
1514	if (is64Bit() \|\| (CallConv != CallingConv::X86_FastCall &&
1515	CallConv != CallingConv::X86_ThisCall)) {
1516	FuncInfo->setVarArgsFrameIndex(
1517	FrameInfo.CreateFixedObject(Size: `1`, SPOffset: StackSize, IsImmutable: true));
1518	}
1519
1520	// 64-bit calling conventions support varargs and register parameters, so we
1521	// have to do extra work to spill them in the prologue.
1522	if (is64Bit()) {
1523	// Find the first unallocated argument registers.
1524	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
1525	ArrayRef<MCPhysReg> ArgXMMs =
1526	get64BitArgumentXMMs(MF&: TheMachineFunction, CallConv, Subtarget);
1527	unsigned NumIntRegs = CCInfo.getFirstUnallocated(Regs: ArgGPRs);
1528	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(Regs: ArgXMMs);
1529
1530	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
1531	"SSE register cannot be used when SSE is disabled!");
1532
1533	if (isWin64()) {
1534	// Get to the caller-allocated home save location. Add 8 to account
1535	// for the return address.
1536	int HomeOffset = FrameLowering.getOffsetOfLocalArea() + `8`;
1537	FuncInfo->setRegSaveFrameIndex(
1538	FrameInfo.CreateFixedObject(Size: `1`, SPOffset: NumIntRegs * `8` + HomeOffset, IsImmutable: false));
1539	// Fixup to set vararg frame on shadow area (4 x i64).
1540	if (NumIntRegs < `4`)
1541	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1542	} else {
1543	// For X86-64, if there are vararg parameters that are passed via
1544	// registers, then we must store them to their spots on the stack so
1545	// they may be loaded by dereferencing the result of va_next.
1546	FuncInfo->setVarArgsGPOffset(NumIntRegs * `8`);
1547	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * `8` + NumXMMRegs * `16`);
1548	FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
1549	Size: ArgGPRs.size() * `8` + ArgXMMs.size() * `16`, Alignment: Align (`16`), isSpillSlot: false));
1550	}
1551
1552	SmallVector<SDValue, `6`>
1553	LiveGPRs; // list of SDValue for GPR registers keeping live input value
1554	SmallVector<SDValue, `8`> LiveXMMRegs; // list of SDValue for XMM registers
1555	// keeping live input value
1556	SDValue ALVal; // if applicable keeps SDValue for %al register
1557
1558	// Gather all the live in physical registers.
1559	for (MCPhysReg Reg : ArgGPRs.slice(N: NumIntRegs)) {
1560	Register GPR = TheMachineFunction.addLiveIn(PReg: Reg, RC: &X86::GR64RegClass);
1561	LiveGPRs.push_back(Elt: DAG.getCopyFromReg(Chain, dl: DL, Reg: GPR, VT: MVT::i64));
1562	}
1563	const auto &AvailableXmms = ArgXMMs.slice(N: NumXMMRegs);
1564	if (!AvailableXmms.empty()) {
1565	Register AL = TheMachineFunction.addLiveIn(PReg: X86::AL, RC: &X86::GR8RegClass);
1566	ALVal = DAG.getCopyFromReg(Chain, dl: DL, Reg: AL, VT: MVT::i8);
1567	for (MCPhysReg Reg : AvailableXmms) {
1568	// FastRegisterAllocator spills virtual registers at basic
1569	// block boundary. That leads to usages of xmm registers
1570	// outside of check for %al. Pass physical registers to
1571	// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
1572	TheMachineFunction.getRegInfo().addLiveIn(Reg);
1573	LiveXMMRegs.push_back(Elt: DAG.getRegister(Reg, VT: MVT::v4f32));
1574	}
1575	}
1576
1577	// Store the integer parameter registers.
1578	SmallVector<SDValue, `8`> MemOps;
1579	SDValue RSFIN =
1580	DAG.getFrameIndex(FI: FuncInfo->getRegSaveFrameIndex(),
1581	VT: TargLowering.getPointerTy(DL: DAG.getDataLayout()));
1582	unsigned Offset = FuncInfo->getVarArgsGPOffset();
1583	for (SDValue Val : LiveGPRs) {
1584	SDValue FIN = DAG.getNode(Opcode: ISD::ADD, DL,
1585	VT: TargLowering.getPointerTy(DL: DAG.getDataLayout()),
1586	N1: RSFIN, N2: DAG.getIntPtrConstant(Val: Offset, DL));
1587	SDValue Store =
1588	DAG.getStore(Chain: Val.getValue(R: `1`), dl: DL, Val, Ptr: FIN,
1589	PtrInfo: MachinePointerInfo::getFixedStack(
1590	MF&: DAG.getMachineFunction(),
1591	FI: FuncInfo->getRegSaveFrameIndex(), Offset));
1592	MemOps.push_back(Elt: Store);
1593	Offset += `8`;
1594	}
1595
1596	// Now store the XMM (fp + vector) parameter registers.
1597	if (!LiveXMMRegs.empty()) {
1598	SmallVector<SDValue, `12`> SaveXMMOps;
1599	SaveXMMOps.push_back(Elt: Chain);
1600	SaveXMMOps.push_back(Elt: ALVal);
1601	SaveXMMOps.push_back(Elt: RSFIN);
1602	SaveXMMOps.push_back(
1603	Elt: DAG.getTargetConstant(Val: FuncInfo->getVarArgsFPOffset(), DL, VT: MVT::i32));
1604	llvm::append_range(C&: SaveXMMOps, R&: LiveXMMRegs);
1605	MachineMemOperand *StoreMMO =
1606	DAG.getMachineFunction().getMachineMemOperand(
1607	PtrInfo: MachinePointerInfo::getFixedStack(
1608	MF&: DAG.getMachineFunction(), FI: FuncInfo->getRegSaveFrameIndex(),
1609	Offset),
1610	F: MachineMemOperand::MOStore, Size: `128`, BaseAlignment: Align (`16`));
1611	MemOps.push_back(Elt: DAG.getMemIntrinsicNode(Opcode: X86ISD::VASTART_SAVE_XMM_REGS,
1612	dl: DL, VTList: DAG.getVTList(VT: MVT::Other),
1613	Ops: SaveXMMOps, MemVT: MVT::i8, MMO: StoreMMO));
1614	}
1615
1616	if (!MemOps.empty())
1617	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
1618	}
1619	}
1620
1621	void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
1622	// Find the largest legal vector type.
1623	MVT VecVT = MVT::Other;
1624	// FIXME: Only some x86_32 calling conventions support AVX512.
1625	if (Subtarget.useAVX512Regs() &&
1626	(is64Bit() \|\| (CallConv == CallingConv::X86_VectorCall \|\|
1627	CallConv == CallingConv::Intel_OCL_BI)))
1628	VecVT = MVT::v16f32;
1629	else if (Subtarget.hasAVX())
1630	VecVT = MVT::v8f32;
1631	else if (Subtarget.hasSSE2())
1632	VecVT = MVT::v4f32;
1633
1634	// We forward some GPRs and some vector types.
1635	SmallVector<MVT, `2`> RegParmTypes;
1636	MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
1637	RegParmTypes.push_back(Elt: IntVT);
1638	if (VecVT != MVT::Other)
1639	RegParmTypes.push_back(Elt: VecVT);
1640
1641	// Compute the set of forwarded registers. The rest are scratch.
1642	SmallVectorImpl<ForwardedRegister> &Forwards =
1643	FuncInfo->getForwardedMustTailRegParms();
1644	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, Fn: CC_X86);
1645
1646	// Forward AL for SysV x86_64 targets, since it is used for varargs.
1647	if (is64Bit() && !isWin64() && !CCInfo.isAllocated(Reg: X86::AL)) {
1648	Register ALVReg = TheMachineFunction.addLiveIn(PReg: X86::AL, RC: &X86::GR8RegClass);
1649	Forwards.push_back(Elt: ForwardedRegister (ALVReg, X86::AL, MVT::i8));
1650	}
1651
1652	// Copy all forwards from physical to virtual registers.
1653	for (ForwardedRegister &FR : Forwards) {
1654	// FIXME: Can we use a less constrained schedule?
1655	SDValue RegVal = DAG.getCopyFromReg(Chain, dl: DL, Reg: FR.VReg, VT: FR.VT);
1656	FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
1657	RegClass: TargLowering.getRegClassFor(VT: FR.VT));
1658	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: FR.VReg, N: RegVal);
1659	}
1660	}
1661
1662	void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
1663	unsigned StackSize) {
1664	// Set FrameIndex to the 0xAAAAAAA value to mark unset state.
1665	// If necessary, it would be set into the correct value later.
1666	FuncInfo->setVarArgsFrameIndex(`0xAAAAAAA`);
1667	FuncInfo->setRegSaveFrameIndex(`0xAAAAAAA`);
1668
1669	if (FrameInfo.hasVAStart())
1670	createVarArgAreaAndStoreRegisters(Chain, StackSize);
1671
1672	if (FrameInfo.hasMustTailInVarArgFunc())
1673	forwardMustTailParameters(Chain);
1674	}
1675
1676	SDValue X86TargetLowering::LowerFormalArguments(
1677	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1678	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1679	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1680	MachineFunction &MF = DAG.getMachineFunction();
1681	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1682
1683	const Function &F = MF.getFunction();
1684	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
1685	F.getName() == "main")
1686	FuncInfo->setForceFramePointer(true);
1687
1688	MachineFrameInfo &MFI = MF.getFrameInfo();
1689	bool Is64Bit = Subtarget.is64Bit();
1690	bool IsWin64 = Subtarget.isCallingConvWin64(CC: CallConv);
1691
1692	assert(
1693	!(IsVarArg && canGuaranteeTCO(CallConv)) &&
1694	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
1695
1696	// Assign locations to all of the incoming arguments.
1697	SmallVector<CCValAssign, `16`> ArgLocs;
1698	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1699
1700	// Allocate shadow area for Win64.
1701	if (IsWin64)
1702	CCInfo.AllocateStack(Size: `32`, Alignment: Align (`8`));
1703
1704	CCInfo.AnalyzeArguments(Ins, Fn: CC_X86);
1705
1706	// In vectorcall calling convention a second pass is required for the HVA
1707	// types.
1708	if (CallingConv::X86_VectorCall == CallConv) {
1709	CCInfo.AnalyzeArgumentsSecondPass(Args: Ins, Fn: CC_X86);
1710	}
1711
1712	// The next loop assumes that the locations are in the same order of the
1713	// input arguments.
1714	assert(isSortedByValueNo(ArgLocs) &&
1715	"Argument Location list must be sorted before lowering");
1716
1717	SDValue ArgValue;
1718	for (unsigned I = `0`, InsIndex = `0`, E = ArgLocs.size(); I != E;
1719	++I, ++InsIndex) {
1720	assert(InsIndex < Ins.size() && "Invalid Ins index");
1721	CCValAssign &VA = ArgLocs [I];
1722
1723	if (VA.isRegLoc()) {
1724	EVT RegVT = VA.getLocVT();
1725	if (VA.needsCustom()) {
1726	assert(
1727	VA.getValVT() == MVT::v64i1 &&
1728	"Currently the only custom case is when we split v64i1 to 2 regs");
1729
1730	// v64i1 values, in regcall calling convention, that are
1731	// compiled to 32 bit arch, are split up into two registers.
1732	ArgValue =
1733	getv64i1Argument(VA, NextVA&: ArgLocs [++I], Root&: Chain, DAG, DL: dl, Subtarget);
1734	} else {
1735	const TargetRegisterClass *RC;
1736	if (RegVT == MVT::i8)
1737	RC = &X86::GR8RegClass;
1738	else if (RegVT == MVT::i16)
1739	RC = &X86::GR16RegClass;
1740	else if (RegVT == MVT::i32)
1741	RC = &X86::GR32RegClass;
1742	else if (Is64Bit && RegVT == MVT::i64)
1743	RC = &X86::GR64RegClass;
1744	else if (RegVT == MVT::f16)
1745	RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
1746	else if (RegVT == MVT::f32)
1747	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
1748	else if (RegVT == MVT::f64)
1749	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
1750	else if (RegVT == MVT::f80)
1751	RC = &X86::RFP80RegClass;
1752	else if (RegVT == MVT::f128)
1753	RC = &X86::VR128RegClass;
1754	else if (RegVT.is512BitVector())
1755	RC = &X86::VR512RegClass;
1756	else if (RegVT.is256BitVector())
1757	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
1758	else if (RegVT.is128BitVector())
1759	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
1760	else if (RegVT == MVT::x86mmx)
1761	RC = &X86::VR64RegClass;
1762	else if (RegVT == MVT::v1i1)
1763	RC = &X86::VK1RegClass;
1764	else if (RegVT == MVT::v8i1)
1765	RC = &X86::VK8RegClass;
1766	else if (RegVT == MVT::v16i1)
1767	RC = &X86::VK16RegClass;
1768	else if (RegVT == MVT::v32i1)
1769	RC = &X86::VK32RegClass;
1770	else if (RegVT == MVT::v64i1)
1771	RC = &X86::VK64RegClass;
1772	else
1773	llvm_unreachable("Unknown argument type!");
1774
1775	Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
1776	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, VT: RegVT);
1777	}
1778
1779	// If this is an 8 or 16-bit value, it is really passed promoted to 32
1780	// bits. Insert an assert[sz]ext to capture this, then truncate to the
1781	// right size.
1782	if (VA.getLocInfo() == CCValAssign::SExt)
1783	ArgValue = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: RegVT, N1: ArgValue,
1784	N2: DAG.getValueType(VA.getValVT()));
1785	else if (VA.getLocInfo() == CCValAssign::ZExt)
1786	ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: RegVT, N1: ArgValue,
1787	N2: DAG.getValueType(VA.getValVT()));
1788	else if (VA.getLocInfo() == CCValAssign::BCvt)
1789	ArgValue = DAG.getBitcast(VT: VA.getValVT(), V: ArgValue);
1790
1791	if (VA.isExtInLoc()) {
1792	// Handle MMX values passed in XMM regs.
1793	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
1794	ArgValue = DAG.getNode(Opcode: X86ISD::MOVDQ2Q, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
1795	else if (VA.getValVT().isVector() &&
1796	VA.getValVT().getScalarType() == MVT::i1 &&
1797	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
1798	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
1799	// Promoting a mask type (vi1) into a register of type i64/i32/i16/i8*
1800	ArgValue = lowerRegToMasks(ValArg: ArgValue, ValVT: VA.getValVT(), ValLoc: RegVT, DL: dl, DAG);
1801	} else
1802	ArgValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
1803	}
1804	} else {
1805	assert(VA.isMemLoc());
1806	ArgValue =
1807	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i: InsIndex);
1808	}
1809
1810	// If value is passed via pointer - do a load.
1811	if (VA.getLocInfo() == CCValAssign::Indirect &&
1812	!(Ins [I].Flags.isByVal() && VA.isRegLoc())) {
1813	ArgValue =
1814	DAG.getLoad(VT: VA.getValVT(), dl, Chain, Ptr: ArgValue, PtrInfo: MachinePointerInfo ());
1815	}
1816
1817	InVals.push_back(Elt: ArgValue);
1818	}
1819
1820	for (unsigned I = `0`, E = Ins.size(); I != E; ++I) {
1821	if (Ins [I].Flags.isSwiftAsync()) {
1822	auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
1823	if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF))
1824	X86FI->setHasSwiftAsyncContext(true);
1825	else {
1826	int PtrSize = Subtarget.is64Bit() ? `8` : `4`;
1827	int FI =
1828	MF.getFrameInfo().CreateStackObject(Size: PtrSize, Alignment: Align (PtrSize), isSpillSlot: false);
1829	X86FI->setSwiftAsyncContextFrameIdx(FI);
1830	SDValue St = DAG.getStore(
1831	Chain: DAG.getEntryNode(), dl, Val: InVals [I],
1832	Ptr: DAG.getFrameIndex(FI, VT: PtrSize == `8` ? MVT::i64 : MVT::i32),
1833	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI));
1834	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, N1: St, N2: Chain);
1835	}
1836	}
1837
1838	// Swift calling convention does not require we copy the sret argument
1839	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
1840	if (CallConv == CallingConv::Swift \|\| CallConv == CallingConv::SwiftTail)
1841	continue;
1842
1843	// All x86 ABIs require that for returning structs by value we copy the
1844	// sret argument into %rax/%eax (depending on ABI) for the return. Save
1845	// the argument into a virtual register so that we can access it from the
1846	// return points.
1847	if (Ins [I].Flags.isSRet()) {
1848	assert(!FuncInfo->getSRetReturnReg() &&
1849	"SRet return has already been set");
1850	MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
1851	Register Reg =
1852	MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
1853	FuncInfo->setSRetReturnReg(Reg);
1854	SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl, Reg, N: InVals [I]);
1855	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, N1: Copy, N2: Chain);
1856	break;
1857	}
1858	}
1859
1860	unsigned StackSize = CCInfo.getStackSize();
1861	// Align stack specially for tail calls.
1862	if (shouldGuaranteeTCO(CC: CallConv,
1863	GuaranteedTailCallOpt: MF.getTarget().Options.GuaranteedTailCallOpt))
1864	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1865
1866	if (IsVarArg)
1867	VarArgsLoweringHelper (FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
1868	.lowerVarArgsParameters(Chain, StackSize);
1869
1870	// Some CCs need callee pop.
1871	if (X86::isCalleePop(CallingConv: CallConv, is64Bit: Is64Bit, IsVarArg,
1872	GuaranteeTCO: MF.getTarget().Options.GuaranteedTailCallOpt)) {
1873	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1874	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == `2`) {
1875	// X86 interrupts must pop the error code (and the alignment padding) if
1876	// present.
1877	FuncInfo->setBytesToPopOnReturn(Is64Bit ? `16` : `4`);
1878	} else {
1879	FuncInfo->setBytesToPopOnReturn(`0`); // Callee pops nothing.
1880	// If this is an sret function, the return should pop the hidden pointer.
1881	if (hasCalleePopSRet(Args: Ins, ArgLocs, Subtarget))
1882	FuncInfo->setBytesToPopOnReturn(`4`);
1883	}
1884
1885	if (!Is64Bit) {
1886	// RegSaveFrameIndex is X86-64 only.
1887	FuncInfo->setRegSaveFrameIndex(`0xAAAAAAA`);
1888	}
1889
1890	FuncInfo->setArgumentStackSize(StackSize);
1891
1892	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
1893	EHPersonality Personality = classifyEHPersonality(Pers: F.getPersonalityFn());
1894	if (Personality == EHPersonality::CoreCLR) {
1895	assert(Is64Bit);
1896	// TODO: Add a mechanism to frame lowering that will allow us to indicate
1897	// that we'd prefer this slot be allocated towards the bottom of the frame
1898	// (i.e. near the stack pointer after allocating the frame). Every
1899	// funclet needs a copy of this slot in its (mostly empty) frame, and the
1900	// offset from the bottom of this and each funclet's frame must be the
1901	// same, so the size of funclets' (mostly empty) frames is dictated by
1902	// how far this slot is from the bottom (since they allocate just enough
1903	// space to accommodate holding this slot at the correct offset).
1904	int PSPSymFI = MFI.CreateStackObject(Size: `8`, Alignment: Align (`8`), /isSpillSlot=/false);
1905	EHInfo->PSPSymFrameIdx = PSPSymFI;
1906	}
1907	}
1908
1909	if (shouldDisableArgRegFromCSR(CC: CallConv) \|\|
1910	F.hasFnAttribute(Kind: "no_caller_saved_registers")) {
1911	MachineRegisterInfo &MRI = MF.getRegInfo();
1912	for (std::pair<MCRegister, Register> Pair : MRI.liveins())
1913	MRI.disableCalleeSavedRegister(Reg: Pair.first);
1914	}
1915
1916	if (CallingConv::PreserveNone == CallConv)
1917	for (const ISD::InputArg &In : Ins) {
1918	if (In.Flags.isSwiftSelf() \|\| In.Flags.isSwiftAsync() \|\|
1919	In.Flags.isSwiftError()) {
1920	errorUnsupported(DAG, dl,
1921	Msg: "Swift attributes can't be used with preserve_none");
1922	break;
1923	}
1924	}
1925
1926	return Chain;
1927	}
1928
1929	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1930	SDValue Arg, const SDLoc &dl,
1931	SelectionDAG &DAG,
1932	const CCValAssign &VA,
1933	ISD::ArgFlagsTy Flags,
1934	bool isByVal) const {
1935	unsigned LocMemOffset = VA.getLocMemOffset();
1936	SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
1937	PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
1938	N1: StackPtr, N2: PtrOff);
1939	if (isByVal)
1940	return CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff, Chain, Flags, DAG, dl);
1941
1942	MaybeAlign Alignment;
1943	if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1944	Arg.getSimpleValueType() != MVT::f80)
1945	Alignment = MaybeAlign (`4`);
1946	return DAG.getStore(
1947	Chain, dl, Val: Arg, Ptr: PtrOff,
1948	PtrInfo: MachinePointerInfo::getStack(MF&: DAG.getMachineFunction(), Offset: LocMemOffset),
1949	Alignment);
1950	}
1951
1952	/// Emit a load of return address if tail call
1953	/// optimization is performed and it is required.
1954	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
1955	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
1956	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
1957	// Adjust the Return address stack slot.
1958	EVT VT = getPointerTy(DL: DAG.getDataLayout());
1959	OutRetAddr = getReturnAddressFrameIndex(DAG);
1960
1961	// Load the "old" Return address.
1962	OutRetAddr = DAG.getLoad(VT, dl, Chain, Ptr: OutRetAddr, PtrInfo: MachinePointerInfo ());
1963	return SDValue (OutRetAddr.getNode(), `1`);
1964	}
1965
1966	/// Emit a store of the return address if tail call
1967	/// optimization is performed and it is required (FPDiff!=0).
1968	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
1969	SDValue Chain, SDValue RetAddrFrIdx,
1970	EVT PtrVT, unsigned SlotSize,
1971	int FPDiff, const SDLoc &dl) {
1972	// Store the return address to the appropriate stack slot.
1973	if (!FPDiff) return Chain;
1974	// Calculate the new stack slot for the return address.
1975	int NewReturnAddrFI =
1976	MF.getFrameInfo().CreateFixedObject(Size: SlotSize, SPOffset: (int64_t)FPDiff - SlotSize,
1977	IsImmutable: false);
1978	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(FI: NewReturnAddrFI, VT: PtrVT);
1979	Chain = DAG.getStore(Chain, dl, Val: RetAddrFrIdx, Ptr: NewRetAddrFrIdx,
1980	PtrInfo: MachinePointerInfo::getFixedStack(
1981	MF&: DAG.getMachineFunction(), FI: NewReturnAddrFI));
1982	return Chain;
1983	}
1984
1985	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
1986	/// operation of specified width.
1987	SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
1988	SDValue V1, SDValue V2) const {
1989	unsigned NumElems = VT.getVectorNumElements();
1990	SmallVector<int, `8`> Mask;
1991	Mask.push_back(Elt: NumElems);
1992	for (unsigned i = `1`; i != NumElems; ++i)
1993	Mask.push_back(Elt: i);
1994	return DAG.getVectorShuffle(VT, dl, N1: V1, N2: V2, Mask);
1995	}
1996
1997	// Returns the type of copying which is required to set up a byval argument to
1998	// a tail-called function. This isn't needed for non-tail calls, because they
1999	// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2000	// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2001	// optimised to zero copies when forwarding an argument from the caller's
2002	// caller (NoCopy).
2003	X86TargetLowering::ByValCopyKind X86TargetLowering::ByValNeedsCopyForTailCall(
2004	SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2005	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2006
2007	// Globals are always safe to copy from.
2008	if (isa<GlobalAddressSDNode>(Val: Src) \|\| isa<ExternalSymbolSDNode>(Val: Src))
2009	return CopyOnce;
2010
2011	// Can only analyse frame index nodes, conservatively assume we need a
2012	// temporary.
2013	auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Val&: Src);
2014	auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Val&: Dst);
2015	if (!SrcFrameIdxNode \|\| !DstFrameIdxNode)
2016	return CopyViaTemp;
2017
2018	int SrcFI = SrcFrameIdxNode->getIndex();
2019	int DstFI = DstFrameIdxNode->getIndex();
2020	assert(MFI.isFixedObjectIndex(DstFI) &&
2021	"byval passed in non-fixed stack slot");
2022
2023	int64_t SrcOffset = MFI.getObjectOffset(ObjectIdx: SrcFI);
2024	int64_t DstOffset = MFI.getObjectOffset(ObjectIdx: DstFI);
2025
2026	// If the source is in the local frame, then the copy to the argument
2027	// memory is always valid.
2028	bool FixedSrc = MFI.isFixedObjectIndex(ObjectIdx: SrcFI);
2029	if (!FixedSrc \|\| (FixedSrc && SrcOffset < `0`))
2030	return CopyOnce;
2031
2032	// If the value is already in the correct location, then no copying is
2033	// needed. If not, then we need to copy via a temporary.
2034	if (SrcOffset == DstOffset)
2035	return NoCopy;
2036	else
2037	return CopyViaTemp;
2038	}
2039
2040	SDValue
2041	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2042	SmallVectorImpl<SDValue> &InVals) const {
2043	SelectionDAG &DAG = CLI.DAG;
2044	SDLoc &dl = CLI.DL;
2045	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2046	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2047	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2048	SDValue Chain = CLI.Chain;
2049	SDValue Callee = CLI.Callee;
2050	CallingConv::ID CallConv = CLI.CallConv;
2051	bool &isTailCall = CLI.IsTailCall;
2052	bool isVarArg = CLI.IsVarArg;
2053	const auto *CB = CLI.CB;
2054
2055	MachineFunction &MF = DAG.getMachineFunction();
2056	bool Is64Bit = Subtarget.is64Bit();
2057	bool IsWin64 = Subtarget.isCallingConvWin64(CC: CallConv);
2058	bool ShouldGuaranteeTCO = shouldGuaranteeTCO(
2059	CC: CallConv, GuaranteedTailCallOpt: MF.getTarget().Options.GuaranteedTailCallOpt);
2060	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2061	bool HasNCSR = (CB && isa<CallInst>(Val: CB) &&
2062	CB->hasFnAttr(Kind: "no_caller_saved_registers"));
2063	bool IsIndirectCall = (CB && isa<CallInst>(Val: CB) && CB->isIndirectCall());
2064	bool IsCFICall = IsIndirectCall && CLI.CFIType;
2065	const Module *M = MF.getFunction().getParent();
2066
2067	// If the indirect call target has the nocf_check attribute, the call needs
2068	// the NOTRACK prefix. For simplicity just disable tail calls as there are
2069	// so many variants.
2070	// FIXME: This will cause backend errors if the user forces the issue.
2071	bool IsNoTrackIndirectCall = IsIndirectCall && CB->doesNoCfCheck() &&
2072	M->getModuleFlag(Key: "cf-protection-branch");
2073	if (IsNoTrackIndirectCall)
2074	isTailCall = false;
2075
2076	MachineFunction::CallSiteInfo CSInfo;
2077	if (CallConv == CallingConv::X86_INTR)
2078	report_fatal_error(reason: "X86 interrupts may not be called directly");
2079
2080	// Set type id for call site info.
2081	setTypeIdForCallsiteInfo(CB, MF, CSInfo);
2082
2083	if (IsIndirectCall && !IsWin64 &&
2084	M->getModuleFlag(Key: "import-call-optimization"))
2085	errorUnsupported(DAG, dl,
2086	Msg: "Indirect calls must have a normal calling convention if "
2087	"Import Call Optimization is enabled");
2088
2089	// Analyze operands of the call, assigning locations to each operand.
2090	SmallVector<CCValAssign, `16`> ArgLocs;
2091	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2092
2093	// Allocate shadow area for Win64.
2094	if (IsWin64)
2095	CCInfo.AllocateStack(Size: `32`, Alignment: Align (`8`));
2096
2097	CCInfo.AnalyzeArguments(Outs, Fn: CC_X86);
2098
2099	// In vectorcall calling convention a second pass is required for the HVA
2100	// types.
2101	if (CallingConv::X86_VectorCall == CallConv) {
2102	CCInfo.AnalyzeArgumentsSecondPass(Args: Outs, Fn: CC_X86);
2103	}
2104
2105	bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
2106	bool IsSibcall = false;
2107	if (isTailCall && ShouldGuaranteeTCO) {
2108	// If we need to guarantee TCO for a non-musttail call, we just need to make
2109	// sure the conventions match. If a tail call uses one of the supported TCO
2110	// conventions and the caller and callee match, we can tail call any
2111	// function prototype.
2112	CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
2113	isTailCall = (CallConv == CallerCC);
2114	IsSibcall = IsMustTail;
2115	} else if (isTailCall) {
2116	// Check if this tail call is a "sibling" call, which is loosely defined to
2117	// be a tail call that doesn't require heroics like moving the return
2118	// address or swapping byval arguments. We treat some musttail calls as
2119	// sibling calls to avoid unnecessary argument copies.
2120	IsSibcall = isEligibleForSiblingCallOpt(CLI, CCInfo, ArgLocs);
2121	isTailCall = IsSibcall \|\| IsMustTail;
2122	}
2123
2124	if (isTailCall)
2125	++NumTailCalls;
2126
2127	if (IsMustTail && !isTailCall)
2128	report_fatal_error(reason: "failed to perform tail call elimination on a call "
2129	"site marked musttail");
2130
2131	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2132	"Var args not supported with calling convention fastcc, ghc or hipe");
2133
2134	// Get a count of how many bytes are to be pushed on the stack.
2135	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
2136	if (IsSibcall)
2137	// This is a sibcall. The memory operands are available in caller's
2138	// own caller's stack.
2139	NumBytes = `0`;
2140	else if (ShouldGuaranteeTCO && canGuaranteeTCO(CC: CallConv))
2141	NumBytes = GetAlignedArgumentStackSize(StackSize: NumBytes, DAG);
2142
2143	// A sibcall is ABI-compatible and does not need to adjust the stack pointer.
2144	int FPDiff = `0`;
2145	if (isTailCall && ShouldGuaranteeTCO && !IsSibcall) {
2146	// Lower arguments at fp - stackoffset + fpdiff.
2147	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2148
2149	FPDiff = NumBytesCallerPushed - NumBytes;
2150
2151	// Set the delta of movement of the returnaddr stackslot.
2152	// But only set if delta is greater than previous delta.
2153	if (FPDiff < X86Info->getTCReturnAddrDelta())
2154	X86Info->setTCReturnAddrDelta(FPDiff);
2155	}
2156
2157	unsigned NumBytesToPush = NumBytes;
2158	unsigned NumBytesToPop = NumBytes;
2159
2160	SDValue StackPtr;
2161	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2162
2163	// If we are doing a tail-call, any byval arguments will be written to stack
2164	// space which was used for incoming arguments. If any the values being used
2165	// are incoming byval arguments to this function, then they might be
2166	// overwritten by the stores of the outgoing arguments. To avoid this, we
2167	// need to make a temporary copy of them in local stack space, then copy back
2168	// to the argument area.
2169	// FIXME: There's potential to improve the code by using virtual registers for
2170	// temporary storage, and letting the register allocator spill if needed.
2171	SmallVector<SDValue, `8`> ByValTemporaries;
2172	SDValue ByValTempChain;
2173	if (isTailCall) {
2174	// Use null SDValue to mean "no temporary recorded for this arg index".
2175	ByValTemporaries.assign(NumElts: OutVals.size(), Elt: SDValue ());
2176
2177	SmallVector<SDValue, `8`> ByValCopyChains;
2178	for (const CCValAssign &VA : ArgLocs) {
2179	unsigned ArgIdx = VA.getValNo();
2180	SDValue Src = OutVals [ArgIdx];
2181	ISD::ArgFlagsTy Flags = Outs [ArgIdx].Flags;
2182
2183	if (!Flags.isByVal())
2184	continue;
2185
2186	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2187
2188	if (!StackPtr.getNode())
2189	StackPtr =
2190	DAG.getCopyFromReg(Chain, dl, Reg: RegInfo->getStackRegister(), VT: PtrVT);
2191
2192	// Destination: where this byval should live in the callee’s frame
2193	// after the tail call.
2194	int64_t Offset = VA.getLocMemOffset() + FPDiff;
2195	uint64_t Size = VA.getLocVT().getFixedSizeInBits() / `8`;
2196	int FI = MF.getFrameInfo().CreateFixedObject(Size, SPOffset: Offset,
2197	/IsImmutable=/true);
2198	SDValue Dst = DAG.getFrameIndex(FI, VT: PtrVT);
2199
2200	ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2201
2202	if (Copy == NoCopy) {
2203	// If the argument is already at the correct offset on the stack
2204	// (because we are forwarding a byval argument from our caller), we
2205	// don't need any copying.
2206	continue;
2207	} else if (Copy == CopyOnce) {
2208	// If the argument is in our local stack frame, no other argument
2209	// preparation can clobber it, so we can copy it to the final location
2210	// later.
2211	ByValTemporaries [ArgIdx] = Src;
2212	} else {
2213	assert(Copy == CopyViaTemp && "unexpected enum value");
2214	// If we might be copying this argument from the outgoing argument
2215	// stack area, we need to copy via a temporary in the local stack
2216	// frame.
2217	MachineFrameInfo &MFI = MF.getFrameInfo();
2218	int TempFrameIdx = MFI.CreateStackObject(Size: Flags.getByValSize(),
2219	Alignment: Flags.getNonZeroByValAlign(),
2220	/isSS=/isSpillSlot: false);
2221	SDValue Temp =
2222	DAG.getFrameIndex(FI: TempFrameIdx, VT: getPointerTy(DL: DAG.getDataLayout()));
2223
2224	SDValue CopyChain =
2225	CreateCopyOfByValArgument(Src, Dst: Temp, Chain, Flags, DAG, dl);
2226	ByValCopyChains.push_back(Elt: CopyChain);
2227	}
2228	}
2229	if (!ByValCopyChains.empty())
2230	ByValTempChain =
2231	DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ByValCopyChains);
2232	}
2233
2234	// If we have an inalloca argument, all stack space has already been allocated
2235	// for us and be right at the top of the stack. We don't support multiple
2236	// arguments passed in memory when using inalloca.
2237	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2238	NumBytesToPush = `0`;
2239	if (!ArgLocs.back().isMemLoc())
2240	report_fatal_error(reason: "cannot use inalloca attribute on a register "
2241	"parameter");
2242	if (ArgLocs.back().getLocMemOffset() != `0`)
2243	report_fatal_error(reason: "any parameter with the inalloca attribute must be "
2244	"the only memory argument");
2245	} else if (CLI.IsPreallocated) {
2246	assert(ArgLocs.back().isMemLoc() &&
2247	"cannot use preallocated attribute on a register "
2248	"parameter");
2249	SmallVector<size_t, `4`> PreallocatedOffsets;
2250	for (size_t i = `0`; i < CLI.OutVals.size(); ++i) {
2251	if (CLI.CB->paramHasAttr(ArgNo: i, Kind: Attribute::Preallocated)) {
2252	PreallocatedOffsets.push_back(Elt: ArgLocs [i].getLocMemOffset());
2253	}
2254	}
2255	auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
2256	size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CS: CLI.CB);
2257	MFI->setPreallocatedStackSize(Id: PreallocatedId, StackSize: NumBytes);
2258	MFI->setPreallocatedArgOffsets(Id: PreallocatedId, AO: PreallocatedOffsets);
2259	NumBytesToPush = `0`;
2260	}
2261
2262	if (!IsSibcall && !IsMustTail)
2263	Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytesToPush,
2264	OutSize: NumBytes - NumBytesToPush, DL: dl);
2265
2266	SDValue RetAddrFrIdx;
2267	// Load return address for tail calls.
2268	if (isTailCall && FPDiff)
2269	Chain = EmitTailCallLoadRetAddr(DAG, OutRetAddr&: RetAddrFrIdx, Chain, IsTailCall: isTailCall,
2270	Is64Bit, FPDiff, dl);
2271
2272	SmallVector<std::pair<Register, SDValue>, `8`> RegsToPass;
2273	SmallVector<SDValue, `8`> MemOpChains;
2274
2275	// The next loop assumes that the locations are in the same order of the
2276	// input arguments.
2277	assert(isSortedByValueNo(ArgLocs) &&
2278	"Argument Location list must be sorted before lowering");
2279
2280	// Walk the register/memloc assignments, inserting copies/loads. In the case
2281	// of tail call optimization arguments are handle later.
2282	for (unsigned I = `0`, OutIndex = `0`, E = ArgLocs.size(); I != E;
2283	++I, ++OutIndex) {
2284	assert(OutIndex < Outs.size() && "Invalid Out index");
2285	// Skip inalloca/preallocated arguments, they have already been written.
2286	ISD::ArgFlagsTy Flags = Outs [OutIndex].Flags;
2287	if (Flags.isInAlloca() \|\| Flags.isPreallocated())
2288	continue;
2289
2290	CCValAssign &VA = ArgLocs [I];
2291	EVT RegVT = VA.getLocVT();
2292	SDValue Arg = OutVals [OutIndex];
2293	bool isByVal = Flags.isByVal();
2294
2295	// Promote the value if needed.
2296	switch (VA.getLocInfo()) {
2297	default: llvm_unreachable("Unknown loc info!");
2298	case CCValAssign::Full: break;
2299	case CCValAssign::SExt:
2300	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2301	break;
2302	case CCValAssign::ZExt:
2303	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2304	break;
2305	case CCValAssign::AExt:
2306	if (Arg.getValueType().isVector() &&
2307	Arg.getValueType().getVectorElementType() == MVT::i1)
2308	Arg = lowerMasksToReg(ValArg: Arg, ValLoc: RegVT, DL: dl, DAG);
2309	else if (RegVT.is128BitVector()) {
2310	// Special case: passing MMX values in XMM registers.
2311	Arg = DAG.getBitcast(VT: MVT::i64, V: Arg);
2312	Arg = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v2i64, Operand: Arg);
2313	Arg = getMOVL(DAG, dl, VT: MVT::v2i64, V1: DAG.getUNDEF(VT: MVT::v2i64), V2: Arg);
2314	} else
2315	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2316	break;
2317	case CCValAssign::BCvt:
2318	Arg = DAG.getBitcast(VT: RegVT, V: Arg);
2319	break;
2320	case CCValAssign::Indirect: {
2321	if (isByVal) {
2322	// Memcpy the argument to a temporary stack slot to prevent
2323	// the caller from seeing any modifications the callee may make
2324	// as guaranteed by the `byval` attribute.
2325	int FrameIdx = MF.getFrameInfo().CreateStackObject(
2326	Size: Flags.getByValSize(),
2327	Alignment: std::max(a: Align (`16`), b: Flags.getNonZeroByValAlign()), isSpillSlot: false);
2328	SDValue StackSlot =
2329	DAG.getFrameIndex(FI: FrameIdx, VT: getPointerTy(DL: DAG.getDataLayout()));
2330	Chain =
2331	CreateCopyOfByValArgument(Src: Arg, Dst: StackSlot, Chain, Flags, DAG, dl);
2332	// From now on treat this as a regular pointer
2333	Arg = StackSlot;
2334	isByVal = false;
2335	} else {
2336	// Store the argument.
2337	SDValue SpillSlot = DAG.CreateStackTemporary(VT: VA.getValVT());
2338	int FI = cast<FrameIndexSDNode>(Val&: SpillSlot)->getIndex();
2339	Chain = DAG.getStore(
2340	Chain, dl, Val: Arg, Ptr: SpillSlot,
2341	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI));
2342	Arg = SpillSlot;
2343	}
2344	break;
2345	}
2346	}
2347
2348	if (VA.needsCustom()) {
2349	assert(VA.getValVT() == MVT::v64i1 &&
2350	"Currently the only custom case is when we split v64i1 to 2 regs");
2351	// Split v64i1 value into two registers
2352	Passv64i1ArgInRegs(DL: dl, DAG, Arg, RegsToPass, VA, NextVA&: ArgLocs [++I], Subtarget);
2353	} else if (VA.isRegLoc()) {
2354	RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
2355	const TargetOptions &Options = DAG.getTarget().Options;
2356	if (Options.EmitCallSiteInfo)
2357	CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: I);
2358	if (isVarArg && IsWin64) {
2359	// Win64 ABI requires argument XMM reg to be copied to the corresponding
2360	// shadow reg if callee is a varargs function.
2361	Register ShadowReg;
2362	switch (VA.getLocReg()) {
2363	case X86::XMM0: ShadowReg = X86::RCX; break;
2364	case X86::XMM1: ShadowReg = X86::RDX; break;
2365	case X86::XMM2: ShadowReg = X86::R8; break;
2366	case X86::XMM3: ShadowReg = X86::R9; break;
2367	}
2368	if (ShadowReg)
2369	RegsToPass.push_back(Elt: std::make_pair(x&: ShadowReg, y&: Arg));
2370	}
2371	} else if (!IsSibcall && (!isTailCall \|\| (isByVal && !IsMustTail))) {
2372	assert(VA.isMemLoc());
2373	if (!StackPtr.getNode())
2374	StackPtr = DAG.getCopyFromReg(Chain, dl, Reg: RegInfo->getStackRegister(),
2375	VT: getPointerTy(DL: DAG.getDataLayout()));
2376	MemOpChains.push_back(Elt: LowerMemOpCallTo(Chain, StackPtr, Arg,
2377	dl, DAG, VA, Flags, isByVal));
2378	}
2379	}
2380
2381	if (!MemOpChains.empty())
2382	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
2383
2384	if (Subtarget.isPICStyleGOT()) {
2385	// ELF / PIC requires GOT in the EBX register before function calls via PLT
2386	// GOT pointer (except regcall).
2387	if (!isTailCall) {
2388	// Indirect call with RegCall calling convertion may use up all the
2389	// general registers, so it is not suitable to bind EBX reister for
2390	// GOT address, just let register allocator handle it.
2391	if (CallConv != CallingConv::X86_RegCall)
2392	RegsToPass.push_back(Elt: std::make_pair(
2393	x: Register (X86::EBX), y: DAG.getNode(Opcode: X86ISD::GlobalBaseReg, DL: SDLoc (),
2394	VT: getPointerTy(DL: DAG.getDataLayout()))));
2395	} else {
2396	// If we are tail calling and generating PIC/GOT style code load the
2397	// address of the callee into ECX. The value in ecx is used as target of
2398	// the tail jump. This is done to circumvent the ebx/callee-saved problem
2399	// for tail calls on PIC/GOT architectures. Normally we would just put the
2400	// address of GOT into ebx and then call target@PLT. But for tail calls
2401	// ebx would be restored (since ebx is callee saved) before jumping to the
2402	// target@PLT.
2403
2404	// Note: The actual moving to ECX is done further down.
2405	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee);
2406	if (G && !G->getGlobal()->hasLocalLinkage() &&
2407	G->getGlobal()->hasDefaultVisibility())
2408	Callee = LowerGlobalAddress(Op: Callee, DAG);
2409	else if (isa<ExternalSymbolSDNode>(Val: Callee))
2410	Callee = LowerExternalSymbol(Op: Callee, DAG);
2411	}
2412	}
2413
2414	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
2415	(Subtarget.hasSSE1() \|\| !M->getModuleFlag(Key: "SkipRaxSetup"))) {
2416	// From AMD64 ABI document:
2417	// For calls that may call functions that use varargs or stdargs
2418	// (prototype-less calls or calls to functions containing ellipsis (...) in
2419	// the declaration) %al is used as hidden argument to specify the number
2420	// of SSE registers used. The contents of %al do not need to match exactly
2421	// the number of registers, but must be an ubound on the number of SSE
2422	// registers used and is in the range 0 - 8 inclusive.
2423
2424	// Count the number of XMM registers allocated.
2425	static const MCPhysReg XMMArgRegs[] = {
2426	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2427	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2428	};
2429	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(Regs: XMMArgRegs);
2430	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
2431	&& "SSE registers cannot be used when SSE is disabled");
2432	RegsToPass.push_back(Elt: std::make_pair(x: Register (X86::AL),
2433	y: DAG.getConstant(Val: NumXMMRegs, DL: dl,
2434	VT: MVT::i8)));
2435	}
2436
2437	if (isVarArg && IsMustTail) {
2438	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2439	for (const auto &F : Forwards) {
2440	SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: F.VReg, VT: F.VT);
2441	RegsToPass.push_back(Elt: std::make_pair(x: F.PReg, y&: Val));
2442	}
2443	}
2444
2445	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
2446	// don't need this because the eligibility check rejects calls that require
2447	// shuffling arguments passed in memory.
2448	if (isTailCall && !IsSibcall) {
2449	// Force all the incoming stack arguments to be loaded from the stack
2450	// before any new outgoing arguments or the return address are stored to the
2451	// stack, because the outgoing stack slots may alias the incoming argument
2452	// stack slots, and the alias isn't otherwise explicit. This is slightly
2453	// more conservative than necessary, because it means that each store
2454	// effectively depends on every argument instead of just those arguments it
2455	// would clobber.
2456	Chain = DAG.getStackArgumentTokenFactor(Chain);
2457
2458	if (ByValTempChain)
2459	Chain =
2460	DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, N1: Chain, N2: ByValTempChain);
2461
2462	SmallVector<SDValue, `8`> MemOpChains2;
2463	SDValue FIN;
2464	int FI = `0`;
2465	for (unsigned I = `0`, OutsIndex = `0`, E = ArgLocs.size(); I != E;
2466	++I, ++OutsIndex) {
2467	CCValAssign &VA = ArgLocs [I];
2468
2469	if (VA.isRegLoc()) {
2470	if (VA.needsCustom()) {
2471	assert((CallConv == CallingConv::X86_RegCall) &&
2472	"Expecting custom case only in regcall calling convention");
2473	// This means that we are in special case where one argument was
2474	// passed through two register locations - Skip the next location
2475	++I;
2476	}
2477
2478	continue;
2479	}
2480
2481	assert(VA.isMemLoc());
2482	SDValue Arg = OutVals [OutsIndex];
2483	ISD::ArgFlagsTy Flags = Outs [OutsIndex].Flags;
2484	// Skip inalloca/preallocated arguments. They don't require any work.
2485	if (Flags.isInAlloca() \|\| Flags.isPreallocated())
2486	continue;
2487	// Create frame index.
2488	int32_t Offset = VA.getLocMemOffset()+FPDiff;
2489	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+`7`)/`8`;
2490	FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
2491	FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
2492
2493	if (Flags.isByVal()) {
2494	if (SDValue ByValSrc = ByValTemporaries [OutsIndex]) {
2495	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2496	SDValue DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
2497
2498	MemOpChains2.push_back(Elt: CreateCopyOfByValArgument(
2499	Src: ByValSrc, Dst: DstAddr, Chain, Flags, DAG, dl));
2500	}
2501	} else {
2502	// Store relative to framepointer.
2503	MemOpChains2.push_back(Elt: DAG.getStore(
2504	Chain, dl, Val: Arg, Ptr: FIN,
2505	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI)));
2506	}
2507	}
2508
2509	if (!MemOpChains2.empty())
2510	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains2);
2511
2512	// Store the return address to the appropriate stack slot.
2513	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2514	PtrVT: getPointerTy(DL: DAG.getDataLayout()),
2515	SlotSize: RegInfo->getSlotSize(), FPDiff, dl);
2516	}
2517
2518	// Build a sequence of copy-to-reg nodes chained together with token chain
2519	// and glue operands which copy the outgoing args into registers.
2520	SDValue InGlue;
2521	for (const auto &[Reg, N] : RegsToPass) {
2522	Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
2523	InGlue = Chain.getValue(R: `1`);
2524	}
2525
2526	bool IsImpCall = false;
2527	bool IsCFGuardCall = false;
2528	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2529	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2530	// In the 64-bit large code model, we have to make all calls
2531	// through a register, since the call instruction's 32-bit
2532	// pc-relative offset may not be large enough to hold the whole
2533	// address.
2534	} else if (Callee ->getOpcode() == ISD::GlobalAddress \|\|
2535	Callee ->getOpcode() == ISD::ExternalSymbol) {
2536	// Lower direct calls to global addresses and external symbols. Setting
2537	// ForCall to true here has the effect of removing WrapperRIP when possible
2538	// to allow direct calls to be selected without first materializing the
2539	// address into a register.
2540	Callee = LowerGlobalOrExternal(Op: Callee, DAG, /ForCall=/true, IsImpCall: &IsImpCall);
2541	} else if (Subtarget.isTarget64BitILP32() &&
2542	Callee.getValueType() == MVT::i32) {
2543	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
2544	Callee = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i64, Operand: Callee);
2545	} else if (Is64Bit && CB && isCFGuardCall(CB)) {
2546	// We'll use a specific psuedo instruction for tail calls to control flow
2547	// guard functions to guarantee the instruction used for the call. To do
2548	// this we need to unwrap the load now and use the CFG Func GV as the
2549	// callee.
2550	IsCFGuardCall = true;
2551	auto *LoadNode = cast<LoadSDNode>(Val&: Callee);
2552	GlobalAddressSDNode *GA =
2553	cast<GlobalAddressSDNode>(Val: unwrapAddress(N: LoadNode->getBasePtr()));
2554	assert(isCFGuardFunction(GA->getGlobal()) &&
2555	"CFG Call should be to a guard function");
2556	assert(LoadNode->getOffset()->isUndef() &&
2557	"CFG Function load should not have an offset");
2558	Callee = DAG.getTargetGlobalAddress(
2559	GV: GA->getGlobal(), DL: dl, VT: GA->getValueType(ResNo: `0`), offset: `0`, TargetFlags: X86II::MO_NO_FLAG);
2560	}
2561
2562	SmallVector<SDValue, `8`> Ops;
2563
2564	if (!IsSibcall && isTailCall && !IsMustTail) {
2565	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytesToPop, Size2: `0`, Glue: InGlue, DL: dl);
2566	InGlue = Chain.getValue(R: `1`);
2567	}
2568
2569	Ops.push_back(Elt: Chain);
2570	Ops.push_back(Elt: Callee);
2571
2572	if (isTailCall)
2573	Ops.push_back(Elt: DAG.getSignedTargetConstant(Val: FPDiff, DL: dl, VT: MVT::i32));
2574
2575	// Add argument registers to the end of the list so that they are known live
2576	// into the call.
2577	for (const auto &[Reg, N] : RegsToPass)
2578	Ops.push_back(Elt: DAG.getRegister(Reg, VT: N.getValueType()));
2579
2580	// Add a register mask operand representing the call-preserved registers.
2581	const uint32_t *Mask = [&]() {
2582	auto AdaptedCC = CallConv;
2583	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
2584	// use X86_INTR calling convention because it has the same CSR mask
2585	// (same preserved registers).
2586	if (HasNCSR)
2587	AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
2588	// If NoCalleeSavedRegisters is requested, than use GHC since it happens
2589	// to use the CSR_NoRegs_RegMask.
2590	if (CB && CB->hasFnAttr(Kind: "no_callee_saved_registers"))
2591	AdaptedCC = (CallingConv::ID)CallingConv::GHC;
2592	return RegInfo->getCallPreservedMask(MF, AdaptedCC);
2593	}();
2594	assert(Mask && "Missing call preserved mask for calling convention");
2595
2596	if (MachineOperand::clobbersPhysReg(RegMask: Mask, PhysReg: RegInfo->getFramePtr())) {
2597	X86Info->setFPClobberedByCall(true);
2598	if (CLI.CB && isa<InvokeInst>(Val: CLI.CB))
2599	X86Info->setFPClobberedByInvoke(true);
2600	}
2601	if (MachineOperand::clobbersPhysReg(RegMask: Mask, PhysReg: RegInfo->getBaseRegister())) {
2602	X86Info->setBPClobberedByCall(true);
2603	if (CLI.CB && isa<InvokeInst>(Val: CLI.CB))
2604	X86Info->setBPClobberedByInvoke(true);
2605	}
2606
2607	// If this is an invoke in a 32-bit function using a funclet-based
2608	// personality, assume the function clobbers all registers. If an exception
2609	// is thrown, the runtime will not restore CSRs.
2610	// FIXME: Model this more precisely so that we can register allocate across
2611	// the normal edge and spill and fill across the exceptional edge.
2612	if (!Is64Bit && CLI.CB && isa<InvokeInst>(Val: CLI.CB)) {
2613	const Function &CallerFn = MF.getFunction();
2614	EHPersonality Pers =
2615	CallerFn.hasPersonalityFn()
2616	? classifyEHPersonality(Pers: CallerFn.getPersonalityFn())
2617	: EHPersonality::Unknown;
2618	if (isFuncletEHPersonality(Pers))
2619	Mask = RegInfo->getNoPreservedMask();
2620	}
2621
2622	// Define a new register mask from the existing mask.
2623	uint32_t RegMask = nullptr*;
2624
2625	// In some calling conventions we need to remove the used physical registers
2626	// from the reg mask. Create a new RegMask for such calling conventions.
2627	// RegMask for calling conventions that disable only return registers (e.g.
2628	// preserve_most) will be modified later in LowerCallResult.
2629	bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CC: CallConv) \|\| HasNCSR;
2630	if (ShouldDisableArgRegs \|\| shouldDisableRetRegFromCSR(CC: CallConv)) {
2631	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2632
2633	// Allocate a new Reg Mask and copy Mask.
2634	RegMask = MF.allocateRegMask();
2635	unsigned RegMaskSize = MachineOperand::getRegMaskSize(NumRegs: TRI->getNumRegs());
2636	memcpy(dest: RegMask, src: Mask, n: sizeof(RegMask[`0`]) * RegMaskSize);
2637
2638	// Make sure all sub registers of the argument registers are reset
2639	// in the RegMask.
2640	if (ShouldDisableArgRegs) {
2641	for (auto const &RegPair : RegsToPass)
2642	for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg: RegPair.first))
2643	RegMask[SubReg / `32`] &= ~(`1u` << (SubReg % `32`));
2644	}
2645
2646	// Create the RegMask Operand according to our updated mask.
2647	Ops.push_back(Elt: DAG.getRegisterMask(RegMask));
2648	} else {
2649	// Create the RegMask Operand according to the static mask.
2650	Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
2651	}
2652
2653	if (InGlue.getNode())
2654	Ops.push_back(Elt: InGlue);
2655
2656	if (isTailCall) {
2657	// We used to do:
2658	//// If this is the first return lowered for this function, add the regs
2659	//// to the liveout set for the function.
2660	// This isn't right, although it's probably harmless on x86; liveouts
2661	// should be computed from returns not tail calls. Consider a void
2662	// function making a tail call to a function returning int.
2663	MF.getFrameInfo().setHasTailCall();
2664	auto Opcode =
2665	IsCFGuardCall ? X86ISD::TC_RETURN_GLOBALADDR : X86ISD::TC_RETURN;
2666	SDValue Ret = DAG.getNode(Opcode, DL: dl, VT: MVT::Other, Ops);
2667
2668	if (IsCFICall)
2669	Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2670
2671	DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
2672	DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
2673	return Ret;
2674	}
2675
2676	// Returns a chain & a glue for retval copy to use.
2677	SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
2678	if (IsImpCall) {
2679	Chain = DAG.getNode(Opcode: X86ISD::IMP_CALL, DL: dl, VTList: NodeTys, Ops);
2680	} else if (IsNoTrackIndirectCall) {
2681	Chain = DAG.getNode(Opcode: X86ISD::NT_CALL, DL: dl, VTList: NodeTys, Ops);
2682	} else if (IsCFGuardCall) {
2683	Chain = DAG.getNode(Opcode: X86ISD::CALL_GLOBALADDR, DL: dl, VTList: NodeTys, Ops);
2684	} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
2685	// Calls with a "clang.arc.attachedcall" bundle are special. They should be
2686	// expanded to the call, directly followed by a special marker sequence and
2687	// a call to a ObjC library function. Use the CALL_RVMARKER to do that.
2688	assert(!isTailCall &&
2689	"tail calls cannot be marked with clang.arc.attachedcall");
2690	assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
2691
2692	// Add a target global address for the retainRV/claimRV runtime function
2693	// just before the call target.
2694	Function ARCFn = objcarc::getAttachedARCFunction(CB: CLI.CB);
2695	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2696	auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL: dl, VT: PtrVT);
2697	Ops.insert(I: Ops.begin() + `1`, Elt: GA);
2698	Chain = DAG.getNode(Opcode: X86ISD::CALL_RVMARKER, DL: dl, VTList: NodeTys, Ops);
2699	} else {
2700	Chain = DAG.getNode(Opcode: X86ISD::CALL, DL: dl, VTList: NodeTys, Ops);
2701	}
2702
2703	if (IsCFICall)
2704	Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2705
2706	InGlue = Chain.getValue(R: `1`);
2707	DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
2708	DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
2709
2710	// Save heapallocsite metadata.
2711	if (CLI.CB)
2712	if (MDNode *HeapAlloc = CLI.CB->getMetadata(Kind: "heapallocsite"))
2713	DAG.addHeapAllocSite(Node: Chain.getNode(), MD: HeapAlloc);
2714
2715	// Create the CALLSEQ_END node.
2716	unsigned NumBytesForCalleeToPop = `0`; // Callee pops nothing.
2717	if (X86::isCalleePop(CallingConv: CallConv, is64Bit: Is64Bit, IsVarArg: isVarArg,
2718	GuaranteeTCO: DAG.getTarget().Options.GuaranteedTailCallOpt)) {
2719	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
2720	} else if (hasCalleePopSRet(Args: Outs, ArgLocs, Subtarget)) {
2721	// If this call passes a struct-return pointer, the callee
2722	// pops that struct pointer.
2723	NumBytesForCalleeToPop = `4`;
2724	}
2725
2726	// Returns a glue for retval copy to use.
2727	if (!IsSibcall) {
2728	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytesToPop, Size2: NumBytesForCalleeToPop,
2729	Glue: InGlue, DL: dl);
2730	InGlue = Chain.getValue(R: `1`);
2731	}
2732
2733	if (CallingConv::PreserveNone == CallConv)
2734	for (const ISD::OutputArg &Out : Outs) {
2735	if (Out.Flags.isSwiftSelf() \|\| Out.Flags.isSwiftAsync() \|\|
2736	Out.Flags.isSwiftError()) {
2737	errorUnsupported(DAG, dl,
2738	Msg: "Swift attributes can't be used with preserve_none");
2739	break;
2740	}
2741	}
2742
2743	// Handle result values, copying them out of physregs into vregs that we
2744	// return.
2745	return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2746	InVals, RegMask);
2747	}
2748
2749	//===----------------------------------------------------------------------===//
2750	// Fast Calling Convention (tail call) implementation
2751	//===----------------------------------------------------------------------===//
2752
2753	// Like std call, callee cleans arguments, convention except that ECX is
2754	// reserved for storing the tail called function address. Only 2 registers are
2755	// free for argument passing (inreg). Tail call optimization is performed
2756	// provided:
2757	// tailcallopt is enabled*
2758	// caller/callee are fastcc*
2759	// On X86_64 architecture with GOT-style position independent code only local
2760	// (within module) calls are supported at the moment.
2761	// To keep the stack aligned according to platform abi the function
2762	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
2763	// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
2764	// If a tail called function callee has more arguments than the caller the
2765	// caller needs to make sure that there is room to move the RETADDR to. This is
2766	// achieved by reserving an area the size of the argument delta right after the
2767	// original RETADDR, but before the saved framepointer or the spilled registers
2768	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2769	// stack layout:
2770	// arg1
2771	// arg2
2772	// RETADDR
2773	// [ new RETADDR
2774	// move area ]
2775	// (possible EBP)
2776	// ESI
2777	// EDI
2778	// local1 ..
2779
2780	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
2781	/// requirement.
2782	unsigned
2783	X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
2784	SelectionDAG &DAG) const {
2785	const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
2786	const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
2787	assert(StackSize % SlotSize == `0` &&
2788	"StackSize must be a multiple of SlotSize");
2789	return alignTo(Size: StackSize + SlotSize, A: StackAlignment) - SlotSize;
2790	}
2791
2792	/// Return true if the given stack call argument is already available in the
2793	/// same position (relatively) of the caller's incoming argument stack.
2794	static
2795	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2796	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2797	const X86InstrInfo TII, const* CCValAssign &VA) {
2798	unsigned Bytes = Arg.getValueSizeInBits() / `8`;
2799
2800	for (;;) {
2801	// Look through nodes that don't alter the bits of the incoming value.
2802	unsigned Op = Arg.getOpcode();
2803	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST \|\|
2804	Op == ISD::AssertZext) {
2805	Arg = Arg.getOperand(i: `0`);
2806	continue;
2807	}
2808	if (Op == ISD::TRUNCATE) {
2809	const SDValue &TruncInput = Arg.getOperand(i: `0`);
2810	if (TruncInput.getOpcode() == ISD::AssertZext &&
2811	cast<VTSDNode>(Val: TruncInput.getOperand(i: `1`))->getVT() ==
2812	Arg.getValueType()) {
2813	Arg = TruncInput.getOperand(i: `0`);
2814	continue;
2815	}
2816	}
2817	break;
2818	}
2819
2820	int FI = INT_MAX;
2821	if (Arg.getOpcode() == ISD::CopyFromReg) {
2822	Register VR = cast<RegisterSDNode>(Val: Arg.getOperand(i: `1`))->getReg();
2823	if (!VR.isVirtual())
2824	return false;
2825	MachineInstr *Def = MRI->getVRegDef(Reg: VR);
2826	if (!Def)
2827	return false;
2828	if (!Flags.isByVal()) {
2829	if (!TII->isLoadFromStackSlot(MI: *Def, FrameIndex&: FI))
2830	return false;
2831	} else {
2832	unsigned Opcode = Def->getOpcode();
2833	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
2834	Opcode == X86::LEA64_32r) &&
2835	Def->getOperand(i: `1`).isFI()) {
2836	FI = Def->getOperand(i: `1`).getIndex();
2837	Bytes = Flags.getByValSize();
2838	} else
2839	return false;
2840	}
2841	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val&: Arg)) {
2842	if (Flags.isByVal())
2843	// ByVal argument is passed in as a pointer but it's now being
2844	// dereferenced. e.g.
2845	// define @foo(%struct.X %A) {*
2846	// tail call @bar(%struct.X byval %A)*
2847	// }
2848	return false;
2849	SDValue Ptr = Ld->getBasePtr();
2850	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Val&: Ptr);
2851	if (!FINode)
2852	return false;
2853	FI = FINode->getIndex();
2854	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2855	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Val&: Arg);
2856	FI = FINode->getIndex();
2857	Bytes = Flags.getByValSize();
2858	} else
2859	return false;
2860
2861	assert(FI != INT_MAX);
2862	if (!MFI.isFixedObjectIndex(ObjectIdx: FI))
2863	return false;
2864
2865	if (Offset != MFI.getObjectOffset(ObjectIdx: FI))
2866	return false;
2867
2868	// If this is not byval, check that the argument stack object is immutable.
2869	// inalloca and argument copy elision can create mutable argument stack
2870	// objects. Byval objects can be mutated, but a byval call intends to pass the
2871	// mutated memory.
2872	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(ObjectIdx: FI))
2873	return false;
2874
2875	if (VA.getLocVT().getFixedSizeInBits() >
2876	Arg.getValueSizeInBits().getFixedValue()) {
2877	// If the argument location is wider than the argument type, check that any
2878	// extension flags match.
2879	if (Flags.isZExt() != MFI.isObjectZExt(ObjectIdx: FI) \|\|
2880	Flags.isSExt() != MFI.isObjectSExt(ObjectIdx: FI)) {
2881	return false;
2882	}
2883	}
2884
2885	return Bytes == MFI.getObjectSize(ObjectIdx: FI);
2886	}
2887
2888	static bool
2889	mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI,
2890	Register CallerSRetReg) {
2891	const auto &Outs = CLI.Outs;
2892	const auto &OutVals = CLI.OutVals;
2893
2894	// We know the caller has a sret pointer argument (CallerSRetReg). Locate the
2895	// operand index within the callee that may have a sret pointer too.
2896	unsigned Pos = `0`;
2897	for (unsigned E = Outs.size(); Pos != E; ++Pos)
2898	if (Outs [Pos].Flags.isSRet())
2899	break;
2900	// Bail out if the callee has not any sret argument.
2901	if (Pos == Outs.size())
2902	return false;
2903
2904	// At this point, either the caller is forwarding its sret argument to the
2905	// callee, or the callee is being passed a different sret pointer. We now look
2906	// for a CopyToReg, where the callee sret argument is written into a new vreg
2907	// (which should later be %rax/%eax, if this is returned).
2908	SDValue SRetArgVal = OutVals [Pos];
2909	for (SDNode *User : SRetArgVal ->users()) {
2910	if (User->getOpcode() != ISD::CopyToReg)
2911	continue;
2912	Register Reg = cast<RegisterSDNode>(Val: User->getOperand(Num: `1`))->getReg();
2913	if (Reg == CallerSRetReg && User->getOperand(Num: `2`) == SRetArgVal)
2914	return true;
2915	}
2916
2917	return false;
2918	}
2919
2920	/// Check whether the call is eligible for sibling call optimization. Sibling
2921	/// calls are loosely defined to be simple, profitable tail calls that only
2922	/// require adjusting register parameters. We do not speculatively to optimize
2923	/// complex calls that require lots of argument memory operations that may
2924	/// alias.
2925	///
2926	/// Note that LLVM supports multiple ways, such as musttail, to force tail call
2927	/// emission. Returning false from this function will not prevent tail call
2928	/// emission in all cases.
2929	bool X86TargetLowering::isEligibleForSiblingCallOpt(
2930	TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
2931	SmallVectorImpl<CCValAssign> &ArgLocs) const {
2932	SelectionDAG &DAG = CLI.DAG;
2933	const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2934	const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2935	const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2936	SDValue Callee = CLI.Callee;
2937	CallingConv::ID CalleeCC = CLI.CallConv;
2938	bool isVarArg = CLI.IsVarArg;
2939
2940	if (!mayTailCallThisCC(CC: CalleeCC))
2941	return false;
2942
2943	// If -tailcallopt is specified, make fastcc functions tail-callable.
2944	MachineFunction &MF = DAG.getMachineFunction();
2945	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2946	const Function &CallerF = MF.getFunction();
2947
2948	// If the function return type is x86_fp80 and the callee return type is not,
2949	// then the FP_EXTEND of the call result is not a nop. It's not safe to
2950	// perform a tailcall optimization here.
2951	if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty())
2952	return false;
2953
2954	// Win64 functions have extra shadow space for argument homing. Don't do the
2955	// sibcall if the caller and callee have mismatched expectations for this
2956	// space.
2957	CallingConv::ID CallerCC = CallerF.getCallingConv();
2958	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CC: CalleeCC);
2959	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CC: CallerCC);
2960	if (IsCalleeWin64 != IsCallerWin64)
2961	return false;
2962
2963	// If we are using a GOT, don't generate sibling calls to non-local,
2964	// default-visibility symbols. Tail calling such a symbol requires using a GOT
2965	// relocation, which forces early binding of the symbol. This breaks code that
2966	// require lazy function symbol resolution. Using musttail or
2967	// GuaranteedTailCallOpt will override this.
2968	if (Subtarget.isPICStyleGOT()) {
2969	if (isa<ExternalSymbolSDNode>(Val: Callee))
2970	return false;
2971	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
2972	if (!G->getGlobal()->hasLocalLinkage() &&
2973	G->getGlobal()->hasDefaultVisibility())
2974	return false;
2975	}
2976	}
2977
2978	// Look for obvious safe cases to perform tail call optimization that do not
2979	// require ABI changes. This is what gcc calls sibcall.
2980
2981	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2982	// emit a special epilogue.
2983	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2984	if (RegInfo->hasStackRealignment(MF))
2985	return false;
2986
2987	// Avoid sibcall optimization if we are an sret return function and the callee
2988	// is incompatible, unless such premises are proven wrong. See comment in
2989	// LowerReturn about why hasStructRetAttr is insufficient.
2990	if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2991	// For a compatible tail call the callee must return our sret pointer. So it
2992	// needs to be (a) an sret function itself and (b) we pass our sret as its
2993	// sret. Condition #b is harder to determine.
2994	if (!mayBeSRetTailCallCompatible(CLI, CallerSRetReg: SRetReg))
2995	return false;
2996	} else if (hasCalleePopSRet(Args: Outs, ArgLocs, Subtarget))
2997	// The callee pops an sret, so we cannot tail-call, as our caller doesn't
2998	// expect that.
2999	return false;
3000
3001	// Do not sibcall optimize vararg calls unless all arguments are passed via
3002	// registers.
3003	LLVMContext &C = *DAG.getContext();
3004	if (isVarArg && !Outs.empty()) {
3005	// Optimizing for varargs on Win64 is unlikely to be safe without
3006	// additional testing.
3007	if (IsCalleeWin64 \|\| IsCallerWin64)
3008	return false;
3009
3010	for (const auto &VA : ArgLocs)
3011	if (!VA.isRegLoc())
3012	return false;
3013	}
3014
3015	// If the call result is in ST0 / ST1, it needs to be popped off the x87
3016	// stack. Therefore, if it's not used by the call it is not safe to optimize
3017	// this into a sibcall.
3018	bool Unused = false;
3019	for (const auto &In : Ins) {
3020	if (!In.Used) {
3021	Unused = true;
3022	break;
3023	}
3024	}
3025	if (Unused) {
3026	SmallVector<CCValAssign, `16`> RVLocs;
3027	CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C);
3028	RVCCInfo.AnalyzeCallResult(Ins, Fn: RetCC_X86);
3029	for (const auto &VA : RVLocs) {
3030	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
3031	return false;
3032	}
3033	}
3034
3035	// Check that the call results are passed in the same way.
3036	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3037	CalleeFn: RetCC_X86, CallerFn: RetCC_X86))
3038	return false;
3039	// The callee has to preserve all registers the caller needs to preserve.
3040	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3041	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3042	if (CallerCC != CalleeCC) {
3043	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3044	if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
3045	return false;
3046	}
3047
3048	// The stack frame of the caller cannot be replaced by the tail-callee one's
3049	// if the function is required to preserve all the registers. Conservatively
3050	// prevent tail optimization even if hypothetically all the registers are used
3051	// for passing formal parameters or returning values.
3052	if (CallerF.hasFnAttribute(Kind: "no_caller_saved_registers"))
3053	return false;
3054
3055	unsigned StackArgsSize = CCInfo.getStackSize();
3056
3057	// If the callee takes no arguments then go on to check the results of the
3058	// call.
3059	if (!Outs.empty()) {
3060	if (StackArgsSize > `0`) {
3061	// Check if the arguments are already laid out in the right way as
3062	// the caller's fixed stack objects.
3063	MachineFrameInfo &MFI = MF.getFrameInfo();
3064	const MachineRegisterInfo *MRI = &MF.getRegInfo();
3065	const X86InstrInfo *TII = Subtarget.getInstrInfo();
3066	for (unsigned I = `0`, E = ArgLocs.size(); I != E; ++I) {
3067	const CCValAssign &VA = ArgLocs [I];
3068	SDValue Arg = OutVals [I];
3069	ISD::ArgFlagsTy Flags = Outs [I].Flags;
3070	if (VA.getLocInfo() == CCValAssign::Indirect)
3071	return false;
3072	if (!VA.isRegLoc()) {
3073	if (!MatchingStackOffset(Arg, Offset: VA.getLocMemOffset(), Flags, MFI, MRI,
3074	TII, VA))
3075	return false;
3076	}
3077	}
3078	}
3079
3080	bool PositionIndependent = isPositionIndependent();
3081	// If the tailcall address may be in a register, then make sure it's
3082	// possible to register allocate for it. In 32-bit, the call address can
3083	// only target EAX, EDX, or ECX since the tail call must be scheduled after
3084	// callee-saved registers are restored. These happen to be the same
3085	// registers used to pass 'inreg' arguments so watch out for those.
3086	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Val: Callee) &&
3087	!isa<ExternalSymbolSDNode>(Val: Callee)) \|\|
3088	PositionIndependent)) {
3089	unsigned NumInRegs = `0`;
3090	// In PIC we need an extra register to formulate the address computation
3091	// for the callee.
3092	unsigned MaxInRegs = PositionIndependent ? `2` : `3`;
3093
3094	for (const auto &VA : ArgLocs) {
3095	if (!VA.isRegLoc())
3096	continue;
3097	Register Reg = VA.getLocReg();
3098	switch (Reg) {
3099	default: break;
3100	case X86::EAX: case X86::EDX: case X86::ECX:
3101	if (++NumInRegs == MaxInRegs)
3102	return false;
3103	break;
3104	}
3105	}
3106	}
3107
3108	const MachineRegisterInfo &MRI = MF.getRegInfo();
3109	if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
3110	return false;
3111	}
3112
3113	bool CalleeWillPop =
3114	X86::isCalleePop(CallingConv: CalleeCC, is64Bit: Subtarget.is64Bit(), IsVarArg: isVarArg,
3115	GuaranteeTCO: MF.getTarget().Options.GuaranteedTailCallOpt);
3116
3117	if (unsigned BytesToPop = FuncInfo->getBytesToPopOnReturn()) {
3118	// If we have bytes to pop, the callee must pop them.
3119	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
3120	if (!CalleePopMatches)
3121	return false;
3122	} else if (CalleeWillPop && StackArgsSize > `0`) {
3123	// If we don't have bytes to pop, make sure the callee doesn't pop any.
3124	return false;
3125	}
3126
3127	return true;
3128	}
3129
3130	/// Determines whether the callee is required to pop its own arguments.
3131	/// Callee pop is necessary to support tail calls.
3132	bool X86::isCalleePop(CallingConv::ID CallingConv,
3133	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
3134	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
3135	// can guarantee TCO.
3136	if (!IsVarArg && shouldGuaranteeTCO(CC: CallingConv, GuaranteedTailCallOpt: GuaranteeTCO))
3137	return true;
3138
3139	switch (CallingConv) {
3140	default:
3141	return false;
3142	case CallingConv::X86_StdCall:
3143	case CallingConv::X86_FastCall:
3144	case CallingConv::X86_ThisCall:
3145	case CallingConv::X86_VectorCall:
3146	return !is64Bit;
3147	}
3148	}
3149

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86ISelLoweringCall.cpp