X86ISelLoweringCall.cpp source code [llvm_projects/llvm/lib/Target/X86/X86ISelLoweringCall.cpp]

1	//===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file implements the lowering of LLVM calls to DAG nodes.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "MCTargetDesc/X86MCAsmInfo.h"
15	#include "X86.h"
16	#include "X86CallingConv.h"
17	#include "X86FrameLowering.h"
18	#include "X86ISelLowering.h"
19	#include "X86InstrBuilder.h"
20	#include "X86MachineFunctionInfo.h"
21	#include "X86TargetMachine.h"
22	#include "llvm/ADT/Statistic.h"
23	#include "llvm/Analysis/ObjCARCUtil.h"
24	#include "llvm/CodeGen/MachineJumpTableInfo.h"
25	#include "llvm/CodeGen/MachineModuleInfo.h"
26	#include "llvm/CodeGen/WinEHFuncInfo.h"
27	#include "llvm/IR/DiagnosticInfo.h"
28	#include "llvm/IR/IRBuilder.h"
29	#include "llvm/IR/Module.h"
30
31	#define DEBUG_TYPE "x86-isel"
32
33	using namespace llvm;
34
35	STATISTIC(NumTailCalls, "Number of tail calls");
36
37	/// Call this when the user attempts to do something unsupported, like
38	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
39	/// report_fatal_error, so calling code should attempt to recover without
40	/// crashing.
41	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
42	const char *Msg) {
43	MachineFunction &MF = DAG.getMachineFunction();
44	DAG.getContext()->diagnose(
45	DI: DiagnosticInfoUnsupported (MF.getFunction(), Msg, dl.getDebugLoc()));
46	}
47
48	/// Returns true if a CC can dynamically exclude a register from the list of
49	/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
50	/// the return registers.
51	static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
52	switch (CC) {
53	default:
54	return false;
55	case CallingConv::X86_RegCall:
56	case CallingConv::PreserveMost:
57	case CallingConv::PreserveAll:
58	return true;
59	}
60	}
61
62	/// Returns true if a CC can dynamically exclude a register from the list of
63	/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
64	/// the parameters.
65	static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
66	return CC == CallingConv::X86_RegCall;
67	}
68
69	static std::pair<MVT, unsigned>
70	handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
71	const X86Subtarget &Subtarget) {
72	// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
73	// convention is one that uses k registers.
74	if (NumElts == `2`)
75	return {MVT::v2i64, `1`};
76	if (NumElts == `4`)
77	return {MVT::v4i32, `1`};
78	if (NumElts == `8` && CC != CallingConv::X86_RegCall &&
79	CC != CallingConv::Intel_OCL_BI)
80	return {MVT::v8i16, `1`};
81	if (NumElts == `16` && CC != CallingConv::X86_RegCall &&
82	CC != CallingConv::Intel_OCL_BI)
83	return {MVT::v16i8, `1`};
84	// v32i1 passes in ymm unless we have BWI and the calling convention is
85	// regcall.
86	if (NumElts == `32` && (!Subtarget.hasBWI() \|\| CC != CallingConv::X86_RegCall))
87	return {MVT::v32i8, `1`};
88	// Split v64i1 vectors if we don't have v64i8 available.
89	if (NumElts == `64` && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
90	if (Subtarget.useAVX512Regs())
91	return {MVT::v64i8, `1`};
92	return {MVT::v32i8, `2`};
93	}
94
95	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
96	if (!isPowerOf2_32(Value: NumElts) \|\| (NumElts == `64` && !Subtarget.hasBWI()) \|\|
97	NumElts > `64`)
98	return {MVT::i8, NumElts};
99
100	return {MVT::INVALID_SIMPLE_VALUE_TYPE, `0`};
101	}
102
103	MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
104	CallingConv::ID CC,
105	EVT VT) const {
106	if (VT.isVector()) {
107	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
108	unsigned NumElts = VT.getVectorNumElements();
109
110	MVT RegisterVT;
111	unsigned NumRegisters;
112	std::tie(args&: RegisterVT, args&: NumRegisters) =
113	handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
114	if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
115	return RegisterVT;
116	}
117
118	if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < `8`)
119	return MVT::v8f16;
120	}
121
122	// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
123	if ((VT == MVT::f64 \|\| VT == MVT::f80) && !Subtarget.is64Bit() &&
124	!Subtarget.hasX87())
125	return MVT::i32;
126
127	if (isTypeLegal(VT: MVT::f16)) {
128	if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
129	return getRegisterTypeForCallingConv(
130	Context, CC, VT: VT.changeVectorElementType(EltVT: MVT::f16));
131
132	if (VT == MVT::bf16)
133	return MVT::f16;
134	}
135
136	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
137	}
138
139	unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
140	CallingConv::ID CC,
141	EVT VT) const {
142	if (VT.isVector()) {
143	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
144	unsigned NumElts = VT.getVectorNumElements();
145
146	MVT RegisterVT;
147	unsigned NumRegisters;
148	std::tie(args&: RegisterVT, args&: NumRegisters) =
149	handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
150	if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
151	return NumRegisters;
152	}
153
154	if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < `8`)
155	return `1`;
156	}
157
158	// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
159	// x87 is disabled.
160	if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
161	if (VT == MVT::f64)
162	return `2`;
163	if (VT == MVT::f80)
164	return `3`;
165	}
166
167	if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
168	isTypeLegal(VT: MVT::f16))
169	return getNumRegistersForCallingConv(Context, CC,
170	VT: VT.changeVectorElementType(EltVT: MVT::f16));
171
172	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
173	}
174
175	unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
176	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
177	unsigned &NumIntermediates, MVT &RegisterVT) const {
178	// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
179	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
180	Subtarget.hasAVX512() &&
181	(!isPowerOf2_32(Value: VT.getVectorNumElements()) \|\|
182	(VT.getVectorNumElements() == `64` && !Subtarget.hasBWI()) \|\|
183	VT.getVectorNumElements() > `64`)) {
184	RegisterVT = MVT::i8;
185	IntermediateVT = MVT::i1;
186	NumIntermediates = VT.getVectorNumElements();
187	return NumIntermediates;
188	}
189
190	// Split v64i1 vectors if we don't have v64i8 available.
191	if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
192	CC != CallingConv::X86_RegCall) {
193	RegisterVT = MVT::v32i8;
194	IntermediateVT = MVT::v32i1;
195	NumIntermediates = `2`;
196	return `2`;
197	}
198
199	// Split vNbf16 vectors according to vNf16.
200	if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
201	isTypeLegal(VT: MVT::f16))
202	VT = VT.changeVectorElementType(EltVT: MVT::f16);
203
204	return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
205	NumIntermediates, RegisterVT);
206	}
207
208	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
209	LLVMContext& Context,
210	EVT VT) const {
211	if (!VT.isVector())
212	return MVT::i8;
213
214	if (Subtarget.hasAVX512()) {
215	// Figure out what this type will be legalized to.
216	EVT LegalVT = VT;
217	while (getTypeAction(Context, VT: LegalVT) != TypeLegal)
218	LegalVT = getTypeToTransformTo(Context, VT: LegalVT);
219
220	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
221	if (LegalVT.getSimpleVT().is512BitVector())
222	return EVT::getVectorVT(Context, VT: MVT::i1, EC: VT.getVectorElementCount());
223
224	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
225	// If we legalized to less than a 512-bit vector, then we will use a vXi1
226	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
227	// vXi16/vXi8.
228	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
229	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= `32`)
230	return EVT::getVectorVT(Context, VT: MVT::i1, EC: VT.getVectorElementCount());
231	}
232	}
233
234	return VT.changeVectorElementTypeToInteger();
235	}
236
237	bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
238	Type Ty, CallingConv::ID CallConv, bool* isVarArg,
239	const DataLayout &DL) const {
240	// i128 split into i64 needs to be allocated to two consecutive registers,
241	// or spilled to the stack as a whole.
242	return Ty->isIntegerTy(Bitwidth: `128`);
243	}
244
245	/// Helper for getByValTypeAlignment to determine
246	/// the desired ByVal argument alignment.
247	static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
248	if (MaxAlign == `16`)
249	return;
250	if (VectorType *VTy = dyn_cast<VectorType>(Val: Ty)) {
251	if (VTy->getPrimitiveSizeInBits().getFixedValue() == `128`)
252	MaxAlign = Align (`16`);
253	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
254	Align EltAlign;
255	getMaxByValAlign(Ty: ATy->getElementType(), MaxAlign&: EltAlign);
256	if (EltAlign > MaxAlign)
257	MaxAlign = EltAlign;
258	} else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
259	for (auto *EltTy : STy->elements()) {
260	Align EltAlign;
261	getMaxByValAlign(Ty: EltTy, MaxAlign&: EltAlign);
262	if (EltAlign > MaxAlign)
263	MaxAlign = EltAlign;
264	if (MaxAlign == `16`)
265	break;
266	}
267	}
268	}
269
270	/// Return the desired alignment for ByVal aggregate
271	/// function arguments in the caller parameter area. For X86, aggregates
272	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
273	/// are at 4-byte boundaries.
274	Align X86TargetLowering::getByValTypeAlignment(Type *Ty,
275	const DataLayout &DL) const {
276	if (Subtarget.is64Bit())
277	return std::max(a: DL.getABITypeAlign(Ty), b: Align::Constant<`8`>());
278
279	Align Alignment(`4`);
280	if (Subtarget.hasSSE1())
281	getMaxByValAlign(Ty, MaxAlign&: Alignment);
282	return Alignment;
283	}
284
285	/// It returns EVT::Other if the type should be determined using generic
286	/// target-independent logic.
287	/// For vector ops we check that the overall size isn't larger than our
288	/// preferred vector width.
289	EVT X86TargetLowering::getOptimalMemOpType(
290	const MemOp &Op, const AttributeList &FuncAttributes) const {
291	if (!FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat)) {
292	if (Op.size() >= `16` &&
293	(!Subtarget.isUnalignedMem16Slow() \|\| Op.isAligned(AlignCheck: Align (`16`)))) {
294	// FIXME: Check if unaligned 64-byte accesses are slow.
295	if (Op.size() >= `64` && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
296	(Subtarget.getPreferVectorWidth() >= `512`)) {
297	return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
298	}
299	// FIXME: Check if unaligned 32-byte accesses are slow.
300	if (Op.size() >= `32` && Subtarget.hasAVX() &&
301	Subtarget.useLight256BitInstructions()) {
302	// Although this isn't a well-supported type for AVX1, we'll let
303	// legalization and shuffle lowering produce the optimal codegen. If we
304	// choose an optimal type with a vector element larger than a byte,
305	// getMemsetStores() may create an intermediate splat (using an integer
306	// multiply) before we splat as a vector.
307	return MVT::v32i8;
308	}
309	if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= `128`))
310	return MVT::v16i8;
311	// TODO: Can SSE1 handle a byte vector?
312	// If we have SSE1 registers we should be able to use them.
313	if (Subtarget.hasSSE1() && (Subtarget.is64Bit() \|\| Subtarget.hasX87()) &&
314	(Subtarget.getPreferVectorWidth() >= `128`))
315	return MVT::v4f32;
316	} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) \|\| Op.isZeroMemset()) &&
317	Op.size() >= `8` && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
318	// Do not use f64 to lower memcpy if source is string constant. It's
319	// better to use i32 to avoid the loads.
320	// Also, do not use f64 to lower memset unless this is a memset of zeros.
321	// The gymnastics of splatting a byte value into an XMM register and then
322	// only using 8-byte stores (because this is a CPU with slow unaligned
323	// 16-byte accesses) makes that a loser.
324	return MVT::f64;
325	}
326	}
327	// This is a compromise. If we reach here, unaligned accesses may be slow on
328	// this target. However, creating smaller, aligned accesses could be even
329	// slower and would certainly be a lot more code.
330	if (Subtarget.is64Bit() && Op.size() >= `8`)
331	return MVT::i64;
332	return MVT::i32;
333	}
334
335	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
336	if (VT == MVT::f32)
337	return Subtarget.hasSSE1();
338	if (VT == MVT::f64)
339	return Subtarget.hasSSE2();
340	return true;
341	}
342
343	static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
344	return (`8` * Alignment.value()) % SizeInBits == `0`;
345	}
346
347	bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
348	if (isBitAligned(Alignment, SizeInBits: VT.getSizeInBits()))
349	return true;
350	switch (VT.getSizeInBits()) {
351	default:
352	// 8-byte and under are always assumed to be fast.
353	return true;
354	case `128`:
355	return !Subtarget.isUnalignedMem16Slow();
356	case `256`:
357	return !Subtarget.isUnalignedMem32Slow();
358	// TODO: What about AVX-512 (512-bit) accesses?
359	}
360	}
361
362	bool X86TargetLowering::allowsMisalignedMemoryAccesses(
363	EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
364	unsigned Fast) const* {
365	if (Fast)
366	*Fast = isMemoryAccessFast(VT, Alignment);
367	// NonTemporal vector memory ops must be aligned.
368	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
369	// NT loads can only be vector aligned, so if its less aligned than the
370	// minimum vector size (which we can split the vector down to), we might as
371	// well use a regular unaligned vector load.
372	// We don't have any NT loads pre-SSE41.
373	if (!!(Flags & MachineMemOperand::MOLoad))
374	return (Alignment < `16` \|\| !Subtarget.hasSSE41());
375	return false;
376	}
377	// Misaligned accesses of any size are always allowed.
378	return true;
379	}
380
381	bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
382	const DataLayout &DL, EVT VT,
383	unsigned AddrSpace, Align Alignment,
384	MachineMemOperand::Flags Flags,
385	unsigned Fast) const* {
386	if (Fast)
387	*Fast = isMemoryAccessFast(VT, Alignment);
388	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
389	if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
390	/Fast=/nullptr))
391	return true;
392	// NonTemporal vector memory ops are special, and must be aligned.
393	if (!isBitAligned(Alignment, SizeInBits: VT.getSizeInBits()))
394	return false;
395	switch (VT.getSizeInBits()) {
396	case `128`:
397	if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
398	return true;
399	if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
400	return true;
401	return false;
402	case `256`:
403	if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
404	return true;
405	if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
406	return true;
407	return false;
408	case `512`:
409	if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
410	return true;
411	return false;
412	default:
413	return false; // Don't have NonTemporal vector memory ops of this size.
414	}
415	}
416	return true;
417	}
418
419	/// Return the entry encoding for a jump table in the
420	/// current function. The returned value is a member of the
421	/// MachineJumpTableInfo::JTEntryKind enum.
422	unsigned X86TargetLowering::getJumpTableEncoding() const {
423	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
424	// symbol.
425	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
426	return MachineJumpTableInfo::EK_Custom32;
427	if (isPositionIndependent() &&
428	getTargetMachine().getCodeModel() == CodeModel::Large &&
429	!Subtarget.isTargetCOFF())
430	return MachineJumpTableInfo::EK_LabelDifference64;
431
432	// Otherwise, use the normal jump table encoding heuristics.
433	return TargetLowering::getJumpTableEncoding();
434	}
435
436	bool X86TargetLowering::useSoftFloat() const {
437	return Subtarget.useSoftFloat();
438	}
439
440	void X86TargetLowering::markLibCallAttributes(MachineFunction MF, unsigned* CC,
441	ArgListTy &Args) const {
442
443	// Only relabel X86-32 for C / Stdcall CCs.
444	if (Subtarget.is64Bit())
445	return;
446	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
447	return;
448	unsigned ParamRegs = `0`;
449	if (auto *M = MF->getFunction().getParent())
450	ParamRegs = M->getNumberRegisterParameters();
451
452	// Mark the first N int arguments as having reg
453	for (auto &Arg : Args) {
454	Type *T = Arg.Ty;
455	if (T->isIntOrPtrTy())
456	if (MF->getDataLayout().getTypeAllocSize(Ty: T) <= `8`) {
457	unsigned numRegs = `1`;
458	if (MF->getDataLayout().getTypeAllocSize(Ty: T) > `4`)
459	numRegs = `2`;
460	if (ParamRegs < numRegs)
461	return;
462	ParamRegs -= numRegs;
463	Arg.IsInReg = true;
464	}
465	}
466	}
467
468	const MCExpr *
469	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
470	const MachineBasicBlock *MBB,
471	unsigned uid,MCContext &Ctx) const{
472	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
473	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
474	// entries.
475	return MCSymbolRefExpr::create(Symbol: MBB->getSymbol(), specifier: X86::S_GOTOFF, Ctx);
476	}
477
478	/// Returns relocation base for the given PIC jumptable.
479	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
480	SelectionDAG &DAG) const {
481	if (!Subtarget.is64Bit())
482	// This doesn't have SDLoc associated with it, but is not really the
483	// same as a Register.
484	return DAG.getNode(Opcode: X86ISD::GlobalBaseReg, DL: SDLoc (),
485	VT: getPointerTy(DL: DAG.getDataLayout()));
486	return Table;
487	}
488
489	/// This returns the relocation base for the given PIC jumptable,
490	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
491	const MCExpr *X86TargetLowering::
492	getPICJumpTableRelocBaseExpr(const MachineFunction MF, unsigned* JTI,
493	MCContext &Ctx) const {
494	// X86-64 uses RIP relative addressing based on the jump table label.
495	if (Subtarget.isPICStyleRIPRel() \|\|
496	(Subtarget.is64Bit() &&
497	getTargetMachine().getCodeModel() == CodeModel::Large))
498	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
499
500	// Otherwise, the reference is relative to the PIC base.
501	return MCSymbolRefExpr::create(Symbol: MF->getPICBaseSymbol(), Ctx);
502	}
503
504	std::pair<const TargetRegisterClass *, uint8_t>
505	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
506	MVT VT) const {
507	const TargetRegisterClass RRC = nullptr*;
508	uint8_t Cost = `1`;
509	switch (VT.SimpleTy) {
510	default:
511	return TargetLowering::findRepresentativeClass(TRI, VT);
512	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
513	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
514	break;
515	case MVT::x86mmx:
516	RRC = &X86::VR64RegClass;
517	break;
518	case MVT::f32: case MVT::f64:
519	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
520	case MVT::v4f32: case MVT::v2f64:
521	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
522	case MVT::v8f32: case MVT::v4f64:
523	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
524	case MVT::v16f32: case MVT::v8f64:
525	RRC = &X86::VR128XRegClass;
526	break;
527	}
528	return std::make_pair(x&: RRC, y&: Cost);
529	}
530
531	unsigned X86TargetLowering::getAddressSpace() const {
532	if (Subtarget.is64Bit())
533	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? X86AS::GS
534	: X86AS::FS;
535	return X86AS::GS;
536	}
537
538	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
539	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
540	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(Major: `17`));
541	}
542
543	static Constant* SegmentOffset(IRBuilderBase &IRB,
544	int Offset, unsigned AddressSpace) {
545	return ConstantExpr::getIntToPtr(
546	C: ConstantInt::get(Ty: Type::getInt32Ty(C&: IRB.getContext()), V: Offset),
547	Ty: IRB.getPtrTy(AddrSpace: AddressSpace));
548	}
549
550	Value X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const* {
551	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
552	// tcbhead_t; use it instead of the usual global variable (see
553	// sysdeps/{i386,x86_64}/nptl/tls.h)
554	if (hasStackGuardSlotTLS(TargetTriple: Subtarget.getTargetTriple())) {
555	unsigned AddressSpace = getAddressSpace();
556
557	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
558	if (Subtarget.isTargetFuchsia())
559	return SegmentOffset(IRB, Offset: `0x10`, AddressSpace);
560
561	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
562	// Specially, some users may customize the base reg and offset.
563	int Offset = M->getStackProtectorGuardOffset();
564	// If we don't set -stack-protector-guard-offset value:
565	// %fs:0x28, unless we're using a Kernel code model, in which case
566	// it's %gs:0x28. gs:0x14 on i386.
567	if (Offset == INT_MAX)
568	Offset = (Subtarget.is64Bit()) ? `0x28` : `0x14`;
569
570	StringRef GuardReg = M->getStackProtectorGuardReg();
571	if (GuardReg == "fs")
572	AddressSpace = X86AS::FS;
573	else if (GuardReg == "gs")
574	AddressSpace = X86AS::GS;
575
576	// Use symbol guard if user specify.
577	StringRef GuardSymb = M->getStackProtectorGuardSymbol();
578	if (!GuardSymb.empty()) {
579	GlobalVariable *GV = M->getGlobalVariable(Name: GuardSymb);
580	if (!GV) {
581	Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(C&: M->getContext())
582	: Type::getInt32Ty(C&: M->getContext());
583	GV = new GlobalVariable (M, Ty, false*, GlobalValue::ExternalLinkage,
584	nullptr, GuardSymb, nullptr,
585	GlobalValue::NotThreadLocal, AddressSpace);
586	if (!Subtarget.isTargetDarwin())
587	GV->setDSOLocal(M->getDirectAccessExternalData());
588	}
589	return GV;
590	}
591
592	return SegmentOffset(IRB, Offset, AddressSpace);
593	}
594	return TargetLowering::getIRStackGuard(IRB);
595	}
596
597	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
598	// MSVC CRT provides functionalities for stack protection.
599	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
600	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
601	// MSVC CRT has a global variable holding security cookie.
602	M.getOrInsertGlobal(Name: "__security_cookie",
603	Ty: PointerType::getUnqual(C&: M.getContext()));
604
605	// MSVC CRT has a function to validate security cookie.
606	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
607	Name: "__security_check_cookie", RetTy: Type::getVoidTy(C&: M.getContext()),
608	Args: PointerType::getUnqual(C&: M.getContext()));
609	if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
610	F->setCallingConv(CallingConv::X86_FastCall);
611	F->addParamAttr(ArgNo: `0`, Kind: Attribute::AttrKind::InReg);
612	}
613	return;
614	}
615
616	StringRef GuardMode = M.getStackProtectorGuard();
617
618	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
619	if ((GuardMode == "tls" \|\| GuardMode.empty()) &&
620	hasStackGuardSlotTLS(TargetTriple: Subtarget.getTargetTriple()))
621	return;
622	TargetLowering::insertSSPDeclarations(M);
623	}
624
625	Value X86TargetLowering::getSDagStackGuard(const* Module &M) const {
626	// MSVC CRT has a global variable holding security cookie.
627	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
628	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
629	return M.getGlobalVariable(Name: "__security_cookie");
630	}
631	return TargetLowering::getSDagStackGuard(M);
632	}
633
634	Function X86TargetLowering::getSSPStackGuardCheck(const* Module &M) const {
635	// MSVC CRT has a function to validate security cookie.
636	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
637	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
638	return M.getFunction(Name: "__security_check_cookie");
639	}
640	return TargetLowering::getSSPStackGuardCheck(M);
641	}
642
643	Value *
644	X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
645	// Android provides a fixed TLS slot for the SafeStack pointer. See the
646	// definition of TLS_SLOT_SAFESTACK in
647	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
648	if (Subtarget.isTargetAndroid()) {
649	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
650	// %gs:0x24 on i386
651	int Offset = (Subtarget.is64Bit()) ? `0x48` : `0x24`;
652	return SegmentOffset(IRB, Offset, AddressSpace: getAddressSpace());
653	}
654
655	// Fuchsia is similar.
656	if (Subtarget.isTargetFuchsia()) {
657	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
658	return SegmentOffset(IRB, Offset: `0x18`, AddressSpace: getAddressSpace());
659	}
660
661	return TargetLowering::getSafeStackPointerLocation(IRB);
662	}
663
664	//===----------------------------------------------------------------------===//
665	// Return Value Calling Convention Implementation
666	//===----------------------------------------------------------------------===//
667
668	bool X86TargetLowering::CanLowerReturn(
669	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
670	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
671	const Type RetTy) const* {
672	SmallVector<CCValAssign, `16`> RVLocs;
673	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
674	return CCInfo.CheckReturn(Outs, Fn: RetCC_X86);
675	}
676
677	const MCPhysReg X86TargetLowering::getScratchRegisters(CallingConv::ID) const* {
678	static const MCPhysReg ScratchRegs[] = { X86::R11, `0` };
679	return ScratchRegs;
680	}
681
682	ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
683	static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR};
684	return RCRegs;
685	}
686
687	/// Lowers masks values (vi1) to the local register values*
688	/// \returns DAG node after lowering to register type
689	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
690	const SDLoc &DL, SelectionDAG &DAG) {
691	EVT ValVT = ValArg.getValueType();
692
693	if (ValVT == MVT::v1i1)
694	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ValLoc, N1: ValArg,
695	N2: DAG.getIntPtrConstant(Val: `0`, DL));
696
697	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
698	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
699	// Two stage lowering might be required
700	// bitcast: v8i1 -> i8 / v16i1 -> i16
701	// anyextend: i8 -> i32 / i16 -> i32
702	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
703	SDValue ValToCopy = DAG.getBitcast(VT: TempValLoc, V: ValArg);
704	if (ValLoc == MVT::i32)
705	ValToCopy = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ValLoc, Operand: ValToCopy);
706	return ValToCopy;
707	}
708
709	if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
710	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
711	// One stage lowering is required
712	// bitcast: v32i1 -> i32 / v64i1 -> i64
713	return DAG.getBitcast(VT: ValLoc, V: ValArg);
714	}
715
716	return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ValLoc, Operand: ValArg);
717	}
718
719	/// Breaks v64i1 value into two registers and adds the new node to the DAG
720	static void Passv64i1ArgInRegs(
721	const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
722	SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
723	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
724	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
725	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
726	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
727	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
728	"The value should reside in two registers");
729
730	// Before splitting the value we cast it to i64
731	Arg = DAG.getBitcast(VT: MVT::i64, V: Arg);
732
733	// Splitting the value into two i32 types
734	SDValue Lo, Hi;
735	std::tie(args&: Lo, args&: Hi) = DAG.SplitScalar(N: Arg, DL, LoVT: MVT::i32, HiVT: MVT::i32);
736
737	// Attach the two i32 types into corresponding registers
738	RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Lo));
739	RegsToPass.push_back(Elt: std::make_pair(x: NextVA.getLocReg(), y&: Hi));
740	}
741
742	SDValue
743	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
744	bool isVarArg,
745	const SmallVectorImpl<ISD::OutputArg> &Outs,
746	const SmallVectorImpl<SDValue> &OutVals,
747	const SDLoc &dl, SelectionDAG &DAG) const {
748	MachineFunction &MF = DAG.getMachineFunction();
749	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
750
751	// In some cases we need to disable registers from the default CSR list.
752	// For example, when they are used as return registers (preserve_ and X86's*
753	// regcall) or for argument passing (X86's regcall).
754	bool ShouldDisableCalleeSavedRegister =
755	shouldDisableRetRegFromCSR(CC: CallConv) \|\|
756	MF.getFunction().hasFnAttribute(Kind: "no_caller_saved_registers");
757
758	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
759	report_fatal_error(reason: "X86 interrupts may not return any value");
760
761	SmallVector<CCValAssign, `16`> RVLocs;
762	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
763	CCInfo.AnalyzeReturn(Outs, Fn: RetCC_X86);
764
765	SmallVector<std::pair<Register, SDValue>, `4`> RetVals;
766	for (unsigned I = `0`, OutsIndex = `0`, E = RVLocs.size(); I != E;
767	++I, ++OutsIndex) {
768	CCValAssign &VA = RVLocs [I];
769	assert(VA.isRegLoc() && "Can only return in registers!");
770
771	// Add the register to the CalleeSaveDisableRegs list.
772	if (ShouldDisableCalleeSavedRegister)
773	MF.getRegInfo().disableCalleeSavedRegister(Reg: VA.getLocReg());
774
775	SDValue ValToCopy = OutVals [OutsIndex];
776	EVT ValVT = ValToCopy.getValueType();
777
778	// Promote values to the appropriate types.
779	if (VA.getLocInfo() == CCValAssign::SExt)
780	ValToCopy = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
781	else if (VA.getLocInfo() == CCValAssign::ZExt)
782	ValToCopy = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
783	else if (VA.getLocInfo() == CCValAssign::AExt) {
784	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
785	ValToCopy = lowerMasksToReg(ValArg: ValToCopy, ValLoc: VA.getLocVT(), DL: dl, DAG);
786	else
787	ValToCopy = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
788	}
789	else if (VA.getLocInfo() == CCValAssign::BCvt)
790	ValToCopy = DAG.getBitcast(VT: VA.getLocVT(), V: ValToCopy);
791
792	assert(VA.getLocInfo() != CCValAssign::FPExt &&
793	"Unexpected FP-extend for return value.");
794
795	// Report an error if we have attempted to return a value via an XMM
796	// register and SSE was disabled.
797	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(Reg: VA.getLocReg())) {
798	errorUnsupported(DAG, dl, Msg: "SSE register return with SSE disabled");
799	VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
800	} else if (!Subtarget.hasSSE2() &&
801	X86::FR64XRegClass.contains(Reg: VA.getLocReg()) &&
802	ValVT == MVT::f64) {
803	// When returning a double via an XMM register, report an error if SSE2 is
804	// not enabled.
805	errorUnsupported(DAG, dl, Msg: "SSE2 register return with SSE2 disabled");
806	VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
807	}
808
809	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
810	// the RET instruction and handled by the FP Stackifier.
811	if (VA.getLocReg() == X86::FP0 \|\|
812	VA.getLocReg() == X86::FP1) {
813	// If this is a copy from an xmm register to ST(0), use an FPExtend to
814	// change the value to the FP stack register class.
815	if (isScalarFPTypeInSSEReg(VT: VA.getValVT()))
816	ValToCopy = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f80, Operand: ValToCopy);
817	RetVals.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ValToCopy));
818	// Don't emit a copytoreg.
819	continue;
820	}
821
822	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
823	// which is returned in RAX / RDX.
824	if (Subtarget.is64Bit()) {
825	if (ValVT == MVT::x86mmx) {
826	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
827	ValToCopy = DAG.getBitcast(VT: MVT::i64, V: ValToCopy);
828	ValToCopy = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v2i64,
829	Operand: ValToCopy);
830	// If we don't have SSE2 available, convert to v4f32 so the generated
831	// register is legal.
832	if (!Subtarget.hasSSE2())
833	ValToCopy = DAG.getBitcast(VT: MVT::v4f32, V: ValToCopy);
834	}
835	}
836	}
837
838	if (VA.needsCustom()) {
839	assert(VA.getValVT() == MVT::v64i1 &&
840	"Currently the only custom case is when we split v64i1 to 2 regs");
841
842	Passv64i1ArgInRegs(DL: dl, DAG, Arg&: ValToCopy, RegsToPass&: RetVals, VA, NextVA&: RVLocs [++I],
843	Subtarget);
844
845	// Add the second register to the CalleeSaveDisableRegs list.
846	if (ShouldDisableCalleeSavedRegister)
847	MF.getRegInfo().disableCalleeSavedRegister(Reg: RVLocs [I].getLocReg());
848	} else {
849	RetVals.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ValToCopy));
850	}
851	}
852
853	SDValue Glue;
854	SmallVector<SDValue, `6`> RetOps;
855	RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
856	// Operand #1 = Bytes To Pop
857	RetOps.push_back(Elt: DAG.getTargetConstant(Val: FuncInfo->getBytesToPopOnReturn(), DL: dl,
858	VT: MVT::i32));
859
860	// Copy the result values into the output registers.
861	for (auto &RetVal : RetVals) {
862	if (RetVal.first == X86::FP0 \|\| RetVal.first == X86::FP1) {
863	RetOps.push_back(Elt: RetVal.second);
864	continue; // Don't emit a copytoreg.
865	}
866
867	Chain = DAG.getCopyToReg(Chain, dl, Reg: RetVal.first, N: RetVal.second, Glue);
868	Glue = Chain.getValue(R: `1`);
869	RetOps.push_back(
870	Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
871	}
872
873	// Swift calling convention does not require we copy the sret argument
874	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
875
876	// All x86 ABIs require that for returning structs by value we copy
877	// the sret argument into %rax/%eax (depending on ABI) for the return.
878	// We saved the argument into a virtual register in the entry block,
879	// so now we copy the value out and into %rax/%eax.
880	//
881	// Checking Function.hasStructRetAttr() here is insufficient because the IR
882	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
883	// false, then an sret argument may be implicitly inserted in the SelDAG. In
884	// either case FuncInfo->setSRetReturnReg() will have been called.
885	if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
886	// When we have both sret and another return value, we should use the
887	// original Chain stored in RetOps[0], instead of the current Chain updated
888	// in the above loop. If we only have sret, RetOps[0] equals to Chain.
889
890	// For the case of sret and another return value, we have
891	// Chain_0 at the function entry
892	// Chain_1 = getCopyToReg(Chain_0) in the above loop
893	// If we use Chain_1 in getCopyFromReg, we will have
894	// Val = getCopyFromReg(Chain_1)
895	// Chain_2 = getCopyToReg(Chain_1, Val) from below
896
897	// getCopyToReg(Chain_0) will be glued together with
898	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
899	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
900	// Data dependency from Unit B to Unit A due to usage of Val in
901	// getCopyToReg(Chain_1, Val)
902	// Chain dependency from Unit A to Unit B
903
904	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
905	SDValue Val = DAG.getCopyFromReg(Chain: RetOps [`0`], dl, Reg: SRetReg,
906	VT: getPointerTy(DL: MF.getDataLayout()));
907
908	Register RetValReg
909	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
910	X86::RAX : X86::EAX;
911	Chain = DAG.getCopyToReg(Chain, dl, Reg: RetValReg, N: Val, Glue);
912	Glue = Chain.getValue(R: `1`);
913
914	// RAX/EAX now acts like a return value.
915	RetOps.push_back(
916	Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
917
918	// Add the returned register to the CalleeSaveDisableRegs list. Don't do
919	// this however for preserve_most/preserve_all to minimize the number of
920	// callee-saved registers for these CCs.
921	if (ShouldDisableCalleeSavedRegister &&
922	CallConv != CallingConv::PreserveAll &&
923	CallConv != CallingConv::PreserveMost)
924	MF.getRegInfo().disableCalleeSavedRegister(Reg: RetValReg);
925	}
926
927	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
928	const MCPhysReg *I =
929	TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
930	if (I) {
931	for (; *I; ++I) {
932	if (X86::GR64RegClass.contains(Reg: *I))
933	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
934	else
935	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
936	}
937	}
938
939	RetOps [`0`] = Chain; // Update chain.
940
941	// Add the glue if we have it.
942	if (Glue.getNode())
943	RetOps.push_back(Elt: Glue);
944
945	X86ISD::NodeType opcode = X86ISD::RET_GLUE;
946	if (CallConv == CallingConv::X86_INTR)
947	opcode = X86ISD::IRET;
948	return DAG.getNode(Opcode: opcode, DL: dl, VT: MVT::Other, Ops: RetOps);
949	}
950
951	bool X86TargetLowering::isUsedByReturnOnly(SDNode N, SDValue &Chain) const* {
952	if (N->getNumValues() != `1` \|\| !N->hasNUsesOfValue(NUses: `1`, Value: `0`))
953	return false;
954
955	SDValue TCChain = Chain;
956	SDNode Copy = N->user_begin();
957	if (Copy->getOpcode() == ISD::CopyToReg) {
958	// If the copy has a glue operand, we conservatively assume it isn't safe to
959	// perform a tail call.
960	if (Copy->getOperand(Num: Copy->getNumOperands()-`1`).getValueType() == MVT::Glue)
961	return false;
962	TCChain = Copy->getOperand(Num: `0`);
963	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
964	return false;
965
966	bool HasRet = false;
967	for (const SDNode *U : Copy->users()) {
968	if (U->getOpcode() != X86ISD::RET_GLUE)
969	return false;
970	// If we are returning more than one value, we can definitely
971	// not make a tail call see PR19530
972	if (U->getNumOperands() > `4`)
973	return false;
974	if (U->getNumOperands() == `4` &&
975	U->getOperand(Num: U->getNumOperands() - `1`).getValueType() != MVT::Glue)
976	return false;
977	HasRet = true;
978	}
979
980	if (!HasRet)
981	return false;
982
983	Chain = TCChain;
984	return true;
985	}
986
987	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
988	ISD::NodeType ExtendKind) const {
989	MVT ReturnMVT = MVT::i32;
990
991	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
992	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
993	// The ABI does not require i1, i8 or i16 to be extended.
994	//
995	// On Darwin, there is code in the wild relying on Clang's old behaviour of
996	// always extending i8/i16 return values, so keep doing that for now.
997	// (PR26665).
998	ReturnMVT = MVT::i8;
999	}
1000
1001	EVT MinVT = getRegisterType(Context, VT: ReturnMVT);
1002	return VT.bitsLT(VT: MinVT) ? MinVT : VT;
1003	}
1004
1005	/// Reads two 32 bit registers and creates a 64 bit mask value.
1006	/// \param VA The current 32 bit value that need to be assigned.
1007	/// \param NextVA The next 32 bit value that need to be assigned.
1008	/// \param Root The parent DAG node.
1009	/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
1010	/// glue purposes. In the case the DAG is already using
1011	/// physical register instead of virtual, we should glue
1012	/// our new SDValue to InGlue SDvalue.
1013	/// \return a new SDvalue of size 64bit.
1014	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
1015	SDValue &Root, SelectionDAG &DAG,
1016	const SDLoc &DL, const X86Subtarget &Subtarget,
1017	SDValue InGlue = nullptr*) {
1018	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
1019	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
1020	assert(VA.getValVT() == MVT::v64i1 &&
1021	"Expecting first location of 64 bit width type");
1022	assert(NextVA.getValVT() == VA.getValVT() &&
1023	"The locations should have the same type");
1024	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
1025	"The values should reside in two registers");
1026
1027	SDValue Lo, Hi;
1028	SDValue ArgValueLo, ArgValueHi;
1029
1030	MachineFunction &MF = DAG.getMachineFunction();
1031	const TargetRegisterClass *RC = &X86::GR32RegClass;
1032
1033	// Read a 32 bit value from the registers.
1034	if (nullptr == InGlue) {
1035	// When no physical register is present,
1036	// create an intermediate virtual register.
1037	Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
1038	ArgValueLo = DAG.getCopyFromReg(Chain: Root, dl: DL, Reg, VT: MVT::i32);
1039	Reg = MF.addLiveIn(PReg: NextVA.getLocReg(), RC);
1040	ArgValueHi = DAG.getCopyFromReg(Chain: Root, dl: DL, Reg, VT: MVT::i32);
1041	} else {
1042	// When a physical register is available read the value from it and glue
1043	// the reads together.
1044	ArgValueLo =
1045	DAG.getCopyFromReg(Chain: Root, dl: DL, Reg: VA.getLocReg(), VT: MVT::i32, Glue: *InGlue);
1046	*InGlue = ArgValueLo.getValue(R: `2`);
1047	ArgValueHi =
1048	DAG.getCopyFromReg(Chain: Root, dl: DL, Reg: NextVA.getLocReg(), VT: MVT::i32, Glue: *InGlue);
1049	*InGlue = ArgValueHi.getValue(R: `2`);
1050	}
1051
1052	// Convert the i32 type into v32i1 type.
1053	Lo = DAG.getBitcast(VT: MVT::v32i1, V: ArgValueLo);
1054
1055	// Convert the i32 type into v32i1 type.
1056	Hi = DAG.getBitcast(VT: MVT::v32i1, V: ArgValueHi);
1057
1058	// Concatenate the two values together.
1059	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v64i1, N1: Lo, N2: Hi);
1060	}
1061
1062	/// The function will lower a register of various sizes (8/16/32/64)
1063	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
1064	/// \returns a DAG node contains the operand after lowering to mask type.
1065	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
1066	const EVT &ValLoc, const SDLoc &DL,
1067	SelectionDAG &DAG) {
1068	SDValue ValReturned = ValArg;
1069
1070	if (ValVT == MVT::v1i1)
1071	return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i1, Operand: ValReturned);
1072
1073	if (ValVT == MVT::v64i1) {
1074	// In 32 bit machine, this case is handled by getv64i1Argument
1075	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
1076	// In 64 bit machine, There is no need to truncate the value only bitcast
1077	} else {
1078	MVT MaskLenVT;
1079	switch (ValVT.getSimpleVT().SimpleTy) {
1080	case MVT::v8i1:
1081	MaskLenVT = MVT::i8;
1082	break;
1083	case MVT::v16i1:
1084	MaskLenVT = MVT::i16;
1085	break;
1086	case MVT::v32i1:
1087	MaskLenVT = MVT::i32;
1088	break;
1089	default:
1090	llvm_unreachable("Expecting a vector of i1 types");
1091	}
1092
1093	ValReturned = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MaskLenVT, Operand: ValReturned);
1094	}
1095	return DAG.getBitcast(VT: ValVT, V: ValReturned);
1096	}
1097
1098	static SDValue getPopFromX87Reg(SelectionDAG &DAG, SDValue Chain,
1099	const SDLoc &dl, Register Reg, EVT VT,
1100	SDValue Glue) {
1101	SDVTList VTs = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
1102	SDValue Ops[] = {Chain, DAG.getRegister(Reg, VT), Glue};
1103	return DAG.getNode(Opcode: X86ISD::POP_FROM_X87_REG, DL: dl, VTList: VTs,
1104	Ops: ArrayRef(Ops, Glue.getNode() ? `3` : `2`));
1105	}
1106
1107	/// Lower the result values of a call into the
1108	/// appropriate copies out of appropriate physical registers.
1109	///
1110	SDValue X86TargetLowering::LowerCallResult(
1111	SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1112	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1113	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
1114	uint32_t RegMask) const* {
1115
1116	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1117	// Assign locations to each value returned by this call.
1118	SmallVector<CCValAssign, `16`> RVLocs;
1119	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1120	*DAG.getContext());
1121	CCInfo.AnalyzeCallResult(Ins, Fn: RetCC_X86);
1122
1123	// Copy all of the result registers out of their specified physreg.
1124	for (unsigned I = `0`, InsIndex = `0`, E = RVLocs.size(); I != E;
1125	++I, ++InsIndex) {
1126	CCValAssign &VA = RVLocs [I];
1127	EVT CopyVT = VA.getLocVT();
1128
1129	// In some calling conventions we need to remove the used registers
1130	// from the register mask.
1131	if (RegMask) {
1132	for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg: VA.getLocReg()))
1133	RegMask[SubReg / `32`] &= ~(`1u` << (SubReg % `32`));
1134	}
1135
1136	// Report an error if there was an attempt to return FP values via XMM
1137	// registers.
1138	if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(Reg: VA.getLocReg())) {
1139	errorUnsupported(DAG, dl, Msg: "SSE register return with SSE disabled");
1140	if (VA.getLocReg() == X86::XMM1)
1141	VA.convertToReg(Reg: X86::FP1); // Set reg to FP1, avoid hitting asserts.
1142	else
1143	VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
1144	} else if (!Subtarget.hasSSE2() &&
1145	X86::FR64XRegClass.contains(Reg: VA.getLocReg()) &&
1146	CopyVT == MVT::f64) {
1147	errorUnsupported(DAG, dl, Msg: "SSE2 register return with SSE2 disabled");
1148	if (VA.getLocReg() == X86::XMM1)
1149	VA.convertToReg(Reg: X86::FP1); // Set reg to FP1, avoid hitting asserts.
1150	else
1151	VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
1152	}
1153
1154	// If we prefer to use the value in xmm registers, copy it out as f80 and
1155	// use a truncate to move it from fp stack reg to xmm reg.
1156	bool RoundAfterCopy = false;
1157	bool X87Result = VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1;
1158	if (X87Result && isScalarFPTypeInSSEReg(VT: VA.getValVT())) {
1159	if (!Subtarget.hasX87())
1160	report_fatal_error(reason: "X87 register return with X87 disabled");
1161	CopyVT = MVT::f80;
1162	RoundAfterCopy = (CopyVT != VA.getLocVT());
1163	}
1164
1165	SDValue Val;
1166	if (VA.needsCustom()) {
1167	assert(VA.getValVT() == MVT::v64i1 &&
1168	"Currently the only custom case is when we split v64i1 to 2 regs");
1169	Val =
1170	getv64i1Argument(VA, NextVA&: RVLocs [++I], Root&: Chain, DAG, DL: dl, Subtarget, InGlue: &InGlue);
1171	} else {
1172	Chain =
1173	X87Result
1174	? getPopFromX87Reg(DAG, Chain, dl, Reg: VA.getLocReg(), VT: CopyVT, Glue: InGlue)
1175	.getValue(R: `1`)
1176	: DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: CopyVT, Glue: InGlue)
1177	.getValue(R: `1`);
1178	Val = Chain.getValue(R: `0`);
1179	InGlue = Chain.getValue(R: `2`);
1180	}
1181
1182	if (RoundAfterCopy)
1183	Val = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: VA.getValVT(), N1: Val,
1184	// This truncation won't change the value.
1185	N2: DAG.getIntPtrConstant(Val: `1`, DL: dl, /isTarget=/true));
1186
1187	if (VA.isExtInLoc()) {
1188	if (VA.getValVT().isVector() &&
1189	VA.getValVT().getScalarType() == MVT::i1 &&
1190	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
1191	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
1192	// promoting a mask type (vi1) into a register of type i64/i32/i16/i8*
1193	Val = lowerRegToMasks(ValArg: Val, ValVT: VA.getValVT(), ValLoc: VA.getLocVT(), DL: dl, DAG);
1194	} else
1195	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
1196	}
1197
1198	if (VA.getLocInfo() == CCValAssign::BCvt)
1199	Val = DAG.getBitcast(VT: VA.getValVT(), V: Val);
1200
1201	InVals.push_back(Elt: Val);
1202	}
1203
1204	return Chain;
1205	}
1206
1207	//===----------------------------------------------------------------------===//
1208	// C & StdCall & Fast Calling Convention implementation
1209	//===----------------------------------------------------------------------===//
1210	// StdCall calling convention seems to be standard for many Windows' API
1211	// routines and around. It differs from C calling convention just a little:
1212	// callee should clean up the stack, not caller. Symbols should be also
1213	// decorated in some fancy way :) It doesn't support any vector arguments.
1214	// For info on fast calling convention see Fast Calling Convention (tail call)
1215	// implementation LowerX86_32FastCCCallTo.
1216
1217	/// Determines whether Args, either a set of outgoing arguments to a call, or a
1218	/// set of incoming args of a call, contains an sret pointer that the callee
1219	/// pops
1220	template <typename T>
1221	static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
1222	const X86Subtarget &Subtarget) {
1223	// Not C++20 (yet), so no concepts available.
1224	static_assert(std::is_same_v<T, ISD::OutputArg> \|\|
1225	std::is_same_v<T, ISD::InputArg>,
1226	"requires ISD::OutputArg or ISD::InputArg");
1227
1228	// Only 32-bit pops the sret. It's a 64-bit world these days, so early-out
1229	// for most compilations.
1230	if (!Subtarget.is32Bit())
1231	return false;
1232
1233	if (Args.empty())
1234	return false;
1235
1236	// Most calls do not have an sret argument, check the arg next.
1237	const ISD::ArgFlagsTy &Flags = Args[`0`].Flags;
1238	if (!Flags.isSRet() \|\| Flags.isInReg())
1239	return false;
1240
1241	// The MSVCabi does not pop the sret.
1242	if (Subtarget.getTargetTriple().isOSMSVCRT())
1243	return false;
1244
1245	// MCUs don't pop the sret
1246	if (Subtarget.isTargetMCU())
1247	return false;
1248
1249	// Callee pops argument
1250	return true;
1251	}
1252
1253	/// Make a copy of an aggregate at address specified by "Src" to address
1254	/// "Dst" with size and alignment information specified by the specific
1255	/// parameter attribute. The copy will be passed as a byval function parameter.
1256	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
1257	SDValue Chain, ISD::ArgFlagsTy Flags,
1258	SelectionDAG &DAG, const SDLoc &dl) {
1259	SDValue SizeNode = DAG.getIntPtrConstant(Val: Flags.getByValSize(), DL: dl);
1260
1261	return DAG.getMemcpy(
1262	Chain, dl, Dst, Src, Size: SizeNode, Alignment: Flags.getNonZeroByValAlign(),
1263	/isVolatile/ isVol: false, /AlwaysInline=/true,
1264	/CI=/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo (), SrcPtrInfo: MachinePointerInfo ());
1265	}
1266
1267	/// Return true if the calling convention is one that we can guarantee TCO for.
1268	static bool canGuaranteeTCO(CallingConv::ID CC) {
1269	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
1270	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
1271	CC == CallingConv::Tail \|\| CC == CallingConv::SwiftTail);
1272	}
1273
1274	/// Return true if we might ever do TCO for calls with this calling convention.
1275	static bool mayTailCallThisCC(CallingConv::ID CC) {
1276	switch (CC) {
1277	// C calling conventions:
1278	case CallingConv::C:
1279	case CallingConv::Win64:
1280	case CallingConv::X86_64_SysV:
1281	case CallingConv::PreserveNone:
1282	// Callee pop conventions:
1283	case CallingConv::X86_ThisCall:
1284	case CallingConv::X86_StdCall:
1285	case CallingConv::X86_VectorCall:
1286	case CallingConv::X86_FastCall:
1287	// Swift:
1288	case CallingConv::Swift:
1289	return true;
1290	default:
1291	return canGuaranteeTCO(CC);
1292	}
1293	}
1294
1295	/// Return true if the function is being made into a tailcall target by
1296	/// changing its ABI.
1297	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
1298	return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) \|\|
1299	CC == CallingConv::Tail \|\| CC == CallingConv::SwiftTail;
1300	}
1301
1302	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst CI) const* {
1303	if (!CI->isTailCall())
1304	return false;
1305
1306	CallingConv::ID CalleeCC = CI->getCallingConv();
1307	if (!mayTailCallThisCC(CC: CalleeCC))
1308	return false;
1309
1310	return true;
1311	}
1312
1313	SDValue
1314	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1315	const SmallVectorImpl<ISD::InputArg> &Ins,
1316	const SDLoc &dl, SelectionDAG &DAG,
1317	const CCValAssign &VA,
1318	MachineFrameInfo &MFI, unsigned i) const {
1319	// Create the nodes corresponding to a load from this parameter slot.
1320	ISD::ArgFlagsTy Flags = Ins [i].Flags;
1321	bool AlwaysUseMutable = shouldGuaranteeTCO(
1322	CC: CallConv, GuaranteedTailCallOpt: DAG.getTarget().Options.GuaranteedTailCallOpt);
1323	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1324	EVT ValVT;
1325	MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
1326
1327	// If value is passed by pointer we have address passed instead of the value
1328	// itself. No need to extend if the mask value and location share the same
1329	// absolute size.
1330	bool ExtendedInMem =
1331	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
1332	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
1333
1334	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
1335	ValVT = VA.getLocVT();
1336	else
1337	ValVT = VA.getValVT();
1338
1339	// FIXME: For now, all byval parameter objects are marked mutable. This can be
1340	// changed with more analysis.
1341	// In case of tail call optimization mark all arguments mutable. Since they
1342	// could be overwritten by lowering of arguments in case of a tail call.
1343	if (Flags.isByVal()) {
1344	unsigned Bytes = Flags.getByValSize();
1345	if (Bytes == `0`) Bytes = `1`; // Don't create zero-sized stack objects.
1346
1347	// FIXME: For now, all byval parameter objects are marked as aliasing. This
1348	// can be improved with deeper analysis.
1349	int FI = MFI.CreateFixedObject(Size: Bytes, SPOffset: VA.getLocMemOffset(), IsImmutable: isImmutable,
1350	/isAliased=/true);
1351	return DAG.getFrameIndex(FI, VT: PtrVT);
1352	}
1353
1354	EVT ArgVT = Ins [i].ArgVT;
1355
1356	// If this is a vector that has been split into multiple parts, don't elide
1357	// the copy. The layout on the stack may not match the packed in-memory
1358	// layout.
1359	bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
1360
1361	// This is an argument in memory. We might be able to perform copy elision.
1362	// If the argument is passed directly in memory without any extension, then we
1363	// can perform copy elision. Large vector types, for example, may be passed
1364	// indirectly by pointer.
1365	if (Flags.isCopyElisionCandidate() &&
1366	VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
1367	!ScalarizedVector) {
1368	SDValue PartAddr;
1369	if (Ins [i].PartOffset == `0`) {
1370	// If this is a one-part value or the first part of a multi-part value,
1371	// create a stack object for the entire argument value type and return a
1372	// load from our portion of it. This assumes that if the first part of an
1373	// argument is in memory, the rest will also be in memory.
1374	int FI = MFI.CreateFixedObject(Size: ArgVT.getStoreSize(), SPOffset: VA.getLocMemOffset(),
1375	/IsImmutable=/false);
1376	PartAddr = DAG.getFrameIndex(FI, VT: PtrVT);
1377	return DAG.getLoad(
1378	VT: ValVT, dl, Chain, Ptr: PartAddr,
1379	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI));
1380	}
1381
1382	// This is not the first piece of an argument in memory. See if there is
1383	// already a fixed stack object including this offset. If so, assume it
1384	// was created by the PartOffset == 0 branch above and create a load from
1385	// the appropriate offset into it.
1386	int64_t PartBegin = VA.getLocMemOffset();
1387	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / `8`;
1388	int FI = MFI.getObjectIndexBegin();
1389	for (; MFI.isFixedObjectIndex(ObjectIdx: FI); ++FI) {
1390	int64_t ObjBegin = MFI.getObjectOffset(ObjectIdx: FI);
1391	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(ObjectIdx: FI);
1392	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
1393	break;
1394	}
1395	if (MFI.isFixedObjectIndex(ObjectIdx: FI)) {
1396	SDValue Addr =
1397	DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: DAG.getFrameIndex(FI, VT: PtrVT),
1398	N2: DAG.getIntPtrConstant(Val: Ins [i].PartOffset, DL: dl));
1399	return DAG.getLoad(VT: ValVT, dl, Chain, Ptr: Addr,
1400	PtrInfo: MachinePointerInfo::getFixedStack(
1401	MF&: DAG.getMachineFunction(), FI, Offset: Ins [i].PartOffset));
1402	}
1403	}
1404
1405	int FI = MFI.CreateFixedObject(Size: ValVT.getSizeInBits() / `8`,
1406	SPOffset: VA.getLocMemOffset(), IsImmutable: isImmutable);
1407
1408	// Set SExt or ZExt flag.
1409	if (VA.getLocInfo() == CCValAssign::ZExt) {
1410	MFI.setObjectZExt(ObjectIdx: FI, IsZExt: true);
1411	} else if (VA.getLocInfo() == CCValAssign::SExt) {
1412	MFI.setObjectSExt(ObjectIdx: FI, IsSExt: true);
1413	}
1414
1415	MaybeAlign Alignment;
1416	if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1417	ValVT != MVT::f80)
1418	Alignment = MaybeAlign (`4`);
1419	SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
1420	SDValue Val = DAG.getLoad(
1421	VT: ValVT, dl, Chain, Ptr: FIN,
1422	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI),
1423	Alignment);
1424	return ExtendedInMem
1425	? (VA.getValVT().isVector()
1426	? DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VA.getValVT(), Operand: Val)
1427	: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val))
1428	: Val;
1429	}
1430
1431	// FIXME: Get this from tablegen.
1432	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
1433	const X86Subtarget &Subtarget) {
1434	assert(Subtarget.is64Bit());
1435
1436	if (Subtarget.isCallingConvWin64(CC: CallConv)) {
1437	static const MCPhysReg GPR64ArgRegsWin64[] = {
1438	X86::RCX, X86::RDX, X86::R8, X86::R9
1439	};
1440	return GPR64ArgRegsWin64;
1441	}
1442
1443	static const MCPhysReg GPR64ArgRegs64Bit[] = {
1444	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1445	};
1446	return GPR64ArgRegs64Bit;
1447	}
1448
1449	// FIXME: Get this from tablegen.
1450	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
1451	CallingConv::ID CallConv,
1452	const X86Subtarget &Subtarget) {
1453	assert(Subtarget.is64Bit());
1454	if (Subtarget.isCallingConvWin64(CC: CallConv)) {
1455	// The XMM registers which might contain var arg parameters are shadowed
1456	// in their paired GPR. So we only need to save the GPR to their home
1457	// slots.
1458	// TODO: __vectorcall will change this.
1459	return {};
1460	}
1461
1462	bool isSoftFloat = Subtarget.useSoftFloat();
1463	if (isSoftFloat \|\| !Subtarget.hasSSE1())
1464	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
1465	// registers.
1466	return {};
1467
1468	static const MCPhysReg XMMArgRegs64Bit[] = {
1469	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1470	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1471	};
1472	return XMMArgRegs64Bit;
1473	}
1474
1475	#ifndef NDEBUG
1476	static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
1477	return llvm::is_sorted(
1478	ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
1479	return A.getValNo() < B.getValNo();
1480	});
1481	}
1482	#endif
1483
1484	namespace {
1485	/// This is a helper class for lowering variable arguments parameters.
1486	class VarArgsLoweringHelper {
1487	public:
1488	VarArgsLoweringHelper(X86MachineFunctionInfo FuncInfo, const* SDLoc &Loc,
1489	SelectionDAG &DAG, const X86Subtarget &Subtarget,
1490	CallingConv::ID CallConv, CCState &CCInfo)
1491	: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
1492	TheMachineFunction(DAG.getMachineFunction()),
1493	TheFunction(TheMachineFunction.getFunction()),
1494	FrameInfo(TheMachineFunction.getFrameInfo()),
1495	FrameLowering(*Subtarget.getFrameLowering()),
1496	TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
1497	CCInfo(CCInfo) {}
1498
1499	// Lower variable arguments parameters.
1500	void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
1501
1502	private:
1503	void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
1504
1505	void forwardMustTailParameters(SDValue &Chain);
1506
1507	bool is64Bit() const { return Subtarget.is64Bit(); }
1508	bool isWin64() const { return Subtarget.isCallingConvWin64(CC: CallConv); }
1509
1510	X86MachineFunctionInfo *FuncInfo;
1511	const SDLoc &DL;
1512	SelectionDAG &DAG;
1513	const X86Subtarget &Subtarget;
1514	MachineFunction &TheMachineFunction;
1515	const Function &TheFunction;
1516	MachineFrameInfo &FrameInfo;
1517	const TargetFrameLowering &FrameLowering;
1518	const TargetLowering &TargLowering;
1519	CallingConv::ID CallConv;
1520	CCState &CCInfo;
1521	};
1522	} // namespace
1523
1524	void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
1525	SDValue &Chain, unsigned StackSize) {
1526	// If the function takes variable number of arguments, make a frame index for
1527	// the start of the first vararg value... for expansion of llvm.va_start. We
1528	// can skip this if there are no va_start calls.
1529	if (is64Bit() \|\| (CallConv != CallingConv::X86_FastCall &&
1530	CallConv != CallingConv::X86_ThisCall)) {
1531	FuncInfo->setVarArgsFrameIndex(
1532	FrameInfo.CreateFixedObject(Size: `1`, SPOffset: StackSize, IsImmutable: true));
1533	}
1534
1535	// 64-bit calling conventions support varargs and register parameters, so we
1536	// have to do extra work to spill them in the prologue.
1537	if (is64Bit()) {
1538	// Find the first unallocated argument registers.
1539	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
1540	ArrayRef<MCPhysReg> ArgXMMs =
1541	get64BitArgumentXMMs(MF&: TheMachineFunction, CallConv, Subtarget);
1542	unsigned NumIntRegs = CCInfo.getFirstUnallocated(Regs: ArgGPRs);
1543	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(Regs: ArgXMMs);
1544
1545	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
1546	"SSE register cannot be used when SSE is disabled!");
1547
1548	if (isWin64()) {
1549	// Get to the caller-allocated home save location. Add 8 to account
1550	// for the return address.
1551	int HomeOffset = FrameLowering.getOffsetOfLocalArea() + `8`;
1552	FuncInfo->setRegSaveFrameIndex(
1553	FrameInfo.CreateFixedObject(Size: `1`, SPOffset: NumIntRegs * `8` + HomeOffset, IsImmutable: false));
1554	// Fixup to set vararg frame on shadow area (4 x i64).
1555	if (NumIntRegs < `4`)
1556	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1557	} else {
1558	// For X86-64, if there are vararg parameters that are passed via
1559	// registers, then we must store them to their spots on the stack so
1560	// they may be loaded by dereferencing the result of va_next.
1561	FuncInfo->setVarArgsGPOffset(NumIntRegs * `8`);
1562	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * `8` + NumXMMRegs * `16`);
1563	FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
1564	Size: ArgGPRs.size() * `8` + ArgXMMs.size() * `16`, Alignment: Align (`16`), isSpillSlot: false));
1565	}
1566
1567	SmallVector<SDValue, `6`>
1568	LiveGPRs; // list of SDValue for GPR registers keeping live input value
1569	SmallVector<SDValue, `8`> LiveXMMRegs; // list of SDValue for XMM registers
1570	// keeping live input value
1571	SDValue ALVal; // if applicable keeps SDValue for %al register
1572
1573	// Gather all the live in physical registers.
1574	for (MCPhysReg Reg : ArgGPRs.slice(N: NumIntRegs)) {
1575	Register GPR = TheMachineFunction.addLiveIn(PReg: Reg, RC: &X86::GR64RegClass);
1576	LiveGPRs.push_back(Elt: DAG.getCopyFromReg(Chain, dl: DL, Reg: GPR, VT: MVT::i64));
1577	}
1578	const auto &AvailableXmms = ArgXMMs.slice(N: NumXMMRegs);
1579	if (!AvailableXmms.empty()) {
1580	Register AL = TheMachineFunction.addLiveIn(PReg: X86::AL, RC: &X86::GR8RegClass);
1581	ALVal = DAG.getCopyFromReg(Chain, dl: DL, Reg: AL, VT: MVT::i8);
1582	for (MCPhysReg Reg : AvailableXmms) {
1583	// FastRegisterAllocator spills virtual registers at basic
1584	// block boundary. That leads to usages of xmm registers
1585	// outside of check for %al. Pass physical registers to
1586	// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
1587	TheMachineFunction.getRegInfo().addLiveIn(Reg);
1588	LiveXMMRegs.push_back(Elt: DAG.getRegister(Reg, VT: MVT::v4f32));
1589	}
1590	}
1591
1592	// Store the integer parameter registers.
1593	SmallVector<SDValue, `8`> MemOps;
1594	SDValue RSFIN =
1595	DAG.getFrameIndex(FI: FuncInfo->getRegSaveFrameIndex(),
1596	VT: TargLowering.getPointerTy(DL: DAG.getDataLayout()));
1597	unsigned Offset = FuncInfo->getVarArgsGPOffset();
1598	for (SDValue Val : LiveGPRs) {
1599	SDValue FIN = DAG.getNode(Opcode: ISD::ADD, DL,
1600	VT: TargLowering.getPointerTy(DL: DAG.getDataLayout()),
1601	N1: RSFIN, N2: DAG.getIntPtrConstant(Val: Offset, DL));
1602	SDValue Store =
1603	DAG.getStore(Chain: Val.getValue(R: `1`), dl: DL, Val, Ptr: FIN,
1604	PtrInfo: MachinePointerInfo::getFixedStack(
1605	MF&: DAG.getMachineFunction(),
1606	FI: FuncInfo->getRegSaveFrameIndex(), Offset));
1607	MemOps.push_back(Elt: Store);
1608	Offset += `8`;
1609	}
1610
1611	// Now store the XMM (fp + vector) parameter registers.
1612	if (!LiveXMMRegs.empty()) {
1613	SmallVector<SDValue, `12`> SaveXMMOps;
1614	SaveXMMOps.push_back(Elt: Chain);
1615	SaveXMMOps.push_back(Elt: ALVal);
1616	SaveXMMOps.push_back(Elt: RSFIN);
1617	SaveXMMOps.push_back(
1618	Elt: DAG.getTargetConstant(Val: FuncInfo->getVarArgsFPOffset(), DL, VT: MVT::i32));
1619	llvm::append_range(C&: SaveXMMOps, R&: LiveXMMRegs);
1620	MachineMemOperand *StoreMMO =
1621	DAG.getMachineFunction().getMachineMemOperand(
1622	PtrInfo: MachinePointerInfo::getFixedStack(
1623	MF&: DAG.getMachineFunction(), FI: FuncInfo->getRegSaveFrameIndex(),
1624	Offset),
1625	F: MachineMemOperand::MOStore, Size: `128`, BaseAlignment: Align (`16`));
1626	MemOps.push_back(Elt: DAG.getMemIntrinsicNode(Opcode: X86ISD::VASTART_SAVE_XMM_REGS,
1627	dl: DL, VTList: DAG.getVTList(VT: MVT::Other),
1628	Ops: SaveXMMOps, MemVT: MVT::i8, MMO: StoreMMO));
1629	}
1630
1631	if (!MemOps.empty())
1632	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
1633	}
1634	}
1635
1636	void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
1637	// Find the largest legal vector type.
1638	MVT VecVT = MVT::Other;
1639	// FIXME: Only some x86_32 calling conventions support AVX512.
1640	if (Subtarget.useAVX512Regs() &&
1641	(is64Bit() \|\| (CallConv == CallingConv::X86_VectorCall \|\|
1642	CallConv == CallingConv::Intel_OCL_BI)))
1643	VecVT = MVT::v16f32;
1644	else if (Subtarget.hasAVX())
1645	VecVT = MVT::v8f32;
1646	else if (Subtarget.hasSSE2())
1647	VecVT = MVT::v4f32;
1648
1649	// We forward some GPRs and some vector types.
1650	SmallVector<MVT, `2`> RegParmTypes;
1651	MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
1652	RegParmTypes.push_back(Elt: IntVT);
1653	if (VecVT != MVT::Other)
1654	RegParmTypes.push_back(Elt: VecVT);
1655
1656	// Compute the set of forwarded registers. The rest are scratch.
1657	SmallVectorImpl<ForwardedRegister> &Forwards =
1658	FuncInfo->getForwardedMustTailRegParms();
1659	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, Fn: CC_X86);
1660
1661	// Forward AL for SysV x86_64 targets, since it is used for varargs.
1662	if (is64Bit() && !isWin64() && !CCInfo.isAllocated(Reg: X86::AL)) {
1663	Register ALVReg = TheMachineFunction.addLiveIn(PReg: X86::AL, RC: &X86::GR8RegClass);
1664	Forwards.push_back(Elt: ForwardedRegister (ALVReg, X86::AL, MVT::i8));
1665	}
1666
1667	// Copy all forwards from physical to virtual registers.
1668	for (ForwardedRegister &FR : Forwards) {
1669	// FIXME: Can we use a less constrained schedule?
1670	SDValue RegVal = DAG.getCopyFromReg(Chain, dl: DL, Reg: FR.VReg, VT: FR.VT);
1671	FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
1672	RegClass: TargLowering.getRegClassFor(VT: FR.VT));
1673	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: FR.VReg, N: RegVal);
1674	}
1675	}
1676
1677	void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
1678	unsigned StackSize) {
1679	// Set FrameIndex to the 0xAAAAAAA value to mark unset state.
1680	// If necessary, it would be set into the correct value later.
1681	FuncInfo->setVarArgsFrameIndex(`0xAAAAAAA`);
1682	FuncInfo->setRegSaveFrameIndex(`0xAAAAAAA`);
1683
1684	if (FrameInfo.hasVAStart())
1685	createVarArgAreaAndStoreRegisters(Chain, StackSize);
1686
1687	if (FrameInfo.hasMustTailInVarArgFunc())
1688	forwardMustTailParameters(Chain);
1689	}
1690
1691	SDValue X86TargetLowering::LowerFormalArguments(
1692	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1693	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1694	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1695	MachineFunction &MF = DAG.getMachineFunction();
1696	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1697
1698	const Function &F = MF.getFunction();
1699	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
1700	F.getName() == "main")
1701	FuncInfo->setForceFramePointer(true);
1702
1703	MachineFrameInfo &MFI = MF.getFrameInfo();
1704	bool Is64Bit = Subtarget.is64Bit();
1705	bool IsWin64 = Subtarget.isCallingConvWin64(CC: CallConv);
1706
1707	assert(
1708	!(IsVarArg && canGuaranteeTCO(CallConv)) &&
1709	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
1710
1711	// Assign locations to all of the incoming arguments.
1712	SmallVector<CCValAssign, `16`> ArgLocs;
1713	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1714
1715	// Allocate shadow area for Win64.
1716	if (IsWin64)
1717	CCInfo.AllocateStack(Size: `32`, Alignment: Align (`8`));
1718
1719	CCInfo.AnalyzeArguments(Ins, Fn: CC_X86);
1720
1721	// In vectorcall calling convention a second pass is required for the HVA
1722	// types.
1723	if (CallingConv::X86_VectorCall == CallConv) {
1724	CCInfo.AnalyzeArgumentsSecondPass(Args: Ins, Fn: CC_X86);
1725	}
1726
1727	// The next loop assumes that the locations are in the same order of the
1728	// input arguments.
1729	assert(isSortedByValueNo(ArgLocs) &&
1730	"Argument Location list must be sorted before lowering");
1731
1732	SDValue ArgValue;
1733	for (unsigned I = `0`, InsIndex = `0`, E = ArgLocs.size(); I != E;
1734	++I, ++InsIndex) {
1735	assert(InsIndex < Ins.size() && "Invalid Ins index");
1736	CCValAssign &VA = ArgLocs [I];
1737
1738	if (VA.isRegLoc()) {
1739	EVT RegVT = VA.getLocVT();
1740	if (VA.needsCustom()) {
1741	assert(
1742	VA.getValVT() == MVT::v64i1 &&
1743	"Currently the only custom case is when we split v64i1 to 2 regs");
1744
1745	// v64i1 values, in regcall calling convention, that are
1746	// compiled to 32 bit arch, are split up into two registers.
1747	ArgValue =
1748	getv64i1Argument(VA, NextVA&: ArgLocs [++I], Root&: Chain, DAG, DL: dl, Subtarget);
1749	} else {
1750	const TargetRegisterClass *RC;
1751	if (RegVT == MVT::i8)
1752	RC = &X86::GR8RegClass;
1753	else if (RegVT == MVT::i16)
1754	RC = &X86::GR16RegClass;
1755	else if (RegVT == MVT::i32)
1756	RC = &X86::GR32RegClass;
1757	else if (Is64Bit && RegVT == MVT::i64)
1758	RC = &X86::GR64RegClass;
1759	else if (RegVT == MVT::f16)
1760	RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
1761	else if (RegVT == MVT::f32)
1762	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
1763	else if (RegVT == MVT::f64)
1764	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
1765	else if (RegVT == MVT::f80)
1766	RC = &X86::RFP80RegClass;
1767	else if (RegVT == MVT::f128)
1768	RC = &X86::VR128RegClass;
1769	else if (RegVT.is512BitVector())
1770	RC = &X86::VR512RegClass;
1771	else if (RegVT.is256BitVector())
1772	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
1773	else if (RegVT.is128BitVector())
1774	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
1775	else if (RegVT == MVT::x86mmx)
1776	RC = &X86::VR64RegClass;
1777	else if (RegVT == MVT::v1i1)
1778	RC = &X86::VK1RegClass;
1779	else if (RegVT == MVT::v8i1)
1780	RC = &X86::VK8RegClass;
1781	else if (RegVT == MVT::v16i1)
1782	RC = &X86::VK16RegClass;
1783	else if (RegVT == MVT::v32i1)
1784	RC = &X86::VK32RegClass;
1785	else if (RegVT == MVT::v64i1)
1786	RC = &X86::VK64RegClass;
1787	else
1788	llvm_unreachable("Unknown argument type!");
1789
1790	Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
1791	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, VT: RegVT);
1792	}
1793
1794	// If this is an 8 or 16-bit value, it is really passed promoted to 32
1795	// bits. Insert an assert[sz]ext to capture this, then truncate to the
1796	// right size.
1797	if (VA.getLocInfo() == CCValAssign::SExt)
1798	ArgValue = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: RegVT, N1: ArgValue,
1799	N2: DAG.getValueType(VA.getValVT()));
1800	else if (VA.getLocInfo() == CCValAssign::ZExt)
1801	ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: RegVT, N1: ArgValue,
1802	N2: DAG.getValueType(VA.getValVT()));
1803	else if (VA.getLocInfo() == CCValAssign::BCvt)
1804	ArgValue = DAG.getBitcast(VT: VA.getValVT(), V: ArgValue);
1805
1806	if (VA.isExtInLoc()) {
1807	// Handle MMX values passed in XMM regs.
1808	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
1809	ArgValue = DAG.getNode(Opcode: X86ISD::MOVDQ2Q, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
1810	else if (VA.getValVT().isVector() &&
1811	VA.getValVT().getScalarType() == MVT::i1 &&
1812	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
1813	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
1814	// Promoting a mask type (vi1) into a register of type i64/i32/i16/i8*
1815	ArgValue = lowerRegToMasks(ValArg: ArgValue, ValVT: VA.getValVT(), ValLoc: RegVT, DL: dl, DAG);
1816	} else
1817	ArgValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
1818	}
1819	} else {
1820	assert(VA.isMemLoc());
1821	ArgValue =
1822	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i: InsIndex);
1823	}
1824
1825	// If value is passed via pointer - do a load.
1826	if (VA.getLocInfo() == CCValAssign::Indirect &&
1827	!(Ins [I].Flags.isByVal() && VA.isRegLoc())) {
1828	ArgValue =
1829	DAG.getLoad(VT: VA.getValVT(), dl, Chain, Ptr: ArgValue, PtrInfo: MachinePointerInfo ());
1830	}
1831
1832	InVals.push_back(Elt: ArgValue);
1833	}
1834
1835	for (unsigned I = `0`, E = Ins.size(); I != E; ++I) {
1836	if (Ins [I].Flags.isSwiftAsync()) {
1837	auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
1838	if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF))
1839	X86FI->setHasSwiftAsyncContext(true);
1840	else {
1841	int PtrSize = Subtarget.is64Bit() ? `8` : `4`;
1842	int FI =
1843	MF.getFrameInfo().CreateStackObject(Size: PtrSize, Alignment: Align (PtrSize), isSpillSlot: false);
1844	X86FI->setSwiftAsyncContextFrameIdx(FI);
1845	SDValue St = DAG.getStore(
1846	Chain: DAG.getEntryNode(), dl, Val: InVals [I],
1847	Ptr: DAG.getFrameIndex(FI, VT: PtrSize == `8` ? MVT::i64 : MVT::i32),
1848	PtrInfo: MachinePointerInfo::getFixedStack(MF, FI));
1849	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, N1: St, N2: Chain);
1850	}
1851	}
1852
1853	// Swift calling convention does not require we copy the sret argument
1854	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
1855	if (CallConv == CallingConv::Swift \|\| CallConv == CallingConv::SwiftTail)
1856	continue;
1857
1858	// All x86 ABIs require that for returning structs by value we copy the
1859	// sret argument into %rax/%eax (depending on ABI) for the return. Save
1860	// the argument into a virtual register so that we can access it from the
1861	// return points.
1862	if (Ins [I].Flags.isSRet()) {
1863	assert(!FuncInfo->getSRetReturnReg() &&
1864	"SRet return has already been set");
1865	MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
1866	Register Reg =
1867	MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
1868	FuncInfo->setSRetReturnReg(Reg);
1869	SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl, Reg, N: InVals [I]);
1870	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, N1: Copy, N2: Chain);
1871	break;
1872	}
1873	}
1874
1875	unsigned StackSize = CCInfo.getStackSize();
1876	// Align stack specially for tail calls.
1877	if (shouldGuaranteeTCO(CC: CallConv,
1878	GuaranteedTailCallOpt: MF.getTarget().Options.GuaranteedTailCallOpt))
1879	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1880
1881	if (IsVarArg)
1882	VarArgsLoweringHelper (FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
1883	.lowerVarArgsParameters(Chain, StackSize);
1884
1885	// Some CCs need callee pop.
1886	if (X86::isCalleePop(CallingConv: CallConv, is64Bit: Is64Bit, IsVarArg,
1887	GuaranteeTCO: MF.getTarget().Options.GuaranteedTailCallOpt)) {
1888	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1889	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == `2`) {
1890	// X86 interrupts must pop the error code (and the alignment padding) if
1891	// present.
1892	FuncInfo->setBytesToPopOnReturn(Is64Bit ? `16` : `4`);
1893	} else {
1894	FuncInfo->setBytesToPopOnReturn(`0`); // Callee pops nothing.
1895	// If this is an sret function, the return should pop the hidden pointer.
1896	if (!canGuaranteeTCO(CC: CallConv) && hasCalleePopSRet(Args: Ins, Subtarget))
1897	FuncInfo->setBytesToPopOnReturn(`4`);
1898	}
1899
1900	if (!Is64Bit) {
1901	// RegSaveFrameIndex is X86-64 only.
1902	FuncInfo->setRegSaveFrameIndex(`0xAAAAAAA`);
1903	}
1904
1905	FuncInfo->setArgumentStackSize(StackSize);
1906
1907	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
1908	EHPersonality Personality = classifyEHPersonality(Pers: F.getPersonalityFn());
1909	if (Personality == EHPersonality::CoreCLR) {
1910	assert(Is64Bit);
1911	// TODO: Add a mechanism to frame lowering that will allow us to indicate
1912	// that we'd prefer this slot be allocated towards the bottom of the frame
1913	// (i.e. near the stack pointer after allocating the frame). Every
1914	// funclet needs a copy of this slot in its (mostly empty) frame, and the
1915	// offset from the bottom of this and each funclet's frame must be the
1916	// same, so the size of funclets' (mostly empty) frames is dictated by
1917	// how far this slot is from the bottom (since they allocate just enough
1918	// space to accommodate holding this slot at the correct offset).
1919	int PSPSymFI = MFI.CreateStackObject(Size: `8`, Alignment: Align (`8`), /isSpillSlot=/false);
1920	EHInfo->PSPSymFrameIdx = PSPSymFI;
1921	}
1922	}
1923
1924	if (shouldDisableArgRegFromCSR(CC: CallConv) \|\|
1925	F.hasFnAttribute(Kind: "no_caller_saved_registers")) {
1926	MachineRegisterInfo &MRI = MF.getRegInfo();
1927	for (std::pair<MCRegister, Register> Pair : MRI.liveins())
1928	MRI.disableCalleeSavedRegister(Reg: Pair.first);
1929	}
1930
1931	if (CallingConv::PreserveNone == CallConv)
1932	for (const ISD::InputArg &In : Ins) {
1933	if (In.Flags.isSwiftSelf() \|\| In.Flags.isSwiftAsync() \|\|
1934	In.Flags.isSwiftError()) {
1935	errorUnsupported(DAG, dl,
1936	Msg: "Swift attributes can't be used with preserve_none");
1937	break;
1938	}
1939	}
1940
1941	return Chain;
1942	}
1943
1944	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1945	SDValue Arg, const SDLoc &dl,
1946	SelectionDAG &DAG,
1947	const CCValAssign &VA,
1948	ISD::ArgFlagsTy Flags,
1949	bool isByVal) const {
1950	unsigned LocMemOffset = VA.getLocMemOffset();
1951	SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
1952	PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
1953	N1: StackPtr, N2: PtrOff);
1954	if (isByVal)
1955	return CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff, Chain, Flags, DAG, dl);
1956
1957	MaybeAlign Alignment;
1958	if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1959	Arg.getSimpleValueType() != MVT::f80)
1960	Alignment = MaybeAlign (`4`);
1961	return DAG.getStore(
1962	Chain, dl, Val: Arg, Ptr: PtrOff,
1963	PtrInfo: MachinePointerInfo::getStack(MF&: DAG.getMachineFunction(), Offset: LocMemOffset),
1964	Alignment);
1965	}
1966
1967	/// Emit a load of return address if tail call
1968	/// optimization is performed and it is required.
1969	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
1970	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
1971	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
1972	// Adjust the Return address stack slot.
1973	EVT VT = getPointerTy(DL: DAG.getDataLayout());
1974	OutRetAddr = getReturnAddressFrameIndex(DAG);
1975
1976	// Load the "old" Return address.
1977	OutRetAddr = DAG.getLoad(VT, dl, Chain, Ptr: OutRetAddr, PtrInfo: MachinePointerInfo ());
1978	return SDValue (OutRetAddr.getNode(), `1`);
1979	}
1980
1981	/// Emit a store of the return address if tail call
1982	/// optimization is performed and it is required (FPDiff!=0).
1983	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
1984	SDValue Chain, SDValue RetAddrFrIdx,
1985	EVT PtrVT, unsigned SlotSize,
1986	int FPDiff, const SDLoc &dl) {
1987	// Store the return address to the appropriate stack slot.
1988	if (!FPDiff) return Chain;
1989	// Calculate the new stack slot for the return address.
1990	int NewReturnAddrFI =
1991	MF.getFrameInfo().CreateFixedObject(Size: SlotSize, SPOffset: (int64_t)FPDiff - SlotSize,
1992	IsImmutable: false);
1993	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(FI: NewReturnAddrFI, VT: PtrVT);
1994	Chain = DAG.getStore(Chain, dl, Val: RetAddrFrIdx, Ptr: NewRetAddrFrIdx,
1995	PtrInfo: MachinePointerInfo::getFixedStack(
1996	MF&: DAG.getMachineFunction(), FI: NewReturnAddrFI));
1997	return Chain;
1998	}
1999
2000	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
2001	/// operation of specified width.
2002	SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
2003	SDValue V1, SDValue V2) const {
2004	unsigned NumElems = VT.getVectorNumElements();
2005	SmallVector<int, `8`> Mask;
2006	Mask.push_back(Elt: NumElems);
2007	for (unsigned i = `1`; i != NumElems; ++i)
2008	Mask.push_back(Elt: i);
2009	return DAG.getVectorShuffle(VT, dl, N1: V1, N2: V2, Mask);
2010	}
2011
2012	SDValue
2013	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2014	SmallVectorImpl<SDValue> &InVals) const {
2015	SelectionDAG &DAG = CLI.DAG;
2016	SDLoc &dl = CLI.DL;
2017	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2018	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2019	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2020	SDValue Chain = CLI.Chain;
2021	SDValue Callee = CLI.Callee;
2022	CallingConv::ID CallConv = CLI.CallConv;
2023	bool &isTailCall = CLI.IsTailCall;
2024	bool isVarArg = CLI.IsVarArg;
2025	const auto *CB = CLI.CB;
2026
2027	MachineFunction &MF = DAG.getMachineFunction();
2028	bool Is64Bit = Subtarget.is64Bit();
2029	bool IsWin64 = Subtarget.isCallingConvWin64(CC: CallConv);
2030	bool IsSibcall = false;
2031	bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt \|\|
2032	CallConv == CallingConv::Tail \|\| CallConv == CallingConv::SwiftTail;
2033	bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Args: Outs, Subtarget);
2034	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2035	bool HasNCSR = (CB && isa<CallInst>(Val: CB) &&
2036	CB->hasFnAttr(Kind: "no_caller_saved_registers"));
2037	bool IsIndirectCall = (CB && isa<CallInst>(Val: CB) && CB->isIndirectCall());
2038	bool IsCFICall = IsIndirectCall && CLI.CFIType;
2039	const Module *M = MF.getFunction().getParent();
2040
2041	// If the indirect call target has the nocf_check attribute, the call needs
2042	// the NOTRACK prefix. For simplicity just disable tail calls as there are
2043	// so many variants.
2044	bool IsNoTrackIndirectCall = IsIndirectCall && CB->doesNoCfCheck() &&
2045	M->getModuleFlag(Key: "cf-protection-branch");
2046	if (IsNoTrackIndirectCall)
2047	isTailCall = false;
2048
2049	MachineFunction::CallSiteInfo CSInfo;
2050	if (CallConv == CallingConv::X86_INTR)
2051	report_fatal_error(reason: "X86 interrupts may not be called directly");
2052
2053	if (IsIndirectCall && !IsWin64 &&
2054	M->getModuleFlag(Key: "import-call-optimization"))
2055	errorUnsupported(DAG, dl,
2056	Msg: "Indirect calls must have a normal calling convention if "
2057	"Import Call Optimization is enabled");
2058
2059	// Analyze operands of the call, assigning locations to each operand.
2060	SmallVector<CCValAssign, `16`> ArgLocs;
2061	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2062
2063	// Allocate shadow area for Win64.
2064	if (IsWin64)
2065	CCInfo.AllocateStack(Size: `32`, Alignment: Align (`8`));
2066
2067	CCInfo.AnalyzeArguments(Outs, Fn: CC_X86);
2068
2069	// In vectorcall calling convention a second pass is required for the HVA
2070	// types.
2071	if (CallingConv::X86_VectorCall == CallConv) {
2072	CCInfo.AnalyzeArgumentsSecondPass(Args: Outs, Fn: CC_X86);
2073	}
2074
2075	bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
2076	if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
2077	// If we are using a GOT, disable tail calls to external symbols with
2078	// default visibility. Tail calling such a symbol requires using a GOT
2079	// relocation, which forces early binding of the symbol. This breaks code
2080	// that require lazy function symbol resolution. Using musttail or
2081	// GuaranteedTailCallOpt will override this.
2082	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee);
2083	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
2084	G->getGlobal()->hasDefaultVisibility()))
2085	isTailCall = false;
2086	}
2087
2088	if (isTailCall && !IsMustTail) {
2089	// Check if it's really possible to do a tail call.
2090	isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
2091	IsCalleePopSRet);
2092
2093	// Sibcalls are automatically detected tailcalls which do not require
2094	// ABI changes.
2095	if (!IsGuaranteeTCO && isTailCall)
2096	IsSibcall = true;
2097
2098	if (isTailCall)
2099	++NumTailCalls;
2100	}
2101
2102	if (IsMustTail && !isTailCall)
2103	report_fatal_error(reason: "failed to perform tail call elimination on a call "
2104	"site marked musttail");
2105
2106	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2107	"Var args not supported with calling convention fastcc, ghc or hipe");
2108
2109	// Get a count of how many bytes are to be pushed on the stack.
2110	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
2111	if (IsSibcall)
2112	// This is a sibcall. The memory operands are available in caller's
2113	// own caller's stack.
2114	NumBytes = `0`;
2115	else if (IsGuaranteeTCO && canGuaranteeTCO(CC: CallConv))
2116	NumBytes = GetAlignedArgumentStackSize(StackSize: NumBytes, DAG);
2117
2118	int FPDiff = `0`;
2119	if (isTailCall &&
2120	shouldGuaranteeTCO(CC: CallConv,
2121	GuaranteedTailCallOpt: MF.getTarget().Options.GuaranteedTailCallOpt)) {
2122	// Lower arguments at fp - stackoffset + fpdiff.
2123	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2124
2125	FPDiff = NumBytesCallerPushed - NumBytes;
2126
2127	// Set the delta of movement of the returnaddr stackslot.
2128	// But only set if delta is greater than previous delta.
2129	if (FPDiff < X86Info->getTCReturnAddrDelta())
2130	X86Info->setTCReturnAddrDelta(FPDiff);
2131	}
2132
2133	unsigned NumBytesToPush = NumBytes;
2134	unsigned NumBytesToPop = NumBytes;
2135
2136	// If we have an inalloca argument, all stack space has already been allocated
2137	// for us and be right at the top of the stack. We don't support multiple
2138	// arguments passed in memory when using inalloca.
2139	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2140	NumBytesToPush = `0`;
2141	if (!ArgLocs.back().isMemLoc())
2142	report_fatal_error(reason: "cannot use inalloca attribute on a register "
2143	"parameter");
2144	if (ArgLocs.back().getLocMemOffset() != `0`)
2145	report_fatal_error(reason: "any parameter with the inalloca attribute must be "
2146	"the only memory argument");
2147	} else if (CLI.IsPreallocated) {
2148	assert(ArgLocs.back().isMemLoc() &&
2149	"cannot use preallocated attribute on a register "
2150	"parameter");
2151	SmallVector<size_t, `4`> PreallocatedOffsets;
2152	for (size_t i = `0`; i < CLI.OutVals.size(); ++i) {
2153	if (CLI.CB->paramHasAttr(ArgNo: i, Kind: Attribute::Preallocated)) {
2154	PreallocatedOffsets.push_back(Elt: ArgLocs [i].getLocMemOffset());
2155	}
2156	}
2157	auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
2158	size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CS: CLI.CB);
2159	MFI->setPreallocatedStackSize(Id: PreallocatedId, StackSize: NumBytes);
2160	MFI->setPreallocatedArgOffsets(Id: PreallocatedId, AO: PreallocatedOffsets);
2161	NumBytesToPush = `0`;
2162	}
2163
2164	if (!IsSibcall && !IsMustTail)
2165	Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytesToPush,
2166	OutSize: NumBytes - NumBytesToPush, DL: dl);
2167
2168	SDValue RetAddrFrIdx;
2169	// Load return address for tail calls.
2170	if (isTailCall && FPDiff)
2171	Chain = EmitTailCallLoadRetAddr(DAG, OutRetAddr&: RetAddrFrIdx, Chain, IsTailCall: isTailCall,
2172	Is64Bit, FPDiff, dl);
2173
2174	SmallVector<std::pair<Register, SDValue>, `8`> RegsToPass;
2175	SmallVector<SDValue, `8`> MemOpChains;
2176	SDValue StackPtr;
2177
2178	// The next loop assumes that the locations are in the same order of the
2179	// input arguments.
2180	assert(isSortedByValueNo(ArgLocs) &&
2181	"Argument Location list must be sorted before lowering");
2182
2183	// Walk the register/memloc assignments, inserting copies/loads. In the case
2184	// of tail call optimization arguments are handle later.
2185	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2186	for (unsigned I = `0`, OutIndex = `0`, E = ArgLocs.size(); I != E;
2187	++I, ++OutIndex) {
2188	assert(OutIndex < Outs.size() && "Invalid Out index");
2189	// Skip inalloca/preallocated arguments, they have already been written.
2190	ISD::ArgFlagsTy Flags = Outs [OutIndex].Flags;
2191	if (Flags.isInAlloca() \|\| Flags.isPreallocated())
2192	continue;
2193
2194	CCValAssign &VA = ArgLocs [I];
2195	EVT RegVT = VA.getLocVT();
2196	SDValue Arg = OutVals [OutIndex];
2197	bool isByVal = Flags.isByVal();
2198
2199	// Promote the value if needed.
2200	switch (VA.getLocInfo()) {
2201	default: llvm_unreachable("Unknown loc info!");
2202	case CCValAssign::Full: break;
2203	case CCValAssign::SExt:
2204	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2205	break;
2206	case CCValAssign::ZExt:
2207	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2208	break;
2209	case CCValAssign::AExt:
2210	if (Arg.getValueType().isVector() &&
2211	Arg.getValueType().getVectorElementType() == MVT::i1)
2212	Arg = lowerMasksToReg(ValArg: Arg, ValLoc: RegVT, DL: dl, DAG);
2213	else if (RegVT.is128BitVector()) {
2214	// Special case: passing MMX values in XMM registers.
2215	Arg = DAG.getBitcast(VT: MVT::i64, V: Arg);
2216	Arg = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v2i64, Operand: Arg);
2217	Arg = getMOVL(DAG, dl, VT: MVT::v2i64, V1: DAG.getUNDEF(VT: MVT::v2i64), V2: Arg);
2218	} else
2219	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2220	break;
2221	case CCValAssign::BCvt:
2222	Arg = DAG.getBitcast(VT: RegVT, V: Arg);
2223	break;
2224	case CCValAssign::Indirect: {
2225	if (isByVal) {
2226	// Memcpy the argument to a temporary stack slot to prevent
2227	// the caller from seeing any modifications the callee may make
2228	// as guaranteed by the `byval` attribute.
2229	int FrameIdx = MF.getFrameInfo().CreateStackObject(
2230	Size: Flags.getByValSize(),
2231	Alignment: std::max(a: Align (`16`), b: Flags.getNonZeroByValAlign()), isSpillSlot: false);
2232	SDValue StackSlot =
2233	DAG.getFrameIndex(FI: FrameIdx, VT: getPointerTy(DL: DAG.getDataLayout()));
2234	Chain =
2235	CreateCopyOfByValArgument(Src: Arg, Dst: StackSlot, Chain, Flags, DAG, dl);
2236	// From now on treat this as a regular pointer
2237	Arg = StackSlot;
2238	isByVal = false;
2239	} else {
2240	// Store the argument.
2241	SDValue SpillSlot = DAG.CreateStackTemporary(VT: VA.getValVT());
2242	int FI = cast<FrameIndexSDNode>(Val&: SpillSlot)->getIndex();
2243	Chain = DAG.getStore(
2244	Chain, dl, Val: Arg, Ptr: SpillSlot,
2245	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI));
2246	Arg = SpillSlot;
2247	}
2248	break;
2249	}
2250	}
2251
2252	if (VA.needsCustom()) {
2253	assert(VA.getValVT() == MVT::v64i1 &&
2254	"Currently the only custom case is when we split v64i1 to 2 regs");
2255	// Split v64i1 value into two registers
2256	Passv64i1ArgInRegs(DL: dl, DAG, Arg, RegsToPass, VA, NextVA&: ArgLocs [++I], Subtarget);
2257	} else if (VA.isRegLoc()) {
2258	RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
2259	const TargetOptions &Options = DAG.getTarget().Options;
2260	if (Options.EmitCallSiteInfo)
2261	CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: I);
2262	if (isVarArg && IsWin64) {
2263	// Win64 ABI requires argument XMM reg to be copied to the corresponding
2264	// shadow reg if callee is a varargs function.
2265	Register ShadowReg;
2266	switch (VA.getLocReg()) {
2267	case X86::XMM0: ShadowReg = X86::RCX; break;
2268	case X86::XMM1: ShadowReg = X86::RDX; break;
2269	case X86::XMM2: ShadowReg = X86::R8; break;
2270	case X86::XMM3: ShadowReg = X86::R9; break;
2271	}
2272	if (ShadowReg)
2273	RegsToPass.push_back(Elt: std::make_pair(x&: ShadowReg, y&: Arg));
2274	}
2275	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
2276	assert(VA.isMemLoc());
2277	if (!StackPtr.getNode())
2278	StackPtr = DAG.getCopyFromReg(Chain, dl, Reg: RegInfo->getStackRegister(),
2279	VT: getPointerTy(DL: DAG.getDataLayout()));
2280	MemOpChains.push_back(Elt: LowerMemOpCallTo(Chain, StackPtr, Arg,
2281	dl, DAG, VA, Flags, isByVal));
2282	}
2283	}
2284
2285	if (!MemOpChains.empty())
2286	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
2287
2288	if (Subtarget.isPICStyleGOT()) {
2289	// ELF / PIC requires GOT in the EBX register before function calls via PLT
2290	// GOT pointer (except regcall).
2291	if (!isTailCall) {
2292	// Indirect call with RegCall calling convertion may use up all the
2293	// general registers, so it is not suitable to bind EBX reister for
2294	// GOT address, just let register allocator handle it.
2295	if (CallConv != CallingConv::X86_RegCall)
2296	RegsToPass.push_back(Elt: std::make_pair(
2297	x: Register (X86::EBX), y: DAG.getNode(Opcode: X86ISD::GlobalBaseReg, DL: SDLoc (),
2298	VT: getPointerTy(DL: DAG.getDataLayout()))));
2299	} else {
2300	// If we are tail calling and generating PIC/GOT style code load the
2301	// address of the callee into ECX. The value in ecx is used as target of
2302	// the tail jump. This is done to circumvent the ebx/callee-saved problem
2303	// for tail calls on PIC/GOT architectures. Normally we would just put the
2304	// address of GOT into ebx and then call target@PLT. But for tail calls
2305	// ebx would be restored (since ebx is callee saved) before jumping to the
2306	// target@PLT.
2307
2308	// Note: The actual moving to ECX is done further down.
2309	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee);
2310	if (G && !G->getGlobal()->hasLocalLinkage() &&
2311	G->getGlobal()->hasDefaultVisibility())
2312	Callee = LowerGlobalAddress(Op: Callee, DAG);
2313	else if (isa<ExternalSymbolSDNode>(Val: Callee))
2314	Callee = LowerExternalSymbol(Op: Callee, DAG);
2315	}
2316	}
2317
2318	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
2319	(Subtarget.hasSSE1() \|\| !M->getModuleFlag(Key: "SkipRaxSetup"))) {
2320	// From AMD64 ABI document:
2321	// For calls that may call functions that use varargs or stdargs
2322	// (prototype-less calls or calls to functions containing ellipsis (...) in
2323	// the declaration) %al is used as hidden argument to specify the number
2324	// of SSE registers used. The contents of %al do not need to match exactly
2325	// the number of registers, but must be an ubound on the number of SSE
2326	// registers used and is in the range 0 - 8 inclusive.
2327
2328	// Count the number of XMM registers allocated.
2329	static const MCPhysReg XMMArgRegs[] = {
2330	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2331	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2332	};
2333	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(Regs: XMMArgRegs);
2334	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
2335	&& "SSE registers cannot be used when SSE is disabled");
2336	RegsToPass.push_back(Elt: std::make_pair(x: Register (X86::AL),
2337	y: DAG.getConstant(Val: NumXMMRegs, DL: dl,
2338	VT: MVT::i8)));
2339	}
2340
2341	if (isVarArg && IsMustTail) {
2342	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2343	for (const auto &F : Forwards) {
2344	SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: F.VReg, VT: F.VT);
2345	RegsToPass.push_back(Elt: std::make_pair(x: F.PReg, y&: Val));
2346	}
2347	}
2348
2349	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
2350	// don't need this because the eligibility check rejects calls that require
2351	// shuffling arguments passed in memory.
2352	if (!IsSibcall && isTailCall) {
2353	// Force all the incoming stack arguments to be loaded from the stack
2354	// before any new outgoing arguments or the return address are stored to the
2355	// stack, because the outgoing stack slots may alias the incoming argument
2356	// stack slots, and the alias isn't otherwise explicit. This is slightly
2357	// more conservative than necessary, because it means that each store
2358	// effectively depends on every argument instead of just those arguments it
2359	// would clobber.
2360	Chain = DAG.getStackArgumentTokenFactor(Chain);
2361
2362	SmallVector<SDValue, `8`> MemOpChains2;
2363	SDValue FIN;
2364	int FI = `0`;
2365	for (unsigned I = `0`, OutsIndex = `0`, E = ArgLocs.size(); I != E;
2366	++I, ++OutsIndex) {
2367	CCValAssign &VA = ArgLocs [I];
2368
2369	if (VA.isRegLoc()) {
2370	if (VA.needsCustom()) {
2371	assert((CallConv == CallingConv::X86_RegCall) &&
2372	"Expecting custom case only in regcall calling convention");
2373	// This means that we are in special case where one argument was
2374	// passed through two register locations - Skip the next location
2375	++I;
2376	}
2377
2378	continue;
2379	}
2380
2381	assert(VA.isMemLoc());
2382	SDValue Arg = OutVals [OutsIndex];
2383	ISD::ArgFlagsTy Flags = Outs [OutsIndex].Flags;
2384	// Skip inalloca/preallocated arguments. They don't require any work.
2385	if (Flags.isInAlloca() \|\| Flags.isPreallocated())
2386	continue;
2387	// Create frame index.
2388	int32_t Offset = VA.getLocMemOffset()+FPDiff;
2389	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+`7`)/`8`;
2390	FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
2391	FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
2392
2393	if (Flags.isByVal()) {
2394	// Copy relative to framepointer.
2395	SDValue Source = DAG.getIntPtrConstant(Val: VA.getLocMemOffset(), DL: dl);
2396	if (!StackPtr.getNode())
2397	StackPtr = DAG.getCopyFromReg(Chain, dl, Reg: RegInfo->getStackRegister(),
2398	VT: getPointerTy(DL: DAG.getDataLayout()));
2399	Source = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
2400	N1: StackPtr, N2: Source);
2401
2402	MemOpChains2.push_back(
2403	Elt: CreateCopyOfByValArgument(Src: Source, Dst: FIN, Chain, Flags, DAG, dl));
2404	} else {
2405	// Store relative to framepointer.
2406	MemOpChains2.push_back(Elt: DAG.getStore(
2407	Chain, dl, Val: Arg, Ptr: FIN,
2408	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI)));
2409	}
2410	}
2411
2412	if (!MemOpChains2.empty())
2413	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains2);
2414
2415	// Store the return address to the appropriate stack slot.
2416	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2417	PtrVT: getPointerTy(DL: DAG.getDataLayout()),
2418	SlotSize: RegInfo->getSlotSize(), FPDiff, dl);
2419	}
2420
2421	// Build a sequence of copy-to-reg nodes chained together with token chain
2422	// and glue operands which copy the outgoing args into registers.
2423	SDValue InGlue;
2424	for (const auto &[Reg, N] : RegsToPass) {
2425	Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
2426	InGlue = Chain.getValue(R: `1`);
2427	}
2428
2429	bool IsImpCall = false;
2430	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2431	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2432	// In the 64-bit large code model, we have to make all calls
2433	// through a register, since the call instruction's 32-bit
2434	// pc-relative offset may not be large enough to hold the whole
2435	// address.
2436	} else if (Callee ->getOpcode() == ISD::GlobalAddress \|\|
2437	Callee ->getOpcode() == ISD::ExternalSymbol) {
2438	// Lower direct calls to global addresses and external symbols. Setting
2439	// ForCall to true here has the effect of removing WrapperRIP when possible
2440	// to allow direct calls to be selected without first materializing the
2441	// address into a register.
2442	Callee = LowerGlobalOrExternal(Op: Callee, DAG, /ForCall=/true, IsImpCall: &IsImpCall);
2443	} else if (Subtarget.isTarget64BitILP32() &&
2444	Callee.getValueType() == MVT::i32) {
2445	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
2446	Callee = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i64, Operand: Callee);
2447	}
2448
2449	SmallVector<SDValue, `8`> Ops;
2450
2451	if (!IsSibcall && isTailCall && !IsMustTail) {
2452	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytesToPop, Size2: `0`, Glue: InGlue, DL: dl);
2453	InGlue = Chain.getValue(R: `1`);
2454	}
2455
2456	Ops.push_back(Elt: Chain);
2457	Ops.push_back(Elt: Callee);
2458
2459	if (isTailCall)
2460	Ops.push_back(Elt: DAG.getSignedTargetConstant(Val: FPDiff, DL: dl, VT: MVT::i32));
2461
2462	// Add argument registers to the end of the list so that they are known live
2463	// into the call.
2464	for (const auto &[Reg, N] : RegsToPass)
2465	Ops.push_back(Elt: DAG.getRegister(Reg, VT: N.getValueType()));
2466
2467	// Add a register mask operand representing the call-preserved registers.
2468	const uint32_t *Mask = [&]() {
2469	auto AdaptedCC = CallConv;
2470	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
2471	// use X86_INTR calling convention because it has the same CSR mask
2472	// (same preserved registers).
2473	if (HasNCSR)
2474	AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
2475	// If NoCalleeSavedRegisters is requested, than use GHC since it happens
2476	// to use the CSR_NoRegs_RegMask.
2477	if (CB && CB->hasFnAttr(Kind: "no_callee_saved_registers"))
2478	AdaptedCC = (CallingConv::ID)CallingConv::GHC;
2479	return RegInfo->getCallPreservedMask(MF, AdaptedCC);
2480	}();
2481	assert(Mask && "Missing call preserved mask for calling convention");
2482
2483	if (MachineOperand::clobbersPhysReg(RegMask: Mask, PhysReg: RegInfo->getFramePtr())) {
2484	X86Info->setFPClobberedByCall(true);
2485	if (CLI.CB && isa<InvokeInst>(Val: CLI.CB))
2486	X86Info->setFPClobberedByInvoke(true);
2487	}
2488	if (MachineOperand::clobbersPhysReg(RegMask: Mask, PhysReg: RegInfo->getBaseRegister())) {
2489	X86Info->setBPClobberedByCall(true);
2490	if (CLI.CB && isa<InvokeInst>(Val: CLI.CB))
2491	X86Info->setBPClobberedByInvoke(true);
2492	}
2493
2494	// If this is an invoke in a 32-bit function using a funclet-based
2495	// personality, assume the function clobbers all registers. If an exception
2496	// is thrown, the runtime will not restore CSRs.
2497	// FIXME: Model this more precisely so that we can register allocate across
2498	// the normal edge and spill and fill across the exceptional edge.
2499	if (!Is64Bit && CLI.CB && isa<InvokeInst>(Val: CLI.CB)) {
2500	const Function &CallerFn = MF.getFunction();
2501	EHPersonality Pers =
2502	CallerFn.hasPersonalityFn()
2503	? classifyEHPersonality(Pers: CallerFn.getPersonalityFn())
2504	: EHPersonality::Unknown;
2505	if (isFuncletEHPersonality(Pers))
2506	Mask = RegInfo->getNoPreservedMask();
2507	}
2508
2509	// Define a new register mask from the existing mask.
2510	uint32_t RegMask = nullptr*;
2511
2512	// In some calling conventions we need to remove the used physical registers
2513	// from the reg mask. Create a new RegMask for such calling conventions.
2514	// RegMask for calling conventions that disable only return registers (e.g.
2515	// preserve_most) will be modified later in LowerCallResult.
2516	bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CC: CallConv) \|\| HasNCSR;
2517	if (ShouldDisableArgRegs \|\| shouldDisableRetRegFromCSR(CC: CallConv)) {
2518	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2519
2520	// Allocate a new Reg Mask and copy Mask.
2521	RegMask = MF.allocateRegMask();
2522	unsigned RegMaskSize = MachineOperand::getRegMaskSize(NumRegs: TRI->getNumRegs());
2523	memcpy(dest: RegMask, src: Mask, n: sizeof(RegMask[`0`]) * RegMaskSize);
2524
2525	// Make sure all sub registers of the argument registers are reset
2526	// in the RegMask.
2527	if (ShouldDisableArgRegs) {
2528	for (auto const &RegPair : RegsToPass)
2529	for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg: RegPair.first))
2530	RegMask[SubReg / `32`] &= ~(`1u` << (SubReg % `32`));
2531	}
2532
2533	// Create the RegMask Operand according to our updated mask.
2534	Ops.push_back(Elt: DAG.getRegisterMask(RegMask));
2535	} else {
2536	// Create the RegMask Operand according to the static mask.
2537	Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
2538	}
2539
2540	if (InGlue.getNode())
2541	Ops.push_back(Elt: InGlue);
2542
2543	if (isTailCall) {
2544	// We used to do:
2545	//// If this is the first return lowered for this function, add the regs
2546	//// to the liveout set for the function.
2547	// This isn't right, although it's probably harmless on x86; liveouts
2548	// should be computed from returns not tail calls. Consider a void
2549	// function making a tail call to a function returning int.
2550	MF.getFrameInfo().setHasTailCall();
2551	SDValue Ret = DAG.getNode(Opcode: X86ISD::TC_RETURN, DL: dl, VT: MVT::Other, Ops);
2552
2553	if (IsCFICall)
2554	Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2555
2556	DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
2557	DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
2558	return Ret;
2559	}
2560
2561	// Returns a chain & a glue for retval copy to use.
2562	SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
2563	if (IsImpCall) {
2564	Chain = DAG.getNode(Opcode: X86ISD::IMP_CALL, DL: dl, VTList: NodeTys, Ops);
2565	} else if (IsNoTrackIndirectCall) {
2566	Chain = DAG.getNode(Opcode: X86ISD::NT_CALL, DL: dl, VTList: NodeTys, Ops);
2567	} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
2568	// Calls with a "clang.arc.attachedcall" bundle are special. They should be
2569	// expanded to the call, directly followed by a special marker sequence and
2570	// a call to a ObjC library function. Use the CALL_RVMARKER to do that.
2571	assert(!isTailCall &&
2572	"tail calls cannot be marked with clang.arc.attachedcall");
2573	assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
2574
2575	// Add a target global address for the retainRV/claimRV runtime function
2576	// just before the call target.
2577	Function ARCFn = objcarc::getAttachedARCFunction(CB: CLI.CB);
2578	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2579	auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL: dl, VT: PtrVT);
2580	Ops.insert(I: Ops.begin() + `1`, Elt: GA);
2581	Chain = DAG.getNode(Opcode: X86ISD::CALL_RVMARKER, DL: dl, VTList: NodeTys, Ops);
2582	} else {
2583	Chain = DAG.getNode(Opcode: X86ISD::CALL, DL: dl, VTList: NodeTys, Ops);
2584	}
2585
2586	if (IsCFICall)
2587	Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2588
2589	InGlue = Chain.getValue(R: `1`);
2590	DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
2591	DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
2592
2593	// Save heapallocsite metadata.
2594	if (CLI.CB)
2595	if (MDNode *HeapAlloc = CLI.CB->getMetadata(Kind: "heapallocsite"))
2596	DAG.addHeapAllocSite(Node: Chain.getNode(), MD: HeapAlloc);
2597
2598	// Create the CALLSEQ_END node.
2599	unsigned NumBytesForCalleeToPop = `0`; // Callee pops nothing.
2600	if (X86::isCalleePop(CallingConv: CallConv, is64Bit: Is64Bit, IsVarArg: isVarArg,
2601	GuaranteeTCO: DAG.getTarget().Options.GuaranteedTailCallOpt))
2602	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
2603	else if (!canGuaranteeTCO(CC: CallConv) && IsCalleePopSRet)
2604	// If this call passes a struct-return pointer, the callee
2605	// pops that struct pointer.
2606	NumBytesForCalleeToPop = `4`;
2607
2608	// Returns a glue for retval copy to use.
2609	if (!IsSibcall) {
2610	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytesToPop, Size2: NumBytesForCalleeToPop,
2611	Glue: InGlue, DL: dl);
2612	InGlue = Chain.getValue(R: `1`);
2613	}
2614
2615	if (CallingConv::PreserveNone == CallConv)
2616	for (const ISD::OutputArg &Out : Outs) {
2617	if (Out.Flags.isSwiftSelf() \|\| Out.Flags.isSwiftAsync() \|\|
2618	Out.Flags.isSwiftError()) {
2619	errorUnsupported(DAG, dl,
2620	Msg: "Swift attributes can't be used with preserve_none");
2621	break;
2622	}
2623	}
2624
2625	// Handle result values, copying them out of physregs into vregs that we
2626	// return.
2627	return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2628	InVals, RegMask);
2629	}
2630
2631	//===----------------------------------------------------------------------===//
2632	// Fast Calling Convention (tail call) implementation
2633	//===----------------------------------------------------------------------===//
2634
2635	// Like std call, callee cleans arguments, convention except that ECX is
2636	// reserved for storing the tail called function address. Only 2 registers are
2637	// free for argument passing (inreg). Tail call optimization is performed
2638	// provided:
2639	// tailcallopt is enabled*
2640	// caller/callee are fastcc*
2641	// On X86_64 architecture with GOT-style position independent code only local
2642	// (within module) calls are supported at the moment.
2643	// To keep the stack aligned according to platform abi the function
2644	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
2645	// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
2646	// If a tail called function callee has more arguments than the caller the
2647	// caller needs to make sure that there is room to move the RETADDR to. This is
2648	// achieved by reserving an area the size of the argument delta right after the
2649	// original RETADDR, but before the saved framepointer or the spilled registers
2650	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2651	// stack layout:
2652	// arg1
2653	// arg2
2654	// RETADDR
2655	// [ new RETADDR
2656	// move area ]
2657	// (possible EBP)
2658	// ESI
2659	// EDI
2660	// local1 ..
2661
2662	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
2663	/// requirement.
2664	unsigned
2665	X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
2666	SelectionDAG &DAG) const {
2667	const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
2668	const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
2669	assert(StackSize % SlotSize == `0` &&
2670	"StackSize must be a multiple of SlotSize");
2671	return alignTo(Size: StackSize + SlotSize, A: StackAlignment) - SlotSize;
2672	}
2673
2674	/// Return true if the given stack call argument is already available in the
2675	/// same position (relatively) of the caller's incoming argument stack.
2676	static
2677	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2678	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2679	const X86InstrInfo TII, const* CCValAssign &VA) {
2680	unsigned Bytes = Arg.getValueSizeInBits() / `8`;
2681
2682	for (;;) {
2683	// Look through nodes that don't alter the bits of the incoming value.
2684	unsigned Op = Arg.getOpcode();
2685	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST \|\|
2686	Op == ISD::AssertZext) {
2687	Arg = Arg.getOperand(i: `0`);
2688	continue;
2689	}
2690	if (Op == ISD::TRUNCATE) {
2691	const SDValue &TruncInput = Arg.getOperand(i: `0`);
2692	if (TruncInput.getOpcode() == ISD::AssertZext &&
2693	cast<VTSDNode>(Val: TruncInput.getOperand(i: `1`))->getVT() ==
2694	Arg.getValueType()) {
2695	Arg = TruncInput.getOperand(i: `0`);
2696	continue;
2697	}
2698	}
2699	break;
2700	}
2701
2702	int FI = INT_MAX;
2703	if (Arg.getOpcode() == ISD::CopyFromReg) {
2704	Register VR = cast<RegisterSDNode>(Val: Arg.getOperand(i: `1`))->getReg();
2705	if (!VR.isVirtual())
2706	return false;
2707	MachineInstr *Def = MRI->getVRegDef(Reg: VR);
2708	if (!Def)
2709	return false;
2710	if (!Flags.isByVal()) {
2711	if (!TII->isLoadFromStackSlot(MI: *Def, FrameIndex&: FI))
2712	return false;
2713	} else {
2714	unsigned Opcode = Def->getOpcode();
2715	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
2716	Opcode == X86::LEA64_32r) &&
2717	Def->getOperand(i: `1`).isFI()) {
2718	FI = Def->getOperand(i: `1`).getIndex();
2719	Bytes = Flags.getByValSize();
2720	} else
2721	return false;
2722	}
2723	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val&: Arg)) {
2724	if (Flags.isByVal())
2725	// ByVal argument is passed in as a pointer but it's now being
2726	// dereferenced. e.g.
2727	// define @foo(%struct.X %A) {*
2728	// tail call @bar(%struct.X byval %A)*
2729	// }
2730	return false;
2731	SDValue Ptr = Ld->getBasePtr();
2732	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Val&: Ptr);
2733	if (!FINode)
2734	return false;
2735	FI = FINode->getIndex();
2736	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2737	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Val&: Arg);
2738	FI = FINode->getIndex();
2739	Bytes = Flags.getByValSize();
2740	} else
2741	return false;
2742
2743	assert(FI != INT_MAX);
2744	if (!MFI.isFixedObjectIndex(ObjectIdx: FI))
2745	return false;
2746
2747	if (Offset != MFI.getObjectOffset(ObjectIdx: FI))
2748	return false;
2749
2750	// If this is not byval, check that the argument stack object is immutable.
2751	// inalloca and argument copy elision can create mutable argument stack
2752	// objects. Byval objects can be mutated, but a byval call intends to pass the
2753	// mutated memory.
2754	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(ObjectIdx: FI))
2755	return false;
2756
2757	if (VA.getLocVT().getFixedSizeInBits() >
2758	Arg.getValueSizeInBits().getFixedValue()) {
2759	// If the argument location is wider than the argument type, check that any
2760	// extension flags match.
2761	if (Flags.isZExt() != MFI.isObjectZExt(ObjectIdx: FI) \|\|
2762	Flags.isSExt() != MFI.isObjectSExt(ObjectIdx: FI)) {
2763	return false;
2764	}
2765	}
2766
2767	return Bytes == MFI.getObjectSize(ObjectIdx: FI);
2768	}
2769
2770	/// Check whether the call is eligible for tail call optimization. Targets
2771	/// that want to do tail call optimization should implement this function.
2772	/// Note that the x86 backend does not check musttail calls for eligibility! The
2773	/// rest of x86 tail call lowering must be prepared to forward arguments of any
2774	/// type.
2775	bool X86TargetLowering::IsEligibleForTailCallOptimization(
2776	TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
2777	SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const {
2778	SelectionDAG &DAG = CLI.DAG;
2779	const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2780	const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2781	const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2782	SDValue Callee = CLI.Callee;
2783	CallingConv::ID CalleeCC = CLI.CallConv;
2784	bool isVarArg = CLI.IsVarArg;
2785
2786	if (!mayTailCallThisCC(CC: CalleeCC))
2787	return false;
2788
2789	// If -tailcallopt is specified, make fastcc functions tail-callable.
2790	MachineFunction &MF = DAG.getMachineFunction();
2791	const Function &CallerF = MF.getFunction();
2792
2793	// If the function return type is x86_fp80 and the callee return type is not,
2794	// then the FP_EXTEND of the call result is not a nop. It's not safe to
2795	// perform a tailcall optimization here.
2796	if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty())
2797	return false;
2798
2799	CallingConv::ID CallerCC = CallerF.getCallingConv();
2800	bool CCMatch = CallerCC == CalleeCC;
2801	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CC: CalleeCC);
2802	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CC: CallerCC);
2803	bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt \|\|
2804	CalleeCC == CallingConv::Tail \|\| CalleeCC == CallingConv::SwiftTail;
2805
2806	// Win64 functions have extra shadow space for argument homing. Don't do the
2807	// sibcall if the caller and callee have mismatched expectations for this
2808	// space.
2809	if (IsCalleeWin64 != IsCallerWin64)
2810	return false;
2811
2812	if (IsGuaranteeTCO) {
2813	if (canGuaranteeTCO(CC: CalleeCC) && CCMatch)
2814	return true;
2815	return false;
2816	}
2817
2818	// Look for obvious safe cases to perform tail call optimization that do not
2819	// require ABI changes. This is what gcc calls sibcall.
2820
2821	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2822	// emit a special epilogue.
2823	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2824	if (RegInfo->hasStackRealignment(MF))
2825	return false;
2826
2827	// Also avoid sibcall optimization if we're an sret return fn and the callee
2828	// is incompatible. See comment in LowerReturn about why hasStructRetAttr is
2829	// insufficient.
2830	if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
2831	// For a compatible tail call the callee must return our sret pointer. So it
2832	// needs to be (a) an sret function itself and (b) we pass our sret as its
2833	// sret. Condition #b is harder to determine.
2834	return false;
2835	} else if (IsCalleePopSRet)
2836	// The callee pops an sret, so we cannot tail-call, as our caller doesn't
2837	// expect that.
2838	return false;
2839
2840	// Do not sibcall optimize vararg calls unless all arguments are passed via
2841	// registers.
2842	LLVMContext &C = *DAG.getContext();
2843	if (isVarArg && !Outs.empty()) {
2844	// Optimizing for varargs on Win64 is unlikely to be safe without
2845	// additional testing.
2846	if (IsCalleeWin64 \|\| IsCallerWin64)
2847	return false;
2848
2849	for (const auto &VA : ArgLocs)
2850	if (!VA.isRegLoc())
2851	return false;
2852	}
2853
2854	// If the call result is in ST0 / ST1, it needs to be popped off the x87
2855	// stack. Therefore, if it's not used by the call it is not safe to optimize
2856	// this into a sibcall.
2857	bool Unused = false;
2858	for (const auto &In : Ins) {
2859	if (!In.Used) {
2860	Unused = true;
2861	break;
2862	}
2863	}
2864	if (Unused) {
2865	SmallVector<CCValAssign, `16`> RVLocs;
2866	CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C);
2867	RVCCInfo.AnalyzeCallResult(Ins, Fn: RetCC_X86);
2868	for (const auto &VA : RVLocs) {
2869	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
2870	return false;
2871	}
2872	}
2873
2874	// Check that the call results are passed in the same way.
2875	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2876	CalleeFn: RetCC_X86, CallerFn: RetCC_X86))
2877	return false;
2878	// The callee has to preserve all registers the caller needs to preserve.
2879	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2880	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2881	if (!CCMatch) {
2882	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2883	if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
2884	return false;
2885	}
2886
2887	// The stack frame of the caller cannot be replaced by the tail-callee one's
2888	// if the function is required to preserve all the registers. Conservatively
2889	// prevent tail optimization even if hypothetically all the registers are used
2890	// for passing formal parameters or returning values.
2891	if (CallerF.hasFnAttribute(Kind: "no_caller_saved_registers"))
2892	return false;
2893
2894	unsigned StackArgsSize = CCInfo.getStackSize();
2895
2896	// If the callee takes no arguments then go on to check the results of the
2897	// call.
2898	if (!Outs.empty()) {
2899	if (StackArgsSize > `0`) {
2900	// Check if the arguments are already laid out in the right way as
2901	// the caller's fixed stack objects.
2902	MachineFrameInfo &MFI = MF.getFrameInfo();
2903	const MachineRegisterInfo *MRI = &MF.getRegInfo();
2904	const X86InstrInfo *TII = Subtarget.getInstrInfo();
2905	for (unsigned I = `0`, E = ArgLocs.size(); I != E; ++I) {
2906	const CCValAssign &VA = ArgLocs [I];
2907	SDValue Arg = OutVals [I];
2908	ISD::ArgFlagsTy Flags = Outs [I].Flags;
2909	if (VA.getLocInfo() == CCValAssign::Indirect)
2910	return false;
2911	if (!VA.isRegLoc()) {
2912	if (!MatchingStackOffset(Arg, Offset: VA.getLocMemOffset(), Flags, MFI, MRI,
2913	TII, VA))
2914	return false;
2915	}
2916	}
2917	}
2918
2919	bool PositionIndependent = isPositionIndependent();
2920	// If the tailcall address may be in a register, then make sure it's
2921	// possible to register allocate for it. In 32-bit, the call address can
2922	// only target EAX, EDX, or ECX since the tail call must be scheduled after
2923	// callee-saved registers are restored. These happen to be the same
2924	// registers used to pass 'inreg' arguments so watch out for those.
2925	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Val: Callee) &&
2926	!isa<ExternalSymbolSDNode>(Val: Callee)) \|\|
2927	PositionIndependent)) {
2928	unsigned NumInRegs = `0`;
2929	// In PIC we need an extra register to formulate the address computation
2930	// for the callee.
2931	unsigned MaxInRegs = PositionIndependent ? `2` : `3`;
2932
2933	for (const auto &VA : ArgLocs) {
2934	if (!VA.isRegLoc())
2935	continue;
2936	Register Reg = VA.getLocReg();
2937	switch (Reg) {
2938	default: break;
2939	case X86::EAX: case X86::EDX: case X86::ECX:
2940	if (++NumInRegs == MaxInRegs)
2941	return false;
2942	break;
2943	}
2944	}
2945	}
2946
2947	const MachineRegisterInfo &MRI = MF.getRegInfo();
2948	if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
2949	return false;
2950	}
2951
2952	bool CalleeWillPop =
2953	X86::isCalleePop(CallingConv: CalleeCC, is64Bit: Subtarget.is64Bit(), IsVarArg: isVarArg,
2954	GuaranteeTCO: MF.getTarget().Options.GuaranteedTailCallOpt);
2955
2956	if (unsigned BytesToPop =
2957	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
2958	// If we have bytes to pop, the callee must pop them.
2959	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
2960	if (!CalleePopMatches)
2961	return false;
2962	} else if (CalleeWillPop && StackArgsSize > `0`) {
2963	// If we don't have bytes to pop, make sure the callee doesn't pop any.
2964	return false;
2965	}
2966
2967	return true;
2968	}
2969
2970	/// Determines whether the callee is required to pop its own arguments.
2971	/// Callee pop is necessary to support tail calls.
2972	bool X86::isCalleePop(CallingConv::ID CallingConv,
2973	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
2974	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
2975	// can guarantee TCO.
2976	if (!IsVarArg && shouldGuaranteeTCO(CC: CallingConv, GuaranteedTailCallOpt: GuaranteeTCO))
2977	return true;
2978
2979	switch (CallingConv) {
2980	default:
2981	return false;
2982	case CallingConv::X86_StdCall:
2983	case CallingConv::X86_FastCall:
2984	case CallingConv::X86_ThisCall:
2985	case CallingConv::X86_VectorCall:
2986	return !is64Bit;
2987	}
2988	}
2989

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86ISelLoweringCall.cpp