1//===- llvm/lib/Target/X86/X86ISelCallLowering.cpp - Call lowering --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements the lowering of LLVM calls to DAG nodes.
11//
12//===----------------------------------------------------------------------===//
13
14#include "MCTargetDesc/X86MCAsmInfo.h"
15#include "X86.h"
16#include "X86CallingConv.h"
17#include "X86FrameLowering.h"
18#include "X86ISelLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86MachineFunctionInfo.h"
21#include "X86TargetMachine.h"
22#include "llvm/ADT/Statistic.h"
23#include "llvm/Analysis/ObjCARCUtil.h"
24#include "llvm/CodeGen/MachineJumpTableInfo.h"
25#include "llvm/CodeGen/MachineModuleInfo.h"
26#include "llvm/CodeGen/WinEHFuncInfo.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Module.h"
30#include "llvm/Transforms/CFGuard.h"
31
32#define DEBUG_TYPE "x86-isel"
33
34using namespace llvm;
35
36STATISTIC(NumTailCalls, "Number of tail calls");
37
38/// Call this when the user attempts to do something unsupported, like
39/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
40/// report_fatal_error, so calling code should attempt to recover without
41/// crashing.
42static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
43 const char *Msg) {
44 MachineFunction &MF = DAG.getMachineFunction();
45 DAG.getContext()->diagnose(
46 DI: DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
47}
48
49/// Returns true if a CC can dynamically exclude a register from the list of
50/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
51/// the return registers.
52static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
53 switch (CC) {
54 default:
55 return false;
56 case CallingConv::X86_RegCall:
57 case CallingConv::PreserveMost:
58 case CallingConv::PreserveAll:
59 return true;
60 }
61}
62
63/// Returns true if a CC can dynamically exclude a register from the list of
64/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
65/// the parameters.
66static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
67 return CC == CallingConv::X86_RegCall;
68}
69
70static std::pair<MVT, unsigned>
71handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
72 const X86Subtarget &Subtarget) {
73 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
74 // convention is one that uses k registers.
75 if (NumElts == 2)
76 return {MVT::v2i64, 1};
77 if (NumElts == 4)
78 return {MVT::v4i32, 1};
79 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
80 CC != CallingConv::Intel_OCL_BI)
81 return {MVT::v8i16, 1};
82 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
83 CC != CallingConv::Intel_OCL_BI)
84 return {MVT::v16i8, 1};
85 // v32i1 passes in ymm unless we have BWI and the calling convention is
86 // regcall.
87 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
88 return {MVT::v32i8, 1};
89 // Split v64i1 vectors if we don't have v64i8 available.
90 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
91 if (Subtarget.useAVX512Regs())
92 return {MVT::v64i8, 1};
93 return {MVT::v32i8, 2};
94 }
95
96 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
97 if (!isPowerOf2_32(Value: NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
98 NumElts > 64)
99 return {MVT::i8, NumElts};
100
101 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
102}
103
104MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
105 CallingConv::ID CC,
106 EVT VT) const {
107 if (VT.isVector()) {
108 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
109 unsigned NumElts = VT.getVectorNumElements();
110
111 MVT RegisterVT;
112 unsigned NumRegisters;
113 std::tie(args&: RegisterVT, args&: NumRegisters) =
114 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
115 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
116 return RegisterVT;
117 }
118
119 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
120 return MVT::v8f16;
121 }
122
123 // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
124 if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
125 !Subtarget.hasX87())
126 return MVT::i32;
127
128 if (isTypeLegal(VT: MVT::f16)) {
129 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
130 return getRegisterTypeForCallingConv(
131 Context, CC, VT: VT.changeVectorElementType(Context, EltVT: MVT::f16));
132
133 if (VT == MVT::bf16)
134 return MVT::f16;
135 }
136
137 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
138}
139
140unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
141 CallingConv::ID CC,
142 EVT VT) const {
143 if (VT.isVector()) {
144 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
145 unsigned NumElts = VT.getVectorNumElements();
146
147 MVT RegisterVT;
148 unsigned NumRegisters;
149 std::tie(args&: RegisterVT, args&: NumRegisters) =
150 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
151 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
152 return NumRegisters;
153 }
154
155 if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
156 return 1;
157 }
158
159 // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
160 // x87 is disabled.
161 if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
162 if (VT == MVT::f64)
163 return 2;
164 if (VT == MVT::f80)
165 return 3;
166 }
167
168 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
169 isTypeLegal(VT: MVT::f16))
170 return getNumRegistersForCallingConv(
171 Context, CC, VT: VT.changeVectorElementType(Context, EltVT: MVT::f16));
172
173 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
174}
175
176unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
177 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
178 unsigned &NumIntermediates, MVT &RegisterVT) const {
179 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
180 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
181 Subtarget.hasAVX512() &&
182 (!isPowerOf2_32(Value: VT.getVectorNumElements()) ||
183 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
184 VT.getVectorNumElements() > 64)) {
185 RegisterVT = MVT::i8;
186 IntermediateVT = MVT::i1;
187 NumIntermediates = VT.getVectorNumElements();
188 return NumIntermediates;
189 }
190
191 // Split v64i1 vectors if we don't have v64i8 available.
192 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
193 CC != CallingConv::X86_RegCall) {
194 RegisterVT = MVT::v32i8;
195 IntermediateVT = MVT::v32i1;
196 NumIntermediates = 2;
197 return 2;
198 }
199
200 // Split vNbf16 vectors according to vNf16.
201 if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
202 isTypeLegal(VT: MVT::f16))
203 VT = VT.changeVectorElementType(Context, EltVT: MVT::f16);
204
205 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
206 NumIntermediates, RegisterVT);
207}
208
209EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
210 LLVMContext& Context,
211 EVT VT) const {
212 if (!VT.isVector())
213 return MVT::i8;
214
215 if (Subtarget.hasAVX512()) {
216 // Figure out what this type will be legalized to.
217 EVT LegalVT = VT;
218 while (getTypeAction(Context, VT: LegalVT) != TypeLegal)
219 LegalVT = getTypeToTransformTo(Context, VT: LegalVT);
220
221 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
222 if (LegalVT.getSimpleVT().is512BitVector())
223 return EVT::getVectorVT(Context, VT: MVT::i1, EC: VT.getVectorElementCount());
224
225 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
226 // If we legalized to less than a 512-bit vector, then we will use a vXi1
227 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
228 // vXi16/vXi8.
229 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
230 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
231 return EVT::getVectorVT(Context, VT: MVT::i1, EC: VT.getVectorElementCount());
232 }
233 }
234
235 return VT.changeVectorElementTypeToInteger();
236}
237
238bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
239 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
240 const DataLayout &DL) const {
241 // On x86-64 i128 is split into two i64s and needs to be allocated to two
242 // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
243 // is split to four i32s and never actually passed in registers, but we use
244 // the consecutive register mark to match it in TableGen.
245 if (Ty->isIntegerTy(Bitwidth: 128))
246 return true;
247
248 // On x86-32, fp128 acts the same as i128.
249 if (Subtarget.is32Bit() && Ty->isFP128Ty())
250 return true;
251
252 return false;
253}
254
255/// Helper for getByValTypeAlignment to determine
256/// the desired ByVal argument alignment.
257static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
258 if (MaxAlign == 16)
259 return;
260 if (VectorType *VTy = dyn_cast<VectorType>(Val: Ty)) {
261 if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
262 MaxAlign = Align(16);
263 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
264 Align EltAlign;
265 getMaxByValAlign(Ty: ATy->getElementType(), MaxAlign&: EltAlign);
266 if (EltAlign > MaxAlign)
267 MaxAlign = EltAlign;
268 } else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
269 for (auto *EltTy : STy->elements()) {
270 Align EltAlign;
271 getMaxByValAlign(Ty: EltTy, MaxAlign&: EltAlign);
272 if (EltAlign > MaxAlign)
273 MaxAlign = EltAlign;
274 if (MaxAlign == 16)
275 break;
276 }
277 }
278}
279
280/// Return the desired alignment for ByVal aggregate
281/// function arguments in the caller parameter area. For X86, aggregates
282/// that contain SSE vectors are placed at 16-byte boundaries while the rest
283/// are at 4-byte boundaries.
284Align X86TargetLowering::getByValTypeAlignment(Type *Ty,
285 const DataLayout &DL) const {
286 if (Subtarget.is64Bit())
287 return std::max(a: DL.getABITypeAlign(Ty), b: Align::Constant<8>());
288
289 Align Alignment(4);
290 if (Subtarget.hasSSE1())
291 getMaxByValAlign(Ty, MaxAlign&: Alignment);
292 return Alignment;
293}
294
295/// It returns EVT::Other if the type should be determined using generic
296/// target-independent logic.
297/// For vector ops we check that the overall size isn't larger than our
298/// preferred vector width.
299EVT X86TargetLowering::getOptimalMemOpType(
300 LLVMContext &Context, const MemOp &Op,
301 const AttributeList &FuncAttributes) const {
302 if (!FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat)) {
303 if (Op.size() >= 16 &&
304 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(AlignCheck: Align(16)))) {
305 // FIXME: Check if unaligned 64-byte accesses are slow.
306 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
307 (Subtarget.getPreferVectorWidth() >= 512)) {
308 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
309 }
310 // FIXME: Check if unaligned 32-byte accesses are slow.
311 if (Op.size() >= 32 && Subtarget.hasAVX() &&
312 Subtarget.useLight256BitInstructions()) {
313 // Although this isn't a well-supported type for AVX1, we'll let
314 // legalization and shuffle lowering produce the optimal codegen. If we
315 // choose an optimal type with a vector element larger than a byte,
316 // getMemsetStores() may create an intermediate splat (using an integer
317 // multiply) before we splat as a vector.
318 return MVT::v32i8;
319 }
320 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
321 return MVT::v16i8;
322 // TODO: Can SSE1 handle a byte vector?
323 // If we have SSE1 registers we should be able to use them.
324 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
325 (Subtarget.getPreferVectorWidth() >= 128))
326 return MVT::v4f32;
327 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
328 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
329 // Do not use f64 to lower memcpy if source is string constant. It's
330 // better to use i32 to avoid the loads.
331 // Also, do not use f64 to lower memset unless this is a memset of zeros.
332 // The gymnastics of splatting a byte value into an XMM register and then
333 // only using 8-byte stores (because this is a CPU with slow unaligned
334 // 16-byte accesses) makes that a loser.
335 return MVT::f64;
336 }
337 }
338 // This is a compromise. If we reach here, unaligned accesses may be slow on
339 // this target. However, creating smaller, aligned accesses could be even
340 // slower and would certainly be a lot more code.
341 if (Subtarget.is64Bit() && Op.size() >= 8)
342 return MVT::i64;
343 return MVT::i32;
344}
345
346bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
347 if (VT == MVT::f32)
348 return Subtarget.hasSSE1();
349 if (VT == MVT::f64)
350 return Subtarget.hasSSE2();
351 return true;
352}
353
354static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
355 return (8 * Alignment.value()) % SizeInBits == 0;
356}
357
358bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
359 if (isBitAligned(Alignment, SizeInBits: VT.getSizeInBits()))
360 return true;
361 switch (VT.getSizeInBits()) {
362 default:
363 // 8-byte and under are always assumed to be fast.
364 return true;
365 case 128:
366 return !Subtarget.isUnalignedMem16Slow();
367 case 256:
368 return !Subtarget.isUnalignedMem32Slow();
369 // TODO: What about AVX-512 (512-bit) accesses?
370 }
371}
372
373bool X86TargetLowering::allowsMisalignedMemoryAccesses(
374 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
375 unsigned *Fast) const {
376 if (Fast)
377 *Fast = isMemoryAccessFast(VT, Alignment);
378 // NonTemporal vector memory ops must be aligned.
379 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
380 // NT loads can only be vector aligned, so if its less aligned than the
381 // minimum vector size (which we can split the vector down to), we might as
382 // well use a regular unaligned vector load.
383 // We don't have any NT loads pre-SSE41.
384 if (!!(Flags & MachineMemOperand::MOLoad))
385 return (Alignment < 16 || !Subtarget.hasSSE41());
386 return false;
387 }
388 // Misaligned accesses of any size are always allowed.
389 return true;
390}
391
392bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
393 const DataLayout &DL, EVT VT,
394 unsigned AddrSpace, Align Alignment,
395 MachineMemOperand::Flags Flags,
396 unsigned *Fast) const {
397 if (Fast)
398 *Fast = isMemoryAccessFast(VT, Alignment);
399 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
400 if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
401 /*Fast=*/nullptr))
402 return true;
403 // NonTemporal vector memory ops are special, and must be aligned.
404 if (!isBitAligned(Alignment, SizeInBits: VT.getSizeInBits()))
405 return false;
406 switch (VT.getSizeInBits()) {
407 case 128:
408 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
409 return true;
410 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
411 return true;
412 return false;
413 case 256:
414 if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
415 return true;
416 if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
417 return true;
418 return false;
419 case 512:
420 if (Subtarget.hasAVX512())
421 return true;
422 return false;
423 default:
424 return false; // Don't have NonTemporal vector memory ops of this size.
425 }
426 }
427 return true;
428}
429
430/// Return the entry encoding for a jump table in the
431/// current function. The returned value is a member of the
432/// MachineJumpTableInfo::JTEntryKind enum.
433unsigned X86TargetLowering::getJumpTableEncoding() const {
434 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
435 // symbol.
436 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
437 return MachineJumpTableInfo::EK_Custom32;
438 if (isPositionIndependent() &&
439 getTargetMachine().getCodeModel() == CodeModel::Large &&
440 !Subtarget.isTargetCOFF())
441 return MachineJumpTableInfo::EK_LabelDifference64;
442
443 // Otherwise, use the normal jump table encoding heuristics.
444 return TargetLowering::getJumpTableEncoding();
445}
446
447bool X86TargetLowering::useSoftFloat() const {
448 return Subtarget.useSoftFloat();
449}
450
451void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
452 ArgListTy &Args) const {
453
454 // Only relabel X86-32 for C / Stdcall CCs.
455 if (Subtarget.is64Bit())
456 return;
457 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
458 return;
459 unsigned ParamRegs = 0;
460 if (auto *M = MF->getFunction().getParent())
461 ParamRegs = M->getNumberRegisterParameters();
462
463 // Mark the first N int arguments as having reg
464 for (auto &Arg : Args) {
465 Type *T = Arg.Ty;
466 if (T->isIntOrPtrTy())
467 if (MF->getDataLayout().getTypeAllocSize(Ty: T) <= 8) {
468 unsigned numRegs = 1;
469 if (MF->getDataLayout().getTypeAllocSize(Ty: T) > 4)
470 numRegs = 2;
471 if (ParamRegs < numRegs)
472 return;
473 ParamRegs -= numRegs;
474 Arg.IsInReg = true;
475 }
476 }
477}
478
479const MCExpr *
480X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
481 const MachineBasicBlock *MBB,
482 unsigned uid,MCContext &Ctx) const{
483 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
484 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
485 // entries.
486 return MCSymbolRefExpr::create(Symbol: MBB->getSymbol(), specifier: X86::S_GOTOFF, Ctx);
487}
488
489/// Returns relocation base for the given PIC jumptable.
490SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
491 SelectionDAG &DAG) const {
492 if (!Subtarget.is64Bit())
493 // This doesn't have SDLoc associated with it, but is not really the
494 // same as a Register.
495 return DAG.getNode(Opcode: X86ISD::GlobalBaseReg, DL: SDLoc(),
496 VT: getPointerTy(DL: DAG.getDataLayout()));
497 return Table;
498}
499
500/// This returns the relocation base for the given PIC jumptable,
501/// the same as getPICJumpTableRelocBase, but as an MCExpr.
502const MCExpr *X86TargetLowering::
503getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
504 MCContext &Ctx) const {
505 // X86-64 uses RIP relative addressing based on the jump table label.
506 if (Subtarget.isPICStyleRIPRel() ||
507 (Subtarget.is64Bit() &&
508 getTargetMachine().getCodeModel() == CodeModel::Large))
509 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
510
511 // Otherwise, the reference is relative to the PIC base.
512 return MCSymbolRefExpr::create(Symbol: MF->getPICBaseSymbol(), Ctx);
513}
514
515std::pair<const TargetRegisterClass *, uint8_t>
516X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
517 MVT VT) const {
518 const TargetRegisterClass *RRC = nullptr;
519 uint8_t Cost = 1;
520 switch (VT.SimpleTy) {
521 default:
522 return TargetLowering::findRepresentativeClass(TRI, VT);
523 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
524 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
525 break;
526 case MVT::x86mmx:
527 RRC = &X86::VR64RegClass;
528 break;
529 case MVT::f32: case MVT::f64:
530 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
531 case MVT::v4f32: case MVT::v2f64:
532 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
533 case MVT::v8f32: case MVT::v4f64:
534 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
535 case MVT::v16f32: case MVT::v8f64:
536 RRC = &X86::VR128XRegClass;
537 break;
538 }
539 return std::make_pair(x&: RRC, y&: Cost);
540}
541
542unsigned X86TargetLowering::getAddressSpace() const {
543 if (Subtarget.is64Bit())
544 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? X86AS::GS
545 : X86AS::FS;
546 return X86AS::GS;
547}
548
549static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
550 return TargetTriple.isOSGlibc() || TargetTriple.isMusl() ||
551 TargetTriple.isOSFuchsia() || TargetTriple.isAndroid();
552}
553
554static Constant* SegmentOffset(IRBuilderBase &IRB,
555 int Offset, unsigned AddressSpace) {
556 return ConstantExpr::getIntToPtr(
557 C: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: IRB.getContext()), V: Offset),
558 Ty: IRB.getPtrTy(AddrSpace: AddressSpace));
559}
560
561Value *
562X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB,
563 const LibcallLoweringInfo &Libcalls) const {
564 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
565 // tcbhead_t; use it instead of the usual global variable (see
566 // sysdeps/{i386,x86_64}/nptl/tls.h)
567 if (hasStackGuardSlotTLS(TargetTriple: Subtarget.getTargetTriple())) {
568 unsigned AddressSpace = getAddressSpace();
569
570 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
571 if (Subtarget.isTargetFuchsia())
572 return SegmentOffset(IRB, Offset: 0x10, AddressSpace);
573
574 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
575 // Specially, some users may customize the base reg and offset.
576 int Offset = M->getStackProtectorGuardOffset();
577 // If we don't set -stack-protector-guard-offset value:
578 // %fs:0x28, unless we're using a Kernel code model, in which case
579 // it's %gs:0x28. gs:0x14 on i386.
580 if (Offset == INT_MAX)
581 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
582
583 StringRef GuardReg = M->getStackProtectorGuardReg();
584 if (GuardReg == "fs")
585 AddressSpace = X86AS::FS;
586 else if (GuardReg == "gs")
587 AddressSpace = X86AS::GS;
588
589 // Use symbol guard if user specify.
590 StringRef GuardSymb = M->getStackProtectorGuardSymbol();
591 if (!GuardSymb.empty()) {
592 GlobalVariable *GV = M->getGlobalVariable(Name: GuardSymb);
593 if (!GV) {
594 Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(C&: M->getContext())
595 : Type::getInt32Ty(C&: M->getContext());
596 GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
597 nullptr, GuardSymb, nullptr,
598 GlobalValue::NotThreadLocal, AddressSpace);
599 if (!Subtarget.isTargetDarwin())
600 GV->setDSOLocal(M->getDirectAccessExternalData());
601 }
602 return GV;
603 }
604
605 return SegmentOffset(IRB, Offset, AddressSpace);
606 }
607 return TargetLowering::getIRStackGuard(IRB, Libcalls);
608}
609
610void X86TargetLowering::insertSSPDeclarations(
611 Module &M, const LibcallLoweringInfo &Libcalls) const {
612 // MSVC CRT provides functionalities for stack protection.
613 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
614 Libcalls.getLibcallImpl(Call: RTLIB::SECURITY_CHECK_COOKIE);
615
616 RTLIB::LibcallImpl SecurityCookieVar =
617 Libcalls.getLibcallImpl(Call: RTLIB::STACK_CHECK_GUARD);
618 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
619 SecurityCookieVar != RTLIB::Unsupported) {
620 // MSVC CRT provides functionalities for stack protection.
621 // MSVC CRT has a global variable holding security cookie.
622 M.getOrInsertGlobal(Name: getLibcallImplName(Call: SecurityCookieVar),
623 Ty: PointerType::getUnqual(C&: M.getContext()));
624
625 // MSVC CRT has a function to validate security cookie.
626 FunctionCallee SecurityCheckCookie =
627 M.getOrInsertFunction(Name: getLibcallImplName(Call: SecurityCheckCookieLibcall),
628 RetTy: Type::getVoidTy(C&: M.getContext()),
629 Args: PointerType::getUnqual(C&: M.getContext()));
630
631 if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
632 F->setCallingConv(CallingConv::X86_FastCall);
633 F->addParamAttr(ArgNo: 0, Kind: Attribute::AttrKind::InReg);
634 }
635 return;
636 }
637
638 StringRef GuardMode = M.getStackProtectorGuard();
639
640 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
641 if ((GuardMode == "tls" || GuardMode.empty()) &&
642 hasStackGuardSlotTLS(TargetTriple: Subtarget.getTargetTriple()))
643 return;
644 TargetLowering::insertSSPDeclarations(M, Libcalls);
645}
646
647Value *X86TargetLowering::getSafeStackPointerLocation(
648 IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const {
649 // Android provides a fixed TLS slot for the SafeStack pointer. See the
650 // definition of TLS_SLOT_SAFESTACK in
651 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
652 if (Subtarget.isTargetAndroid()) {
653 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
654 // %gs:0x24 on i386
655 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
656 return SegmentOffset(IRB, Offset, AddressSpace: getAddressSpace());
657 }
658
659 // Fuchsia is similar.
660 if (Subtarget.isTargetFuchsia()) {
661 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
662 return SegmentOffset(IRB, Offset: 0x18, AddressSpace: getAddressSpace());
663 }
664
665 return TargetLowering::getSafeStackPointerLocation(IRB, Libcalls);
666}
667
668//===----------------------------------------------------------------------===//
669// Return Value Calling Convention Implementation
670//===----------------------------------------------------------------------===//
671
672bool X86TargetLowering::CanLowerReturn(
673 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
674 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
675 const Type *RetTy) const {
676 SmallVector<CCValAssign, 16> RVLocs;
677 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
678 return CCInfo.CheckReturn(Outs, Fn: RetCC_X86);
679}
680
681const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
682 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
683 return ScratchRegs;
684}
685
686ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
687 static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR};
688 return RCRegs;
689}
690
691/// Lowers masks values (v*i1) to the local register values
692/// \returns DAG node after lowering to register type
693static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
694 const SDLoc &DL, SelectionDAG &DAG) {
695 EVT ValVT = ValArg.getValueType();
696
697 if (ValVT == MVT::v1i1)
698 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ValLoc, N1: ValArg,
699 N2: DAG.getIntPtrConstant(Val: 0, DL));
700
701 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
702 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
703 // Two stage lowering might be required
704 // bitcast: v8i1 -> i8 / v16i1 -> i16
705 // anyextend: i8 -> i32 / i16 -> i32
706 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
707 SDValue ValToCopy = DAG.getBitcast(VT: TempValLoc, V: ValArg);
708 if (ValLoc == MVT::i32)
709 ValToCopy = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ValLoc, Operand: ValToCopy);
710 return ValToCopy;
711 }
712
713 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
714 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
715 // One stage lowering is required
716 // bitcast: v32i1 -> i32 / v64i1 -> i64
717 return DAG.getBitcast(VT: ValLoc, V: ValArg);
718 }
719
720 return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ValLoc, Operand: ValArg);
721}
722
723/// Breaks v64i1 value into two registers and adds the new node to the DAG
724static void Passv64i1ArgInRegs(
725 const SDLoc &DL, SelectionDAG &DAG, SDValue &Arg,
726 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
727 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
728 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
729 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
730 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
731 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
732 "The value should reside in two registers");
733
734 // Before splitting the value we cast it to i64
735 Arg = DAG.getBitcast(VT: MVT::i64, V: Arg);
736
737 // Splitting the value into two i32 types
738 SDValue Lo, Hi;
739 std::tie(args&: Lo, args&: Hi) = DAG.SplitScalar(N: Arg, DL, LoVT: MVT::i32, HiVT: MVT::i32);
740
741 // Attach the two i32 types into corresponding registers
742 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Lo));
743 RegsToPass.push_back(Elt: std::make_pair(x: NextVA.getLocReg(), y&: Hi));
744}
745
746SDValue
747X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
748 bool isVarArg,
749 const SmallVectorImpl<ISD::OutputArg> &Outs,
750 const SmallVectorImpl<SDValue> &OutVals,
751 const SDLoc &dl, SelectionDAG &DAG) const {
752 MachineFunction &MF = DAG.getMachineFunction();
753 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
754
755 // In some cases we need to disable registers from the default CSR list.
756 // For example, when they are used as return registers (preserve_* and X86's
757 // regcall) or for argument passing (X86's regcall).
758 bool ShouldDisableCalleeSavedRegister =
759 shouldDisableRetRegFromCSR(CC: CallConv) ||
760 MF.getFunction().hasFnAttribute(Kind: "no_caller_saved_registers");
761
762 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
763 report_fatal_error(reason: "X86 interrupts may not return any value");
764
765 SmallVector<CCValAssign, 16> RVLocs;
766 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
767 CCInfo.AnalyzeReturn(Outs, Fn: RetCC_X86);
768
769 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
770 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
771 ++I, ++OutsIndex) {
772 CCValAssign &VA = RVLocs[I];
773 assert(VA.isRegLoc() && "Can only return in registers!");
774
775 // Add the register to the CalleeSaveDisableRegs list.
776 if (ShouldDisableCalleeSavedRegister)
777 MF.getRegInfo().disableCalleeSavedRegister(Reg: VA.getLocReg());
778
779 SDValue ValToCopy = OutVals[OutsIndex];
780 EVT ValVT = ValToCopy.getValueType();
781
782 // Promote values to the appropriate types.
783 if (VA.getLocInfo() == CCValAssign::SExt)
784 ValToCopy = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
785 else if (VA.getLocInfo() == CCValAssign::ZExt)
786 ValToCopy = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
787 else if (VA.getLocInfo() == CCValAssign::AExt) {
788 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
789 ValToCopy = lowerMasksToReg(ValArg: ValToCopy, ValLoc: VA.getLocVT(), DL: dl, DAG);
790 else
791 ValToCopy = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: ValToCopy);
792 }
793 else if (VA.getLocInfo() == CCValAssign::BCvt)
794 ValToCopy = DAG.getBitcast(VT: VA.getLocVT(), V: ValToCopy);
795
796 assert(VA.getLocInfo() != CCValAssign::FPExt &&
797 "Unexpected FP-extend for return value.");
798
799 // Report an error if we have attempted to return a value via an XMM
800 // register and SSE was disabled.
801 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(Reg: VA.getLocReg())) {
802 errorUnsupported(DAG, dl, Msg: "SSE register return with SSE disabled");
803 VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
804 } else if (!Subtarget.hasSSE2() &&
805 X86::FR64XRegClass.contains(Reg: VA.getLocReg()) &&
806 ValVT == MVT::f64) {
807 // When returning a double via an XMM register, report an error if SSE2 is
808 // not enabled.
809 errorUnsupported(DAG, dl, Msg: "SSE2 register return with SSE2 disabled");
810 VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
811 }
812
813 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
814 // the RET instruction and handled by the FP Stackifier.
815 if (VA.getLocReg() == X86::FP0 ||
816 VA.getLocReg() == X86::FP1) {
817 // If this is a copy from an xmm register to ST(0), use an FPExtend to
818 // change the value to the FP stack register class.
819 if (isScalarFPTypeInSSEReg(VT: VA.getValVT()))
820 ValToCopy = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f80, Operand: ValToCopy);
821 RetVals.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ValToCopy));
822 // Don't emit a copytoreg.
823 continue;
824 }
825
826 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
827 // which is returned in RAX / RDX.
828 if (Subtarget.is64Bit()) {
829 if (ValVT == MVT::x86mmx) {
830 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
831 ValToCopy = DAG.getBitcast(VT: MVT::i64, V: ValToCopy);
832 ValToCopy = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v2i64,
833 Operand: ValToCopy);
834 // If we don't have SSE2 available, convert to v4f32 so the generated
835 // register is legal.
836 if (!Subtarget.hasSSE2())
837 ValToCopy = DAG.getBitcast(VT: MVT::v4f32, V: ValToCopy);
838 }
839 }
840 }
841
842 if (VA.needsCustom()) {
843 assert(VA.getValVT() == MVT::v64i1 &&
844 "Currently the only custom case is when we split v64i1 to 2 regs");
845
846 Passv64i1ArgInRegs(DL: dl, DAG, Arg&: ValToCopy, RegsToPass&: RetVals, VA, NextVA&: RVLocs[++I],
847 Subtarget);
848
849 // Add the second register to the CalleeSaveDisableRegs list.
850 if (ShouldDisableCalleeSavedRegister)
851 MF.getRegInfo().disableCalleeSavedRegister(Reg: RVLocs[I].getLocReg());
852 } else {
853 RetVals.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: ValToCopy));
854 }
855 }
856
857 SDValue Glue;
858 SmallVector<SDValue, 6> RetOps;
859 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
860 // Operand #1 = Bytes To Pop
861 RetOps.push_back(Elt: DAG.getTargetConstant(Val: FuncInfo->getBytesToPopOnReturn(), DL: dl,
862 VT: MVT::i32));
863
864 // Copy the result values into the output registers.
865 for (auto &RetVal : RetVals) {
866 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
867 RetOps.push_back(Elt: RetVal.second);
868 continue; // Don't emit a copytoreg.
869 }
870
871 Chain = DAG.getCopyToReg(Chain, dl, Reg: RetVal.first, N: RetVal.second, Glue);
872 Glue = Chain.getValue(R: 1);
873 RetOps.push_back(
874 Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
875 }
876
877 // Swift calling convention does not require we copy the sret argument
878 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
879
880 // All x86 ABIs require that for returning structs by value we copy
881 // the sret argument into %rax/%eax (depending on ABI) for the return.
882 // We saved the argument into a virtual register in the entry block,
883 // so now we copy the value out and into %rax/%eax.
884 //
885 // Checking Function.hasStructRetAttr() here is insufficient because the IR
886 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
887 // false, then an sret argument may be implicitly inserted in the SelDAG. In
888 // either case FuncInfo->setSRetReturnReg() will have been called.
889 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
890 // When we have both sret and another return value, we should use the
891 // original Chain stored in RetOps[0], instead of the current Chain updated
892 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
893
894 // For the case of sret and another return value, we have
895 // Chain_0 at the function entry
896 // Chain_1 = getCopyToReg(Chain_0) in the above loop
897 // If we use Chain_1 in getCopyFromReg, we will have
898 // Val = getCopyFromReg(Chain_1)
899 // Chain_2 = getCopyToReg(Chain_1, Val) from below
900
901 // getCopyToReg(Chain_0) will be glued together with
902 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
903 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
904 // Data dependency from Unit B to Unit A due to usage of Val in
905 // getCopyToReg(Chain_1, Val)
906 // Chain dependency from Unit A to Unit B
907
908 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
909 SDValue Val = DAG.getCopyFromReg(Chain: RetOps[0], dl, Reg: SRetReg,
910 VT: getPointerTy(DL: MF.getDataLayout()));
911
912 Register RetValReg
913 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
914 X86::RAX : X86::EAX;
915 Chain = DAG.getCopyToReg(Chain, dl, Reg: RetValReg, N: Val, Glue);
916 Glue = Chain.getValue(R: 1);
917
918 // RAX/EAX now acts like a return value.
919 RetOps.push_back(
920 Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
921
922 // Add the returned register to the CalleeSaveDisableRegs list. Don't do
923 // this however for preserve_most/preserve_all to minimize the number of
924 // callee-saved registers for these CCs.
925 if (ShouldDisableCalleeSavedRegister &&
926 CallConv != CallingConv::PreserveAll &&
927 CallConv != CallingConv::PreserveMost)
928 MF.getRegInfo().disableCalleeSavedRegister(Reg: RetValReg);
929 }
930
931 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
932 const MCPhysReg *I =
933 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
934 if (I) {
935 for (; *I; ++I) {
936 if (X86::GR64RegClass.contains(Reg: *I))
937 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
938 else
939 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
940 }
941 }
942
943 RetOps[0] = Chain; // Update chain.
944
945 // Add the glue if we have it.
946 if (Glue.getNode())
947 RetOps.push_back(Elt: Glue);
948
949 unsigned RetOpcode = X86ISD::RET_GLUE;
950 if (CallConv == CallingConv::X86_INTR)
951 RetOpcode = X86ISD::IRET;
952 return DAG.getNode(Opcode: RetOpcode, DL: dl, VT: MVT::Other, Ops: RetOps);
953}
954
955bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
956 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(NUses: 1, Value: 0))
957 return false;
958
959 SDValue TCChain = Chain;
960 SDNode *Copy = *N->user_begin();
961 if (Copy->getOpcode() == ISD::CopyToReg) {
962 // If the copy has a glue operand, we conservatively assume it isn't safe to
963 // perform a tail call.
964 if (Copy->getOperand(Num: Copy->getNumOperands()-1).getValueType() == MVT::Glue)
965 return false;
966 TCChain = Copy->getOperand(Num: 0);
967 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
968 return false;
969
970 bool HasRet = false;
971 for (const SDNode *U : Copy->users()) {
972 if (U->getOpcode() != X86ISD::RET_GLUE)
973 return false;
974 // If we are returning more than one value, we can definitely
975 // not make a tail call see PR19530
976 if (U->getNumOperands() > 4)
977 return false;
978 if (U->getNumOperands() == 4 &&
979 U->getOperand(Num: U->getNumOperands() - 1).getValueType() != MVT::Glue)
980 return false;
981 HasRet = true;
982 }
983
984 if (!HasRet)
985 return false;
986
987 Chain = TCChain;
988 return true;
989}
990
991EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
992 ISD::NodeType ExtendKind) const {
993 MVT ReturnMVT = MVT::i32;
994
995 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
996 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
997 // The ABI does not require i1, i8 or i16 to be extended.
998 //
999 // On Darwin, there is code in the wild relying on Clang's old behaviour of
1000 // always extending i8/i16 return values, so keep doing that for now.
1001 // (PR26665).
1002 ReturnMVT = MVT::i8;
1003 }
1004
1005 EVT MinVT = getRegisterType(Context, VT: ReturnMVT);
1006 return VT.bitsLT(VT: MinVT) ? MinVT : VT;
1007}
1008
1009/// Reads two 32 bit registers and creates a 64 bit mask value.
1010/// \param VA The current 32 bit value that need to be assigned.
1011/// \param NextVA The next 32 bit value that need to be assigned.
1012/// \param Root The parent DAG node.
1013/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
1014/// glue purposes. In the case the DAG is already using
1015/// physical register instead of virtual, we should glue
1016/// our new SDValue to InGlue SDvalue.
1017/// \return a new SDvalue of size 64bit.
1018static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
1019 SDValue &Root, SelectionDAG &DAG,
1020 const SDLoc &DL, const X86Subtarget &Subtarget,
1021 SDValue *InGlue = nullptr) {
1022 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
1023 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
1024 assert(VA.getValVT() == MVT::v64i1 &&
1025 "Expecting first location of 64 bit width type");
1026 assert(NextVA.getValVT() == VA.getValVT() &&
1027 "The locations should have the same type");
1028 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
1029 "The values should reside in two registers");
1030
1031 SDValue Lo, Hi;
1032 SDValue ArgValueLo, ArgValueHi;
1033
1034 MachineFunction &MF = DAG.getMachineFunction();
1035 const TargetRegisterClass *RC = &X86::GR32RegClass;
1036
1037 // Read a 32 bit value from the registers.
1038 if (nullptr == InGlue) {
1039 // When no physical register is present,
1040 // create an intermediate virtual register.
1041 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
1042 ArgValueLo = DAG.getCopyFromReg(Chain: Root, dl: DL, Reg, VT: MVT::i32);
1043 Reg = MF.addLiveIn(PReg: NextVA.getLocReg(), RC);
1044 ArgValueHi = DAG.getCopyFromReg(Chain: Root, dl: DL, Reg, VT: MVT::i32);
1045 } else {
1046 // When a physical register is available read the value from it and glue
1047 // the reads together.
1048 ArgValueLo =
1049 DAG.getCopyFromReg(Chain: Root, dl: DL, Reg: VA.getLocReg(), VT: MVT::i32, Glue: *InGlue);
1050 *InGlue = ArgValueLo.getValue(R: 2);
1051 ArgValueHi =
1052 DAG.getCopyFromReg(Chain: Root, dl: DL, Reg: NextVA.getLocReg(), VT: MVT::i32, Glue: *InGlue);
1053 *InGlue = ArgValueHi.getValue(R: 2);
1054 }
1055
1056 // Convert the i32 type into v32i1 type.
1057 Lo = DAG.getBitcast(VT: MVT::v32i1, V: ArgValueLo);
1058
1059 // Convert the i32 type into v32i1 type.
1060 Hi = DAG.getBitcast(VT: MVT::v32i1, V: ArgValueHi);
1061
1062 // Concatenate the two values together.
1063 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v64i1, N1: Lo, N2: Hi);
1064}
1065
1066/// The function will lower a register of various sizes (8/16/32/64)
1067/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
1068/// \returns a DAG node contains the operand after lowering to mask type.
1069static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
1070 const EVT &ValLoc, const SDLoc &DL,
1071 SelectionDAG &DAG) {
1072 SDValue ValReturned = ValArg;
1073
1074 if (ValVT == MVT::v1i1)
1075 return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i1, Operand: ValReturned);
1076
1077 if (ValVT == MVT::v64i1) {
1078 // In 32 bit machine, this case is handled by getv64i1Argument
1079 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
1080 // In 64 bit machine, There is no need to truncate the value only bitcast
1081 } else {
1082 MVT MaskLenVT;
1083 switch (ValVT.getSimpleVT().SimpleTy) {
1084 case MVT::v8i1:
1085 MaskLenVT = MVT::i8;
1086 break;
1087 case MVT::v16i1:
1088 MaskLenVT = MVT::i16;
1089 break;
1090 case MVT::v32i1:
1091 MaskLenVT = MVT::i32;
1092 break;
1093 default:
1094 llvm_unreachable("Expecting a vector of i1 types");
1095 }
1096
1097 ValReturned = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MaskLenVT, Operand: ValReturned);
1098 }
1099 return DAG.getBitcast(VT: ValVT, V: ValReturned);
1100}
1101
1102static SDValue getPopFromX87Reg(SelectionDAG &DAG, SDValue Chain,
1103 const SDLoc &dl, Register Reg, EVT VT,
1104 SDValue Glue) {
1105 SDVTList VTs = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
1106 SDValue Ops[] = {Chain, DAG.getRegister(Reg, VT), Glue};
1107 return DAG.getNode(Opcode: X86ISD::POP_FROM_X87_REG, DL: dl, VTList: VTs,
1108 Ops: ArrayRef(Ops, Glue.getNode() ? 3 : 2));
1109}
1110
1111/// Lower the result values of a call into the
1112/// appropriate copies out of appropriate physical registers.
1113///
1114SDValue X86TargetLowering::LowerCallResult(
1115 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1116 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1117 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
1118 uint32_t *RegMask) const {
1119
1120 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1121 // Assign locations to each value returned by this call.
1122 SmallVector<CCValAssign, 16> RVLocs;
1123 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1124 *DAG.getContext());
1125 CCInfo.AnalyzeCallResult(Ins, Fn: RetCC_X86);
1126
1127 // Copy all of the result registers out of their specified physreg.
1128 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
1129 ++I, ++InsIndex) {
1130 CCValAssign &VA = RVLocs[I];
1131 EVT CopyVT = VA.getLocVT();
1132
1133 // In some calling conventions we need to remove the used registers
1134 // from the register mask.
1135 if (RegMask) {
1136 for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg: VA.getLocReg()))
1137 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
1138 }
1139
1140 // Report an error if there was an attempt to return FP values via XMM
1141 // registers.
1142 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(Reg: VA.getLocReg())) {
1143 errorUnsupported(DAG, dl, Msg: "SSE register return with SSE disabled");
1144 if (VA.getLocReg() == X86::XMM1)
1145 VA.convertToReg(Reg: X86::FP1); // Set reg to FP1, avoid hitting asserts.
1146 else
1147 VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
1148 } else if (!Subtarget.hasSSE2() &&
1149 X86::FR64XRegClass.contains(Reg: VA.getLocReg()) &&
1150 CopyVT == MVT::f64) {
1151 errorUnsupported(DAG, dl, Msg: "SSE2 register return with SSE2 disabled");
1152 if (VA.getLocReg() == X86::XMM1)
1153 VA.convertToReg(Reg: X86::FP1); // Set reg to FP1, avoid hitting asserts.
1154 else
1155 VA.convertToReg(Reg: X86::FP0); // Set reg to FP0, avoid hitting asserts.
1156 }
1157
1158 // If we prefer to use the value in xmm registers, copy it out as f80 and
1159 // use a truncate to move it from fp stack reg to xmm reg.
1160 bool RoundAfterCopy = false;
1161 bool X87Result = VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1;
1162 if (X87Result && isScalarFPTypeInSSEReg(VT: VA.getValVT())) {
1163 if (!Subtarget.hasX87())
1164 report_fatal_error(reason: "X87 register return with X87 disabled");
1165 CopyVT = MVT::f80;
1166 RoundAfterCopy = (CopyVT != VA.getLocVT());
1167 }
1168
1169 SDValue Val;
1170 if (VA.needsCustom()) {
1171 assert(VA.getValVT() == MVT::v64i1 &&
1172 "Currently the only custom case is when we split v64i1 to 2 regs");
1173 Val =
1174 getv64i1Argument(VA, NextVA&: RVLocs[++I], Root&: Chain, DAG, DL: dl, Subtarget, InGlue: &InGlue);
1175 } else {
1176 Chain =
1177 X87Result
1178 ? getPopFromX87Reg(DAG, Chain, dl, Reg: VA.getLocReg(), VT: CopyVT, Glue: InGlue)
1179 .getValue(R: 1)
1180 : DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: CopyVT, Glue: InGlue)
1181 .getValue(R: 1);
1182 Val = Chain.getValue(R: 0);
1183 InGlue = Chain.getValue(R: 2);
1184 }
1185
1186 if (RoundAfterCopy)
1187 Val = DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: VA.getValVT(), N1: Val,
1188 // This truncation won't change the value.
1189 N2: DAG.getIntPtrConstant(Val: 1, DL: dl, /*isTarget=*/true));
1190
1191 if (VA.isExtInLoc()) {
1192 if (VA.getValVT().isVector() &&
1193 VA.getValVT().getScalarType() == MVT::i1 &&
1194 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1195 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1196 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1197 Val = lowerRegToMasks(ValArg: Val, ValVT: VA.getValVT(), ValLoc: VA.getLocVT(), DL: dl, DAG);
1198 } else
1199 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val);
1200 }
1201
1202 if (VA.getLocInfo() == CCValAssign::BCvt)
1203 Val = DAG.getBitcast(VT: VA.getValVT(), V: Val);
1204
1205 InVals.push_back(Elt: Val);
1206 }
1207
1208 return Chain;
1209}
1210
1211/// Determines whether Args, either a set of outgoing arguments to a call, or a
1212/// set of incoming args of a call, contains an sret pointer that the callee
1213/// pops. This happens on most x86-32, System V platforms, unless register
1214/// parameters are in use (-mregparm=1+, regcallcc, etc).
1215template <typename T>
1216static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
1217 const SmallVectorImpl<CCValAssign> &ArgLocs,
1218 const X86Subtarget &Subtarget) {
1219 // Not C++20 (yet), so no concepts available.
1220 static_assert(std::is_same_v<T, ISD::OutputArg> ||
1221 std::is_same_v<T, ISD::InputArg>,
1222 "requires ISD::OutputArg or ISD::InputArg");
1223
1224 // Popping the sret pointer only happens on x86-32 System V ABI platforms
1225 // (Linux, Cygwin, BSDs, Mac, etc). That excludes Windows-minus-Cygwin and
1226 // MCU.
1227 const Triple &TT = Subtarget.getTargetTriple();
1228 if (!TT.isX86_32() || TT.isOSMSVCRT() || TT.isOSIAMCU())
1229 return false;
1230
1231 // Check if the first argument is marked sret and if it is passed in memory.
1232 bool IsSRetInMem = false;
1233 if (!Args.empty())
1234 IsSRetInMem = Args.front().Flags.isSRet() && ArgLocs.front().isMemLoc();
1235 return IsSRetInMem;
1236}
1237
1238/// Make a copy of an aggregate at address specified by "Src" to address
1239/// "Dst" with size and alignment information specified by the specific
1240/// parameter attribute. The copy will be passed as a byval function parameter.
1241static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
1242 SDValue Chain, ISD::ArgFlagsTy Flags,
1243 SelectionDAG &DAG, const SDLoc &dl) {
1244 SDValue SizeNode = DAG.getIntPtrConstant(Val: Flags.getByValSize(), DL: dl);
1245
1246 return DAG.getMemcpy(
1247 Chain, dl, Dst, Src, Size: SizeNode, Alignment: Flags.getNonZeroByValAlign(),
1248 /*isVolatile*/ isVol: false, /*AlwaysInline=*/true,
1249 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo(), SrcPtrInfo: MachinePointerInfo());
1250}
1251
1252/// Return true if the calling convention is one that we can guarantee TCO for.
1253static bool canGuaranteeTCO(CallingConv::ID CC) {
1254 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
1255 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
1256 CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
1257}
1258
1259/// Return true if we might ever do TCO for calls with this calling convention.
1260static bool mayTailCallThisCC(CallingConv::ID CC) {
1261 switch (CC) {
1262 // C calling conventions:
1263 case CallingConv::C:
1264 case CallingConv::Win64:
1265 case CallingConv::X86_64_SysV:
1266 case CallingConv::PreserveNone:
1267 // Callee pop conventions:
1268 case CallingConv::X86_ThisCall:
1269 case CallingConv::X86_StdCall:
1270 case CallingConv::X86_VectorCall:
1271 case CallingConv::X86_FastCall:
1272 // Swift:
1273 case CallingConv::Swift:
1274 return true;
1275 default:
1276 return canGuaranteeTCO(CC);
1277 }
1278}
1279
1280/// Return true if the function is being made into a tailcall target by
1281/// changing its ABI.
1282static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
1283 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
1284 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
1285}
1286
1287bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
1288 if (!CI->isTailCall())
1289 return false;
1290
1291 CallingConv::ID CalleeCC = CI->getCallingConv();
1292 if (!mayTailCallThisCC(CC: CalleeCC))
1293 return false;
1294
1295 return true;
1296}
1297
1298SDValue
1299X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1300 const SmallVectorImpl<ISD::InputArg> &Ins,
1301 const SDLoc &dl, SelectionDAG &DAG,
1302 const CCValAssign &VA,
1303 MachineFrameInfo &MFI, unsigned i) const {
1304 // Create the nodes corresponding to a load from this parameter slot.
1305 ISD::ArgFlagsTy Flags = Ins[i].Flags;
1306 bool AlwaysUseMutable = shouldGuaranteeTCO(
1307 CC: CallConv, GuaranteedTailCallOpt: DAG.getTarget().Options.GuaranteedTailCallOpt);
1308 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1309 EVT ValVT;
1310 MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
1311
1312 // If value is passed by pointer we have address passed instead of the value
1313 // itself. No need to extend if the mask value and location share the same
1314 // absolute size.
1315 bool ExtendedInMem =
1316 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
1317 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
1318
1319 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
1320 ValVT = VA.getLocVT();
1321 else
1322 ValVT = VA.getValVT();
1323
1324 // FIXME: For now, all byval parameter objects are marked mutable. This can be
1325 // changed with more analysis.
1326 // In case of tail call optimization mark all arguments mutable. Since they
1327 // could be overwritten by lowering of arguments in case of a tail call.
1328 if (Flags.isByVal()) {
1329 unsigned Bytes = Flags.getByValSize();
1330 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
1331
1332 // FIXME: For now, all byval parameter objects are marked as aliasing. This
1333 // can be improved with deeper analysis.
1334 int FI = MFI.CreateFixedObject(Size: Bytes, SPOffset: VA.getLocMemOffset(), IsImmutable: isImmutable,
1335 /*isAliased=*/true);
1336 return DAG.getFrameIndex(FI, VT: PtrVT);
1337 }
1338
1339 EVT ArgVT = Ins[i].ArgVT;
1340
1341 // If this is a vector that has been split into multiple parts, don't elide
1342 // the copy. The layout on the stack may not match the packed in-memory
1343 // layout.
1344 bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
1345
1346 // This is an argument in memory. We might be able to perform copy elision.
1347 // If the argument is passed directly in memory without any extension, then we
1348 // can perform copy elision. Large vector types, for example, may be passed
1349 // indirectly by pointer.
1350 if (Flags.isCopyElisionCandidate() &&
1351 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
1352 !ScalarizedVector) {
1353 SDValue PartAddr;
1354 if (Ins[i].PartOffset == 0) {
1355 // If this is a one-part value or the first part of a multi-part value,
1356 // create a stack object for the entire argument value type and return a
1357 // load from our portion of it. This assumes that if the first part of an
1358 // argument is in memory, the rest will also be in memory.
1359 int FI = MFI.CreateFixedObject(Size: ArgVT.getStoreSize(), SPOffset: VA.getLocMemOffset(),
1360 /*IsImmutable=*/false);
1361 PartAddr = DAG.getFrameIndex(FI, VT: PtrVT);
1362 return DAG.getLoad(
1363 VT: ValVT, dl, Chain, Ptr: PartAddr,
1364 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI));
1365 }
1366
1367 // This is not the first piece of an argument in memory. See if there is
1368 // already a fixed stack object including this offset. If so, assume it
1369 // was created by the PartOffset == 0 branch above and create a load from
1370 // the appropriate offset into it.
1371 int64_t PartBegin = VA.getLocMemOffset();
1372 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
1373 int FI = MFI.getObjectIndexBegin();
1374 for (; MFI.isFixedObjectIndex(ObjectIdx: FI); ++FI) {
1375 int64_t ObjBegin = MFI.getObjectOffset(ObjectIdx: FI);
1376 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(ObjectIdx: FI);
1377 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
1378 break;
1379 }
1380 if (MFI.isFixedObjectIndex(ObjectIdx: FI)) {
1381 SDValue Addr =
1382 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: DAG.getFrameIndex(FI, VT: PtrVT),
1383 N2: DAG.getIntPtrConstant(Val: Ins[i].PartOffset, DL: dl));
1384 return DAG.getLoad(VT: ValVT, dl, Chain, Ptr: Addr,
1385 PtrInfo: MachinePointerInfo::getFixedStack(
1386 MF&: DAG.getMachineFunction(), FI, Offset: Ins[i].PartOffset));
1387 }
1388 }
1389
1390 int FI = MFI.CreateFixedObject(Size: ValVT.getSizeInBits() / 8,
1391 SPOffset: VA.getLocMemOffset(), IsImmutable: isImmutable);
1392
1393 // Set SExt or ZExt flag.
1394 if (VA.getLocInfo() == CCValAssign::ZExt) {
1395 MFI.setObjectZExt(ObjectIdx: FI, IsZExt: true);
1396 } else if (VA.getLocInfo() == CCValAssign::SExt) {
1397 MFI.setObjectSExt(ObjectIdx: FI, IsSExt: true);
1398 }
1399
1400 MaybeAlign Alignment;
1401 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1402 ValVT != MVT::f80)
1403 Alignment = MaybeAlign(4);
1404 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
1405 SDValue Val = DAG.getLoad(
1406 VT: ValVT, dl, Chain, Ptr: FIN,
1407 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI),
1408 Alignment);
1409 return ExtendedInMem
1410 ? (VA.getValVT().isVector()
1411 ? DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VA.getValVT(), Operand: Val)
1412 : DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: Val))
1413 : Val;
1414}
1415
1416// FIXME: Get this from tablegen.
1417static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
1418 const X86Subtarget &Subtarget) {
1419 assert(Subtarget.is64Bit());
1420
1421 if (Subtarget.isCallingConvWin64(CC: CallConv)) {
1422 static const MCPhysReg GPR64ArgRegsWin64[] = {
1423 X86::RCX, X86::RDX, X86::R8, X86::R9
1424 };
1425 return GPR64ArgRegsWin64;
1426 }
1427
1428 static const MCPhysReg GPR64ArgRegs64Bit[] = {
1429 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1430 };
1431 return GPR64ArgRegs64Bit;
1432}
1433
1434// FIXME: Get this from tablegen.
1435static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
1436 CallingConv::ID CallConv,
1437 const X86Subtarget &Subtarget) {
1438 assert(Subtarget.is64Bit());
1439 if (Subtarget.isCallingConvWin64(CC: CallConv)) {
1440 // The XMM registers which might contain var arg parameters are shadowed
1441 // in their paired GPR. So we only need to save the GPR to their home
1442 // slots.
1443 // TODO: __vectorcall will change this.
1444 return {};
1445 }
1446
1447 bool isSoftFloat = Subtarget.useSoftFloat();
1448 if (isSoftFloat || !Subtarget.hasSSE1())
1449 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
1450 // registers.
1451 return {};
1452
1453 static const MCPhysReg XMMArgRegs64Bit[] = {
1454 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1455 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1456 };
1457 return XMMArgRegs64Bit;
1458}
1459
1460#ifndef NDEBUG
1461static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
1462 return llvm::is_sorted(
1463 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
1464 return A.getValNo() < B.getValNo();
1465 });
1466}
1467#endif
1468
1469namespace {
1470/// This is a helper class for lowering variable arguments parameters.
1471class VarArgsLoweringHelper {
1472public:
1473 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
1474 SelectionDAG &DAG, const X86Subtarget &Subtarget,
1475 CallingConv::ID CallConv, CCState &CCInfo)
1476 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
1477 TheMachineFunction(DAG.getMachineFunction()),
1478 TheFunction(TheMachineFunction.getFunction()),
1479 FrameInfo(TheMachineFunction.getFrameInfo()),
1480 FrameLowering(*Subtarget.getFrameLowering()),
1481 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
1482 CCInfo(CCInfo) {}
1483
1484 // Lower variable arguments parameters.
1485 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
1486
1487private:
1488 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
1489
1490 void forwardMustTailParameters(SDValue &Chain);
1491
1492 bool is64Bit() const { return Subtarget.is64Bit(); }
1493 bool isWin64() const { return Subtarget.isCallingConvWin64(CC: CallConv); }
1494
1495 X86MachineFunctionInfo *FuncInfo;
1496 const SDLoc &DL;
1497 SelectionDAG &DAG;
1498 const X86Subtarget &Subtarget;
1499 MachineFunction &TheMachineFunction;
1500 const Function &TheFunction;
1501 MachineFrameInfo &FrameInfo;
1502 const TargetFrameLowering &FrameLowering;
1503 const TargetLowering &TargLowering;
1504 CallingConv::ID CallConv;
1505 CCState &CCInfo;
1506};
1507} // namespace
1508
1509void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
1510 SDValue &Chain, unsigned StackSize) {
1511 // If the function takes variable number of arguments, make a frame index for
1512 // the start of the first vararg value... for expansion of llvm.va_start. We
1513 // can skip this if there are no va_start calls.
1514 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
1515 CallConv != CallingConv::X86_ThisCall)) {
1516 FuncInfo->setVarArgsFrameIndex(
1517 FrameInfo.CreateFixedObject(Size: 1, SPOffset: StackSize, IsImmutable: true));
1518 }
1519
1520 // 64-bit calling conventions support varargs and register parameters, so we
1521 // have to do extra work to spill them in the prologue.
1522 if (is64Bit()) {
1523 // Find the first unallocated argument registers.
1524 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
1525 ArrayRef<MCPhysReg> ArgXMMs =
1526 get64BitArgumentXMMs(MF&: TheMachineFunction, CallConv, Subtarget);
1527 unsigned NumIntRegs = CCInfo.getFirstUnallocated(Regs: ArgGPRs);
1528 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(Regs: ArgXMMs);
1529
1530 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
1531 "SSE register cannot be used when SSE is disabled!");
1532
1533 if (isWin64()) {
1534 // Get to the caller-allocated home save location. Add 8 to account
1535 // for the return address.
1536 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
1537 FuncInfo->setRegSaveFrameIndex(
1538 FrameInfo.CreateFixedObject(Size: 1, SPOffset: NumIntRegs * 8 + HomeOffset, IsImmutable: false));
1539 // Fixup to set vararg frame on shadow area (4 x i64).
1540 if (NumIntRegs < 4)
1541 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
1542 } else {
1543 // For X86-64, if there are vararg parameters that are passed via
1544 // registers, then we must store them to their spots on the stack so
1545 // they may be loaded by dereferencing the result of va_next.
1546 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1547 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
1548 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
1549 Size: ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Alignment: Align(16), isSpillSlot: false));
1550 }
1551
1552 SmallVector<SDValue, 6>
1553 LiveGPRs; // list of SDValue for GPR registers keeping live input value
1554 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
1555 // keeping live input value
1556 SDValue ALVal; // if applicable keeps SDValue for %al register
1557
1558 // Gather all the live in physical registers.
1559 for (MCPhysReg Reg : ArgGPRs.slice(N: NumIntRegs)) {
1560 Register GPR = TheMachineFunction.addLiveIn(PReg: Reg, RC: &X86::GR64RegClass);
1561 LiveGPRs.push_back(Elt: DAG.getCopyFromReg(Chain, dl: DL, Reg: GPR, VT: MVT::i64));
1562 }
1563 const auto &AvailableXmms = ArgXMMs.slice(N: NumXMMRegs);
1564 if (!AvailableXmms.empty()) {
1565 Register AL = TheMachineFunction.addLiveIn(PReg: X86::AL, RC: &X86::GR8RegClass);
1566 ALVal = DAG.getCopyFromReg(Chain, dl: DL, Reg: AL, VT: MVT::i8);
1567 for (MCPhysReg Reg : AvailableXmms) {
1568 // FastRegisterAllocator spills virtual registers at basic
1569 // block boundary. That leads to usages of xmm registers
1570 // outside of check for %al. Pass physical registers to
1571 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
1572 TheMachineFunction.getRegInfo().addLiveIn(Reg);
1573 LiveXMMRegs.push_back(Elt: DAG.getRegister(Reg, VT: MVT::v4f32));
1574 }
1575 }
1576
1577 // Store the integer parameter registers.
1578 SmallVector<SDValue, 8> MemOps;
1579 SDValue RSFIN =
1580 DAG.getFrameIndex(FI: FuncInfo->getRegSaveFrameIndex(),
1581 VT: TargLowering.getPointerTy(DL: DAG.getDataLayout()));
1582 unsigned Offset = FuncInfo->getVarArgsGPOffset();
1583 for (SDValue Val : LiveGPRs) {
1584 SDValue FIN = DAG.getNode(Opcode: ISD::ADD, DL,
1585 VT: TargLowering.getPointerTy(DL: DAG.getDataLayout()),
1586 N1: RSFIN, N2: DAG.getIntPtrConstant(Val: Offset, DL));
1587 SDValue Store =
1588 DAG.getStore(Chain: Val.getValue(R: 1), dl: DL, Val, Ptr: FIN,
1589 PtrInfo: MachinePointerInfo::getFixedStack(
1590 MF&: DAG.getMachineFunction(),
1591 FI: FuncInfo->getRegSaveFrameIndex(), Offset));
1592 MemOps.push_back(Elt: Store);
1593 Offset += 8;
1594 }
1595
1596 // Now store the XMM (fp + vector) parameter registers.
1597 if (!LiveXMMRegs.empty()) {
1598 SmallVector<SDValue, 12> SaveXMMOps;
1599 SaveXMMOps.push_back(Elt: Chain);
1600 SaveXMMOps.push_back(Elt: ALVal);
1601 SaveXMMOps.push_back(Elt: RSFIN);
1602 SaveXMMOps.push_back(
1603 Elt: DAG.getTargetConstant(Val: FuncInfo->getVarArgsFPOffset(), DL, VT: MVT::i32));
1604 llvm::append_range(C&: SaveXMMOps, R&: LiveXMMRegs);
1605 MachineMemOperand *StoreMMO =
1606 DAG.getMachineFunction().getMachineMemOperand(
1607 PtrInfo: MachinePointerInfo::getFixedStack(
1608 MF&: DAG.getMachineFunction(), FI: FuncInfo->getRegSaveFrameIndex(),
1609 Offset),
1610 F: MachineMemOperand::MOStore, Size: 128, BaseAlignment: Align(16));
1611 MemOps.push_back(Elt: DAG.getMemIntrinsicNode(Opcode: X86ISD::VASTART_SAVE_XMM_REGS,
1612 dl: DL, VTList: DAG.getVTList(VT: MVT::Other),
1613 Ops: SaveXMMOps, MemVT: MVT::i8, MMO: StoreMMO));
1614 }
1615
1616 if (!MemOps.empty())
1617 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
1618 }
1619}
1620
1621void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
1622 // Find the largest legal vector type.
1623 MVT VecVT = MVT::Other;
1624 // FIXME: Only some x86_32 calling conventions support AVX512.
1625 if (Subtarget.useAVX512Regs() &&
1626 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
1627 CallConv == CallingConv::Intel_OCL_BI)))
1628 VecVT = MVT::v16f32;
1629 else if (Subtarget.hasAVX())
1630 VecVT = MVT::v8f32;
1631 else if (Subtarget.hasSSE2())
1632 VecVT = MVT::v4f32;
1633
1634 // We forward some GPRs and some vector types.
1635 SmallVector<MVT, 2> RegParmTypes;
1636 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
1637 RegParmTypes.push_back(Elt: IntVT);
1638 if (VecVT != MVT::Other)
1639 RegParmTypes.push_back(Elt: VecVT);
1640
1641 // Compute the set of forwarded registers. The rest are scratch.
1642 SmallVectorImpl<ForwardedRegister> &Forwards =
1643 FuncInfo->getForwardedMustTailRegParms();
1644 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, Fn: CC_X86);
1645
1646 // Forward AL for SysV x86_64 targets, since it is used for varargs.
1647 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(Reg: X86::AL)) {
1648 Register ALVReg = TheMachineFunction.addLiveIn(PReg: X86::AL, RC: &X86::GR8RegClass);
1649 Forwards.push_back(Elt: ForwardedRegister(ALVReg, X86::AL, MVT::i8));
1650 }
1651
1652 // Copy all forwards from physical to virtual registers.
1653 for (ForwardedRegister &FR : Forwards) {
1654 // FIXME: Can we use a less constrained schedule?
1655 SDValue RegVal = DAG.getCopyFromReg(Chain, dl: DL, Reg: FR.VReg, VT: FR.VT);
1656 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
1657 RegClass: TargLowering.getRegClassFor(VT: FR.VT));
1658 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: FR.VReg, N: RegVal);
1659 }
1660}
1661
1662void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
1663 unsigned StackSize) {
1664 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
1665 // If necessary, it would be set into the correct value later.
1666 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1667 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1668
1669 if (FrameInfo.hasVAStart())
1670 createVarArgAreaAndStoreRegisters(Chain, StackSize);
1671
1672 if (FrameInfo.hasMustTailInVarArgFunc())
1673 forwardMustTailParameters(Chain);
1674}
1675
1676SDValue X86TargetLowering::LowerFormalArguments(
1677 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1678 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1679 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1680 MachineFunction &MF = DAG.getMachineFunction();
1681 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1682
1683 const Function &F = MF.getFunction();
1684 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
1685 F.getName() == "main")
1686 FuncInfo->setForceFramePointer(true);
1687
1688 MachineFrameInfo &MFI = MF.getFrameInfo();
1689 bool Is64Bit = Subtarget.is64Bit();
1690 bool IsWin64 = Subtarget.isCallingConvWin64(CC: CallConv);
1691
1692 // On x86_64 with x87 disabled, x86_fp80 cannot be handled: the type would
1693 // need to be returned/passed in x87 registers (FP0/FP1) which are
1694 // unavailable. Emit a clear diagnostic instead of crashing later with
1695 // "Cannot select: build_pair".
1696 if (Is64Bit && !Subtarget.hasX87()) {
1697 if (F.getReturnType()->isX86_FP80Ty() ||
1698 any_of(Range: F.args(), P: [](const Argument &Arg) {
1699 return Arg.getType()->isX86_FP80Ty();
1700 }))
1701 reportFatalUsageError(
1702 reason: "cannot use x86_fp80 type with x87 disabled on x86_64 target");
1703 }
1704
1705 assert(
1706 !(IsVarArg && canGuaranteeTCO(CallConv)) &&
1707 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
1708
1709 // Assign locations to all of the incoming arguments.
1710 SmallVector<CCValAssign, 16> ArgLocs;
1711 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1712
1713 // Allocate shadow area for Win64.
1714 if (IsWin64)
1715 CCInfo.AllocateStack(Size: 32, Alignment: Align(8));
1716
1717 CCInfo.AnalyzeArguments(Ins, Fn: CC_X86);
1718
1719 // In vectorcall calling convention a second pass is required for the HVA
1720 // types.
1721 if (CallingConv::X86_VectorCall == CallConv) {
1722 CCInfo.AnalyzeArgumentsSecondPass(Args: Ins, Fn: CC_X86);
1723 }
1724
1725 // The next loop assumes that the locations are in the same order of the
1726 // input arguments.
1727 assert(isSortedByValueNo(ArgLocs) &&
1728 "Argument Location list must be sorted before lowering");
1729
1730 SDValue ArgValue;
1731 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
1732 ++I, ++InsIndex) {
1733 assert(InsIndex < Ins.size() && "Invalid Ins index");
1734 CCValAssign &VA = ArgLocs[I];
1735
1736 if (VA.isRegLoc()) {
1737 EVT RegVT = VA.getLocVT();
1738 if (VA.needsCustom()) {
1739 assert(
1740 VA.getValVT() == MVT::v64i1 &&
1741 "Currently the only custom case is when we split v64i1 to 2 regs");
1742
1743 // v64i1 values, in regcall calling convention, that are
1744 // compiled to 32 bit arch, are split up into two registers.
1745 ArgValue =
1746 getv64i1Argument(VA, NextVA&: ArgLocs[++I], Root&: Chain, DAG, DL: dl, Subtarget);
1747 } else {
1748 const TargetRegisterClass *RC;
1749 if (RegVT == MVT::i8)
1750 RC = &X86::GR8RegClass;
1751 else if (RegVT == MVT::i16)
1752 RC = &X86::GR16RegClass;
1753 else if (RegVT == MVT::i32)
1754 RC = &X86::GR32RegClass;
1755 else if (Is64Bit && RegVT == MVT::i64)
1756 RC = &X86::GR64RegClass;
1757 else if (RegVT == MVT::f16)
1758 RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
1759 else if (RegVT == MVT::f32)
1760 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
1761 else if (RegVT == MVT::f64)
1762 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
1763 else if (RegVT == MVT::f80)
1764 RC = &X86::RFP80RegClass;
1765 else if (RegVT == MVT::f128)
1766 RC = &X86::VR128RegClass;
1767 else if (RegVT.is512BitVector())
1768 RC = &X86::VR512RegClass;
1769 else if (RegVT.is256BitVector())
1770 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
1771 else if (RegVT.is128BitVector())
1772 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
1773 else if (RegVT == MVT::x86mmx)
1774 RC = &X86::VR64RegClass;
1775 else if (RegVT == MVT::v1i1)
1776 RC = &X86::VK1RegClass;
1777 else if (RegVT == MVT::v8i1)
1778 RC = &X86::VK8RegClass;
1779 else if (RegVT == MVT::v16i1)
1780 RC = &X86::VK16RegClass;
1781 else if (RegVT == MVT::v32i1)
1782 RC = &X86::VK32RegClass;
1783 else if (RegVT == MVT::v64i1)
1784 RC = &X86::VK64RegClass;
1785 else
1786 llvm_unreachable("Unknown argument type!");
1787
1788 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
1789 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, VT: RegVT);
1790 }
1791
1792 // If this is an 8 or 16-bit value, it is really passed promoted to 32
1793 // bits. Insert an assert[sz]ext to capture this, then truncate to the
1794 // right size.
1795 if (VA.getLocInfo() == CCValAssign::SExt)
1796 ArgValue = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: RegVT, N1: ArgValue,
1797 N2: DAG.getValueType(VA.getValVT()));
1798 else if (VA.getLocInfo() == CCValAssign::ZExt)
1799 ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: RegVT, N1: ArgValue,
1800 N2: DAG.getValueType(VA.getValVT()));
1801 else if (VA.getLocInfo() == CCValAssign::BCvt)
1802 ArgValue = DAG.getBitcast(VT: VA.getValVT(), V: ArgValue);
1803
1804 if (VA.isExtInLoc()) {
1805 // Handle MMX values passed in XMM regs.
1806 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
1807 ArgValue = DAG.getNode(Opcode: X86ISD::MOVDQ2Q, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
1808 else if (VA.getValVT().isVector() &&
1809 VA.getValVT().getScalarType() == MVT::i1 &&
1810 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
1811 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
1812 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
1813 ArgValue = lowerRegToMasks(ValArg: ArgValue, ValVT: VA.getValVT(), ValLoc: RegVT, DL: dl, DAG);
1814 } else
1815 ArgValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
1816 }
1817 } else {
1818 assert(VA.isMemLoc());
1819 ArgValue =
1820 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i: InsIndex);
1821 }
1822
1823 // If value is passed via pointer - do a load.
1824 if (VA.getLocInfo() == CCValAssign::Indirect &&
1825 !(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
1826 ArgValue =
1827 DAG.getLoad(VT: VA.getValVT(), dl, Chain, Ptr: ArgValue, PtrInfo: MachinePointerInfo());
1828 }
1829
1830 InVals.push_back(Elt: ArgValue);
1831 }
1832
1833 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
1834 if (Ins[I].Flags.isSwiftAsync()) {
1835 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
1836 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF))
1837 X86FI->setHasSwiftAsyncContext(true);
1838 else {
1839 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
1840 int FI =
1841 MF.getFrameInfo().CreateStackObject(Size: PtrSize, Alignment: Align(PtrSize), isSpillSlot: false);
1842 X86FI->setSwiftAsyncContextFrameIdx(FI);
1843 SDValue St = DAG.getStore(
1844 Chain: DAG.getEntryNode(), dl, Val: InVals[I],
1845 Ptr: DAG.getFrameIndex(FI, VT: PtrSize == 8 ? MVT::i64 : MVT::i32),
1846 PtrInfo: MachinePointerInfo::getFixedStack(MF, FI));
1847 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, N1: St, N2: Chain);
1848 }
1849 }
1850
1851 // Swift calling convention does not require we copy the sret argument
1852 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
1853 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
1854 continue;
1855
1856 // All x86 ABIs require that for returning structs by value we copy the
1857 // sret argument into %rax/%eax (depending on ABI) for the return. Save
1858 // the argument into a virtual register so that we can access it from the
1859 // return points.
1860 if (Ins[I].Flags.isSRet()) {
1861 assert(!FuncInfo->getSRetReturnReg() &&
1862 "SRet return has already been set");
1863 MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
1864 Register Reg =
1865 MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
1866 FuncInfo->setSRetReturnReg(Reg);
1867 SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl, Reg, N: InVals[I]);
1868 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, N1: Copy, N2: Chain);
1869 break;
1870 }
1871 }
1872
1873 unsigned StackSize = CCInfo.getStackSize();
1874 // Align stack specially for tail calls.
1875 if (shouldGuaranteeTCO(CC: CallConv,
1876 GuaranteedTailCallOpt: MF.getTarget().Options.GuaranteedTailCallOpt))
1877 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1878
1879 if (IsVarArg)
1880 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
1881 .lowerVarArgsParameters(Chain, StackSize);
1882
1883 // Some CCs need callee pop.
1884 if (X86::isCalleePop(CallingConv: CallConv, is64Bit: Is64Bit, IsVarArg,
1885 GuaranteeTCO: MF.getTarget().Options.GuaranteedTailCallOpt)) {
1886 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1887 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
1888 // X86 interrupts must pop the error code (and the alignment padding) if
1889 // present.
1890 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
1891 } else {
1892 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1893 // If this is an sret function, the return should pop the hidden pointer.
1894 if (hasCalleePopSRet(Args: Ins, ArgLocs, Subtarget))
1895 FuncInfo->setBytesToPopOnReturn(4);
1896 }
1897
1898 if (!Is64Bit) {
1899 // RegSaveFrameIndex is X86-64 only.
1900 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1901 }
1902
1903 FuncInfo->setArgumentStackSize(StackSize);
1904
1905 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
1906 EHPersonality Personality = classifyEHPersonality(Pers: F.getPersonalityFn());
1907 if (Personality == EHPersonality::CoreCLR) {
1908 assert(Is64Bit);
1909 // TODO: Add a mechanism to frame lowering that will allow us to indicate
1910 // that we'd prefer this slot be allocated towards the bottom of the frame
1911 // (i.e. near the stack pointer after allocating the frame). Every
1912 // funclet needs a copy of this slot in its (mostly empty) frame, and the
1913 // offset from the bottom of this and each funclet's frame must be the
1914 // same, so the size of funclets' (mostly empty) frames is dictated by
1915 // how far this slot is from the bottom (since they allocate just enough
1916 // space to accommodate holding this slot at the correct offset).
1917 int PSPSymFI = MFI.CreateStackObject(Size: 8, Alignment: Align(8), /*isSpillSlot=*/false);
1918 EHInfo->PSPSymFrameIdx = PSPSymFI;
1919 }
1920 }
1921
1922 if (shouldDisableArgRegFromCSR(CC: CallConv) ||
1923 F.hasFnAttribute(Kind: "no_caller_saved_registers")) {
1924 MachineRegisterInfo &MRI = MF.getRegInfo();
1925 for (std::pair<MCRegister, Register> Pair : MRI.liveins())
1926 MRI.disableCalleeSavedRegister(Reg: Pair.first);
1927 }
1928
1929 if (CallingConv::PreserveNone == CallConv)
1930 for (const ISD::InputArg &In : Ins) {
1931 if (In.Flags.isSwiftSelf() || In.Flags.isSwiftAsync() ||
1932 In.Flags.isSwiftError()) {
1933 errorUnsupported(DAG, dl,
1934 Msg: "Swift attributes can't be used with preserve_none");
1935 break;
1936 }
1937 }
1938
1939 return Chain;
1940}
1941
1942SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1943 SDValue Arg, const SDLoc &dl,
1944 SelectionDAG &DAG,
1945 const CCValAssign &VA,
1946 ISD::ArgFlagsTy Flags,
1947 bool isByVal) const {
1948 unsigned LocMemOffset = VA.getLocMemOffset();
1949 SDValue PtrOff = DAG.getIntPtrConstant(Val: LocMemOffset, DL: dl);
1950 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
1951 N1: StackPtr, N2: PtrOff);
1952 if (isByVal)
1953 return CreateCopyOfByValArgument(Src: Arg, Dst: PtrOff, Chain, Flags, DAG, dl);
1954
1955 MaybeAlign Alignment;
1956 if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
1957 Arg.getSimpleValueType() != MVT::f80)
1958 Alignment = MaybeAlign(4);
1959 return DAG.getStore(
1960 Chain, dl, Val: Arg, Ptr: PtrOff,
1961 PtrInfo: MachinePointerInfo::getStack(MF&: DAG.getMachineFunction(), Offset: LocMemOffset),
1962 Alignment);
1963}
1964
1965/// Emit a load of return address if tail call
1966/// optimization is performed and it is required.
1967SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
1968 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
1969 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
1970 // Adjust the Return address stack slot.
1971 EVT VT = getPointerTy(DL: DAG.getDataLayout());
1972 OutRetAddr = getReturnAddressFrameIndex(DAG);
1973
1974 // Load the "old" Return address.
1975 OutRetAddr = DAG.getLoad(VT, dl, Chain, Ptr: OutRetAddr, PtrInfo: MachinePointerInfo());
1976 return SDValue(OutRetAddr.getNode(), 1);
1977}
1978
1979/// Emit a store of the return address if tail call
1980/// optimization is performed and it is required (FPDiff!=0).
1981static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
1982 SDValue Chain, SDValue RetAddrFrIdx,
1983 EVT PtrVT, unsigned SlotSize,
1984 int FPDiff, const SDLoc &dl) {
1985 // Store the return address to the appropriate stack slot.
1986 if (!FPDiff) return Chain;
1987 // Calculate the new stack slot for the return address.
1988 int NewReturnAddrFI =
1989 MF.getFrameInfo().CreateFixedObject(Size: SlotSize, SPOffset: (int64_t)FPDiff - SlotSize,
1990 IsImmutable: false);
1991 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(FI: NewReturnAddrFI, VT: PtrVT);
1992 Chain = DAG.getStore(Chain, dl, Val: RetAddrFrIdx, Ptr: NewRetAddrFrIdx,
1993 PtrInfo: MachinePointerInfo::getFixedStack(
1994 MF&: DAG.getMachineFunction(), FI: NewReturnAddrFI));
1995 return Chain;
1996}
1997
1998/// Returns a vector_shuffle mask for an movs{s|d}, movd
1999/// operation of specified width.
2000SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
2001 SDValue V1, SDValue V2) const {
2002 unsigned NumElems = VT.getVectorNumElements();
2003 SmallVector<int, 8> Mask;
2004 Mask.push_back(Elt: NumElems);
2005 for (unsigned i = 1; i != NumElems; ++i)
2006 Mask.push_back(Elt: i);
2007 return DAG.getVectorShuffle(VT, dl, N1: V1, N2: V2, Mask);
2008}
2009
2010// Returns the type of copying which is required to set up a byval argument to
2011// a tail-called function. This isn't needed for non-tail calls, because they
2012// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2013// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2014// optimised to zero copies when forwarding an argument from the caller's
2015// caller (NoCopy).
2016X86TargetLowering::ByValCopyKind X86TargetLowering::ByValNeedsCopyForTailCall(
2017 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2018 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2019
2020 // Globals are always safe to copy from.
2021 if (isa<GlobalAddressSDNode>(Val: Src) || isa<ExternalSymbolSDNode>(Val: Src))
2022 return CopyOnce;
2023
2024 // Can only analyse frame index nodes, conservatively assume we need a
2025 // temporary.
2026 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Val&: Src);
2027 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Val&: Dst);
2028 if (!SrcFrameIdxNode || !DstFrameIdxNode)
2029 return CopyViaTemp;
2030
2031 int SrcFI = SrcFrameIdxNode->getIndex();
2032 int DstFI = DstFrameIdxNode->getIndex();
2033 assert(MFI.isFixedObjectIndex(DstFI) &&
2034 "byval passed in non-fixed stack slot");
2035
2036 int64_t SrcOffset = MFI.getObjectOffset(ObjectIdx: SrcFI);
2037 int64_t DstOffset = MFI.getObjectOffset(ObjectIdx: DstFI);
2038
2039 // If the source is in the local frame, then the copy to the argument
2040 // memory is always valid.
2041 bool FixedSrc = MFI.isFixedObjectIndex(ObjectIdx: SrcFI);
2042 if (!FixedSrc || (FixedSrc && SrcOffset < 0))
2043 return CopyOnce;
2044
2045 // If the value is already in the correct location, then no copying is
2046 // needed. If not, then we need to copy via a temporary.
2047 if (SrcOffset == DstOffset)
2048 return NoCopy;
2049 else
2050 return CopyViaTemp;
2051}
2052
2053SDValue
2054X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2055 SmallVectorImpl<SDValue> &InVals) const {
2056 SelectionDAG &DAG = CLI.DAG;
2057 SDLoc &dl = CLI.DL;
2058 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2059 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2060 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2061 SDValue Chain = CLI.Chain;
2062 SDValue Callee = CLI.Callee;
2063 CallingConv::ID CallConv = CLI.CallConv;
2064 bool &isTailCall = CLI.IsTailCall;
2065 bool isVarArg = CLI.IsVarArg;
2066 const auto *CB = CLI.CB;
2067
2068 MachineFunction &MF = DAG.getMachineFunction();
2069 bool Is64Bit = Subtarget.is64Bit();
2070 bool IsWin64 = Subtarget.isCallingConvWin64(CC: CallConv);
2071 bool ShouldGuaranteeTCO = shouldGuaranteeTCO(
2072 CC: CallConv, GuaranteedTailCallOpt: MF.getTarget().Options.GuaranteedTailCallOpt);
2073 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2074 bool HasNCSR = (CB && isa<CallInst>(Val: CB) &&
2075 CB->hasFnAttr(Kind: "no_caller_saved_registers"));
2076 bool IsIndirectCall = (CB && isa<CallInst>(Val: CB) && CB->isIndirectCall());
2077 bool IsCFICall = IsIndirectCall && CLI.CFIType;
2078 const Module *M = MF.getFunction().getParent();
2079
2080 // If the indirect call target has the nocf_check attribute, the call needs
2081 // the NOTRACK prefix. For simplicity just disable tail calls as there are
2082 // so many variants.
2083 // FIXME: This will cause backend errors if the user forces the issue.
2084 bool IsNoTrackIndirectCall = IsIndirectCall && CB->doesNoCfCheck() &&
2085 M->getModuleFlag(Key: "cf-protection-branch");
2086 if (IsNoTrackIndirectCall)
2087 isTailCall = false;
2088
2089 MachineFunction::CallSiteInfo CSInfo;
2090 if (CallConv == CallingConv::X86_INTR)
2091 report_fatal_error(reason: "X86 interrupts may not be called directly");
2092
2093 // Set type id for call site info.
2094 setTypeIdForCallsiteInfo(CB, MF, CSInfo);
2095
2096 if (IsIndirectCall && !IsWin64 &&
2097 M->getModuleFlag(Key: "import-call-optimization"))
2098 errorUnsupported(DAG, dl,
2099 Msg: "Indirect calls must have a normal calling convention if "
2100 "Import Call Optimization is enabled");
2101
2102 // Analyze operands of the call, assigning locations to each operand.
2103 SmallVector<CCValAssign, 16> ArgLocs;
2104 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2105
2106 // Allocate shadow area for Win64.
2107 if (IsWin64)
2108 CCInfo.AllocateStack(Size: 32, Alignment: Align(8));
2109
2110 CCInfo.AnalyzeArguments(Outs, Fn: CC_X86);
2111
2112 // In vectorcall calling convention a second pass is required for the HVA
2113 // types.
2114 if (CallingConv::X86_VectorCall == CallConv) {
2115 CCInfo.AnalyzeArgumentsSecondPass(Args: Outs, Fn: CC_X86);
2116 }
2117
2118 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
2119 bool IsSibcall = false;
2120 if (isTailCall && ShouldGuaranteeTCO) {
2121 // If we need to guarantee TCO for a non-musttail call, we just need to make
2122 // sure the conventions match. If a tail call uses one of the supported TCO
2123 // conventions and the caller and callee match, we can tail call any
2124 // function prototype.
2125 CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
2126 isTailCall = (CallConv == CallerCC);
2127 IsSibcall = IsMustTail;
2128 } else if (isTailCall) {
2129 // Check if this tail call is a "sibling" call, which is loosely defined to
2130 // be a tail call that doesn't require heroics like moving the return
2131 // address or swapping byval arguments. We treat some musttail calls as
2132 // sibling calls to avoid unnecessary argument copies.
2133 IsSibcall = isEligibleForSiblingCallOpt(CLI, CCInfo, ArgLocs);
2134 isTailCall = IsSibcall || IsMustTail;
2135 }
2136
2137 if (isTailCall)
2138 ++NumTailCalls;
2139
2140 if (IsMustTail && !isTailCall)
2141 report_fatal_error(reason: "failed to perform tail call elimination on a call "
2142 "site marked musttail");
2143
2144 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2145 "Var args not supported with calling convention fastcc, ghc or hipe");
2146
2147 // Get a count of how many bytes are to be pushed on the stack.
2148 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
2149 if (IsSibcall)
2150 // This is a sibcall. The memory operands are available in caller's
2151 // own caller's stack.
2152 NumBytes = 0;
2153 else if (ShouldGuaranteeTCO && canGuaranteeTCO(CC: CallConv))
2154 NumBytes = GetAlignedArgumentStackSize(StackSize: NumBytes, DAG);
2155
2156 // A sibcall is ABI-compatible and does not need to adjust the stack pointer.
2157 int FPDiff = 0;
2158 if (isTailCall && ShouldGuaranteeTCO && !IsSibcall) {
2159 // Lower arguments at fp - stackoffset + fpdiff.
2160 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2161
2162 FPDiff = NumBytesCallerPushed - NumBytes;
2163
2164 // Set the delta of movement of the returnaddr stackslot.
2165 // But only set if delta is greater than previous delta.
2166 if (FPDiff < X86Info->getTCReturnAddrDelta())
2167 X86Info->setTCReturnAddrDelta(FPDiff);
2168 }
2169
2170 unsigned NumBytesToPush = NumBytes;
2171 unsigned NumBytesToPop = NumBytes;
2172
2173 SDValue StackPtr;
2174 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2175
2176 // If we are doing a tail-call, any byval arguments will be written to stack
2177 // space which was used for incoming arguments. If any the values being used
2178 // are incoming byval arguments to this function, then they might be
2179 // overwritten by the stores of the outgoing arguments. To avoid this, we
2180 // need to make a temporary copy of them in local stack space, then copy back
2181 // to the argument area.
2182 // FIXME: There's potential to improve the code by using virtual registers for
2183 // temporary storage, and letting the register allocator spill if needed.
2184 SmallVector<SDValue, 8> ByValTemporaries;
2185 SDValue ByValTempChain;
2186 if (isTailCall) {
2187 // Use null SDValue to mean "no temporary recorded for this arg index".
2188 ByValTemporaries.assign(NumElts: OutVals.size(), Elt: SDValue());
2189
2190 SmallVector<SDValue, 8> ByValCopyChains;
2191 for (const CCValAssign &VA : ArgLocs) {
2192 unsigned ArgIdx = VA.getValNo();
2193 SDValue Src = OutVals[ArgIdx];
2194 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2195
2196 if (!Flags.isByVal())
2197 continue;
2198
2199 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2200
2201 if (!StackPtr.getNode())
2202 StackPtr =
2203 DAG.getCopyFromReg(Chain, dl, Reg: RegInfo->getStackRegister(), VT: PtrVT);
2204
2205 // Destination: where this byval should live in the callee’s frame
2206 // after the tail call.
2207 int64_t Offset = VA.getLocMemOffset() + FPDiff;
2208 uint64_t Size = VA.getLocVT().getFixedSizeInBits() / 8;
2209 int FI = MF.getFrameInfo().CreateFixedObject(Size, SPOffset: Offset,
2210 /*IsImmutable=*/true);
2211 SDValue Dst = DAG.getFrameIndex(FI, VT: PtrVT);
2212
2213 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2214
2215 if (Copy == NoCopy) {
2216 // If the argument is already at the correct offset on the stack
2217 // (because we are forwarding a byval argument from our caller), we
2218 // don't need any copying.
2219 continue;
2220 } else if (Copy == CopyOnce) {
2221 // If the argument is in our local stack frame, no other argument
2222 // preparation can clobber it, so we can copy it to the final location
2223 // later.
2224 ByValTemporaries[ArgIdx] = Src;
2225 } else {
2226 assert(Copy == CopyViaTemp && "unexpected enum value");
2227 // If we might be copying this argument from the outgoing argument
2228 // stack area, we need to copy via a temporary in the local stack
2229 // frame.
2230 MachineFrameInfo &MFI = MF.getFrameInfo();
2231 int TempFrameIdx = MFI.CreateStackObject(Size: Flags.getByValSize(),
2232 Alignment: Flags.getNonZeroByValAlign(),
2233 /*isSS=*/isSpillSlot: false);
2234 SDValue Temp =
2235 DAG.getFrameIndex(FI: TempFrameIdx, VT: getPointerTy(DL: DAG.getDataLayout()));
2236
2237 SDValue CopyChain =
2238 CreateCopyOfByValArgument(Src, Dst: Temp, Chain, Flags, DAG, dl);
2239 ByValCopyChains.push_back(Elt: CopyChain);
2240 }
2241 }
2242 if (!ByValCopyChains.empty())
2243 ByValTempChain =
2244 DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ByValCopyChains);
2245 }
2246
2247 // If we have an inalloca argument, all stack space has already been allocated
2248 // for us and be right at the top of the stack. We don't support multiple
2249 // arguments passed in memory when using inalloca.
2250 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2251 NumBytesToPush = 0;
2252 if (!ArgLocs.back().isMemLoc())
2253 report_fatal_error(reason: "cannot use inalloca attribute on a register "
2254 "parameter");
2255 if (ArgLocs.back().getLocMemOffset() != 0)
2256 report_fatal_error(reason: "any parameter with the inalloca attribute must be "
2257 "the only memory argument");
2258 } else if (CLI.IsPreallocated) {
2259 assert(ArgLocs.back().isMemLoc() &&
2260 "cannot use preallocated attribute on a register "
2261 "parameter");
2262 SmallVector<size_t, 4> PreallocatedOffsets;
2263 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
2264 if (CLI.CB->paramHasAttr(ArgNo: i, Kind: Attribute::Preallocated)) {
2265 PreallocatedOffsets.push_back(Elt: ArgLocs[i].getLocMemOffset());
2266 }
2267 }
2268 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
2269 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CS: CLI.CB);
2270 MFI->setPreallocatedStackSize(Id: PreallocatedId, StackSize: NumBytes);
2271 MFI->setPreallocatedArgOffsets(Id: PreallocatedId, AO: PreallocatedOffsets);
2272 NumBytesToPush = 0;
2273 }
2274
2275 if (!IsSibcall && !IsMustTail)
2276 Chain = DAG.getCALLSEQ_START(Chain, InSize: NumBytesToPush,
2277 OutSize: NumBytes - NumBytesToPush, DL: dl);
2278
2279 SDValue RetAddrFrIdx;
2280 // Load return address for tail calls.
2281 if (isTailCall && FPDiff)
2282 Chain = EmitTailCallLoadRetAddr(DAG, OutRetAddr&: RetAddrFrIdx, Chain, IsTailCall: isTailCall,
2283 Is64Bit, FPDiff, dl);
2284
2285 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
2286 SmallVector<SDValue, 8> MemOpChains;
2287
2288 // The next loop assumes that the locations are in the same order of the
2289 // input arguments.
2290 assert(isSortedByValueNo(ArgLocs) &&
2291 "Argument Location list must be sorted before lowering");
2292
2293 // Walk the register/memloc assignments, inserting copies/loads. In the case
2294 // of tail call optimization arguments are handle later.
2295 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
2296 ++I, ++OutIndex) {
2297 assert(OutIndex < Outs.size() && "Invalid Out index");
2298 // Skip inalloca/preallocated arguments, they have already been written.
2299 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
2300 if (Flags.isInAlloca() || Flags.isPreallocated())
2301 continue;
2302
2303 CCValAssign &VA = ArgLocs[I];
2304 EVT RegVT = VA.getLocVT();
2305 SDValue Arg = OutVals[OutIndex];
2306 bool isByVal = Flags.isByVal();
2307
2308 // Promote the value if needed.
2309 switch (VA.getLocInfo()) {
2310 default: llvm_unreachable("Unknown loc info!");
2311 case CCValAssign::Full: break;
2312 case CCValAssign::SExt:
2313 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2314 break;
2315 case CCValAssign::ZExt:
2316 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2317 break;
2318 case CCValAssign::AExt:
2319 if (Arg.getValueType().isVector() &&
2320 Arg.getValueType().getVectorElementType() == MVT::i1)
2321 Arg = lowerMasksToReg(ValArg: Arg, ValLoc: RegVT, DL: dl, DAG);
2322 else if (RegVT.is128BitVector()) {
2323 // Special case: passing MMX values in XMM registers.
2324 Arg = DAG.getBitcast(VT: MVT::i64, V: Arg);
2325 Arg = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v2i64, Operand: Arg);
2326 Arg = getMOVL(DAG, dl, VT: MVT::v2i64, V1: DAG.getUNDEF(VT: MVT::v2i64), V2: Arg);
2327 } else
2328 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: RegVT, Operand: Arg);
2329 break;
2330 case CCValAssign::BCvt:
2331 Arg = DAG.getBitcast(VT: RegVT, V: Arg);
2332 break;
2333 case CCValAssign::Indirect: {
2334 if (isByVal) {
2335 // Memcpy the argument to a temporary stack slot to prevent
2336 // the caller from seeing any modifications the callee may make
2337 // as guaranteed by the `byval` attribute.
2338 int FrameIdx = MF.getFrameInfo().CreateStackObject(
2339 Size: Flags.getByValSize(),
2340 Alignment: std::max(a: Align(16), b: Flags.getNonZeroByValAlign()), isSpillSlot: false);
2341 SDValue StackSlot =
2342 DAG.getFrameIndex(FI: FrameIdx, VT: getPointerTy(DL: DAG.getDataLayout()));
2343 Chain =
2344 CreateCopyOfByValArgument(Src: Arg, Dst: StackSlot, Chain, Flags, DAG, dl);
2345 // From now on treat this as a regular pointer
2346 Arg = StackSlot;
2347 isByVal = false;
2348 } else {
2349 // Store the argument.
2350 SDValue SpillSlot = DAG.CreateStackTemporary(VT: VA.getValVT());
2351 int FI = cast<FrameIndexSDNode>(Val&: SpillSlot)->getIndex();
2352 Chain = DAG.getStore(
2353 Chain, dl, Val: Arg, Ptr: SpillSlot,
2354 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI));
2355 Arg = SpillSlot;
2356 }
2357 break;
2358 }
2359 }
2360
2361 if (VA.needsCustom()) {
2362 assert(VA.getValVT() == MVT::v64i1 &&
2363 "Currently the only custom case is when we split v64i1 to 2 regs");
2364 // Split v64i1 value into two registers
2365 Passv64i1ArgInRegs(DL: dl, DAG, Arg, RegsToPass, VA, NextVA&: ArgLocs[++I], Subtarget);
2366 } else if (VA.isRegLoc()) {
2367 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
2368 const TargetOptions &Options = DAG.getTarget().Options;
2369 if (Options.EmitCallSiteInfo)
2370 CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: I);
2371 if (isVarArg && IsWin64) {
2372 // Win64 ABI requires argument XMM reg to be copied to the corresponding
2373 // shadow reg if callee is a varargs function.
2374 Register ShadowReg;
2375 switch (VA.getLocReg()) {
2376 case X86::XMM0: ShadowReg = X86::RCX; break;
2377 case X86::XMM1: ShadowReg = X86::RDX; break;
2378 case X86::XMM2: ShadowReg = X86::R8; break;
2379 case X86::XMM3: ShadowReg = X86::R9; break;
2380 }
2381 if (ShadowReg)
2382 RegsToPass.push_back(Elt: std::make_pair(x&: ShadowReg, y&: Arg));
2383 }
2384 } else if (!IsSibcall && (!isTailCall || (isByVal && !IsMustTail))) {
2385 assert(VA.isMemLoc());
2386 if (!StackPtr.getNode())
2387 StackPtr = DAG.getCopyFromReg(Chain, dl, Reg: RegInfo->getStackRegister(),
2388 VT: getPointerTy(DL: DAG.getDataLayout()));
2389 MemOpChains.push_back(Elt: LowerMemOpCallTo(Chain, StackPtr, Arg,
2390 dl, DAG, VA, Flags, isByVal));
2391 }
2392 }
2393
2394 if (!MemOpChains.empty())
2395 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains);
2396
2397 if (Subtarget.isPICStyleGOT()) {
2398 // ELF / PIC requires GOT in the EBX register before function calls via PLT
2399 // GOT pointer (except regcall).
2400 if (!isTailCall) {
2401 // Indirect call with RegCall calling convertion may use up all the
2402 // general registers, so it is not suitable to bind EBX reister for
2403 // GOT address, just let register allocator handle it.
2404 if (CallConv != CallingConv::X86_RegCall)
2405 RegsToPass.push_back(Elt: std::make_pair(
2406 x: Register(X86::EBX), y: DAG.getNode(Opcode: X86ISD::GlobalBaseReg, DL: SDLoc(),
2407 VT: getPointerTy(DL: DAG.getDataLayout()))));
2408 } else {
2409 // If we are tail calling and generating PIC/GOT style code load the
2410 // address of the callee into ECX. The value in ecx is used as target of
2411 // the tail jump. This is done to circumvent the ebx/callee-saved problem
2412 // for tail calls on PIC/GOT architectures. Normally we would just put the
2413 // address of GOT into ebx and then call target@PLT. But for tail calls
2414 // ebx would be restored (since ebx is callee saved) before jumping to the
2415 // target@PLT.
2416
2417 // Note: The actual moving to ECX is done further down.
2418 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee);
2419 if (G && !G->getGlobal()->hasLocalLinkage() &&
2420 G->getGlobal()->hasDefaultVisibility())
2421 Callee = LowerGlobalAddress(Op: Callee, DAG);
2422 else if (isa<ExternalSymbolSDNode>(Val: Callee))
2423 Callee = LowerExternalSymbol(Op: Callee, DAG);
2424 }
2425 }
2426
2427 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
2428 (Subtarget.hasSSE1() || !M->getModuleFlag(Key: "SkipRaxSetup"))) {
2429 // From AMD64 ABI document:
2430 // For calls that may call functions that use varargs or stdargs
2431 // (prototype-less calls or calls to functions containing ellipsis (...) in
2432 // the declaration) %al is used as hidden argument to specify the number
2433 // of SSE registers used. The contents of %al do not need to match exactly
2434 // the number of registers, but must be an ubound on the number of SSE
2435 // registers used and is in the range 0 - 8 inclusive.
2436
2437 // Count the number of XMM registers allocated.
2438 static const MCPhysReg XMMArgRegs[] = {
2439 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2440 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2441 };
2442 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(Regs: XMMArgRegs);
2443 assert((Subtarget.hasSSE1() || !NumXMMRegs)
2444 && "SSE registers cannot be used when SSE is disabled");
2445 RegsToPass.push_back(Elt: std::make_pair(x: Register(X86::AL),
2446 y: DAG.getConstant(Val: NumXMMRegs, DL: dl,
2447 VT: MVT::i8)));
2448 }
2449
2450 if (isVarArg && IsMustTail) {
2451 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
2452 for (const auto &F : Forwards) {
2453 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg: F.VReg, VT: F.VT);
2454 RegsToPass.push_back(Elt: std::make_pair(x: F.PReg, y&: Val));
2455 }
2456 }
2457
2458 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
2459 // don't need this because the eligibility check rejects calls that require
2460 // shuffling arguments passed in memory.
2461 if (isTailCall && !IsSibcall) {
2462 // Force all the incoming stack arguments to be loaded from the stack
2463 // before any new outgoing arguments or the return address are stored to the
2464 // stack, because the outgoing stack slots may alias the incoming argument
2465 // stack slots, and the alias isn't otherwise explicit. This is slightly
2466 // more conservative than necessary, because it means that each store
2467 // effectively depends on every argument instead of just those arguments it
2468 // would clobber.
2469 Chain = DAG.getStackArgumentTokenFactor(Chain);
2470
2471 if (ByValTempChain)
2472 Chain =
2473 DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, N1: Chain, N2: ByValTempChain);
2474
2475 SmallVector<SDValue, 8> MemOpChains2;
2476 SDValue FIN;
2477 int FI = 0;
2478 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
2479 ++I, ++OutsIndex) {
2480 CCValAssign &VA = ArgLocs[I];
2481
2482 if (VA.isRegLoc()) {
2483 if (VA.needsCustom()) {
2484 assert((CallConv == CallingConv::X86_RegCall) &&
2485 "Expecting custom case only in regcall calling convention");
2486 // This means that we are in special case where one argument was
2487 // passed through two register locations - Skip the next location
2488 ++I;
2489 }
2490
2491 continue;
2492 }
2493
2494 assert(VA.isMemLoc());
2495 SDValue Arg = OutVals[OutsIndex];
2496 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
2497 // Skip inalloca/preallocated arguments. They don't require any work.
2498 if (Flags.isInAlloca() || Flags.isPreallocated())
2499 continue;
2500 // Create frame index.
2501 int32_t Offset = VA.getLocMemOffset()+FPDiff;
2502 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2503 FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
2504 FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
2505
2506 if (Flags.isByVal()) {
2507 if (SDValue ByValSrc = ByValTemporaries[OutsIndex]) {
2508 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2509 SDValue DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
2510
2511 MemOpChains2.push_back(Elt: CreateCopyOfByValArgument(
2512 Src: ByValSrc, Dst: DstAddr, Chain, Flags, DAG, dl));
2513 }
2514 } else {
2515 // Store relative to framepointer.
2516 MemOpChains2.push_back(Elt: DAG.getStore(
2517 Chain, dl, Val: Arg, Ptr: FIN,
2518 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI)));
2519 }
2520 }
2521
2522 if (!MemOpChains2.empty())
2523 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: MemOpChains2);
2524
2525 // Store the return address to the appropriate stack slot.
2526 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
2527 PtrVT: getPointerTy(DL: DAG.getDataLayout()),
2528 SlotSize: RegInfo->getSlotSize(), FPDiff, dl);
2529 }
2530
2531 // Build a sequence of copy-to-reg nodes chained together with token chain
2532 // and glue operands which copy the outgoing args into registers.
2533 SDValue InGlue;
2534 for (const auto &[Reg, N] : RegsToPass) {
2535 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, Glue: InGlue);
2536 InGlue = Chain.getValue(R: 1);
2537 }
2538
2539 bool IsImpCall = false;
2540 bool IsCFGuardCall = false;
2541 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
2542 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2543 // In the 64-bit large code model, we have to make all calls
2544 // through a register, since the call instruction's 32-bit
2545 // pc-relative offset may not be large enough to hold the whole
2546 // address.
2547 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
2548 Callee->getOpcode() == ISD::ExternalSymbol) {
2549 // Lower direct calls to global addresses and external symbols. Setting
2550 // ForCall to true here has the effect of removing WrapperRIP when possible
2551 // to allow direct calls to be selected without first materializing the
2552 // address into a register.
2553 Callee = LowerGlobalOrExternal(Op: Callee, DAG, /*ForCall=*/true, IsImpCall: &IsImpCall);
2554 } else if (Subtarget.isTarget64BitILP32() &&
2555 Callee.getValueType() == MVT::i32) {
2556 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
2557 Callee = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i64, Operand: Callee);
2558 } else if (Is64Bit && CB && isCFGuardCall(CB)) {
2559 // We'll use a specific psuedo instruction for tail calls to control flow
2560 // guard functions to guarantee the instruction used for the call. To do
2561 // this we need to unwrap the load now and use the CFG Func GV as the
2562 // callee.
2563 IsCFGuardCall = true;
2564 auto *LoadNode = cast<LoadSDNode>(Val&: Callee);
2565 GlobalAddressSDNode *GA =
2566 cast<GlobalAddressSDNode>(Val: unwrapAddress(N: LoadNode->getBasePtr()));
2567 assert(isCFGuardFunction(GA->getGlobal()) &&
2568 "CFG Call should be to a guard function");
2569 assert(LoadNode->getOffset()->isUndef() &&
2570 "CFG Function load should not have an offset");
2571 Callee = DAG.getTargetGlobalAddress(
2572 GV: GA->getGlobal(), DL: dl, VT: GA->getValueType(ResNo: 0), offset: 0, TargetFlags: X86II::MO_NO_FLAG);
2573 }
2574
2575 SmallVector<SDValue, 8> Ops;
2576
2577 if (!IsSibcall && isTailCall && !IsMustTail) {
2578 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytesToPop, Size2: 0, Glue: InGlue, DL: dl);
2579 InGlue = Chain.getValue(R: 1);
2580 }
2581
2582 Ops.push_back(Elt: Chain);
2583 Ops.push_back(Elt: Callee);
2584
2585 if (isTailCall)
2586 Ops.push_back(Elt: DAG.getSignedTargetConstant(Val: FPDiff, DL: dl, VT: MVT::i32));
2587
2588 // Add argument registers to the end of the list so that they are known live
2589 // into the call.
2590 for (const auto &[Reg, N] : RegsToPass)
2591 Ops.push_back(Elt: DAG.getRegister(Reg, VT: N.getValueType()));
2592
2593 // Add a register mask operand representing the call-preserved registers.
2594 const uint32_t *Mask = [&]() {
2595 auto AdaptedCC = CallConv;
2596 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
2597 // use X86_INTR calling convention because it has the same CSR mask
2598 // (same preserved registers).
2599 if (HasNCSR)
2600 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
2601 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
2602 // to use the CSR_NoRegs_RegMask.
2603 if (CB && CB->hasFnAttr(Kind: "no_callee_saved_registers"))
2604 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
2605 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
2606 }();
2607 assert(Mask && "Missing call preserved mask for calling convention");
2608
2609 if (MachineOperand::clobbersPhysReg(RegMask: Mask, PhysReg: RegInfo->getFramePtr())) {
2610 X86Info->setFPClobberedByCall(true);
2611 if (CLI.CB && isa<InvokeInst>(Val: CLI.CB))
2612 X86Info->setFPClobberedByInvoke(true);
2613 }
2614 if (MachineOperand::clobbersPhysReg(RegMask: Mask, PhysReg: RegInfo->getBaseRegister())) {
2615 X86Info->setBPClobberedByCall(true);
2616 if (CLI.CB && isa<InvokeInst>(Val: CLI.CB))
2617 X86Info->setBPClobberedByInvoke(true);
2618 }
2619
2620 // If this is an invoke in a 32-bit function using a funclet-based
2621 // personality, assume the function clobbers all registers. If an exception
2622 // is thrown, the runtime will not restore CSRs.
2623 // FIXME: Model this more precisely so that we can register allocate across
2624 // the normal edge and spill and fill across the exceptional edge.
2625 if (!Is64Bit && CLI.CB && isa<InvokeInst>(Val: CLI.CB)) {
2626 const Function &CallerFn = MF.getFunction();
2627 EHPersonality Pers =
2628 CallerFn.hasPersonalityFn()
2629 ? classifyEHPersonality(Pers: CallerFn.getPersonalityFn())
2630 : EHPersonality::Unknown;
2631 if (isFuncletEHPersonality(Pers))
2632 Mask = RegInfo->getNoPreservedMask();
2633 }
2634
2635 // Define a new register mask from the existing mask.
2636 uint32_t *RegMask = nullptr;
2637
2638 // In some calling conventions we need to remove the used physical registers
2639 // from the reg mask. Create a new RegMask for such calling conventions.
2640 // RegMask for calling conventions that disable only return registers (e.g.
2641 // preserve_most) will be modified later in LowerCallResult.
2642 bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CC: CallConv) || HasNCSR;
2643 if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CC: CallConv)) {
2644 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2645
2646 // Allocate a new Reg Mask and copy Mask.
2647 RegMask = MF.allocateRegMask();
2648 unsigned RegMaskSize = MachineOperand::getRegMaskSize(NumRegs: TRI->getNumRegs());
2649 memcpy(dest: RegMask, src: Mask, n: sizeof(RegMask[0]) * RegMaskSize);
2650
2651 // Make sure all sub registers of the argument registers are reset
2652 // in the RegMask.
2653 if (ShouldDisableArgRegs) {
2654 for (auto const &RegPair : RegsToPass)
2655 for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg: RegPair.first))
2656 RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
2657 }
2658
2659 // Create the RegMask Operand according to our updated mask.
2660 Ops.push_back(Elt: DAG.getRegisterMask(RegMask));
2661 } else {
2662 // Create the RegMask Operand according to the static mask.
2663 Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
2664 }
2665
2666 if (InGlue.getNode())
2667 Ops.push_back(Elt: InGlue);
2668
2669 if (isTailCall) {
2670 // We used to do:
2671 //// If this is the first return lowered for this function, add the regs
2672 //// to the liveout set for the function.
2673 // This isn't right, although it's probably harmless on x86; liveouts
2674 // should be computed from returns not tail calls. Consider a void
2675 // function making a tail call to a function returning int.
2676 MF.getFrameInfo().setHasTailCall();
2677 auto Opcode =
2678 IsCFGuardCall ? X86ISD::TC_RETURN_GLOBALADDR : X86ISD::TC_RETURN;
2679 SDValue Ret = DAG.getNode(Opcode, DL: dl, VT: MVT::Other, Ops);
2680
2681 if (IsCFICall)
2682 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2683
2684 DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
2685 DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
2686 return Ret;
2687 }
2688
2689 // Returns a chain & a glue for retval copy to use.
2690 SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
2691 if (IsImpCall) {
2692 Chain = DAG.getNode(Opcode: X86ISD::IMP_CALL, DL: dl, VTList: NodeTys, Ops);
2693 } else if (IsNoTrackIndirectCall) {
2694 Chain = DAG.getNode(Opcode: X86ISD::NT_CALL, DL: dl, VTList: NodeTys, Ops);
2695 } else if (IsCFGuardCall) {
2696 Chain = DAG.getNode(Opcode: X86ISD::CALL_GLOBALADDR, DL: dl, VTList: NodeTys, Ops);
2697 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
2698 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
2699 // expanded to the call, directly followed by a special marker sequence and
2700 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
2701 assert(!isTailCall &&
2702 "tail calls cannot be marked with clang.arc.attachedcall");
2703 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
2704
2705 // Add a target global address for the retainRV/claimRV runtime function
2706 // just before the call target.
2707 Function *ARCFn = *objcarc::getAttachedARCFunction(CB: CLI.CB);
2708 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2709 auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL: dl, VT: PtrVT);
2710 Ops.insert(I: Ops.begin() + 1, Elt: GA);
2711 Chain = DAG.getNode(Opcode: X86ISD::CALL_RVMARKER, DL: dl, VTList: NodeTys, Ops);
2712 } else {
2713 Chain = DAG.getNode(Opcode: X86ISD::CALL, DL: dl, VTList: NodeTys, Ops);
2714 }
2715
2716 if (IsCFICall)
2717 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2718
2719 InGlue = Chain.getValue(R: 1);
2720 DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
2721 DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
2722
2723 // Save heapallocsite metadata.
2724 if (CLI.CB)
2725 if (MDNode *HeapAlloc = CLI.CB->getMetadata(Kind: "heapallocsite"))
2726 DAG.addHeapAllocSite(Node: Chain.getNode(), MD: HeapAlloc);
2727
2728 // Create the CALLSEQ_END node.
2729 unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
2730 if (X86::isCalleePop(CallingConv: CallConv, is64Bit: Is64Bit, IsVarArg: isVarArg,
2731 GuaranteeTCO: DAG.getTarget().Options.GuaranteedTailCallOpt)) {
2732 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
2733 } else if (hasCalleePopSRet(Args: Outs, ArgLocs, Subtarget)) {
2734 // If this call passes a struct-return pointer, the callee
2735 // pops that struct pointer.
2736 NumBytesForCalleeToPop = 4;
2737 }
2738
2739 // Returns a glue for retval copy to use.
2740 if (!IsSibcall) {
2741 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytesToPop, Size2: NumBytesForCalleeToPop,
2742 Glue: InGlue, DL: dl);
2743 InGlue = Chain.getValue(R: 1);
2744 }
2745
2746 if (CallingConv::PreserveNone == CallConv)
2747 for (const ISD::OutputArg &Out : Outs) {
2748 if (Out.Flags.isSwiftSelf() || Out.Flags.isSwiftAsync() ||
2749 Out.Flags.isSwiftError()) {
2750 errorUnsupported(DAG, dl,
2751 Msg: "Swift attributes can't be used with preserve_none");
2752 break;
2753 }
2754 }
2755
2756 // Handle result values, copying them out of physregs into vregs that we
2757 // return.
2758 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2759 InVals, RegMask);
2760}
2761
2762//===----------------------------------------------------------------------===//
2763// Fast Calling Convention (tail call) implementation
2764//===----------------------------------------------------------------------===//
2765
2766// Like std call, callee cleans arguments, convention except that ECX is
2767// reserved for storing the tail called function address. Only 2 registers are
2768// free for argument passing (inreg). Tail call optimization is performed
2769// provided:
2770// * tailcallopt is enabled
2771// * caller/callee are fastcc
2772// On X86_64 architecture with GOT-style position independent code only local
2773// (within module) calls are supported at the moment.
2774// To keep the stack aligned according to platform abi the function
2775// GetAlignedArgumentStackSize ensures that argument delta is always multiples
2776// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
2777// If a tail called function callee has more arguments than the caller the
2778// caller needs to make sure that there is room to move the RETADDR to. This is
2779// achieved by reserving an area the size of the argument delta right after the
2780// original RETADDR, but before the saved framepointer or the spilled registers
2781// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2782// stack layout:
2783// arg1
2784// arg2
2785// RETADDR
2786// [ new RETADDR
2787// move area ]
2788// (possible EBP)
2789// ESI
2790// EDI
2791// local1 ..
2792
2793/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
2794/// requirement.
2795unsigned
2796X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
2797 SelectionDAG &DAG) const {
2798 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
2799 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
2800 assert(StackSize % SlotSize == 0 &&
2801 "StackSize must be a multiple of SlotSize");
2802 return alignTo(Size: StackSize + SlotSize, A: StackAlignment) - SlotSize;
2803}
2804
2805/// Return true if the given stack call argument is already available in the
2806/// same position (relatively) of the caller's incoming argument stack.
2807static
2808bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2809 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2810 const X86InstrInfo *TII, const CCValAssign &VA) {
2811 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2812
2813 for (;;) {
2814 // Look through nodes that don't alter the bits of the incoming value.
2815 unsigned Op = Arg.getOpcode();
2816 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
2817 Op == ISD::AssertZext) {
2818 Arg = Arg.getOperand(i: 0);
2819 continue;
2820 }
2821 if (Op == ISD::TRUNCATE) {
2822 const SDValue &TruncInput = Arg.getOperand(i: 0);
2823 if (TruncInput.getOpcode() == ISD::AssertZext &&
2824 cast<VTSDNode>(Val: TruncInput.getOperand(i: 1))->getVT() ==
2825 Arg.getValueType()) {
2826 Arg = TruncInput.getOperand(i: 0);
2827 continue;
2828 }
2829 }
2830 break;
2831 }
2832
2833 int FI = INT_MAX;
2834 if (Arg.getOpcode() == ISD::CopyFromReg) {
2835 Register VR = cast<RegisterSDNode>(Val: Arg.getOperand(i: 1))->getReg();
2836 if (!VR.isVirtual())
2837 return false;
2838 MachineInstr *Def = MRI->getVRegDef(Reg: VR);
2839 if (!Def)
2840 return false;
2841 if (!Flags.isByVal()) {
2842 if (!TII->isLoadFromStackSlot(MI: *Def, FrameIndex&: FI))
2843 return false;
2844 } else {
2845 unsigned Opcode = Def->getOpcode();
2846 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
2847 Opcode == X86::LEA64_32r) &&
2848 Def->getOperand(i: 1).isFI()) {
2849 FI = Def->getOperand(i: 1).getIndex();
2850 Bytes = Flags.getByValSize();
2851 } else
2852 return false;
2853 }
2854 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val&: Arg)) {
2855 if (Flags.isByVal())
2856 // ByVal argument is passed in as a pointer but it's now being
2857 // dereferenced. e.g.
2858 // define @foo(%struct.X* %A) {
2859 // tail call @bar(%struct.X* byval %A)
2860 // }
2861 return false;
2862 SDValue Ptr = Ld->getBasePtr();
2863 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Val&: Ptr);
2864 if (!FINode)
2865 return false;
2866 FI = FINode->getIndex();
2867 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
2868 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Val&: Arg);
2869 FI = FINode->getIndex();
2870 Bytes = Flags.getByValSize();
2871 } else
2872 return false;
2873
2874 assert(FI != INT_MAX);
2875 if (!MFI.isFixedObjectIndex(ObjectIdx: FI))
2876 return false;
2877
2878 if (Offset != MFI.getObjectOffset(ObjectIdx: FI))
2879 return false;
2880
2881 // If this is not byval, check that the argument stack object is immutable.
2882 // inalloca and argument copy elision can create mutable argument stack
2883 // objects. Byval objects can be mutated, but a byval call intends to pass the
2884 // mutated memory.
2885 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(ObjectIdx: FI))
2886 return false;
2887
2888 if (VA.getLocVT().getFixedSizeInBits() >
2889 Arg.getValueSizeInBits().getFixedValue()) {
2890 // If the argument location is wider than the argument type, check that any
2891 // extension flags match.
2892 if (Flags.isZExt() != MFI.isObjectZExt(ObjectIdx: FI) ||
2893 Flags.isSExt() != MFI.isObjectSExt(ObjectIdx: FI)) {
2894 return false;
2895 }
2896 }
2897
2898 return Bytes == MFI.getObjectSize(ObjectIdx: FI);
2899}
2900
2901static bool
2902mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI,
2903 Register CallerSRetReg) {
2904 const auto &Outs = CLI.Outs;
2905 const auto &OutVals = CLI.OutVals;
2906
2907 // We know the caller has a sret pointer argument (CallerSRetReg). Locate the
2908 // operand index within the callee that may have a sret pointer too.
2909 unsigned Pos = 0;
2910 for (unsigned E = Outs.size(); Pos != E; ++Pos)
2911 if (Outs[Pos].Flags.isSRet())
2912 break;
2913 // Bail out if the callee has not any sret argument.
2914 if (Pos == Outs.size())
2915 return false;
2916
2917 // At this point, either the caller is forwarding its sret argument to the
2918 // callee, or the callee is being passed a different sret pointer. We now look
2919 // for a CopyToReg, where the callee sret argument is written into a new vreg
2920 // (which should later be %rax/%eax, if this is returned).
2921 SDValue SRetArgVal = OutVals[Pos];
2922 for (SDNode *User : SRetArgVal->users()) {
2923 if (User->getOpcode() != ISD::CopyToReg)
2924 continue;
2925 Register Reg = cast<RegisterSDNode>(Val: User->getOperand(Num: 1))->getReg();
2926 if (Reg == CallerSRetReg && User->getOperand(Num: 2) == SRetArgVal)
2927 return true;
2928 }
2929
2930 return false;
2931}
2932
2933/// Check whether the call is eligible for sibling call optimization. Sibling
2934/// calls are loosely defined to be simple, profitable tail calls that only
2935/// require adjusting register parameters. We do not speculatively to optimize
2936/// complex calls that require lots of argument memory operations that may
2937/// alias.
2938///
2939/// Note that LLVM supports multiple ways, such as musttail, to force tail call
2940/// emission. Returning false from this function will not prevent tail call
2941/// emission in all cases.
2942bool X86TargetLowering::isEligibleForSiblingCallOpt(
2943 TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
2944 SmallVectorImpl<CCValAssign> &ArgLocs) const {
2945 SelectionDAG &DAG = CLI.DAG;
2946 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2947 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2948 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2949 SDValue Callee = CLI.Callee;
2950 CallingConv::ID CalleeCC = CLI.CallConv;
2951 bool isVarArg = CLI.IsVarArg;
2952
2953 if (!mayTailCallThisCC(CC: CalleeCC))
2954 return false;
2955
2956 // If -tailcallopt is specified, make fastcc functions tail-callable.
2957 MachineFunction &MF = DAG.getMachineFunction();
2958 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2959 const Function &CallerF = MF.getFunction();
2960
2961 // If the function return type is x86_fp80 and the callee return type is not,
2962 // then the FP_EXTEND of the call result is not a nop. It's not safe to
2963 // perform a tailcall optimization here.
2964 if (CallerF.getReturnType()->isX86_FP80Ty() && !CLI.RetTy->isX86_FP80Ty())
2965 return false;
2966
2967 // Win64 functions have extra shadow space for argument homing. Don't do the
2968 // sibcall if the caller and callee have mismatched expectations for this
2969 // space.
2970 CallingConv::ID CallerCC = CallerF.getCallingConv();
2971 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CC: CalleeCC);
2972 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CC: CallerCC);
2973 if (IsCalleeWin64 != IsCallerWin64)
2974 return false;
2975
2976 // If we are using a GOT, don't generate sibling calls to non-local,
2977 // default-visibility symbols. Tail calling such a symbol requires using a GOT
2978 // relocation, which forces early binding of the symbol. This breaks code that
2979 // require lazy function symbol resolution. Using musttail or
2980 // GuaranteedTailCallOpt will override this.
2981 if (Subtarget.isPICStyleGOT()) {
2982 if (isa<ExternalSymbolSDNode>(Val: Callee))
2983 return false;
2984 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
2985 if (!G->getGlobal()->hasLocalLinkage() &&
2986 G->getGlobal()->hasDefaultVisibility())
2987 return false;
2988 }
2989 }
2990
2991 // Look for obvious safe cases to perform tail call optimization that do not
2992 // require ABI changes. This is what gcc calls sibcall.
2993
2994 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2995 // emit a special epilogue.
2996 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2997 if (RegInfo->hasStackRealignment(MF))
2998 return false;
2999
3000 // Avoid sibcall optimization if we are an sret return function and the callee
3001 // is incompatible, unless such premises are proven wrong. See comment in
3002 // LowerReturn about why hasStructRetAttr is insufficient.
3003 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3004 // For a compatible tail call the callee must return our sret pointer. So it
3005 // needs to be (a) an sret function itself and (b) we pass our sret as its
3006 // sret. Condition #b is harder to determine.
3007 if (!mayBeSRetTailCallCompatible(CLI, CallerSRetReg: SRetReg))
3008 return false;
3009 } else if (hasCalleePopSRet(Args: Outs, ArgLocs, Subtarget))
3010 // The callee pops an sret, so we cannot tail-call, as our caller doesn't
3011 // expect that.
3012 return false;
3013
3014 // Do not sibcall optimize vararg calls unless all arguments are passed via
3015 // registers.
3016 LLVMContext &C = *DAG.getContext();
3017 if (isVarArg && !Outs.empty()) {
3018 // Optimizing for varargs on Win64 is unlikely to be safe without
3019 // additional testing.
3020 if (IsCalleeWin64 || IsCallerWin64)
3021 return false;
3022
3023 for (const auto &VA : ArgLocs)
3024 if (!VA.isRegLoc())
3025 return false;
3026 }
3027
3028 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3029 // stack. Therefore, if it's not used by the call it is not safe to optimize
3030 // this into a sibcall.
3031 bool Unused = false;
3032 for (const auto &In : Ins) {
3033 if (!In.Used) {
3034 Unused = true;
3035 break;
3036 }
3037 }
3038 if (Unused) {
3039 SmallVector<CCValAssign, 16> RVLocs;
3040 CCState RVCCInfo(CalleeCC, false, MF, RVLocs, C);
3041 RVCCInfo.AnalyzeCallResult(Ins, Fn: RetCC_X86);
3042 for (const auto &VA : RVLocs) {
3043 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3044 return false;
3045 }
3046 }
3047
3048 // Check that the call results are passed in the same way.
3049 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3050 CalleeFn: RetCC_X86, CallerFn: RetCC_X86))
3051 return false;
3052 // The callee has to preserve all registers the caller needs to preserve.
3053 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3054 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3055 if (CallerCC != CalleeCC) {
3056 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3057 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
3058 return false;
3059 }
3060
3061 // The stack frame of the caller cannot be replaced by the tail-callee one's
3062 // if the function is required to preserve all the registers. Conservatively
3063 // prevent tail optimization even if hypothetically all the registers are used
3064 // for passing formal parameters or returning values.
3065 if (CallerF.hasFnAttribute(Kind: "no_caller_saved_registers"))
3066 return false;
3067
3068 unsigned StackArgsSize = CCInfo.getStackSize();
3069
3070 // If the callee takes no arguments then go on to check the results of the
3071 // call.
3072 if (!Outs.empty()) {
3073 if (StackArgsSize > 0) {
3074 // Check if the arguments are already laid out in the right way as
3075 // the caller's fixed stack objects.
3076 MachineFrameInfo &MFI = MF.getFrameInfo();
3077 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3078 const X86InstrInfo *TII = Subtarget.getInstrInfo();
3079 for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
3080 const CCValAssign &VA = ArgLocs[I];
3081 SDValue Arg = OutVals[I];
3082 ISD::ArgFlagsTy Flags = Outs[I].Flags;
3083 if (VA.getLocInfo() == CCValAssign::Indirect)
3084 return false;
3085 if (!VA.isRegLoc()) {
3086 if (!MatchingStackOffset(Arg, Offset: VA.getLocMemOffset(), Flags, MFI, MRI,
3087 TII, VA))
3088 return false;
3089 }
3090 }
3091 }
3092
3093 bool PositionIndependent = isPositionIndependent();
3094 // If the tailcall address may be in a register, then make sure it's
3095 // possible to register allocate for it. In 32-bit, the call address can
3096 // only target EAX, EDX, or ECX since the tail call must be scheduled after
3097 // callee-saved registers are restored. These happen to be the same
3098 // registers used to pass 'inreg' arguments so watch out for those.
3099 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Val: Callee) &&
3100 !isa<ExternalSymbolSDNode>(Val: Callee)) ||
3101 PositionIndependent)) {
3102 unsigned NumInRegs = 0;
3103 // In PIC we need an extra register to formulate the address computation
3104 // for the callee.
3105 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
3106
3107 for (const auto &VA : ArgLocs) {
3108 if (!VA.isRegLoc())
3109 continue;
3110 Register Reg = VA.getLocReg();
3111 switch (Reg) {
3112 default: break;
3113 case X86::EAX: case X86::EDX: case X86::ECX:
3114 if (++NumInRegs == MaxInRegs)
3115 return false;
3116 break;
3117 }
3118 }
3119 }
3120
3121 const MachineRegisterInfo &MRI = MF.getRegInfo();
3122 if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
3123 return false;
3124 }
3125
3126 bool CalleeWillPop =
3127 X86::isCalleePop(CallingConv: CalleeCC, is64Bit: Subtarget.is64Bit(), IsVarArg: isVarArg,
3128 GuaranteeTCO: MF.getTarget().Options.GuaranteedTailCallOpt);
3129
3130 if (unsigned BytesToPop = FuncInfo->getBytesToPopOnReturn()) {
3131 // If we have bytes to pop, the callee must pop them.
3132 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
3133 if (!CalleePopMatches)
3134 return false;
3135 } else if (CalleeWillPop && StackArgsSize > 0) {
3136 // If we don't have bytes to pop, make sure the callee doesn't pop any.
3137 return false;
3138 }
3139
3140 return true;
3141}
3142
3143/// Determines whether the callee is required to pop its own arguments.
3144/// Callee pop is necessary to support tail calls.
3145bool X86::isCalleePop(CallingConv::ID CallingConv,
3146 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
3147 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
3148 // can guarantee TCO.
3149 if (!IsVarArg && shouldGuaranteeTCO(CC: CallingConv, GuaranteedTailCallOpt: GuaranteeTCO))
3150 return true;
3151
3152 switch (CallingConv) {
3153 default:
3154 return false;
3155 case CallingConv::X86_StdCall:
3156 case CallingConv::X86_FastCall:
3157 case CallingConv::X86_ThisCall:
3158 case CallingConv::X86_VectorCall:
3159 return !is64Bit;
3160 }
3161}
3162