1//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the X86SelectionDAGInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86SelectionDAGInfo.h"
14#include "X86ISelLowering.h"
15#include "X86InstrInfo.h"
16#include "X86RegisterInfo.h"
17#include "X86Subtarget.h"
18#include "llvm/CodeGen/MachineFrameInfo.h"
19#include "llvm/CodeGen/SelectionDAG.h"
20#include "llvm/CodeGen/TargetLowering.h"
21#include "llvm/IR/DerivedTypes.h"
22
23using namespace llvm;
24
25#define DEBUG_TYPE "x86-selectiondag-info"
26
27static cl::opt<bool>
28 UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(Val: false),
29 cl::desc("Use fast short rep mov in memcpy lowering"));
30
31bool X86SelectionDAGInfo::isBaseRegConflictPossible(
32 SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
33 // We cannot use TRI->hasBasePointer() until *after* we select all basic
34 // blocks. Legalization may introduce new stack temporaries with large
35 // alignment requirements. Fall back to generic code if there are any
36 // dynamic stack adjustments (hopefully rare) and the base pointer would
37 // conflict if we had to use it.
38 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
39 if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
40 return false;
41
42 const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
43 DAG.getSubtarget().getRegisterInfo());
44 return llvm::is_contained(Range&: ClobberSet, Element: TRI->getBaseRegister());
45}
46
47SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
48 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
49 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
50 MachinePointerInfo DstPtrInfo) const {
51 // If to a segment-relative address space, use the default lowering.
52 if (DstPtrInfo.getAddrSpace() >= 256)
53 return SDValue();
54
55 // If the base register might conflict with our physical registers, bail out.
56 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
57 X86::ECX, X86::EAX, X86::EDI};
58 if (isBaseRegConflictPossible(DAG, ClobberSet))
59 return SDValue();
60
61 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
62 const X86Subtarget &Subtarget =
63 DAG.getMachineFunction().getSubtarget<X86Subtarget>();
64
65 // If not DWORD aligned or size is more than the threshold, call the library.
66 // The libc version is likely to be faster for these cases. It can use the
67 // address value and run time information about the CPU.
68 if (Alignment < Align(4) || !ConstantSize ||
69 ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
70 return SDValue();
71
72 uint64_t SizeVal = ConstantSize->getZExtValue();
73 SDValue InGlue;
74 EVT AVT;
75 SDValue Count;
76 unsigned BytesLeft = 0;
77 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
78 unsigned ValReg;
79 uint64_t Val = ValC->getZExtValue() & 255;
80
81 // If the value is a constant, then we can potentially use larger sets.
82 if (Alignment >= Align(4)) {
83 // DWORD aligned
84 AVT = MVT::i32;
85 ValReg = X86::EAX;
86 Val = (Val << 8) | Val;
87 Val = (Val << 16) | Val;
88 if (Subtarget.is64Bit() && Alignment >= Align(8)) { // QWORD aligned
89 AVT = MVT::i64;
90 ValReg = X86::RAX;
91 Val = (Val << 32) | Val;
92 }
93 } else if (Alignment == Align(2)) {
94 // WORD aligned
95 AVT = MVT::i16;
96 ValReg = X86::AX;
97 Val = (Val << 8) | Val;
98 } else {
99 // Byte aligned
100 AVT = MVT::i8;
101 ValReg = X86::AL;
102 Count = DAG.getIntPtrConstant(Val: SizeVal, DL: dl);
103 }
104
105 if (AVT.bitsGT(VT: MVT::i8)) {
106 unsigned UBytes = AVT.getSizeInBits() / 8;
107 Count = DAG.getIntPtrConstant(Val: SizeVal / UBytes, DL: dl);
108 BytesLeft = SizeVal % UBytes;
109 }
110
111 Chain = DAG.getCopyToReg(Chain, dl, Reg: ValReg, N: DAG.getConstant(Val, DL: dl, VT: AVT),
112 Glue: InGlue);
113 InGlue = Chain.getValue(R: 1);
114 } else {
115 AVT = MVT::i8;
116 Count = DAG.getIntPtrConstant(Val: SizeVal, DL: dl);
117 Chain = DAG.getCopyToReg(Chain, dl, Reg: X86::AL, N: Val, Glue: InGlue);
118 InGlue = Chain.getValue(R: 1);
119 }
120
121 bool Use64BitRegs = Subtarget.isTarget64BitLP64();
122 Chain = DAG.getCopyToReg(Chain, dl, Reg: Use64BitRegs ? X86::RCX : X86::ECX,
123 N: Count, Glue: InGlue);
124 InGlue = Chain.getValue(R: 1);
125 Chain = DAG.getCopyToReg(Chain, dl, Reg: Use64BitRegs ? X86::RDI : X86::EDI,
126 N: Dst, Glue: InGlue);
127 InGlue = Chain.getValue(R: 1);
128
129 SDVTList Tys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
130 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
131 SDValue RepStos = DAG.getNode(Opcode: X86ISD::REP_STOS, DL: dl, VTList: Tys, Ops);
132
133 /// RepStos can process the whole length.
134 if (BytesLeft == 0)
135 return RepStos;
136
137 // Handle the last 1 - 7 bytes.
138 SmallVector<SDValue, 4> Results;
139 Results.push_back(Elt: RepStos);
140 unsigned Offset = SizeVal - BytesLeft;
141 EVT AddrVT = Dst.getValueType();
142 EVT SizeVT = Size.getValueType();
143
144 Results.push_back(
145 Elt: DAG.getMemset(Chain, dl,
146 Dst: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AddrVT, N1: Dst,
147 N2: DAG.getConstant(Val: Offset, DL: dl, VT: AddrVT)),
148 Src: Val, Size: DAG.getConstant(Val: BytesLeft, DL: dl, VT: SizeVT), Alignment,
149 isVol: isVolatile, AlwaysInline,
150 /* CI */ nullptr, DstPtrInfo: DstPtrInfo.getWithOffset(O: Offset)));
151
152 return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: Results);
153}
154
155/// Emit a single REP MOVS{B,W,D,Q} instruction.
156static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
157 const SDLoc &dl, SDValue Chain, SDValue Dst,
158 SDValue Src, SDValue Size, MVT AVT) {
159 const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
160 const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
161 const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
162 const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
163
164 SDValue InGlue;
165 Chain = DAG.getCopyToReg(Chain, dl, Reg: CX, N: Size, Glue: InGlue);
166 InGlue = Chain.getValue(R: 1);
167 Chain = DAG.getCopyToReg(Chain, dl, Reg: DI, N: Dst, Glue: InGlue);
168 InGlue = Chain.getValue(R: 1);
169 Chain = DAG.getCopyToReg(Chain, dl, Reg: SI, N: Src, Glue: InGlue);
170 InGlue = Chain.getValue(R: 1);
171
172 SDVTList Tys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
173 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
174 return DAG.getNode(Opcode: X86ISD::REP_MOVS, DL: dl, VTList: Tys, Ops);
175}
176
177/// Emit a single REP MOVSB instruction for a particular constant size.
178static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
179 const SDLoc &dl, SDValue Chain, SDValue Dst,
180 SDValue Src, uint64_t Size) {
181 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
182 Size: DAG.getIntPtrConstant(Val: Size, DL: dl), AVT: MVT::i8);
183}
184
185/// Returns the best type to use with repmovs depending on alignment.
186static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
187 Align Alignment) {
188 uint64_t Align = Alignment.value();
189 assert((Align != 0) && "Align is normalized");
190 assert(isPowerOf2_64(Align) && "Align is a power of 2");
191 switch (Align) {
192 case 1:
193 return MVT::i8;
194 case 2:
195 return MVT::i16;
196 case 4:
197 return MVT::i32;
198 default:
199 return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
200 }
201}
202
203/// Returns a REP MOVS instruction, possibly with a few load/stores to implement
204/// a constant size memory copy. In some cases where we know REP MOVS is
205/// inefficient we return an empty SDValue so the calling code can either
206/// generate a load/store sequence or call the runtime memcpy function.
207static SDValue emitConstantSizeRepmov(
208 SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
209 SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
210 Align Alignment, bool isVolatile, bool AlwaysInline,
211 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
212
213 /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
214 /// efficient.
215 if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
216 return SDValue();
217
218 /// If we have enhanced repmovs we use it.
219 if (Subtarget.hasERMSB())
220 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
221
222 assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
223 /// We assume runtime memcpy will do a better job for unaligned copies when
224 /// ERMS is not present.
225 if (!AlwaysInline && (Alignment.value() & 3) != 0)
226 return SDValue();
227
228 const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
229 const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
230 const uint64_t BlockCount = Size / BlockBytes;
231 const uint64_t BytesLeft = Size % BlockBytes;
232 SDValue RepMovs =
233 emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
234 Size: DAG.getIntPtrConstant(Val: BlockCount, DL: dl), AVT: BlockType);
235
236 /// RepMov can process the whole length.
237 if (BytesLeft == 0)
238 return RepMovs;
239
240 assert(BytesLeft && "We have leftover at this point");
241
242 /// In case we optimize for size we use repmovsb even if it's less efficient
243 /// so we can save the loads/stores of the leftover.
244 if (DAG.getMachineFunction().getFunction().hasMinSize())
245 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
246
247 // Handle the last 1 - 7 bytes.
248 SmallVector<SDValue, 4> Results;
249 Results.push_back(Elt: RepMovs);
250 unsigned Offset = Size - BytesLeft;
251 EVT DstVT = Dst.getValueType();
252 EVT SrcVT = Src.getValueType();
253 Results.push_back(Elt: DAG.getMemcpy(
254 Chain, dl,
255 Dst: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: DstVT, N1: Dst, N2: DAG.getConstant(Val: Offset, DL: dl, VT: DstVT)),
256 Src: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: SrcVT, N1: Src, N2: DAG.getConstant(Val: Offset, DL: dl, VT: SrcVT)),
257 Size: DAG.getConstant(Val: BytesLeft, DL: dl, VT: SizeVT), Alignment, isVol: isVolatile,
258 /*AlwaysInline*/ true, /*CI=*/nullptr, OverrideTailCall: std::nullopt,
259 DstPtrInfo: DstPtrInfo.getWithOffset(O: Offset), SrcPtrInfo: SrcPtrInfo.getWithOffset(O: Offset)));
260 return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: Results);
261}
262
263SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
264 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
265 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
266 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
267 // If to a segment-relative address space, use the default lowering.
268 if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
269 return SDValue();
270
271 // If the base registers conflict with our physical registers, use the default
272 // lowering.
273 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
274 X86::ECX, X86::ESI, X86::EDI};
275 if (isBaseRegConflictPossible(DAG, ClobberSet))
276 return SDValue();
277
278 const X86Subtarget &Subtarget =
279 DAG.getMachineFunction().getSubtarget<X86Subtarget>();
280
281 // If enabled and available, use fast short rep mov.
282 if (UseFSRMForMemcpy && Subtarget.hasFSRM())
283 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, AVT: MVT::i8);
284
285 /// Handle constant sizes,
286 if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size))
287 return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
288 Size: ConstantSize->getZExtValue(),
289 SizeVT: Size.getValueType(), Alignment, isVolatile,
290 AlwaysInline, DstPtrInfo, SrcPtrInfo);
291
292 return SDValue();
293}
294