1//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the X86SelectionDAGInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86SelectionDAGInfo.h"
14#include "X86InstrInfo.h"
15#include "X86RegisterInfo.h"
16#include "X86Subtarget.h"
17#include "llvm/CodeGen/MachineFrameInfo.h"
18#include "llvm/CodeGen/SelectionDAG.h"
19#include "llvm/CodeGen/TargetLowering.h"
20
21#define GET_SDNODE_DESC
22#include "X86GenSDNodeInfo.inc"
23
24using namespace llvm;
25
26#define DEBUG_TYPE "x86-selectiondag-info"
27
28static cl::opt<bool>
29 UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(Val: false),
30 cl::desc("Use fast short rep mov in memcpy lowering"));
31
32X86SelectionDAGInfo::X86SelectionDAGInfo()
33 : SelectionDAGGenTargetInfo(X86GenSDNodeInfo) {}
34
35const char *X86SelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
36#define NODE_NAME_CASE(NODE) \
37 case X86ISD::NODE: \
38 return "X86ISD::" #NODE;
39
40 // These nodes don't have corresponding entries in *.td files yet.
41 switch (static_cast<X86ISD::NodeType>(Opcode)) {
42 NODE_NAME_CASE(POP_FROM_X87_REG)
43 NODE_NAME_CASE(GlobalBaseReg)
44 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
45 NODE_NAME_CASE(PCMPESTR)
46 NODE_NAME_CASE(PCMPISTR)
47 NODE_NAME_CASE(MGATHER)
48 NODE_NAME_CASE(MSCATTER)
49 NODE_NAME_CASE(AESENCWIDE128KL)
50 NODE_NAME_CASE(AESDECWIDE128KL)
51 NODE_NAME_CASE(AESENCWIDE256KL)
52 NODE_NAME_CASE(AESDECWIDE256KL)
53 }
54#undef NODE_NAME_CASE
55
56 return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
57}
58
59bool X86SelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
60 // These nodes don't have corresponding entries in *.td files yet.
61 if (Opcode >= X86ISD::FIRST_MEMORY_OPCODE &&
62 Opcode <= X86ISD::LAST_MEMORY_OPCODE)
63 return true;
64
65 return SelectionDAGGenTargetInfo::isTargetMemoryOpcode(Opcode);
66}
67
68void X86SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
69 const SDNode *N) const {
70 switch (N->getOpcode()) {
71 default:
72 break;
73 case X86ISD::VP2INTERSECT:
74 // invalid number of results; expected 1, got 2
75 case X86ISD::FSETCCM_SAE:
76 // invalid number of operands; expected 3, got 4
77 case X86ISD::CVTTP2SI_SAE:
78 case X86ISD::CVTTP2UI_SAE:
79 case X86ISD::CVTTP2IBS_SAE:
80 // invalid number of operands; expected 1, got 2
81 case X86ISD::CMPMM_SAE:
82 // invalid number of operands; expected 4, got 5
83 case X86ISD::CALL:
84 case X86ISD::NT_BRIND:
85 // operand #1 must have type i32 (iPTR), but has type i64
86 case X86ISD::INSERTQI:
87 case X86ISD::EXTRQI:
88 // result #0 must have type v2i64, but has type v16i8/v8i16
89 return;
90 }
91
92 SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
93}
94
95/// Returns the best type to use with repmovs/repstos depending on alignment.
96static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) {
97 uint64_t Align = Alignment.value();
98 assert((Align != 0) && "Align is normalized");
99 assert(isPowerOf2_64(Align) && "Align is a power of 2");
100 switch (Align) {
101 case 1:
102 return MVT::i8;
103 case 2:
104 return MVT::i16;
105 case 4:
106 return MVT::i32;
107 default:
108 return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
109 }
110}
111
112bool X86SelectionDAGInfo::isBaseRegConflictPossible(
113 SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
114 // We cannot use TRI->hasBasePointer() until *after* we select all basic
115 // blocks. Legalization may introduce new stack temporaries with large
116 // alignment requirements. Fall back to generic code if there are any
117 // dynamic stack adjustments (hopefully rare) and the base pointer would
118 // conflict if we had to use it.
119 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
120 if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
121 return false;
122
123 const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
124 DAG.getSubtarget().getRegisterInfo());
125 return llvm::is_contained(Range&: ClobberSet, Element: TRI->getBaseRegister());
126}
127
128/// Emit a single REP STOSB instruction for a particular constant size.
129static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
130 const SDLoc &dl, SDValue Chain, SDValue Dst,
131 SDValue Val, SDValue Size, MVT AVT) {
132 const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
133 unsigned AX = X86::AL;
134 switch (AVT.getSizeInBits()) {
135 case 8:
136 AX = X86::AL;
137 break;
138 case 16:
139 AX = X86::AX;
140 break;
141 case 32:
142 AX = X86::EAX;
143 break;
144 default:
145 AX = X86::RAX;
146 break;
147 }
148
149 const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
150 const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
151
152 SDValue InGlue;
153 Chain = DAG.getCopyToReg(Chain, dl, Reg: AX, N: Val, Glue: InGlue);
154 InGlue = Chain.getValue(R: 1);
155 Chain = DAG.getCopyToReg(Chain, dl, Reg: CX, N: Size, Glue: InGlue);
156 InGlue = Chain.getValue(R: 1);
157 Chain = DAG.getCopyToReg(Chain, dl, Reg: DI, N: Dst, Glue: InGlue);
158 InGlue = Chain.getValue(R: 1);
159
160 SDVTList Tys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
161 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
162 return DAG.getNode(Opcode: X86ISD::REP_STOS, DL: dl, VTList: Tys, Ops);
163}
164
165/// Emit a single REP STOSB instruction for a particular constant size.
166static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
167 const SDLoc &dl, SDValue Chain, SDValue Dst,
168 SDValue Val, uint64_t Size) {
169 return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
170 Size: DAG.getIntPtrConstant(Val: Size, DL: dl), AVT: MVT::i8);
171}
172
173/// Returns a REP STOS instruction, possibly with a few load/stores to implement
174/// a constant size memory set. In some cases where we know REP MOVS is
175/// inefficient we return an empty SDValue so the calling code can either
176/// generate a store sequence or call the runtime memset function.
177static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
178 const X86Subtarget &Subtarget,
179 const SDLoc &dl, SDValue Chain,
180 SDValue Dst, SDValue Val, uint64_t Size,
181 EVT SizeVT, Align Alignment,
182 bool isVolatile, bool AlwaysInline,
183 MachinePointerInfo DstPtrInfo) {
184 /// In case we optimize for size, we use repstosb even if it's less efficient
185 /// so we can save the loads/stores of the leftover.
186 if (DAG.getMachineFunction().getFunction().hasMinSize()) {
187 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
188 // Special case 0 because otherwise we get large literals,
189 // which causes larger encoding.
190 if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) {
191 MVT BlockType = MVT::i32;
192 const uint64_t BlockBits = BlockType.getSizeInBits();
193 const uint64_t BlockBytes = BlockBits / 8;
194 const uint64_t BlockCount = Size / BlockBytes;
195
196 Val = DAG.getConstant(Val: 0, DL: dl, VT: BlockType);
197 // repstosd is same size as repstosb
198 return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
199 Size: DAG.getIntPtrConstant(Val: BlockCount, DL: dl), AVT: BlockType);
200 }
201 }
202 return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size);
203 }
204
205 if (Size > Subtarget.getMaxInlineSizeThreshold())
206 return SDValue();
207
208 // If not DWORD aligned or size is more than the threshold, call the library.
209 // The libc version is likely to be faster for these cases. It can use the
210 // address value and run time information about the CPU.
211 if (Alignment < Align(4))
212 return SDValue();
213
214 MVT BlockType = MVT::i8;
215 uint64_t BlockCount = Size;
216 uint64_t BytesLeft = 0;
217
218 SDValue OriginalVal = Val;
219 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
220 BlockType = getOptimalRepType(Subtarget, Alignment);
221 uint64_t Value = ValC->getZExtValue() & 255;
222 const uint64_t BlockBits = BlockType.getSizeInBits();
223
224 if (BlockBits >= 16)
225 Value = (Value << 8) | Value;
226
227 if (BlockBits >= 32)
228 Value = (Value << 16) | Value;
229
230 if (BlockBits >= 64)
231 Value = (Value << 32) | Value;
232
233 const uint64_t BlockBytes = BlockBits / 8;
234 BlockCount = Size / BlockBytes;
235 BytesLeft = Size % BlockBytes;
236 Val = DAG.getConstant(Val: Value, DL: dl, VT: BlockType);
237 }
238
239 SDValue RepStos =
240 emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
241 Size: DAG.getIntPtrConstant(Val: BlockCount, DL: dl), AVT: BlockType);
242 /// RepStos can process the whole length.
243 if (BytesLeft == 0)
244 return RepStos;
245
246 // Handle the last 1 - 7 bytes.
247 SmallVector<SDValue, 4> Results;
248 Results.push_back(Elt: RepStos);
249 unsigned Offset = Size - BytesLeft;
250 EVT AddrVT = Dst.getValueType();
251
252 Results.push_back(
253 Elt: DAG.getMemset(Chain, dl,
254 Dst: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: AddrVT, N1: Dst,
255 N2: DAG.getConstant(Val: Offset, DL: dl, VT: AddrVT)),
256 Src: OriginalVal, Size: DAG.getConstant(Val: BytesLeft, DL: dl, VT: SizeVT),
257 Alignment, isVol: isVolatile, AlwaysInline,
258 /* CI */ nullptr, DstPtrInfo: DstPtrInfo.getWithOffset(O: Offset)));
259
260 return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: Results);
261}
262
263SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
264 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
265 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
266 MachinePointerInfo DstPtrInfo) const {
267 // If to a segment-relative address space, use the default lowering.
268 if (DstPtrInfo.getAddrSpace() >= 256)
269 return SDValue();
270
271 // If the base register might conflict with our physical registers, bail out.
272 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
273 X86::ECX, X86::EAX, X86::EDI};
274 if (isBaseRegConflictPossible(DAG, ClobberSet))
275 return SDValue();
276
277 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
278 if (!ConstantSize)
279 return SDValue();
280
281 const X86Subtarget &Subtarget =
282 DAG.getMachineFunction().getSubtarget<X86Subtarget>();
283 return emitConstantSizeRepstos(
284 DAG, Subtarget, dl, Chain, Dst, Val, Size: ConstantSize->getZExtValue(),
285 SizeVT: Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
286}
287
288/// Emit a single REP MOVS{B,W,D,Q} instruction.
289static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
290 const SDLoc &dl, SDValue Chain, SDValue Dst,
291 SDValue Src, SDValue Size, MVT AVT) {
292 const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
293 const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
294 const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
295 const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
296
297 SDValue InGlue;
298 Chain = DAG.getCopyToReg(Chain, dl, Reg: CX, N: Size, Glue: InGlue);
299 InGlue = Chain.getValue(R: 1);
300 Chain = DAG.getCopyToReg(Chain, dl, Reg: DI, N: Dst, Glue: InGlue);
301 InGlue = Chain.getValue(R: 1);
302 Chain = DAG.getCopyToReg(Chain, dl, Reg: SI, N: Src, Glue: InGlue);
303 InGlue = Chain.getValue(R: 1);
304
305 SDVTList Tys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
306 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
307 return DAG.getNode(Opcode: X86ISD::REP_MOVS, DL: dl, VTList: Tys, Ops);
308}
309
310/// Emit a single REP MOVSB instruction for a particular constant size.
311static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
312 const SDLoc &dl, SDValue Chain, SDValue Dst,
313 SDValue Src, uint64_t Size) {
314 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
315 Size: DAG.getIntPtrConstant(Val: Size, DL: dl), AVT: MVT::i8);
316}
317
318/// Returns a REP MOVS instruction, possibly with a few load/stores to implement
319/// a constant size memory copy. In some cases where we know REP MOVS is
320/// inefficient we return an empty SDValue so the calling code can either
321/// generate a load/store sequence or call the runtime memcpy function.
322static SDValue emitConstantSizeRepmov(
323 SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
324 SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
325 Align Alignment, bool isVolatile, bool AlwaysInline,
326 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
327 /// In case we optimize for size, we use repmovsb even if it's less efficient
328 /// so we can save the loads/stores of the leftover.
329 if (DAG.getMachineFunction().getFunction().hasMinSize())
330 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
331
332 /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
333 /// efficient.
334 if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
335 return SDValue();
336
337 /// If we have enhanced repmovs we use it.
338 if (Subtarget.hasERMSB())
339 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
340
341 assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
342 /// We assume runtime memcpy will do a better job for unaligned copies when
343 /// ERMS is not present.
344 if (!AlwaysInline && (Alignment < Align(4)))
345 return SDValue();
346
347 const MVT BlockType = getOptimalRepType(Subtarget, Alignment);
348 const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
349 const uint64_t BlockCount = Size / BlockBytes;
350 const uint64_t BytesLeft = Size % BlockBytes;
351 SDValue RepMovs =
352 emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
353 Size: DAG.getIntPtrConstant(Val: BlockCount, DL: dl), AVT: BlockType);
354
355 /// RepMov can process the whole length.
356 if (BytesLeft == 0)
357 return RepMovs;
358
359 assert(BytesLeft && "We have leftover at this point");
360
361 // Handle the last 1 - 7 bytes.
362 SmallVector<SDValue, 4> Results;
363 Results.push_back(Elt: RepMovs);
364 unsigned Offset = Size - BytesLeft;
365 EVT DstVT = Dst.getValueType();
366 EVT SrcVT = Src.getValueType();
367 Results.push_back(Elt: DAG.getMemcpy(
368 Chain, dl,
369 Dst: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: DstVT, N1: Dst, N2: DAG.getConstant(Val: Offset, DL: dl, VT: DstVT)),
370 Src: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: SrcVT, N1: Src, N2: DAG.getConstant(Val: Offset, DL: dl, VT: SrcVT)),
371 Size: DAG.getConstant(Val: BytesLeft, DL: dl, VT: SizeVT), Alignment, isVol: isVolatile,
372 /*AlwaysInline*/ true, /*CI=*/nullptr, OverrideTailCall: std::nullopt,
373 DstPtrInfo: DstPtrInfo.getWithOffset(O: Offset), SrcPtrInfo: SrcPtrInfo.getWithOffset(O: Offset)));
374 return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: Results);
375}
376
377SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
378 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
379 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
380 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
381 // If to a segment-relative address space, use the default lowering.
382 if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
383 return SDValue();
384
385 // If the base registers conflict with our physical registers, use the default
386 // lowering.
387 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
388 X86::ECX, X86::ESI, X86::EDI};
389 if (isBaseRegConflictPossible(DAG, ClobberSet))
390 return SDValue();
391
392 const X86Subtarget &Subtarget =
393 DAG.getMachineFunction().getSubtarget<X86Subtarget>();
394
395 // If enabled and available, use fast short rep mov.
396 if (UseFSRMForMemcpy && Subtarget.hasFSRM())
397 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, AVT: MVT::i8);
398
399 /// Handle constant sizes
400 if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size))
401 return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
402 Size: ConstantSize->getZExtValue(),
403 SizeVT: Size.getValueType(), Alignment, isVolatile,
404 AlwaysInline, DstPtrInfo, SrcPtrInfo);
405
406 return SDValue();
407}
408