1//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the ARMSelectionDAGInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "ARMSelectionDAGInfo.h"
14#include "ARMTargetTransformInfo.h"
15#include "llvm/CodeGen/SelectionDAG.h"
16#include "llvm/Support/CommandLine.h"
17
18#define GET_SDNODE_DESC
19#include "ARMGenSDNodeInfo.inc"
20
21using namespace llvm;
22
23#define DEBUG_TYPE "arm-selectiondag-info"
24
25static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
26 "arm-memtransfer-tploop", cl::Hidden,
27 cl::desc("Control conversion of memcpy to "
28 "Tail predicated loops (WLSTP)"),
29 cl::init(Val: TPLoop::ForceDisabled),
30 cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
31 "Don't convert memcpy to TP loop."),
32 clEnumValN(TPLoop::ForceEnabled, "force-enabled",
33 "Always convert memcpy to TP loop."),
34 clEnumValN(TPLoop::Allow, "allow",
35 "Allow (may be subject to certain conditions) "
36 "conversion of memcpy to TP loop.")));
37
38ARMSelectionDAGInfo::ARMSelectionDAGInfo()
39 : SelectionDAGGenTargetInfo(ARMGenSDNodeInfo) {}
40
41const char *ARMSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
42#define MAKE_CASE(V) \
43 case V: \
44 return #V;
45
46 // These nodes don't have corresponding entries in *.td files yet.
47 switch (static_cast<ARMISD::NodeType>(Opcode)) {
48 MAKE_CASE(ARMISD::DYN_ALLOC)
49 MAKE_CASE(ARMISD::MVESEXT)
50 MAKE_CASE(ARMISD::MVEZEXT)
51 MAKE_CASE(ARMISD::MVETRUNC)
52 MAKE_CASE(ARMISD::BUILD_VECTOR)
53 MAKE_CASE(ARMISD::VLD1DUP)
54 MAKE_CASE(ARMISD::VLD2DUP)
55 MAKE_CASE(ARMISD::VLD3DUP)
56 MAKE_CASE(ARMISD::VLD4DUP)
57 MAKE_CASE(ARMISD::VLD1_UPD)
58 MAKE_CASE(ARMISD::VLD2_UPD)
59 MAKE_CASE(ARMISD::VLD3_UPD)
60 MAKE_CASE(ARMISD::VLD4_UPD)
61 MAKE_CASE(ARMISD::VLD1x2_UPD)
62 MAKE_CASE(ARMISD::VLD1x3_UPD)
63 MAKE_CASE(ARMISD::VLD1x4_UPD)
64 MAKE_CASE(ARMISD::VLD2LN_UPD)
65 MAKE_CASE(ARMISD::VLD3LN_UPD)
66 MAKE_CASE(ARMISD::VLD4LN_UPD)
67 MAKE_CASE(ARMISD::VLD1DUP_UPD)
68 MAKE_CASE(ARMISD::VLD2DUP_UPD)
69 MAKE_CASE(ARMISD::VLD3DUP_UPD)
70 MAKE_CASE(ARMISD::VLD4DUP_UPD)
71 MAKE_CASE(ARMISD::VST1_UPD)
72 MAKE_CASE(ARMISD::VST3_UPD)
73 MAKE_CASE(ARMISD::VST1x2_UPD)
74 MAKE_CASE(ARMISD::VST1x3_UPD)
75 MAKE_CASE(ARMISD::VST1x4_UPD)
76 MAKE_CASE(ARMISD::VST2LN_UPD)
77 MAKE_CASE(ARMISD::VST3LN_UPD)
78 MAKE_CASE(ARMISD::VST4LN_UPD)
79 }
80#undef MAKE_CASE
81
82 return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
83}
84
85bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
86 // These nodes don't have corresponding entries in *.td files yet.
87 if (Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
88 Opcode <= ARMISD::LAST_MEMORY_OPCODE)
89 return true;
90
91 return SelectionDAGGenTargetInfo::isTargetMemoryOpcode(Opcode);
92}
93
94void ARMSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
95 const SDNode *N) const {
96 switch (N->getOpcode()) {
97 default:
98 break;
99 case ARMISD::WIN__DBZCHK:
100 // invalid number of results; expected 2, got 1
101 case ARMISD::WIN__CHKSTK:
102 // invalid number of results; expected 1, got 2
103 case ARMISD::COPY_STRUCT_BYVAL:
104 // invalid number of operands; expected 6, got 5
105 case ARMISD::MEMCPY:
106 // invalid number of operands; expected 5, got 4
107 case ARMISD::VMOVRRD:
108 // operand #0 must have type f64, but has type v1i64/v4f16/v8i8
109 case ARMISD::VMOVIMM:
110 // operand #0 must have type i32, but has type i16
111 return;
112 }
113
114 SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
115}
116
117// Emit, if possible, a specialized version of the given Libcall. Typically this
118// means selecting the appropriately aligned version, but we also convert memset
119// of 0 into memclr.
120SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
121 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
122 SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
123 const ARMSubtarget &Subtarget =
124 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
125 const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
126
127 // Only use a specialized AEABI function if the default version of this
128 // Libcall is an AEABI function.
129 //
130 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
131 // able to translate memset to memclr and use the value to index the function
132 // name array.
133 enum {
134 AEABI_MEMCPY = 0,
135 AEABI_MEMMOVE,
136 AEABI_MEMSET,
137 AEABI_MEMCLR
138 } AEABILibcall;
139 switch (LC) {
140 case RTLIB::MEMCPY:
141 if (DAG.getLibcalls().getLibcallImpl(Call: LC) != RTLIB::impl___aeabi_memcpy)
142 return SDValue();
143
144 AEABILibcall = AEABI_MEMCPY;
145 break;
146 case RTLIB::MEMMOVE:
147 if (DAG.getLibcalls().getLibcallImpl(Call: LC) != RTLIB::impl___aeabi_memmove)
148 return SDValue();
149
150 AEABILibcall = AEABI_MEMMOVE;
151 break;
152 case RTLIB::MEMSET:
153 if (DAG.getLibcalls().getLibcallImpl(Call: LC) != RTLIB::impl___aeabi_memset)
154 return SDValue();
155
156 AEABILibcall = AEABI_MEMSET;
157 if (isNullConstant(V: Src))
158 AEABILibcall = AEABI_MEMCLR;
159 break;
160 default:
161 return SDValue();
162 }
163
164 // Choose the most-aligned libcall variant that we can
165 enum {
166 ALIGN1 = 0,
167 ALIGN4,
168 ALIGN8
169 } AlignVariant;
170 if ((Align & 7) == 0)
171 AlignVariant = ALIGN8;
172 else if ((Align & 3) == 0)
173 AlignVariant = ALIGN4;
174 else
175 AlignVariant = ALIGN1;
176
177 TargetLowering::ArgListTy Args;
178 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext());
179 Args.emplace_back(args&: Dst, args&: IntPtrTy);
180 if (AEABILibcall == AEABI_MEMCLR) {
181 Args.emplace_back(args&: Size, args&: IntPtrTy);
182 } else if (AEABILibcall == AEABI_MEMSET) {
183 // Adjust parameters for memset, EABI uses format (ptr, size, value),
184 // GNU library uses (ptr, value, size)
185 // See RTABI section 4.3.4
186 Args.emplace_back(args&: Size, args&: IntPtrTy);
187
188 // Extend or truncate the argument to be an i32 value for the call.
189 if (Src.getValueType().bitsGT(VT: MVT::i32))
190 Src = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: Src);
191 else if (Src.getValueType().bitsLT(VT: MVT::i32))
192 Src = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i32, Operand: Src);
193
194 TargetLowering::ArgListEntry Entry(Src,
195 Type::getInt32Ty(C&: *DAG.getContext()));
196 Entry.IsSExt = false;
197 Args.push_back(x: Entry);
198 } else {
199 Args.emplace_back(args&: Src, args&: IntPtrTy);
200 Args.emplace_back(args&: Size, args&: IntPtrTy);
201 }
202
203 static const RTLIB::Libcall FunctionImpls[4][3] = {
204 {RTLIB::MEMCPY, RTLIB::AEABI_MEMCPY4, RTLIB::AEABI_MEMCPY8},
205 {RTLIB::MEMMOVE, RTLIB::AEABI_MEMMOVE4, RTLIB::AEABI_MEMMOVE8},
206 {RTLIB::MEMSET, RTLIB::AEABI_MEMSET4, RTLIB::AEABI_MEMSET8},
207 {RTLIB::AEABI_MEMCLR, RTLIB::AEABI_MEMCLR4, RTLIB::AEABI_MEMCLR8}};
208
209 RTLIB::Libcall NewLC = FunctionImpls[AEABILibcall][AlignVariant];
210 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(Call: NewLC);
211 if (LCImpl == RTLIB::Unsupported)
212 return SDValue();
213
214 TargetLowering::CallLoweringInfo CLI(DAG);
215 CLI.setDebugLoc(dl)
216 .setChain(Chain)
217 .setLibCallee(
218 CC: DAG.getLibcalls().getLibcallImplCallingConv(Call: LCImpl),
219 ResultType: Type::getVoidTy(C&: *DAG.getContext()),
220 Target: DAG.getExternalSymbol(LCImpl, VT: TLI->getPointerTy(DL: DAG.getDataLayout())),
221 ArgsList: std::move(Args))
222 .setDiscardResult();
223 std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
224
225 return CallResult.second;
226}
227
228static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
229 const SelectionDAG &DAG,
230 ConstantSDNode *ConstantSize,
231 Align Alignment, bool IsMemcpy) {
232 auto &F = DAG.getMachineFunction().getFunction();
233 if (!EnableMemtransferTPLoop)
234 return false;
235 if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
236 return true;
237 // Do not generate inline TP loop if optimizations is disabled,
238 // or if optimization for size (-Os or -Oz) is on.
239 if (F.hasOptNone() || F.hasOptSize())
240 return false;
241 // If cli option is unset, for memset always generate inline TP.
242 // For memcpy, check some conditions
243 if (!IsMemcpy)
244 return true;
245 if (!ConstantSize && Alignment >= Align(4))
246 return true;
247 if (ConstantSize &&
248 ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
249 ConstantSize->getZExtValue() <
250 Subtarget.getMaxMemcpyTPInlineSizeThreshold())
251 return true;
252 return false;
253}
254
255SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
256 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
257 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
258 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
259 const ARMSubtarget &Subtarget =
260 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
261 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
262
263 if (Subtarget.hasMVEIntegerOps() &&
264 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, IsMemcpy: true))
265 return DAG.getNode(Opcode: ARMISD::MEMCPYLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src,
266 N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32));
267
268 // Do repeated 4-byte loads and stores. To be improved.
269 // This requires 4-byte alignment.
270 if (Alignment < Align(4))
271 return SDValue();
272 // This requires the copy size to be a constant, preferably
273 // within a subtarget-specific limit.
274 if (!ConstantSize)
275 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
276 Align: Alignment.value(), LC: RTLIB::MEMCPY);
277 uint64_t SizeVal = ConstantSize->getZExtValue();
278 if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
279 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
280 Align: Alignment.value(), LC: RTLIB::MEMCPY);
281
282 unsigned BytesLeft = SizeVal & 3;
283 unsigned NumMemOps = SizeVal >> 2;
284 unsigned EmittedNumMemOps = 0;
285 EVT VT = MVT::i32;
286 unsigned VTSize = 4;
287 unsigned i = 0;
288 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
289 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
290 SDValue TFOps[6];
291 SDValue Loads[6];
292 uint64_t SrcOff = 0, DstOff = 0;
293
294 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
295 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
296 // pressure on the general purpose registers. However this seems harder to map
297 // onto the register allocator's view of the world.
298
299 // The number of MEMCPY pseudo-instructions to emit. We use up to
300 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
301 // later on. This is a lower bound on the number of MEMCPY operations we must
302 // emit.
303 unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
304
305 // Code size optimisation: do not inline memcpy if expansion results in
306 // more instructions than the libary call.
307 if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
308 return SDValue();
309 }
310
311 SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
312
313 for (unsigned I = 0; I != NumMEMCPYs; ++I) {
314 // Evenly distribute registers among MEMCPY operations to reduce register
315 // pressure.
316 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
317 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
318
319 Dst = DAG.getNode(Opcode: ARMISD::MEMCPY, DL: dl, VTList: VTs, N1: Chain, N2: Dst, N3: Src,
320 N4: DAG.getConstant(Val: NumRegs, DL: dl, VT: MVT::i32));
321 Src = Dst.getValue(R: 1);
322 Chain = Dst.getValue(R: 2);
323
324 DstPtrInfo = DstPtrInfo.getWithOffset(O: NumRegs * VTSize);
325 SrcPtrInfo = SrcPtrInfo.getWithOffset(O: NumRegs * VTSize);
326
327 EmittedNumMemOps = NextEmittedNumMemOps;
328 }
329
330 if (BytesLeft == 0)
331 return Chain;
332
333 // Issue loads / stores for the trailing (1 - 3) bytes.
334 auto getRemainingValueType = [](unsigned BytesLeft) {
335 return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
336 };
337 auto getRemainingSize = [](unsigned BytesLeft) {
338 return (BytesLeft >= 2) ? 2 : 1;
339 };
340
341 unsigned BytesLeftSave = BytesLeft;
342 i = 0;
343 while (BytesLeft) {
344 VT = getRemainingValueType(BytesLeft);
345 VTSize = getRemainingSize(BytesLeft);
346 Loads[i] = DAG.getLoad(VT, dl, Chain,
347 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Src,
348 N2: DAG.getConstant(Val: SrcOff, DL: dl, VT: MVT::i32)),
349 PtrInfo: SrcPtrInfo.getWithOffset(O: SrcOff));
350 TFOps[i] = Loads[i].getValue(R: 1);
351 ++i;
352 SrcOff += VTSize;
353 BytesLeft -= VTSize;
354 }
355 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i));
356
357 i = 0;
358 BytesLeft = BytesLeftSave;
359 while (BytesLeft) {
360 VT = getRemainingValueType(BytesLeft);
361 VTSize = getRemainingSize(BytesLeft);
362 TFOps[i] = DAG.getStore(Chain, dl, Val: Loads[i],
363 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Dst,
364 N2: DAG.getConstant(Val: DstOff, DL: dl, VT: MVT::i32)),
365 PtrInfo: DstPtrInfo.getWithOffset(O: DstOff));
366 ++i;
367 DstOff += VTSize;
368 BytesLeft -= VTSize;
369 }
370 return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i));
371}
372
373SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
374 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
375 SDValue Size, Align Alignment, bool isVolatile,
376 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
377 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
378 Align: Alignment.value(), LC: RTLIB::MEMMOVE);
379}
380
381SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
382 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
383 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
384 MachinePointerInfo DstPtrInfo) const {
385
386 const ARMSubtarget &Subtarget =
387 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
388
389 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
390
391 // Generate TP loop for llvm.memset
392 if (Subtarget.hasMVEIntegerOps() &&
393 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
394 IsMemcpy: false)) {
395 Src = DAG.getSplatBuildVector(VT: MVT::v16i8, DL: dl,
396 Op: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i8, Operand: Src));
397 return DAG.getNode(Opcode: ARMISD::MEMSETLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src,
398 N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32));
399 }
400
401 if (!AlwaysInline)
402 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
403 Align: Alignment.value(), LC: RTLIB::MEMSET);
404
405 return SDValue();
406}
407