1//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the ARMSelectionDAGInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "ARMTargetTransformInfo.h"
14#include "llvm/CodeGen/SelectionDAG.h"
15#include "llvm/Support/CommandLine.h"
16using namespace llvm;
17
18#define DEBUG_TYPE "arm-selectiondag-info"
19
20static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
21 "arm-memtransfer-tploop", cl::Hidden,
22 cl::desc("Control conversion of memcpy to "
23 "Tail predicated loops (WLSTP)"),
24 cl::init(Val: TPLoop::ForceDisabled),
25 cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
26 "Don't convert memcpy to TP loop."),
27 clEnumValN(TPLoop::ForceEnabled, "force-enabled",
28 "Always convert memcpy to TP loop."),
29 clEnumValN(TPLoop::Allow, "allow",
30 "Allow (may be subject to certain conditions) "
31 "conversion of memcpy to TP loop.")));
32
33bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
34 return Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
35 Opcode <= ARMISD::LAST_MEMORY_OPCODE;
36}
37
38// Emit, if possible, a specialized version of the given Libcall. Typically this
39// means selecting the appropriately aligned version, but we also convert memset
40// of 0 into memclr.
41SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
42 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
43 SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
44 const ARMSubtarget &Subtarget =
45 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
46 const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
47
48 // Only use a specialized AEABI function if the default version of this
49 // Libcall is an AEABI function.
50 if (std::strncmp(s1: TLI->getLibcallName(Call: LC), s2: "__aeabi", n: 7) != 0)
51 return SDValue();
52
53 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
54 // able to translate memset to memclr and use the value to index the function
55 // name array.
56 enum {
57 AEABI_MEMCPY = 0,
58 AEABI_MEMMOVE,
59 AEABI_MEMSET,
60 AEABI_MEMCLR
61 } AEABILibcall;
62 switch (LC) {
63 case RTLIB::MEMCPY:
64 AEABILibcall = AEABI_MEMCPY;
65 break;
66 case RTLIB::MEMMOVE:
67 AEABILibcall = AEABI_MEMMOVE;
68 break;
69 case RTLIB::MEMSET:
70 AEABILibcall = AEABI_MEMSET;
71 if (isNullConstant(V: Src))
72 AEABILibcall = AEABI_MEMCLR;
73 break;
74 default:
75 return SDValue();
76 }
77
78 // Choose the most-aligned libcall variant that we can
79 enum {
80 ALIGN1 = 0,
81 ALIGN4,
82 ALIGN8
83 } AlignVariant;
84 if ((Align & 7) == 0)
85 AlignVariant = ALIGN8;
86 else if ((Align & 3) == 0)
87 AlignVariant = ALIGN4;
88 else
89 AlignVariant = ALIGN1;
90
91 TargetLowering::ArgListTy Args;
92 TargetLowering::ArgListEntry Entry;
93 Entry.Ty = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext());
94 Entry.Node = Dst;
95 Args.push_back(x: Entry);
96 if (AEABILibcall == AEABI_MEMCLR) {
97 Entry.Node = Size;
98 Args.push_back(x: Entry);
99 } else if (AEABILibcall == AEABI_MEMSET) {
100 // Adjust parameters for memset, EABI uses format (ptr, size, value),
101 // GNU library uses (ptr, value, size)
102 // See RTABI section 4.3.4
103 Entry.Node = Size;
104 Args.push_back(x: Entry);
105
106 // Extend or truncate the argument to be an i32 value for the call.
107 if (Src.getValueType().bitsGT(VT: MVT::i32))
108 Src = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: Src);
109 else if (Src.getValueType().bitsLT(VT: MVT::i32))
110 Src = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i32, Operand: Src);
111
112 Entry.Node = Src;
113 Entry.Ty = Type::getInt32Ty(C&: *DAG.getContext());
114 Entry.IsSExt = false;
115 Args.push_back(x: Entry);
116 } else {
117 Entry.Node = Src;
118 Args.push_back(x: Entry);
119
120 Entry.Node = Size;
121 Args.push_back(x: Entry);
122 }
123
124 static const RTLIB::Libcall FunctionImpls[4][3] = {
125 {RTLIB::MEMCPY, RTLIB::AEABI_MEMCPY4, RTLIB::AEABI_MEMCPY8},
126 {RTLIB::MEMMOVE, RTLIB::AEABI_MEMMOVE4, RTLIB::AEABI_MEMMOVE8},
127 {RTLIB::MEMSET, RTLIB::AEABI_MEMSET4, RTLIB::AEABI_MEMSET8},
128 {RTLIB::AEABI_MEMCLR, RTLIB::AEABI_MEMCLR4, RTLIB::AEABI_MEMCLR8}};
129
130 RTLIB::Libcall NewLC = FunctionImpls[AEABILibcall][AlignVariant];
131
132 TargetLowering::CallLoweringInfo CLI(DAG);
133 CLI.setDebugLoc(dl)
134 .setChain(Chain)
135 .setLibCallee(
136 CC: TLI->getLibcallCallingConv(Call: NewLC), ResultType: Type::getVoidTy(C&: *DAG.getContext()),
137 Target: DAG.getExternalSymbol(Sym: TLI->getLibcallName(Call: NewLC),
138 VT: TLI->getPointerTy(DL: DAG.getDataLayout())),
139 ArgsList: std::move(Args))
140 .setDiscardResult();
141 std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
142
143 return CallResult.second;
144}
145
146static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
147 const SelectionDAG &DAG,
148 ConstantSDNode *ConstantSize,
149 Align Alignment, bool IsMemcpy) {
150 auto &F = DAG.getMachineFunction().getFunction();
151 if (!EnableMemtransferTPLoop)
152 return false;
153 if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
154 return true;
155 // Do not generate inline TP loop if optimizations is disabled,
156 // or if optimization for size (-Os or -Oz) is on.
157 if (F.hasOptNone() || F.hasOptSize())
158 return false;
159 // If cli option is unset, for memset always generate inline TP.
160 // For memcpy, check some conditions
161 if (!IsMemcpy)
162 return true;
163 if (!ConstantSize && Alignment >= Align(4))
164 return true;
165 if (ConstantSize &&
166 ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
167 ConstantSize->getZExtValue() <
168 Subtarget.getMaxMemcpyTPInlineSizeThreshold())
169 return true;
170 return false;
171}
172
173SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
174 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
175 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
176 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
177 const ARMSubtarget &Subtarget =
178 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
179 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
180
181 if (Subtarget.hasMVEIntegerOps() &&
182 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, IsMemcpy: true))
183 return DAG.getNode(Opcode: ARMISD::MEMCPYLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src,
184 N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32));
185
186 // Do repeated 4-byte loads and stores. To be improved.
187 // This requires 4-byte alignment.
188 if (Alignment < Align(4))
189 return SDValue();
190 // This requires the copy size to be a constant, preferably
191 // within a subtarget-specific limit.
192 if (!ConstantSize)
193 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
194 Align: Alignment.value(), LC: RTLIB::MEMCPY);
195 uint64_t SizeVal = ConstantSize->getZExtValue();
196 if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
197 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
198 Align: Alignment.value(), LC: RTLIB::MEMCPY);
199
200 unsigned BytesLeft = SizeVal & 3;
201 unsigned NumMemOps = SizeVal >> 2;
202 unsigned EmittedNumMemOps = 0;
203 EVT VT = MVT::i32;
204 unsigned VTSize = 4;
205 unsigned i = 0;
206 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
207 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
208 SDValue TFOps[6];
209 SDValue Loads[6];
210 uint64_t SrcOff = 0, DstOff = 0;
211
212 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
213 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
214 // pressure on the general purpose registers. However this seems harder to map
215 // onto the register allocator's view of the world.
216
217 // The number of MEMCPY pseudo-instructions to emit. We use up to
218 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
219 // later on. This is a lower bound on the number of MEMCPY operations we must
220 // emit.
221 unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
222
223 // Code size optimisation: do not inline memcpy if expansion results in
224 // more instructions than the libary call.
225 if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
226 return SDValue();
227 }
228
229 SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
230
231 for (unsigned I = 0; I != NumMEMCPYs; ++I) {
232 // Evenly distribute registers among MEMCPY operations to reduce register
233 // pressure.
234 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
235 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
236
237 Dst = DAG.getNode(Opcode: ARMISD::MEMCPY, DL: dl, VTList: VTs, N1: Chain, N2: Dst, N3: Src,
238 N4: DAG.getConstant(Val: NumRegs, DL: dl, VT: MVT::i32));
239 Src = Dst.getValue(R: 1);
240 Chain = Dst.getValue(R: 2);
241
242 DstPtrInfo = DstPtrInfo.getWithOffset(O: NumRegs * VTSize);
243 SrcPtrInfo = SrcPtrInfo.getWithOffset(O: NumRegs * VTSize);
244
245 EmittedNumMemOps = NextEmittedNumMemOps;
246 }
247
248 if (BytesLeft == 0)
249 return Chain;
250
251 // Issue loads / stores for the trailing (1 - 3) bytes.
252 auto getRemainingValueType = [](unsigned BytesLeft) {
253 return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
254 };
255 auto getRemainingSize = [](unsigned BytesLeft) {
256 return (BytesLeft >= 2) ? 2 : 1;
257 };
258
259 unsigned BytesLeftSave = BytesLeft;
260 i = 0;
261 while (BytesLeft) {
262 VT = getRemainingValueType(BytesLeft);
263 VTSize = getRemainingSize(BytesLeft);
264 Loads[i] = DAG.getLoad(VT, dl, Chain,
265 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Src,
266 N2: DAG.getConstant(Val: SrcOff, DL: dl, VT: MVT::i32)),
267 PtrInfo: SrcPtrInfo.getWithOffset(O: SrcOff));
268 TFOps[i] = Loads[i].getValue(R: 1);
269 ++i;
270 SrcOff += VTSize;
271 BytesLeft -= VTSize;
272 }
273 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i));
274
275 i = 0;
276 BytesLeft = BytesLeftSave;
277 while (BytesLeft) {
278 VT = getRemainingValueType(BytesLeft);
279 VTSize = getRemainingSize(BytesLeft);
280 TFOps[i] = DAG.getStore(Chain, dl, Val: Loads[i],
281 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Dst,
282 N2: DAG.getConstant(Val: DstOff, DL: dl, VT: MVT::i32)),
283 PtrInfo: DstPtrInfo.getWithOffset(O: DstOff));
284 ++i;
285 DstOff += VTSize;
286 BytesLeft -= VTSize;
287 }
288 return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i));
289}
290
291SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
292 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
293 SDValue Size, Align Alignment, bool isVolatile,
294 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
295 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
296 Align: Alignment.value(), LC: RTLIB::MEMMOVE);
297}
298
299SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
300 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
301 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
302 MachinePointerInfo DstPtrInfo) const {
303
304 const ARMSubtarget &Subtarget =
305 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
306
307 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
308
309 // Generate TP loop for llvm.memset
310 if (Subtarget.hasMVEIntegerOps() &&
311 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
312 IsMemcpy: false)) {
313 Src = DAG.getSplatBuildVector(VT: MVT::v16i8, DL: dl,
314 Op: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i8, Operand: Src));
315 return DAG.getNode(Opcode: ARMISD::MEMSETLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src,
316 N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32));
317 }
318
319 if (!AlwaysInline)
320 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
321 Align: Alignment.value(), LC: RTLIB::MEMSET);
322
323 return SDValue();
324}
325