1//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the ARMSelectionDAGInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "ARMTargetMachine.h"
14#include "ARMTargetTransformInfo.h"
15#include "llvm/CodeGen/SelectionDAG.h"
16#include "llvm/IR/DerivedTypes.h"
17#include "llvm/Support/CommandLine.h"
18using namespace llvm;
19
20#define DEBUG_TYPE "arm-selectiondag-info"
21
22cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
23 "arm-memtransfer-tploop", cl::Hidden,
24 cl::desc("Control conversion of memcpy to "
25 "Tail predicated loops (WLSTP)"),
26 cl::init(Val: TPLoop::ForceDisabled),
27 cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
28 "Don't convert memcpy to TP loop."),
29 clEnumValN(TPLoop::ForceEnabled, "force-enabled",
30 "Always convert memcpy to TP loop."),
31 clEnumValN(TPLoop::Allow, "allow",
32 "Allow (may be subject to certain conditions) "
33 "conversion of memcpy to TP loop.")));
34
35// Emit, if possible, a specialized version of the given Libcall. Typically this
36// means selecting the appropriately aligned version, but we also convert memset
37// of 0 into memclr.
38SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
39 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
40 SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
41 const ARMSubtarget &Subtarget =
42 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
43 const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
44
45 // Only use a specialized AEABI function if the default version of this
46 // Libcall is an AEABI function.
47 if (std::strncmp(s1: TLI->getLibcallName(Call: LC), s2: "__aeabi", n: 7) != 0)
48 return SDValue();
49
50 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
51 // able to translate memset to memclr and use the value to index the function
52 // name array.
53 enum {
54 AEABI_MEMCPY = 0,
55 AEABI_MEMMOVE,
56 AEABI_MEMSET,
57 AEABI_MEMCLR
58 } AEABILibcall;
59 switch (LC) {
60 case RTLIB::MEMCPY:
61 AEABILibcall = AEABI_MEMCPY;
62 break;
63 case RTLIB::MEMMOVE:
64 AEABILibcall = AEABI_MEMMOVE;
65 break;
66 case RTLIB::MEMSET:
67 AEABILibcall = AEABI_MEMSET;
68 if (isNullConstant(V: Src))
69 AEABILibcall = AEABI_MEMCLR;
70 break;
71 default:
72 return SDValue();
73 }
74
75 // Choose the most-aligned libcall variant that we can
76 enum {
77 ALIGN1 = 0,
78 ALIGN4,
79 ALIGN8
80 } AlignVariant;
81 if ((Align & 7) == 0)
82 AlignVariant = ALIGN8;
83 else if ((Align & 3) == 0)
84 AlignVariant = ALIGN4;
85 else
86 AlignVariant = ALIGN1;
87
88 TargetLowering::ArgListTy Args;
89 TargetLowering::ArgListEntry Entry;
90 Entry.Ty = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext());
91 Entry.Node = Dst;
92 Args.push_back(x: Entry);
93 if (AEABILibcall == AEABI_MEMCLR) {
94 Entry.Node = Size;
95 Args.push_back(x: Entry);
96 } else if (AEABILibcall == AEABI_MEMSET) {
97 // Adjust parameters for memset, EABI uses format (ptr, size, value),
98 // GNU library uses (ptr, value, size)
99 // See RTABI section 4.3.4
100 Entry.Node = Size;
101 Args.push_back(x: Entry);
102
103 // Extend or truncate the argument to be an i32 value for the call.
104 if (Src.getValueType().bitsGT(VT: MVT::i32))
105 Src = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: Src);
106 else if (Src.getValueType().bitsLT(VT: MVT::i32))
107 Src = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i32, Operand: Src);
108
109 Entry.Node = Src;
110 Entry.Ty = Type::getInt32Ty(C&: *DAG.getContext());
111 Entry.IsSExt = false;
112 Args.push_back(x: Entry);
113 } else {
114 Entry.Node = Src;
115 Args.push_back(x: Entry);
116
117 Entry.Node = Size;
118 Args.push_back(x: Entry);
119 }
120
121 char const *FunctionNames[4][3] = {
122 { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
123 { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
124 { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
125 { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
126 };
127 TargetLowering::CallLoweringInfo CLI(DAG);
128 CLI.setDebugLoc(dl)
129 .setChain(Chain)
130 .setLibCallee(
131 CC: TLI->getLibcallCallingConv(Call: LC), ResultType: Type::getVoidTy(C&: *DAG.getContext()),
132 Target: DAG.getExternalSymbol(Sym: FunctionNames[AEABILibcall][AlignVariant],
133 VT: TLI->getPointerTy(DL: DAG.getDataLayout())),
134 ArgsList: std::move(Args))
135 .setDiscardResult();
136 std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
137
138 return CallResult.second;
139}
140
141static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
142 const SelectionDAG &DAG,
143 ConstantSDNode *ConstantSize,
144 Align Alignment, bool IsMemcpy) {
145 auto &F = DAG.getMachineFunction().getFunction();
146 if (!EnableMemtransferTPLoop)
147 return false;
148 if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
149 return true;
150 // Do not generate inline TP loop if optimizations is disabled,
151 // or if optimization for size (-Os or -Oz) is on.
152 if (F.hasOptNone() || F.hasOptSize())
153 return false;
154 // If cli option is unset, for memset always generate inline TP.
155 // For memcpy, check some conditions
156 if (!IsMemcpy)
157 return true;
158 if (!ConstantSize && Alignment >= Align(4))
159 return true;
160 if (ConstantSize &&
161 ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
162 ConstantSize->getZExtValue() <
163 Subtarget.getMaxMemcpyTPInlineSizeThreshold())
164 return true;
165 return false;
166}
167
168SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
169 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
170 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
171 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
172 const ARMSubtarget &Subtarget =
173 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
174 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
175
176 if (Subtarget.hasMVEIntegerOps() &&
177 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, IsMemcpy: true))
178 return DAG.getNode(Opcode: ARMISD::MEMCPYLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src,
179 N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32));
180
181 // Do repeated 4-byte loads and stores. To be improved.
182 // This requires 4-byte alignment.
183 if (Alignment < Align(4))
184 return SDValue();
185 // This requires the copy size to be a constant, preferably
186 // within a subtarget-specific limit.
187 if (!ConstantSize)
188 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
189 Align: Alignment.value(), LC: RTLIB::MEMCPY);
190 uint64_t SizeVal = ConstantSize->getZExtValue();
191 if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
192 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
193 Align: Alignment.value(), LC: RTLIB::MEMCPY);
194
195 unsigned BytesLeft = SizeVal & 3;
196 unsigned NumMemOps = SizeVal >> 2;
197 unsigned EmittedNumMemOps = 0;
198 EVT VT = MVT::i32;
199 unsigned VTSize = 4;
200 unsigned i = 0;
201 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
202 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
203 SDValue TFOps[6];
204 SDValue Loads[6];
205 uint64_t SrcOff = 0, DstOff = 0;
206
207 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
208 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
209 // pressure on the general purpose registers. However this seems harder to map
210 // onto the register allocator's view of the world.
211
212 // The number of MEMCPY pseudo-instructions to emit. We use up to
213 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
214 // later on. This is a lower bound on the number of MEMCPY operations we must
215 // emit.
216 unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
217
218 // Code size optimisation: do not inline memcpy if expansion results in
219 // more instructions than the libary call.
220 if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
221 return SDValue();
222 }
223
224 SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
225
226 for (unsigned I = 0; I != NumMEMCPYs; ++I) {
227 // Evenly distribute registers among MEMCPY operations to reduce register
228 // pressure.
229 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
230 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
231
232 Dst = DAG.getNode(Opcode: ARMISD::MEMCPY, DL: dl, VTList: VTs, N1: Chain, N2: Dst, N3: Src,
233 N4: DAG.getConstant(Val: NumRegs, DL: dl, VT: MVT::i32));
234 Src = Dst.getValue(R: 1);
235 Chain = Dst.getValue(R: 2);
236
237 DstPtrInfo = DstPtrInfo.getWithOffset(O: NumRegs * VTSize);
238 SrcPtrInfo = SrcPtrInfo.getWithOffset(O: NumRegs * VTSize);
239
240 EmittedNumMemOps = NextEmittedNumMemOps;
241 }
242
243 if (BytesLeft == 0)
244 return Chain;
245
246 // Issue loads / stores for the trailing (1 - 3) bytes.
247 auto getRemainingValueType = [](unsigned BytesLeft) {
248 return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
249 };
250 auto getRemainingSize = [](unsigned BytesLeft) {
251 return (BytesLeft >= 2) ? 2 : 1;
252 };
253
254 unsigned BytesLeftSave = BytesLeft;
255 i = 0;
256 while (BytesLeft) {
257 VT = getRemainingValueType(BytesLeft);
258 VTSize = getRemainingSize(BytesLeft);
259 Loads[i] = DAG.getLoad(VT, dl, Chain,
260 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Src,
261 N2: DAG.getConstant(Val: SrcOff, DL: dl, VT: MVT::i32)),
262 PtrInfo: SrcPtrInfo.getWithOffset(O: SrcOff));
263 TFOps[i] = Loads[i].getValue(R: 1);
264 ++i;
265 SrcOff += VTSize;
266 BytesLeft -= VTSize;
267 }
268 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i));
269
270 i = 0;
271 BytesLeft = BytesLeftSave;
272 while (BytesLeft) {
273 VT = getRemainingValueType(BytesLeft);
274 VTSize = getRemainingSize(BytesLeft);
275 TFOps[i] = DAG.getStore(Chain, dl, Val: Loads[i],
276 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Dst,
277 N2: DAG.getConstant(Val: DstOff, DL: dl, VT: MVT::i32)),
278 PtrInfo: DstPtrInfo.getWithOffset(O: DstOff));
279 ++i;
280 DstOff += VTSize;
281 BytesLeft -= VTSize;
282 }
283 return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i));
284}
285
286SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
287 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
288 SDValue Size, Align Alignment, bool isVolatile,
289 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
290 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
291 Align: Alignment.value(), LC: RTLIB::MEMMOVE);
292}
293
294SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
295 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
296 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
297 MachinePointerInfo DstPtrInfo) const {
298
299 const ARMSubtarget &Subtarget =
300 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
301
302 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
303
304 // Generate TP loop for llvm.memset
305 if (Subtarget.hasMVEIntegerOps() &&
306 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
307 IsMemcpy: false)) {
308 Src = DAG.getSplatBuildVector(VT: MVT::v16i8, DL: dl,
309 Op: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i8, Operand: Src));
310 return DAG.getNode(Opcode: ARMISD::MEMSETLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src,
311 N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32));
312 }
313
314 if (!AlwaysInline)
315 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
316 Align: Alignment.value(), LC: RTLIB::MEMSET);
317
318 return SDValue();
319}
320