1 | //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the ARMSelectionDAGInfo class. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "ARMTargetMachine.h" |
14 | #include "ARMTargetTransformInfo.h" |
15 | #include "llvm/CodeGen/SelectionDAG.h" |
16 | #include "llvm/IR/DerivedTypes.h" |
17 | #include "llvm/Support/CommandLine.h" |
18 | using namespace llvm; |
19 | |
20 | #define DEBUG_TYPE "arm-selectiondag-info" |
21 | |
22 | cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop( |
23 | "arm-memtransfer-tploop" , cl::Hidden, |
24 | cl::desc("Control conversion of memcpy to " |
25 | "Tail predicated loops (WLSTP)" ), |
26 | cl::init(Val: TPLoop::ForceDisabled), |
27 | cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled" , |
28 | "Don't convert memcpy to TP loop." ), |
29 | clEnumValN(TPLoop::ForceEnabled, "force-enabled" , |
30 | "Always convert memcpy to TP loop." ), |
31 | clEnumValN(TPLoop::Allow, "allow" , |
32 | "Allow (may be subject to certain conditions) " |
33 | "conversion of memcpy to TP loop." ))); |
34 | |
35 | // Emit, if possible, a specialized version of the given Libcall. Typically this |
36 | // means selecting the appropriately aligned version, but we also convert memset |
37 | // of 0 into memclr. |
38 | SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( |
39 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, |
40 | SDValue Size, unsigned Align, RTLIB::Libcall LC) const { |
41 | const ARMSubtarget &Subtarget = |
42 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); |
43 | const ARMTargetLowering *TLI = Subtarget.getTargetLowering(); |
44 | |
45 | // Only use a specialized AEABI function if the default version of this |
46 | // Libcall is an AEABI function. |
47 | if (std::strncmp(s1: TLI->getLibcallName(Call: LC), s2: "__aeabi" , n: 7) != 0) |
48 | return SDValue(); |
49 | |
50 | // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be |
51 | // able to translate memset to memclr and use the value to index the function |
52 | // name array. |
53 | enum { |
54 | AEABI_MEMCPY = 0, |
55 | AEABI_MEMMOVE, |
56 | AEABI_MEMSET, |
57 | AEABI_MEMCLR |
58 | } AEABILibcall; |
59 | switch (LC) { |
60 | case RTLIB::MEMCPY: |
61 | AEABILibcall = AEABI_MEMCPY; |
62 | break; |
63 | case RTLIB::MEMMOVE: |
64 | AEABILibcall = AEABI_MEMMOVE; |
65 | break; |
66 | case RTLIB::MEMSET: |
67 | AEABILibcall = AEABI_MEMSET; |
68 | if (isNullConstant(V: Src)) |
69 | AEABILibcall = AEABI_MEMCLR; |
70 | break; |
71 | default: |
72 | return SDValue(); |
73 | } |
74 | |
75 | // Choose the most-aligned libcall variant that we can |
76 | enum { |
77 | ALIGN1 = 0, |
78 | ALIGN4, |
79 | ALIGN8 |
80 | } AlignVariant; |
81 | if ((Align & 7) == 0) |
82 | AlignVariant = ALIGN8; |
83 | else if ((Align & 3) == 0) |
84 | AlignVariant = ALIGN4; |
85 | else |
86 | AlignVariant = ALIGN1; |
87 | |
88 | TargetLowering::ArgListTy Args; |
89 | TargetLowering::ArgListEntry Entry; |
90 | Entry.Ty = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext()); |
91 | Entry.Node = Dst; |
92 | Args.push_back(x: Entry); |
93 | if (AEABILibcall == AEABI_MEMCLR) { |
94 | Entry.Node = Size; |
95 | Args.push_back(x: Entry); |
96 | } else if (AEABILibcall == AEABI_MEMSET) { |
97 | // Adjust parameters for memset, EABI uses format (ptr, size, value), |
98 | // GNU library uses (ptr, value, size) |
99 | // See RTABI section 4.3.4 |
100 | Entry.Node = Size; |
101 | Args.push_back(x: Entry); |
102 | |
103 | // Extend or truncate the argument to be an i32 value for the call. |
104 | if (Src.getValueType().bitsGT(VT: MVT::i32)) |
105 | Src = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: Src); |
106 | else if (Src.getValueType().bitsLT(VT: MVT::i32)) |
107 | Src = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i32, Operand: Src); |
108 | |
109 | Entry.Node = Src; |
110 | Entry.Ty = Type::getInt32Ty(C&: *DAG.getContext()); |
111 | Entry.IsSExt = false; |
112 | Args.push_back(x: Entry); |
113 | } else { |
114 | Entry.Node = Src; |
115 | Args.push_back(x: Entry); |
116 | |
117 | Entry.Node = Size; |
118 | Args.push_back(x: Entry); |
119 | } |
120 | |
121 | char const *FunctionNames[4][3] = { |
122 | { "__aeabi_memcpy" , "__aeabi_memcpy4" , "__aeabi_memcpy8" }, |
123 | { "__aeabi_memmove" , "__aeabi_memmove4" , "__aeabi_memmove8" }, |
124 | { "__aeabi_memset" , "__aeabi_memset4" , "__aeabi_memset8" }, |
125 | { "__aeabi_memclr" , "__aeabi_memclr4" , "__aeabi_memclr8" } |
126 | }; |
127 | TargetLowering::CallLoweringInfo CLI(DAG); |
128 | CLI.setDebugLoc(dl) |
129 | .setChain(Chain) |
130 | .setLibCallee( |
131 | CC: TLI->getLibcallCallingConv(Call: LC), ResultType: Type::getVoidTy(C&: *DAG.getContext()), |
132 | Target: DAG.getExternalSymbol(Sym: FunctionNames[AEABILibcall][AlignVariant], |
133 | VT: TLI->getPointerTy(DL: DAG.getDataLayout())), |
134 | ArgsList: std::move(Args)) |
135 | .setDiscardResult(); |
136 | std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI); |
137 | |
138 | return CallResult.second; |
139 | } |
140 | |
141 | static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, |
142 | const SelectionDAG &DAG, |
143 | ConstantSDNode *ConstantSize, |
144 | Align Alignment, bool IsMemcpy) { |
145 | auto &F = DAG.getMachineFunction().getFunction(); |
146 | if (!EnableMemtransferTPLoop) |
147 | return false; |
148 | if (EnableMemtransferTPLoop == TPLoop::ForceEnabled) |
149 | return true; |
150 | // Do not generate inline TP loop if optimizations is disabled, |
151 | // or if optimization for size (-Os or -Oz) is on. |
152 | if (F.hasOptNone() || F.hasOptSize()) |
153 | return false; |
154 | // If cli option is unset, for memset always generate inline TP. |
155 | // For memcpy, check some conditions |
156 | if (!IsMemcpy) |
157 | return true; |
158 | if (!ConstantSize && Alignment >= Align(4)) |
159 | return true; |
160 | if (ConstantSize && |
161 | ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() && |
162 | ConstantSize->getZExtValue() < |
163 | Subtarget.getMaxMemcpyTPInlineSizeThreshold()) |
164 | return true; |
165 | return false; |
166 | } |
167 | |
168 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( |
169 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, |
170 | SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, |
171 | MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { |
172 | const ARMSubtarget &Subtarget = |
173 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); |
174 | ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size); |
175 | |
176 | if (Subtarget.hasMVEIntegerOps() && |
177 | shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, IsMemcpy: true)) |
178 | return DAG.getNode(Opcode: ARMISD::MEMCPYLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src, |
179 | N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32)); |
180 | |
181 | // Do repeated 4-byte loads and stores. To be improved. |
182 | // This requires 4-byte alignment. |
183 | if (Alignment < Align(4)) |
184 | return SDValue(); |
185 | // This requires the copy size to be a constant, preferably |
186 | // within a subtarget-specific limit. |
187 | if (!ConstantSize) |
188 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
189 | Align: Alignment.value(), LC: RTLIB::MEMCPY); |
190 | uint64_t SizeVal = ConstantSize->getZExtValue(); |
191 | if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) |
192 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
193 | Align: Alignment.value(), LC: RTLIB::MEMCPY); |
194 | |
195 | unsigned BytesLeft = SizeVal & 3; |
196 | unsigned NumMemOps = SizeVal >> 2; |
197 | unsigned EmittedNumMemOps = 0; |
198 | EVT VT = MVT::i32; |
199 | unsigned VTSize = 4; |
200 | unsigned i = 0; |
201 | // Emit a maximum of 4 loads in Thumb1 since we have fewer registers |
202 | const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; |
203 | SDValue TFOps[6]; |
204 | SDValue Loads[6]; |
205 | uint64_t SrcOff = 0, DstOff = 0; |
206 | |
207 | // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to |
208 | // VLDM/VSTM and make this code emit it when appropriate. This would reduce |
209 | // pressure on the general purpose registers. However this seems harder to map |
210 | // onto the register allocator's view of the world. |
211 | |
212 | // The number of MEMCPY pseudo-instructions to emit. We use up to |
213 | // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm |
214 | // later on. This is a lower bound on the number of MEMCPY operations we must |
215 | // emit. |
216 | unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; |
217 | |
218 | // Code size optimisation: do not inline memcpy if expansion results in |
219 | // more instructions than the libary call. |
220 | if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) { |
221 | return SDValue(); |
222 | } |
223 | |
224 | SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue); |
225 | |
226 | for (unsigned I = 0; I != NumMEMCPYs; ++I) { |
227 | // Evenly distribute registers among MEMCPY operations to reduce register |
228 | // pressure. |
229 | unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; |
230 | unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; |
231 | |
232 | Dst = DAG.getNode(Opcode: ARMISD::MEMCPY, DL: dl, VTList: VTs, N1: Chain, N2: Dst, N3: Src, |
233 | N4: DAG.getConstant(Val: NumRegs, DL: dl, VT: MVT::i32)); |
234 | Src = Dst.getValue(R: 1); |
235 | Chain = Dst.getValue(R: 2); |
236 | |
237 | DstPtrInfo = DstPtrInfo.getWithOffset(O: NumRegs * VTSize); |
238 | SrcPtrInfo = SrcPtrInfo.getWithOffset(O: NumRegs * VTSize); |
239 | |
240 | EmittedNumMemOps = NextEmittedNumMemOps; |
241 | } |
242 | |
243 | if (BytesLeft == 0) |
244 | return Chain; |
245 | |
246 | // Issue loads / stores for the trailing (1 - 3) bytes. |
247 | auto getRemainingValueType = [](unsigned BytesLeft) { |
248 | return (BytesLeft >= 2) ? MVT::i16 : MVT::i8; |
249 | }; |
250 | auto getRemainingSize = [](unsigned BytesLeft) { |
251 | return (BytesLeft >= 2) ? 2 : 1; |
252 | }; |
253 | |
254 | unsigned BytesLeftSave = BytesLeft; |
255 | i = 0; |
256 | while (BytesLeft) { |
257 | VT = getRemainingValueType(BytesLeft); |
258 | VTSize = getRemainingSize(BytesLeft); |
259 | Loads[i] = DAG.getLoad(VT, dl, Chain, |
260 | Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Src, |
261 | N2: DAG.getConstant(Val: SrcOff, DL: dl, VT: MVT::i32)), |
262 | PtrInfo: SrcPtrInfo.getWithOffset(O: SrcOff)); |
263 | TFOps[i] = Loads[i].getValue(R: 1); |
264 | ++i; |
265 | SrcOff += VTSize; |
266 | BytesLeft -= VTSize; |
267 | } |
268 | Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i)); |
269 | |
270 | i = 0; |
271 | BytesLeft = BytesLeftSave; |
272 | while (BytesLeft) { |
273 | VT = getRemainingValueType(BytesLeft); |
274 | VTSize = getRemainingSize(BytesLeft); |
275 | TFOps[i] = DAG.getStore(Chain, dl, Val: Loads[i], |
276 | Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Dst, |
277 | N2: DAG.getConstant(Val: DstOff, DL: dl, VT: MVT::i32)), |
278 | PtrInfo: DstPtrInfo.getWithOffset(O: DstOff)); |
279 | ++i; |
280 | DstOff += VTSize; |
281 | BytesLeft -= VTSize; |
282 | } |
283 | return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i)); |
284 | } |
285 | |
286 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove( |
287 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, |
288 | SDValue Size, Align Alignment, bool isVolatile, |
289 | MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { |
290 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
291 | Align: Alignment.value(), LC: RTLIB::MEMMOVE); |
292 | } |
293 | |
294 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset( |
295 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, |
296 | SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, |
297 | MachinePointerInfo DstPtrInfo) const { |
298 | |
299 | const ARMSubtarget &Subtarget = |
300 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); |
301 | |
302 | ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size); |
303 | |
304 | // Generate TP loop for llvm.memset |
305 | if (Subtarget.hasMVEIntegerOps() && |
306 | shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, |
307 | IsMemcpy: false)) { |
308 | Src = DAG.getSplatBuildVector(VT: MVT::v16i8, DL: dl, |
309 | Op: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i8, Operand: Src)); |
310 | return DAG.getNode(Opcode: ARMISD::MEMSETLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src, |
311 | N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32)); |
312 | } |
313 | |
314 | if (!AlwaysInline) |
315 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
316 | Align: Alignment.value(), LC: RTLIB::MEMSET); |
317 | |
318 | return SDValue(); |
319 | } |
320 | |