1 | //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the ARMSelectionDAGInfo class. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "ARMTargetTransformInfo.h" |
14 | #include "llvm/CodeGen/SelectionDAG.h" |
15 | #include "llvm/Support/CommandLine.h" |
16 | using namespace llvm; |
17 | |
18 | #define DEBUG_TYPE "arm-selectiondag-info" |
19 | |
20 | static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop( |
21 | "arm-memtransfer-tploop" , cl::Hidden, |
22 | cl::desc("Control conversion of memcpy to " |
23 | "Tail predicated loops (WLSTP)" ), |
24 | cl::init(Val: TPLoop::ForceDisabled), |
25 | cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled" , |
26 | "Don't convert memcpy to TP loop." ), |
27 | clEnumValN(TPLoop::ForceEnabled, "force-enabled" , |
28 | "Always convert memcpy to TP loop." ), |
29 | clEnumValN(TPLoop::Allow, "allow" , |
30 | "Allow (may be subject to certain conditions) " |
31 | "conversion of memcpy to TP loop." ))); |
32 | |
33 | bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { |
34 | return Opcode >= ARMISD::FIRST_MEMORY_OPCODE && |
35 | Opcode <= ARMISD::LAST_MEMORY_OPCODE; |
36 | } |
37 | |
38 | // Emit, if possible, a specialized version of the given Libcall. Typically this |
39 | // means selecting the appropriately aligned version, but we also convert memset |
40 | // of 0 into memclr. |
41 | SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( |
42 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, |
43 | SDValue Size, unsigned Align, RTLIB::Libcall LC) const { |
44 | const ARMSubtarget &Subtarget = |
45 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); |
46 | const ARMTargetLowering *TLI = Subtarget.getTargetLowering(); |
47 | |
48 | // Only use a specialized AEABI function if the default version of this |
49 | // Libcall is an AEABI function. |
50 | if (std::strncmp(s1: TLI->getLibcallName(Call: LC), s2: "__aeabi" , n: 7) != 0) |
51 | return SDValue(); |
52 | |
53 | // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be |
54 | // able to translate memset to memclr and use the value to index the function |
55 | // name array. |
56 | enum { |
57 | AEABI_MEMCPY = 0, |
58 | AEABI_MEMMOVE, |
59 | AEABI_MEMSET, |
60 | AEABI_MEMCLR |
61 | } AEABILibcall; |
62 | switch (LC) { |
63 | case RTLIB::MEMCPY: |
64 | AEABILibcall = AEABI_MEMCPY; |
65 | break; |
66 | case RTLIB::MEMMOVE: |
67 | AEABILibcall = AEABI_MEMMOVE; |
68 | break; |
69 | case RTLIB::MEMSET: |
70 | AEABILibcall = AEABI_MEMSET; |
71 | if (isNullConstant(V: Src)) |
72 | AEABILibcall = AEABI_MEMCLR; |
73 | break; |
74 | default: |
75 | return SDValue(); |
76 | } |
77 | |
78 | // Choose the most-aligned libcall variant that we can |
79 | enum { |
80 | ALIGN1 = 0, |
81 | ALIGN4, |
82 | ALIGN8 |
83 | } AlignVariant; |
84 | if ((Align & 7) == 0) |
85 | AlignVariant = ALIGN8; |
86 | else if ((Align & 3) == 0) |
87 | AlignVariant = ALIGN4; |
88 | else |
89 | AlignVariant = ALIGN1; |
90 | |
91 | TargetLowering::ArgListTy Args; |
92 | TargetLowering::ArgListEntry Entry; |
93 | Entry.Ty = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext()); |
94 | Entry.Node = Dst; |
95 | Args.push_back(x: Entry); |
96 | if (AEABILibcall == AEABI_MEMCLR) { |
97 | Entry.Node = Size; |
98 | Args.push_back(x: Entry); |
99 | } else if (AEABILibcall == AEABI_MEMSET) { |
100 | // Adjust parameters for memset, EABI uses format (ptr, size, value), |
101 | // GNU library uses (ptr, value, size) |
102 | // See RTABI section 4.3.4 |
103 | Entry.Node = Size; |
104 | Args.push_back(x: Entry); |
105 | |
106 | // Extend or truncate the argument to be an i32 value for the call. |
107 | if (Src.getValueType().bitsGT(VT: MVT::i32)) |
108 | Src = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: Src); |
109 | else if (Src.getValueType().bitsLT(VT: MVT::i32)) |
110 | Src = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i32, Operand: Src); |
111 | |
112 | Entry.Node = Src; |
113 | Entry.Ty = Type::getInt32Ty(C&: *DAG.getContext()); |
114 | Entry.IsSExt = false; |
115 | Args.push_back(x: Entry); |
116 | } else { |
117 | Entry.Node = Src; |
118 | Args.push_back(x: Entry); |
119 | |
120 | Entry.Node = Size; |
121 | Args.push_back(x: Entry); |
122 | } |
123 | |
124 | static const RTLIB::Libcall FunctionImpls[4][3] = { |
125 | {RTLIB::MEMCPY, RTLIB::AEABI_MEMCPY4, RTLIB::AEABI_MEMCPY8}, |
126 | {RTLIB::MEMMOVE, RTLIB::AEABI_MEMMOVE4, RTLIB::AEABI_MEMMOVE8}, |
127 | {RTLIB::MEMSET, RTLIB::AEABI_MEMSET4, RTLIB::AEABI_MEMSET8}, |
128 | {RTLIB::AEABI_MEMCLR, RTLIB::AEABI_MEMCLR4, RTLIB::AEABI_MEMCLR8}}; |
129 | |
130 | RTLIB::Libcall NewLC = FunctionImpls[AEABILibcall][AlignVariant]; |
131 | |
132 | TargetLowering::CallLoweringInfo CLI(DAG); |
133 | CLI.setDebugLoc(dl) |
134 | .setChain(Chain) |
135 | .setLibCallee( |
136 | CC: TLI->getLibcallCallingConv(Call: NewLC), ResultType: Type::getVoidTy(C&: *DAG.getContext()), |
137 | Target: DAG.getExternalSymbol(Sym: TLI->getLibcallName(Call: NewLC), |
138 | VT: TLI->getPointerTy(DL: DAG.getDataLayout())), |
139 | ArgsList: std::move(Args)) |
140 | .setDiscardResult(); |
141 | std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI); |
142 | |
143 | return CallResult.second; |
144 | } |
145 | |
146 | static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, |
147 | const SelectionDAG &DAG, |
148 | ConstantSDNode *ConstantSize, |
149 | Align Alignment, bool IsMemcpy) { |
150 | auto &F = DAG.getMachineFunction().getFunction(); |
151 | if (!EnableMemtransferTPLoop) |
152 | return false; |
153 | if (EnableMemtransferTPLoop == TPLoop::ForceEnabled) |
154 | return true; |
155 | // Do not generate inline TP loop if optimizations is disabled, |
156 | // or if optimization for size (-Os or -Oz) is on. |
157 | if (F.hasOptNone() || F.hasOptSize()) |
158 | return false; |
159 | // If cli option is unset, for memset always generate inline TP. |
160 | // For memcpy, check some conditions |
161 | if (!IsMemcpy) |
162 | return true; |
163 | if (!ConstantSize && Alignment >= Align(4)) |
164 | return true; |
165 | if (ConstantSize && |
166 | ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() && |
167 | ConstantSize->getZExtValue() < |
168 | Subtarget.getMaxMemcpyTPInlineSizeThreshold()) |
169 | return true; |
170 | return false; |
171 | } |
172 | |
173 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( |
174 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, |
175 | SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, |
176 | MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { |
177 | const ARMSubtarget &Subtarget = |
178 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); |
179 | ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size); |
180 | |
181 | if (Subtarget.hasMVEIntegerOps() && |
182 | shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, IsMemcpy: true)) |
183 | return DAG.getNode(Opcode: ARMISD::MEMCPYLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src, |
184 | N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32)); |
185 | |
186 | // Do repeated 4-byte loads and stores. To be improved. |
187 | // This requires 4-byte alignment. |
188 | if (Alignment < Align(4)) |
189 | return SDValue(); |
190 | // This requires the copy size to be a constant, preferably |
191 | // within a subtarget-specific limit. |
192 | if (!ConstantSize) |
193 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
194 | Align: Alignment.value(), LC: RTLIB::MEMCPY); |
195 | uint64_t SizeVal = ConstantSize->getZExtValue(); |
196 | if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) |
197 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
198 | Align: Alignment.value(), LC: RTLIB::MEMCPY); |
199 | |
200 | unsigned BytesLeft = SizeVal & 3; |
201 | unsigned NumMemOps = SizeVal >> 2; |
202 | unsigned EmittedNumMemOps = 0; |
203 | EVT VT = MVT::i32; |
204 | unsigned VTSize = 4; |
205 | unsigned i = 0; |
206 | // Emit a maximum of 4 loads in Thumb1 since we have fewer registers |
207 | const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; |
208 | SDValue TFOps[6]; |
209 | SDValue Loads[6]; |
210 | uint64_t SrcOff = 0, DstOff = 0; |
211 | |
212 | // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to |
213 | // VLDM/VSTM and make this code emit it when appropriate. This would reduce |
214 | // pressure on the general purpose registers. However this seems harder to map |
215 | // onto the register allocator's view of the world. |
216 | |
217 | // The number of MEMCPY pseudo-instructions to emit. We use up to |
218 | // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm |
219 | // later on. This is a lower bound on the number of MEMCPY operations we must |
220 | // emit. |
221 | unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; |
222 | |
223 | // Code size optimisation: do not inline memcpy if expansion results in |
224 | // more instructions than the libary call. |
225 | if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) { |
226 | return SDValue(); |
227 | } |
228 | |
229 | SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue); |
230 | |
231 | for (unsigned I = 0; I != NumMEMCPYs; ++I) { |
232 | // Evenly distribute registers among MEMCPY operations to reduce register |
233 | // pressure. |
234 | unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; |
235 | unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; |
236 | |
237 | Dst = DAG.getNode(Opcode: ARMISD::MEMCPY, DL: dl, VTList: VTs, N1: Chain, N2: Dst, N3: Src, |
238 | N4: DAG.getConstant(Val: NumRegs, DL: dl, VT: MVT::i32)); |
239 | Src = Dst.getValue(R: 1); |
240 | Chain = Dst.getValue(R: 2); |
241 | |
242 | DstPtrInfo = DstPtrInfo.getWithOffset(O: NumRegs * VTSize); |
243 | SrcPtrInfo = SrcPtrInfo.getWithOffset(O: NumRegs * VTSize); |
244 | |
245 | EmittedNumMemOps = NextEmittedNumMemOps; |
246 | } |
247 | |
248 | if (BytesLeft == 0) |
249 | return Chain; |
250 | |
251 | // Issue loads / stores for the trailing (1 - 3) bytes. |
252 | auto getRemainingValueType = [](unsigned BytesLeft) { |
253 | return (BytesLeft >= 2) ? MVT::i16 : MVT::i8; |
254 | }; |
255 | auto getRemainingSize = [](unsigned BytesLeft) { |
256 | return (BytesLeft >= 2) ? 2 : 1; |
257 | }; |
258 | |
259 | unsigned BytesLeftSave = BytesLeft; |
260 | i = 0; |
261 | while (BytesLeft) { |
262 | VT = getRemainingValueType(BytesLeft); |
263 | VTSize = getRemainingSize(BytesLeft); |
264 | Loads[i] = DAG.getLoad(VT, dl, Chain, |
265 | Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Src, |
266 | N2: DAG.getConstant(Val: SrcOff, DL: dl, VT: MVT::i32)), |
267 | PtrInfo: SrcPtrInfo.getWithOffset(O: SrcOff)); |
268 | TFOps[i] = Loads[i].getValue(R: 1); |
269 | ++i; |
270 | SrcOff += VTSize; |
271 | BytesLeft -= VTSize; |
272 | } |
273 | Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i)); |
274 | |
275 | i = 0; |
276 | BytesLeft = BytesLeftSave; |
277 | while (BytesLeft) { |
278 | VT = getRemainingValueType(BytesLeft); |
279 | VTSize = getRemainingSize(BytesLeft); |
280 | TFOps[i] = DAG.getStore(Chain, dl, Val: Loads[i], |
281 | Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Dst, |
282 | N2: DAG.getConstant(Val: DstOff, DL: dl, VT: MVT::i32)), |
283 | PtrInfo: DstPtrInfo.getWithOffset(O: DstOff)); |
284 | ++i; |
285 | DstOff += VTSize; |
286 | BytesLeft -= VTSize; |
287 | } |
288 | return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i)); |
289 | } |
290 | |
291 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove( |
292 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, |
293 | SDValue Size, Align Alignment, bool isVolatile, |
294 | MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { |
295 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
296 | Align: Alignment.value(), LC: RTLIB::MEMMOVE); |
297 | } |
298 | |
299 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset( |
300 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, |
301 | SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, |
302 | MachinePointerInfo DstPtrInfo) const { |
303 | |
304 | const ARMSubtarget &Subtarget = |
305 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); |
306 | |
307 | ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size); |
308 | |
309 | // Generate TP loop for llvm.memset |
310 | if (Subtarget.hasMVEIntegerOps() && |
311 | shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, |
312 | IsMemcpy: false)) { |
313 | Src = DAG.getSplatBuildVector(VT: MVT::v16i8, DL: dl, |
314 | Op: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i8, Operand: Src)); |
315 | return DAG.getNode(Opcode: ARMISD::MEMSETLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src, |
316 | N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32)); |
317 | } |
318 | |
319 | if (!AlwaysInline) |
320 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, |
321 | Align: Alignment.value(), LC: RTLIB::MEMSET); |
322 | |
323 | return SDValue(); |
324 | } |
325 | |