ARMSelectionDAGInfo.cpp source code [llvm_projects/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp]

1	//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the ARMSelectionDAGInfo class.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "ARMSelectionDAGInfo.h"
14	#include "ARMTargetTransformInfo.h"
15	#include "llvm/CodeGen/SelectionDAG.h"
16	#include "llvm/Support/CommandLine.h"
17
18	#define GET_SDNODE_DESC
19	#include "ARMGenSDNodeInfo.inc"
20
21	using namespace llvm;
22
23	#define DEBUG_TYPE "arm-selectiondag-info"
24
25	static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
26	"arm-memtransfer-tploop", cl::Hidden,
27	cl::desc ("Control conversion of memcpy to "
28	"Tail predicated loops (WLSTP)"),
29	cl::init(Val: TPLoop::ForceDisabled),
30	cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
31	"Don't convert memcpy to TP loop."),
32	clEnumValN(TPLoop::ForceEnabled, "force-enabled",
33	"Always convert memcpy to TP loop."),
34	clEnumValN(TPLoop::Allow, "allow",
35	"Allow (may be subject to certain conditions) "
36	"conversion of memcpy to TP loop.")));
37
38	ARMSelectionDAGInfo::ARMSelectionDAGInfo()
39	: SelectionDAGGenTargetInfo (ARMGenSDNodeInfo) {}
40
41	const char ARMSelectionDAGInfo::getTargetNodeName(unsigned* Opcode) const {
42	#define MAKE_CASE(V) \
43	case V: \
44	return #V;
45
46	// These nodes don't have corresponding entries in .td files yet.*
47	switch (static_cast<ARMISD::NodeType>(Opcode)) {
48	MAKE_CASE(ARMISD::DYN_ALLOC)
49	MAKE_CASE(ARMISD::MVESEXT)
50	MAKE_CASE(ARMISD::MVEZEXT)
51	MAKE_CASE(ARMISD::MVETRUNC)
52	MAKE_CASE(ARMISD::BUILD_VECTOR)
53	MAKE_CASE(ARMISD::VLD1DUP)
54	MAKE_CASE(ARMISD::VLD2DUP)
55	MAKE_CASE(ARMISD::VLD3DUP)
56	MAKE_CASE(ARMISD::VLD4DUP)
57	MAKE_CASE(ARMISD::VLD1_UPD)
58	MAKE_CASE(ARMISD::VLD2_UPD)
59	MAKE_CASE(ARMISD::VLD3_UPD)
60	MAKE_CASE(ARMISD::VLD4_UPD)
61	MAKE_CASE(ARMISD::VLD1x2_UPD)
62	MAKE_CASE(ARMISD::VLD1x3_UPD)
63	MAKE_CASE(ARMISD::VLD1x4_UPD)
64	MAKE_CASE(ARMISD::VLD2LN_UPD)
65	MAKE_CASE(ARMISD::VLD3LN_UPD)
66	MAKE_CASE(ARMISD::VLD4LN_UPD)
67	MAKE_CASE(ARMISD::VLD1DUP_UPD)
68	MAKE_CASE(ARMISD::VLD2DUP_UPD)
69	MAKE_CASE(ARMISD::VLD3DUP_UPD)
70	MAKE_CASE(ARMISD::VLD4DUP_UPD)
71	MAKE_CASE(ARMISD::VST1_UPD)
72	MAKE_CASE(ARMISD::VST3_UPD)
73	MAKE_CASE(ARMISD::VST1x2_UPD)
74	MAKE_CASE(ARMISD::VST1x3_UPD)
75	MAKE_CASE(ARMISD::VST1x4_UPD)
76	MAKE_CASE(ARMISD::VST2LN_UPD)
77	MAKE_CASE(ARMISD::VST3LN_UPD)
78	MAKE_CASE(ARMISD::VST4LN_UPD)
79	}
80	#undef MAKE_CASE
81
82	return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
83	}
84
85	bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
86	// These nodes don't have corresponding entries in .td files yet.*
87	if (Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
88	Opcode <= ARMISD::LAST_MEMORY_OPCODE)
89	return true;
90
91	return SelectionDAGGenTargetInfo::isTargetMemoryOpcode(Opcode);
92	}
93
94	void ARMSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
95	const SDNode N) const* {
96	switch (N->getOpcode()) {
97	default:
98	break;
99	case ARMISD::WIN__DBZCHK:
100	// invalid number of results; expected 2, got 1
101	case ARMISD::WIN__CHKSTK:
102	// invalid number of results; expected 1, got 2
103	case ARMISD::COPY_STRUCT_BYVAL:
104	// invalid number of operands; expected 6, got 5
105	case ARMISD::MEMCPY:
106	// invalid number of operands; expected 5, got 4
107	case ARMISD::VMOVRRD:
108	// operand #0 must have type f64, but has type v1i64/v4f16/v8i8
109	case ARMISD::VMOVIMM:
110	// operand #0 must have type i32, but has type i16
111	return;
112	}
113
114	SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
115	}
116
117	// Emit, if possible, a specialized version of the given Libcall. Typically this
118	// means selecting the appropriately aligned version, but we also convert memset
119	// of 0 into memclr.
120	SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
121	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
122	SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
123	const ARMSubtarget &Subtarget =
124	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
125	const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
126
127	// Only use a specialized AEABI function if the default version of this
128	// Libcall is an AEABI function.
129	//
130	// Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
131	// able to translate memset to memclr and use the value to index the function
132	// name array.
133	enum {
134	AEABI_MEMCPY = `0`,
135	AEABI_MEMMOVE,
136	AEABI_MEMSET,
137	AEABI_MEMCLR
138	} AEABILibcall;
139	switch (LC) {
140	case RTLIB::MEMCPY:
141	if (DAG.getLibcalls().getLibcallImpl(Call: LC) != RTLIB::impl___aeabi_memcpy)
142	return SDValue ();
143
144	AEABILibcall = AEABI_MEMCPY;
145	break;
146	case RTLIB::MEMMOVE:
147	if (DAG.getLibcalls().getLibcallImpl(Call: LC) != RTLIB::impl___aeabi_memmove)
148	return SDValue ();
149
150	AEABILibcall = AEABI_MEMMOVE;
151	break;
152	case RTLIB::MEMSET:
153	if (DAG.getLibcalls().getLibcallImpl(Call: LC) != RTLIB::impl___aeabi_memset)
154	return SDValue ();
155
156	AEABILibcall = AEABI_MEMSET;
157	if (isNullConstant(V: Src))
158	AEABILibcall = AEABI_MEMCLR;
159	break;
160	default:
161	return SDValue ();
162	}
163
164	// Choose the most-aligned libcall variant that we can
165	enum {
166	ALIGN1 = `0`,
167	ALIGN4,
168	ALIGN8
169	} AlignVariant;
170	if ((Align & `7`) == `0`)
171	AlignVariant = ALIGN8;
172	else if ((Align & `3`) == `0`)
173	AlignVariant = ALIGN4;
174	else
175	AlignVariant = ALIGN1;
176
177	TargetLowering::ArgListTy Args;
178	Type IntPtrTy = DAG.getDataLayout().getIntPtrType(C&: DAG.getContext());
179	Args.emplace_back(args&: Dst, args&: IntPtrTy);
180	if (AEABILibcall == AEABI_MEMCLR) {
181	Args.emplace_back(args&: Size, args&: IntPtrTy);
182	} else if (AEABILibcall == AEABI_MEMSET) {
183	// Adjust parameters for memset, EABI uses format (ptr, size, value),
184	// GNU library uses (ptr, value, size)
185	// See RTABI section 4.3.4
186	Args.emplace_back(args&: Size, args&: IntPtrTy);
187
188	// Extend or truncate the argument to be an i32 value for the call.
189	if (Src.getValueType().bitsGT(VT: MVT::i32))
190	Src = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: Src);
191	else if (Src.getValueType().bitsLT(VT: MVT::i32))
192	Src = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i32, Operand: Src);
193
194	TargetLowering::ArgListEntry Entry(Src,
195	Type::getInt32Ty(C&: *DAG.getContext()));
196	Entry.IsSExt = false;
197	Args.push_back(x: Entry);
198	} else {
199	Args.emplace_back(args&: Src, args&: IntPtrTy);
200	Args.emplace_back(args&: Size, args&: IntPtrTy);
201	}
202
203	static const RTLIB::Libcall FunctionImpls[`4`][`3`] = {
204	{RTLIB::MEMCPY, RTLIB::AEABI_MEMCPY4, RTLIB::AEABI_MEMCPY8},
205	{RTLIB::MEMMOVE, RTLIB::AEABI_MEMMOVE4, RTLIB::AEABI_MEMMOVE8},
206	{RTLIB::MEMSET, RTLIB::AEABI_MEMSET4, RTLIB::AEABI_MEMSET8},
207	{RTLIB::AEABI_MEMCLR, RTLIB::AEABI_MEMCLR4, RTLIB::AEABI_MEMCLR8}};
208
209	RTLIB::Libcall NewLC = FunctionImpls[AEABILibcall][AlignVariant];
210	RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(Call: NewLC);
211	if (LCImpl == RTLIB::Unsupported)
212	return SDValue ();
213
214	TargetLowering::CallLoweringInfo CLI(DAG);
215	CLI.setDebugLoc(dl)
216	.setChain(Chain)
217	.setLibCallee(
218	CC: DAG.getLibcalls().getLibcallImplCallingConv(Call: LCImpl),
219	ResultType: Type::getVoidTy(C&: *DAG.getContext()),
220	Target: DAG.getExternalSymbol(LCImpl, VT: TLI->getPointerTy(DL: DAG.getDataLayout())),
221	ArgsList: std::move(Args))
222	.setDiscardResult();
223	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
224
225	return CallResult.second;
226	}
227
228	static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
229	const SelectionDAG &DAG,
230	ConstantSDNode *ConstantSize,
231	Align Alignment, bool IsMemcpy) {
232	auto &F = DAG.getMachineFunction().getFunction();
233	if (!EnableMemtransferTPLoop)
234	return false;
235	if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
236	return true;
237	// Do not generate inline TP loop if optimizations is disabled,
238	// or if optimization for size (-Os or -Oz) is on.
239	if (F.hasOptNone() \|\| F.hasOptSize())
240	return false;
241	// If cli option is unset, for memset always generate inline TP.
242	// For memcpy, check some conditions
243	if (!IsMemcpy)
244	return true;
245	if (!ConstantSize && Alignment >= Align (`4`))
246	return true;
247	if (ConstantSize &&
248	ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
249	ConstantSize->getZExtValue() <
250	Subtarget.getMaxMemcpyTPInlineSizeThreshold())
251	return true;
252	return false;
253	}
254
255	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
256	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
257	SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
258	MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
259	const ARMSubtarget &Subtarget =
260	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
261	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
262
263	if (Subtarget.hasMVEIntegerOps() &&
264	shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, IsMemcpy: true))
265	return DAG.getNode(Opcode: ARMISD::MEMCPYLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src,
266	N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32));
267
268	// Do repeated 4-byte loads and stores. To be improved.
269	// This requires 4-byte alignment.
270	if (Alignment < Align (`4`))
271	return SDValue ();
272	// This requires the copy size to be a constant, preferably
273	// within a subtarget-specific limit.
274	if (!ConstantSize)
275	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
276	Align: Alignment.value(), LC: RTLIB::MEMCPY);
277	uint64_t SizeVal = ConstantSize->getZExtValue();
278	if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
279	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
280	Align: Alignment.value(), LC: RTLIB::MEMCPY);
281
282	unsigned BytesLeft = SizeVal & `3`;
283	unsigned NumMemOps = SizeVal >> `2`;
284	unsigned EmittedNumMemOps = `0`;
285	EVT VT = MVT::i32;
286	unsigned VTSize = `4`;
287	unsigned i = `0`;
288	// Emit a maximum of 4 loads in Thumb1 since we have fewer registers
289	const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? `4` : `6`;
290	SDValue TFOps[`6`];
291	SDValue Loads[`6`];
292	uint64_t SrcOff = `0`, DstOff = `0`;
293
294	// FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
295	// VLDM/VSTM and make this code emit it when appropriate. This would reduce
296	// pressure on the general purpose registers. However this seems harder to map
297	// onto the register allocator's view of the world.
298
299	// The number of MEMCPY pseudo-instructions to emit. We use up to
300	// MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
301	// later on. This is a lower bound on the number of MEMCPY operations we must
302	// emit.
303	unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - `1`) / MaxLoadsInLDM;
304
305	// Code size optimisation: do not inline memcpy if expansion results in
306	// more instructions than the library call.
307	if (NumMEMCPYs > `1` && Subtarget.hasMinSize()) {
308	return SDValue ();
309	}
310
311	SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue);
312
313	for (unsigned I = `0`; I != NumMEMCPYs; ++I) {
314	// Evenly distribute registers among MEMCPY operations to reduce register
315	// pressure.
316	unsigned NextEmittedNumMemOps = NumMemOps * (I + `1`) / NumMEMCPYs;
317	unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
318
319	Dst = DAG.getNode(Opcode: ARMISD::MEMCPY, DL: dl, VTList: VTs, N1: Chain, N2: Dst, N3: Src,
320	N4: DAG.getConstant(Val: NumRegs, DL: dl, VT: MVT::i32));
321	Src = Dst.getValue(R: `1`);
322	Chain = Dst.getValue(R: `2`);
323
324	DstPtrInfo = DstPtrInfo.getWithOffset(O: NumRegs * VTSize);
325	SrcPtrInfo = SrcPtrInfo.getWithOffset(O: NumRegs * VTSize);
326
327	EmittedNumMemOps = NextEmittedNumMemOps;
328	}
329
330	if (BytesLeft == `0`)
331	return Chain;
332
333	// Issue loads / stores for the trailing (1 - 3) bytes.
334	auto getRemainingValueType = [](unsigned BytesLeft) {
335	return (BytesLeft >= `2`) ? MVT::i16 : MVT::i8;
336	};
337	auto getRemainingSize = [](unsigned BytesLeft) {
338	return (BytesLeft >= `2`) ? `2` : `1`;
339	};
340
341	unsigned BytesLeftSave = BytesLeft;
342	i = `0`;
343	while (BytesLeft) {
344	VT = getRemainingValueType (BytesLeft);
345	VTSize = getRemainingSize (BytesLeft);
346	Loads[i] = DAG.getLoad(VT, dl, Chain,
347	Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Src,
348	N2: DAG.getConstant(Val: SrcOff, DL: dl, VT: MVT::i32)),
349	PtrInfo: SrcPtrInfo.getWithOffset(O: SrcOff));
350	TFOps[i] = Loads[i].getValue(R: `1`);
351	++i;
352	SrcOff += VTSize;
353	BytesLeft -= VTSize;
354	}
355	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i));
356
357	i = `0`;
358	BytesLeft = BytesLeftSave;
359	while (BytesLeft) {
360	VT = getRemainingValueType (BytesLeft);
361	VTSize = getRemainingSize (BytesLeft);
362	TFOps[i] = DAG.getStore(Chain, dl, Val: Loads[i],
363	Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Dst,
364	N2: DAG.getConstant(Val: DstOff, DL: dl, VT: MVT::i32)),
365	PtrInfo: DstPtrInfo.getWithOffset(O: DstOff));
366	++i;
367	DstOff += VTSize;
368	BytesLeft -= VTSize;
369	}
370	return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i));
371	}
372
373	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
374	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
375	SDValue Size, Align Alignment, bool isVolatile,
376	MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
377	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
378	Align: Alignment.value(), LC: RTLIB::MEMMOVE);
379	}
380
381	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
382	SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
383	SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
384	MachinePointerInfo DstPtrInfo) const {
385
386	const ARMSubtarget &Subtarget =
387	DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
388
389	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size);
390
391	// Generate TP loop for llvm.memset
392	if (Subtarget.hasMVEIntegerOps() &&
393	shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
394	IsMemcpy: false)) {
395	Src = DAG.getSplatBuildVector(VT: MVT::v16i8, DL: dl,
396	Op: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i8, Operand: Src));
397	return DAG.getNode(Opcode: ARMISD::MEMSETLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src,
398	N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32));
399	}
400
401	if (!AlwaysInline)
402	return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
403	Align: Alignment.value(), LC: RTLIB::MEMSET);
404
405	return SDValue ();
406	}
407

Browse the source code of llvm_projects/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp