AArch64PreLegalizerCombiner.cpp source code [llvm_projects/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp]

1	//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass does combining of machine instructions at the generic MI level,
10	// before the legalizer.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AArch64GlobalISelUtils.h"
15	#include "AArch64TargetMachine.h"
16	#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
17	#include "llvm/CodeGen/GlobalISel/Combiner.h"
18	#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
19	#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
20	#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
21	#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
22	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24	#include "llvm/CodeGen/GlobalISel/Utils.h"
25	#include "llvm/CodeGen/MachineDominators.h"
26	#include "llvm/CodeGen/MachineFunction.h"
27	#include "llvm/CodeGen/MachineFunctionPass.h"
28	#include "llvm/CodeGen/MachineRegisterInfo.h"
29	#include "llvm/CodeGen/TargetPassConfig.h"
30	#include "llvm/IR/Instructions.h"
31
32	#define GET_GICOMBINER_DEPS
33	#include "AArch64GenPreLegalizeGICombiner.inc"
34	#undef GET_GICOMBINER_DEPS
35
36	#define DEBUG_TYPE "aarch64-prelegalizer-combiner"
37
38	using namespace llvm;
39	using namespace MIPatternMatch;
40
41	namespace {
42
43	#define GET_GICOMBINER_TYPES
44	#include "AArch64GenPreLegalizeGICombiner.inc"
45	#undef GET_GICOMBINER_TYPES
46
47	/// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
48	/// are sign bits. In this case, we can transform the G_ICMP to directly compare
49	/// the wide value with a zero.
50	bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
51	GISelValueTracking *VT, Register &MatchInfo) {
52	assert(MI.getOpcode() == TargetOpcode::G_ICMP && VT);
53
54	auto Pred = (CmpInst::Predicate)MI.getOperand(i: `1`).getPredicate();
55	if (!ICmpInst::isEquality(P: Pred))
56	return false;
57
58	Register LHS = MI.getOperand(i: `2`).getReg();
59	LLT LHSTy = MRI.getType(Reg: LHS);
60	if (!LHSTy.isScalar())
61	return false;
62
63	Register RHS = MI.getOperand(i: `3`).getReg();
64	Register WideReg;
65
66	if (!mi_match(R: LHS, MRI, P: m_GTrunc(Src: m_Reg(R&: WideReg))) \|\|
67	!mi_match(R: RHS, MRI, P: m_SpecificICst(RequestedValue: `0`)))
68	return false;
69
70	LLT WideTy = MRI.getType(Reg: WideReg);
71	if (VT->computeNumSignBits(R: WideReg) <=
72	WideTy.getSizeInBits() - LHSTy.getSizeInBits())
73	return false;
74
75	MatchInfo = WideReg;
76	return true;
77	}
78
79	void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
80	MachineIRBuilder &Builder,
81	GISelChangeObserver &Observer, Register &WideReg) {
82	assert(MI.getOpcode() == TargetOpcode::G_ICMP);
83
84	LLT WideTy = MRI.getType(Reg: WideReg);
85	// We're going to directly use the wide register as the LHS, and then use an
86	// equivalent size zero for RHS.
87	Builder.setInstrAndDebugLoc(MI);
88	auto WideZero = Builder.buildConstant(Res: WideTy, Val: `0`);
89	Observer.changingInstr(MI);
90	MI.getOperand(i: `2`).setReg(WideReg);
91	MI.getOperand(i: `3`).setReg(WideZero.getReg(Idx: `0`));
92	Observer.changedInstr(MI);
93	}
94
95	/// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
96	///
97	/// e.g.
98	///
99	/// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
100	bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
101	std::pair<uint64_t, uint64_t> &MatchInfo) {
102	assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
103	MachineFunction &MF = *MI.getMF();
104	auto &GlobalOp = MI.getOperand(i: `1`);
105	auto *GV = GlobalOp.getGlobal();
106	if (GV->isThreadLocal())
107	return false;
108
109	// Don't allow anything that could represent offsets etc.
110	if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
111	GV, TM: MF.getTarget()) != AArch64II::MO_NO_FLAG)
112	return false;
113
114	// Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
115	//
116	// %g = G_GLOBAL_VALUE @x
117	// %ptr1 = G_PTR_ADD %g, cst1
118	// %ptr2 = G_PTR_ADD %g, cst2
119	// ...
120	// %ptrN = G_PTR_ADD %g, cstN
121	//
122	// Identify the smallest* constant. We want to be able to form this:*
123	//
124	// %offset_g = G_GLOBAL_VALUE @x + min_cst
125	// %g = G_PTR_ADD %offset_g, -min_cst
126	// %ptr1 = G_PTR_ADD %g, cst1
127	// ...
128	Register Dst = MI.getOperand(i: `0`).getReg();
129	uint64_t MinOffset = -`1ull`;
130	for (auto &UseInstr : MRI.use_nodbg_instructions(Reg: Dst)) {
131	if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
132	return false;
133	auto Cst = getIConstantVRegValWithLookThrough(
134	VReg: UseInstr.getOperand(i: `2`).getReg(), MRI);
135	if (!Cst)
136	return false;
137	MinOffset = std::min(a: MinOffset, b: Cst ->Value.getZExtValue());
138	}
139
140	// Require that the new offset is larger than the existing one to avoid
141	// infinite loops.
142	uint64_t CurrOffset = GlobalOp.getOffset();
143	uint64_t NewOffset = MinOffset + CurrOffset;
144	if (NewOffset <= CurrOffset)
145	return false;
146
147	// Check whether folding this offset is legal. It must not go out of bounds of
148	// the referenced object to avoid violating the code model, and must be
149	// smaller than 2^20 because this is the largest offset expressible in all
150	// object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
151	// stores an immediate signed 21 bit offset.)
152	//
153	// This check also prevents us from folding negative offsets, which will end
154	// up being treated in the same way as large positive ones. They could also
155	// cause code model violations, and aren't really common enough to matter.
156	if (NewOffset >= (`1` << `20`))
157	return false;
158
159	Type *T = GV->getValueType();
160	if (!T->isSized() \|\|
161	NewOffset > GV->getDataLayout().getTypeAllocSize(Ty: T))
162	return false;
163	MatchInfo = std::make_pair(x&: NewOffset, y&: MinOffset);
164	return true;
165	}
166
167	void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
168	MachineIRBuilder &B, GISelChangeObserver &Observer,
169	std::pair<uint64_t, uint64_t> &MatchInfo) {
170	// Change:
171	//
172	// %g = G_GLOBAL_VALUE @x
173	// %ptr1 = G_PTR_ADD %g, cst1
174	// %ptr2 = G_PTR_ADD %g, cst2
175	// ...
176	// %ptrN = G_PTR_ADD %g, cstN
177	//
178	// To:
179	//
180	// %offset_g = G_GLOBAL_VALUE @x + min_cst
181	// %g = G_PTR_ADD %offset_g, -min_cst
182	// %ptr1 = G_PTR_ADD %g, cst1
183	// ...
184	// %ptrN = G_PTR_ADD %g, cstN
185	//
186	// Then, the original G_PTR_ADDs should be folded later on so that they look
187	// like this:
188	//
189	// %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
190	uint64_t Offset, MinOffset;
191	std::tie(args&: Offset, args&: MinOffset) = MatchInfo;
192	B.setInstrAndDebugLoc(*std::next(x: MI.getIterator()));
193	Observer.changingInstr(MI);
194	auto &GlobalOp = MI.getOperand(i: `1`);
195	auto *GV = GlobalOp.getGlobal();
196	GlobalOp.ChangeToGA(GV, Offset, TargetFlags: GlobalOp.getTargetFlags());
197	Register Dst = MI.getOperand(i: `0`).getReg();
198	Register NewGVDst = MRI.cloneVirtualRegister(VReg: Dst);
199	MI.getOperand(i: `0`).setReg(NewGVDst);
200	Observer.changedInstr(MI);
201	B.buildPtrAdd(
202	Res: Dst, Op0: NewGVDst,
203	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: -static_cast<int64_t>(MinOffset)));
204	}
205
206	// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add([us]dot(x, y))
207	// Or vecreduce_add(ext(mul(ext(x), ext(y)))) -> vecreduce_add([us]dot(x, y))
208	// Or vecreduce_add(ext(x)) -> vecreduce_add([us]dot(x, 1))
209	// Similar to performVecReduceAddCombine in SelectionDAG
210	bool matchExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
211	const AArch64Subtarget &STI,
212	std::tuple<Register, Register, bool> &MatchInfo) {
213	assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
214	"Expected a G_VECREDUCE_ADD instruction");
215	assert(STI.hasDotProd() && "Target should have Dot Product feature");
216
217	MachineInstr *I1 = getDefIgnoringCopies(Reg: MI.getOperand(i: `1`).getReg(), MRI);
218	Register DstReg = MI.getOperand(i: `0`).getReg();
219	Register MidReg = I1->getOperand(i: `0`).getReg();
220	LLT DstTy = MRI.getType(Reg: DstReg);
221	LLT MidTy = MRI.getType(Reg: MidReg);
222	if (DstTy.getScalarSizeInBits() != `32` \|\| MidTy.getScalarSizeInBits() != `32`)
223	return false;
224
225	// Detect mul(ext, ext) with symmetric ext's. If I1Opc is G_ZEXT or G_SEXT
226	// then the ext's must match the same opcode. It is set to the ext opcode on
227	// output.
228	auto tryMatchingMulOfExt = [&MRI](MachineInstr *MI, Register &Out1,
229	Register &Out2, unsigned &I1Opc) {
230	// If result of this has more than 1 use, then there is no point in creating
231	// a dot instruction
232	if (!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: `0`).getReg()))
233	return false;
234
235	MachineInstr *ExtMI1 =
236	getDefIgnoringCopies(Reg: MI->getOperand(i: `1`).getReg(), MRI);
237	MachineInstr *ExtMI2 =
238	getDefIgnoringCopies(Reg: MI->getOperand(i: `2`).getReg(), MRI);
239	LLT Ext1DstTy = MRI.getType(Reg: ExtMI1->getOperand(i: `0`).getReg());
240	LLT Ext2DstTy = MRI.getType(Reg: ExtMI2->getOperand(i: `0`).getReg());
241
242	if (ExtMI1->getOpcode() != ExtMI2->getOpcode() \|\| Ext1DstTy != Ext2DstTy)
243	return false;
244	if ((I1Opc == TargetOpcode::G_ZEXT \|\| I1Opc == TargetOpcode::G_SEXT) &&
245	I1Opc != ExtMI1->getOpcode())
246	return false;
247	Out1 = ExtMI1->getOperand(i: `1`).getReg();
248	Out2 = ExtMI2->getOperand(i: `1`).getReg();
249	I1Opc = ExtMI1->getOpcode();
250	return true;
251	};
252
253	LLT SrcTy;
254	unsigned I1Opc = I1->getOpcode();
255	if (I1Opc == TargetOpcode::G_MUL) {
256	Register Out1, Out2;
257	if (!tryMatchingMulOfExt (I1, Out1, Out2, I1Opc))
258	return false;
259	SrcTy = MRI.getType(Reg: Out1);
260	std::get<`0`>(t&: MatchInfo) = Out1;
261	std::get<`1`>(t&: MatchInfo) = Out2;
262	} else if (I1Opc == TargetOpcode::G_ZEXT \|\| I1Opc == TargetOpcode::G_SEXT) {
263	Register I1Op = I1->getOperand(i: `1`).getReg();
264	MachineInstr *M = getDefIgnoringCopies(Reg: I1Op, MRI);
265	Register Out1, Out2;
266	if (M->getOpcode() == TargetOpcode::G_MUL &&
267	tryMatchingMulOfExt (M, Out1, Out2, I1Opc)) {
268	SrcTy = MRI.getType(Reg: Out1);
269	std::get<`0`>(t&: MatchInfo) = Out1;
270	std::get<`1`>(t&: MatchInfo) = Out2;
271	} else {
272	SrcTy = MRI.getType(Reg: I1Op);
273	std::get<`0`>(t&: MatchInfo) = I1Op;
274	std::get<`1`>(t&: MatchInfo) = `0`;
275	}
276	} else {
277	return false;
278	}
279
280	if (I1Opc == TargetOpcode::G_ZEXT)
281	std::get<`2`>(t&: MatchInfo) = `0`;
282	else if (I1Opc == TargetOpcode::G_SEXT)
283	std::get<`2`>(t&: MatchInfo) = `1`;
284	else
285	return false;
286
287	if (SrcTy.getScalarSizeInBits() != `8` \|\| SrcTy.getNumElements() % `8` != `0`)
288	return false;
289
290	return true;
291	}
292
293	void applyExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
294	MachineIRBuilder &Builder,
295	GISelChangeObserver &Observer,
296	const AArch64Subtarget &STI,
297	std::tuple<Register, Register, bool> &MatchInfo) {
298	assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
299	"Expected a G_VECREDUCE_ADD instruction");
300	assert(STI.hasDotProd() && "Target should have Dot Product feature");
301
302	// Initialise the variables
303	unsigned DotOpcode =
304	std::get<`2`>(t&: MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT;
305	Register Ext1SrcReg = std::get<`0`>(t&: MatchInfo);
306
307	// If there is one source register, create a vector of 0s as the second
308	// source register
309	Register Ext2SrcReg;
310	if (std::get<`1`>(t&: MatchInfo) == `0`)
311	Ext2SrcReg = Builder.buildConstant(Res: MRI.getType(Reg: Ext1SrcReg), Val: `1`)
312	->getOperand(i: `0`)
313	.getReg();
314	else
315	Ext2SrcReg = std::get<`1`>(t&: MatchInfo);
316
317	// Find out how many DOT instructions are needed
318	LLT SrcTy = MRI.getType(Reg: Ext1SrcReg);
319	LLT MidTy;
320	unsigned NumOfDotMI;
321	if (SrcTy.getNumElements() % `16` == `0`) {
322	NumOfDotMI = SrcTy.getNumElements() / `16`;
323	MidTy = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
324	} else if (SrcTy.getNumElements() % `8` == `0`) {
325	NumOfDotMI = SrcTy.getNumElements() / `8`;
326	MidTy = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`);
327	} else {
328	llvm_unreachable("Source type number of elements is not multiple of 8");
329	}
330
331	// Handle case where one DOT instruction is needed
332	if (NumOfDotMI == `1`) {
333	auto Zeroes = Builder.buildConstant(Res: MidTy, Val: `0`)->getOperand(i: `0`).getReg();
334	auto Dot = Builder.buildInstr(Opc: DotOpcode, DstOps: {MidTy},
335	SrcOps: {Zeroes, Ext1SrcReg, Ext2SrcReg});
336	Builder.buildVecReduceAdd(Dst: MI.getOperand(i: `0`), Src: Dot ->getOperand(i: `0`));
337	} else {
338	// If not pad the last v8 element with 0s to a v16
339	SmallVector<Register, `4`> Ext1UnmergeReg;
340	SmallVector<Register, `4`> Ext2UnmergeReg;
341	if (SrcTy.getNumElements() % `16` != `0`) {
342	SmallVector<Register> Leftover1;
343	SmallVector<Register> Leftover2;
344
345	// Split the elements into v16i8 and v8i8
346	LLT MainTy = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`);
347	LLT LeftoverTy1, LeftoverTy2;
348	if ((!extractParts(Reg: Ext1SrcReg, RegTy: MRI.getType(Reg: Ext1SrcReg), MainTy,
349	LeftoverTy&: LeftoverTy1, VRegs&: Ext1UnmergeReg, LeftoverVRegs&: Leftover1, MIRBuilder&: Builder,
350	MRI)) \|\|
351	(!extractParts(Reg: Ext2SrcReg, RegTy: MRI.getType(Reg: Ext2SrcReg), MainTy,
352	LeftoverTy&: LeftoverTy2, VRegs&: Ext2UnmergeReg, LeftoverVRegs&: Leftover2, MIRBuilder&: Builder,
353	MRI))) {
354	llvm_unreachable("Unable to split this vector properly");
355	}
356
357	// Pad the leftover v8i8 vector with register of 0s of type v8i8
358	Register v8Zeroes = Builder.buildConstant(Res: LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `8`), Val: `0`)
359	->getOperand(i: `0`)
360	.getReg();
361
362	Ext1UnmergeReg.push_back(
363	Elt: Builder
364	.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`),
365	Ops: {Leftover1 [`0`], v8Zeroes})
366	.getReg(Idx: `0`));
367	Ext2UnmergeReg.push_back(
368	Elt: Builder
369	.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`),
370	Ops: {Leftover2 [`0`], v8Zeroes})
371	.getReg(Idx: `0`));
372
373	} else {
374	// Unmerge the source vectors to v16i8
375	unsigned SrcNumElts = SrcTy.getNumElements();
376	extractParts(Reg: Ext1SrcReg, Ty: LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`), NumParts: SrcNumElts / `16`,
377	VRegs&: Ext1UnmergeReg, MIRBuilder&: Builder, MRI);
378	extractParts(Reg: Ext2SrcReg, Ty: LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`), NumParts: SrcNumElts / `16`,
379	VRegs&: Ext2UnmergeReg, MIRBuilder&: Builder, MRI);
380	}
381
382	// Build the UDOT instructions
383	SmallVector<Register, `2`> DotReg;
384	unsigned NumElements = `0`;
385	for (unsigned i = `0`; i < Ext1UnmergeReg.size(); i++) {
386	LLT ZeroesLLT;
387	// Check if it is 16 or 8 elements. Set Zeroes to the according size
388	if (MRI.getType(Reg: Ext1UnmergeReg [i]).getNumElements() == `16`) {
389	ZeroesLLT = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
390	NumElements += `4`;
391	} else {
392	ZeroesLLT = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`);
393	NumElements += `2`;
394	}
395	auto Zeroes = Builder.buildConstant(Res: ZeroesLLT, Val: `0`)->getOperand(i: `0`).getReg();
396	DotReg.push_back(
397	Elt: Builder
398	.buildInstr(Opc: DotOpcode, DstOps: {MRI.getType(Reg: Zeroes)},
399	SrcOps: {Zeroes, Ext1UnmergeReg [i], Ext2UnmergeReg [i]})
400	.getReg(Idx: `0`));
401	}
402
403	// Merge the output
404	auto ConcatMI =
405	Builder.buildConcatVectors(Res: LLT::fixed_vector(NumElements, ScalarSizeInBits: `32`), Ops: DotReg);
406
407	// Put it through a vector reduction
408	Builder.buildVecReduceAdd(Dst: MI.getOperand(i: `0`).getReg(),
409	Src: ConcatMI ->getOperand(i: `0`).getReg());
410	}
411
412	// Erase the dead instructions
413	MI.eraseFromParent();
414	}
415
416	// Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x)
417	// Ensure that the type coming from the extend instruction is the right size
418	bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
419	std::pair<Register, bool> &MatchInfo) {
420	assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
421	"Expected G_VECREDUCE_ADD Opcode");
422
423	// Check if the last instruction is an extend
424	MachineInstr *ExtMI = getDefIgnoringCopies(Reg: MI.getOperand(i: `1`).getReg(), MRI);
425	auto ExtOpc = ExtMI->getOpcode();
426
427	if (ExtOpc == TargetOpcode::G_ZEXT)
428	std::get<`1`>(in&: MatchInfo) = `0`;
429	else if (ExtOpc == TargetOpcode::G_SEXT)
430	std::get<`1`>(in&: MatchInfo) = `1`;
431	else
432	return false;
433
434	// Check if the source register is a valid type
435	Register ExtSrcReg = ExtMI->getOperand(i: `1`).getReg();
436	LLT ExtSrcTy = MRI.getType(Reg: ExtSrcReg);
437	LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
438	if (ExtSrcTy.getScalarSizeInBits() * `2` > DstTy.getScalarSizeInBits())
439	return false;
440	if ((DstTy.getScalarSizeInBits() == `16` &&
441	ExtSrcTy.getNumElements() % `8` == `0` && ExtSrcTy.getNumElements() < `256`) \|\|
442	(DstTy.getScalarSizeInBits() == `32` &&
443	ExtSrcTy.getNumElements() % `4` == `0`) \|\|
444	(DstTy.getScalarSizeInBits() == `64` &&
445	ExtSrcTy.getNumElements() % `4` == `0`)) {
446	std::get<`0`>(in&: MatchInfo) = ExtSrcReg;
447	return true;
448	}
449	return false;
450	}
451
452	void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
453	MachineIRBuilder &B, GISelChangeObserver &Observer,
454	std::pair<Register, bool> &MatchInfo) {
455	assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
456	"Expected G_VECREDUCE_ADD Opcode");
457
458	unsigned Opc = std::get<`1`>(in&: MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV;
459	Register SrcReg = std::get<`0`>(in&: MatchInfo);
460	Register DstReg = MI.getOperand(i: `0`).getReg();
461	LLT SrcTy = MRI.getType(Reg: SrcReg);
462	LLT DstTy = MRI.getType(Reg: DstReg);
463
464	// If SrcTy has more elements than expected, split them into multiple
465	// instructions and sum the results
466	LLT MainTy;
467	SmallVector<Register, `1`> WorkingRegisters;
468	unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
469	unsigned SrcNumElem = SrcTy.getNumElements();
470	if ((SrcScalSize == `8` && SrcNumElem > `16`) \|\|
471	(SrcScalSize == `16` && SrcNumElem > `8`) \|\|
472	(SrcScalSize == `32` && SrcNumElem > `4`)) {
473
474	LLT LeftoverTy;
475	SmallVector<Register, `4`> LeftoverRegs;
476	if (SrcScalSize == `8`)
477	MainTy = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`);
478	else if (SrcScalSize == `16`)
479	MainTy = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`);
480	else if (SrcScalSize == `32`)
481	MainTy = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
482	else
483	llvm_unreachable("Source's Scalar Size not supported");
484
485	// Extract the parts and put each extracted sources through U/SADDLV and put
486	// the values inside a small vec
487	extractParts(Reg: SrcReg, RegTy: SrcTy, MainTy, LeftoverTy, VRegs&: WorkingRegisters,
488	LeftoverVRegs&: LeftoverRegs, MIRBuilder&: B, MRI);
489	llvm::append_range(C&: WorkingRegisters, R&: LeftoverRegs);
490	} else {
491	WorkingRegisters.push_back(Elt: SrcReg);
492	MainTy = SrcTy;
493	}
494
495	unsigned MidScalarSize = MainTy.getScalarSizeInBits() * `2`;
496	LLT MidScalarLLT = LLT::scalar(SizeInBits: MidScalarSize);
497	Register ZeroReg = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: `0`).getReg(Idx: `0`);
498	for (unsigned I = `0`; I < WorkingRegisters.size(); I++) {
499	// If the number of elements is too small to build an instruction, extend
500	// its size before applying addlv
501	LLT WorkingRegTy = MRI.getType(Reg: WorkingRegisters [I]);
502	if ((WorkingRegTy.getScalarSizeInBits() == `8`) &&
503	(WorkingRegTy.getNumElements() == `4`)) {
504	WorkingRegisters [I] =
505	B.buildInstr(Opc: std::get<`1`>(in&: MatchInfo) ? TargetOpcode::G_SEXT
506	: TargetOpcode::G_ZEXT,
507	DstOps: {LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`)}, SrcOps: {WorkingRegisters [I]})
508	.getReg(Idx: `0`);
509	}
510
511	// Generate the {U/S}ADDLV instruction, whose output is always double of the
512	// Src's Scalar size
513	LLT AddlvTy = MidScalarSize <= `32` ? LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`)
514	: LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
515	Register AddlvReg =
516	B.buildInstr(Opc, DstOps: {AddlvTy}, SrcOps: {WorkingRegisters [I]}).getReg(Idx: `0`);
517
518	// The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
519	// v2i64 register.
520	// i16, i32 results uses v4i32 registers
521	// i64 results uses v2i64 registers
522	// Therefore we have to extract/truncate the the value to the right type
523	if (MidScalarSize == `32` \|\| MidScalarSize == `64`) {
524	WorkingRegisters [I] = B.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT,
525	DstOps: {MidScalarLLT}, SrcOps: {AddlvReg, ZeroReg})
526	.getReg(Idx: `0`);
527	} else {
528	Register ExtractReg = B.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT,
529	DstOps: {LLT::scalar(SizeInBits: `32`)}, SrcOps: {AddlvReg, ZeroReg})
530	.getReg(Idx: `0`);
531	WorkingRegisters [I] =
532	B.buildTrunc(Res: {MidScalarLLT}, Op: {ExtractReg}).getReg(Idx: `0`);
533	}
534	}
535
536	Register OutReg;
537	if (WorkingRegisters.size() > `1`) {
538	OutReg = B.buildAdd(Dst: MidScalarLLT, Src0: WorkingRegisters [`0`], Src1: WorkingRegisters [`1`])
539	.getReg(Idx: `0`);
540	for (unsigned I = `2`; I < WorkingRegisters.size(); I++) {
541	OutReg = B.buildAdd(Dst: MidScalarLLT, Src0: OutReg, Src1: WorkingRegisters [I]).getReg(Idx: `0`);
542	}
543	} else {
544	OutReg = WorkingRegisters [`0`];
545	}
546
547	if (DstTy.getScalarSizeInBits() > MidScalarSize) {
548	// Handle the scalar value if the DstTy's Scalar Size is more than double
549	// Src's ScalarType
550	B.buildInstr(Opc: std::get<`1`>(in&: MatchInfo) ? TargetOpcode::G_SEXT
551	: TargetOpcode::G_ZEXT,
552	DstOps: {DstReg}, SrcOps: {OutReg});
553	} else {
554	B.buildCopy(Res: DstReg, Op: OutReg);
555	}
556
557	MI.eraseFromParent();
558	}
559
560	// Pushes ADD/SUB/MUL through extend instructions to decrease the number of
561	// extend instruction at the end by allowing selection of {s\|u}addl sooner
562	// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
563	bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
564	Register DstReg, Register SrcReg1, Register SrcReg2) {
565	assert((MI.getOpcode() == TargetOpcode::G_ADD \|\|
566	MI.getOpcode() == TargetOpcode::G_SUB \|\|
567	MI.getOpcode() == TargetOpcode::G_MUL) &&
568	"Expected a G_ADD, G_SUB or G_MUL instruction\n");
569
570	// Deal with vector types only
571	LLT DstTy = MRI.getType(Reg: DstReg);
572	if (!DstTy.isVector())
573	return false;
574
575	// Return true if G_{S\|Z}EXT instruction is more than 2 source*
576	Register ExtDstReg = MI.getOperand(i: `1`).getReg();
577	LLT Ext1SrcTy = MRI.getType(Reg: SrcReg1);
578	LLT Ext2SrcTy = MRI.getType(Reg: SrcReg2);
579	unsigned ExtDstScal = MRI.getType(Reg: ExtDstReg).getScalarSizeInBits();
580	unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
581	if (((Ext1SrcScal == `8` && ExtDstScal == `32`) \|\|
582	((Ext1SrcScal == `8` \|\| Ext1SrcScal == `16`) && ExtDstScal == `64`)) &&
583	Ext1SrcTy == Ext2SrcTy)
584	return true;
585
586	return false;
587	}
588
589	void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
590	MachineIRBuilder &B, bool isSExt, Register DstReg,
591	Register SrcReg1, Register SrcReg2) {
592	LLT SrcTy = MRI.getType(Reg: SrcReg1);
593	LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTy.getScalarSizeInBits() * `2`);
594	unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
595	Register Ext1Reg = B.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg1}).getReg(Idx: `0`);
596	Register Ext2Reg = B.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg2}).getReg(Idx: `0`);
597	Register AddReg =
598	B.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Ext1Reg, Ext2Reg}).getReg(Idx: `0`);
599
600	// G_SUB has to sign-extend the result.
601	// G_ADD needs to sext from sext and can sext or zext from zext, and G_MUL
602	// needs to use the original opcode so the original opcode is used for both.
603	if (MI.getOpcode() == TargetOpcode::G_ADD \|\|
604	MI.getOpcode() == TargetOpcode::G_MUL)
605	B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {AddReg});
606	else
607	B.buildSExt(Res: DstReg, Op: AddReg);
608
609	MI.eraseFromParent();
610	}
611
612	bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
613	const CombinerHelper &Helper,
614	GISelChangeObserver &Observer) {
615	// Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
616	// result is only used in the no-overflow case. It is restricted to cases
617	// where we know that the high-bits of the operands are 0. If there's an
618	// overflow, then the 9th or 17th bit must be set, which can be checked
619	// using TBNZ.
620	//
621	// Change (for UADDOs on 8 and 16 bits):
622	//
623	// %z0 = G_ASSERT_ZEXT _
624	// %op0 = G_TRUNC %z0
625	// %z1 = G_ASSERT_ZEXT _
626	// %op1 = G_TRUNC %z1
627	// %val, %cond = G_UADDO %op0, %op1
628	// G_BRCOND %cond, %error.bb
629	//
630	// error.bb:
631	// (no successors and no uses of %val)
632	//
633	// To:
634	//
635	// %z0 = G_ASSERT_ZEXT _
636	// %z1 = G_ASSERT_ZEXT _
637	// %add = G_ADD %z0, %z1
638	// %val = G_TRUNC %add
639	// %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
640	// %cond = G_ICMP NE, %bit, 0
641	// G_BRCOND %cond, %error.bb
642
643	auto &MRI = *B.getMRI();
644
645	MachineOperand *DefOp0 = MRI.getOneDef(Reg: MI.getOperand(i: `2`).getReg());
646	MachineOperand *DefOp1 = MRI.getOneDef(Reg: MI.getOperand(i: `3`).getReg());
647	Register Op0Wide;
648	Register Op1Wide;
649	if (!mi_match(R: DefOp0->getParent(), MRI, P: m_GTrunc(Src: m_Reg(R&: Op0Wide))) \|\|
650	!mi_match(R: DefOp1->getParent(), MRI, P: m_GTrunc(Src: m_Reg(R&: Op1Wide))))
651	return false;
652	LLT WideTy0 = MRI.getType(Reg: Op0Wide);
653	LLT WideTy1 = MRI.getType(Reg: Op1Wide);
654	Register ResVal = MI.getOperand(i: `0`).getReg();
655	LLT OpTy = MRI.getType(Reg: ResVal);
656	MachineInstr *Op0WideDef = MRI.getVRegDef(Reg: Op0Wide);
657	MachineInstr *Op1WideDef = MRI.getVRegDef(Reg: Op1Wide);
658
659	unsigned OpTySize = OpTy.getScalarSizeInBits();
660	// First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
661	// inputs have been zero-extended.
662	if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT \|\|
663	Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT \|\|
664	OpTySize != Op0WideDef->getOperand(i: `2`).getImm() \|\|
665	OpTySize != Op1WideDef->getOperand(i: `2`).getImm())
666	return false;
667
668	// Only scalar UADDO with either 8 or 16 bit operands are handled.
669	if (!WideTy0.isScalar() \|\| !WideTy1.isScalar() \|\| WideTy0 != WideTy1 \|\|
670	OpTySize >= WideTy0.getScalarSizeInBits() \|\|
671	(OpTySize != `8` && OpTySize != `16`))
672	return false;
673
674	// The overflow-status result must be used by a branch only.
675	Register ResStatus = MI.getOperand(i: `1`).getReg();
676	if (!MRI.hasOneNonDBGUse(RegNo: ResStatus))
677	return false;
678	MachineInstr CondUser = &MRI.use_instr_nodbg_begin(RegNo: ResStatus);
679	if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
680	return false;
681
682	// Make sure the computed result is only used in the no-overflow blocks.
683	MachineBasicBlock *CurrentMBB = MI.getParent();
684	MachineBasicBlock *FailMBB = CondUser->getOperand(i: `1`).getMBB();
685	if (!FailMBB->succ_empty() \|\| CondUser->getParent() != CurrentMBB)
686	return false;
687	if (any_of(Range: MRI.use_nodbg_instructions(Reg: ResVal),
688	P: [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
689	return &MI != &I &&
690	(I.getParent() == FailMBB \|\| I.getParent() == CurrentMBB);
691	}))
692	return false;
693
694	// Remove G_ADDO.
695	B.setInstrAndDebugLoc(*MI.getNextNode());
696	MI.eraseFromParent();
697
698	// Emit wide add.
699	Register AddDst = MRI.cloneVirtualRegister(VReg: Op0Wide);
700	B.buildInstr(Opc: TargetOpcode::G_ADD, DstOps: {AddDst}, SrcOps: {Op0Wide, Op1Wide});
701
702	// Emit check of the 9th or 17th bit and update users (the branch). This will
703	// later be folded to TBNZ.
704	Register CondBit = MRI.cloneVirtualRegister(VReg: Op0Wide);
705	B.buildAnd(
706	Dst: CondBit, Src0: AddDst,
707	Src1: B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: OpTySize == `8` ? `1` << `8` : `1` << `16`));
708	B.buildICmp(Pred: CmpInst::ICMP_NE, Res: ResStatus, Op0: CondBit,
709	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: `0`));
710
711	// Update ZEXts users of the result value. Because all uses are in the
712	// no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
713	B.buildZExtOrTrunc(Res: ResVal, Op: AddDst);
714	for (MachineOperand &U : make_early_inc_range(Range: MRI.use_operands(Reg: ResVal))) {
715	Register WideReg;
716	if (mi_match(R: U.getParent(), MRI, P: m_GZExt(Src: m_Reg(R&: WideReg)))) {
717	auto OldR = U.getParent()->getOperand(i: `0`).getReg();
718	Observer.erasingInstr(MI&: *U.getParent());
719	U.getParent()->eraseFromParent();
720	Helper.replaceRegWith(MRI, FromReg: OldR, ToReg: AddDst);
721	}
722	}
723
724	return true;
725	}
726
727	class AArch64PreLegalizerCombinerImpl : public Combiner {
728	protected:
729	const CombinerHelper Helper;
730	const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig;
731	const AArch64Subtarget &STI;
732	const LibcallLoweringInfo &Libcalls;
733
734	public:
735	AArch64PreLegalizerCombinerImpl(
736	MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
737	GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
738	const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
739	const AArch64Subtarget &STI, const LibcallLoweringInfo &Libcalls,
740	MachineDominatorTree MDT, const* LegalizerInfo *LI);
741
742	static const char getName() { return* "AArch6400PreLegalizerCombiner"; }
743
744	bool tryCombineAll(MachineInstr &I) const override;
745
746	bool tryCombineAllImpl(MachineInstr &I) const;
747
748	private:
749	#define GET_GICOMBINER_CLASS_MEMBERS
750	#include "AArch64GenPreLegalizeGICombiner.inc"
751	#undef GET_GICOMBINER_CLASS_MEMBERS
752	};
753
754	#define GET_GICOMBINER_IMPL
755	#include "AArch64GenPreLegalizeGICombiner.inc"
756	#undef GET_GICOMBINER_IMPL
757
758	AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl(
759	MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
760	GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
761	const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
762	const AArch64Subtarget &STI, const LibcallLoweringInfo &Libcalls,
763	MachineDominatorTree MDT, const* LegalizerInfo *LI)
764	: Combiner (MF, CInfo, TPC, &VT, CSEInfo),
765	Helper (Observer, B, /IsPreLegalize/ true, &VT, MDT, LI),
766	RuleConfig(RuleConfig), STI(STI), Libcalls(Libcalls),
767	#define GET_GICOMBINER_CONSTRUCTOR_INITS
768	#include "AArch64GenPreLegalizeGICombiner.inc"
769	#undef GET_GICOMBINER_CONSTRUCTOR_INITS
770	{
771	}
772
773	bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
774	if (tryCombineAllImpl(I&: MI))
775	return true;
776
777	unsigned Opc = MI.getOpcode();
778	switch (Opc) {
779	case TargetOpcode::G_SHUFFLE_VECTOR:
780	return Helper.tryCombineShuffleVector(MI);
781	case TargetOpcode::G_UADDO:
782	return tryToSimplifyUADDO(MI, B, Helper, Observer);
783	case TargetOpcode::G_MEMCPY_INLINE:
784	return Helper.tryEmitMemcpyInline(MI);
785	case TargetOpcode::G_MEMCPY:
786	case TargetOpcode::G_MEMMOVE:
787	case TargetOpcode::G_MEMSET: {
788	// If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
789	// heuristics decide.
790	unsigned MaxLen = CInfo.EnableOpt ? `0` : `32`;
791	// Try to inline memcpy type calls if optimizations are enabled.
792	if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
793	return true;
794	if (Opc == TargetOpcode::G_MEMSET)
795	return llvm::AArch64GISelUtils::tryEmitBZero(MI, MIRBuilder&: B, Libcalls,
796	MinSize: CInfo.EnableMinSize);
797	return false;
798	}
799	}
800
801	return false;
802	}
803
804	// Pass boilerplate
805	// ================
806
807	class AArch64PreLegalizerCombiner : public MachineFunctionPass {
808	public:
809	static char ID;
810
811	AArch64PreLegalizerCombiner();
812
813	StringRef getPassName() const override {
814	return "AArch64PreLegalizerCombiner";
815	}
816
817	bool runOnMachineFunction(MachineFunction &MF) override;
818
819	void getAnalysisUsage(AnalysisUsage &AU) const override;
820
821	private:
822	AArch64PreLegalizerCombinerImplRuleConfig RuleConfig;
823	};
824	} // end anonymous namespace
825
826	void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
827	AU.addRequired<TargetPassConfig>();
828	AU.setPreservesCFG();
829	getSelectionDAGFallbackAnalysisUsage(AU);
830	AU.addRequired<GISelValueTrackingAnalysisLegacy>();
831	AU.addPreserved<GISelValueTrackingAnalysisLegacy>();
832	AU.addRequired<MachineDominatorTreeWrapperPass>();
833	AU.addPreserved<MachineDominatorTreeWrapperPass>();
834	AU.addRequired<GISelCSEAnalysisWrapperPass>();
835	AU.addPreserved<GISelCSEAnalysisWrapperPass>();
836	AU.addRequired<LibcallLoweringInfoWrapper>();
837	MachineFunctionPass::getAnalysisUsage(AU);
838	}
839
840	AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
841	: MachineFunctionPass (ID) {
842	if (!RuleConfig.parseCommandLineOption())
843	report_fatal_error(reason: "Invalid rule identifier");
844	}
845
846	bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
847	if (MF.getProperties().hasFailedISel())
848	return false;
849	auto &TPC = getAnalysis<TargetPassConfig>();
850
851	// Enable CSE.
852	GISelCSEAnalysisWrapper &Wrapper =
853	getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
854	auto *CSEInfo = &Wrapper.get(CSEOpt: TPC.getCSEConfig());
855
856	const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
857	const auto *LI = ST.getLegalizerInfo();
858
859	const Function &F = MF.getFunction();
860
861	const LibcallLoweringInfo &Libcalls =
862	getAnalysis<LibcallLoweringInfoWrapper>().getLibcallLowering(
863	M: *F.getParent(), Subtarget: ST);
864
865	bool EnableOpt =
866	MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
867	GISelValueTracking *VT =
868	&getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF);
869	MachineDominatorTree *MDT =
870	&getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
871	CombinerInfo CInfo(/AllowIllegalOps/ true, /ShouldLegalizeIllegal/ false,
872	/LegalizerInfo/ nullptr, EnableOpt, F.hasOptSize(),
873	F.hasMinSize());
874	// Disable fixed-point iteration to reduce compile-time
875	CInfo.MaxIterations = `1`;
876	CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass;
877	// This is the first Combiner, so the input IR might contain dead
878	// instructions.
879	CInfo.EnableFullDCE = true;
880	AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *VT, CSEInfo,
881	RuleConfig, ST, Libcalls, MDT, LI);
882	return Impl.combineMachineInstrs();
883	}
884
885	char AArch64PreLegalizerCombiner::ID = `0`;
886	INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
887	"Combine AArch64 machine instrs before legalization",
888	false, false)
889	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
890	INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy)
891	INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
892	INITIALIZE_PASS_DEPENDENCY(LibcallLoweringInfoWrapper)
893	INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
894	"Combine AArch64 machine instrs before legalization", false,
895	false)
896
897	namespace llvm {
898	FunctionPass *createAArch64PreLegalizerCombiner() {
899	return new AArch64PreLegalizerCombiner ();
900	}
901	} // end namespace llvm
902

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp