AArch64PreLegalizerCombiner.cpp source code [llvm_projects/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp]

1	//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass does combining of machine instructions at the generic MI level,
10	// before the legalizer.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AArch64GlobalISelUtils.h"
15	#include "AArch64TargetMachine.h"
16	#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
17	#include "llvm/CodeGen/GlobalISel/Combiner.h"
18	#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
19	#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
20	#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
21	#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24	#include "llvm/CodeGen/GlobalISel/Utils.h"
25	#include "llvm/CodeGen/MachineDominators.h"
26	#include "llvm/CodeGen/MachineFunction.h"
27	#include "llvm/CodeGen/MachineFunctionPass.h"
28	#include "llvm/CodeGen/MachineRegisterInfo.h"
29	#include "llvm/CodeGen/TargetPassConfig.h"
30	#include "llvm/IR/Instructions.h"
31	#include "llvm/Support/Debug.h"
32
33	#define GET_GICOMBINER_DEPS
34	#include "AArch64GenPreLegalizeGICombiner.inc"
35	#undef GET_GICOMBINER_DEPS
36
37	#define DEBUG_TYPE "aarch64-prelegalizer-combiner"
38
39	using namespace llvm;
40	using namespace MIPatternMatch;
41
42	namespace {
43
44	#define GET_GICOMBINER_TYPES
45	#include "AArch64GenPreLegalizeGICombiner.inc"
46	#undef GET_GICOMBINER_TYPES
47
48	/// Return true if a G_FCONSTANT instruction is known to be better-represented
49	/// as a G_CONSTANT.
50	bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
51	assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
52	Register DstReg = MI.getOperand(i: `0`).getReg();
53	const unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
54	if (DstSize != `32` && DstSize != `64`)
55	return false;
56
57	// When we're storing a value, it doesn't matter what register bank it's on.
58	// Since not all floating point constants can be materialized using a fmov,
59	// it makes more sense to just use a GPR.
60	return all_of(Range: MRI.use_nodbg_instructions(Reg: DstReg),
61	P: [](const MachineInstr &Use) { return Use.mayStore(); });
62	}
63
64	/// Change a G_FCONSTANT into a G_CONSTANT.
65	void applyFConstantToConstant(MachineInstr &MI) {
66	assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
67	MachineIRBuilder MIB(MI);
68	const APFloat &ImmValAPF = MI.getOperand(i: `1`).getFPImm()->getValueAPF();
69	MIB.buildConstant(Res: MI.getOperand(i: `0`).getReg(), Val: ImmValAPF.bitcastToAPInt());
70	MI.eraseFromParent();
71	}
72
73	/// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
74	/// are sign bits. In this case, we can transform the G_ICMP to directly compare
75	/// the wide value with a zero.
76	bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
77	GISelKnownBits *KB, Register &MatchInfo) {
78	assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB);
79
80	auto Pred = (CmpInst::Predicate)MI.getOperand(i: `1`).getPredicate();
81	if (!ICmpInst::isEquality(P: Pred))
82	return false;
83
84	Register LHS = MI.getOperand(i: `2`).getReg();
85	LLT LHSTy = MRI.getType(Reg: LHS);
86	if (!LHSTy.isScalar())
87	return false;
88
89	Register RHS = MI.getOperand(i: `3`).getReg();
90	Register WideReg;
91
92	if (!mi_match(R: LHS, MRI, P: m_GTrunc(Src: m_Reg(R&: WideReg))) \|\|
93	!mi_match(R: RHS, MRI, P: m_SpecificICst(RequestedValue: `0`)))
94	return false;
95
96	LLT WideTy = MRI.getType(Reg: WideReg);
97	if (KB->computeNumSignBits(R: WideReg) <=
98	WideTy.getSizeInBits() - LHSTy.getSizeInBits())
99	return false;
100
101	MatchInfo = WideReg;
102	return true;
103	}
104
105	void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
106	MachineIRBuilder &Builder,
107	GISelChangeObserver &Observer, Register &WideReg) {
108	assert(MI.getOpcode() == TargetOpcode::G_ICMP);
109
110	LLT WideTy = MRI.getType(Reg: WideReg);
111	// We're going to directly use the wide register as the LHS, and then use an
112	// equivalent size zero for RHS.
113	Builder.setInstrAndDebugLoc(MI);
114	auto WideZero = Builder.buildConstant(Res: WideTy, Val: `0`);
115	Observer.changingInstr(MI);
116	MI.getOperand(i: `2`).setReg(WideReg);
117	MI.getOperand(i: `3`).setReg(WideZero.getReg(Idx: `0`));
118	Observer.changedInstr(MI);
119	}
120
121	/// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
122	///
123	/// e.g.
124	///
125	/// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
126	bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
127	std::pair<uint64_t, uint64_t> &MatchInfo) {
128	assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
129	MachineFunction &MF = *MI.getMF();
130	auto &GlobalOp = MI.getOperand(i: `1`);
131	auto *GV = GlobalOp.getGlobal();
132	if (GV->isThreadLocal())
133	return false;
134
135	// Don't allow anything that could represent offsets etc.
136	if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
137	GV, TM: MF.getTarget()) != AArch64II::MO_NO_FLAG)
138	return false;
139
140	// Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
141	//
142	// %g = G_GLOBAL_VALUE @x
143	// %ptr1 = G_PTR_ADD %g, cst1
144	// %ptr2 = G_PTR_ADD %g, cst2
145	// ...
146	// %ptrN = G_PTR_ADD %g, cstN
147	//
148	// Identify the smallest* constant. We want to be able to form this:*
149	//
150	// %offset_g = G_GLOBAL_VALUE @x + min_cst
151	// %g = G_PTR_ADD %offset_g, -min_cst
152	// %ptr1 = G_PTR_ADD %g, cst1
153	// ...
154	Register Dst = MI.getOperand(i: `0`).getReg();
155	uint64_t MinOffset = -`1ull`;
156	for (auto &UseInstr : MRI.use_nodbg_instructions(Reg: Dst)) {
157	if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
158	return false;
159	auto Cst = getIConstantVRegValWithLookThrough(
160	VReg: UseInstr.getOperand(i: `2`).getReg(), MRI);
161	if (!Cst)
162	return false;
163	MinOffset = std::min(a: MinOffset, b: Cst ->Value.getZExtValue());
164	}
165
166	// Require that the new offset is larger than the existing one to avoid
167	// infinite loops.
168	uint64_t CurrOffset = GlobalOp.getOffset();
169	uint64_t NewOffset = MinOffset + CurrOffset;
170	if (NewOffset <= CurrOffset)
171	return false;
172
173	// Check whether folding this offset is legal. It must not go out of bounds of
174	// the referenced object to avoid violating the code model, and must be
175	// smaller than 2^20 because this is the largest offset expressible in all
176	// object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
177	// stores an immediate signed 21 bit offset.)
178	//
179	// This check also prevents us from folding negative offsets, which will end
180	// up being treated in the same way as large positive ones. They could also
181	// cause code model violations, and aren't really common enough to matter.
182	if (NewOffset >= (`1` << `20`))
183	return false;
184
185	Type *T = GV->getValueType();
186	if (!T->isSized() \|\|
187	NewOffset > GV->getDataLayout().getTypeAllocSize(Ty: T))
188	return false;
189	MatchInfo = std::make_pair(x&: NewOffset, y&: MinOffset);
190	return true;
191	}
192
193	void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
194	MachineIRBuilder &B, GISelChangeObserver &Observer,
195	std::pair<uint64_t, uint64_t> &MatchInfo) {
196	// Change:
197	//
198	// %g = G_GLOBAL_VALUE @x
199	// %ptr1 = G_PTR_ADD %g, cst1
200	// %ptr2 = G_PTR_ADD %g, cst2
201	// ...
202	// %ptrN = G_PTR_ADD %g, cstN
203	//
204	// To:
205	//
206	// %offset_g = G_GLOBAL_VALUE @x + min_cst
207	// %g = G_PTR_ADD %offset_g, -min_cst
208	// %ptr1 = G_PTR_ADD %g, cst1
209	// ...
210	// %ptrN = G_PTR_ADD %g, cstN
211	//
212	// Then, the original G_PTR_ADDs should be folded later on so that they look
213	// like this:
214	//
215	// %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
216	uint64_t Offset, MinOffset;
217	std::tie(args&: Offset, args&: MinOffset) = MatchInfo;
218	B.setInstrAndDebugLoc(*std::next(x: MI.getIterator()));
219	Observer.changingInstr(MI);
220	auto &GlobalOp = MI.getOperand(i: `1`);
221	auto *GV = GlobalOp.getGlobal();
222	GlobalOp.ChangeToGA(GV, Offset, TargetFlags: GlobalOp.getTargetFlags());
223	Register Dst = MI.getOperand(i: `0`).getReg();
224	Register NewGVDst = MRI.cloneVirtualRegister(VReg: Dst);
225	MI.getOperand(i: `0`).setReg(NewGVDst);
226	Observer.changedInstr(MI);
227	B.buildPtrAdd(
228	Res: Dst, Op0: NewGVDst,
229	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: -static_cast<int64_t>(MinOffset)));
230	}
231
232	// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y))
233	// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1))
234	// Similar to performVecReduceAddCombine in SelectionDAG
235	bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
236	const AArch64Subtarget &STI,
237	std::tuple<Register, Register, bool> &MatchInfo) {
238	assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
239	"Expected a G_VECREDUCE_ADD instruction");
240	assert(STI.hasDotProd() && "Target should have Dot Product feature");
241
242	MachineInstr *I1 = getDefIgnoringCopies(Reg: MI.getOperand(i: `1`).getReg(), MRI);
243	Register DstReg = MI.getOperand(i: `0`).getReg();
244	Register MidReg = I1->getOperand(i: `0`).getReg();
245	LLT DstTy = MRI.getType(Reg: DstReg);
246	LLT MidTy = MRI.getType(Reg: MidReg);
247	if (DstTy.getScalarSizeInBits() != `32` \|\| MidTy.getScalarSizeInBits() != `32`)
248	return false;
249
250	LLT SrcTy;
251	auto I1Opc = I1->getOpcode();
252	if (I1Opc == TargetOpcode::G_MUL) {
253	// If result of this has more than 1 use, then there is no point in creating
254	// udot instruction
255	if (!MRI.hasOneNonDBGUse(RegNo: MidReg))
256	return false;
257
258	MachineInstr *ExtMI1 =
259	getDefIgnoringCopies(Reg: I1->getOperand(i: `1`).getReg(), MRI);
260	MachineInstr *ExtMI2 =
261	getDefIgnoringCopies(Reg: I1->getOperand(i: `2`).getReg(), MRI);
262	LLT Ext1DstTy = MRI.getType(Reg: ExtMI1->getOperand(i: `0`).getReg());
263	LLT Ext2DstTy = MRI.getType(Reg: ExtMI2->getOperand(i: `0`).getReg());
264
265	if (ExtMI1->getOpcode() != ExtMI2->getOpcode() \|\| Ext1DstTy != Ext2DstTy)
266	return false;
267	I1Opc = ExtMI1->getOpcode();
268	SrcTy = MRI.getType(Reg: ExtMI1->getOperand(i: `1`).getReg());
269	std::get<`0`>(t&: MatchInfo) = ExtMI1->getOperand(i: `1`).getReg();
270	std::get<`1`>(t&: MatchInfo) = ExtMI2->getOperand(i: `1`).getReg();
271	} else {
272	SrcTy = MRI.getType(Reg: I1->getOperand(i: `1`).getReg());
273	std::get<`0`>(t&: MatchInfo) = I1->getOperand(i: `1`).getReg();
274	std::get<`1`>(t&: MatchInfo) = `0`;
275	}
276
277	if (I1Opc == TargetOpcode::G_ZEXT)
278	std::get<`2`>(t&: MatchInfo) = `0`;
279	else if (I1Opc == TargetOpcode::G_SEXT)
280	std::get<`2`>(t&: MatchInfo) = `1`;
281	else
282	return false;
283
284	if (SrcTy.getScalarSizeInBits() != `8` \|\| SrcTy.getNumElements() % `8` != `0`)
285	return false;
286
287	return true;
288	}
289
290	void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
291	MachineIRBuilder &Builder,
292	GISelChangeObserver &Observer,
293	const AArch64Subtarget &STI,
294	std::tuple<Register, Register, bool> &MatchInfo) {
295	assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
296	"Expected a G_VECREDUCE_ADD instruction");
297	assert(STI.hasDotProd() && "Target should have Dot Product feature");
298
299	// Initialise the variables
300	unsigned DotOpcode =
301	std::get<`2`>(t&: MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT;
302	Register Ext1SrcReg = std::get<`0`>(t&: MatchInfo);
303
304	// If there is one source register, create a vector of 0s as the second
305	// source register
306	Register Ext2SrcReg;
307	if (std::get<`1`>(t&: MatchInfo) == `0`)
308	Ext2SrcReg = Builder.buildConstant(Res: MRI.getType(Reg: Ext1SrcReg), Val: `1`)
309	->getOperand(i: `0`)
310	.getReg();
311	else
312	Ext2SrcReg = std::get<`1`>(t&: MatchInfo);
313
314	// Find out how many DOT instructions are needed
315	LLT SrcTy = MRI.getType(Reg: Ext1SrcReg);
316	LLT MidTy;
317	unsigned NumOfDotMI;
318	if (SrcTy.getNumElements() % `16` == `0`) {
319	NumOfDotMI = SrcTy.getNumElements() / `16`;
320	MidTy = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
321	} else if (SrcTy.getNumElements() % `8` == `0`) {
322	NumOfDotMI = SrcTy.getNumElements() / `8`;
323	MidTy = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`);
324	} else {
325	llvm_unreachable("Source type number of elements is not multiple of 8");
326	}
327
328	// Handle case where one DOT instruction is needed
329	if (NumOfDotMI == `1`) {
330	auto Zeroes = Builder.buildConstant(Res: MidTy, Val: `0`)->getOperand(i: `0`).getReg();
331	auto Dot = Builder.buildInstr(Opc: DotOpcode, DstOps: {MidTy},
332	SrcOps: {Zeroes, Ext1SrcReg, Ext2SrcReg});
333	Builder.buildVecReduceAdd(Dst: MI.getOperand(i: `0`), Src: Dot ->getOperand(i: `0`));
334	} else {
335	// If not pad the last v8 element with 0s to a v16
336	SmallVector<Register, `4`> Ext1UnmergeReg;
337	SmallVector<Register, `4`> Ext2UnmergeReg;
338	if (SrcTy.getNumElements() % `16` != `0`) {
339	SmallVector<Register> Leftover1;
340	SmallVector<Register> Leftover2;
341
342	// Split the elements into v16i8 and v8i8
343	LLT MainTy = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`);
344	LLT LeftoverTy1, LeftoverTy2;
345	if ((!extractParts(Reg: Ext1SrcReg, RegTy: MRI.getType(Reg: Ext1SrcReg), MainTy,
346	LeftoverTy&: LeftoverTy1, VRegs&: Ext1UnmergeReg, LeftoverVRegs&: Leftover1, MIRBuilder&: Builder,
347	MRI)) \|\|
348	(!extractParts(Reg: Ext2SrcReg, RegTy: MRI.getType(Reg: Ext2SrcReg), MainTy,
349	LeftoverTy&: LeftoverTy2, VRegs&: Ext2UnmergeReg, LeftoverVRegs&: Leftover2, MIRBuilder&: Builder,
350	MRI))) {
351	llvm_unreachable("Unable to split this vector properly");
352	}
353
354	// Pad the leftover v8i8 vector with register of 0s of type v8i8
355	Register v8Zeroes = Builder.buildConstant(Res: LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `8`), Val: `0`)
356	->getOperand(i: `0`)
357	.getReg();
358
359	Ext1UnmergeReg.push_back(
360	Elt: Builder
361	.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`),
362	Ops: {Leftover1 [`0`], v8Zeroes})
363	.getReg(Idx: `0`));
364	Ext2UnmergeReg.push_back(
365	Elt: Builder
366	.buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`),
367	Ops: {Leftover2 [`0`], v8Zeroes})
368	.getReg(Idx: `0`));
369
370	} else {
371	// Unmerge the source vectors to v16i8
372	unsigned SrcNumElts = SrcTy.getNumElements();
373	extractParts(Reg: Ext1SrcReg, Ty: LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`), NumParts: SrcNumElts / `16`,
374	VRegs&: Ext1UnmergeReg, MIRBuilder&: Builder, MRI);
375	extractParts(Reg: Ext2SrcReg, Ty: LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`), NumParts: SrcNumElts / `16`,
376	VRegs&: Ext2UnmergeReg, MIRBuilder&: Builder, MRI);
377	}
378
379	// Build the UDOT instructions
380	SmallVector<Register, `2`> DotReg;
381	unsigned NumElements = `0`;
382	for (unsigned i = `0`; i < Ext1UnmergeReg.size(); i++) {
383	LLT ZeroesLLT;
384	// Check if it is 16 or 8 elements. Set Zeroes to the according size
385	if (MRI.getType(Reg: Ext1UnmergeReg [i]).getNumElements() == `16`) {
386	ZeroesLLT = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
387	NumElements += `4`;
388	} else {
389	ZeroesLLT = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`);
390	NumElements += `2`;
391	}
392	auto Zeroes = Builder.buildConstant(Res: ZeroesLLT, Val: `0`)->getOperand(i: `0`).getReg();
393	DotReg.push_back(
394	Elt: Builder
395	.buildInstr(Opc: DotOpcode, DstOps: {MRI.getType(Reg: Zeroes)},
396	SrcOps: {Zeroes, Ext1UnmergeReg [i], Ext2UnmergeReg [i]})
397	.getReg(Idx: `0`));
398	}
399
400	// Merge the output
401	auto ConcatMI =
402	Builder.buildConcatVectors(Res: LLT::fixed_vector(NumElements, ScalarSizeInBits: `32`), Ops: DotReg);
403
404	// Put it through a vector reduction
405	Builder.buildVecReduceAdd(Dst: MI.getOperand(i: `0`).getReg(),
406	Src: ConcatMI ->getOperand(i: `0`).getReg());
407	}
408
409	// Erase the dead instructions
410	MI.eraseFromParent();
411	}
412
413	// Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x)
414	// Ensure that the type coming from the extend instruction is the right size
415	bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
416	std::pair<Register, bool> &MatchInfo) {
417	assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
418	"Expected G_VECREDUCE_ADD Opcode");
419
420	// Check if the last instruction is an extend
421	MachineInstr *ExtMI = getDefIgnoringCopies(Reg: MI.getOperand(i: `1`).getReg(), MRI);
422	auto ExtOpc = ExtMI->getOpcode();
423
424	if (ExtOpc == TargetOpcode::G_ZEXT)
425	std::get<`1`>(in&: MatchInfo) = `0`;
426	else if (ExtOpc == TargetOpcode::G_SEXT)
427	std::get<`1`>(in&: MatchInfo) = `1`;
428	else
429	return false;
430
431	// Check if the source register is a valid type
432	Register ExtSrcReg = ExtMI->getOperand(i: `1`).getReg();
433	LLT ExtSrcTy = MRI.getType(Reg: ExtSrcReg);
434	LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
435	if ((DstTy.getScalarSizeInBits() == `16` &&
436	ExtSrcTy.getNumElements() % `8` == `0` && ExtSrcTy.getNumElements() < `256`) \|\|
437	(DstTy.getScalarSizeInBits() == `32` &&
438	ExtSrcTy.getNumElements() % `4` == `0`) \|\|
439	(DstTy.getScalarSizeInBits() == `64` &&
440	ExtSrcTy.getNumElements() % `4` == `0`)) {
441	std::get<`0`>(in&: MatchInfo) = ExtSrcReg;
442	return true;
443	}
444	return false;
445	}
446
447	void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
448	MachineIRBuilder &B, GISelChangeObserver &Observer,
449	std::pair<Register, bool> &MatchInfo) {
450	assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
451	"Expected G_VECREDUCE_ADD Opcode");
452
453	unsigned Opc = std::get<`1`>(in&: MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV;
454	Register SrcReg = std::get<`0`>(in&: MatchInfo);
455	Register DstReg = MI.getOperand(i: `0`).getReg();
456	LLT SrcTy = MRI.getType(Reg: SrcReg);
457	LLT DstTy = MRI.getType(Reg: DstReg);
458
459	// If SrcTy has more elements than expected, split them into multiple
460	// insructions and sum the results
461	LLT MainTy;
462	SmallVector<Register, `1`> WorkingRegisters;
463	unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
464	unsigned SrcNumElem = SrcTy.getNumElements();
465	if ((SrcScalSize == `8` && SrcNumElem > `16`) \|\|
466	(SrcScalSize == `16` && SrcNumElem > `8`) \|\|
467	(SrcScalSize == `32` && SrcNumElem > `4`)) {
468
469	LLT LeftoverTy;
470	SmallVector<Register, `4`> LeftoverRegs;
471	if (SrcScalSize == `8`)
472	MainTy = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `8`);
473	else if (SrcScalSize == `16`)
474	MainTy = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`);
475	else if (SrcScalSize == `32`)
476	MainTy = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
477	else
478	llvm_unreachable("Source's Scalar Size not supported");
479
480	// Extract the parts and put each extracted sources through U/SADDLV and put
481	// the values inside a small vec
482	extractParts(Reg: SrcReg, RegTy: SrcTy, MainTy, LeftoverTy, VRegs&: WorkingRegisters,
483	LeftoverVRegs&: LeftoverRegs, MIRBuilder&: B, MRI);
484	for (unsigned I = `0`; I < LeftoverRegs.size(); I++) {
485	WorkingRegisters.push_back(Elt: LeftoverRegs [I]);
486	}
487	} else {
488	WorkingRegisters.push_back(Elt: SrcReg);
489	MainTy = SrcTy;
490	}
491
492	unsigned MidScalarSize = MainTy.getScalarSizeInBits() * `2`;
493	LLT MidScalarLLT = LLT::scalar(SizeInBits: MidScalarSize);
494	Register zeroReg = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: `0`).getReg(Idx: `0`);
495	for (unsigned I = `0`; I < WorkingRegisters.size(); I++) {
496	// If the number of elements is too small to build an instruction, extend
497	// its size before applying addlv
498	LLT WorkingRegTy = MRI.getType(Reg: WorkingRegisters [I]);
499	if ((WorkingRegTy.getScalarSizeInBits() == `8`) &&
500	(WorkingRegTy.getNumElements() == `4`)) {
501	WorkingRegisters [I] =
502	B.buildInstr(Opc: std::get<`1`>(in&: MatchInfo) ? TargetOpcode::G_SEXT
503	: TargetOpcode::G_ZEXT,
504	DstOps: {LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`)}, SrcOps: {WorkingRegisters [I]})
505	.getReg(Idx: `0`);
506	}
507
508	// Generate the {U/S}ADDLV instruction, whose output is always double of the
509	// Src's Scalar size
510	LLT addlvTy = MidScalarSize <= `32` ? LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`)
511	: LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
512	Register addlvReg =
513	B.buildInstr(Opc, DstOps: {addlvTy}, SrcOps: {WorkingRegisters [I]}).getReg(Idx: `0`);
514
515	// The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
516	// v2i64 register.
517	// i16, i32 results uses v4i32 registers
518	// i64 results uses v2i64 registers
519	// Therefore we have to extract/truncate the the value to the right type
520	if (MidScalarSize == `32` \|\| MidScalarSize == `64`) {
521	WorkingRegisters [I] = B.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT,
522	DstOps: {MidScalarLLT}, SrcOps: {addlvReg, zeroReg})
523	.getReg(Idx: `0`);
524	} else {
525	Register extractReg = B.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT,
526	DstOps: {LLT::scalar(SizeInBits: `32`)}, SrcOps: {addlvReg, zeroReg})
527	.getReg(Idx: `0`);
528	WorkingRegisters [I] =
529	B.buildTrunc(Res: {MidScalarLLT}, Op: {extractReg}).getReg(Idx: `0`);
530	}
531	}
532
533	Register outReg;
534	if (WorkingRegisters.size() > `1`) {
535	outReg = B.buildAdd(Dst: MidScalarLLT, Src0: WorkingRegisters [`0`], Src1: WorkingRegisters [`1`])
536	.getReg(Idx: `0`);
537	for (unsigned I = `2`; I < WorkingRegisters.size(); I++) {
538	outReg = B.buildAdd(Dst: MidScalarLLT, Src0: outReg, Src1: WorkingRegisters [I]).getReg(Idx: `0`);
539	}
540	} else {
541	outReg = WorkingRegisters [`0`];
542	}
543
544	if (DstTy.getScalarSizeInBits() > MidScalarSize) {
545	// Handle the scalar value if the DstTy's Scalar Size is more than double
546	// Src's ScalarType
547	B.buildInstr(Opc: std::get<`1`>(in&: MatchInfo) ? TargetOpcode::G_SEXT
548	: TargetOpcode::G_ZEXT,
549	DstOps: {DstReg}, SrcOps: {outReg});
550	} else {
551	B.buildCopy(Res: DstReg, Op: outReg);
552	}
553
554	MI.eraseFromParent();
555	}
556
557	// Pushes ADD/SUB through extend instructions to decrease the number of extend
558	// instruction at the end by allowing selection of {s\|u}addl sooner
559
560	// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
561	bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
562	Register DstReg, Register SrcReg1, Register SrcReg2) {
563	assert((MI.getOpcode() == TargetOpcode::G_ADD \|\|
564	MI.getOpcode() == TargetOpcode::G_SUB) &&
565	"Expected a G_ADD or G_SUB instruction\n");
566
567	// Deal with vector types only
568	LLT DstTy = MRI.getType(Reg: DstReg);
569	if (!DstTy.isVector())
570	return false;
571
572	// Return true if G_{S\|Z}EXT instruction is more than 2 source*
573	Register ExtDstReg = MI.getOperand(i: `1`).getReg();
574	LLT Ext1SrcTy = MRI.getType(Reg: SrcReg1);
575	LLT Ext2SrcTy = MRI.getType(Reg: SrcReg2);
576	unsigned ExtDstScal = MRI.getType(Reg: ExtDstReg).getScalarSizeInBits();
577	unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
578	if (((Ext1SrcScal == `8` && ExtDstScal == `32`) \|\|
579	((Ext1SrcScal == `8` \|\| Ext1SrcScal == `16`) && ExtDstScal == `64`)) &&
580	Ext1SrcTy == Ext2SrcTy)
581	return true;
582
583	return false;
584	}
585
586	void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
587	MachineIRBuilder &B, bool isSExt, Register DstReg,
588	Register SrcReg1, Register SrcReg2) {
589	LLT SrcTy = MRI.getType(Reg: SrcReg1);
590	LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTy.getScalarSizeInBits() * `2`);
591	unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
592	Register Ext1Reg = B.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg1}).getReg(Idx: `0`);
593	Register Ext2Reg = B.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg2}).getReg(Idx: `0`);
594	Register AddReg =
595	B.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Ext1Reg, Ext2Reg}).getReg(Idx: `0`);
596
597	// G_SUB has to sign-extend the result.
598	// G_ADD needs to sext from sext and can sext or zext from zext, so the
599	// original opcode is used.
600	if (MI.getOpcode() == TargetOpcode::G_ADD)
601	B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {AddReg});
602	else
603	B.buildSExt(Res: DstReg, Op: AddReg);
604
605	MI.eraseFromParent();
606	}
607
608	bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
609	CombinerHelper &Helper, GISelChangeObserver &Observer) {
610	// Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
611	// result is only used in the no-overflow case. It is restricted to cases
612	// where we know that the high-bits of the operands are 0. If there's an
613	// overflow, then the 9th or 17th bit must be set, which can be checked
614	// using TBNZ.
615	//
616	// Change (for UADDOs on 8 and 16 bits):
617	//
618	// %z0 = G_ASSERT_ZEXT _
619	// %op0 = G_TRUNC %z0
620	// %z1 = G_ASSERT_ZEXT _
621	// %op1 = G_TRUNC %z1
622	// %val, %cond = G_UADDO %op0, %op1
623	// G_BRCOND %cond, %error.bb
624	//
625	// error.bb:
626	// (no successors and no uses of %val)
627	//
628	// To:
629	//
630	// %z0 = G_ASSERT_ZEXT _
631	// %z1 = G_ASSERT_ZEXT _
632	// %add = G_ADD %z0, %z1
633	// %val = G_TRUNC %add
634	// %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
635	// %cond = G_ICMP NE, %bit, 0
636	// G_BRCOND %cond, %error.bb
637
638	auto &MRI = *B.getMRI();
639
640	MachineOperand *DefOp0 = MRI.getOneDef(Reg: MI.getOperand(i: `2`).getReg());
641	MachineOperand *DefOp1 = MRI.getOneDef(Reg: MI.getOperand(i: `3`).getReg());
642	Register Op0Wide;
643	Register Op1Wide;
644	if (!mi_match(R: DefOp0->getParent(), MRI, P: m_GTrunc(Src: m_Reg(R&: Op0Wide))) \|\|
645	!mi_match(R: DefOp1->getParent(), MRI, P: m_GTrunc(Src: m_Reg(R&: Op1Wide))))
646	return false;
647	LLT WideTy0 = MRI.getType(Reg: Op0Wide);
648	LLT WideTy1 = MRI.getType(Reg: Op1Wide);
649	Register ResVal = MI.getOperand(i: `0`).getReg();
650	LLT OpTy = MRI.getType(Reg: ResVal);
651	MachineInstr *Op0WideDef = MRI.getVRegDef(Reg: Op0Wide);
652	MachineInstr *Op1WideDef = MRI.getVRegDef(Reg: Op1Wide);
653
654	unsigned OpTySize = OpTy.getScalarSizeInBits();
655	// First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
656	// inputs have been zero-extended.
657	if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT \|\|
658	Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT \|\|
659	OpTySize != Op0WideDef->getOperand(i: `2`).getImm() \|\|
660	OpTySize != Op1WideDef->getOperand(i: `2`).getImm())
661	return false;
662
663	// Only scalar UADDO with either 8 or 16 bit operands are handled.
664	if (!WideTy0.isScalar() \|\| !WideTy1.isScalar() \|\| WideTy0 != WideTy1 \|\|
665	OpTySize >= WideTy0.getScalarSizeInBits() \|\|
666	(OpTySize != `8` && OpTySize != `16`))
667	return false;
668
669	// The overflow-status result must be used by a branch only.
670	Register ResStatus = MI.getOperand(i: `1`).getReg();
671	if (!MRI.hasOneNonDBGUse(RegNo: ResStatus))
672	return false;
673	MachineInstr CondUser = &MRI.use_instr_nodbg_begin(RegNo: ResStatus);
674	if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
675	return false;
676
677	// Make sure the computed result is only used in the no-overflow blocks.
678	MachineBasicBlock *CurrentMBB = MI.getParent();
679	MachineBasicBlock *FailMBB = CondUser->getOperand(i: `1`).getMBB();
680	if (!FailMBB->succ_empty() \|\| CondUser->getParent() != CurrentMBB)
681	return false;
682	if (any_of(Range: MRI.use_nodbg_instructions(Reg: ResVal),
683	P: [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
684	return &MI != &I &&
685	(I.getParent() == FailMBB \|\| I.getParent() == CurrentMBB);
686	}))
687	return false;
688
689	// Remove G_ADDO.
690	B.setInstrAndDebugLoc(*MI.getNextNode());
691	MI.eraseFromParent();
692
693	// Emit wide add.
694	Register AddDst = MRI.cloneVirtualRegister(VReg: Op0Wide);
695	B.buildInstr(Opc: TargetOpcode::G_ADD, DstOps: {AddDst}, SrcOps: {Op0Wide, Op1Wide});
696
697	// Emit check of the 9th or 17th bit and update users (the branch). This will
698	// later be folded to TBNZ.
699	Register CondBit = MRI.cloneVirtualRegister(VReg: Op0Wide);
700	B.buildAnd(
701	Dst: CondBit, Src0: AddDst,
702	Src1: B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: OpTySize == `8` ? `1` << `8` : `1` << `16`));
703	B.buildICmp(Pred: CmpInst::ICMP_NE, Res: ResStatus, Op0: CondBit,
704	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: `0`));
705
706	// Update ZEXts users of the result value. Because all uses are in the
707	// no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
708	B.buildZExtOrTrunc(Res: ResVal, Op: AddDst);
709	for (MachineOperand &U : make_early_inc_range(Range: MRI.use_operands(Reg: ResVal))) {
710	Register WideReg;
711	if (mi_match(R: U.getParent(), MRI, P: m_GZExt(Src: m_Reg(R&: WideReg)))) {
712	auto OldR = U.getParent()->getOperand(i: `0`).getReg();
713	Observer.erasingInstr(MI&: *U.getParent());
714	U.getParent()->eraseFromParent();
715	Helper.replaceRegWith(MRI, FromReg: OldR, ToReg: AddDst);
716	}
717	}
718
719	return true;
720	}
721
722	class AArch64PreLegalizerCombinerImpl : public Combiner {
723	protected:
724	// TODO: Make CombinerHelper methods const.
725	mutable CombinerHelper Helper;
726	const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig;
727	const AArch64Subtarget &STI;
728
729	public:
730	AArch64PreLegalizerCombinerImpl(
731	MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
732	GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
733	const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
734	const AArch64Subtarget &STI, MachineDominatorTree *MDT,
735	const LegalizerInfo *LI);
736
737	static const char getName() { return* "AArch6400PreLegalizerCombiner"; }
738
739	bool tryCombineAll(MachineInstr &I) const override;
740
741	bool tryCombineAllImpl(MachineInstr &I) const;
742
743	private:
744	#define GET_GICOMBINER_CLASS_MEMBERS
745	#include "AArch64GenPreLegalizeGICombiner.inc"
746	#undef GET_GICOMBINER_CLASS_MEMBERS
747	};
748
749	#define GET_GICOMBINER_IMPL
750	#include "AArch64GenPreLegalizeGICombiner.inc"
751	#undef GET_GICOMBINER_IMPL
752
753	AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl(
754	MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
755	GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
756	const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
757	const AArch64Subtarget &STI, MachineDominatorTree *MDT,
758	const LegalizerInfo *LI)
759	: Combiner (MF, CInfo, TPC, &KB, CSEInfo),
760	Helper (Observer, B, /IsPreLegalize/ true, &KB, MDT, LI),
761	RuleConfig(RuleConfig), STI(STI),
762	#define GET_GICOMBINER_CONSTRUCTOR_INITS
763	#include "AArch64GenPreLegalizeGICombiner.inc"
764	#undef GET_GICOMBINER_CONSTRUCTOR_INITS
765	{
766	}
767
768	bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
769	if (tryCombineAllImpl(I&: MI))
770	return true;
771
772	unsigned Opc = MI.getOpcode();
773	switch (Opc) {
774	case TargetOpcode::G_SHUFFLE_VECTOR:
775	return Helper.tryCombineShuffleVector(MI);
776	case TargetOpcode::G_UADDO:
777	return tryToSimplifyUADDO(MI, B, Helper, Observer);
778	case TargetOpcode::G_MEMCPY_INLINE:
779	return Helper.tryEmitMemcpyInline(MI);
780	case TargetOpcode::G_MEMCPY:
781	case TargetOpcode::G_MEMMOVE:
782	case TargetOpcode::G_MEMSET: {
783	// If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
784	// heuristics decide.
785	unsigned MaxLen = CInfo.EnableOpt ? `0` : `32`;
786	// Try to inline memcpy type calls if optimizations are enabled.
787	if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
788	return true;
789	if (Opc == TargetOpcode::G_MEMSET)
790	return llvm::AArch64GISelUtils::tryEmitBZero(MI, MIRBuilder&: B, MinSize: CInfo.EnableMinSize);
791	return false;
792	}
793	}
794
795	return false;
796	}
797
798	// Pass boilerplate
799	// ================
800
801	class AArch64PreLegalizerCombiner : public MachineFunctionPass {
802	public:
803	static char ID;
804
805	AArch64PreLegalizerCombiner();
806
807	StringRef getPassName() const override {
808	return "AArch64PreLegalizerCombiner";
809	}
810
811	bool runOnMachineFunction(MachineFunction &MF) override;
812
813	void getAnalysisUsage(AnalysisUsage &AU) const override;
814
815	private:
816	AArch64PreLegalizerCombinerImplRuleConfig RuleConfig;
817	};
818	} // end anonymous namespace
819
820	void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
821	AU.addRequired<TargetPassConfig>();
822	AU.setPreservesCFG();
823	getSelectionDAGFallbackAnalysisUsage(AU);
824	AU.addRequired<GISelKnownBitsAnalysis>();
825	AU.addPreserved<GISelKnownBitsAnalysis>();
826	AU.addRequired<MachineDominatorTreeWrapperPass>();
827	AU.addPreserved<MachineDominatorTreeWrapperPass>();
828	AU.addRequired<GISelCSEAnalysisWrapperPass>();
829	AU.addPreserved<GISelCSEAnalysisWrapperPass>();
830	MachineFunctionPass::getAnalysisUsage(AU);
831	}
832
833	AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
834	: MachineFunctionPass (ID) {
835	initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
836
837	if (!RuleConfig.parseCommandLineOption())
838	report_fatal_error(reason: "Invalid rule identifier");
839	}
840
841	bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
842	if (MF.getProperties().hasProperty(
843	P: MachineFunctionProperties::Property::FailedISel))
844	return false;
845	auto &TPC = getAnalysis<TargetPassConfig>();
846
847	// Enable CSE.
848	GISelCSEAnalysisWrapper &Wrapper =
849	getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
850	auto *CSEInfo = &Wrapper.get(CSEOpt: TPC.getCSEConfig());
851
852	const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
853	const auto *LI = ST.getLegalizerInfo();
854
855	const Function &F = MF.getFunction();
856	bool EnableOpt =
857	MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
858	GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
859	MachineDominatorTree *MDT =
860	&getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
861	CombinerInfo CInfo(/AllowIllegalOps/ true, /ShouldLegalizeIllegal/ false,
862	/LegalizerInfo/ nullptr, EnableOpt, F.hasOptSize(),
863	F.hasMinSize());
864	AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, CSEInfo,
865	RuleConfig, ST, MDT, LI);
866	return Impl.combineMachineInstrs();
867	}
868
869	char AArch64PreLegalizerCombiner::ID = `0`;
870	INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
871	"Combine AArch64 machine instrs before legalization",
872	false, false)
873	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
874	INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
875	INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
876	INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
877	"Combine AArch64 machine instrs before legalization", false,
878	false)
879
880	namespace llvm {
881	FunctionPass *createAArch64PreLegalizerCombiner() {
882	return new AArch64PreLegalizerCombiner ();
883	}
884	} // end namespace llvm
885

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp