AArch64PostLegalizerCombiner.cpp source code [llvm_projects/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp]

1	//=== AArch64PostLegalizerCombiner.cpp --------------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// Post-legalization combines on generic MachineInstrs.
11	///
12	/// The combines here must preserve instruction legality.
13	///
14	/// Lowering combines (e.g. pseudo matching) should be handled by
15	/// AArch64PostLegalizerLowering.
16	///
17	/// Combines which don't rely on instruction legality should go in the
18	/// AArch64PreLegalizerCombiner.
19	///
20	//===----------------------------------------------------------------------===//
21
22	#include "AArch64TargetMachine.h"
23	#include "llvm/ADT/STLExtras.h"
24	#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
25	#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26	#include "llvm/CodeGen/GlobalISel/Combiner.h"
27	#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
28	#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
29	#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
30	#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
31	#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
32	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
33	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
34	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
35	#include "llvm/CodeGen/GlobalISel/Utils.h"
36	#include "llvm/CodeGen/MachineDominators.h"
37	#include "llvm/CodeGen/MachineFunctionPass.h"
38	#include "llvm/CodeGen/MachineRegisterInfo.h"
39	#include "llvm/CodeGen/TargetOpcodes.h"
40	#include "llvm/CodeGen/TargetPassConfig.h"
41	#include "llvm/Support/Debug.h"
42
43	#define GET_GICOMBINER_DEPS
44	#include "AArch64GenPostLegalizeGICombiner.inc"
45	#undef GET_GICOMBINER_DEPS
46
47	#define DEBUG_TYPE "aarch64-postlegalizer-combiner"
48
49	using namespace llvm;
50	using namespace MIPatternMatch;
51
52	namespace {
53
54	#define GET_GICOMBINER_TYPES
55	#include "AArch64GenPostLegalizeGICombiner.inc"
56	#undef GET_GICOMBINER_TYPES
57
58	/// This combine tries do what performExtractVectorEltCombine does in SDAG.
59	/// Rewrite for pairwise fadd pattern
60	/// (s32 (g_extract_vector_elt
61	/// (g_fadd (vXs32 Other)
62	/// (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0))
63	/// ->
64	/// (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0)
65	/// (g_extract_vector_elt (vXs32 Other) 1))
66	bool matchExtractVecEltPairwiseAdd(
67	MachineInstr &MI, MachineRegisterInfo &MRI,
68	std::tuple<unsigned, LLT, Register> &MatchInfo) {
69	Register Src1 = MI.getOperand(i: `1`).getReg();
70	Register Src2 = MI.getOperand(i: `2`).getReg();
71	LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
72
73	auto Cst = getIConstantVRegValWithLookThrough(VReg: Src2, MRI);
74	if (!Cst \|\| Cst ->Value != `0`)
75	return false;
76	// SDAG also checks for FullFP16, but this looks to be beneficial anyway.
77
78	// Now check for an fadd operation. TODO: expand this for integer add?
79	auto *FAddMI = getOpcodeDef(Opcode: TargetOpcode::G_FADD, Reg: Src1, MRI);
80	if (!FAddMI)
81	return false;
82
83	// If we add support for integer add, must restrict these types to just s64.
84	unsigned DstSize = DstTy.getSizeInBits();
85	if (DstSize != `16` && DstSize != `32` && DstSize != `64`)
86	return false;
87
88	Register Src1Op1 = FAddMI->getOperand(i: `1`).getReg();
89	Register Src1Op2 = FAddMI->getOperand(i: `2`).getReg();
90	MachineInstr *Shuffle =
91	getOpcodeDef(Opcode: TargetOpcode::G_SHUFFLE_VECTOR, Reg: Src1Op2, MRI);
92	MachineInstr *Other = MRI.getVRegDef(Reg: Src1Op1);
93	if (!Shuffle) {
94	Shuffle = getOpcodeDef(Opcode: TargetOpcode::G_SHUFFLE_VECTOR, Reg: Src1Op1, MRI);
95	Other = MRI.getVRegDef(Reg: Src1Op2);
96	}
97
98	// We're looking for a shuffle that moves the second element to index 0.
99	if (Shuffle && Shuffle->getOperand(i: `3`).getShuffleMask()[`0`] == `1` &&
100	Other == MRI.getVRegDef(Reg: Shuffle->getOperand(i: `1`).getReg())) {
101	std::get<`0`>(t&: MatchInfo) = TargetOpcode::G_FADD;
102	std::get<`1`>(t&: MatchInfo) = DstTy;
103	std::get<`2`>(t&: MatchInfo) = Other->getOperand(i: `0`).getReg();
104	return true;
105	}
106	return false;
107	}
108
109	void applyExtractVecEltPairwiseAdd(
110	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
111	std::tuple<unsigned, LLT, Register> &MatchInfo) {
112	unsigned Opc = std::get<`0`>(t&: MatchInfo);
113	assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!");
114	// We want to generate two extracts of elements 0 and 1, and add them.
115	LLT Ty = std::get<`1`>(t&: MatchInfo);
116	Register Src = std::get<`2`>(t&: MatchInfo);
117	LLT s64 = LLT::scalar(SizeInBits: `64`);
118	B.setInstrAndDebugLoc(MI);
119	auto Elt0 = B.buildExtractVectorElement(Res: Ty, Val: Src, Idx: B.buildConstant(Res: s64, Val: `0`));
120	auto Elt1 = B.buildExtractVectorElement(Res: Ty, Val: Src, Idx: B.buildConstant(Res: s64, Val: `1`));
121	B.buildInstr(Opc, DstOps: {MI.getOperand(i: `0`).getReg()}, SrcOps: {Elt0, Elt1});
122	MI.eraseFromParent();
123	}
124
125	bool isSignExtended(Register R, MachineRegisterInfo &MRI) {
126	// TODO: check if extended build vector as well.
127	unsigned Opc = MRI.getVRegDef(Reg: R)->getOpcode();
128	return Opc == TargetOpcode::G_SEXT \|\| Opc == TargetOpcode::G_SEXT_INREG;
129	}
130
131	bool isZeroExtended(Register R, MachineRegisterInfo &MRI) {
132	// TODO: check if extended build vector as well.
133	return MRI.getVRegDef(Reg: R)->getOpcode() == TargetOpcode::G_ZEXT;
134	}
135
136	bool matchAArch64MulConstCombine(
137	MachineInstr &MI, MachineRegisterInfo &MRI,
138	std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
139	assert(MI.getOpcode() == TargetOpcode::G_MUL);
140	Register LHS = MI.getOperand(i: `1`).getReg();
141	Register RHS = MI.getOperand(i: `2`).getReg();
142	Register Dst = MI.getOperand(i: `0`).getReg();
143	const LLT Ty = MRI.getType(Reg: LHS);
144
145	// The below optimizations require a constant RHS.
146	auto Const = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
147	if (!Const)
148	return false;
149
150	APInt ConstValue = Const ->Value.sext(width: Ty.getSizeInBits());
151	// The following code is ported from AArch64ISelLowering.
152	// Multiplication of a power of two plus/minus one can be done more
153	// cheaply as shift+add/sub. For now, this is true unilaterally. If
154	// future CPUs have a cheaper MADD instruction, this may need to be
155	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
156	// 64-bit is 5 cycles, so this is always a win.
157	// More aggressively, some multiplications N0 C can be lowered to*
158	// shift+add+shift if the constant C = A B where A = 2^N + 1 and B = 2^M,*
159	// e.g. 6=32=(2+1)2.
160	// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
161	// which equals to (1+2)16-(1+2).*
162	// TrailingZeroes is used to test if the mul can be lowered to
163	// shift+add+shift.
164	unsigned TrailingZeroes = ConstValue.countr_zero();
165	if (TrailingZeroes) {
166	// Conservatively do not lower to shift+add+shift if the mul might be
167	// folded into smul or umul.
168	if (MRI.hasOneNonDBGUse(RegNo: LHS) &&
169	(isSignExtended(R: LHS, MRI) \|\| isZeroExtended(R: LHS, MRI)))
170	return false;
171	// Conservatively do not lower to shift+add+shift if the mul might be
172	// folded into madd or msub.
173	if (MRI.hasOneNonDBGUse(RegNo: Dst)) {
174	MachineInstr &UseMI = *MRI.use_instr_begin(RegNo: Dst);
175	unsigned UseOpc = UseMI.getOpcode();
176	if (UseOpc == TargetOpcode::G_ADD \|\| UseOpc == TargetOpcode::G_PTR_ADD \|\|
177	UseOpc == TargetOpcode::G_SUB)
178	return false;
179	}
180	}
181	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
182	// and shift+add+shift.
183	APInt ShiftedConstValue = ConstValue.ashr(ShiftAmt: TrailingZeroes);
184
185	unsigned ShiftAmt, AddSubOpc;
186	// Is the shifted value the LHS operand of the add/sub?
187	bool ShiftValUseIsLHS = true;
188	// Do we need to negate the result?
189	bool NegateResult = false;
190
191	if (ConstValue.isNonNegative()) {
192	// (mul x, 2^N + 1) => (add (shl x, N), x)
193	// (mul x, 2^N - 1) => (sub (shl x, N), x)
194	// (mul x, (2^N + 1) 2^M) => (shl (add (shl x, N), x), M)*
195	APInt SCVMinus1 = ShiftedConstValue - `1`;
196	APInt CVPlus1 = ConstValue + `1`;
197	if (SCVMinus1.isPowerOf2()) {
198	ShiftAmt = SCVMinus1.logBase2();
199	AddSubOpc = TargetOpcode::G_ADD;
200	} else if (CVPlus1.isPowerOf2()) {
201	ShiftAmt = CVPlus1.logBase2();
202	AddSubOpc = TargetOpcode::G_SUB;
203	} else
204	return false;
205	} else {
206	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
207	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
208	APInt CVNegPlus1 = -ConstValue + `1`;
209	APInt CVNegMinus1 = -ConstValue - `1`;
210	if (CVNegPlus1.isPowerOf2()) {
211	ShiftAmt = CVNegPlus1.logBase2();
212	AddSubOpc = TargetOpcode::G_SUB;
213	ShiftValUseIsLHS = false;
214	} else if (CVNegMinus1.isPowerOf2()) {
215	ShiftAmt = CVNegMinus1.logBase2();
216	AddSubOpc = TargetOpcode::G_ADD;
217	NegateResult = true;
218	} else
219	return false;
220	}
221
222	if (NegateResult && TrailingZeroes)
223	return false;
224
225	ApplyFn = [=](MachineIRBuilder &B, Register DstReg) {
226	auto Shift = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: ShiftAmt);
227	auto ShiftedVal = B.buildShl(Dst: Ty, Src0: LHS, Src1: Shift);
228
229	Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(Idx: `0`) : LHS;
230	Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(Idx: `0`);
231	auto Res = B.buildInstr(Opc: AddSubOpc, DstOps: {Ty}, SrcOps: {AddSubLHS, AddSubRHS});
232	assert(!(NegateResult && TrailingZeroes) &&
233	"NegateResult and TrailingZeroes cannot both be true for now.");
234	// Negate the result.
235	if (NegateResult) {
236	B.buildSub(Dst: DstReg, Src0: B.buildConstant(Res: Ty, Val: `0`), Src1: Res);
237	return;
238	}
239	// Shift the result.
240	if (TrailingZeroes) {
241	B.buildShl(Dst: DstReg, Src0: Res, Src1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: TrailingZeroes));
242	return;
243	}
244	B.buildCopy(Res: DstReg, Op: Res.getReg(Idx: `0`));
245	};
246	return true;
247	}
248
249	void applyAArch64MulConstCombine(
250	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
251	std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) {
252	B.setInstrAndDebugLoc(MI);
253	ApplyFn (B, MI.getOperand(i: `0`).getReg());
254	MI.eraseFromParent();
255	}
256
257	/// Try to fold a G_MERGE_VALUES of 2 s32 sources, where the second source
258	/// is a zero, into a G_ZEXT of the first.
259	bool matchFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI) {
260	auto &Merge = cast<GMerge>(Val&: MI);
261	LLT SrcTy = MRI.getType(Reg: Merge.getSourceReg(I: `0`));
262	if (SrcTy != LLT::scalar(SizeInBits: `32`) \|\| Merge.getNumSources() != `2`)
263	return false;
264	return mi_match(R: Merge.getSourceReg(I: `1`), MRI, P: m_SpecificICst(RequestedValue: `0`));
265	}
266
267	void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI,
268	MachineIRBuilder &B, GISelChangeObserver &Observer) {
269	// Mutate %d(s64) = G_MERGE_VALUES %a(s32), 0(s32)
270	// ->
271	// %d(s64) = G_ZEXT %a(s32)
272	Observer.changingInstr(MI);
273	MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_ZEXT));
274	MI.removeOperand(OpNo: `2`);
275	Observer.changedInstr(MI);
276	}
277
278	/// \returns True if a G_ANYEXT instruction \p MI should be mutated to a G_ZEXT
279	/// instruction.
280	bool matchMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI) {
281	// If this is coming from a scalar compare then we can use a G_ZEXT instead of
282	// a G_ANYEXT:
283	//
284	// %cmp:_(s32) = G_[I\|F]CMP ... <-- produces 0/1.
285	// %ext:_(s64) = G_ANYEXT %cmp(s32)
286	//
287	// By doing this, we can leverage more KnownBits combines.
288	assert(MI.getOpcode() == TargetOpcode::G_ANYEXT);
289	Register Dst = MI.getOperand(i: `0`).getReg();
290	Register Src = MI.getOperand(i: `1`).getReg();
291	return MRI.getType(Reg: Dst).isScalar() &&
292	mi_match(R: Src, MRI,
293	P: m_any_of(preds: m_GICmp(P: m_Pred(), L: m_Reg(), R: m_Reg()),
294	preds: m_GFCmp(P: m_Pred(), L: m_Reg(), R: m_Reg())));
295	}
296
297	void applyMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI,
298	MachineIRBuilder &B,
299	GISelChangeObserver &Observer) {
300	Observer.changingInstr(MI);
301	MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_ZEXT));
302	Observer.changedInstr(MI);
303	}
304
305	/// Match a 128b store of zero and split it into two 64 bit stores, for
306	/// size/performance reasons.
307	bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
308	GStore &Store = cast<GStore>(Val&: MI);
309	if (!Store.isSimple())
310	return false;
311	LLT ValTy = MRI.getType(Reg: Store.getValueReg());
312	if (ValTy.isScalableVector())
313	return false;
314	if (!ValTy.isVector() \|\| ValTy.getSizeInBits() != `128`)
315	return false;
316	if (Store.getMemSizeInBits() != ValTy.getSizeInBits())
317	return false; // Don't split truncating stores.
318	if (!MRI.hasOneNonDBGUse(RegNo: Store.getValueReg()))
319	return false;
320	auto MaybeCst = isConstantOrConstantSplatVector(
321	MI&: *MRI.getVRegDef(Reg: Store.getValueReg()), MRI);
322	return MaybeCst && MaybeCst ->isZero();
323	}
324
325	void applySplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI,
326	MachineIRBuilder &B,
327	GISelChangeObserver &Observer) {
328	B.setInstrAndDebugLoc(MI);
329	GStore &Store = cast<GStore>(Val&: MI);
330	assert(MRI.getType(Store.getValueReg()).isVector() &&
331	"Expected a vector store value");
332	LLT NewTy = LLT::scalar(SizeInBits: `64`);
333	Register PtrReg = Store.getPointerReg();
334	auto Zero = B.buildConstant(Res: NewTy, Val: `0`);
335	auto HighPtr = B.buildPtrAdd(Res: MRI.getType(Reg: PtrReg), Op0: PtrReg,
336	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: `8`));
337	auto &MF = *MI.getMF();
338	auto *LowMMO = MF.getMachineMemOperand(MMO: &Store.getMMO(), Offset: `0`, Ty: NewTy);
339	auto *HighMMO = MF.getMachineMemOperand(MMO: &Store.getMMO(), Offset: `8`, Ty: NewTy);
340	B.buildStore(Val: Zero, Addr: PtrReg, MMO&: *LowMMO);
341	B.buildStore(Val: Zero, Addr: HighPtr, MMO&: *HighMMO);
342	Store.eraseFromParent();
343	}
344
345	bool matchOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
346	std::tuple<Register, Register, Register> &MatchInfo) {
347	const LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
348	if (!DstTy.isVector())
349	return false;
350
351	Register AO1, AO2, BVO1, BVO2;
352	if (!mi_match(MI, MRI,
353	P: m_GOr(L: m_GAnd(L: m_Reg(R&: AO1), R: m_Reg(R&: BVO1)),
354	R: m_GAnd(L: m_Reg(R&: AO2), R: m_Reg(R&: BVO2)))))
355	return false;
356
357	auto *BV1 = getOpcodeDef<GBuildVector>(Reg: BVO1, MRI);
358	auto *BV2 = getOpcodeDef<GBuildVector>(Reg: BVO2, MRI);
359	if (!BV1 \|\| !BV2)
360	return false;
361
362	for (int I = `0`, E = DstTy.getNumElements(); I < E; I++) {
363	auto ValAndVReg1 =
364	getIConstantVRegValWithLookThrough(VReg: BV1->getSourceReg(I), MRI);
365	auto ValAndVReg2 =
366	getIConstantVRegValWithLookThrough(VReg: BV2->getSourceReg(I), MRI);
367	if (!ValAndVReg1 \|\| !ValAndVReg2 \|\|
368	ValAndVReg1 ->Value != ~ValAndVReg2 ->Value)
369	return false;
370	}
371
372	MatchInfo = {AO1, AO2, BVO1};
373	return true;
374	}
375
376	void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
377	MachineIRBuilder &B,
378	std::tuple<Register, Register, Register> &MatchInfo) {
379	B.setInstrAndDebugLoc(MI);
380	B.buildInstr(
381	Opc: AArch64::G_BSP, DstOps: {MI.getOperand(i: `0`).getReg()},
382	SrcOps: {std::get<`2`>(t&: MatchInfo), std::get<`0`>(t&: MatchInfo), std::get<`1`>(t&: MatchInfo)});
383	MI.eraseFromParent();
384	}
385
386	// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
387	bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
388	Register &SrcReg) {
389	LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
390
391	if (DstTy != LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`) && DstTy != LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`) &&
392	DstTy != LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`) && DstTy != LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`) &&
393	DstTy != LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`))
394	return false;
395
396	auto AndMI = getDefIgnoringCopies(Reg: MI.getOperand(i: `1`).getReg(), MRI);
397	if (AndMI->getOpcode() != TargetOpcode::G_AND)
398	return false;
399	auto LShrMI = getDefIgnoringCopies(Reg: AndMI->getOperand(i: `1`).getReg(), MRI);
400	if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
401	return false;
402
403	// Check the constant splat values
404	auto V1 = isConstantOrConstantSplatVector(
405	MI&: *MRI.getVRegDef(Reg: MI.getOperand(i: `2`).getReg()), MRI);
406	auto V2 = isConstantOrConstantSplatVector(
407	MI&: *MRI.getVRegDef(Reg: AndMI->getOperand(i: `2`).getReg()), MRI);
408	auto V3 = isConstantOrConstantSplatVector(
409	MI&: *MRI.getVRegDef(Reg: LShrMI->getOperand(i: `2`).getReg()), MRI);
410	if (!V1.has_value() \|\| !V2.has_value() \|\| !V3.has_value())
411	return false;
412	unsigned HalfSize = DstTy.getScalarSizeInBits() / `2`;
413	if (!V1.value().isMask(numBits: HalfSize) \|\| V2.value() != (`1ULL` \| `1ULL` << HalfSize) \|\|
414	V3 != (HalfSize - `1`))
415	return false;
416
417	SrcReg = LShrMI->getOperand(i: `1`).getReg();
418
419	return true;
420	}
421
422	void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
423	MachineIRBuilder &B, Register &SrcReg) {
424	Register DstReg = MI.getOperand(i: `0`).getReg();
425	LLT DstTy = MRI.getType(Reg: DstReg);
426	LLT HalfTy =
427	DstTy.changeElementCount(EC: DstTy.getElementCount().multiplyCoefficientBy(RHS: `2`))
428	.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits() / `2`);
429
430	Register ZeroVec = B.buildConstant(Res: HalfTy, Val: `0`).getReg(Idx: `0`);
431	Register CastReg =
432	B.buildInstr(Opc: TargetOpcode::G_BITCAST, DstOps: {HalfTy}, SrcOps: {SrcReg}).getReg(Idx: `0`);
433	Register CMLTReg =
434	B.buildICmp(Pred: CmpInst::Predicate::ICMP_SLT, Res: HalfTy, Op0: CastReg, Op1: ZeroVec)
435	.getReg(Idx: `0`);
436
437	B.buildInstr(Opc: TargetOpcode::G_BITCAST, DstOps: {DstReg}, SrcOps: {CMLTReg}).getReg(Idx: `0`);
438	MI.eraseFromParent();
439	}
440
441	// Match mul({z/s}ext , {z/s}ext) => {u/s}mull
442	bool matchExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
443	GISelValueTracking *KB,
444	std::tuple<bool, Register, Register> &MatchInfo) {
445	// Get the instructions that defined the source operand
446	LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
447	MachineInstr *I1 = getDefIgnoringCopies(Reg: MI.getOperand(i: `1`).getReg(), MRI);
448	MachineInstr *I2 = getDefIgnoringCopies(Reg: MI.getOperand(i: `2`).getReg(), MRI);
449	unsigned I1Opc = I1->getOpcode();
450	unsigned I2Opc = I2->getOpcode();
451	unsigned EltSize = DstTy.getScalarSizeInBits();
452
453	if (!DstTy.isVector() \|\| I1->getNumOperands() < `2` \|\| I2->getNumOperands() < `2`)
454	return false;
455
456	auto IsAtLeastDoubleExtend = [&](Register R) {
457	LLT Ty = MRI.getType(Reg: R);
458	return EltSize >= Ty.getScalarSizeInBits() * `2`;
459	};
460
461	// If the source operands were EXTENDED before, then {U/S}MULL can be used
462	bool IsZExt1 =
463	I1Opc == TargetOpcode::G_ZEXT \|\| I1Opc == TargetOpcode::G_ANYEXT;
464	bool IsZExt2 =
465	I2Opc == TargetOpcode::G_ZEXT \|\| I2Opc == TargetOpcode::G_ANYEXT;
466	if (IsZExt1 && IsZExt2 && IsAtLeastDoubleExtend (I1->getOperand(i: `1`).getReg()) &&
467	IsAtLeastDoubleExtend (I2->getOperand(i: `1`).getReg())) {
468	get<`0`>(t&: MatchInfo) = true;
469	get<`1`>(t&: MatchInfo) = I1->getOperand(i: `1`).getReg();
470	get<`2`>(t&: MatchInfo) = I2->getOperand(i: `1`).getReg();
471	return true;
472	}
473
474	bool IsSExt1 =
475	I1Opc == TargetOpcode::G_SEXT \|\| I1Opc == TargetOpcode::G_ANYEXT;
476	bool IsSExt2 =
477	I2Opc == TargetOpcode::G_SEXT \|\| I2Opc == TargetOpcode::G_ANYEXT;
478	if (IsSExt1 && IsSExt2 && IsAtLeastDoubleExtend (I1->getOperand(i: `1`).getReg()) &&
479	IsAtLeastDoubleExtend (I2->getOperand(i: `1`).getReg())) {
480	get<`0`>(t&: MatchInfo) = false;
481	get<`1`>(t&: MatchInfo) = I1->getOperand(i: `1`).getReg();
482	get<`2`>(t&: MatchInfo) = I2->getOperand(i: `1`).getReg();
483	return true;
484	}
485
486	// Select UMULL if we can replace the other operand with an extend.
487	APInt Mask = APInt::getHighBitsSet(numBits: EltSize, hiBitsSet: EltSize / `2`);
488	if (KB && (IsZExt1 \|\| IsZExt2) &&
489	IsAtLeastDoubleExtend (IsZExt1 ? I1->getOperand(i: `1`).getReg()
490	: I2->getOperand(i: `1`).getReg())) {
491	Register ZExtOp =
492	IsZExt1 ? MI.getOperand(i: `2`).getReg() : MI.getOperand(i: `1`).getReg();
493	if (KB->maskedValueIsZero(Val: ZExtOp, Mask)) {
494	get<`0`>(t&: MatchInfo) = true;
495	get<`1`>(t&: MatchInfo) = IsZExt1 ? I1->getOperand(i: `1`).getReg() : ZExtOp;
496	get<`2`>(t&: MatchInfo) = IsZExt1 ? ZExtOp : I2->getOperand(i: `1`).getReg();
497	return true;
498	}
499	} else if (KB && DstTy == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`) &&
500	KB->maskedValueIsZero(Val: MI.getOperand(i: `1`).getReg(), Mask) &&
501	KB->maskedValueIsZero(Val: MI.getOperand(i: `2`).getReg(), Mask)) {
502	get<`0`>(t&: MatchInfo) = true;
503	get<`1`>(t&: MatchInfo) = MI.getOperand(i: `1`).getReg();
504	get<`2`>(t&: MatchInfo) = MI.getOperand(i: `2`).getReg();
505	return true;
506	}
507
508	if (KB && (IsSExt1 \|\| IsSExt2) &&
509	IsAtLeastDoubleExtend (IsSExt1 ? I1->getOperand(i: `1`).getReg()
510	: I2->getOperand(i: `1`).getReg())) {
511	Register SExtOp =
512	IsSExt1 ? MI.getOperand(i: `2`).getReg() : MI.getOperand(i: `1`).getReg();
513	if (KB->computeNumSignBits(R: SExtOp) > EltSize / `2`) {
514	get<`0`>(t&: MatchInfo) = false;
515	get<`1`>(t&: MatchInfo) = IsSExt1 ? I1->getOperand(i: `1`).getReg() : SExtOp;
516	get<`2`>(t&: MatchInfo) = IsSExt1 ? SExtOp : I2->getOperand(i: `1`).getReg();
517	return true;
518	}
519	} else if (KB && DstTy == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`) &&
520	KB->computeNumSignBits(R: MI.getOperand(i: `1`).getReg()) > EltSize / `2` &&
521	KB->computeNumSignBits(R: MI.getOperand(i: `2`).getReg()) > EltSize / `2`) {
522	get<`0`>(t&: MatchInfo) = false;
523	get<`1`>(t&: MatchInfo) = MI.getOperand(i: `1`).getReg();
524	get<`2`>(t&: MatchInfo) = MI.getOperand(i: `2`).getReg();
525	return true;
526	}
527
528	return false;
529	}
530
531	void applyExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
532	MachineIRBuilder &B, GISelChangeObserver &Observer,
533	std::tuple<bool, Register, Register> &MatchInfo) {
534	assert(MI.getOpcode() == TargetOpcode::G_MUL &&
535	"Expected a G_MUL instruction");
536
537	// Get the instructions that defined the source operand
538	LLT DstTy = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
539	bool IsZExt = get<`0`>(t&: MatchInfo);
540	Register Src1Reg = get<`1`>(t&: MatchInfo);
541	Register Src2Reg = get<`2`>(t&: MatchInfo);
542	LLT Src1Ty = MRI.getType(Reg: Src1Reg);
543	LLT Src2Ty = MRI.getType(Reg: Src2Reg);
544	LLT HalfDstTy = DstTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits() / `2`);
545	unsigned ExtOpc = IsZExt ? TargetOpcode::G_ZEXT : TargetOpcode::G_SEXT;
546
547	if (Src1Ty.getScalarSizeInBits() * `2` != DstTy.getScalarSizeInBits())
548	Src1Reg = B.buildExtOrTrunc(ExtOpc, Res: {HalfDstTy}, Op: {Src1Reg}).getReg(Idx: `0`);
549	if (Src2Ty.getScalarSizeInBits() * `2` != DstTy.getScalarSizeInBits())
550	Src2Reg = B.buildExtOrTrunc(ExtOpc, Res: {HalfDstTy}, Op: {Src2Reg}).getReg(Idx: `0`);
551
552	B.buildInstr(Opc: IsZExt ? AArch64::G_UMULL : AArch64::G_SMULL,
553	DstOps: {MI.getOperand(i: `0`).getReg()}, SrcOps: {Src1Reg, Src2Reg});
554	MI.eraseFromParent();
555	}
556
557	class AArch64PostLegalizerCombinerImpl : public Combiner {
558	protected:
559	const CombinerHelper Helper;
560	const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig;
561	const AArch64Subtarget &STI;
562
563	public:
564	AArch64PostLegalizerCombinerImpl(
565	MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
566	GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
567	const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig,
568	const AArch64Subtarget &STI, MachineDominatorTree *MDT,
569	const LegalizerInfo *LI);
570
571	static const char getName() { return* "AArch64PostLegalizerCombiner"; }
572
573	bool tryCombineAll(MachineInstr &I) const override;
574
575	private:
576	#define GET_GICOMBINER_CLASS_MEMBERS
577	#include "AArch64GenPostLegalizeGICombiner.inc"
578	#undef GET_GICOMBINER_CLASS_MEMBERS
579	};
580
581	#define GET_GICOMBINER_IMPL
582	#include "AArch64GenPostLegalizeGICombiner.inc"
583	#undef GET_GICOMBINER_IMPL
584
585	AArch64PostLegalizerCombinerImpl::AArch64PostLegalizerCombinerImpl(
586	MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
587	GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
588	const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig,
589	const AArch64Subtarget &STI, MachineDominatorTree *MDT,
590	const LegalizerInfo *LI)
591	: Combiner (MF, CInfo, TPC, &VT, CSEInfo),
592	Helper (Observer, B, /IsPreLegalize/ false, &VT, MDT, LI),
593	RuleConfig(RuleConfig), STI(STI),
594	#define GET_GICOMBINER_CONSTRUCTOR_INITS
595	#include "AArch64GenPostLegalizeGICombiner.inc"
596	#undef GET_GICOMBINER_CONSTRUCTOR_INITS
597	{
598	}
599
600	class AArch64PostLegalizerCombiner : public MachineFunctionPass {
601	public:
602	static char ID;
603
604	AArch64PostLegalizerCombiner(bool IsOptNone = false);
605
606	StringRef getPassName() const override {
607	return "AArch64PostLegalizerCombiner";
608	}
609
610	bool runOnMachineFunction(MachineFunction &MF) override;
611	void getAnalysisUsage(AnalysisUsage &AU) const override;
612
613	private:
614	bool IsOptNone;
615	AArch64PostLegalizerCombinerImplRuleConfig RuleConfig;
616
617
618	struct StoreInfo {
619	GStore St = nullptr*;
620	// The G_PTR_ADD that's used by the store. We keep this to cache the
621	// MachineInstr def.
622	GPtrAdd Ptr = nullptr*;
623	// The signed offset to the Ptr instruction.
624	int64_t Offset = `0`;
625	LLT StoredType;
626	};
627	bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores,
628	CSEMIRBuilder &MIB);
629
630	bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF,
631	CSEMIRBuilder &MIB);
632	};
633	} // end anonymous namespace
634
635	void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
636	AU.addRequired<TargetPassConfig>();
637	AU.setPreservesCFG();
638	getSelectionDAGFallbackAnalysisUsage(AU);
639	AU.addRequired<GISelValueTrackingAnalysisLegacy>();
640	AU.addPreserved<GISelValueTrackingAnalysisLegacy>();
641	if (!IsOptNone) {
642	AU.addRequired<MachineDominatorTreeWrapperPass>();
643	AU.addPreserved<MachineDominatorTreeWrapperPass>();
644	AU.addRequired<GISelCSEAnalysisWrapperPass>();
645	AU.addPreserved<GISelCSEAnalysisWrapperPass>();
646	}
647	MachineFunctionPass::getAnalysisUsage(AU);
648	}
649
650	AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone)
651	: MachineFunctionPass (ID), IsOptNone(IsOptNone) {
652	if (!RuleConfig.parseCommandLineOption())
653	report_fatal_error(reason: "Invalid rule identifier");
654	}
655
656	bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
657	if (MF.getProperties().hasFailedISel())
658	return false;
659	assert(MF.getProperties().hasLegalized() && "Expected a legalized function?");
660	auto *TPC = &getAnalysis<TargetPassConfig>();
661	const Function &F = MF.getFunction();
662	bool EnableOpt =
663	MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
664
665	const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
666	const auto *LI = ST.getLegalizerInfo();
667
668	GISelValueTracking *VT =
669	&getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF);
670	MachineDominatorTree *MDT =
671	IsOptNone ? nullptr
672	: &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
673	GISelCSEAnalysisWrapper &Wrapper =
674	getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
675	auto *CSEInfo = &Wrapper.get(CSEOpt: TPC->getCSEConfig());
676
677	CombinerInfo CInfo(/AllowIllegalOps/ true, /ShouldLegalizeIllegal/ false,
678	/LegalizerInfo/ nullptr, EnableOpt, F.hasOptSize(),
679	F.hasMinSize());
680	// Disable fixed-point iteration to reduce compile-time
681	CInfo.MaxIterations = `1`;
682	CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass;
683	// Legalizer performs DCE, so a full DCE pass is unnecessary.
684	CInfo.EnableFullDCE = false;
685	AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *VT, CSEInfo,
686	RuleConfig, ST, MDT, LI);
687	bool Changed = Impl.combineMachineInstrs();
688
689	auto MIB = CSEMIRBuilder (MF);
690	MIB.setCSEInfo(CSEInfo);
691	Changed \|= optimizeConsecutiveMemOpAddressing(MF, MIB);
692	return Changed;
693	}
694
695	bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
696	SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) {
697	if (Stores.size() <= `2`)
698	return false;
699
700	// Profitabity checks:
701	int64_t BaseOffset = Stores [`0`].Offset;
702	unsigned NumPairsExpected = Stores.size() / `2`;
703	unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % `2`);
704	// Size savings will depend on whether we can fold the offset, as an
705	// immediate of an ADD.
706	auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
707	if (!TLI.isLegalAddImmediate(BaseOffset))
708	TotalInstsExpected++;
709	int SavingsExpected = Stores.size() - TotalInstsExpected;
710	if (SavingsExpected <= `0`)
711	return false;
712
713	auto &MRI = MIB.getMF().getRegInfo();
714
715	// We have a series of consecutive stores. Factor out the common base
716	// pointer and rewrite the offsets.
717	Register NewBase = Stores [`0`].Ptr->getReg(Idx: `0`);
718	for (auto &SInfo : Stores) {
719	// Compute a new pointer with the new base ptr and adjusted offset.
720	MIB.setInstrAndDebugLoc(*SInfo.St);
721	auto NewOff = MIB.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: SInfo.Offset - BaseOffset);
722	auto NewPtr = MIB.buildPtrAdd(Res: MRI.getType(Reg: SInfo.St->getPointerReg()),
723	Op0: NewBase, Op1: NewOff);
724	if (MIB.getObserver())
725	MIB.getObserver()->changingInstr(MI&: *SInfo.St);
726	SInfo.St->getOperand(i: `1`).setReg(NewPtr.getReg(Idx: `0`));
727	if (MIB.getObserver())
728	MIB.getObserver()->changedInstr(MI&: *SInfo.St);
729	}
730	LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
731	<< " stores into a base pointer and offsets.\n");
732	return true;
733	}
734
735	static cl::opt<bool>
736	EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
737	cl::init(Val: true), cl::Hidden,
738	cl::desc ("Enable consecutive memop optimization "
739	"in AArch64PostLegalizerCombiner"));
740
741	bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
742	MachineFunction &MF, CSEMIRBuilder &MIB) {
743	// This combine needs to run after all reassociations/folds on pointer
744	// addressing have been done, specifically those that combine two G_PTR_ADDs
745	// with constant offsets into a single G_PTR_ADD with a combined offset.
746	// The goal of this optimization is to undo that combine in the case where
747	// doing so has prevented the formation of pair stores due to illegal
748	// addressing modes of STP. The reason that we do it here is because
749	// it's much easier to undo the transformation of a series consecutive
750	// mem ops, than it is to detect when doing it would be a bad idea looking
751	// at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
752	//
753	// An example:
754	// G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
755	// %off1:_(s64) = G_CONSTANT i64 4128
756	// %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
757	// G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
758	// %off2:_(s64) = G_CONSTANT i64 4144
759	// %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
760	// G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
761	// %off3:_(s64) = G_CONSTANT i64 4160
762	// %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
763	// G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
764	bool Changed = false;
765	auto &MRI = MF.getRegInfo();
766
767	if (!EnableConsecutiveMemOpOpt)
768	return Changed;
769
770	SmallVector<StoreInfo, `8`> Stores;
771	// If we see a load, then we keep track of any values defined by it.
772	// In the following example, STP formation will fail anyway because
773	// the latter store is using a load result that appears after the
774	// the prior store. In this situation if we factor out the offset then
775	// we increase code size for no benefit.
776	// G_STORE %v1:_(s64), %base:_(p0) :: (store (s64))
777	// %v2:_(s64) = G_LOAD %ldptr:_(p0) :: (load (s64))
778	// G_STORE %v2:_(s64), %base:_(p0) :: (store (s64))
779	SmallVector<Register> LoadValsSinceLastStore;
780
781	auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
782	// Check if this store is consecutive to the last one.
783	if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() \|\|
784	(Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
785	New.Offset) \|\|
786	Last.StoredType != New.StoredType)
787	return false;
788
789	// Check if this store is using a load result that appears after the
790	// last store. If so, bail out.
791	if (any_of(Range&: LoadValsSinceLastStore, P: [&](Register LoadVal) {
792	return New.St->getValueReg() == LoadVal;
793	}))
794	return false;
795
796	// Check if the current offset would be too large for STP.
797	// If not, then STP formation should be able to handle it, so we don't
798	// need to do anything.
799	int64_t MaxLegalOffset;
800	switch (New.StoredType.getSizeInBits()) {
801	case `32`:
802	MaxLegalOffset = `252`;
803	break;
804	case `64`:
805	MaxLegalOffset = `504`;
806	break;
807	case `128`:
808	MaxLegalOffset = `1008`;
809	break;
810	default:
811	llvm_unreachable("Unexpected stored type size");
812	}
813	if (New.Offset < MaxLegalOffset)
814	return false;
815
816	// If factoring it out still wouldn't help then don't bother.
817	return New.Offset - Stores [`0`].Offset <= MaxLegalOffset;
818	};
819
820	auto resetState = [&]() {
821	Stores.clear();
822	LoadValsSinceLastStore.clear();
823	};
824
825	for (auto &MBB : MF) {
826	// We're looking inside a single BB at a time since the memset pattern
827	// should only be in a single block.
828	resetState ();
829	for (auto &MI : MBB) {
830	// Skip for scalable vectors
831	if (auto *LdSt = dyn_cast<GLoadStore>(Val: &MI);
832	LdSt && MRI.getType(Reg: LdSt->getOperand(i: `0`).getReg()).isScalableVector())
833	continue;
834
835	if (auto *St = dyn_cast<GStore>(Val: &MI)) {
836	Register PtrBaseReg;
837	APInt Offset;
838	LLT StoredValTy = MRI.getType(Reg: St->getValueReg());
839	unsigned ValSize = StoredValTy.getSizeInBits();
840	if (ValSize < `32` \|\| St->getMMO().getSizeInBits() != ValSize)
841	continue;
842
843	Register PtrReg = St->getPointerReg();
844	if (mi_match(
845	R: PtrReg, MRI,
846	P: m_OneNonDBGUse(SP: m_GPtrAdd(L: m_Reg(R&: PtrBaseReg), R: m_ICst(Cst&: Offset))))) {
847	GPtrAdd *PtrAdd = cast<GPtrAdd>(Val: MRI.getVRegDef(Reg: PtrReg));
848	StoreInfo New = {.St: St, .Ptr: PtrAdd, .Offset: Offset.getSExtValue(), .StoredType: StoredValTy};
849
850	if (Stores.empty()) {
851	Stores.push_back(Elt: New);
852	continue;
853	}
854
855	// Check if this store is a valid continuation of the sequence.
856	auto &Last = Stores.back();
857	if (storeIsValid (Last, New)) {
858	Stores.push_back(Elt: New);
859	LoadValsSinceLastStore.clear(); // Reset the load value tracking.
860	} else {
861	// The store isn't a valid to consider for the prior sequence,
862	// so try to optimize what we have so far and start a new sequence.
863	Changed \|= tryOptimizeConsecStores(Stores, MIB);
864	resetState ();
865	Stores.push_back(Elt: New);
866	}
867	}
868	} else if (auto *Ld = dyn_cast<GLoad>(Val: &MI)) {
869	LoadValsSinceLastStore.push_back(Elt: Ld->getDstReg());
870	}
871	}
872	Changed \|= tryOptimizeConsecStores(Stores, MIB);
873	resetState ();
874	}
875
876	return Changed;
877	}
878
879	char AArch64PostLegalizerCombiner::ID = `0`;
880	INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE,
881	"Combine AArch64 MachineInstrs after legalization", false,
882	false)
883	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
884	INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy)
885	INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE,
886	"Combine AArch64 MachineInstrs after legalization", false,
887	false)
888
889	namespace llvm {
890	FunctionPass createAArch64PostLegalizerCombiner(bool* IsOptNone) {
891	return new AArch64PostLegalizerCombiner (IsOptNone);
892	}
893	} // end namespace llvm
894

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp