1//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does combining of machine instructions at the generic MI level,
10// before the legalizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AArch64GlobalISelUtils.h"
15#include "AArch64TargetMachine.h"
16#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
17#include "llvm/CodeGen/GlobalISel/Combiner.h"
18#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
19#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
20#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
21#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/GlobalISel/Utils.h"
25#include "llvm/CodeGen/MachineDominators.h"
26#include "llvm/CodeGen/MachineFunction.h"
27#include "llvm/CodeGen/MachineFunctionPass.h"
28#include "llvm/CodeGen/MachineRegisterInfo.h"
29#include "llvm/CodeGen/TargetPassConfig.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/Support/Debug.h"
32
33#define GET_GICOMBINER_DEPS
34#include "AArch64GenPreLegalizeGICombiner.inc"
35#undef GET_GICOMBINER_DEPS
36
37#define DEBUG_TYPE "aarch64-prelegalizer-combiner"
38
39using namespace llvm;
40using namespace MIPatternMatch;
41
42namespace {
43
44#define GET_GICOMBINER_TYPES
45#include "AArch64GenPreLegalizeGICombiner.inc"
46#undef GET_GICOMBINER_TYPES
47
48/// Return true if a G_FCONSTANT instruction is known to be better-represented
49/// as a G_CONSTANT.
50bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
51 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
52 Register DstReg = MI.getOperand(i: 0).getReg();
53 const unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
54 if (DstSize != 32 && DstSize != 64)
55 return false;
56
57 // When we're storing a value, it doesn't matter what register bank it's on.
58 // Since not all floating point constants can be materialized using a fmov,
59 // it makes more sense to just use a GPR.
60 return all_of(Range: MRI.use_nodbg_instructions(Reg: DstReg),
61 P: [](const MachineInstr &Use) { return Use.mayStore(); });
62}
63
64/// Change a G_FCONSTANT into a G_CONSTANT.
65void applyFConstantToConstant(MachineInstr &MI) {
66 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
67 MachineIRBuilder MIB(MI);
68 const APFloat &ImmValAPF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
69 MIB.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: ImmValAPF.bitcastToAPInt());
70 MI.eraseFromParent();
71}
72
73/// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
74/// are sign bits. In this case, we can transform the G_ICMP to directly compare
75/// the wide value with a zero.
76bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
77 GISelKnownBits *KB, Register &MatchInfo) {
78 assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB);
79
80 auto Pred = (CmpInst::Predicate)MI.getOperand(i: 1).getPredicate();
81 if (!ICmpInst::isEquality(P: Pred))
82 return false;
83
84 Register LHS = MI.getOperand(i: 2).getReg();
85 LLT LHSTy = MRI.getType(Reg: LHS);
86 if (!LHSTy.isScalar())
87 return false;
88
89 Register RHS = MI.getOperand(i: 3).getReg();
90 Register WideReg;
91
92 if (!mi_match(R: LHS, MRI, P: m_GTrunc(Src: m_Reg(R&: WideReg))) ||
93 !mi_match(R: RHS, MRI, P: m_SpecificICst(RequestedValue: 0)))
94 return false;
95
96 LLT WideTy = MRI.getType(Reg: WideReg);
97 if (KB->computeNumSignBits(R: WideReg) <=
98 WideTy.getSizeInBits() - LHSTy.getSizeInBits())
99 return false;
100
101 MatchInfo = WideReg;
102 return true;
103}
104
105void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
106 MachineIRBuilder &Builder,
107 GISelChangeObserver &Observer, Register &WideReg) {
108 assert(MI.getOpcode() == TargetOpcode::G_ICMP);
109
110 LLT WideTy = MRI.getType(Reg: WideReg);
111 // We're going to directly use the wide register as the LHS, and then use an
112 // equivalent size zero for RHS.
113 Builder.setInstrAndDebugLoc(MI);
114 auto WideZero = Builder.buildConstant(Res: WideTy, Val: 0);
115 Observer.changingInstr(MI);
116 MI.getOperand(i: 2).setReg(WideReg);
117 MI.getOperand(i: 3).setReg(WideZero.getReg(Idx: 0));
118 Observer.changedInstr(MI);
119}
120
121/// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
122///
123/// e.g.
124///
125/// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
126bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
127 std::pair<uint64_t, uint64_t> &MatchInfo) {
128 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
129 MachineFunction &MF = *MI.getMF();
130 auto &GlobalOp = MI.getOperand(i: 1);
131 auto *GV = GlobalOp.getGlobal();
132 if (GV->isThreadLocal())
133 return false;
134
135 // Don't allow anything that could represent offsets etc.
136 if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
137 GV, TM: MF.getTarget()) != AArch64II::MO_NO_FLAG)
138 return false;
139
140 // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
141 //
142 // %g = G_GLOBAL_VALUE @x
143 // %ptr1 = G_PTR_ADD %g, cst1
144 // %ptr2 = G_PTR_ADD %g, cst2
145 // ...
146 // %ptrN = G_PTR_ADD %g, cstN
147 //
148 // Identify the *smallest* constant. We want to be able to form this:
149 //
150 // %offset_g = G_GLOBAL_VALUE @x + min_cst
151 // %g = G_PTR_ADD %offset_g, -min_cst
152 // %ptr1 = G_PTR_ADD %g, cst1
153 // ...
154 Register Dst = MI.getOperand(i: 0).getReg();
155 uint64_t MinOffset = -1ull;
156 for (auto &UseInstr : MRI.use_nodbg_instructions(Reg: Dst)) {
157 if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
158 return false;
159 auto Cst = getIConstantVRegValWithLookThrough(
160 VReg: UseInstr.getOperand(i: 2).getReg(), MRI);
161 if (!Cst)
162 return false;
163 MinOffset = std::min(a: MinOffset, b: Cst->Value.getZExtValue());
164 }
165
166 // Require that the new offset is larger than the existing one to avoid
167 // infinite loops.
168 uint64_t CurrOffset = GlobalOp.getOffset();
169 uint64_t NewOffset = MinOffset + CurrOffset;
170 if (NewOffset <= CurrOffset)
171 return false;
172
173 // Check whether folding this offset is legal. It must not go out of bounds of
174 // the referenced object to avoid violating the code model, and must be
175 // smaller than 2^20 because this is the largest offset expressible in all
176 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
177 // stores an immediate signed 21 bit offset.)
178 //
179 // This check also prevents us from folding negative offsets, which will end
180 // up being treated in the same way as large positive ones. They could also
181 // cause code model violations, and aren't really common enough to matter.
182 if (NewOffset >= (1 << 20))
183 return false;
184
185 Type *T = GV->getValueType();
186 if (!T->isSized() ||
187 NewOffset > GV->getDataLayout().getTypeAllocSize(Ty: T))
188 return false;
189 MatchInfo = std::make_pair(x&: NewOffset, y&: MinOffset);
190 return true;
191}
192
193void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
194 MachineIRBuilder &B, GISelChangeObserver &Observer,
195 std::pair<uint64_t, uint64_t> &MatchInfo) {
196 // Change:
197 //
198 // %g = G_GLOBAL_VALUE @x
199 // %ptr1 = G_PTR_ADD %g, cst1
200 // %ptr2 = G_PTR_ADD %g, cst2
201 // ...
202 // %ptrN = G_PTR_ADD %g, cstN
203 //
204 // To:
205 //
206 // %offset_g = G_GLOBAL_VALUE @x + min_cst
207 // %g = G_PTR_ADD %offset_g, -min_cst
208 // %ptr1 = G_PTR_ADD %g, cst1
209 // ...
210 // %ptrN = G_PTR_ADD %g, cstN
211 //
212 // Then, the original G_PTR_ADDs should be folded later on so that they look
213 // like this:
214 //
215 // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
216 uint64_t Offset, MinOffset;
217 std::tie(args&: Offset, args&: MinOffset) = MatchInfo;
218 B.setInstrAndDebugLoc(*std::next(x: MI.getIterator()));
219 Observer.changingInstr(MI);
220 auto &GlobalOp = MI.getOperand(i: 1);
221 auto *GV = GlobalOp.getGlobal();
222 GlobalOp.ChangeToGA(GV, Offset, TargetFlags: GlobalOp.getTargetFlags());
223 Register Dst = MI.getOperand(i: 0).getReg();
224 Register NewGVDst = MRI.cloneVirtualRegister(VReg: Dst);
225 MI.getOperand(i: 0).setReg(NewGVDst);
226 Observer.changedInstr(MI);
227 B.buildPtrAdd(
228 Res: Dst, Op0: NewGVDst,
229 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: -static_cast<int64_t>(MinOffset)));
230}
231
232// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y))
233// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1))
234// Similar to performVecReduceAddCombine in SelectionDAG
235bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
236 const AArch64Subtarget &STI,
237 std::tuple<Register, Register, bool> &MatchInfo) {
238 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
239 "Expected a G_VECREDUCE_ADD instruction");
240 assert(STI.hasDotProd() && "Target should have Dot Product feature");
241
242 MachineInstr *I1 = getDefIgnoringCopies(Reg: MI.getOperand(i: 1).getReg(), MRI);
243 Register DstReg = MI.getOperand(i: 0).getReg();
244 Register MidReg = I1->getOperand(i: 0).getReg();
245 LLT DstTy = MRI.getType(Reg: DstReg);
246 LLT MidTy = MRI.getType(Reg: MidReg);
247 if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32)
248 return false;
249
250 LLT SrcTy;
251 auto I1Opc = I1->getOpcode();
252 if (I1Opc == TargetOpcode::G_MUL) {
253 // If result of this has more than 1 use, then there is no point in creating
254 // udot instruction
255 if (!MRI.hasOneNonDBGUse(RegNo: MidReg))
256 return false;
257
258 MachineInstr *ExtMI1 =
259 getDefIgnoringCopies(Reg: I1->getOperand(i: 1).getReg(), MRI);
260 MachineInstr *ExtMI2 =
261 getDefIgnoringCopies(Reg: I1->getOperand(i: 2).getReg(), MRI);
262 LLT Ext1DstTy = MRI.getType(Reg: ExtMI1->getOperand(i: 0).getReg());
263 LLT Ext2DstTy = MRI.getType(Reg: ExtMI2->getOperand(i: 0).getReg());
264
265 if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy)
266 return false;
267 I1Opc = ExtMI1->getOpcode();
268 SrcTy = MRI.getType(Reg: ExtMI1->getOperand(i: 1).getReg());
269 std::get<0>(t&: MatchInfo) = ExtMI1->getOperand(i: 1).getReg();
270 std::get<1>(t&: MatchInfo) = ExtMI2->getOperand(i: 1).getReg();
271 } else {
272 SrcTy = MRI.getType(Reg: I1->getOperand(i: 1).getReg());
273 std::get<0>(t&: MatchInfo) = I1->getOperand(i: 1).getReg();
274 std::get<1>(t&: MatchInfo) = 0;
275 }
276
277 if (I1Opc == TargetOpcode::G_ZEXT)
278 std::get<2>(t&: MatchInfo) = 0;
279 else if (I1Opc == TargetOpcode::G_SEXT)
280 std::get<2>(t&: MatchInfo) = 1;
281 else
282 return false;
283
284 if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0)
285 return false;
286
287 return true;
288}
289
290void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
291 MachineIRBuilder &Builder,
292 GISelChangeObserver &Observer,
293 const AArch64Subtarget &STI,
294 std::tuple<Register, Register, bool> &MatchInfo) {
295 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
296 "Expected a G_VECREDUCE_ADD instruction");
297 assert(STI.hasDotProd() && "Target should have Dot Product feature");
298
299 // Initialise the variables
300 unsigned DotOpcode =
301 std::get<2>(t&: MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT;
302 Register Ext1SrcReg = std::get<0>(t&: MatchInfo);
303
304 // If there is one source register, create a vector of 0s as the second
305 // source register
306 Register Ext2SrcReg;
307 if (std::get<1>(t&: MatchInfo) == 0)
308 Ext2SrcReg = Builder.buildConstant(Res: MRI.getType(Reg: Ext1SrcReg), Val: 1)
309 ->getOperand(i: 0)
310 .getReg();
311 else
312 Ext2SrcReg = std::get<1>(t&: MatchInfo);
313
314 // Find out how many DOT instructions are needed
315 LLT SrcTy = MRI.getType(Reg: Ext1SrcReg);
316 LLT MidTy;
317 unsigned NumOfDotMI;
318 if (SrcTy.getNumElements() % 16 == 0) {
319 NumOfDotMI = SrcTy.getNumElements() / 16;
320 MidTy = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
321 } else if (SrcTy.getNumElements() % 8 == 0) {
322 NumOfDotMI = SrcTy.getNumElements() / 8;
323 MidTy = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
324 } else {
325 llvm_unreachable("Source type number of elements is not multiple of 8");
326 }
327
328 // Handle case where one DOT instruction is needed
329 if (NumOfDotMI == 1) {
330 auto Zeroes = Builder.buildConstant(Res: MidTy, Val: 0)->getOperand(i: 0).getReg();
331 auto Dot = Builder.buildInstr(Opc: DotOpcode, DstOps: {MidTy},
332 SrcOps: {Zeroes, Ext1SrcReg, Ext2SrcReg});
333 Builder.buildVecReduceAdd(Dst: MI.getOperand(i: 0), Src: Dot->getOperand(i: 0));
334 } else {
335 // If not pad the last v8 element with 0s to a v16
336 SmallVector<Register, 4> Ext1UnmergeReg;
337 SmallVector<Register, 4> Ext2UnmergeReg;
338 if (SrcTy.getNumElements() % 16 != 0) {
339 SmallVector<Register> Leftover1;
340 SmallVector<Register> Leftover2;
341
342 // Split the elements into v16i8 and v8i8
343 LLT MainTy = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8);
344 LLT LeftoverTy1, LeftoverTy2;
345 if ((!extractParts(Reg: Ext1SrcReg, RegTy: MRI.getType(Reg: Ext1SrcReg), MainTy,
346 LeftoverTy&: LeftoverTy1, VRegs&: Ext1UnmergeReg, LeftoverVRegs&: Leftover1, MIRBuilder&: Builder,
347 MRI)) ||
348 (!extractParts(Reg: Ext2SrcReg, RegTy: MRI.getType(Reg: Ext2SrcReg), MainTy,
349 LeftoverTy&: LeftoverTy2, VRegs&: Ext2UnmergeReg, LeftoverVRegs&: Leftover2, MIRBuilder&: Builder,
350 MRI))) {
351 llvm_unreachable("Unable to split this vector properly");
352 }
353
354 // Pad the leftover v8i8 vector with register of 0s of type v8i8
355 Register v8Zeroes = Builder.buildConstant(Res: LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8), Val: 0)
356 ->getOperand(i: 0)
357 .getReg();
358
359 Ext1UnmergeReg.push_back(
360 Elt: Builder
361 .buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8),
362 Ops: {Leftover1[0], v8Zeroes})
363 .getReg(Idx: 0));
364 Ext2UnmergeReg.push_back(
365 Elt: Builder
366 .buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8),
367 Ops: {Leftover2[0], v8Zeroes})
368 .getReg(Idx: 0));
369
370 } else {
371 // Unmerge the source vectors to v16i8
372 unsigned SrcNumElts = SrcTy.getNumElements();
373 extractParts(Reg: Ext1SrcReg, Ty: LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8), NumParts: SrcNumElts / 16,
374 VRegs&: Ext1UnmergeReg, MIRBuilder&: Builder, MRI);
375 extractParts(Reg: Ext2SrcReg, Ty: LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8), NumParts: SrcNumElts / 16,
376 VRegs&: Ext2UnmergeReg, MIRBuilder&: Builder, MRI);
377 }
378
379 // Build the UDOT instructions
380 SmallVector<Register, 2> DotReg;
381 unsigned NumElements = 0;
382 for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) {
383 LLT ZeroesLLT;
384 // Check if it is 16 or 8 elements. Set Zeroes to the according size
385 if (MRI.getType(Reg: Ext1UnmergeReg[i]).getNumElements() == 16) {
386 ZeroesLLT = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
387 NumElements += 4;
388 } else {
389 ZeroesLLT = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
390 NumElements += 2;
391 }
392 auto Zeroes = Builder.buildConstant(Res: ZeroesLLT, Val: 0)->getOperand(i: 0).getReg();
393 DotReg.push_back(
394 Elt: Builder
395 .buildInstr(Opc: DotOpcode, DstOps: {MRI.getType(Reg: Zeroes)},
396 SrcOps: {Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]})
397 .getReg(Idx: 0));
398 }
399
400 // Merge the output
401 auto ConcatMI =
402 Builder.buildConcatVectors(Res: LLT::fixed_vector(NumElements, ScalarSizeInBits: 32), Ops: DotReg);
403
404 // Put it through a vector reduction
405 Builder.buildVecReduceAdd(Dst: MI.getOperand(i: 0).getReg(),
406 Src: ConcatMI->getOperand(i: 0).getReg());
407 }
408
409 // Erase the dead instructions
410 MI.eraseFromParent();
411}
412
413// Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x)
414// Ensure that the type coming from the extend instruction is the right size
415bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
416 std::pair<Register, bool> &MatchInfo) {
417 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
418 "Expected G_VECREDUCE_ADD Opcode");
419
420 // Check if the last instruction is an extend
421 MachineInstr *ExtMI = getDefIgnoringCopies(Reg: MI.getOperand(i: 1).getReg(), MRI);
422 auto ExtOpc = ExtMI->getOpcode();
423
424 if (ExtOpc == TargetOpcode::G_ZEXT)
425 std::get<1>(in&: MatchInfo) = 0;
426 else if (ExtOpc == TargetOpcode::G_SEXT)
427 std::get<1>(in&: MatchInfo) = 1;
428 else
429 return false;
430
431 // Check if the source register is a valid type
432 Register ExtSrcReg = ExtMI->getOperand(i: 1).getReg();
433 LLT ExtSrcTy = MRI.getType(Reg: ExtSrcReg);
434 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
435 if ((DstTy.getScalarSizeInBits() == 16 &&
436 ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) ||
437 (DstTy.getScalarSizeInBits() == 32 &&
438 ExtSrcTy.getNumElements() % 4 == 0) ||
439 (DstTy.getScalarSizeInBits() == 64 &&
440 ExtSrcTy.getNumElements() % 4 == 0)) {
441 std::get<0>(in&: MatchInfo) = ExtSrcReg;
442 return true;
443 }
444 return false;
445}
446
447void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
448 MachineIRBuilder &B, GISelChangeObserver &Observer,
449 std::pair<Register, bool> &MatchInfo) {
450 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
451 "Expected G_VECREDUCE_ADD Opcode");
452
453 unsigned Opc = std::get<1>(in&: MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV;
454 Register SrcReg = std::get<0>(in&: MatchInfo);
455 Register DstReg = MI.getOperand(i: 0).getReg();
456 LLT SrcTy = MRI.getType(Reg: SrcReg);
457 LLT DstTy = MRI.getType(Reg: DstReg);
458
459 // If SrcTy has more elements than expected, split them into multiple
460 // insructions and sum the results
461 LLT MainTy;
462 SmallVector<Register, 1> WorkingRegisters;
463 unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
464 unsigned SrcNumElem = SrcTy.getNumElements();
465 if ((SrcScalSize == 8 && SrcNumElem > 16) ||
466 (SrcScalSize == 16 && SrcNumElem > 8) ||
467 (SrcScalSize == 32 && SrcNumElem > 4)) {
468
469 LLT LeftoverTy;
470 SmallVector<Register, 4> LeftoverRegs;
471 if (SrcScalSize == 8)
472 MainTy = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8);
473 else if (SrcScalSize == 16)
474 MainTy = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
475 else if (SrcScalSize == 32)
476 MainTy = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
477 else
478 llvm_unreachable("Source's Scalar Size not supported");
479
480 // Extract the parts and put each extracted sources through U/SADDLV and put
481 // the values inside a small vec
482 extractParts(Reg: SrcReg, RegTy: SrcTy, MainTy, LeftoverTy, VRegs&: WorkingRegisters,
483 LeftoverVRegs&: LeftoverRegs, MIRBuilder&: B, MRI);
484 for (unsigned I = 0; I < LeftoverRegs.size(); I++) {
485 WorkingRegisters.push_back(Elt: LeftoverRegs[I]);
486 }
487 } else {
488 WorkingRegisters.push_back(Elt: SrcReg);
489 MainTy = SrcTy;
490 }
491
492 unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2;
493 LLT MidScalarLLT = LLT::scalar(SizeInBits: MidScalarSize);
494 Register zeroReg = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: 0).getReg(Idx: 0);
495 for (unsigned I = 0; I < WorkingRegisters.size(); I++) {
496 // If the number of elements is too small to build an instruction, extend
497 // its size before applying addlv
498 LLT WorkingRegTy = MRI.getType(Reg: WorkingRegisters[I]);
499 if ((WorkingRegTy.getScalarSizeInBits() == 8) &&
500 (WorkingRegTy.getNumElements() == 4)) {
501 WorkingRegisters[I] =
502 B.buildInstr(Opc: std::get<1>(in&: MatchInfo) ? TargetOpcode::G_SEXT
503 : TargetOpcode::G_ZEXT,
504 DstOps: {LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)}, SrcOps: {WorkingRegisters[I]})
505 .getReg(Idx: 0);
506 }
507
508 // Generate the {U/S}ADDLV instruction, whose output is always double of the
509 // Src's Scalar size
510 LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)
511 : LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
512 Register addlvReg =
513 B.buildInstr(Opc, DstOps: {addlvTy}, SrcOps: {WorkingRegisters[I]}).getReg(Idx: 0);
514
515 // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
516 // v2i64 register.
517 // i16, i32 results uses v4i32 registers
518 // i64 results uses v2i64 registers
519 // Therefore we have to extract/truncate the the value to the right type
520 if (MidScalarSize == 32 || MidScalarSize == 64) {
521 WorkingRegisters[I] = B.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT,
522 DstOps: {MidScalarLLT}, SrcOps: {addlvReg, zeroReg})
523 .getReg(Idx: 0);
524 } else {
525 Register extractReg = B.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT,
526 DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {addlvReg, zeroReg})
527 .getReg(Idx: 0);
528 WorkingRegisters[I] =
529 B.buildTrunc(Res: {MidScalarLLT}, Op: {extractReg}).getReg(Idx: 0);
530 }
531 }
532
533 Register outReg;
534 if (WorkingRegisters.size() > 1) {
535 outReg = B.buildAdd(Dst: MidScalarLLT, Src0: WorkingRegisters[0], Src1: WorkingRegisters[1])
536 .getReg(Idx: 0);
537 for (unsigned I = 2; I < WorkingRegisters.size(); I++) {
538 outReg = B.buildAdd(Dst: MidScalarLLT, Src0: outReg, Src1: WorkingRegisters[I]).getReg(Idx: 0);
539 }
540 } else {
541 outReg = WorkingRegisters[0];
542 }
543
544 if (DstTy.getScalarSizeInBits() > MidScalarSize) {
545 // Handle the scalar value if the DstTy's Scalar Size is more than double
546 // Src's ScalarType
547 B.buildInstr(Opc: std::get<1>(in&: MatchInfo) ? TargetOpcode::G_SEXT
548 : TargetOpcode::G_ZEXT,
549 DstOps: {DstReg}, SrcOps: {outReg});
550 } else {
551 B.buildCopy(Res: DstReg, Op: outReg);
552 }
553
554 MI.eraseFromParent();
555}
556
557// Pushes ADD/SUB through extend instructions to decrease the number of extend
558// instruction at the end by allowing selection of {s|u}addl sooner
559
560// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
561bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
562 Register DstReg, Register SrcReg1, Register SrcReg2) {
563 assert((MI.getOpcode() == TargetOpcode::G_ADD ||
564 MI.getOpcode() == TargetOpcode::G_SUB) &&
565 "Expected a G_ADD or G_SUB instruction\n");
566
567 // Deal with vector types only
568 LLT DstTy = MRI.getType(Reg: DstReg);
569 if (!DstTy.isVector())
570 return false;
571
572 // Return true if G_{S|Z}EXT instruction is more than 2* source
573 Register ExtDstReg = MI.getOperand(i: 1).getReg();
574 LLT Ext1SrcTy = MRI.getType(Reg: SrcReg1);
575 LLT Ext2SrcTy = MRI.getType(Reg: SrcReg2);
576 unsigned ExtDstScal = MRI.getType(Reg: ExtDstReg).getScalarSizeInBits();
577 unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
578 if (((Ext1SrcScal == 8 && ExtDstScal == 32) ||
579 ((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) &&
580 Ext1SrcTy == Ext2SrcTy)
581 return true;
582
583 return false;
584}
585
586void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
587 MachineIRBuilder &B, bool isSExt, Register DstReg,
588 Register SrcReg1, Register SrcReg2) {
589 LLT SrcTy = MRI.getType(Reg: SrcReg1);
590 LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTy.getScalarSizeInBits() * 2);
591 unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
592 Register Ext1Reg = B.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg1}).getReg(Idx: 0);
593 Register Ext2Reg = B.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg2}).getReg(Idx: 0);
594 Register AddReg =
595 B.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Ext1Reg, Ext2Reg}).getReg(Idx: 0);
596
597 // G_SUB has to sign-extend the result.
598 // G_ADD needs to sext from sext and can sext or zext from zext, so the
599 // original opcode is used.
600 if (MI.getOpcode() == TargetOpcode::G_ADD)
601 B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {AddReg});
602 else
603 B.buildSExt(Res: DstReg, Op: AddReg);
604
605 MI.eraseFromParent();
606}
607
608bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
609 CombinerHelper &Helper, GISelChangeObserver &Observer) {
610 // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
611 // result is only used in the no-overflow case. It is restricted to cases
612 // where we know that the high-bits of the operands are 0. If there's an
613 // overflow, then the 9th or 17th bit must be set, which can be checked
614 // using TBNZ.
615 //
616 // Change (for UADDOs on 8 and 16 bits):
617 //
618 // %z0 = G_ASSERT_ZEXT _
619 // %op0 = G_TRUNC %z0
620 // %z1 = G_ASSERT_ZEXT _
621 // %op1 = G_TRUNC %z1
622 // %val, %cond = G_UADDO %op0, %op1
623 // G_BRCOND %cond, %error.bb
624 //
625 // error.bb:
626 // (no successors and no uses of %val)
627 //
628 // To:
629 //
630 // %z0 = G_ASSERT_ZEXT _
631 // %z1 = G_ASSERT_ZEXT _
632 // %add = G_ADD %z0, %z1
633 // %val = G_TRUNC %add
634 // %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
635 // %cond = G_ICMP NE, %bit, 0
636 // G_BRCOND %cond, %error.bb
637
638 auto &MRI = *B.getMRI();
639
640 MachineOperand *DefOp0 = MRI.getOneDef(Reg: MI.getOperand(i: 2).getReg());
641 MachineOperand *DefOp1 = MRI.getOneDef(Reg: MI.getOperand(i: 3).getReg());
642 Register Op0Wide;
643 Register Op1Wide;
644 if (!mi_match(R: DefOp0->getParent(), MRI, P: m_GTrunc(Src: m_Reg(R&: Op0Wide))) ||
645 !mi_match(R: DefOp1->getParent(), MRI, P: m_GTrunc(Src: m_Reg(R&: Op1Wide))))
646 return false;
647 LLT WideTy0 = MRI.getType(Reg: Op0Wide);
648 LLT WideTy1 = MRI.getType(Reg: Op1Wide);
649 Register ResVal = MI.getOperand(i: 0).getReg();
650 LLT OpTy = MRI.getType(Reg: ResVal);
651 MachineInstr *Op0WideDef = MRI.getVRegDef(Reg: Op0Wide);
652 MachineInstr *Op1WideDef = MRI.getVRegDef(Reg: Op1Wide);
653
654 unsigned OpTySize = OpTy.getScalarSizeInBits();
655 // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
656 // inputs have been zero-extended.
657 if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
658 Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
659 OpTySize != Op0WideDef->getOperand(i: 2).getImm() ||
660 OpTySize != Op1WideDef->getOperand(i: 2).getImm())
661 return false;
662
663 // Only scalar UADDO with either 8 or 16 bit operands are handled.
664 if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 ||
665 OpTySize >= WideTy0.getScalarSizeInBits() ||
666 (OpTySize != 8 && OpTySize != 16))
667 return false;
668
669 // The overflow-status result must be used by a branch only.
670 Register ResStatus = MI.getOperand(i: 1).getReg();
671 if (!MRI.hasOneNonDBGUse(RegNo: ResStatus))
672 return false;
673 MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(RegNo: ResStatus);
674 if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
675 return false;
676
677 // Make sure the computed result is only used in the no-overflow blocks.
678 MachineBasicBlock *CurrentMBB = MI.getParent();
679 MachineBasicBlock *FailMBB = CondUser->getOperand(i: 1).getMBB();
680 if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB)
681 return false;
682 if (any_of(Range: MRI.use_nodbg_instructions(Reg: ResVal),
683 P: [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
684 return &MI != &I &&
685 (I.getParent() == FailMBB || I.getParent() == CurrentMBB);
686 }))
687 return false;
688
689 // Remove G_ADDO.
690 B.setInstrAndDebugLoc(*MI.getNextNode());
691 MI.eraseFromParent();
692
693 // Emit wide add.
694 Register AddDst = MRI.cloneVirtualRegister(VReg: Op0Wide);
695 B.buildInstr(Opc: TargetOpcode::G_ADD, DstOps: {AddDst}, SrcOps: {Op0Wide, Op1Wide});
696
697 // Emit check of the 9th or 17th bit and update users (the branch). This will
698 // later be folded to TBNZ.
699 Register CondBit = MRI.cloneVirtualRegister(VReg: Op0Wide);
700 B.buildAnd(
701 Dst: CondBit, Src0: AddDst,
702 Src1: B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: OpTySize == 8 ? 1 << 8 : 1 << 16));
703 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: ResStatus, Op0: CondBit,
704 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0));
705
706 // Update ZEXts users of the result value. Because all uses are in the
707 // no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
708 B.buildZExtOrTrunc(Res: ResVal, Op: AddDst);
709 for (MachineOperand &U : make_early_inc_range(Range: MRI.use_operands(Reg: ResVal))) {
710 Register WideReg;
711 if (mi_match(R: U.getParent(), MRI, P: m_GZExt(Src: m_Reg(R&: WideReg)))) {
712 auto OldR = U.getParent()->getOperand(i: 0).getReg();
713 Observer.erasingInstr(MI&: *U.getParent());
714 U.getParent()->eraseFromParent();
715 Helper.replaceRegWith(MRI, FromReg: OldR, ToReg: AddDst);
716 }
717 }
718
719 return true;
720}
721
722class AArch64PreLegalizerCombinerImpl : public Combiner {
723protected:
724 // TODO: Make CombinerHelper methods const.
725 mutable CombinerHelper Helper;
726 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig;
727 const AArch64Subtarget &STI;
728
729public:
730 AArch64PreLegalizerCombinerImpl(
731 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
732 GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
733 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
734 const AArch64Subtarget &STI, MachineDominatorTree *MDT,
735 const LegalizerInfo *LI);
736
737 static const char *getName() { return "AArch6400PreLegalizerCombiner"; }
738
739 bool tryCombineAll(MachineInstr &I) const override;
740
741 bool tryCombineAllImpl(MachineInstr &I) const;
742
743private:
744#define GET_GICOMBINER_CLASS_MEMBERS
745#include "AArch64GenPreLegalizeGICombiner.inc"
746#undef GET_GICOMBINER_CLASS_MEMBERS
747};
748
749#define GET_GICOMBINER_IMPL
750#include "AArch64GenPreLegalizeGICombiner.inc"
751#undef GET_GICOMBINER_IMPL
752
753AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl(
754 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
755 GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
756 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
757 const AArch64Subtarget &STI, MachineDominatorTree *MDT,
758 const LegalizerInfo *LI)
759 : Combiner(MF, CInfo, TPC, &KB, CSEInfo),
760 Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
761 RuleConfig(RuleConfig), STI(STI),
762#define GET_GICOMBINER_CONSTRUCTOR_INITS
763#include "AArch64GenPreLegalizeGICombiner.inc"
764#undef GET_GICOMBINER_CONSTRUCTOR_INITS
765{
766}
767
768bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
769 if (tryCombineAllImpl(I&: MI))
770 return true;
771
772 unsigned Opc = MI.getOpcode();
773 switch (Opc) {
774 case TargetOpcode::G_SHUFFLE_VECTOR:
775 return Helper.tryCombineShuffleVector(MI);
776 case TargetOpcode::G_UADDO:
777 return tryToSimplifyUADDO(MI, B, Helper, Observer);
778 case TargetOpcode::G_MEMCPY_INLINE:
779 return Helper.tryEmitMemcpyInline(MI);
780 case TargetOpcode::G_MEMCPY:
781 case TargetOpcode::G_MEMMOVE:
782 case TargetOpcode::G_MEMSET: {
783 // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
784 // heuristics decide.
785 unsigned MaxLen = CInfo.EnableOpt ? 0 : 32;
786 // Try to inline memcpy type calls if optimizations are enabled.
787 if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
788 return true;
789 if (Opc == TargetOpcode::G_MEMSET)
790 return llvm::AArch64GISelUtils::tryEmitBZero(MI, MIRBuilder&: B, MinSize: CInfo.EnableMinSize);
791 return false;
792 }
793 }
794
795 return false;
796}
797
798// Pass boilerplate
799// ================
800
801class AArch64PreLegalizerCombiner : public MachineFunctionPass {
802public:
803 static char ID;
804
805 AArch64PreLegalizerCombiner();
806
807 StringRef getPassName() const override {
808 return "AArch64PreLegalizerCombiner";
809 }
810
811 bool runOnMachineFunction(MachineFunction &MF) override;
812
813 void getAnalysisUsage(AnalysisUsage &AU) const override;
814
815private:
816 AArch64PreLegalizerCombinerImplRuleConfig RuleConfig;
817};
818} // end anonymous namespace
819
820void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
821 AU.addRequired<TargetPassConfig>();
822 AU.setPreservesCFG();
823 getSelectionDAGFallbackAnalysisUsage(AU);
824 AU.addRequired<GISelKnownBitsAnalysis>();
825 AU.addPreserved<GISelKnownBitsAnalysis>();
826 AU.addRequired<MachineDominatorTreeWrapperPass>();
827 AU.addPreserved<MachineDominatorTreeWrapperPass>();
828 AU.addRequired<GISelCSEAnalysisWrapperPass>();
829 AU.addPreserved<GISelCSEAnalysisWrapperPass>();
830 MachineFunctionPass::getAnalysisUsage(AU);
831}
832
833AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
834 : MachineFunctionPass(ID) {
835 initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
836
837 if (!RuleConfig.parseCommandLineOption())
838 report_fatal_error(reason: "Invalid rule identifier");
839}
840
841bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
842 if (MF.getProperties().hasProperty(
843 P: MachineFunctionProperties::Property::FailedISel))
844 return false;
845 auto &TPC = getAnalysis<TargetPassConfig>();
846
847 // Enable CSE.
848 GISelCSEAnalysisWrapper &Wrapper =
849 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
850 auto *CSEInfo = &Wrapper.get(CSEOpt: TPC.getCSEConfig());
851
852 const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
853 const auto *LI = ST.getLegalizerInfo();
854
855 const Function &F = MF.getFunction();
856 bool EnableOpt =
857 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
858 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
859 MachineDominatorTree *MDT =
860 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
861 CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
862 /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
863 F.hasMinSize());
864 AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, CSEInfo,
865 RuleConfig, ST, MDT, LI);
866 return Impl.combineMachineInstrs();
867}
868
869char AArch64PreLegalizerCombiner::ID = 0;
870INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
871 "Combine AArch64 machine instrs before legalization",
872 false, false)
873INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
874INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
875INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
876INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
877 "Combine AArch64 machine instrs before legalization", false,
878 false)
879
880namespace llvm {
881FunctionPass *createAArch64PreLegalizerCombiner() {
882 return new AArch64PreLegalizerCombiner();
883}
884} // end namespace llvm
885