1//=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does combining of machine instructions at the generic MI level,
10// before the legalizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AArch64GlobalISelUtils.h"
15#include "AArch64TargetMachine.h"
16#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
17#include "llvm/CodeGen/GlobalISel/Combiner.h"
18#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
19#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
20#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
21#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/GlobalISel/Utils.h"
25#include "llvm/CodeGen/MachineDominators.h"
26#include "llvm/CodeGen/MachineFunction.h"
27#include "llvm/CodeGen/MachineFunctionPass.h"
28#include "llvm/CodeGen/MachineRegisterInfo.h"
29#include "llvm/CodeGen/TargetPassConfig.h"
30#include "llvm/IR/Instructions.h"
31
32#define GET_GICOMBINER_DEPS
33#include "AArch64GenPreLegalizeGICombiner.inc"
34#undef GET_GICOMBINER_DEPS
35
36#define DEBUG_TYPE "aarch64-prelegalizer-combiner"
37
38using namespace llvm;
39using namespace MIPatternMatch;
40
41namespace {
42
43#define GET_GICOMBINER_TYPES
44#include "AArch64GenPreLegalizeGICombiner.inc"
45#undef GET_GICOMBINER_TYPES
46
47/// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
48/// are sign bits. In this case, we can transform the G_ICMP to directly compare
49/// the wide value with a zero.
50bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
51 GISelValueTracking *VT, Register &MatchInfo) {
52 assert(MI.getOpcode() == TargetOpcode::G_ICMP && VT);
53
54 auto Pred = (CmpInst::Predicate)MI.getOperand(i: 1).getPredicate();
55 if (!ICmpInst::isEquality(P: Pred))
56 return false;
57
58 Register LHS = MI.getOperand(i: 2).getReg();
59 LLT LHSTy = MRI.getType(Reg: LHS);
60 if (!LHSTy.isScalar())
61 return false;
62
63 Register RHS = MI.getOperand(i: 3).getReg();
64 Register WideReg;
65
66 if (!mi_match(R: LHS, MRI, P: m_GTrunc(Src: m_Reg(R&: WideReg))) ||
67 !mi_match(R: RHS, MRI, P: m_SpecificICst(RequestedValue: 0)))
68 return false;
69
70 LLT WideTy = MRI.getType(Reg: WideReg);
71 if (VT->computeNumSignBits(R: WideReg) <=
72 WideTy.getSizeInBits() - LHSTy.getSizeInBits())
73 return false;
74
75 MatchInfo = WideReg;
76 return true;
77}
78
79void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
80 MachineIRBuilder &Builder,
81 GISelChangeObserver &Observer, Register &WideReg) {
82 assert(MI.getOpcode() == TargetOpcode::G_ICMP);
83
84 LLT WideTy = MRI.getType(Reg: WideReg);
85 // We're going to directly use the wide register as the LHS, and then use an
86 // equivalent size zero for RHS.
87 Builder.setInstrAndDebugLoc(MI);
88 auto WideZero = Builder.buildConstant(Res: WideTy, Val: 0);
89 Observer.changingInstr(MI);
90 MI.getOperand(i: 2).setReg(WideReg);
91 MI.getOperand(i: 3).setReg(WideZero.getReg(Idx: 0));
92 Observer.changedInstr(MI);
93}
94
95/// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
96///
97/// e.g.
98///
99/// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
100bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
101 std::pair<uint64_t, uint64_t> &MatchInfo) {
102 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
103 MachineFunction &MF = *MI.getMF();
104 auto &GlobalOp = MI.getOperand(i: 1);
105 auto *GV = GlobalOp.getGlobal();
106 if (GV->isThreadLocal())
107 return false;
108
109 // Don't allow anything that could represent offsets etc.
110 if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
111 GV, TM: MF.getTarget()) != AArch64II::MO_NO_FLAG)
112 return false;
113
114 // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
115 //
116 // %g = G_GLOBAL_VALUE @x
117 // %ptr1 = G_PTR_ADD %g, cst1
118 // %ptr2 = G_PTR_ADD %g, cst2
119 // ...
120 // %ptrN = G_PTR_ADD %g, cstN
121 //
122 // Identify the *smallest* constant. We want to be able to form this:
123 //
124 // %offset_g = G_GLOBAL_VALUE @x + min_cst
125 // %g = G_PTR_ADD %offset_g, -min_cst
126 // %ptr1 = G_PTR_ADD %g, cst1
127 // ...
128 Register Dst = MI.getOperand(i: 0).getReg();
129 uint64_t MinOffset = -1ull;
130 for (auto &UseInstr : MRI.use_nodbg_instructions(Reg: Dst)) {
131 if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
132 return false;
133 auto Cst = getIConstantVRegValWithLookThrough(
134 VReg: UseInstr.getOperand(i: 2).getReg(), MRI);
135 if (!Cst)
136 return false;
137 MinOffset = std::min(a: MinOffset, b: Cst->Value.getZExtValue());
138 }
139
140 // Require that the new offset is larger than the existing one to avoid
141 // infinite loops.
142 uint64_t CurrOffset = GlobalOp.getOffset();
143 uint64_t NewOffset = MinOffset + CurrOffset;
144 if (NewOffset <= CurrOffset)
145 return false;
146
147 // Check whether folding this offset is legal. It must not go out of bounds of
148 // the referenced object to avoid violating the code model, and must be
149 // smaller than 2^20 because this is the largest offset expressible in all
150 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
151 // stores an immediate signed 21 bit offset.)
152 //
153 // This check also prevents us from folding negative offsets, which will end
154 // up being treated in the same way as large positive ones. They could also
155 // cause code model violations, and aren't really common enough to matter.
156 if (NewOffset >= (1 << 20))
157 return false;
158
159 Type *T = GV->getValueType();
160 if (!T->isSized() ||
161 NewOffset > GV->getDataLayout().getTypeAllocSize(Ty: T))
162 return false;
163 MatchInfo = std::make_pair(x&: NewOffset, y&: MinOffset);
164 return true;
165}
166
167void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
168 MachineIRBuilder &B, GISelChangeObserver &Observer,
169 std::pair<uint64_t, uint64_t> &MatchInfo) {
170 // Change:
171 //
172 // %g = G_GLOBAL_VALUE @x
173 // %ptr1 = G_PTR_ADD %g, cst1
174 // %ptr2 = G_PTR_ADD %g, cst2
175 // ...
176 // %ptrN = G_PTR_ADD %g, cstN
177 //
178 // To:
179 //
180 // %offset_g = G_GLOBAL_VALUE @x + min_cst
181 // %g = G_PTR_ADD %offset_g, -min_cst
182 // %ptr1 = G_PTR_ADD %g, cst1
183 // ...
184 // %ptrN = G_PTR_ADD %g, cstN
185 //
186 // Then, the original G_PTR_ADDs should be folded later on so that they look
187 // like this:
188 //
189 // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
190 uint64_t Offset, MinOffset;
191 std::tie(args&: Offset, args&: MinOffset) = MatchInfo;
192 B.setInstrAndDebugLoc(*std::next(x: MI.getIterator()));
193 Observer.changingInstr(MI);
194 auto &GlobalOp = MI.getOperand(i: 1);
195 auto *GV = GlobalOp.getGlobal();
196 GlobalOp.ChangeToGA(GV, Offset, TargetFlags: GlobalOp.getTargetFlags());
197 Register Dst = MI.getOperand(i: 0).getReg();
198 Register NewGVDst = MRI.cloneVirtualRegister(VReg: Dst);
199 MI.getOperand(i: 0).setReg(NewGVDst);
200 Observer.changedInstr(MI);
201 B.buildPtrAdd(
202 Res: Dst, Op0: NewGVDst,
203 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: -static_cast<int64_t>(MinOffset)));
204}
205
206// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add([us]dot(x, y))
207// Or vecreduce_add(ext(mul(ext(x), ext(y)))) -> vecreduce_add([us]dot(x, y))
208// Or vecreduce_add(ext(x)) -> vecreduce_add([us]dot(x, 1))
209// Similar to performVecReduceAddCombine in SelectionDAG
210bool matchExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
211 const AArch64Subtarget &STI,
212 std::tuple<Register, Register, bool> &MatchInfo) {
213 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
214 "Expected a G_VECREDUCE_ADD instruction");
215 assert(STI.hasDotProd() && "Target should have Dot Product feature");
216
217 MachineInstr *I1 = getDefIgnoringCopies(Reg: MI.getOperand(i: 1).getReg(), MRI);
218 Register DstReg = MI.getOperand(i: 0).getReg();
219 Register MidReg = I1->getOperand(i: 0).getReg();
220 LLT DstTy = MRI.getType(Reg: DstReg);
221 LLT MidTy = MRI.getType(Reg: MidReg);
222 if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32)
223 return false;
224
225 // Detect mul(ext, ext) with symmetric ext's. If I1Opc is G_ZEXT or G_SEXT
226 // then the ext's must match the same opcode. It is set to the ext opcode on
227 // output.
228 auto tryMatchingMulOfExt = [&MRI](MachineInstr *MI, Register &Out1,
229 Register &Out2, unsigned &I1Opc) {
230 // If result of this has more than 1 use, then there is no point in creating
231 // a dot instruction
232 if (!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
233 return false;
234
235 MachineInstr *ExtMI1 =
236 getDefIgnoringCopies(Reg: MI->getOperand(i: 1).getReg(), MRI);
237 MachineInstr *ExtMI2 =
238 getDefIgnoringCopies(Reg: MI->getOperand(i: 2).getReg(), MRI);
239 LLT Ext1DstTy = MRI.getType(Reg: ExtMI1->getOperand(i: 0).getReg());
240 LLT Ext2DstTy = MRI.getType(Reg: ExtMI2->getOperand(i: 0).getReg());
241
242 if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy)
243 return false;
244 if ((I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) &&
245 I1Opc != ExtMI1->getOpcode())
246 return false;
247 Out1 = ExtMI1->getOperand(i: 1).getReg();
248 Out2 = ExtMI2->getOperand(i: 1).getReg();
249 I1Opc = ExtMI1->getOpcode();
250 return true;
251 };
252
253 LLT SrcTy;
254 unsigned I1Opc = I1->getOpcode();
255 if (I1Opc == TargetOpcode::G_MUL) {
256 Register Out1, Out2;
257 if (!tryMatchingMulOfExt(I1, Out1, Out2, I1Opc))
258 return false;
259 SrcTy = MRI.getType(Reg: Out1);
260 std::get<0>(t&: MatchInfo) = Out1;
261 std::get<1>(t&: MatchInfo) = Out2;
262 } else if (I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_SEXT) {
263 Register I1Op = I1->getOperand(i: 1).getReg();
264 MachineInstr *M = getDefIgnoringCopies(Reg: I1Op, MRI);
265 Register Out1, Out2;
266 if (M->getOpcode() == TargetOpcode::G_MUL &&
267 tryMatchingMulOfExt(M, Out1, Out2, I1Opc)) {
268 SrcTy = MRI.getType(Reg: Out1);
269 std::get<0>(t&: MatchInfo) = Out1;
270 std::get<1>(t&: MatchInfo) = Out2;
271 } else {
272 SrcTy = MRI.getType(Reg: I1Op);
273 std::get<0>(t&: MatchInfo) = I1Op;
274 std::get<1>(t&: MatchInfo) = 0;
275 }
276 } else {
277 return false;
278 }
279
280 if (I1Opc == TargetOpcode::G_ZEXT)
281 std::get<2>(t&: MatchInfo) = 0;
282 else if (I1Opc == TargetOpcode::G_SEXT)
283 std::get<2>(t&: MatchInfo) = 1;
284 else
285 return false;
286
287 if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0)
288 return false;
289
290 return true;
291}
292
293void applyExtAddvToDotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
294 MachineIRBuilder &Builder,
295 GISelChangeObserver &Observer,
296 const AArch64Subtarget &STI,
297 std::tuple<Register, Register, bool> &MatchInfo) {
298 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
299 "Expected a G_VECREDUCE_ADD instruction");
300 assert(STI.hasDotProd() && "Target should have Dot Product feature");
301
302 // Initialise the variables
303 unsigned DotOpcode =
304 std::get<2>(t&: MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT;
305 Register Ext1SrcReg = std::get<0>(t&: MatchInfo);
306
307 // If there is one source register, create a vector of 0s as the second
308 // source register
309 Register Ext2SrcReg;
310 if (std::get<1>(t&: MatchInfo) == 0)
311 Ext2SrcReg = Builder.buildConstant(Res: MRI.getType(Reg: Ext1SrcReg), Val: 1)
312 ->getOperand(i: 0)
313 .getReg();
314 else
315 Ext2SrcReg = std::get<1>(t&: MatchInfo);
316
317 // Find out how many DOT instructions are needed
318 LLT SrcTy = MRI.getType(Reg: Ext1SrcReg);
319 LLT MidTy;
320 unsigned NumOfDotMI;
321 if (SrcTy.getNumElements() % 16 == 0) {
322 NumOfDotMI = SrcTy.getNumElements() / 16;
323 MidTy = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
324 } else if (SrcTy.getNumElements() % 8 == 0) {
325 NumOfDotMI = SrcTy.getNumElements() / 8;
326 MidTy = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
327 } else {
328 llvm_unreachable("Source type number of elements is not multiple of 8");
329 }
330
331 // Handle case where one DOT instruction is needed
332 if (NumOfDotMI == 1) {
333 auto Zeroes = Builder.buildConstant(Res: MidTy, Val: 0)->getOperand(i: 0).getReg();
334 auto Dot = Builder.buildInstr(Opc: DotOpcode, DstOps: {MidTy},
335 SrcOps: {Zeroes, Ext1SrcReg, Ext2SrcReg});
336 Builder.buildVecReduceAdd(Dst: MI.getOperand(i: 0), Src: Dot->getOperand(i: 0));
337 } else {
338 // If not pad the last v8 element with 0s to a v16
339 SmallVector<Register, 4> Ext1UnmergeReg;
340 SmallVector<Register, 4> Ext2UnmergeReg;
341 if (SrcTy.getNumElements() % 16 != 0) {
342 SmallVector<Register> Leftover1;
343 SmallVector<Register> Leftover2;
344
345 // Split the elements into v16i8 and v8i8
346 LLT MainTy = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8);
347 LLT LeftoverTy1, LeftoverTy2;
348 if ((!extractParts(Reg: Ext1SrcReg, RegTy: MRI.getType(Reg: Ext1SrcReg), MainTy,
349 LeftoverTy&: LeftoverTy1, VRegs&: Ext1UnmergeReg, LeftoverVRegs&: Leftover1, MIRBuilder&: Builder,
350 MRI)) ||
351 (!extractParts(Reg: Ext2SrcReg, RegTy: MRI.getType(Reg: Ext2SrcReg), MainTy,
352 LeftoverTy&: LeftoverTy2, VRegs&: Ext2UnmergeReg, LeftoverVRegs&: Leftover2, MIRBuilder&: Builder,
353 MRI))) {
354 llvm_unreachable("Unable to split this vector properly");
355 }
356
357 // Pad the leftover v8i8 vector with register of 0s of type v8i8
358 Register v8Zeroes = Builder.buildConstant(Res: LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8), Val: 0)
359 ->getOperand(i: 0)
360 .getReg();
361
362 Ext1UnmergeReg.push_back(
363 Elt: Builder
364 .buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8),
365 Ops: {Leftover1[0], v8Zeroes})
366 .getReg(Idx: 0));
367 Ext2UnmergeReg.push_back(
368 Elt: Builder
369 .buildMergeLikeInstr(Res: LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8),
370 Ops: {Leftover2[0], v8Zeroes})
371 .getReg(Idx: 0));
372
373 } else {
374 // Unmerge the source vectors to v16i8
375 unsigned SrcNumElts = SrcTy.getNumElements();
376 extractParts(Reg: Ext1SrcReg, Ty: LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8), NumParts: SrcNumElts / 16,
377 VRegs&: Ext1UnmergeReg, MIRBuilder&: Builder, MRI);
378 extractParts(Reg: Ext2SrcReg, Ty: LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8), NumParts: SrcNumElts / 16,
379 VRegs&: Ext2UnmergeReg, MIRBuilder&: Builder, MRI);
380 }
381
382 // Build the UDOT instructions
383 SmallVector<Register, 2> DotReg;
384 unsigned NumElements = 0;
385 for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) {
386 LLT ZeroesLLT;
387 // Check if it is 16 or 8 elements. Set Zeroes to the according size
388 if (MRI.getType(Reg: Ext1UnmergeReg[i]).getNumElements() == 16) {
389 ZeroesLLT = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
390 NumElements += 4;
391 } else {
392 ZeroesLLT = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
393 NumElements += 2;
394 }
395 auto Zeroes = Builder.buildConstant(Res: ZeroesLLT, Val: 0)->getOperand(i: 0).getReg();
396 DotReg.push_back(
397 Elt: Builder
398 .buildInstr(Opc: DotOpcode, DstOps: {MRI.getType(Reg: Zeroes)},
399 SrcOps: {Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]})
400 .getReg(Idx: 0));
401 }
402
403 // Merge the output
404 auto ConcatMI =
405 Builder.buildConcatVectors(Res: LLT::fixed_vector(NumElements, ScalarSizeInBits: 32), Ops: DotReg);
406
407 // Put it through a vector reduction
408 Builder.buildVecReduceAdd(Dst: MI.getOperand(i: 0).getReg(),
409 Src: ConcatMI->getOperand(i: 0).getReg());
410 }
411
412 // Erase the dead instructions
413 MI.eraseFromParent();
414}
415
416// Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x)
417// Ensure that the type coming from the extend instruction is the right size
418bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
419 std::pair<Register, bool> &MatchInfo) {
420 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
421 "Expected G_VECREDUCE_ADD Opcode");
422
423 // Check if the last instruction is an extend
424 MachineInstr *ExtMI = getDefIgnoringCopies(Reg: MI.getOperand(i: 1).getReg(), MRI);
425 auto ExtOpc = ExtMI->getOpcode();
426
427 if (ExtOpc == TargetOpcode::G_ZEXT)
428 std::get<1>(in&: MatchInfo) = 0;
429 else if (ExtOpc == TargetOpcode::G_SEXT)
430 std::get<1>(in&: MatchInfo) = 1;
431 else
432 return false;
433
434 // Check if the source register is a valid type
435 Register ExtSrcReg = ExtMI->getOperand(i: 1).getReg();
436 LLT ExtSrcTy = MRI.getType(Reg: ExtSrcReg);
437 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
438 if (ExtSrcTy.getScalarSizeInBits() * 2 > DstTy.getScalarSizeInBits())
439 return false;
440 if ((DstTy.getScalarSizeInBits() == 16 &&
441 ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) ||
442 (DstTy.getScalarSizeInBits() == 32 &&
443 ExtSrcTy.getNumElements() % 4 == 0) ||
444 (DstTy.getScalarSizeInBits() == 64 &&
445 ExtSrcTy.getNumElements() % 4 == 0)) {
446 std::get<0>(in&: MatchInfo) = ExtSrcReg;
447 return true;
448 }
449 return false;
450}
451
452void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
453 MachineIRBuilder &B, GISelChangeObserver &Observer,
454 std::pair<Register, bool> &MatchInfo) {
455 assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
456 "Expected G_VECREDUCE_ADD Opcode");
457
458 unsigned Opc = std::get<1>(in&: MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV;
459 Register SrcReg = std::get<0>(in&: MatchInfo);
460 Register DstReg = MI.getOperand(i: 0).getReg();
461 LLT SrcTy = MRI.getType(Reg: SrcReg);
462 LLT DstTy = MRI.getType(Reg: DstReg);
463
464 // If SrcTy has more elements than expected, split them into multiple
465 // insructions and sum the results
466 LLT MainTy;
467 SmallVector<Register, 1> WorkingRegisters;
468 unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
469 unsigned SrcNumElem = SrcTy.getNumElements();
470 if ((SrcScalSize == 8 && SrcNumElem > 16) ||
471 (SrcScalSize == 16 && SrcNumElem > 8) ||
472 (SrcScalSize == 32 && SrcNumElem > 4)) {
473
474 LLT LeftoverTy;
475 SmallVector<Register, 4> LeftoverRegs;
476 if (SrcScalSize == 8)
477 MainTy = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8);
478 else if (SrcScalSize == 16)
479 MainTy = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
480 else if (SrcScalSize == 32)
481 MainTy = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
482 else
483 llvm_unreachable("Source's Scalar Size not supported");
484
485 // Extract the parts and put each extracted sources through U/SADDLV and put
486 // the values inside a small vec
487 extractParts(Reg: SrcReg, RegTy: SrcTy, MainTy, LeftoverTy, VRegs&: WorkingRegisters,
488 LeftoverVRegs&: LeftoverRegs, MIRBuilder&: B, MRI);
489 llvm::append_range(C&: WorkingRegisters, R&: LeftoverRegs);
490 } else {
491 WorkingRegisters.push_back(Elt: SrcReg);
492 MainTy = SrcTy;
493 }
494
495 unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2;
496 LLT MidScalarLLT = LLT::scalar(SizeInBits: MidScalarSize);
497 Register ZeroReg = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: 0).getReg(Idx: 0);
498 for (unsigned I = 0; I < WorkingRegisters.size(); I++) {
499 // If the number of elements is too small to build an instruction, extend
500 // its size before applying addlv
501 LLT WorkingRegTy = MRI.getType(Reg: WorkingRegisters[I]);
502 if ((WorkingRegTy.getScalarSizeInBits() == 8) &&
503 (WorkingRegTy.getNumElements() == 4)) {
504 WorkingRegisters[I] =
505 B.buildInstr(Opc: std::get<1>(in&: MatchInfo) ? TargetOpcode::G_SEXT
506 : TargetOpcode::G_ZEXT,
507 DstOps: {LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)}, SrcOps: {WorkingRegisters[I]})
508 .getReg(Idx: 0);
509 }
510
511 // Generate the {U/S}ADDLV instruction, whose output is always double of the
512 // Src's Scalar size
513 LLT AddlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)
514 : LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
515 Register AddlvReg =
516 B.buildInstr(Opc, DstOps: {AddlvTy}, SrcOps: {WorkingRegisters[I]}).getReg(Idx: 0);
517
518 // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
519 // v2i64 register.
520 // i16, i32 results uses v4i32 registers
521 // i64 results uses v2i64 registers
522 // Therefore we have to extract/truncate the the value to the right type
523 if (MidScalarSize == 32 || MidScalarSize == 64) {
524 WorkingRegisters[I] = B.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT,
525 DstOps: {MidScalarLLT}, SrcOps: {AddlvReg, ZeroReg})
526 .getReg(Idx: 0);
527 } else {
528 Register ExtractReg = B.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT,
529 DstOps: {LLT::scalar(SizeInBits: 32)}, SrcOps: {AddlvReg, ZeroReg})
530 .getReg(Idx: 0);
531 WorkingRegisters[I] =
532 B.buildTrunc(Res: {MidScalarLLT}, Op: {ExtractReg}).getReg(Idx: 0);
533 }
534 }
535
536 Register OutReg;
537 if (WorkingRegisters.size() > 1) {
538 OutReg = B.buildAdd(Dst: MidScalarLLT, Src0: WorkingRegisters[0], Src1: WorkingRegisters[1])
539 .getReg(Idx: 0);
540 for (unsigned I = 2; I < WorkingRegisters.size(); I++) {
541 OutReg = B.buildAdd(Dst: MidScalarLLT, Src0: OutReg, Src1: WorkingRegisters[I]).getReg(Idx: 0);
542 }
543 } else {
544 OutReg = WorkingRegisters[0];
545 }
546
547 if (DstTy.getScalarSizeInBits() > MidScalarSize) {
548 // Handle the scalar value if the DstTy's Scalar Size is more than double
549 // Src's ScalarType
550 B.buildInstr(Opc: std::get<1>(in&: MatchInfo) ? TargetOpcode::G_SEXT
551 : TargetOpcode::G_ZEXT,
552 DstOps: {DstReg}, SrcOps: {OutReg});
553 } else {
554 B.buildCopy(Res: DstReg, Op: OutReg);
555 }
556
557 MI.eraseFromParent();
558}
559
560// Pushes ADD/SUB/MUL through extend instructions to decrease the number of
561// extend instruction at the end by allowing selection of {s|u}addl sooner
562// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
563bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
564 Register DstReg, Register SrcReg1, Register SrcReg2) {
565 assert((MI.getOpcode() == TargetOpcode::G_ADD ||
566 MI.getOpcode() == TargetOpcode::G_SUB ||
567 MI.getOpcode() == TargetOpcode::G_MUL) &&
568 "Expected a G_ADD, G_SUB or G_MUL instruction\n");
569
570 // Deal with vector types only
571 LLT DstTy = MRI.getType(Reg: DstReg);
572 if (!DstTy.isVector())
573 return false;
574
575 // Return true if G_{S|Z}EXT instruction is more than 2* source
576 Register ExtDstReg = MI.getOperand(i: 1).getReg();
577 LLT Ext1SrcTy = MRI.getType(Reg: SrcReg1);
578 LLT Ext2SrcTy = MRI.getType(Reg: SrcReg2);
579 unsigned ExtDstScal = MRI.getType(Reg: ExtDstReg).getScalarSizeInBits();
580 unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
581 if (((Ext1SrcScal == 8 && ExtDstScal == 32) ||
582 ((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) &&
583 Ext1SrcTy == Ext2SrcTy)
584 return true;
585
586 return false;
587}
588
589void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
590 MachineIRBuilder &B, bool isSExt, Register DstReg,
591 Register SrcReg1, Register SrcReg2) {
592 LLT SrcTy = MRI.getType(Reg: SrcReg1);
593 LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTy.getScalarSizeInBits() * 2);
594 unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
595 Register Ext1Reg = B.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg1}).getReg(Idx: 0);
596 Register Ext2Reg = B.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg2}).getReg(Idx: 0);
597 Register AddReg =
598 B.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Ext1Reg, Ext2Reg}).getReg(Idx: 0);
599
600 // G_SUB has to sign-extend the result.
601 // G_ADD needs to sext from sext and can sext or zext from zext, and G_MUL
602 // needs to use the original opcode so the original opcode is used for both.
603 if (MI.getOpcode() == TargetOpcode::G_ADD ||
604 MI.getOpcode() == TargetOpcode::G_MUL)
605 B.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {AddReg});
606 else
607 B.buildSExt(Res: DstReg, Op: AddReg);
608
609 MI.eraseFromParent();
610}
611
612bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
613 const CombinerHelper &Helper,
614 GISelChangeObserver &Observer) {
615 // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
616 // result is only used in the no-overflow case. It is restricted to cases
617 // where we know that the high-bits of the operands are 0. If there's an
618 // overflow, then the 9th or 17th bit must be set, which can be checked
619 // using TBNZ.
620 //
621 // Change (for UADDOs on 8 and 16 bits):
622 //
623 // %z0 = G_ASSERT_ZEXT _
624 // %op0 = G_TRUNC %z0
625 // %z1 = G_ASSERT_ZEXT _
626 // %op1 = G_TRUNC %z1
627 // %val, %cond = G_UADDO %op0, %op1
628 // G_BRCOND %cond, %error.bb
629 //
630 // error.bb:
631 // (no successors and no uses of %val)
632 //
633 // To:
634 //
635 // %z0 = G_ASSERT_ZEXT _
636 // %z1 = G_ASSERT_ZEXT _
637 // %add = G_ADD %z0, %z1
638 // %val = G_TRUNC %add
639 // %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
640 // %cond = G_ICMP NE, %bit, 0
641 // G_BRCOND %cond, %error.bb
642
643 auto &MRI = *B.getMRI();
644
645 MachineOperand *DefOp0 = MRI.getOneDef(Reg: MI.getOperand(i: 2).getReg());
646 MachineOperand *DefOp1 = MRI.getOneDef(Reg: MI.getOperand(i: 3).getReg());
647 Register Op0Wide;
648 Register Op1Wide;
649 if (!mi_match(R: DefOp0->getParent(), MRI, P: m_GTrunc(Src: m_Reg(R&: Op0Wide))) ||
650 !mi_match(R: DefOp1->getParent(), MRI, P: m_GTrunc(Src: m_Reg(R&: Op1Wide))))
651 return false;
652 LLT WideTy0 = MRI.getType(Reg: Op0Wide);
653 LLT WideTy1 = MRI.getType(Reg: Op1Wide);
654 Register ResVal = MI.getOperand(i: 0).getReg();
655 LLT OpTy = MRI.getType(Reg: ResVal);
656 MachineInstr *Op0WideDef = MRI.getVRegDef(Reg: Op0Wide);
657 MachineInstr *Op1WideDef = MRI.getVRegDef(Reg: Op1Wide);
658
659 unsigned OpTySize = OpTy.getScalarSizeInBits();
660 // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
661 // inputs have been zero-extended.
662 if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
663 Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
664 OpTySize != Op0WideDef->getOperand(i: 2).getImm() ||
665 OpTySize != Op1WideDef->getOperand(i: 2).getImm())
666 return false;
667
668 // Only scalar UADDO with either 8 or 16 bit operands are handled.
669 if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 ||
670 OpTySize >= WideTy0.getScalarSizeInBits() ||
671 (OpTySize != 8 && OpTySize != 16))
672 return false;
673
674 // The overflow-status result must be used by a branch only.
675 Register ResStatus = MI.getOperand(i: 1).getReg();
676 if (!MRI.hasOneNonDBGUse(RegNo: ResStatus))
677 return false;
678 MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(RegNo: ResStatus);
679 if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
680 return false;
681
682 // Make sure the computed result is only used in the no-overflow blocks.
683 MachineBasicBlock *CurrentMBB = MI.getParent();
684 MachineBasicBlock *FailMBB = CondUser->getOperand(i: 1).getMBB();
685 if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB)
686 return false;
687 if (any_of(Range: MRI.use_nodbg_instructions(Reg: ResVal),
688 P: [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
689 return &MI != &I &&
690 (I.getParent() == FailMBB || I.getParent() == CurrentMBB);
691 }))
692 return false;
693
694 // Remove G_ADDO.
695 B.setInstrAndDebugLoc(*MI.getNextNode());
696 MI.eraseFromParent();
697
698 // Emit wide add.
699 Register AddDst = MRI.cloneVirtualRegister(VReg: Op0Wide);
700 B.buildInstr(Opc: TargetOpcode::G_ADD, DstOps: {AddDst}, SrcOps: {Op0Wide, Op1Wide});
701
702 // Emit check of the 9th or 17th bit and update users (the branch). This will
703 // later be folded to TBNZ.
704 Register CondBit = MRI.cloneVirtualRegister(VReg: Op0Wide);
705 B.buildAnd(
706 Dst: CondBit, Src0: AddDst,
707 Src1: B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: OpTySize == 8 ? 1 << 8 : 1 << 16));
708 B.buildICmp(Pred: CmpInst::ICMP_NE, Res: ResStatus, Op0: CondBit,
709 Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 0));
710
711 // Update ZEXts users of the result value. Because all uses are in the
712 // no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
713 B.buildZExtOrTrunc(Res: ResVal, Op: AddDst);
714 for (MachineOperand &U : make_early_inc_range(Range: MRI.use_operands(Reg: ResVal))) {
715 Register WideReg;
716 if (mi_match(R: U.getParent(), MRI, P: m_GZExt(Src: m_Reg(R&: WideReg)))) {
717 auto OldR = U.getParent()->getOperand(i: 0).getReg();
718 Observer.erasingInstr(MI&: *U.getParent());
719 U.getParent()->eraseFromParent();
720 Helper.replaceRegWith(MRI, FromReg: OldR, ToReg: AddDst);
721 }
722 }
723
724 return true;
725}
726
727class AArch64PreLegalizerCombinerImpl : public Combiner {
728protected:
729 const CombinerHelper Helper;
730 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig;
731 const AArch64Subtarget &STI;
732 const LibcallLoweringInfo &Libcalls;
733
734public:
735 AArch64PreLegalizerCombinerImpl(
736 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
737 GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
738 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
739 const AArch64Subtarget &STI, const LibcallLoweringInfo &Libcalls,
740 MachineDominatorTree *MDT, const LegalizerInfo *LI);
741
742 static const char *getName() { return "AArch6400PreLegalizerCombiner"; }
743
744 bool tryCombineAll(MachineInstr &I) const override;
745
746 bool tryCombineAllImpl(MachineInstr &I) const;
747
748private:
749#define GET_GICOMBINER_CLASS_MEMBERS
750#include "AArch64GenPreLegalizeGICombiner.inc"
751#undef GET_GICOMBINER_CLASS_MEMBERS
752};
753
754#define GET_GICOMBINER_IMPL
755#include "AArch64GenPreLegalizeGICombiner.inc"
756#undef GET_GICOMBINER_IMPL
757
758AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl(
759 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
760 GISelValueTracking &VT, GISelCSEInfo *CSEInfo,
761 const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
762 const AArch64Subtarget &STI, const LibcallLoweringInfo &Libcalls,
763 MachineDominatorTree *MDT, const LegalizerInfo *LI)
764 : Combiner(MF, CInfo, TPC, &VT, CSEInfo),
765 Helper(Observer, B, /*IsPreLegalize*/ true, &VT, MDT, LI),
766 RuleConfig(RuleConfig), STI(STI), Libcalls(Libcalls),
767#define GET_GICOMBINER_CONSTRUCTOR_INITS
768#include "AArch64GenPreLegalizeGICombiner.inc"
769#undef GET_GICOMBINER_CONSTRUCTOR_INITS
770{
771}
772
773bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
774 if (tryCombineAllImpl(I&: MI))
775 return true;
776
777 unsigned Opc = MI.getOpcode();
778 switch (Opc) {
779 case TargetOpcode::G_SHUFFLE_VECTOR:
780 return Helper.tryCombineShuffleVector(MI);
781 case TargetOpcode::G_UADDO:
782 return tryToSimplifyUADDO(MI, B, Helper, Observer);
783 case TargetOpcode::G_MEMCPY_INLINE:
784 return Helper.tryEmitMemcpyInline(MI);
785 case TargetOpcode::G_MEMCPY:
786 case TargetOpcode::G_MEMMOVE:
787 case TargetOpcode::G_MEMSET: {
788 // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
789 // heuristics decide.
790 unsigned MaxLen = CInfo.EnableOpt ? 0 : 32;
791 // Try to inline memcpy type calls if optimizations are enabled.
792 if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
793 return true;
794 if (Opc == TargetOpcode::G_MEMSET)
795 return llvm::AArch64GISelUtils::tryEmitBZero(MI, MIRBuilder&: B, Libcalls,
796 MinSize: CInfo.EnableMinSize);
797 return false;
798 }
799 }
800
801 return false;
802}
803
804// Pass boilerplate
805// ================
806
807class AArch64PreLegalizerCombiner : public MachineFunctionPass {
808public:
809 static char ID;
810
811 AArch64PreLegalizerCombiner();
812
813 StringRef getPassName() const override {
814 return "AArch64PreLegalizerCombiner";
815 }
816
817 bool runOnMachineFunction(MachineFunction &MF) override;
818
819 void getAnalysisUsage(AnalysisUsage &AU) const override;
820
821private:
822 AArch64PreLegalizerCombinerImplRuleConfig RuleConfig;
823};
824} // end anonymous namespace
825
826void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
827 AU.addRequired<TargetPassConfig>();
828 AU.setPreservesCFG();
829 getSelectionDAGFallbackAnalysisUsage(AU);
830 AU.addRequired<GISelValueTrackingAnalysisLegacy>();
831 AU.addPreserved<GISelValueTrackingAnalysisLegacy>();
832 AU.addRequired<MachineDominatorTreeWrapperPass>();
833 AU.addPreserved<MachineDominatorTreeWrapperPass>();
834 AU.addRequired<GISelCSEAnalysisWrapperPass>();
835 AU.addPreserved<GISelCSEAnalysisWrapperPass>();
836 AU.addRequired<LibcallLoweringInfoWrapper>();
837 MachineFunctionPass::getAnalysisUsage(AU);
838}
839
840AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
841 : MachineFunctionPass(ID) {
842 if (!RuleConfig.parseCommandLineOption())
843 report_fatal_error(reason: "Invalid rule identifier");
844}
845
846bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
847 if (MF.getProperties().hasFailedISel())
848 return false;
849 auto &TPC = getAnalysis<TargetPassConfig>();
850
851 // Enable CSE.
852 GISelCSEAnalysisWrapper &Wrapper =
853 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
854 auto *CSEInfo = &Wrapper.get(CSEOpt: TPC.getCSEConfig());
855
856 const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
857 const auto *LI = ST.getLegalizerInfo();
858
859 const Function &F = MF.getFunction();
860
861 const LibcallLoweringInfo &Libcalls =
862 getAnalysis<LibcallLoweringInfoWrapper>().getLibcallLowering(
863 M: *F.getParent(), Subtarget: ST);
864
865 bool EnableOpt =
866 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
867 GISelValueTracking *VT =
868 &getAnalysis<GISelValueTrackingAnalysisLegacy>().get(MF);
869 MachineDominatorTree *MDT =
870 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
871 CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
872 /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
873 F.hasMinSize());
874 // Disable fixed-point iteration to reduce compile-time
875 CInfo.MaxIterations = 1;
876 CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass;
877 // This is the first Combiner, so the input IR might contain dead
878 // instructions.
879 CInfo.EnableFullDCE = true;
880 AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *VT, CSEInfo,
881 RuleConfig, ST, Libcalls, MDT, LI);
882 return Impl.combineMachineInstrs();
883}
884
885char AArch64PreLegalizerCombiner::ID = 0;
886INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
887 "Combine AArch64 machine instrs before legalization",
888 false, false)
889INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
890INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy)
891INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
892INITIALIZE_PASS_DEPENDENCY(LibcallLoweringInfoWrapper)
893INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
894 "Combine AArch64 machine instrs before legalization", false,
895 false)
896
897namespace llvm {
898FunctionPass *createAArch64PreLegalizerCombiner() {
899 return new AArch64PreLegalizerCombiner();
900}
901} // end namespace llvm
902