1 | //=== AArch64PostLegalizerLowering.cpp --------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// Post-legalization lowering for instructions. |
11 | /// |
12 | /// This is used to offload pattern matching from the selector. |
13 | /// |
14 | /// For example, this combiner will notice that a G_SHUFFLE_VECTOR is actually |
15 | /// a G_ZIP, G_UZP, etc. |
16 | /// |
17 | /// General optimization combines should be handled by either the |
18 | /// AArch64PostLegalizerCombiner or the AArch64PreLegalizerCombiner. |
19 | /// |
20 | //===----------------------------------------------------------------------===// |
21 | |
22 | #include "AArch64ExpandImm.h" |
23 | #include "AArch64GlobalISelUtils.h" |
24 | #include "AArch64PerfectShuffle.h" |
25 | #include "AArch64Subtarget.h" |
26 | #include "GISel/AArch64LegalizerInfo.h" |
27 | #include "MCTargetDesc/AArch64MCTargetDesc.h" |
28 | #include "Utils/AArch64BaseInfo.h" |
29 | #include "llvm/CodeGen/GlobalISel/Combiner.h" |
30 | #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" |
31 | #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" |
32 | #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" |
33 | #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" |
34 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
35 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" |
36 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
37 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
38 | #include "llvm/CodeGen/GlobalISel/Utils.h" |
39 | #include "llvm/CodeGen/MachineFrameInfo.h" |
40 | #include "llvm/CodeGen/MachineFunctionPass.h" |
41 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
42 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
43 | #include "llvm/CodeGen/TargetOpcodes.h" |
44 | #include "llvm/CodeGen/TargetPassConfig.h" |
45 | #include "llvm/IR/InstrTypes.h" |
46 | #include "llvm/Support/ErrorHandling.h" |
47 | #include <optional> |
48 | |
49 | #define GET_GICOMBINER_DEPS |
50 | #include "AArch64GenPostLegalizeGILowering.inc" |
51 | #undef GET_GICOMBINER_DEPS |
52 | |
53 | #define DEBUG_TYPE "aarch64-postlegalizer-lowering" |
54 | |
55 | using namespace llvm; |
56 | using namespace MIPatternMatch; |
57 | using namespace AArch64GISelUtils; |
58 | |
59 | namespace { |
60 | |
61 | #define GET_GICOMBINER_TYPES |
62 | #include "AArch64GenPostLegalizeGILowering.inc" |
63 | #undef GET_GICOMBINER_TYPES |
64 | |
65 | /// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR. |
66 | /// |
67 | /// Used for matching target-supported shuffles before codegen. |
68 | struct ShuffleVectorPseudo { |
69 | unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1) |
70 | Register Dst; ///< Destination register. |
71 | SmallVector<SrcOp, 2> SrcOps; ///< Source registers. |
72 | ShuffleVectorPseudo(unsigned Opc, Register Dst, |
73 | std::initializer_list<SrcOp> SrcOps) |
74 | : Opc(Opc), Dst(Dst), SrcOps(SrcOps){}; |
75 | ShuffleVectorPseudo() = default; |
76 | }; |
77 | |
78 | /// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector |
79 | /// sources of the shuffle are different. |
80 | std::optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M, |
81 | unsigned NumElts) { |
82 | // Look for the first non-undef element. |
83 | auto FirstRealElt = find_if(Range&: M, P: [](int Elt) { return Elt >= 0; }); |
84 | if (FirstRealElt == M.end()) |
85 | return std::nullopt; |
86 | |
87 | // Use APInt to handle overflow when calculating expected element. |
88 | unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); |
89 | APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, false, true); |
90 | |
91 | // The following shuffle indices must be the successive elements after the |
92 | // first real element. |
93 | if (any_of( |
94 | Range: make_range(x: std::next(x: FirstRealElt), y: M.end()), |
95 | P: [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; })) |
96 | return std::nullopt; |
97 | |
98 | // The index of an EXT is the first element if it is not UNDEF. |
99 | // Watch out for the beginning UNDEFs. The EXT index should be the expected |
100 | // value of the first element. E.g. |
101 | // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. |
102 | // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. |
103 | // ExpectedElt is the last mask index plus 1. |
104 | uint64_t Imm = ExpectedElt.getZExtValue(); |
105 | bool ReverseExt = false; |
106 | |
107 | // There are two difference cases requiring to reverse input vectors. |
108 | // For example, for vector <4 x i32> we have the following cases, |
109 | // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) |
110 | // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) |
111 | // For both cases, we finally use mask <5, 6, 7, 0>, which requires |
112 | // to reverse two input vectors. |
113 | if (Imm < NumElts) |
114 | ReverseExt = true; |
115 | else |
116 | Imm -= NumElts; |
117 | return std::make_pair(x&: ReverseExt, y&: Imm); |
118 | } |
119 | |
120 | /// Helper function for matchINS. |
121 | /// |
122 | /// \returns a value when \p M is an ins mask for \p NumInputElements. |
123 | /// |
124 | /// First element of the returned pair is true when the produced |
125 | /// G_INSERT_VECTOR_ELT destination should be the LHS of the G_SHUFFLE_VECTOR. |
126 | /// |
127 | /// Second element is the destination lane for the G_INSERT_VECTOR_ELT. |
128 | std::optional<std::pair<bool, int>> isINSMask(ArrayRef<int> M, |
129 | int NumInputElements) { |
130 | if (M.size() != static_cast<size_t>(NumInputElements)) |
131 | return std::nullopt; |
132 | int NumLHSMatch = 0, NumRHSMatch = 0; |
133 | int LastLHSMismatch = -1, LastRHSMismatch = -1; |
134 | for (int Idx = 0; Idx < NumInputElements; ++Idx) { |
135 | if (M[Idx] == -1) { |
136 | ++NumLHSMatch; |
137 | ++NumRHSMatch; |
138 | continue; |
139 | } |
140 | M[Idx] == Idx ? ++NumLHSMatch : LastLHSMismatch = Idx; |
141 | M[Idx] == Idx + NumInputElements ? ++NumRHSMatch : LastRHSMismatch = Idx; |
142 | } |
143 | const int NumNeededToMatch = NumInputElements - 1; |
144 | if (NumLHSMatch == NumNeededToMatch) |
145 | return std::make_pair(x: true, y&: LastLHSMismatch); |
146 | if (NumRHSMatch == NumNeededToMatch) |
147 | return std::make_pair(x: false, y&: LastRHSMismatch); |
148 | return std::nullopt; |
149 | } |
150 | |
151 | /// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a |
152 | /// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc. |
153 | bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI, |
154 | ShuffleVectorPseudo &MatchInfo) { |
155 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
156 | ArrayRef<int> ShuffleMask = MI.getOperand(i: 3).getShuffleMask(); |
157 | Register Dst = MI.getOperand(i: 0).getReg(); |
158 | Register Src = MI.getOperand(i: 1).getReg(); |
159 | LLT Ty = MRI.getType(Reg: Dst); |
160 | unsigned EltSize = Ty.getScalarSizeInBits(); |
161 | |
162 | // Element size for a rev cannot be 64. |
163 | if (EltSize == 64) |
164 | return false; |
165 | |
166 | unsigned NumElts = Ty.getNumElements(); |
167 | |
168 | // Try to produce a G_REV instruction |
169 | for (unsigned LaneSize : {64U, 32U, 16U}) { |
170 | if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: LaneSize)) { |
171 | unsigned Opcode; |
172 | if (LaneSize == 64U) |
173 | Opcode = AArch64::G_REV64; |
174 | else if (LaneSize == 32U) |
175 | Opcode = AArch64::G_REV32; |
176 | else |
177 | Opcode = AArch64::G_REV16; |
178 | |
179 | MatchInfo = ShuffleVectorPseudo(Opcode, Dst, {Src}); |
180 | return true; |
181 | } |
182 | } |
183 | |
184 | return false; |
185 | } |
186 | |
187 | /// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with |
188 | /// a G_TRN1 or G_TRN2 instruction. |
189 | bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI, |
190 | ShuffleVectorPseudo &MatchInfo) { |
191 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
192 | unsigned WhichResult; |
193 | ArrayRef<int> ShuffleMask = MI.getOperand(i: 3).getShuffleMask(); |
194 | Register Dst = MI.getOperand(i: 0).getReg(); |
195 | unsigned NumElts = MRI.getType(Reg: Dst).getNumElements(); |
196 | if (!isTRNMask(M: ShuffleMask, NumElts, WhichResult)) |
197 | return false; |
198 | unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2; |
199 | Register V1 = MI.getOperand(i: 1).getReg(); |
200 | Register V2 = MI.getOperand(i: 2).getReg(); |
201 | MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); |
202 | return true; |
203 | } |
204 | |
205 | /// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with |
206 | /// a G_UZP1 or G_UZP2 instruction. |
207 | /// |
208 | /// \param [in] MI - The shuffle vector instruction. |
209 | /// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success. |
210 | bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI, |
211 | ShuffleVectorPseudo &MatchInfo) { |
212 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
213 | unsigned WhichResult; |
214 | ArrayRef<int> ShuffleMask = MI.getOperand(i: 3).getShuffleMask(); |
215 | Register Dst = MI.getOperand(i: 0).getReg(); |
216 | unsigned NumElts = MRI.getType(Reg: Dst).getNumElements(); |
217 | if (!isUZPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) |
218 | return false; |
219 | unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2; |
220 | Register V1 = MI.getOperand(i: 1).getReg(); |
221 | Register V2 = MI.getOperand(i: 2).getReg(); |
222 | MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); |
223 | return true; |
224 | } |
225 | |
226 | bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, |
227 | ShuffleVectorPseudo &MatchInfo) { |
228 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
229 | unsigned WhichResult; |
230 | ArrayRef<int> ShuffleMask = MI.getOperand(i: 3).getShuffleMask(); |
231 | Register Dst = MI.getOperand(i: 0).getReg(); |
232 | unsigned NumElts = MRI.getType(Reg: Dst).getNumElements(); |
233 | if (!isZIPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) |
234 | return false; |
235 | unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2; |
236 | Register V1 = MI.getOperand(i: 1).getReg(); |
237 | Register V2 = MI.getOperand(i: 2).getReg(); |
238 | MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); |
239 | return true; |
240 | } |
241 | |
242 | /// Helper function for matchDup. |
243 | bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI, |
244 | MachineRegisterInfo &MRI, |
245 | ShuffleVectorPseudo &MatchInfo) { |
246 | if (Lane != 0) |
247 | return false; |
248 | |
249 | // Try to match a vector splat operation into a dup instruction. |
250 | // We're looking for this pattern: |
251 | // |
252 | // %scalar:gpr(s64) = COPY $x0 |
253 | // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF |
254 | // %cst0:gpr(s32) = G_CONSTANT i32 0 |
255 | // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) |
256 | // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) |
257 | // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, |
258 | // %zerovec(<2 x s32>) |
259 | // |
260 | // ...into: |
261 | // %splat = G_DUP %scalar |
262 | |
263 | // Begin matching the insert. |
264 | auto *InsMI = getOpcodeDef(Opcode: TargetOpcode::G_INSERT_VECTOR_ELT, |
265 | Reg: MI.getOperand(i: 1).getReg(), MRI); |
266 | if (!InsMI) |
267 | return false; |
268 | // Match the undef vector operand. |
269 | if (!getOpcodeDef(Opcode: TargetOpcode::G_IMPLICIT_DEF, Reg: InsMI->getOperand(i: 1).getReg(), |
270 | MRI)) |
271 | return false; |
272 | |
273 | // Match the index constant 0. |
274 | if (!mi_match(R: InsMI->getOperand(i: 3).getReg(), MRI, P: m_ZeroInt())) |
275 | return false; |
276 | |
277 | MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(i: 0).getReg(), |
278 | {InsMI->getOperand(i: 2).getReg()}); |
279 | return true; |
280 | } |
281 | |
282 | /// Helper function for matchDup. |
283 | bool matchDupFromBuildVector(int Lane, MachineInstr &MI, |
284 | MachineRegisterInfo &MRI, |
285 | ShuffleVectorPseudo &MatchInfo) { |
286 | assert(Lane >= 0 && "Expected positive lane?" ); |
287 | int NumElements = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getNumElements(); |
288 | // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the |
289 | // lane's definition directly. |
290 | auto *BuildVecMI = |
291 | getOpcodeDef(Opcode: TargetOpcode::G_BUILD_VECTOR, |
292 | Reg: MI.getOperand(i: Lane < NumElements ? 1 : 2).getReg(), MRI); |
293 | // If Lane >= NumElements then it is point to RHS, just check from RHS |
294 | if (NumElements <= Lane) |
295 | Lane -= NumElements; |
296 | |
297 | if (!BuildVecMI) |
298 | return false; |
299 | Register Reg = BuildVecMI->getOperand(i: Lane + 1).getReg(); |
300 | MatchInfo = |
301 | ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(i: 0).getReg(), {Reg}); |
302 | return true; |
303 | } |
304 | |
305 | bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI, |
306 | ShuffleVectorPseudo &MatchInfo) { |
307 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
308 | auto MaybeLane = getSplatIndex(MI); |
309 | if (!MaybeLane) |
310 | return false; |
311 | int Lane = *MaybeLane; |
312 | // If this is undef splat, generate it via "just" vdup, if possible. |
313 | if (Lane < 0) |
314 | Lane = 0; |
315 | if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo)) |
316 | return true; |
317 | if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo)) |
318 | return true; |
319 | return false; |
320 | } |
321 | |
322 | // Check if an EXT instruction can handle the shuffle mask when the vector |
323 | // sources of the shuffle are the same. |
324 | bool isSingletonExtMask(ArrayRef<int> M, LLT Ty) { |
325 | unsigned NumElts = Ty.getNumElements(); |
326 | |
327 | // Assume that the first shuffle index is not UNDEF. Fail if it is. |
328 | if (M[0] < 0) |
329 | return false; |
330 | |
331 | // If this is a VEXT shuffle, the immediate value is the index of the first |
332 | // element. The other shuffle indices must be the successive elements after |
333 | // the first one. |
334 | unsigned ExpectedElt = M[0]; |
335 | for (unsigned I = 1; I < NumElts; ++I) { |
336 | // Increment the expected index. If it wraps around, just follow it |
337 | // back to index zero and keep going. |
338 | ++ExpectedElt; |
339 | if (ExpectedElt == NumElts) |
340 | ExpectedElt = 0; |
341 | |
342 | if (M[I] < 0) |
343 | continue; // Ignore UNDEF indices. |
344 | if (ExpectedElt != static_cast<unsigned>(M[I])) |
345 | return false; |
346 | } |
347 | |
348 | return true; |
349 | } |
350 | |
351 | bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI, |
352 | ShuffleVectorPseudo &MatchInfo) { |
353 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
354 | Register Dst = MI.getOperand(i: 0).getReg(); |
355 | LLT DstTy = MRI.getType(Reg: Dst); |
356 | Register V1 = MI.getOperand(i: 1).getReg(); |
357 | Register V2 = MI.getOperand(i: 2).getReg(); |
358 | auto Mask = MI.getOperand(i: 3).getShuffleMask(); |
359 | uint64_t Imm; |
360 | auto ExtInfo = getExtMask(M: Mask, NumElts: DstTy.getNumElements()); |
361 | uint64_t ExtFactor = MRI.getType(Reg: V1).getScalarSizeInBits() / 8; |
362 | |
363 | if (!ExtInfo) { |
364 | if (!getOpcodeDef<GImplicitDef>(Reg: V2, MRI) || |
365 | !isSingletonExtMask(M: Mask, Ty: DstTy)) |
366 | return false; |
367 | |
368 | Imm = Mask[0] * ExtFactor; |
369 | MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V1, Imm}); |
370 | return true; |
371 | } |
372 | bool ReverseExt; |
373 | std::tie(args&: ReverseExt, args&: Imm) = *ExtInfo; |
374 | if (ReverseExt) |
375 | std::swap(a&: V1, b&: V2); |
376 | Imm *= ExtFactor; |
377 | MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm}); |
378 | return true; |
379 | } |
380 | |
381 | /// Replace a G_SHUFFLE_VECTOR instruction with a pseudo. |
382 | /// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR. |
383 | void applyShuffleVectorPseudo(MachineInstr &MI, |
384 | ShuffleVectorPseudo &MatchInfo) { |
385 | MachineIRBuilder MIRBuilder(MI); |
386 | MIRBuilder.buildInstr(Opc: MatchInfo.Opc, DstOps: {MatchInfo.Dst}, SrcOps: MatchInfo.SrcOps); |
387 | MI.eraseFromParent(); |
388 | } |
389 | |
390 | /// Replace a G_SHUFFLE_VECTOR instruction with G_EXT. |
391 | /// Special-cased because the constant operand must be emitted as a G_CONSTANT |
392 | /// for the imported tablegen patterns to work. |
393 | void applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) { |
394 | MachineIRBuilder MIRBuilder(MI); |
395 | if (MatchInfo.SrcOps[2].getImm() == 0) |
396 | MIRBuilder.buildCopy(Res: MatchInfo.Dst, Op: MatchInfo.SrcOps[0]); |
397 | else { |
398 | // Tablegen patterns expect an i32 G_CONSTANT as the final op. |
399 | auto Cst = |
400 | MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: MatchInfo.SrcOps[2].getImm()); |
401 | MIRBuilder.buildInstr(Opc: MatchInfo.Opc, DstOps: {MatchInfo.Dst}, |
402 | SrcOps: {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst}); |
403 | } |
404 | MI.eraseFromParent(); |
405 | } |
406 | |
407 | void applyFullRev(MachineInstr &MI, MachineRegisterInfo &MRI) { |
408 | Register Dst = MI.getOperand(i: 0).getReg(); |
409 | Register Src = MI.getOperand(i: 1).getReg(); |
410 | LLT DstTy = MRI.getType(Reg: Dst); |
411 | assert(DstTy.getSizeInBits() == 128 && |
412 | "Expected 128bit vector in applyFullRev" ); |
413 | MachineIRBuilder MIRBuilder(MI); |
414 | auto Cst = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: 8); |
415 | auto Rev = MIRBuilder.buildInstr(Opc: AArch64::G_REV64, DstOps: {DstTy}, SrcOps: {Src}); |
416 | MIRBuilder.buildInstr(Opc: AArch64::G_EXT, DstOps: {Dst}, SrcOps: {Rev, Rev, Cst}); |
417 | MI.eraseFromParent(); |
418 | } |
419 | |
420 | bool matchNonConstInsert(MachineInstr &MI, MachineRegisterInfo &MRI) { |
421 | assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT); |
422 | |
423 | auto ValAndVReg = |
424 | getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI); |
425 | return !ValAndVReg; |
426 | } |
427 | |
428 | void applyNonConstInsert(MachineInstr &MI, MachineRegisterInfo &MRI, |
429 | MachineIRBuilder &Builder) { |
430 | auto &Insert = cast<GInsertVectorElement>(Val&: MI); |
431 | Builder.setInstrAndDebugLoc(Insert); |
432 | |
433 | Register Offset = Insert.getIndexReg(); |
434 | LLT VecTy = MRI.getType(Reg: Insert.getReg(Idx: 0)); |
435 | LLT EltTy = MRI.getType(Reg: Insert.getElementReg()); |
436 | LLT IdxTy = MRI.getType(Reg: Insert.getIndexReg()); |
437 | |
438 | if (VecTy.isScalableVector()) |
439 | return; |
440 | |
441 | // Create a stack slot and store the vector into it |
442 | MachineFunction &MF = Builder.getMF(); |
443 | Align Alignment( |
444 | std::min<uint64_t>(a: VecTy.getSizeInBytes().getKnownMinValue(), b: 16)); |
445 | int FrameIdx = MF.getFrameInfo().CreateStackObject(Size: VecTy.getSizeInBytes(), |
446 | Alignment, isSpillSlot: false); |
447 | LLT FramePtrTy = LLT::pointer(AddressSpace: 0, SizeInBits: 64); |
448 | MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI: FrameIdx); |
449 | auto StackTemp = Builder.buildFrameIndex(Res: FramePtrTy, Idx: FrameIdx); |
450 | |
451 | Builder.buildStore(Val: Insert.getOperand(i: 1), Addr: StackTemp, PtrInfo, Alignment: Align(8)); |
452 | |
453 | // Get the pointer to the element, and be sure not to hit undefined behavior |
454 | // if the index is out of bounds. |
455 | assert(isPowerOf2_64(VecTy.getNumElements()) && |
456 | "Expected a power-2 vector size" ); |
457 | auto Mask = Builder.buildConstant(Res: IdxTy, Val: VecTy.getNumElements() - 1); |
458 | Register And = Builder.buildAnd(Dst: IdxTy, Src0: Offset, Src1: Mask).getReg(Idx: 0); |
459 | auto EltSize = Builder.buildConstant(Res: IdxTy, Val: EltTy.getSizeInBytes()); |
460 | Register Mul = Builder.buildMul(Dst: IdxTy, Src0: And, Src1: EltSize).getReg(Idx: 0); |
461 | Register EltPtr = |
462 | Builder.buildPtrAdd(Res: MRI.getType(Reg: StackTemp.getReg(Idx: 0)), Op0: StackTemp, Op1: Mul) |
463 | .getReg(Idx: 0); |
464 | |
465 | // Write the inserted element |
466 | Builder.buildStore(Val: Insert.getElementReg(), Addr: EltPtr, PtrInfo, Alignment: Align(1)); |
467 | // Reload the whole vector. |
468 | Builder.buildLoad(Res: Insert.getReg(Idx: 0), Addr: StackTemp, PtrInfo, Alignment: Align(8)); |
469 | Insert.eraseFromParent(); |
470 | } |
471 | |
472 | /// Match a G_SHUFFLE_VECTOR with a mask which corresponds to a |
473 | /// G_INSERT_VECTOR_ELT and G_EXTRACT_VECTOR_ELT pair. |
474 | /// |
475 | /// e.g. |
476 | /// %shuf = G_SHUFFLE_VECTOR %left, %right, shufflemask(0, 0) |
477 | /// |
478 | /// Can be represented as |
479 | /// |
480 | /// %extract = G_EXTRACT_VECTOR_ELT %left, 0 |
481 | /// %ins = G_INSERT_VECTOR_ELT %left, %extract, 1 |
482 | /// |
483 | bool matchINS(MachineInstr &MI, MachineRegisterInfo &MRI, |
484 | std::tuple<Register, int, Register, int> &MatchInfo) { |
485 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
486 | ArrayRef<int> ShuffleMask = MI.getOperand(i: 3).getShuffleMask(); |
487 | Register Dst = MI.getOperand(i: 0).getReg(); |
488 | int NumElts = MRI.getType(Reg: Dst).getNumElements(); |
489 | auto DstIsLeftAndDstLane = isINSMask(M: ShuffleMask, NumInputElements: NumElts); |
490 | if (!DstIsLeftAndDstLane) |
491 | return false; |
492 | bool DstIsLeft; |
493 | int DstLane; |
494 | std::tie(args&: DstIsLeft, args&: DstLane) = *DstIsLeftAndDstLane; |
495 | Register Left = MI.getOperand(i: 1).getReg(); |
496 | Register Right = MI.getOperand(i: 2).getReg(); |
497 | Register DstVec = DstIsLeft ? Left : Right; |
498 | Register SrcVec = Left; |
499 | |
500 | int SrcLane = ShuffleMask[DstLane]; |
501 | if (SrcLane >= NumElts) { |
502 | SrcVec = Right; |
503 | SrcLane -= NumElts; |
504 | } |
505 | |
506 | MatchInfo = std::make_tuple(args&: DstVec, args&: DstLane, args&: SrcVec, args&: SrcLane); |
507 | return true; |
508 | } |
509 | |
510 | void applyINS(MachineInstr &MI, MachineRegisterInfo &MRI, |
511 | MachineIRBuilder &Builder, |
512 | std::tuple<Register, int, Register, int> &MatchInfo) { |
513 | Builder.setInstrAndDebugLoc(MI); |
514 | Register Dst = MI.getOperand(i: 0).getReg(); |
515 | auto ScalarTy = MRI.getType(Reg: Dst).getElementType(); |
516 | Register DstVec, SrcVec; |
517 | int DstLane, SrcLane; |
518 | std::tie(args&: DstVec, args&: DstLane, args&: SrcVec, args&: SrcLane) = MatchInfo; |
519 | auto SrcCst = Builder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: SrcLane); |
520 | auto = Builder.buildExtractVectorElement(Res: ScalarTy, Val: SrcVec, Idx: SrcCst); |
521 | auto DstCst = Builder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: DstLane); |
522 | Builder.buildInsertVectorElement(Res: Dst, Val: DstVec, Elt: Extract, Idx: DstCst); |
523 | MI.eraseFromParent(); |
524 | } |
525 | |
526 | /// isVShiftRImm - Check if this is a valid vector for the immediate |
527 | /// operand of a vector shift right operation. The value must be in the range: |
528 | /// 1 <= Value <= ElementBits for a right shift. |
529 | bool isVShiftRImm(Register Reg, MachineRegisterInfo &MRI, LLT Ty, |
530 | int64_t &Cnt) { |
531 | assert(Ty.isVector() && "vector shift count is not a vector type" ); |
532 | MachineInstr *MI = MRI.getVRegDef(Reg); |
533 | auto Cst = getAArch64VectorSplatScalar(MI: *MI, MRI); |
534 | if (!Cst) |
535 | return false; |
536 | Cnt = *Cst; |
537 | int64_t ElementBits = Ty.getScalarSizeInBits(); |
538 | return Cnt >= 1 && Cnt <= ElementBits; |
539 | } |
540 | |
541 | /// Match a vector G_ASHR or G_LSHR with a valid immediate shift. |
542 | bool matchVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI, |
543 | int64_t &Imm) { |
544 | assert(MI.getOpcode() == TargetOpcode::G_ASHR || |
545 | MI.getOpcode() == TargetOpcode::G_LSHR); |
546 | LLT Ty = MRI.getType(Reg: MI.getOperand(i: 1).getReg()); |
547 | if (!Ty.isVector()) |
548 | return false; |
549 | return isVShiftRImm(Reg: MI.getOperand(i: 2).getReg(), MRI, Ty, Cnt&: Imm); |
550 | } |
551 | |
552 | void applyVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI, |
553 | int64_t &Imm) { |
554 | unsigned Opc = MI.getOpcode(); |
555 | assert(Opc == TargetOpcode::G_ASHR || Opc == TargetOpcode::G_LSHR); |
556 | unsigned NewOpc = |
557 | Opc == TargetOpcode::G_ASHR ? AArch64::G_VASHR : AArch64::G_VLSHR; |
558 | MachineIRBuilder MIB(MI); |
559 | auto ImmDef = MIB.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: Imm); |
560 | MIB.buildInstr(Opc: NewOpc, DstOps: {MI.getOperand(i: 0)}, SrcOps: {MI.getOperand(i: 1), ImmDef}); |
561 | MI.eraseFromParent(); |
562 | } |
563 | |
564 | /// Determine if it is possible to modify the \p RHS and predicate \p P of a |
565 | /// G_ICMP instruction such that the right-hand side is an arithmetic immediate. |
566 | /// |
567 | /// \returns A pair containing the updated immediate and predicate which may |
568 | /// be used to optimize the instruction. |
569 | /// |
570 | /// \note This assumes that the comparison has been legalized. |
571 | std::optional<std::pair<uint64_t, CmpInst::Predicate>> |
572 | tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, |
573 | const MachineRegisterInfo &MRI) { |
574 | const auto &Ty = MRI.getType(Reg: RHS); |
575 | if (Ty.isVector()) |
576 | return std::nullopt; |
577 | unsigned Size = Ty.getSizeInBits(); |
578 | assert((Size == 32 || Size == 64) && "Expected 32 or 64 bit compare only?" ); |
579 | |
580 | // If the RHS is not a constant, or the RHS is already a valid arithmetic |
581 | // immediate, then there is nothing to change. |
582 | auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS, MRI); |
583 | if (!ValAndVReg) |
584 | return std::nullopt; |
585 | uint64_t OriginalC = ValAndVReg->Value.getZExtValue(); |
586 | uint64_t C = OriginalC; |
587 | if (isLegalArithImmed(C)) |
588 | return std::nullopt; |
589 | |
590 | // We have a non-arithmetic immediate. Check if adjusting the immediate and |
591 | // adjusting the predicate will result in a legal arithmetic immediate. |
592 | switch (P) { |
593 | default: |
594 | return std::nullopt; |
595 | case CmpInst::ICMP_SLT: |
596 | case CmpInst::ICMP_SGE: |
597 | // Check for |
598 | // |
599 | // x slt c => x sle c - 1 |
600 | // x sge c => x sgt c - 1 |
601 | // |
602 | // When c is not the smallest possible negative number. |
603 | if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) || |
604 | (Size == 32 && static_cast<int32_t>(C) == INT32_MIN)) |
605 | return std::nullopt; |
606 | P = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT; |
607 | C -= 1; |
608 | break; |
609 | case CmpInst::ICMP_ULT: |
610 | case CmpInst::ICMP_UGE: |
611 | // Check for |
612 | // |
613 | // x ult c => x ule c - 1 |
614 | // x uge c => x ugt c - 1 |
615 | // |
616 | // When c is not zero. |
617 | if (C == 0) |
618 | return std::nullopt; |
619 | P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; |
620 | C -= 1; |
621 | break; |
622 | case CmpInst::ICMP_SLE: |
623 | case CmpInst::ICMP_SGT: |
624 | // Check for |
625 | // |
626 | // x sle c => x slt c + 1 |
627 | // x sgt c => s sge c + 1 |
628 | // |
629 | // When c is not the largest possible signed integer. |
630 | if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) || |
631 | (Size == 64 && static_cast<int64_t>(C) == INT64_MAX)) |
632 | return std::nullopt; |
633 | P = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE; |
634 | C += 1; |
635 | break; |
636 | case CmpInst::ICMP_ULE: |
637 | case CmpInst::ICMP_UGT: |
638 | // Check for |
639 | // |
640 | // x ule c => x ult c + 1 |
641 | // x ugt c => s uge c + 1 |
642 | // |
643 | // When c is not the largest possible unsigned integer. |
644 | if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) || |
645 | (Size == 64 && C == UINT64_MAX)) |
646 | return std::nullopt; |
647 | P = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE; |
648 | C += 1; |
649 | break; |
650 | } |
651 | |
652 | // Check if the new constant is valid, and return the updated constant and |
653 | // predicate if it is. |
654 | if (Size == 32) |
655 | C = static_cast<uint32_t>(C); |
656 | if (isLegalArithImmed(C)) |
657 | return {{C, P}}; |
658 | |
659 | auto IsMaterializableInSingleInstruction = [=](uint64_t Imm) { |
660 | SmallVector<AArch64_IMM::ImmInsnModel> Insn; |
661 | AArch64_IMM::expandMOVImm(Imm, BitSize: 32, Insn); |
662 | return Insn.size() == 1; |
663 | }; |
664 | |
665 | if (!IsMaterializableInSingleInstruction(OriginalC) && |
666 | IsMaterializableInSingleInstruction(C)) |
667 | return {{C, P}}; |
668 | |
669 | return std::nullopt; |
670 | } |
671 | |
672 | /// Determine whether or not it is possible to update the RHS and predicate of |
673 | /// a G_ICMP instruction such that the RHS will be selected as an arithmetic |
674 | /// immediate. |
675 | /// |
676 | /// \p MI - The G_ICMP instruction |
677 | /// \p MatchInfo - The new RHS immediate and predicate on success |
678 | /// |
679 | /// See tryAdjustICmpImmAndPred for valid transformations. |
680 | bool matchAdjustICmpImmAndPred( |
681 | MachineInstr &MI, const MachineRegisterInfo &MRI, |
682 | std::pair<uint64_t, CmpInst::Predicate> &MatchInfo) { |
683 | assert(MI.getOpcode() == TargetOpcode::G_ICMP); |
684 | Register RHS = MI.getOperand(i: 3).getReg(); |
685 | auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate()); |
686 | if (auto MaybeNewImmAndPred = tryAdjustICmpImmAndPred(RHS, P: Pred, MRI)) { |
687 | MatchInfo = *MaybeNewImmAndPred; |
688 | return true; |
689 | } |
690 | return false; |
691 | } |
692 | |
693 | void applyAdjustICmpImmAndPred( |
694 | MachineInstr &MI, std::pair<uint64_t, CmpInst::Predicate> &MatchInfo, |
695 | MachineIRBuilder &MIB, GISelChangeObserver &Observer) { |
696 | MIB.setInstrAndDebugLoc(MI); |
697 | MachineOperand &RHS = MI.getOperand(i: 3); |
698 | MachineRegisterInfo &MRI = *MIB.getMRI(); |
699 | auto Cst = MIB.buildConstant(Res: MRI.cloneVirtualRegister(VReg: RHS.getReg()), |
700 | Val: MatchInfo.first); |
701 | Observer.changingInstr(MI); |
702 | RHS.setReg(Cst->getOperand(i: 0).getReg()); |
703 | MI.getOperand(i: 1).setPredicate(MatchInfo.second); |
704 | Observer.changedInstr(MI); |
705 | } |
706 | |
707 | bool matchDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, |
708 | std::pair<unsigned, int> &MatchInfo) { |
709 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
710 | Register Src1Reg = MI.getOperand(i: 1).getReg(); |
711 | const LLT SrcTy = MRI.getType(Reg: Src1Reg); |
712 | const LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
713 | |
714 | auto LaneIdx = getSplatIndex(MI); |
715 | if (!LaneIdx) |
716 | return false; |
717 | |
718 | // The lane idx should be within the first source vector. |
719 | if (*LaneIdx >= SrcTy.getNumElements()) |
720 | return false; |
721 | |
722 | if (DstTy != SrcTy) |
723 | return false; |
724 | |
725 | LLT ScalarTy = SrcTy.getElementType(); |
726 | unsigned ScalarSize = ScalarTy.getSizeInBits(); |
727 | |
728 | unsigned Opc = 0; |
729 | switch (SrcTy.getNumElements()) { |
730 | case 2: |
731 | if (ScalarSize == 64) |
732 | Opc = AArch64::G_DUPLANE64; |
733 | else if (ScalarSize == 32) |
734 | Opc = AArch64::G_DUPLANE32; |
735 | break; |
736 | case 4: |
737 | if (ScalarSize == 32) |
738 | Opc = AArch64::G_DUPLANE32; |
739 | else if (ScalarSize == 16) |
740 | Opc = AArch64::G_DUPLANE16; |
741 | break; |
742 | case 8: |
743 | if (ScalarSize == 8) |
744 | Opc = AArch64::G_DUPLANE8; |
745 | else if (ScalarSize == 16) |
746 | Opc = AArch64::G_DUPLANE16; |
747 | break; |
748 | case 16: |
749 | if (ScalarSize == 8) |
750 | Opc = AArch64::G_DUPLANE8; |
751 | break; |
752 | default: |
753 | break; |
754 | } |
755 | if (!Opc) |
756 | return false; |
757 | |
758 | MatchInfo.first = Opc; |
759 | MatchInfo.second = *LaneIdx; |
760 | return true; |
761 | } |
762 | |
763 | void applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, |
764 | MachineIRBuilder &B, std::pair<unsigned, int> &MatchInfo) { |
765 | assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); |
766 | Register Src1Reg = MI.getOperand(i: 1).getReg(); |
767 | const LLT SrcTy = MRI.getType(Reg: Src1Reg); |
768 | |
769 | B.setInstrAndDebugLoc(MI); |
770 | auto Lane = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: MatchInfo.second); |
771 | |
772 | Register DupSrc = MI.getOperand(i: 1).getReg(); |
773 | // For types like <2 x s32>, we can use G_DUPLANE32, with a <4 x s32> source. |
774 | // To do this, we can use a G_CONCAT_VECTORS to do the widening. |
775 | if (SrcTy.getSizeInBits() == 64) { |
776 | auto Undef = B.buildUndef(Res: SrcTy); |
777 | DupSrc = B.buildConcatVectors(Res: SrcTy.multiplyElements(Factor: 2), |
778 | Ops: {Src1Reg, Undef.getReg(Idx: 0)}) |
779 | .getReg(Idx: 0); |
780 | } |
781 | B.buildInstr(Opc: MatchInfo.first, DstOps: {MI.getOperand(i: 0).getReg()}, SrcOps: {DupSrc, Lane}); |
782 | MI.eraseFromParent(); |
783 | } |
784 | |
785 | bool matchScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI) { |
786 | auto &Unmerge = cast<GUnmerge>(Val&: MI); |
787 | Register Src1Reg = Unmerge.getReg(Idx: Unmerge.getNumOperands() - 1); |
788 | const LLT SrcTy = MRI.getType(Reg: Src1Reg); |
789 | if (SrcTy.getSizeInBits() != 128 && SrcTy.getSizeInBits() != 64) |
790 | return false; |
791 | return SrcTy.isVector() && !SrcTy.isScalable() && |
792 | Unmerge.getNumOperands() == (unsigned)SrcTy.getNumElements() + 1; |
793 | } |
794 | |
795 | void applyScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, |
796 | MachineIRBuilder &B) { |
797 | auto &Unmerge = cast<GUnmerge>(Val&: MI); |
798 | Register Src1Reg = Unmerge.getReg(Idx: Unmerge.getNumOperands() - 1); |
799 | const LLT SrcTy = MRI.getType(Reg: Src1Reg); |
800 | assert((SrcTy.isVector() && !SrcTy.isScalable()) && |
801 | "Expected a fixed length vector" ); |
802 | |
803 | for (int I = 0; I < SrcTy.getNumElements(); ++I) |
804 | B.buildExtractVectorElementConstant(Res: Unmerge.getReg(Idx: I), Val: Src1Reg, Idx: I); |
805 | MI.eraseFromParent(); |
806 | } |
807 | |
808 | bool matchBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI) { |
809 | assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR); |
810 | |
811 | // Later, during selection, we'll try to match imported patterns using |
812 | // immAllOnesV and immAllZerosV. These require G_BUILD_VECTOR. Don't lower |
813 | // G_BUILD_VECTORs which could match those patterns. |
814 | if (isBuildVectorAllZeros(MI, MRI) || isBuildVectorAllOnes(MI, MRI)) |
815 | return false; |
816 | |
817 | return getAArch64VectorSplat(MI, MRI).has_value(); |
818 | } |
819 | |
820 | void applyBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI, |
821 | MachineIRBuilder &B) { |
822 | B.setInstrAndDebugLoc(MI); |
823 | B.buildInstr(Opc: AArch64::G_DUP, DstOps: {MI.getOperand(i: 0).getReg()}, |
824 | SrcOps: {MI.getOperand(i: 1).getReg()}); |
825 | MI.eraseFromParent(); |
826 | } |
827 | |
828 | /// \returns how many instructions would be saved by folding a G_ICMP's shift |
829 | /// and/or extension operations. |
830 | unsigned getCmpOperandFoldingProfit(Register CmpOp, MachineRegisterInfo &MRI) { |
831 | // No instructions to save if there's more than one use or no uses. |
832 | if (!MRI.hasOneNonDBGUse(RegNo: CmpOp)) |
833 | return 0; |
834 | |
835 | // FIXME: This is duplicated with the selector. (See: selectShiftedRegister) |
836 | auto IsSupportedExtend = [&](const MachineInstr &MI) { |
837 | if (MI.getOpcode() == TargetOpcode::G_SEXT_INREG) |
838 | return true; |
839 | if (MI.getOpcode() != TargetOpcode::G_AND) |
840 | return false; |
841 | auto ValAndVReg = |
842 | getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI); |
843 | if (!ValAndVReg) |
844 | return false; |
845 | uint64_t Mask = ValAndVReg->Value.getZExtValue(); |
846 | return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); |
847 | }; |
848 | |
849 | MachineInstr *Def = getDefIgnoringCopies(Reg: CmpOp, MRI); |
850 | if (IsSupportedExtend(*Def)) |
851 | return 1; |
852 | |
853 | unsigned Opc = Def->getOpcode(); |
854 | if (Opc != TargetOpcode::G_SHL && Opc != TargetOpcode::G_ASHR && |
855 | Opc != TargetOpcode::G_LSHR) |
856 | return 0; |
857 | |
858 | auto MaybeShiftAmt = |
859 | getIConstantVRegValWithLookThrough(VReg: Def->getOperand(i: 2).getReg(), MRI); |
860 | if (!MaybeShiftAmt) |
861 | return 0; |
862 | uint64_t ShiftAmt = MaybeShiftAmt->Value.getZExtValue(); |
863 | MachineInstr *ShiftLHS = |
864 | getDefIgnoringCopies(Reg: Def->getOperand(i: 1).getReg(), MRI); |
865 | |
866 | // Check if we can fold an extend and a shift. |
867 | // FIXME: This is duplicated with the selector. (See: |
868 | // selectArithExtendedRegister) |
869 | if (IsSupportedExtend(*ShiftLHS)) |
870 | return (ShiftAmt <= 4) ? 2 : 1; |
871 | |
872 | LLT Ty = MRI.getType(Reg: Def->getOperand(i: 0).getReg()); |
873 | if (Ty.isVector()) |
874 | return 0; |
875 | unsigned ShiftSize = Ty.getSizeInBits(); |
876 | if ((ShiftSize == 32 && ShiftAmt <= 31) || |
877 | (ShiftSize == 64 && ShiftAmt <= 63)) |
878 | return 1; |
879 | return 0; |
880 | } |
881 | |
882 | /// \returns true if it would be profitable to swap the LHS and RHS of a G_ICMP |
883 | /// instruction \p MI. |
884 | bool trySwapICmpOperands(MachineInstr &MI, MachineRegisterInfo &MRI) { |
885 | assert(MI.getOpcode() == TargetOpcode::G_ICMP); |
886 | // Swap the operands if it would introduce a profitable folding opportunity. |
887 | // (e.g. a shift + extend). |
888 | // |
889 | // For example: |
890 | // lsl w13, w11, #1 |
891 | // cmp w13, w12 |
892 | // can be turned into: |
893 | // cmp w12, w11, lsl #1 |
894 | |
895 | // Don't swap if there's a constant on the RHS, because we know we can fold |
896 | // that. |
897 | Register RHS = MI.getOperand(i: 3).getReg(); |
898 | auto RHSCst = getIConstantVRegValWithLookThrough(VReg: RHS, MRI); |
899 | if (RHSCst && isLegalArithImmed(C: RHSCst->Value.getSExtValue())) |
900 | return false; |
901 | |
902 | Register LHS = MI.getOperand(i: 2).getReg(); |
903 | auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate()); |
904 | auto GetRegForProfit = [&](Register Reg) { |
905 | MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); |
906 | return isCMN(MaybeSub: Def, Pred, MRI) ? Def->getOperand(i: 2).getReg() : Reg; |
907 | }; |
908 | |
909 | // Don't have a constant on the RHS. If we swap the LHS and RHS of the |
910 | // compare, would we be able to fold more instructions? |
911 | Register TheLHS = GetRegForProfit(LHS); |
912 | Register TheRHS = GetRegForProfit(RHS); |
913 | |
914 | // If the LHS is more likely to give us a folding opportunity, then swap the |
915 | // LHS and RHS. |
916 | return (getCmpOperandFoldingProfit(CmpOp: TheLHS, MRI) > |
917 | getCmpOperandFoldingProfit(CmpOp: TheRHS, MRI)); |
918 | } |
919 | |
920 | void applySwapICmpOperands(MachineInstr &MI, GISelChangeObserver &Observer) { |
921 | auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate()); |
922 | Register LHS = MI.getOperand(i: 2).getReg(); |
923 | Register RHS = MI.getOperand(i: 3).getReg(); |
924 | Observer.changedInstr(MI); |
925 | MI.getOperand(i: 1).setPredicate(CmpInst::getSwappedPredicate(pred: Pred)); |
926 | MI.getOperand(i: 2).setReg(RHS); |
927 | MI.getOperand(i: 3).setReg(LHS); |
928 | Observer.changedInstr(MI); |
929 | } |
930 | |
931 | /// \returns a function which builds a vector floating point compare instruction |
932 | /// for a condition code \p CC. |
933 | /// \param [in] NoNans - True if the target has NoNansFPMath. |
934 | std::function<Register(MachineIRBuilder &)> |
935 | getVectorFCMP(AArch64CC::CondCode CC, Register LHS, Register RHS, bool NoNans, |
936 | MachineRegisterInfo &MRI) { |
937 | LLT DstTy = MRI.getType(Reg: LHS); |
938 | assert(DstTy.isVector() && "Expected vector types only?" ); |
939 | assert(DstTy == MRI.getType(RHS) && "Src and Dst types must match!" ); |
940 | switch (CC) { |
941 | default: |
942 | llvm_unreachable("Unexpected condition code!" ); |
943 | case AArch64CC::NE: |
944 | return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { |
945 | auto FCmp = MIB.buildInstr(Opc: AArch64::G_FCMEQ, DstOps: {DstTy}, SrcOps: {LHS, RHS}); |
946 | return MIB.buildNot(Dst: DstTy, Src0: FCmp).getReg(Idx: 0); |
947 | }; |
948 | case AArch64CC::EQ: |
949 | return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { |
950 | return MIB.buildInstr(Opc: AArch64::G_FCMEQ, DstOps: {DstTy}, SrcOps: {LHS, RHS}).getReg(Idx: 0); |
951 | }; |
952 | case AArch64CC::GE: |
953 | return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { |
954 | return MIB.buildInstr(Opc: AArch64::G_FCMGE, DstOps: {DstTy}, SrcOps: {LHS, RHS}).getReg(Idx: 0); |
955 | }; |
956 | case AArch64CC::GT: |
957 | return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { |
958 | return MIB.buildInstr(Opc: AArch64::G_FCMGT, DstOps: {DstTy}, SrcOps: {LHS, RHS}).getReg(Idx: 0); |
959 | }; |
960 | case AArch64CC::LS: |
961 | return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { |
962 | return MIB.buildInstr(Opc: AArch64::G_FCMGE, DstOps: {DstTy}, SrcOps: {RHS, LHS}).getReg(Idx: 0); |
963 | }; |
964 | case AArch64CC::MI: |
965 | return [LHS, RHS, DstTy](MachineIRBuilder &MIB) { |
966 | return MIB.buildInstr(Opc: AArch64::G_FCMGT, DstOps: {DstTy}, SrcOps: {RHS, LHS}).getReg(Idx: 0); |
967 | }; |
968 | } |
969 | } |
970 | |
971 | /// Try to lower a vector G_FCMP \p MI into an AArch64-specific pseudo. |
972 | bool matchLowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI, |
973 | MachineIRBuilder &MIB) { |
974 | assert(MI.getOpcode() == TargetOpcode::G_FCMP); |
975 | const auto &ST = MI.getMF()->getSubtarget<AArch64Subtarget>(); |
976 | |
977 | Register Dst = MI.getOperand(i: 0).getReg(); |
978 | LLT DstTy = MRI.getType(Reg: Dst); |
979 | if (!DstTy.isVector() || !ST.hasNEON()) |
980 | return false; |
981 | Register LHS = MI.getOperand(i: 2).getReg(); |
982 | unsigned EltSize = MRI.getType(Reg: LHS).getScalarSizeInBits(); |
983 | if (EltSize == 16 && !ST.hasFullFP16()) |
984 | return false; |
985 | if (EltSize != 16 && EltSize != 32 && EltSize != 64) |
986 | return false; |
987 | |
988 | return true; |
989 | } |
990 | |
991 | /// Try to lower a vector G_FCMP \p MI into an AArch64-specific pseudo. |
992 | void applyLowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI, |
993 | MachineIRBuilder &MIB) { |
994 | assert(MI.getOpcode() == TargetOpcode::G_FCMP); |
995 | const auto &ST = MI.getMF()->getSubtarget<AArch64Subtarget>(); |
996 | |
997 | const auto &CmpMI = cast<GFCmp>(Val&: MI); |
998 | |
999 | Register Dst = CmpMI.getReg(Idx: 0); |
1000 | CmpInst::Predicate Pred = CmpMI.getCond(); |
1001 | Register LHS = CmpMI.getLHSReg(); |
1002 | Register RHS = CmpMI.getRHSReg(); |
1003 | |
1004 | LLT DstTy = MRI.getType(Reg: Dst); |
1005 | |
1006 | bool Invert = false; |
1007 | AArch64CC::CondCode CC, CC2 = AArch64CC::AL; |
1008 | if ((Pred == CmpInst::Predicate::FCMP_ORD || |
1009 | Pred == CmpInst::Predicate::FCMP_UNO) && |
1010 | isBuildVectorAllZeros(MI: *MRI.getVRegDef(Reg: RHS), MRI)) { |
1011 | // The special case "fcmp ord %a, 0" is the canonical check that LHS isn't |
1012 | // NaN, so equivalent to a == a and doesn't need the two comparisons an |
1013 | // "ord" normally would. |
1014 | // Similarly, "fcmp uno %a, 0" is the canonical check that LHS is NaN and is |
1015 | // thus equivalent to a != a. |
1016 | RHS = LHS; |
1017 | CC = Pred == CmpInst::Predicate::FCMP_ORD ? AArch64CC::EQ : AArch64CC::NE; |
1018 | } else |
1019 | changeVectorFCMPPredToAArch64CC(P: Pred, CondCode&: CC, CondCode2&: CC2, Invert); |
1020 | |
1021 | // Instead of having an apply function, just build here to simplify things. |
1022 | MIB.setInstrAndDebugLoc(MI); |
1023 | |
1024 | const bool NoNans = |
1025 | ST.getTargetLowering()->getTargetMachine().Options.NoNaNsFPMath; |
1026 | |
1027 | auto Cmp = getVectorFCMP(CC, LHS, RHS, NoNans, MRI); |
1028 | Register CmpRes; |
1029 | if (CC2 == AArch64CC::AL) |
1030 | CmpRes = Cmp(MIB); |
1031 | else { |
1032 | auto Cmp2 = getVectorFCMP(CC: CC2, LHS, RHS, NoNans, MRI); |
1033 | auto Cmp2Dst = Cmp2(MIB); |
1034 | auto Cmp1Dst = Cmp(MIB); |
1035 | CmpRes = MIB.buildOr(Dst: DstTy, Src0: Cmp1Dst, Src1: Cmp2Dst).getReg(Idx: 0); |
1036 | } |
1037 | if (Invert) |
1038 | CmpRes = MIB.buildNot(Dst: DstTy, Src0: CmpRes).getReg(Idx: 0); |
1039 | MRI.replaceRegWith(FromReg: Dst, ToReg: CmpRes); |
1040 | MI.eraseFromParent(); |
1041 | } |
1042 | |
1043 | // Matches G_BUILD_VECTOR where at least one source operand is not a constant |
1044 | bool matchLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI) { |
1045 | auto *GBuildVec = cast<GBuildVector>(Val: &MI); |
1046 | |
1047 | // Check if the values are all constants |
1048 | for (unsigned I = 0; I < GBuildVec->getNumSources(); ++I) { |
1049 | auto ConstVal = |
1050 | getAnyConstantVRegValWithLookThrough(VReg: GBuildVec->getSourceReg(I), MRI); |
1051 | |
1052 | if (!ConstVal.has_value()) |
1053 | return true; |
1054 | } |
1055 | |
1056 | return false; |
1057 | } |
1058 | |
1059 | void applyLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI, |
1060 | MachineIRBuilder &B) { |
1061 | auto *GBuildVec = cast<GBuildVector>(Val: &MI); |
1062 | LLT DstTy = MRI.getType(Reg: GBuildVec->getReg(Idx: 0)); |
1063 | Register DstReg = B.buildUndef(Res: DstTy).getReg(Idx: 0); |
1064 | |
1065 | for (unsigned I = 0; I < GBuildVec->getNumSources(); ++I) { |
1066 | Register SrcReg = GBuildVec->getSourceReg(I); |
1067 | if (mi_match(R: SrcReg, MRI, P: m_GImplicitDef())) |
1068 | continue; |
1069 | auto IdxReg = B.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: I); |
1070 | DstReg = |
1071 | B.buildInsertVectorElement(Res: DstTy, Val: DstReg, Elt: SrcReg, Idx: IdxReg).getReg(Idx: 0); |
1072 | } |
1073 | B.buildCopy(Res: GBuildVec->getReg(Idx: 0), Op: DstReg); |
1074 | GBuildVec->eraseFromParent(); |
1075 | } |
1076 | |
1077 | bool matchFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI, |
1078 | Register &SrcReg) { |
1079 | assert(MI.getOpcode() == TargetOpcode::G_STORE); |
1080 | Register DstReg = MI.getOperand(i: 0).getReg(); |
1081 | if (MRI.getType(Reg: DstReg).isVector()) |
1082 | return false; |
1083 | // Match a store of a truncate. |
1084 | if (!mi_match(R: DstReg, MRI, P: m_GTrunc(Src: m_Reg(R&: SrcReg)))) |
1085 | return false; |
1086 | // Only form truncstores for value types of max 64b. |
1087 | return MRI.getType(Reg: SrcReg).getSizeInBits() <= 64; |
1088 | } |
1089 | |
1090 | void applyFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI, |
1091 | MachineIRBuilder &B, GISelChangeObserver &Observer, |
1092 | Register &SrcReg) { |
1093 | assert(MI.getOpcode() == TargetOpcode::G_STORE); |
1094 | Observer.changingInstr(MI); |
1095 | MI.getOperand(i: 0).setReg(SrcReg); |
1096 | Observer.changedInstr(MI); |
1097 | } |
1098 | |
1099 | // Lower vector G_SEXT_INREG back to shifts for selection. We allowed them to |
1100 | // form in the first place for combine opportunities, so any remaining ones |
1101 | // at this stage need be lowered back. |
1102 | bool matchVectorSextInReg(MachineInstr &MI, MachineRegisterInfo &MRI) { |
1103 | assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG); |
1104 | Register DstReg = MI.getOperand(i: 0).getReg(); |
1105 | LLT DstTy = MRI.getType(Reg: DstReg); |
1106 | return DstTy.isVector(); |
1107 | } |
1108 | |
1109 | void applyVectorSextInReg(MachineInstr &MI, MachineRegisterInfo &MRI, |
1110 | MachineIRBuilder &B, GISelChangeObserver &Observer) { |
1111 | assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG); |
1112 | B.setInstrAndDebugLoc(MI); |
1113 | LegalizerHelper Helper(*MI.getMF(), Observer, B); |
1114 | Helper.lower(MI, TypeIdx: 0, /* Unused hint type */ Ty: LLT()); |
1115 | } |
1116 | |
1117 | /// Combine <N x t>, unused = unmerge(G_EXT <2*N x t> v, undef, N) |
1118 | /// => unused, <N x t> = unmerge v |
1119 | bool matchUnmergeExtToUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, |
1120 | Register &MatchInfo) { |
1121 | auto &Unmerge = cast<GUnmerge>(Val&: MI); |
1122 | if (Unmerge.getNumDefs() != 2) |
1123 | return false; |
1124 | if (!MRI.use_nodbg_empty(RegNo: Unmerge.getReg(Idx: 1))) |
1125 | return false; |
1126 | |
1127 | LLT DstTy = MRI.getType(Reg: Unmerge.getReg(Idx: 0)); |
1128 | if (!DstTy.isVector()) |
1129 | return false; |
1130 | |
1131 | MachineInstr *Ext = getOpcodeDef(Opcode: AArch64::G_EXT, Reg: Unmerge.getSourceReg(), MRI); |
1132 | if (!Ext) |
1133 | return false; |
1134 | |
1135 | Register ExtSrc1 = Ext->getOperand(i: 1).getReg(); |
1136 | Register ExtSrc2 = Ext->getOperand(i: 2).getReg(); |
1137 | auto LowestVal = |
1138 | getIConstantVRegValWithLookThrough(VReg: Ext->getOperand(i: 3).getReg(), MRI); |
1139 | if (!LowestVal || LowestVal->Value.getZExtValue() != DstTy.getSizeInBytes()) |
1140 | return false; |
1141 | |
1142 | if (!getOpcodeDef<GImplicitDef>(Reg: ExtSrc2, MRI)) |
1143 | return false; |
1144 | |
1145 | MatchInfo = ExtSrc1; |
1146 | return true; |
1147 | } |
1148 | |
1149 | void applyUnmergeExtToUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, |
1150 | MachineIRBuilder &B, |
1151 | GISelChangeObserver &Observer, Register &SrcReg) { |
1152 | Observer.changingInstr(MI); |
1153 | // Swap dst registers. |
1154 | Register Dst1 = MI.getOperand(i: 0).getReg(); |
1155 | MI.getOperand(i: 0).setReg(MI.getOperand(i: 1).getReg()); |
1156 | MI.getOperand(i: 1).setReg(Dst1); |
1157 | MI.getOperand(i: 2).setReg(SrcReg); |
1158 | Observer.changedInstr(MI); |
1159 | } |
1160 | |
1161 | // Match mul({z/s}ext , {z/s}ext) => {u/s}mull OR |
1162 | // Match v2s64 mul instructions, which will then be scalarised later on |
1163 | // Doing these two matches in one function to ensure that the order of matching |
1164 | // will always be the same. |
1165 | // Try lowering MUL to MULL before trying to scalarize if needed. |
1166 | bool matchMulv2s64(MachineInstr &MI, MachineRegisterInfo &MRI) { |
1167 | // Get the instructions that defined the source operand |
1168 | LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
1169 | return DstTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64); |
1170 | } |
1171 | |
1172 | void applyMulv2s64(MachineInstr &MI, MachineRegisterInfo &MRI, |
1173 | MachineIRBuilder &B, GISelChangeObserver &Observer) { |
1174 | assert(MI.getOpcode() == TargetOpcode::G_MUL && |
1175 | "Expected a G_MUL instruction" ); |
1176 | |
1177 | // Get the instructions that defined the source operand |
1178 | LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg()); |
1179 | assert(DstTy == LLT::fixed_vector(2, 64) && "Expected v2s64 Mul" ); |
1180 | LegalizerHelper Helper(*MI.getMF(), Observer, B); |
1181 | Helper.fewerElementsVector( |
1182 | MI, TypeIdx: 0, |
1183 | NarrowTy: DstTy.changeElementCount(EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2))); |
1184 | } |
1185 | |
1186 | class AArch64PostLegalizerLoweringImpl : public Combiner { |
1187 | protected: |
1188 | const CombinerHelper Helper; |
1189 | const AArch64PostLegalizerLoweringImplRuleConfig &RuleConfig; |
1190 | const AArch64Subtarget &STI; |
1191 | |
1192 | public: |
1193 | AArch64PostLegalizerLoweringImpl( |
1194 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
1195 | GISelCSEInfo *CSEInfo, |
1196 | const AArch64PostLegalizerLoweringImplRuleConfig &RuleConfig, |
1197 | const AArch64Subtarget &STI); |
1198 | |
1199 | static const char *getName() { return "AArch6400PreLegalizerCombiner" ; } |
1200 | |
1201 | bool tryCombineAll(MachineInstr &I) const override; |
1202 | |
1203 | private: |
1204 | #define GET_GICOMBINER_CLASS_MEMBERS |
1205 | #include "AArch64GenPostLegalizeGILowering.inc" |
1206 | #undef GET_GICOMBINER_CLASS_MEMBERS |
1207 | }; |
1208 | |
1209 | #define GET_GICOMBINER_IMPL |
1210 | #include "AArch64GenPostLegalizeGILowering.inc" |
1211 | #undef GET_GICOMBINER_IMPL |
1212 | |
1213 | AArch64PostLegalizerLoweringImpl::AArch64PostLegalizerLoweringImpl( |
1214 | MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, |
1215 | GISelCSEInfo *CSEInfo, |
1216 | const AArch64PostLegalizerLoweringImplRuleConfig &RuleConfig, |
1217 | const AArch64Subtarget &STI) |
1218 | : Combiner(MF, CInfo, TPC, /*VT*/ nullptr, CSEInfo), |
1219 | Helper(Observer, B, /*IsPreLegalize*/ true), RuleConfig(RuleConfig), |
1220 | STI(STI), |
1221 | #define GET_GICOMBINER_CONSTRUCTOR_INITS |
1222 | #include "AArch64GenPostLegalizeGILowering.inc" |
1223 | #undef GET_GICOMBINER_CONSTRUCTOR_INITS |
1224 | { |
1225 | } |
1226 | |
1227 | class AArch64PostLegalizerLowering : public MachineFunctionPass { |
1228 | public: |
1229 | static char ID; |
1230 | |
1231 | AArch64PostLegalizerLowering(); |
1232 | |
1233 | StringRef getPassName() const override { |
1234 | return "AArch64PostLegalizerLowering" ; |
1235 | } |
1236 | |
1237 | bool runOnMachineFunction(MachineFunction &MF) override; |
1238 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
1239 | |
1240 | private: |
1241 | AArch64PostLegalizerLoweringImplRuleConfig RuleConfig; |
1242 | }; |
1243 | } // end anonymous namespace |
1244 | |
1245 | void AArch64PostLegalizerLowering::getAnalysisUsage(AnalysisUsage &AU) const { |
1246 | AU.addRequired<TargetPassConfig>(); |
1247 | AU.setPreservesCFG(); |
1248 | getSelectionDAGFallbackAnalysisUsage(AU); |
1249 | MachineFunctionPass::getAnalysisUsage(AU); |
1250 | } |
1251 | |
1252 | AArch64PostLegalizerLowering::AArch64PostLegalizerLowering() |
1253 | : MachineFunctionPass(ID) { |
1254 | if (!RuleConfig.parseCommandLineOption()) |
1255 | report_fatal_error(reason: "Invalid rule identifier" ); |
1256 | } |
1257 | |
1258 | bool AArch64PostLegalizerLowering::runOnMachineFunction(MachineFunction &MF) { |
1259 | if (MF.getProperties().hasFailedISel()) |
1260 | return false; |
1261 | assert(MF.getProperties().hasLegalized() && "Expected a legalized function?" ); |
1262 | auto *TPC = &getAnalysis<TargetPassConfig>(); |
1263 | const Function &F = MF.getFunction(); |
1264 | |
1265 | const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>(); |
1266 | CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, |
1267 | /*LegalizerInfo*/ nullptr, /*OptEnabled=*/true, |
1268 | F.hasOptSize(), F.hasMinSize()); |
1269 | // Disable fixed-point iteration to reduce compile-time |
1270 | CInfo.MaxIterations = 1; |
1271 | CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass; |
1272 | // PostLegalizerCombiner performs DCE, so a full DCE pass is unnecessary. |
1273 | CInfo.EnableFullDCE = false; |
1274 | AArch64PostLegalizerLoweringImpl Impl(MF, CInfo, TPC, /*CSEInfo*/ nullptr, |
1275 | RuleConfig, ST); |
1276 | return Impl.combineMachineInstrs(); |
1277 | } |
1278 | |
1279 | char AArch64PostLegalizerLowering::ID = 0; |
1280 | INITIALIZE_PASS_BEGIN(AArch64PostLegalizerLowering, DEBUG_TYPE, |
1281 | "Lower AArch64 MachineInstrs after legalization" , false, |
1282 | false) |
1283 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
1284 | INITIALIZE_PASS_END(AArch64PostLegalizerLowering, DEBUG_TYPE, |
1285 | "Lower AArch64 MachineInstrs after legalization" , false, |
1286 | false) |
1287 | |
1288 | namespace llvm { |
1289 | FunctionPass *createAArch64PostLegalizerLowering() { |
1290 | return new AArch64PostLegalizerLowering(); |
1291 | } |
1292 | } // end namespace llvm |
1293 | |