1//===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AArch64.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AArch64LegalizerInfo.h"
15#include "AArch64Subtarget.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
18#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
20#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22#include "llvm/CodeGen/GlobalISel/Utils.h"
23#include "llvm/CodeGen/MachineInstr.h"
24#include "llvm/CodeGen/MachineInstrBuilder.h"
25#include "llvm/CodeGen/MachineRegisterInfo.h"
26#include "llvm/CodeGen/TargetOpcodes.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/Intrinsics.h"
29#include "llvm/IR/IntrinsicsAArch64.h"
30#include "llvm/IR/Type.h"
31#include "llvm/Support/MathExtras.h"
32#include <initializer_list>
33
34#define DEBUG_TYPE "aarch64-legalinfo"
35
36using namespace llvm;
37using namespace LegalizeActions;
38using namespace LegalizeMutations;
39using namespace LegalityPredicates;
40using namespace MIPatternMatch;
41
42AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
43 : ST(&ST) {
44 using namespace TargetOpcode;
45 const LLT p0 = LLT::pointer(AddressSpace: 0, SizeInBits: 64);
46 const LLT s8 = LLT::scalar(SizeInBits: 8);
47 const LLT s16 = LLT::scalar(SizeInBits: 16);
48 const LLT s32 = LLT::scalar(SizeInBits: 32);
49 const LLT s64 = LLT::scalar(SizeInBits: 64);
50 const LLT s128 = LLT::scalar(SizeInBits: 128);
51 const LLT v16s8 = LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8);
52 const LLT v8s8 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8);
53 const LLT v4s8 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 8);
54 const LLT v2s8 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 8);
55 const LLT v8s16 = LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16);
56 const LLT v4s16 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16);
57 const LLT v2s16 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 16);
58 const LLT v2s32 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32);
59 const LLT v4s32 = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
60 const LLT v2s64 = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
61 const LLT v2p0 = LLT::fixed_vector(NumElements: 2, ScalarTy: p0);
62
63 const LLT nxv16s8 = LLT::scalable_vector(MinNumElements: 16, ScalarTy: s8);
64 const LLT nxv8s16 = LLT::scalable_vector(MinNumElements: 8, ScalarTy: s16);
65 const LLT nxv4s32 = LLT::scalable_vector(MinNumElements: 4, ScalarTy: s32);
66 const LLT nxv2s64 = LLT::scalable_vector(MinNumElements: 2, ScalarTy: s64);
67
68 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
69 v16s8, v8s16, v4s32,
70 v2s64, v2p0,
71 /* End 128bit types */
72 /* Begin 64bit types */
73 v8s8, v4s16, v2s32};
74 std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
75 SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
76 SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
77
78 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
79
80 // FIXME: support subtargets which have neon/fp-armv8 disabled.
81 if (!ST.hasNEON() || !ST.hasFPARMv8()) {
82 getLegacyLegalizerInfo().computeTables();
83 return;
84 }
85
86 // Some instructions only support s16 if the subtarget has full 16-bit FP
87 // support.
88 const bool HasFP16 = ST.hasFullFP16();
89 const LLT &MinFPScalar = HasFP16 ? s16 : s32;
90
91 const bool HasCSSC = ST.hasCSSC();
92 const bool HasRCPC3 = ST.hasRCPC3();
93 const bool HasSVE = ST.hasSVE();
94
95 getActionDefinitionsBuilder(
96 Opcodes: {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
97 .legalFor(Types: {p0, s8, s16, s32, s64})
98 .legalFor(Types: {v2s8, v4s8, v8s8, v16s8, v2s16, v4s16, v8s16, v2s32, v4s32,
99 v2s64, v2p0})
100 .widenScalarToNextPow2(TypeIdx: 0)
101 .clampScalar(TypeIdx: 0, MinTy: s8, MaxTy: s64)
102 .moreElementsToNextPow2(TypeIdx: 0)
103 .widenVectorEltsToVectorMinSize(TypeIdx: 0, VectorSize: 64)
104 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
105 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
106 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
107 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
108 .clampMaxNumElements(TypeIdx: 0, EltTy: p0, MaxElements: 2)
109 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0);
110
111 getActionDefinitionsBuilder(Opcode: G_PHI)
112 .legalFor(Types: {p0, s16, s32, s64})
113 .legalFor(Types: PackedVectorAllTypeList)
114 .widenScalarToNextPow2(TypeIdx: 0)
115 .moreElementsToNextPow2(TypeIdx: 0)
116 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
117 .clampScalar(TypeIdx: 0, MinTy: s16, MaxTy: s64)
118 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
119 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
120 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
121 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
122 .clampMaxNumElements(TypeIdx: 0, EltTy: p0, MaxElements: 2);
123
124 getActionDefinitionsBuilder(Opcode: G_INSERT)
125 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {s32, s64, p0}), P1: typeInSet(TypeIdx: 1, TypesInit: {s8, s16, s32}),
126 args: smallerThan(TypeIdx0: 1, TypeIdx1: 0)))
127 .widenScalarToNextPow2(TypeIdx: 0)
128 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
129 .widenScalarToNextPow2(TypeIdx: 1)
130 .minScalar(TypeIdx: 1, Ty: s8)
131 .maxScalarIf(Predicate: typeInSet(TypeIdx: 0, TypesInit: {s32}), TypeIdx: 1, Ty: s16)
132 .maxScalarIf(Predicate: typeInSet(TypeIdx: 0, TypesInit: {s64, p0}), TypeIdx: 1, Ty: s32);
133
134 getActionDefinitionsBuilder(Opcode: G_EXTRACT)
135 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {s16, s32, s64, p0}),
136 P1: typeInSet(TypeIdx: 1, TypesInit: {s32, s64, s128, p0}), args: smallerThan(TypeIdx0: 0, TypeIdx1: 1)))
137 .widenScalarToNextPow2(TypeIdx: 1)
138 .clampScalar(TypeIdx: 1, MinTy: s32, MaxTy: s128)
139 .widenScalarToNextPow2(TypeIdx: 0)
140 .minScalar(TypeIdx: 0, Ty: s16)
141 .maxScalarIf(Predicate: typeInSet(TypeIdx: 1, TypesInit: {s32}), TypeIdx: 0, Ty: s16)
142 .maxScalarIf(Predicate: typeInSet(TypeIdx: 1, TypesInit: {s64, p0}), TypeIdx: 0, Ty: s32)
143 .maxScalarIf(Predicate: typeInSet(TypeIdx: 1, TypesInit: {s128}), TypeIdx: 0, Ty: s64);
144
145 getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB, G_AND, G_OR, G_XOR})
146 .legalFor(Types: {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
147 .legalFor(Pred: HasSVE, Types: {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
148 .widenScalarToNextPow2(TypeIdx: 0)
149 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
150 .clampMaxNumElements(TypeIdx: 0, EltTy: s8, MaxElements: 16)
151 .clampMaxNumElements(TypeIdx: 0, EltTy: s16, MaxElements: 8)
152 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
153 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
154 .minScalarOrEltIf(
155 Predicate: [=](const LegalityQuery &Query) {
156 return Query.Types[0].getNumElements() <= 2;
157 },
158 TypeIdx: 0, Ty: s32)
159 .minScalarOrEltIf(
160 Predicate: [=](const LegalityQuery &Query) {
161 return Query.Types[0].getNumElements() <= 4;
162 },
163 TypeIdx: 0, Ty: s16)
164 .minScalarOrEltIf(
165 Predicate: [=](const LegalityQuery &Query) {
166 return Query.Types[0].getNumElements() <= 16;
167 },
168 TypeIdx: 0, Ty: s8)
169 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
170 .moreElementsToNextPow2(TypeIdx: 0);
171
172 getActionDefinitionsBuilder(Opcode: G_MUL)
173 .legalFor(Types: {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
174 .widenScalarToNextPow2(TypeIdx: 0)
175 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
176 .clampMaxNumElements(TypeIdx: 0, EltTy: s8, MaxElements: 16)
177 .clampMaxNumElements(TypeIdx: 0, EltTy: s16, MaxElements: 8)
178 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
179 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
180 .minScalarOrEltIf(
181 Predicate: [=](const LegalityQuery &Query) {
182 return Query.Types[0].getNumElements() <= 2;
183 },
184 TypeIdx: 0, Ty: s32)
185 .minScalarOrEltIf(
186 Predicate: [=](const LegalityQuery &Query) {
187 return Query.Types[0].getNumElements() <= 4;
188 },
189 TypeIdx: 0, Ty: s16)
190 .minScalarOrEltIf(
191 Predicate: [=](const LegalityQuery &Query) {
192 return Query.Types[0].getNumElements() <= 16;
193 },
194 TypeIdx: 0, Ty: s8)
195 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
196 .moreElementsToNextPow2(TypeIdx: 0);
197
198 getActionDefinitionsBuilder(Opcodes: {G_SHL, G_ASHR, G_LSHR})
199 .customIf(Predicate: [=](const LegalityQuery &Query) {
200 const auto &SrcTy = Query.Types[0];
201 const auto &AmtTy = Query.Types[1];
202 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
203 AmtTy.getSizeInBits() == 32;
204 })
205 .legalFor(Types: {
206 {s32, s32},
207 {s32, s64},
208 {s64, s64},
209 {v8s8, v8s8},
210 {v16s8, v16s8},
211 {v4s16, v4s16},
212 {v8s16, v8s16},
213 {v2s32, v2s32},
214 {v4s32, v4s32},
215 {v2s64, v2s64},
216 })
217 .widenScalarToNextPow2(TypeIdx: 0)
218 .clampScalar(TypeIdx: 1, MinTy: s32, MaxTy: s64)
219 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
220 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
221 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
222 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
223 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
224 .moreElementsToNextPow2(TypeIdx: 0)
225 .minScalarSameAs(TypeIdx: 1, LargeTypeIdx: 0)
226 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
227 .minScalarEltSameAsIf(Predicate: isVector(TypeIdx: 0), TypeIdx: 1, LargeTypeIdx: 0)
228 .maxScalarEltSameAsIf(Predicate: isVector(TypeIdx: 0), TypeIdx: 1, SmallTypeIdx: 0);
229
230 getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
231 .legalFor(Types: {{p0, s64}, {v2p0, v2s64}})
232 .clampScalarOrElt(TypeIdx: 1, MinTy: s64, MaxTy: s64)
233 .clampNumElements(TypeIdx: 0, MinTy: v2p0, MaxTy: v2p0);
234
235 getActionDefinitionsBuilder(Opcode: G_PTRMASK).legalFor(Types: {{p0, s64}});
236
237 getActionDefinitionsBuilder(Opcodes: {G_SDIV, G_UDIV})
238 .legalFor(Types: {s32, s64})
239 .libcallFor(Types: {s128})
240 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
241 .widenScalarToNextPow2(TypeIdx: 0)
242 .scalarize(TypeIdx: 0);
243
244 getActionDefinitionsBuilder(Opcodes: {G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
245 .lowerFor(Types: {s8, s16, s32, s64, v2s32, v4s32, v2s64})
246 .libcallFor(Types: {s128})
247 .widenScalarOrEltToNextPow2(TypeIdx: 0)
248 .minScalarOrElt(TypeIdx: 0, Ty: s32)
249 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
250 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
251 .scalarize(TypeIdx: 0);
252
253 getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
254 .widenScalarToNextPow2(TypeIdx: 0, /*Min = */ MinSize: 32)
255 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
256 .lower();
257
258 getActionDefinitionsBuilder(Opcodes: {G_SMULH, G_UMULH})
259 .legalFor(Types: {s64, v16s8, v8s16, v4s32})
260 .lower();
261
262 getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX})
263 .legalFor(Types: {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
264 .legalFor(Pred: HasCSSC, Types: {s32, s64})
265 .minScalar(Pred: HasCSSC, TypeIdx: 0, Ty: s32)
266 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
267 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
268 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
269 .lower();
270
271 // FIXME: Legal vector types are only legal with NEON.
272 getActionDefinitionsBuilder(Opcode: G_ABS)
273 .legalFor(Pred: HasCSSC, Types: {s32, s64})
274 .legalFor(Types: PackedVectorAllTypeList)
275 .customIf(Predicate: [=](const LegalityQuery &Q) {
276 // TODO: Fix suboptimal codegen for 128+ bit types.
277 LLT SrcTy = Q.Types[0];
278 return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128;
279 })
280 .widenScalarIf(
281 Predicate: [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
282 Mutation: [=](const LegalityQuery &Query) { return std::make_pair(x: 0, y: v4s16); })
283 .widenScalarIf(
284 Predicate: [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; },
285 Mutation: [=](const LegalityQuery &Query) { return std::make_pair(x: 0, y: v2s32); })
286 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
287 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
288 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
289 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
290 .moreElementsToNextPow2(TypeIdx: 0)
291 .lower();
292
293 getActionDefinitionsBuilder(
294 Opcodes: {G_ABDS, G_ABDU, G_UAVGFLOOR, G_UAVGCEIL, G_SAVGFLOOR, G_SAVGCEIL})
295 .legalFor(Types: {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
296 .lower();
297
298 getActionDefinitionsBuilder(
299 Opcodes: {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
300 .legalFor(Types: {{s32, s32}, {s64, s32}})
301 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
302 .clampScalar(TypeIdx: 1, MinTy: s32, MaxTy: s64)
303 .widenScalarToNextPow2(TypeIdx: 0);
304
305 getActionDefinitionsBuilder(Opcodes: {G_FSHL, G_FSHR})
306 .customFor(Types: {{s32, s32}, {s32, s64}, {s64, s64}})
307 .lower();
308
309 getActionDefinitionsBuilder(Opcode: G_ROTR)
310 .legalFor(Types: {{s32, s64}, {s64, s64}})
311 .customIf(Predicate: [=](const LegalityQuery &Q) {
312 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
313 })
314 .lower();
315 getActionDefinitionsBuilder(Opcode: G_ROTL).lower();
316
317 getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
318 .customFor(Types: {{s32, s32}, {s64, s64}});
319
320 auto always = [=](const LegalityQuery &Q) { return true; };
321 getActionDefinitionsBuilder(Opcode: G_CTPOP)
322 .legalFor(Pred: HasCSSC, Types: {{s32, s32}, {s64, s64}})
323 .legalFor(Types: {{v8s8, v8s8}, {v16s8, v16s8}})
324 .customFor(Pred: !HasCSSC, Types: {{s32, s32}, {s64, s64}})
325 .customFor(Types: {{s128, s128},
326 {v4s16, v4s16},
327 {v8s16, v8s16},
328 {v2s32, v2s32},
329 {v4s32, v4s32},
330 {v2s64, v2s64}})
331 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s128)
332 .widenScalarToNextPow2(TypeIdx: 0)
333 .minScalarEltSameAsIf(Predicate: always, TypeIdx: 1, LargeTypeIdx: 0)
334 .maxScalarEltSameAsIf(Predicate: always, TypeIdx: 1, SmallTypeIdx: 0)
335 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
336 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
337 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
338 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
339 .moreElementsToNextPow2(TypeIdx: 0)
340 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0);
341
342 getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTLS})
343 .legalFor(Types: {{s32, s32},
344 {s64, s64},
345 {v8s8, v8s8},
346 {v16s8, v16s8},
347 {v4s16, v4s16},
348 {v8s16, v8s16},
349 {v2s32, v2s32},
350 {v4s32, v4s32}})
351 .widenScalarToNextPow2(TypeIdx: 1, /*Min=*/MinSize: 32)
352 .clampScalar(TypeIdx: 1, MinTy: s32, MaxTy: s64)
353 .widenScalarOrEltToNextPow2OrMinSize(TypeIdx: 1, /*Min=*/MinSize: 8)
354 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
355 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
356 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
357 .moreElementsToNextPow2(TypeIdx: 0)
358 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 32), TypeIdx: 0)
359 .scalarSameSizeAs(TypeIdx: 0, SameSizeIdx: 1);
360
361 getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF).lower();
362
363 getActionDefinitionsBuilder(Opcode: G_CTTZ)
364 .lowerIf(Predicate: isVector(TypeIdx: 0))
365 .widenScalarToNextPow2(TypeIdx: 1, /*Min=*/MinSize: 32)
366 .clampScalar(TypeIdx: 1, MinTy: s32, MaxTy: s64)
367 .scalarSameSizeAs(TypeIdx: 0, SameSizeIdx: 1)
368 .legalFor(Pred: HasCSSC, Types: {s32, s64})
369 .customFor(Pred: !HasCSSC, Types: {s32, s64});
370
371 getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF).lower();
372
373 getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
374 .legalFor(Types: {s32, s64, v8s8, v16s8})
375 .widenScalarToNextPow2(TypeIdx: 0, /*Min = */ MinSize: 32)
376 .widenScalarOrEltToNextPow2OrMinSize(TypeIdx: 0, MinSize: 8)
377 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
378 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
379 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
380 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
381 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
382 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
383 .moreElementsToNextPow2(TypeIdx: 0)
384 .lower();
385
386 getActionDefinitionsBuilder(Opcode: G_BSWAP)
387 .legalFor(Types: {s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64})
388 .widenScalarOrEltToNextPow2(TypeIdx: 0, MinSize: 16)
389 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
390 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
391 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
392 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
393 .moreElementsToNextPow2(TypeIdx: 0);
394
395 getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
396 .legalFor(Types: {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
397 .legalFor(Pred: HasSVE, Types: {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
398 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
399 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
400 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
401 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
402 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
403 .moreElementsToNextPow2(TypeIdx: 0)
404 .lower();
405
406 getActionDefinitionsBuilder(
407 Opcodes: {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM,
408 G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
409 G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
410 .legalFor(Types: {s32, s64, v2s32, v4s32, v2s64})
411 .legalFor(Pred: HasFP16, Types: {s16, v4s16, v8s16})
412 .libcallFor(Types: {s128})
413 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
414 .minScalarOrElt(TypeIdx: 0, Ty: MinFPScalar)
415 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
416 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
417 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
418 .moreElementsToNextPow2(TypeIdx: 0);
419
420 getActionDefinitionsBuilder(Opcodes: {G_FABS, G_FNEG})
421 .legalFor(Types: {s32, s64, v2s32, v4s32, v2s64})
422 .legalFor(Pred: HasFP16, Types: {s16, v4s16, v8s16})
423 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
424 .lowerIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64))
425 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
426 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
427 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
428 .moreElementsToNextPow2(TypeIdx: 0)
429 .lowerFor(Types: {s16, v4s16, v8s16});
430
431 getActionDefinitionsBuilder(Opcode: G_FREM)
432 .libcallFor(Types: {s32, s64, s128})
433 .minScalar(TypeIdx: 0, Ty: s32)
434 .scalarize(TypeIdx: 0);
435
436 getActionDefinitionsBuilder(Opcodes: {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
437 G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10,
438 G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH,
439 G_FSINH, G_FTANH, G_FMODF})
440 // We need a call for these, so we always need to scalarize.
441 .scalarize(TypeIdx: 0)
442 // Regardless of FP16 support, widen 16-bit elements to 32-bits.
443 .minScalar(TypeIdx: 0, Ty: s32)
444 .libcallFor(Types: {s32, s64, s128});
445 getActionDefinitionsBuilder(Opcodes: {G_FPOWI, G_FLDEXP})
446 .scalarize(TypeIdx: 0)
447 .minScalar(TypeIdx: 0, Ty: s32)
448 .libcallFor(Types: {{s32, s32}, {s64, s32}, {s128, s32}});
449
450 getActionDefinitionsBuilder(Opcodes: {G_LROUND, G_INTRINSIC_LRINT})
451 .legalFor(Types: {{s32, s32}, {s32, s64}, {s64, s32}, {s64, s64}})
452 .legalFor(Pred: HasFP16, Types: {{s32, s16}, {s64, s16}})
453 .minScalar(TypeIdx: 1, Ty: s32)
454 .libcallFor(Types: {{s64, s128}})
455 .lower();
456 getActionDefinitionsBuilder(Opcodes: {G_LLROUND, G_INTRINSIC_LLRINT})
457 .legalFor(Types: {{s64, s32}, {s64, s64}})
458 .legalFor(Pred: HasFP16, Types: {{s64, s16}})
459 .minScalar(TypeIdx: 0, Ty: s64)
460 .minScalar(TypeIdx: 1, Ty: s32)
461 .libcallFor(Types: {{s64, s128}})
462 .lower();
463
464 // TODO: Custom legalization for mismatched types.
465 getActionDefinitionsBuilder(Opcode: G_FCOPYSIGN)
466 .moreElementsIf(
467 Predicate: [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); },
468 Mutation: [=](const LegalityQuery &Query) {
469 const LLT Ty = Query.Types[0];
470 return std::pair(0, LLT::fixed_vector(NumElements: Ty == s16 ? 4 : 2, ScalarTy: Ty));
471 })
472 .lower();
473
474 getActionDefinitionsBuilder(Opcode: G_FMAD).lower();
475
476 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
477 auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
478
479 if (Op == G_SEXTLOAD)
480 Actions.lowerIf(Predicate: atomicOrderingAtLeastOrStrongerThan(MMOIdx: 0, Ordering: AtomicOrdering::Unordered));
481
482 // Atomics have zero extending behavior.
483 Actions
484 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: s32, .Type1: p0, .MemTy: s8, .Align: 8},
485 {.Type0: s32, .Type1: p0, .MemTy: s16, .Align: 8},
486 {.Type0: s32, .Type1: p0, .MemTy: s32, .Align: 8},
487 {.Type0: s64, .Type1: p0, .MemTy: s8, .Align: 2},
488 {.Type0: s64, .Type1: p0, .MemTy: s16, .Align: 2},
489 {.Type0: s64, .Type1: p0, .MemTy: s32, .Align: 4},
490 {.Type0: s64, .Type1: p0, .MemTy: s64, .Align: 8},
491 {.Type0: p0, .Type1: p0, .MemTy: s64, .Align: 8},
492 {.Type0: v2s32, .Type1: p0, .MemTy: s64, .Align: 8}})
493 .widenScalarToNextPow2(TypeIdx: 0)
494 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
495 // TODO: We could support sum-of-pow2's but the lowering code doesn't know
496 // how to do that yet.
497 .unsupportedIfMemSizeNotPow2()
498 // Lower anything left over into G_*EXT and G_LOAD
499 .lower();
500 }
501
502 auto IsPtrVecPred = [=](const LegalityQuery &Query) {
503 const LLT &ValTy = Query.Types[0];
504 return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
505 };
506
507 getActionDefinitionsBuilder(Opcode: G_LOAD)
508 .customIf(Predicate: [=](const LegalityQuery &Query) {
509 return HasRCPC3 && Query.Types[0] == s128 &&
510 Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
511 })
512 .customIf(Predicate: [=](const LegalityQuery &Query) {
513 return Query.Types[0] == s128 &&
514 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
515 })
516 .legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: s8, .Type1: p0, .MemTy: s8, .Align: 8},
517 {.Type0: s16, .Type1: p0, .MemTy: s16, .Align: 8},
518 {.Type0: s32, .Type1: p0, .MemTy: s32, .Align: 8},
519 {.Type0: s64, .Type1: p0, .MemTy: s64, .Align: 8},
520 {.Type0: p0, .Type1: p0, .MemTy: s64, .Align: 8},
521 {.Type0: s128, .Type1: p0, .MemTy: s128, .Align: 8},
522 {.Type0: v8s8, .Type1: p0, .MemTy: s64, .Align: 8},
523 {.Type0: v16s8, .Type1: p0, .MemTy: s128, .Align: 8},
524 {.Type0: v4s16, .Type1: p0, .MemTy: s64, .Align: 8},
525 {.Type0: v8s16, .Type1: p0, .MemTy: s128, .Align: 8},
526 {.Type0: v2s32, .Type1: p0, .MemTy: s64, .Align: 8},
527 {.Type0: v4s32, .Type1: p0, .MemTy: s128, .Align: 8},
528 {.Type0: v2s64, .Type1: p0, .MemTy: s128, .Align: 8}})
529 // These extends are also legal
530 .legalForTypesWithMemDesc(
531 TypesAndMemDesc: {{.Type0: s32, .Type1: p0, .MemTy: s8, .Align: 8}, {.Type0: s32, .Type1: p0, .MemTy: s16, .Align: 8}, {.Type0: s64, .Type1: p0, .MemTy: s32, .Align: 8}})
532 .legalForTypesWithMemDesc(TypesAndMemDesc: {
533 // SVE vscale x 128 bit base sizes
534 {.Type0: nxv16s8, .Type1: p0, .MemTy: nxv16s8, .Align: 8},
535 {.Type0: nxv8s16, .Type1: p0, .MemTy: nxv8s16, .Align: 8},
536 {.Type0: nxv4s32, .Type1: p0, .MemTy: nxv4s32, .Align: 8},
537 {.Type0: nxv2s64, .Type1: p0, .MemTy: nxv2s64, .Align: 8},
538 })
539 .widenScalarToNextPow2(TypeIdx: 0, /* MinSize = */ 8)
540 .clampMaxNumElements(TypeIdx: 0, EltTy: s8, MaxElements: 16)
541 .clampMaxNumElements(TypeIdx: 0, EltTy: s16, MaxElements: 8)
542 .clampMaxNumElements(TypeIdx: 0, EltTy: s32, MaxElements: 4)
543 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
544 .clampMaxNumElements(TypeIdx: 0, EltTy: p0, MaxElements: 2)
545 .lowerIfMemSizeNotByteSizePow2()
546 .clampScalar(TypeIdx: 0, MinTy: s8, MaxTy: s64)
547 .narrowScalarIf(
548 Predicate: [=](const LegalityQuery &Query) {
549 // Clamp extending load results to 32-bits.
550 return Query.Types[0].isScalar() &&
551 Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
552 Query.Types[0].getSizeInBits() > 32;
553 },
554 Mutation: changeTo(TypeIdx: 0, Ty: s32))
555 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
556 .bitcastIf(Predicate: typeInSet(TypeIdx: 0, TypesInit: {v4s8}),
557 Mutation: [=](const LegalityQuery &Query) {
558 const LLT VecTy = Query.Types[0];
559 return std::pair(0, LLT::scalar(SizeInBits: VecTy.getSizeInBits()));
560 })
561 .customIf(Predicate: IsPtrVecPred)
562 .scalarizeIf(Predicate: typeInSet(TypeIdx: 0, TypesInit: {v2s16, v2s8}), TypeIdx: 0)
563 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0);
564
565 getActionDefinitionsBuilder(Opcode: G_STORE)
566 .customIf(Predicate: [=](const LegalityQuery &Query) {
567 return HasRCPC3 && Query.Types[0] == s128 &&
568 Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
569 })
570 .customIf(Predicate: [=](const LegalityQuery &Query) {
571 return Query.Types[0] == s128 &&
572 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
573 })
574 .widenScalarIf(
575 Predicate: all(P0: scalarNarrowerThan(TypeIdx: 0, Size: 32),
576 P1: atomicOrderingAtLeastOrStrongerThan(MMOIdx: 0, Ordering: AtomicOrdering::Release)),
577 Mutation: changeTo(TypeIdx: 0, Ty: s32))
578 .legalForTypesWithMemDesc(
579 TypesAndMemDesc: {{.Type0: s8, .Type1: p0, .MemTy: s8, .Align: 8}, {.Type0: s16, .Type1: p0, .MemTy: s8, .Align: 8}, // truncstorei8 from s16
580 {.Type0: s32, .Type1: p0, .MemTy: s8, .Align: 8}, // truncstorei8 from s32
581 {.Type0: s64, .Type1: p0, .MemTy: s8, .Align: 8}, // truncstorei8 from s64
582 {.Type0: s16, .Type1: p0, .MemTy: s16, .Align: 8}, {.Type0: s32, .Type1: p0, .MemTy: s16, .Align: 8}, // truncstorei16 from s32
583 {.Type0: s64, .Type1: p0, .MemTy: s16, .Align: 8}, // truncstorei16 from s64
584 {.Type0: s32, .Type1: p0, .MemTy: s8, .Align: 8}, {.Type0: s32, .Type1: p0, .MemTy: s16, .Align: 8}, {.Type0: s32, .Type1: p0, .MemTy: s32, .Align: 8},
585 {.Type0: s64, .Type1: p0, .MemTy: s64, .Align: 8}, {.Type0: s64, .Type1: p0, .MemTy: s32, .Align: 8}, // truncstorei32 from s64
586 {.Type0: p0, .Type1: p0, .MemTy: s64, .Align: 8}, {.Type0: s128, .Type1: p0, .MemTy: s128, .Align: 8}, {.Type0: v16s8, .Type1: p0, .MemTy: s128, .Align: 8},
587 {.Type0: v8s8, .Type1: p0, .MemTy: s64, .Align: 8}, {.Type0: v4s16, .Type1: p0, .MemTy: s64, .Align: 8}, {.Type0: v8s16, .Type1: p0, .MemTy: s128, .Align: 8},
588 {.Type0: v2s32, .Type1: p0, .MemTy: s64, .Align: 8}, {.Type0: v4s32, .Type1: p0, .MemTy: s128, .Align: 8}, {.Type0: v2s64, .Type1: p0, .MemTy: s128, .Align: 8}})
589 .legalForTypesWithMemDesc(TypesAndMemDesc: {
590 // SVE vscale x 128 bit base sizes
591 // TODO: Add nxv2p0. Consider bitcastIf.
592 // See #92130
593 // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
594 {.Type0: nxv16s8, .Type1: p0, .MemTy: nxv16s8, .Align: 8},
595 {.Type0: nxv8s16, .Type1: p0, .MemTy: nxv8s16, .Align: 8},
596 {.Type0: nxv4s32, .Type1: p0, .MemTy: nxv4s32, .Align: 8},
597 {.Type0: nxv2s64, .Type1: p0, .MemTy: nxv2s64, .Align: 8},
598 })
599 .clampScalar(TypeIdx: 0, MinTy: s8, MaxTy: s64)
600 .minScalarOrElt(TypeIdx: 0, Ty: s8)
601 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
602 return Query.Types[0].isScalar() &&
603 Query.Types[0] != Query.MMODescrs[0].MemoryTy;
604 })
605 // Maximum: sN * k = 128
606 .clampMaxNumElements(TypeIdx: 0, EltTy: s8, MaxElements: 16)
607 .clampMaxNumElements(TypeIdx: 0, EltTy: s16, MaxElements: 8)
608 .clampMaxNumElements(TypeIdx: 0, EltTy: s32, MaxElements: 4)
609 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
610 .clampMaxNumElements(TypeIdx: 0, EltTy: p0, MaxElements: 2)
611 .lowerIfMemSizeNotPow2()
612 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
613 .bitcastIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {v4s8}),
614 P1: LegalityPredicate([=](const LegalityQuery &Query) {
615 return Query.Types[0].getSizeInBits() ==
616 Query.MMODescrs[0].MemoryTy.getSizeInBits();
617 })),
618 Mutation: [=](const LegalityQuery &Query) {
619 const LLT VecTy = Query.Types[0];
620 return std::pair(0, LLT::scalar(SizeInBits: VecTy.getSizeInBits()));
621 })
622 .customIf(Predicate: IsPtrVecPred)
623 .scalarizeIf(Predicate: typeInSet(TypeIdx: 0, TypesInit: {v2s16, v2s8}), TypeIdx: 0)
624 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
625 .lower();
626
627 getActionDefinitionsBuilder(Opcode: G_INDEXED_STORE)
628 // Idx 0 == Ptr, Idx 1 == Val
629 // TODO: we can implement legalizations but as of now these are
630 // generated in a very specific way.
631 .legalForTypesWithMemDesc(TypesAndMemDesc: {
632 {.Type0: p0, .Type1: s8, .MemTy: s8, .Align: 8},
633 {.Type0: p0, .Type1: s16, .MemTy: s16, .Align: 8},
634 {.Type0: p0, .Type1: s32, .MemTy: s8, .Align: 8},
635 {.Type0: p0, .Type1: s32, .MemTy: s16, .Align: 8},
636 {.Type0: p0, .Type1: s32, .MemTy: s32, .Align: 8},
637 {.Type0: p0, .Type1: s64, .MemTy: s64, .Align: 8},
638 {.Type0: p0, .Type1: p0, .MemTy: p0, .Align: 8},
639 {.Type0: p0, .Type1: v8s8, .MemTy: v8s8, .Align: 8},
640 {.Type0: p0, .Type1: v16s8, .MemTy: v16s8, .Align: 8},
641 {.Type0: p0, .Type1: v4s16, .MemTy: v4s16, .Align: 8},
642 {.Type0: p0, .Type1: v8s16, .MemTy: v8s16, .Align: 8},
643 {.Type0: p0, .Type1: v2s32, .MemTy: v2s32, .Align: 8},
644 {.Type0: p0, .Type1: v4s32, .MemTy: v4s32, .Align: 8},
645 {.Type0: p0, .Type1: v2s64, .MemTy: v2s64, .Align: 8},
646 {.Type0: p0, .Type1: v2p0, .MemTy: v2p0, .Align: 8},
647 {.Type0: p0, .Type1: s128, .MemTy: s128, .Align: 8},
648 })
649 .unsupported();
650
651 auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
652 LLT LdTy = Query.Types[0];
653 LLT PtrTy = Query.Types[1];
654 if (!llvm::is_contained(Range: PackedVectorAllTypesVec, Element: LdTy) &&
655 !llvm::is_contained(Range: ScalarAndPtrTypesVec, Element: LdTy) && LdTy != s128)
656 return false;
657 if (PtrTy != p0)
658 return false;
659 return true;
660 };
661 getActionDefinitionsBuilder(Opcode: G_INDEXED_LOAD)
662 .unsupportedIf(
663 Predicate: atomicOrderingAtLeastOrStrongerThan(MMOIdx: 0, Ordering: AtomicOrdering::Unordered))
664 .legalIf(Predicate: IndexedLoadBasicPred)
665 .unsupported();
666 getActionDefinitionsBuilder(Opcodes: {G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
667 .unsupportedIf(
668 Predicate: atomicOrderingAtLeastOrStrongerThan(MMOIdx: 0, Ordering: AtomicOrdering::Unordered))
669 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {s16, s32, s64}),
670 P1: LegalityPredicate([=](const LegalityQuery &Q) {
671 LLT LdTy = Q.Types[0];
672 LLT PtrTy = Q.Types[1];
673 LLT MemTy = Q.MMODescrs[0].MemoryTy;
674 if (PtrTy != p0)
675 return false;
676 if (LdTy == s16)
677 return MemTy == s8;
678 if (LdTy == s32)
679 return MemTy == s8 || MemTy == s16;
680 if (LdTy == s64)
681 return MemTy == s8 || MemTy == s16 || MemTy == s32;
682 return false;
683 })))
684 .unsupported();
685
686 // Constants
687 getActionDefinitionsBuilder(Opcode: G_CONSTANT)
688 .legalFor(Types: {p0, s8, s16, s32, s64})
689 .widenScalarToNextPow2(TypeIdx: 0)
690 .clampScalar(TypeIdx: 0, MinTy: s8, MaxTy: s64);
691 getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
692 // Always legalize s16 to prevent G_FCONSTANT being widened to G_CONSTANT
693 .legalFor(Types: {s16, s32, s64, s128})
694 .clampScalar(TypeIdx: 0, MinTy: MinFPScalar, MaxTy: s128);
695
696 // FIXME: fix moreElementsToNextPow2
697 getActionDefinitionsBuilder(Opcode: G_ICMP)
698 .legalFor(Types: {{s32, s32}, {s32, s64}, {s32, p0}})
699 .widenScalarOrEltToNextPow2(TypeIdx: 1)
700 .clampScalar(TypeIdx: 1, MinTy: s32, MaxTy: s64)
701 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s32)
702 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 1, Size: 64), TypeIdx: 1)
703 .minScalarEltSameAsIf(
704 Predicate: [=](const LegalityQuery &Query) {
705 const LLT &Ty = Query.Types[0];
706 const LLT &SrcTy = Query.Types[1];
707 return Ty.isVector() && !SrcTy.isPointerVector() &&
708 Ty.getElementType() != SrcTy.getElementType();
709 },
710 TypeIdx: 0, LargeTypeIdx: 1)
711 .minScalarOrEltIf(
712 Predicate: [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
713 TypeIdx: 1, Ty: s32)
714 .minScalarOrEltIf(
715 Predicate: [=](const LegalityQuery &Query) {
716 return Query.Types[1].isPointerVector();
717 },
718 TypeIdx: 0, Ty: s64)
719 .moreElementsToNextPow2(TypeIdx: 1)
720 .clampNumElements(TypeIdx: 1, MinTy: v8s8, MaxTy: v16s8)
721 .clampNumElements(TypeIdx: 1, MinTy: v4s16, MaxTy: v8s16)
722 .clampNumElements(TypeIdx: 1, MinTy: v2s32, MaxTy: v4s32)
723 .clampNumElements(TypeIdx: 1, MinTy: v2s64, MaxTy: v2s64)
724 .clampNumElements(TypeIdx: 1, MinTy: v2p0, MaxTy: v2p0)
725 .customIf(Predicate: isVector(TypeIdx: 0));
726
727 getActionDefinitionsBuilder(Opcode: G_FCMP)
728 .legalFor(Types: {{s32, s32},
729 {s32, s64},
730 {v4s32, v4s32},
731 {v2s32, v2s32},
732 {v2s64, v2s64}})
733 .legalFor(Pred: HasFP16, Types: {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
734 .widenScalarOrEltToNextPow2(TypeIdx: 1)
735 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s32)
736 .minScalarOrElt(TypeIdx: 1, Ty: MinFPScalar)
737 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 1, Size: 64), TypeIdx: 1)
738 .minScalarEltSameAsIf(
739 Predicate: [=](const LegalityQuery &Query) {
740 const LLT &Ty = Query.Types[0];
741 const LLT &SrcTy = Query.Types[1];
742 return Ty.isVector() && !SrcTy.isPointerVector() &&
743 Ty.getElementType() != SrcTy.getElementType();
744 },
745 TypeIdx: 0, LargeTypeIdx: 1)
746 .clampNumElements(TypeIdx: 1, MinTy: v4s16, MaxTy: v8s16)
747 .clampNumElements(TypeIdx: 1, MinTy: v2s32, MaxTy: v4s32)
748 .clampMaxNumElements(TypeIdx: 1, EltTy: s64, MaxElements: 2)
749 .moreElementsToNextPow2(TypeIdx: 1)
750 .libcallFor(Types: {{s32, s128}});
751
752 // Extensions
753 auto ExtLegalFunc = [=](const LegalityQuery &Query) {
754 unsigned DstSize = Query.Types[0].getSizeInBits();
755
756 // Handle legal vectors using legalFor
757 if (Query.Types[0].isVector())
758 return false;
759
760 if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(Value: DstSize))
761 return false; // Extending to a scalar s128 needs narrowing.
762
763 const LLT &SrcTy = Query.Types[1];
764
765 // Make sure we fit in a register otherwise. Don't bother checking that
766 // the source type is below 128 bits. We shouldn't be allowing anything
767 // through which is wider than the destination in the first place.
768 unsigned SrcSize = SrcTy.getSizeInBits();
769 if (SrcSize < 8 || !isPowerOf2_32(Value: SrcSize))
770 return false;
771
772 return true;
773 };
774 getActionDefinitionsBuilder(Opcodes: {G_ZEXT, G_SEXT, G_ANYEXT})
775 .legalIf(Predicate: ExtLegalFunc)
776 .legalFor(Types: {{v8s16, v8s8}, {v4s32, v4s16}, {v2s64, v2s32}})
777 .clampScalar(TypeIdx: 0, MinTy: s64, MaxTy: s64) // Just for s128, others are handled above.
778 .moreElementsToNextPow2(TypeIdx: 0)
779 .clampMaxNumElements(TypeIdx: 1, EltTy: s8, MaxElements: 8)
780 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 4)
781 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 2)
782 // Tries to convert a large EXTEND into two smaller EXTENDs
783 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
784 return (Query.Types[0].getScalarSizeInBits() >
785 Query.Types[1].getScalarSizeInBits() * 2) &&
786 Query.Types[0].isVector() &&
787 (Query.Types[1].getScalarSizeInBits() == 8 ||
788 Query.Types[1].getScalarSizeInBits() == 16);
789 })
790 .clampMinNumElements(TypeIdx: 1, EltTy: s8, MinElements: 8)
791 .clampMinNumElements(TypeIdx: 1, EltTy: s16, MinElements: 4)
792 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0);
793
794 getActionDefinitionsBuilder(Opcode: G_TRUNC)
795 .legalFor(Types: {{v8s8, v8s16}, {v4s16, v4s32}, {v2s32, v2s64}})
796 .moreElementsToNextPow2(TypeIdx: 0)
797 .clampMaxNumElements(TypeIdx: 0, EltTy: s8, MaxElements: 8)
798 .clampMaxNumElements(TypeIdx: 0, EltTy: s16, MaxElements: 4)
799 .clampMaxNumElements(TypeIdx: 0, EltTy: s32, MaxElements: 2)
800 .minScalarOrEltIf(
801 Predicate: [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
802 TypeIdx: 0, Ty: s8)
803 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
804 LLT DstTy = Query.Types[0];
805 LLT SrcTy = Query.Types[1];
806 return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
807 DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
808 })
809 .clampMinNumElements(TypeIdx: 0, EltTy: s8, MinElements: 8)
810 .clampMinNumElements(TypeIdx: 0, EltTy: s16, MinElements: 4)
811 .alwaysLegal();
812
813 getActionDefinitionsBuilder(Opcodes: {G_TRUNC_SSAT_S, G_TRUNC_SSAT_U, G_TRUNC_USAT_U})
814 .legalFor(Types: {{v8s8, v8s16}, {v4s16, v4s32}, {v2s32, v2s64}})
815 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v2s32);
816
817 getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
818 .legalFor(Types: {s32, s64})
819 .legalFor(Types: PackedVectorAllTypeList)
820 .maxScalar(TypeIdx: 0, Ty: s64)
821 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
822 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
823 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
824 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
825 .lower();
826
827 // FP conversions
828 getActionDefinitionsBuilder(Opcode: G_FPTRUNC)
829 .legalFor(
830 Types: {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
831 .libcallFor(Types: {{s16, s128}, {s32, s128}, {s64, s128}})
832 .moreElementsToNextPow2(TypeIdx: 1)
833 .customIf(Predicate: [](const LegalityQuery &Q) {
834 LLT DstTy = Q.Types[0];
835 LLT SrcTy = Q.Types[1];
836 return SrcTy.isFixedVector() && DstTy.isFixedVector() &&
837 SrcTy.getScalarSizeInBits() == 64 &&
838 DstTy.getScalarSizeInBits() == 16;
839 })
840 // Clamp based on input
841 .clampNumElements(TypeIdx: 1, MinTy: v4s32, MaxTy: v4s32)
842 .clampNumElements(TypeIdx: 1, MinTy: v2s64, MaxTy: v2s64)
843 .scalarize(TypeIdx: 0);
844
845 getActionDefinitionsBuilder(Opcode: G_FPEXT)
846 .legalFor(
847 Types: {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
848 .libcallFor(Types: {{s128, s64}, {s128, s32}, {s128, s16}})
849 .moreElementsToNextPow2(TypeIdx: 0)
850 .widenScalarIf(
851 Predicate: [](const LegalityQuery &Q) {
852 LLT DstTy = Q.Types[0];
853 LLT SrcTy = Q.Types[1];
854 return SrcTy.isVector() && DstTy.isVector() &&
855 SrcTy.getScalarSizeInBits() == 16 &&
856 DstTy.getScalarSizeInBits() == 64;
857 },
858 Mutation: changeElementTo(TypeIdx: 1, Ty: s32))
859 .clampNumElements(TypeIdx: 0, MinTy: v4s32, MaxTy: v4s32)
860 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
861 .scalarize(TypeIdx: 0);
862
863 // Conversions
864 getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
865 .legalFor(Types: {{s32, s32},
866 {s64, s32},
867 {s32, s64},
868 {s64, s64},
869 {v2s32, v2s32},
870 {v4s32, v4s32},
871 {v2s64, v2s64}})
872 .legalFor(Pred: HasFP16,
873 Types: {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
874 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
875 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 1, Size: 64), TypeIdx: 1)
876 // The range of a fp16 value fits into an i17, so we can lower the width
877 // to i64.
878 .narrowScalarIf(
879 Predicate: [=](const LegalityQuery &Query) {
880 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
881 },
882 Mutation: changeTo(TypeIdx: 0, Ty: s64))
883 .moreElementsToNextPow2(TypeIdx: 0)
884 .widenScalarOrEltToNextPow2OrMinSize(TypeIdx: 0)
885 .minScalar(TypeIdx: 0, Ty: s32)
886 .widenScalarOrEltToNextPow2OrMinSize(TypeIdx: 1, /*MinSize=*/HasFP16 ? 16 : 32)
887 .widenScalarIf(
888 Predicate: [=](const LegalityQuery &Query) {
889 return Query.Types[0].getScalarSizeInBits() <= 64 &&
890 Query.Types[0].getScalarSizeInBits() >
891 Query.Types[1].getScalarSizeInBits();
892 },
893 Mutation: LegalizeMutations::changeElementSizeTo(TypeIdx: 1, FromTypeIdx: 0))
894 .widenScalarIf(
895 Predicate: [=](const LegalityQuery &Query) {
896 return Query.Types[1].getScalarSizeInBits() <= 64 &&
897 Query.Types[0].getScalarSizeInBits() <
898 Query.Types[1].getScalarSizeInBits();
899 },
900 Mutation: LegalizeMutations::changeElementSizeTo(TypeIdx: 0, FromTypeIdx: 1))
901 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
902 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
903 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
904 .libcallFor(
905 Types: {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
906
907 getActionDefinitionsBuilder(Opcodes: {G_FPTOSI_SAT, G_FPTOUI_SAT})
908 .legalFor(Types: {{s32, s32},
909 {s64, s32},
910 {s32, s64},
911 {s64, s64},
912 {v2s32, v2s32},
913 {v4s32, v4s32},
914 {v2s64, v2s64}})
915 .legalFor(
916 Pred: HasFP16,
917 Types: {{s16, s16}, {s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
918 // Handle types larger than i64 by scalarizing/lowering.
919 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
920 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 1, Size: 64), TypeIdx: 1)
921 // The range of a fp16 value fits into an i17, so we can lower the width
922 // to i64.
923 .narrowScalarIf(
924 Predicate: [=](const LegalityQuery &Query) {
925 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
926 },
927 Mutation: changeTo(TypeIdx: 0, Ty: s64))
928 .lowerIf(Predicate: ::any(P0: scalarWiderThan(TypeIdx: 0, Size: 64), P1: scalarWiderThan(TypeIdx: 1, Size: 64)), Mutation: 0)
929 .moreElementsToNextPow2(TypeIdx: 0)
930 .widenScalarToNextPow2(TypeIdx: 0, /*MinSize=*/32)
931 .minScalar(TypeIdx: 0, Ty: s32)
932 .widenScalarOrEltToNextPow2OrMinSize(TypeIdx: 1, /*MinSize=*/HasFP16 ? 16 : 32)
933 .widenScalarIf(
934 Predicate: [=](const LegalityQuery &Query) {
935 unsigned ITySize = Query.Types[0].getScalarSizeInBits();
936 return (ITySize == 16 || ITySize == 32 || ITySize == 64) &&
937 ITySize > Query.Types[1].getScalarSizeInBits();
938 },
939 Mutation: LegalizeMutations::changeElementSizeTo(TypeIdx: 1, FromTypeIdx: 0))
940 .widenScalarIf(
941 Predicate: [=](const LegalityQuery &Query) {
942 unsigned FTySize = Query.Types[1].getScalarSizeInBits();
943 return (FTySize == 16 || FTySize == 32 || FTySize == 64) &&
944 Query.Types[0].getScalarSizeInBits() < FTySize;
945 },
946 Mutation: LegalizeMutations::changeElementSizeTo(TypeIdx: 0, FromTypeIdx: 1))
947 .widenScalarOrEltToNextPow2(TypeIdx: 0)
948 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
949 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
950 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2);
951
952 getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
953 .legalFor(Types: {{s32, s32},
954 {s64, s32},
955 {s32, s64},
956 {s64, s64},
957 {v2s32, v2s32},
958 {v4s32, v4s32},
959 {v2s64, v2s64}})
960 .legalFor(Pred: HasFP16,
961 Types: {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}})
962 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 1, Size: 64), TypeIdx: 1)
963 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
964 .moreElementsToNextPow2(TypeIdx: 1)
965 .widenScalarOrEltToNextPow2OrMinSize(TypeIdx: 1)
966 .minScalar(TypeIdx: 1, Ty: s32)
967 .lowerIf(Predicate: [](const LegalityQuery &Query) {
968 return Query.Types[1].isVector() &&
969 Query.Types[1].getScalarSizeInBits() == 64 &&
970 Query.Types[0].getScalarSizeInBits() == 16;
971 })
972 .widenScalarOrEltToNextPow2OrMinSize(TypeIdx: 0, /*MinSize=*/HasFP16 ? 16 : 32)
973 .scalarizeIf(
974 // v2i64->v2f32 needs to scalarize to avoid double-rounding issues.
975 Predicate: [](const LegalityQuery &Query) {
976 return Query.Types[0].getScalarSizeInBits() == 32 &&
977 Query.Types[1].getScalarSizeInBits() == 64;
978 },
979 TypeIdx: 0)
980 .widenScalarIf(
981 Predicate: [](const LegalityQuery &Query) {
982 return Query.Types[1].getScalarSizeInBits() <= 64 &&
983 Query.Types[0].getScalarSizeInBits() <
984 Query.Types[1].getScalarSizeInBits();
985 },
986 Mutation: LegalizeMutations::changeElementSizeTo(TypeIdx: 0, FromTypeIdx: 1))
987 .widenScalarIf(
988 Predicate: [](const LegalityQuery &Query) {
989 return Query.Types[0].getScalarSizeInBits() <= 64 &&
990 Query.Types[0].getScalarSizeInBits() >
991 Query.Types[1].getScalarSizeInBits();
992 },
993 Mutation: LegalizeMutations::changeElementSizeTo(TypeIdx: 1, FromTypeIdx: 0))
994 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
995 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
996 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
997 .libcallFor(Types: {{s16, s128},
998 {s32, s128},
999 {s64, s128},
1000 {s128, s128},
1001 {s128, s32},
1002 {s128, s64}});
1003
1004 // Control-flow
1005 getActionDefinitionsBuilder(Opcode: G_BR).alwaysLegal();
1006 getActionDefinitionsBuilder(Opcode: G_BRCOND)
1007 .legalFor(Types: {s32})
1008 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s32);
1009 getActionDefinitionsBuilder(Opcode: G_BRINDIRECT).legalFor(Types: {p0});
1010
1011 getActionDefinitionsBuilder(Opcode: G_SELECT)
1012 .legalFor(Types: {{s32, s32}, {s64, s32}, {p0, s32}})
1013 .widenScalarToNextPow2(TypeIdx: 0)
1014 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64)
1015 .clampScalar(TypeIdx: 1, MinTy: s32, MaxTy: s32)
1016 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
1017 .minScalarEltSameAsIf(Predicate: all(P0: isVector(TypeIdx: 0), P1: isVector(TypeIdx: 1)), TypeIdx: 1, LargeTypeIdx: 0)
1018 .lowerIf(Predicate: isVector(TypeIdx: 0));
1019
1020 // Pointer-handling
1021 getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {p0});
1022
1023 if (TM.getCodeModel() == CodeModel::Small)
1024 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE).custom();
1025 else
1026 getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE).legalFor(Types: {p0});
1027
1028 getActionDefinitionsBuilder(Opcode: G_PTRAUTH_GLOBAL_VALUE)
1029 .legalIf(Predicate: all(P0: typeIs(TypeIdx: 0, TypesInit: p0), P1: typeIs(TypeIdx: 1, TypesInit: p0)));
1030
1031 getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1032 .legalFor(Types: {{s64, p0}, {v2s64, v2p0}})
1033 .widenScalarToNextPow2(TypeIdx: 0, MinSize: 64)
1034 .clampScalar(TypeIdx: 0, MinTy: s64, MaxTy: s64)
1035 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2);
1036
1037 getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1038 .unsupportedIf(Predicate: [&](const LegalityQuery &Query) {
1039 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
1040 })
1041 .legalFor(Types: {{p0, s64}, {v2p0, v2s64}})
1042 .clampMaxNumElements(TypeIdx: 1, EltTy: s64, MaxElements: 2);
1043
1044 // Casts for 32 and 64-bit width type are just copies.
1045 // Same for 128-bit width type, except they are on the FPR bank.
1046 getActionDefinitionsBuilder(Opcode: G_BITCAST)
1047 // Keeping 32-bit instructions legal to prevent regression in some tests
1048 .legalForCartesianProduct(Types: {s32, v2s16, v4s8})
1049 .legalForCartesianProduct(Types: {s64, v8s8, v4s16, v2s32})
1050 .legalForCartesianProduct(Types: {s128, v16s8, v8s16, v4s32, v2s64, v2p0})
1051 .customIf(Predicate: [=](const LegalityQuery &Query) {
1052 // Handle casts from i1 vectors to scalars.
1053 LLT DstTy = Query.Types[0];
1054 LLT SrcTy = Query.Types[1];
1055 return DstTy.isScalar() && SrcTy.isVector() &&
1056 SrcTy.getScalarSizeInBits() == 1;
1057 })
1058 .lowerIf(Predicate: [=](const LegalityQuery &Query) {
1059 return Query.Types[0].isVector() != Query.Types[1].isVector();
1060 })
1061 .moreElementsToNextPow2(TypeIdx: 0)
1062 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
1063 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
1064 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
1065 .lower();
1066
1067 getActionDefinitionsBuilder(Opcode: G_VASTART).legalFor(Types: {p0});
1068
1069 // va_list must be a pointer, but most sized types are pretty easy to handle
1070 // as the destination.
1071 getActionDefinitionsBuilder(Opcode: G_VAARG)
1072 .customForCartesianProduct(Types0: {s8, s16, s32, s64, p0}, Types1: {p0})
1073 .clampScalar(TypeIdx: 0, MinTy: s8, MaxTy: s64)
1074 .widenScalarToNextPow2(TypeIdx: 0, /*Min*/ MinSize: 8);
1075
1076 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG_WITH_SUCCESS)
1077 .lowerIf(
1078 Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {s8, s16, s32, s64, s128}), P1: typeIs(TypeIdx: 2, TypesInit: p0)));
1079
1080 bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE();
1081
1082 getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1083 .legalFor(Pred: !UseOutlineAtomics, Types: {{s32, p0}, {s64, p0}})
1084 .customFor(Pred: !UseOutlineAtomics, Types: {{s128, p0}})
1085 .libcallFor(Pred: UseOutlineAtomics,
1086 Types: {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}})
1087 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64);
1088
1089 getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
1090 G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
1091 G_ATOMICRMW_XOR})
1092 .legalFor(Pred: !UseOutlineAtomics, Types: {{s32, p0}, {s64, p0}})
1093 .libcallFor(Pred: UseOutlineAtomics,
1094 Types: {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
1095 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64);
1096
1097 // Do not outline these atomics operations, as per comment in
1098 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
1099 getActionDefinitionsBuilder(
1100 Opcodes: {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
1101 .legalIf(Predicate: all(P0: typeInSet(TypeIdx: 0, TypesInit: {s32, s64}), P1: typeIs(TypeIdx: 1, TypesInit: p0)))
1102 .clampScalar(TypeIdx: 0, MinTy: s32, MaxTy: s64);
1103
1104 getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {p0});
1105
1106 // Merge/Unmerge
1107 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1108 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1109 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1110 getActionDefinitionsBuilder(Opcode: Op)
1111 .widenScalarToNextPow2(TypeIdx: LitTyIdx, MinSize: 8)
1112 .widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: 32)
1113 .clampScalar(TypeIdx: LitTyIdx, MinTy: s8, MaxTy: s64)
1114 .clampScalar(TypeIdx: BigTyIdx, MinTy: s32, MaxTy: s128)
1115 .legalIf(Predicate: [=](const LegalityQuery &Q) {
1116 switch (Q.Types[BigTyIdx].getSizeInBits()) {
1117 case 32:
1118 case 64:
1119 case 128:
1120 break;
1121 default:
1122 return false;
1123 }
1124 switch (Q.Types[LitTyIdx].getSizeInBits()) {
1125 case 8:
1126 case 16:
1127 case 32:
1128 case 64:
1129 return true;
1130 default:
1131 return false;
1132 }
1133 });
1134 }
1135
1136 // TODO : nxv4s16, nxv2s16, nxv2s32
1137 getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1138 .legalFor(Pred: HasSVE, Types: {{s16, nxv16s8, s64},
1139 {s16, nxv8s16, s64},
1140 {s32, nxv4s32, s64},
1141 {s64, nxv2s64, s64}})
1142 .unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1143 const LLT &EltTy = Query.Types[1].getElementType();
1144 if (Query.Types[1].isScalableVector())
1145 return false;
1146 return Query.Types[0] != EltTy;
1147 })
1148 .minScalar(TypeIdx: 2, Ty: s64)
1149 .customIf(Predicate: [=](const LegalityQuery &Query) {
1150 const LLT &VecTy = Query.Types[1];
1151 return VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s16 ||
1152 VecTy == v4s16 || VecTy == v8s16 || VecTy == v2s32 ||
1153 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2p0;
1154 })
1155 .minScalarOrEltIf(
1156 Predicate: [=](const LegalityQuery &Query) {
1157 // We want to promote to <M x s1> to <M x s64> if that wouldn't
1158 // cause the total vec size to be > 128b.
1159 return Query.Types[1].isFixedVector() &&
1160 Query.Types[1].getNumElements() <= 2;
1161 },
1162 TypeIdx: 0, Ty: s64)
1163 .minScalarOrEltIf(
1164 Predicate: [=](const LegalityQuery &Query) {
1165 return Query.Types[1].isFixedVector() &&
1166 Query.Types[1].getNumElements() <= 4;
1167 },
1168 TypeIdx: 0, Ty: s32)
1169 .minScalarOrEltIf(
1170 Predicate: [=](const LegalityQuery &Query) {
1171 return Query.Types[1].isFixedVector() &&
1172 Query.Types[1].getNumElements() <= 8;
1173 },
1174 TypeIdx: 0, Ty: s16)
1175 .minScalarOrEltIf(
1176 Predicate: [=](const LegalityQuery &Query) {
1177 return Query.Types[1].isFixedVector() &&
1178 Query.Types[1].getNumElements() <= 16;
1179 },
1180 TypeIdx: 0, Ty: s8)
1181 .minScalarOrElt(TypeIdx: 0, Ty: s8) // Worst case, we need at least s8.
1182 .moreElementsToNextPow2(TypeIdx: 1)
1183 .clampMaxNumElements(TypeIdx: 1, EltTy: s64, MaxElements: 2)
1184 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 4)
1185 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 8)
1186 .clampMaxNumElements(TypeIdx: 1, EltTy: s8, MaxElements: 16)
1187 .clampMaxNumElements(TypeIdx: 1, EltTy: p0, MaxElements: 2)
1188 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 1, Size: 64), TypeIdx: 1);
1189
1190 getActionDefinitionsBuilder(Opcode: G_INSERT_VECTOR_ELT)
1191 .legalIf(
1192 Predicate: typeInSet(TypeIdx: 0, TypesInit: {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64, v2p0}))
1193 .legalFor(Pred: HasSVE, Types: {{nxv16s8, s32, s64},
1194 {nxv8s16, s32, s64},
1195 {nxv4s32, s32, s64},
1196 {nxv2s64, s64, s64}})
1197 .moreElementsToNextPow2(TypeIdx: 0)
1198 .widenVectorEltsToVectorMinSize(TypeIdx: 0, VectorSize: 64)
1199 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
1200 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
1201 .clampNumElements(TypeIdx: 0, MinTy: v2s32, MaxTy: v4s32)
1202 .clampMaxNumElements(TypeIdx: 0, EltTy: s64, MaxElements: 2)
1203 .clampMaxNumElements(TypeIdx: 0, EltTy: p0, MaxElements: 2)
1204 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0);
1205
1206 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1207 .legalFor(Types: {{v8s8, s8},
1208 {v16s8, s8},
1209 {v4s16, s16},
1210 {v8s16, s16},
1211 {v2s32, s32},
1212 {v4s32, s32},
1213 {v2s64, s64},
1214 {v2p0, p0}})
1215 .clampNumElements(TypeIdx: 0, MinTy: v4s32, MaxTy: v4s32)
1216 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
1217 .minScalarOrElt(TypeIdx: 0, Ty: s8)
1218 .widenVectorEltsToVectorMinSize(TypeIdx: 0, VectorSize: 64)
1219 .widenScalarOrEltToNextPow2(TypeIdx: 0)
1220 .minScalarSameAs(TypeIdx: 1, LargeTypeIdx: 0);
1221
1222 getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC).lower();
1223
1224 getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR)
1225 .legalIf(Predicate: [=](const LegalityQuery &Query) {
1226 const LLT &DstTy = Query.Types[0];
1227 const LLT &SrcTy = Query.Types[1];
1228 // For now just support the TBL2 variant which needs the source vectors
1229 // to be the same size as the dest.
1230 if (DstTy != SrcTy)
1231 return false;
1232 return llvm::is_contained(
1233 Set: {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64}, Element: DstTy);
1234 })
1235 .moreElementsIf(
1236 Predicate: [](const LegalityQuery &Query) {
1237 return Query.Types[0].getNumElements() >
1238 Query.Types[1].getNumElements();
1239 },
1240 Mutation: changeTo(TypeIdx: 1, FromTypeIdx: 0))
1241 .moreElementsToNextPow2(TypeIdx: 0)
1242 .moreElementsIf(
1243 Predicate: [](const LegalityQuery &Query) {
1244 return Query.Types[0].getNumElements() <
1245 Query.Types[1].getNumElements();
1246 },
1247 Mutation: changeTo(TypeIdx: 0, FromTypeIdx: 1))
1248 .widenScalarOrEltToNextPow2OrMinSize(TypeIdx: 0, MinSize: 8)
1249 .clampNumElements(TypeIdx: 0, MinTy: v8s8, MaxTy: v16s8)
1250 .clampNumElements(TypeIdx: 0, MinTy: v4s16, MaxTy: v8s16)
1251 .clampNumElements(TypeIdx: 0, MinTy: v4s32, MaxTy: v4s32)
1252 .clampNumElements(TypeIdx: 0, MinTy: v2s64, MaxTy: v2s64)
1253 .scalarizeIf(Predicate: scalarOrEltWiderThan(TypeIdx: 0, Size: 64), TypeIdx: 0)
1254 .bitcastIf(Predicate: isPointerVector(TypeIdx: 0), Mutation: [=](const LegalityQuery &Query) {
1255 // Bitcast pointers vector to i64.
1256 const LLT DstTy = Query.Types[0];
1257 return std::pair(0, LLT::vector(EC: DstTy.getElementCount(), ScalarSizeInBits: 64));
1258 });
1259
1260 getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
1261 .legalFor(Types: {{v16s8, v8s8}, {v8s16, v4s16}, {v4s32, v2s32}})
1262 .bitcastIf(
1263 Predicate: [=](const LegalityQuery &Query) {
1264 return Query.Types[0].isFixedVector() &&
1265 Query.Types[1].isFixedVector() &&
1266 Query.Types[0].getScalarSizeInBits() >= 8 &&
1267 isPowerOf2_64(Value: Query.Types[0].getScalarSizeInBits()) &&
1268 Query.Types[0].getSizeInBits() <= 128 &&
1269 Query.Types[1].getSizeInBits() <= 64;
1270 },
1271 Mutation: [=](const LegalityQuery &Query) {
1272 const LLT DstTy = Query.Types[0];
1273 const LLT SrcTy = Query.Types[1];
1274 return std::pair(
1275 0, DstTy.changeElementSize(NewEltSize: SrcTy.getSizeInBits())
1276 .changeElementCount(
1277 EC: DstTy.getElementCount().divideCoefficientBy(
1278 RHS: SrcTy.getNumElements())));
1279 });
1280
1281 getActionDefinitionsBuilder(Opcode: G_EXTRACT_SUBVECTOR)
1282 .legalFor(Types: {{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}})
1283 .widenScalarOrEltToNextPow2(TypeIdx: 0)
1284 .immIdx(ImmIdx: 0); // Inform verifier imm idx 0 is handled.
1285
1286 // TODO: {nxv16s8, s8}, {nxv8s16, s16}
1287 getActionDefinitionsBuilder(Opcode: G_SPLAT_VECTOR)
1288 .legalFor(Pred: HasSVE, Types: {{nxv4s32, s32}, {nxv2s64, s64}});
1289
1290 getActionDefinitionsBuilder(Opcode: G_JUMP_TABLE).legalFor(Types: {p0});
1291
1292 getActionDefinitionsBuilder(Opcode: G_BRJT).legalFor(Types: {{p0, s64}});
1293
1294 getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP, G_UBSANTRAP}).alwaysLegal();
1295
1296 getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC).custom();
1297
1298 getActionDefinitionsBuilder(Opcodes: {G_STACKSAVE, G_STACKRESTORE}).lower();
1299
1300 if (ST.hasMOPS()) {
1301 // G_BZERO is not supported. Currently it is only emitted by
1302 // PreLegalizerCombiner for G_MEMSET with zero constant.
1303 getActionDefinitionsBuilder(Opcode: G_BZERO).unsupported();
1304
1305 getActionDefinitionsBuilder(Opcode: G_MEMSET)
1306 .legalForCartesianProduct(Types0: {p0}, Types1: {s64}, Types2: {s64})
1307 .customForCartesianProduct(Types0: {p0}, Types1: {s8}, Types2: {s64})
1308 .immIdx(ImmIdx: 0); // Inform verifier imm idx 0 is handled.
1309
1310 getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMMOVE})
1311 .legalForCartesianProduct(Types0: {p0}, Types1: {p0}, Types2: {s64})
1312 .immIdx(ImmIdx: 0); // Inform verifier imm idx 0 is handled.
1313
1314 // G_MEMCPY_INLINE does not have a tailcall immediate
1315 getActionDefinitionsBuilder(Opcode: G_MEMCPY_INLINE)
1316 .legalForCartesianProduct(Types0: {p0}, Types1: {p0}, Types2: {s64});
1317
1318 } else {
1319 getActionDefinitionsBuilder(Opcodes: {G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
1320 .libcall();
1321 }
1322
1323 // For fadd reductions we have pairwise operations available. We treat the
1324 // usual legal types as legal and handle the lowering to pairwise instructions
1325 // later.
1326 getActionDefinitionsBuilder(Opcode: G_VECREDUCE_FADD)
1327 .legalFor(Types: {{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1328 .legalFor(Pred: HasFP16, Types: {{s16, v4s16}, {s16, v8s16}})
1329 .minScalarOrElt(TypeIdx: 0, Ty: MinFPScalar)
1330 .clampMaxNumElements(TypeIdx: 1, EltTy: s64, MaxElements: 2)
1331 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 4)
1332 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 8)
1333 .moreElementsToNextPow2(TypeIdx: 1)
1334 .scalarize(TypeIdx: 1)
1335 .lower();
1336
1337 // For fmul reductions we need to split up into individual operations. We
1338 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1339 // smaller types, followed by scalarizing what remains.
1340 getActionDefinitionsBuilder(Opcode: G_VECREDUCE_FMUL)
1341 .minScalarOrElt(TypeIdx: 0, Ty: MinFPScalar)
1342 .clampMaxNumElements(TypeIdx: 1, EltTy: s64, MaxElements: 2)
1343 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 4)
1344 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 8)
1345 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 2)
1346 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 4)
1347 .scalarize(TypeIdx: 1)
1348 .lower();
1349
1350 getActionDefinitionsBuilder(Opcodes: {G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1351 .scalarize(TypeIdx: 2)
1352 .lower();
1353
1354 getActionDefinitionsBuilder(Opcode: G_VECREDUCE_ADD)
1355 .legalFor(Types: {{s8, v8s8},
1356 {s8, v16s8},
1357 {s16, v4s16},
1358 {s16, v8s16},
1359 {s32, v2s32},
1360 {s32, v4s32},
1361 {s64, v2s64}})
1362 .moreElementsToNextPow2(TypeIdx: 1)
1363 .clampMaxNumElements(TypeIdx: 1, EltTy: s64, MaxElements: 2)
1364 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 4)
1365 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 8)
1366 .clampMaxNumElements(TypeIdx: 1, EltTy: s8, MaxElements: 16)
1367 .widenVectorEltsToVectorMinSize(TypeIdx: 1, VectorSize: 64)
1368 .scalarize(TypeIdx: 1);
1369
1370 getActionDefinitionsBuilder(Opcodes: {G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1371 G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1372 .legalFor(Types: {{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1373 .legalFor(Pred: HasFP16, Types: {{s16, v4s16}, {s16, v8s16}})
1374 .minScalarOrElt(TypeIdx: 0, Ty: MinFPScalar)
1375 .clampMaxNumElements(TypeIdx: 1, EltTy: s64, MaxElements: 2)
1376 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 4)
1377 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 8)
1378 .scalarize(TypeIdx: 1)
1379 .lower();
1380
1381 getActionDefinitionsBuilder(Opcode: G_VECREDUCE_MUL)
1382 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 2)
1383 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 4)
1384 .clampMaxNumElements(TypeIdx: 1, EltTy: s8, MaxElements: 8)
1385 .scalarize(TypeIdx: 1)
1386 .lower();
1387
1388 getActionDefinitionsBuilder(
1389 Opcodes: {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1390 .legalFor(Types: {{s8, v8s8},
1391 {s8, v16s8},
1392 {s16, v4s16},
1393 {s16, v8s16},
1394 {s32, v2s32},
1395 {s32, v4s32}})
1396 .moreElementsIf(
1397 Predicate: [=](const LegalityQuery &Query) {
1398 return Query.Types[1].isVector() &&
1399 Query.Types[1].getElementType() != s8 &&
1400 Query.Types[1].getNumElements() & 1;
1401 },
1402 Mutation: LegalizeMutations::moreElementsToNextPow2(TypeIdx: 1))
1403 .clampMaxNumElements(TypeIdx: 1, EltTy: s64, MaxElements: 2)
1404 .clampMaxNumElements(TypeIdx: 1, EltTy: s32, MaxElements: 4)
1405 .clampMaxNumElements(TypeIdx: 1, EltTy: s16, MaxElements: 8)
1406 .clampMaxNumElements(TypeIdx: 1, EltTy: s8, MaxElements: 16)
1407 .scalarize(TypeIdx: 1)
1408 .lower();
1409
1410 getActionDefinitionsBuilder(
1411 Opcodes: {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1412 // Try to break down into smaller vectors as long as they're at least 64
1413 // bits. This lets us use vector operations for some parts of the
1414 // reduction.
1415 .fewerElementsIf(
1416 Predicate: [=](const LegalityQuery &Q) {
1417 LLT SrcTy = Q.Types[1];
1418 if (SrcTy.isScalar())
1419 return false;
1420 if (!isPowerOf2_32(Value: SrcTy.getNumElements()))
1421 return false;
1422 // We can usually perform 64b vector operations.
1423 return SrcTy.getSizeInBits() > 64;
1424 },
1425 Mutation: [=](const LegalityQuery &Q) {
1426 LLT SrcTy = Q.Types[1];
1427 return std::make_pair(x: 1, y: SrcTy.divide(Factor: 2));
1428 })
1429 .scalarize(TypeIdx: 1)
1430 .lower();
1431
1432 // TODO: Update this to correct handling when adding AArch64/SVE support.
1433 getActionDefinitionsBuilder(Opcode: G_VECTOR_COMPRESS).lower();
1434
1435 // Access to floating-point environment.
1436 getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1437 G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1438 .libcall();
1439
1440 getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS).lower();
1441
1442 getActionDefinitionsBuilder(Opcode: G_PREFETCH).custom();
1443
1444 getActionDefinitionsBuilder(Opcodes: {G_SCMP, G_UCMP}).lower();
1445
1446 getLegacyLegalizerInfo().computeTables();
1447 verify(MII: *ST.getInstrInfo());
1448}
1449
1450bool AArch64LegalizerInfo::legalizeCustom(
1451 LegalizerHelper &Helper, MachineInstr &MI,
1452 LostDebugLocObserver &LocObserver) const {
1453 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1454 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1455 GISelChangeObserver &Observer = Helper.Observer;
1456 switch (MI.getOpcode()) {
1457 default:
1458 // No idea what to do.
1459 return false;
1460 case TargetOpcode::G_VAARG:
1461 return legalizeVaArg(MI, MRI, MIRBuilder);
1462 case TargetOpcode::G_LOAD:
1463 case TargetOpcode::G_STORE:
1464 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1465 case TargetOpcode::G_SHL:
1466 case TargetOpcode::G_ASHR:
1467 case TargetOpcode::G_LSHR:
1468 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1469 case TargetOpcode::G_GLOBAL_VALUE:
1470 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1471 case TargetOpcode::G_SBFX:
1472 case TargetOpcode::G_UBFX:
1473 return legalizeBitfieldExtract(MI, MRI, Helper);
1474 case TargetOpcode::G_FSHL:
1475 case TargetOpcode::G_FSHR:
1476 return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1477 case TargetOpcode::G_ROTR:
1478 return legalizeRotate(MI, MRI, Helper);
1479 case TargetOpcode::G_CTPOP:
1480 return legalizeCTPOP(MI, MRI, Helper);
1481 case TargetOpcode::G_ATOMIC_CMPXCHG:
1482 return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1483 case TargetOpcode::G_CTTZ:
1484 return legalizeCTTZ(MI, Helper);
1485 case TargetOpcode::G_BZERO:
1486 case TargetOpcode::G_MEMCPY:
1487 case TargetOpcode::G_MEMMOVE:
1488 case TargetOpcode::G_MEMSET:
1489 return legalizeMemOps(MI, Helper);
1490 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1491 return legalizeExtractVectorElt(MI, MRI, Helper);
1492 case TargetOpcode::G_DYN_STACKALLOC:
1493 return legalizeDynStackAlloc(MI, Helper);
1494 case TargetOpcode::G_PREFETCH:
1495 return legalizePrefetch(MI, Helper);
1496 case TargetOpcode::G_ABS:
1497 return Helper.lowerAbsToCNeg(MI);
1498 case TargetOpcode::G_ICMP:
1499 return legalizeICMP(MI, MRI, MIRBuilder);
1500 case TargetOpcode::G_BITCAST:
1501 return legalizeBitcast(MI, Helper);
1502 case TargetOpcode::G_FPTRUNC:
1503 // In order to lower f16 to f64 properly, we need to use f32 as an
1504 // intermediary
1505 return legalizeFptrunc(MI, MIRBuilder, MRI);
1506 }
1507
1508 llvm_unreachable("expected switch to return");
1509}
1510
1511bool AArch64LegalizerInfo::legalizeBitcast(MachineInstr &MI,
1512 LegalizerHelper &Helper) const {
1513 assert(MI.getOpcode() == TargetOpcode::G_BITCAST && "Unexpected opcode");
1514 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
1515 // We're trying to handle casts from i1 vectors to scalars but reloading from
1516 // stack.
1517 if (!DstTy.isScalar() || !SrcTy.isVector() ||
1518 SrcTy.getElementType() != LLT::scalar(SizeInBits: 1))
1519 return false;
1520
1521 Helper.createStackStoreLoad(Res: DstReg, Val: SrcReg);
1522 MI.eraseFromParent();
1523 return true;
1524}
1525
1526bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1527 MachineRegisterInfo &MRI,
1528 MachineIRBuilder &MIRBuilder,
1529 GISelChangeObserver &Observer,
1530 LegalizerHelper &Helper) const {
1531 assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1532 MI.getOpcode() == TargetOpcode::G_FSHR);
1533
1534 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1535 // lowering
1536 Register ShiftNo = MI.getOperand(i: 3).getReg();
1537 LLT ShiftTy = MRI.getType(Reg: ShiftNo);
1538 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ShiftNo, MRI);
1539
1540 // Adjust shift amount according to Opcode (FSHL/FSHR)
1541 // Convert FSHL to FSHR
1542 LLT OperationTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1543 APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1544
1545 // Lower non-constant shifts and leave zero shifts to the optimizer.
1546 if (!VRegAndVal || VRegAndVal->Value.urem(RHS: BitWidth) == 0)
1547 return (Helper.lowerFunnelShiftAsShifts(MI) ==
1548 LegalizerHelper::LegalizeResult::Legalized);
1549
1550 APInt Amount = VRegAndVal->Value.urem(RHS: BitWidth);
1551
1552 Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1553
1554 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1555 // in the range of 0 <-> BitWidth, it is legal
1556 if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1557 VRegAndVal->Value.ult(RHS: BitWidth))
1558 return true;
1559
1560 // Cast the ShiftNumber to a 64-bit type
1561 auto Cast64 = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Amount.zext(width: 64));
1562
1563 if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1564 Observer.changingInstr(MI);
1565 MI.getOperand(i: 3).setReg(Cast64.getReg(Idx: 0));
1566 Observer.changedInstr(MI);
1567 }
1568 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1569 // instruction
1570 else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1571 MIRBuilder.buildInstr(Opc: TargetOpcode::G_FSHR, DstOps: {MI.getOperand(i: 0).getReg()},
1572 SrcOps: {MI.getOperand(i: 1).getReg(), MI.getOperand(i: 2).getReg(),
1573 Cast64.getReg(Idx: 0)});
1574 MI.eraseFromParent();
1575 }
1576 return true;
1577}
1578
1579bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
1580 MachineRegisterInfo &MRI,
1581 MachineIRBuilder &MIRBuilder) const {
1582 Register DstReg = MI.getOperand(i: 0).getReg();
1583 Register SrcReg1 = MI.getOperand(i: 2).getReg();
1584 Register SrcReg2 = MI.getOperand(i: 3).getReg();
1585 LLT DstTy = MRI.getType(Reg: DstReg);
1586 LLT SrcTy = MRI.getType(Reg: SrcReg1);
1587
1588 // Check the vector types are legal
1589 if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() ||
1590 DstTy.getNumElements() != SrcTy.getNumElements() ||
1591 (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128))
1592 return false;
1593
1594 // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
1595 // following passes
1596 CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(i: 1).getPredicate();
1597 if (Pred != CmpInst::ICMP_NE)
1598 return true;
1599 Register CmpReg =
1600 MIRBuilder
1601 .buildICmp(Pred: CmpInst::ICMP_EQ, Res: MRI.getType(Reg: DstReg), Op0: SrcReg1, Op1: SrcReg2)
1602 .getReg(Idx: 0);
1603 MIRBuilder.buildNot(Dst: DstReg, Src0: CmpReg);
1604
1605 MI.eraseFromParent();
1606 return true;
1607}
1608
1609bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1610 MachineRegisterInfo &MRI,
1611 LegalizerHelper &Helper) const {
1612 // To allow for imported patterns to match, we ensure that the rotate amount
1613 // is 64b with an extension.
1614 Register AmtReg = MI.getOperand(i: 2).getReg();
1615 LLT AmtTy = MRI.getType(Reg: AmtReg);
1616 (void)AmtTy;
1617 assert(AmtTy.isScalar() && "Expected a scalar rotate");
1618 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1619 auto NewAmt = Helper.MIRBuilder.buildZExt(Res: LLT::scalar(SizeInBits: 64), Op: AmtReg);
1620 Helper.Observer.changingInstr(MI);
1621 MI.getOperand(i: 2).setReg(NewAmt.getReg(Idx: 0));
1622 Helper.Observer.changedInstr(MI);
1623 return true;
1624}
1625
1626bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1627 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1628 GISelChangeObserver &Observer) const {
1629 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1630 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1631 // G_ADD_LOW instructions.
1632 // By splitting this here, we can optimize accesses in the small code model by
1633 // folding in the G_ADD_LOW into the load/store offset.
1634 auto &GlobalOp = MI.getOperand(i: 1);
1635 // Don't modify an intrinsic call.
1636 if (GlobalOp.isSymbol())
1637 return true;
1638 const auto* GV = GlobalOp.getGlobal();
1639 if (GV->isThreadLocal())
1640 return true; // Don't want to modify TLS vars.
1641
1642 auto &TM = ST->getTargetLowering()->getTargetMachine();
1643 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1644
1645 if (OpFlags & AArch64II::MO_GOT)
1646 return true;
1647
1648 auto Offset = GlobalOp.getOffset();
1649 Register DstReg = MI.getOperand(i: 0).getReg();
1650 auto ADRP = MIRBuilder.buildInstr(Opc: AArch64::ADRP, DstOps: {LLT::pointer(AddressSpace: 0, SizeInBits: 64)}, SrcOps: {})
1651 .addGlobalAddress(GV, Offset, TargetFlags: OpFlags | AArch64II::MO_PAGE);
1652 // Set the regclass on the dest reg too.
1653 MRI.setRegClass(Reg: ADRP.getReg(Idx: 0), RC: &AArch64::GPR64RegClass);
1654
1655 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1656 // by creating a MOVK that sets bits 48-63 of the register to (global address
1657 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1658 // prevent an incorrect tag being generated during relocation when the
1659 // global appears before the code section. Without the offset, a global at
1660 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1661 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1662 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1663 // instead of `0xf`.
1664 // This assumes that we're in the small code model so we can assume a binary
1665 // size of <= 4GB, which makes the untagged PC relative offset positive. The
1666 // binary must also be loaded into address range [0, 2^48). Both of these
1667 // properties need to be ensured at runtime when using tagged addresses.
1668 if (OpFlags & AArch64II::MO_TAGGED) {
1669 assert(!Offset &&
1670 "Should not have folded in an offset for a tagged global!");
1671 ADRP = MIRBuilder.buildInstr(Opc: AArch64::MOVKXi, DstOps: {LLT::pointer(AddressSpace: 0, SizeInBits: 64)}, SrcOps: {ADRP})
1672 .addGlobalAddress(GV, Offset: 0x100000000,
1673 TargetFlags: AArch64II::MO_PREL | AArch64II::MO_G3)
1674 .addImm(Val: 48);
1675 MRI.setRegClass(Reg: ADRP.getReg(Idx: 0), RC: &AArch64::GPR64RegClass);
1676 }
1677
1678 MIRBuilder.buildInstr(Opc: AArch64::G_ADD_LOW, DstOps: {DstReg}, SrcOps: {ADRP})
1679 .addGlobalAddress(GV, Offset,
1680 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1681 MI.eraseFromParent();
1682 return true;
1683}
1684
1685bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1686 MachineInstr &MI) const {
1687 MachineIRBuilder &MIB = Helper.MIRBuilder;
1688 MachineRegisterInfo &MRI = *MIB.getMRI();
1689
1690 auto LowerUnaryOp = [&MI, &MIB](unsigned Opcode) {
1691 MIB.buildInstr(Opc: Opcode, DstOps: {MI.getOperand(i: 0)}, SrcOps: {MI.getOperand(i: 2)});
1692 MI.eraseFromParent();
1693 return true;
1694 };
1695 auto LowerBinOp = [&MI, &MIB](unsigned Opcode) {
1696 MIB.buildInstr(Opc: Opcode, DstOps: {MI.getOperand(i: 0)},
1697 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
1698 MI.eraseFromParent();
1699 return true;
1700 };
1701 auto LowerTriOp = [&MI, &MIB](unsigned Opcode) {
1702 MIB.buildInstr(Opc: Opcode, DstOps: {MI.getOperand(i: 0)},
1703 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3), MI.getOperand(i: 4)});
1704 MI.eraseFromParent();
1705 return true;
1706 };
1707
1708 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
1709 switch (IntrinsicID) {
1710 case Intrinsic::vacopy: {
1711 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1712 unsigned VaListSize =
1713 (ST->isTargetDarwin() || ST->isTargetWindows())
1714 ? PtrSize
1715 : ST->isTargetILP32() ? 20 : 32;
1716
1717 MachineFunction &MF = *MI.getMF();
1718 auto Val = MF.getRegInfo().createGenericVirtualRegister(
1719 Ty: LLT::scalar(SizeInBits: VaListSize * 8));
1720 MIB.buildLoad(Res: Val, Addr: MI.getOperand(i: 2),
1721 MMO&: *MF.getMachineMemOperand(PtrInfo: MachinePointerInfo(),
1722 F: MachineMemOperand::MOLoad,
1723 Size: VaListSize, BaseAlignment: Align(PtrSize)));
1724 MIB.buildStore(Val, Addr: MI.getOperand(i: 1),
1725 MMO&: *MF.getMachineMemOperand(PtrInfo: MachinePointerInfo(),
1726 F: MachineMemOperand::MOStore,
1727 Size: VaListSize, BaseAlignment: Align(PtrSize)));
1728 MI.eraseFromParent();
1729 return true;
1730 }
1731 case Intrinsic::get_dynamic_area_offset: {
1732 MIB.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val: 0);
1733 MI.eraseFromParent();
1734 return true;
1735 }
1736 case Intrinsic::aarch64_mops_memset_tag: {
1737 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1738 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1739 // the instruction).
1740 auto &Value = MI.getOperand(i: 3);
1741 Register ExtValueReg = MIB.buildAnyExt(Res: LLT::scalar(SizeInBits: 64), Op: Value).getReg(Idx: 0);
1742 Value.setReg(ExtValueReg);
1743 return true;
1744 }
1745 case Intrinsic::aarch64_prefetch: {
1746 auto &AddrVal = MI.getOperand(i: 1);
1747
1748 int64_t IsWrite = MI.getOperand(i: 2).getImm();
1749 int64_t Target = MI.getOperand(i: 3).getImm();
1750 int64_t IsStream = MI.getOperand(i: 4).getImm();
1751 int64_t IsData = MI.getOperand(i: 5).getImm();
1752
1753 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
1754 (!IsData << 3) | // IsDataCache bit
1755 (Target << 1) | // Cache level bits
1756 (unsigned)IsStream; // Stream bit
1757
1758 MIB.buildInstr(Opcode: AArch64::G_AARCH64_PREFETCH).addImm(Val: PrfOp).add(MO: AddrVal);
1759 MI.eraseFromParent();
1760 return true;
1761 }
1762 case Intrinsic::aarch64_range_prefetch: {
1763 auto &AddrVal = MI.getOperand(i: 1);
1764
1765 int64_t IsWrite = MI.getOperand(i: 2).getImm();
1766 int64_t IsStream = MI.getOperand(i: 3).getImm();
1767 unsigned PrfOp = (IsStream << 2) | IsWrite;
1768
1769 MIB.buildInstr(Opcode: AArch64::G_AARCH64_RANGE_PREFETCH)
1770 .addImm(Val: PrfOp)
1771 .add(MO: AddrVal)
1772 .addUse(RegNo: MI.getOperand(i: 4).getReg()); // Metadata
1773 MI.eraseFromParent();
1774 return true;
1775 }
1776 case Intrinsic::aarch64_prefetch_ir: {
1777 auto &AddrVal = MI.getOperand(i: 1);
1778 MIB.buildInstr(Opcode: AArch64::G_AARCH64_PREFETCH).addImm(Val: 24).add(MO: AddrVal);
1779 MI.eraseFromParent();
1780 return true;
1781 }
1782 case Intrinsic::aarch64_neon_uaddv:
1783 case Intrinsic::aarch64_neon_saddv:
1784 case Intrinsic::aarch64_neon_umaxv:
1785 case Intrinsic::aarch64_neon_smaxv:
1786 case Intrinsic::aarch64_neon_uminv:
1787 case Intrinsic::aarch64_neon_sminv: {
1788 bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1789 IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1790 IntrinsicID == Intrinsic::aarch64_neon_sminv;
1791
1792 auto OldDst = MI.getOperand(i: 0).getReg();
1793 auto OldDstTy = MRI.getType(Reg: OldDst);
1794 LLT NewDstTy = MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getElementType();
1795 if (OldDstTy == NewDstTy)
1796 return true;
1797
1798 auto NewDst = MRI.createGenericVirtualRegister(Ty: NewDstTy);
1799
1800 Helper.Observer.changingInstr(MI);
1801 MI.getOperand(i: 0).setReg(NewDst);
1802 Helper.Observer.changedInstr(MI);
1803
1804 MIB.setInsertPt(MBB&: MIB.getMBB(), II: ++MIB.getInsertPt());
1805 MIB.buildExtOrTrunc(ExtOpc: IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1806 Res: OldDst, Op: NewDst);
1807
1808 return true;
1809 }
1810 case Intrinsic::aarch64_neon_uaddlp:
1811 case Intrinsic::aarch64_neon_saddlp: {
1812 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1813 ? AArch64::G_UADDLP
1814 : AArch64::G_SADDLP;
1815 MIB.buildInstr(Opc, DstOps: {MI.getOperand(i: 0)}, SrcOps: {MI.getOperand(i: 2)});
1816 MI.eraseFromParent();
1817
1818 return true;
1819 }
1820 case Intrinsic::aarch64_neon_uaddlv:
1821 case Intrinsic::aarch64_neon_saddlv: {
1822 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1823 ? AArch64::G_UADDLV
1824 : AArch64::G_SADDLV;
1825 Register DstReg = MI.getOperand(i: 0).getReg();
1826 Register SrcReg = MI.getOperand(i: 2).getReg();
1827 LLT DstTy = MRI.getType(Reg: DstReg);
1828
1829 LLT MidTy, ExtTy;
1830 if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1831 MidTy = LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32);
1832 ExtTy = LLT::scalar(SizeInBits: 32);
1833 } else {
1834 MidTy = LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
1835 ExtTy = LLT::scalar(SizeInBits: 64);
1836 }
1837
1838 Register MidReg =
1839 MIB.buildInstr(Opc, DstOps: {MidTy}, SrcOps: {SrcReg})->getOperand(i: 0).getReg();
1840 Register ZeroReg =
1841 MIB.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: 0)->getOperand(i: 0).getReg();
1842 Register ExtReg = MIB.buildInstr(Opc: AArch64::G_EXTRACT_VECTOR_ELT, DstOps: {ExtTy},
1843 SrcOps: {MidReg, ZeroReg})
1844 .getReg(Idx: 0);
1845
1846 if (DstTy.getScalarSizeInBits() < 32)
1847 MIB.buildTrunc(Res: DstReg, Op: ExtReg);
1848 else
1849 MIB.buildCopy(Res: DstReg, Op: ExtReg);
1850
1851 MI.eraseFromParent();
1852
1853 return true;
1854 }
1855 case Intrinsic::aarch64_neon_smax:
1856 return LowerBinOp(TargetOpcode::G_SMAX);
1857 case Intrinsic::aarch64_neon_smin:
1858 return LowerBinOp(TargetOpcode::G_SMIN);
1859 case Intrinsic::aarch64_neon_umax:
1860 return LowerBinOp(TargetOpcode::G_UMAX);
1861 case Intrinsic::aarch64_neon_umin:
1862 return LowerBinOp(TargetOpcode::G_UMIN);
1863 case Intrinsic::aarch64_neon_fmax:
1864 return LowerBinOp(TargetOpcode::G_FMAXIMUM);
1865 case Intrinsic::aarch64_neon_fmin:
1866 return LowerBinOp(TargetOpcode::G_FMINIMUM);
1867 case Intrinsic::aarch64_neon_fmaxnm:
1868 return LowerBinOp(TargetOpcode::G_FMAXNUM);
1869 case Intrinsic::aarch64_neon_fminnm:
1870 return LowerBinOp(TargetOpcode::G_FMINNUM);
1871 case Intrinsic::aarch64_neon_pmull:
1872 case Intrinsic::aarch64_neon_pmull64:
1873 return LowerBinOp(AArch64::G_PMULL);
1874 case Intrinsic::aarch64_neon_smull:
1875 return LowerBinOp(AArch64::G_SMULL);
1876 case Intrinsic::aarch64_neon_umull:
1877 return LowerBinOp(AArch64::G_UMULL);
1878 case Intrinsic::aarch64_neon_sabd:
1879 return LowerBinOp(TargetOpcode::G_ABDS);
1880 case Intrinsic::aarch64_neon_uabd:
1881 return LowerBinOp(TargetOpcode::G_ABDU);
1882 case Intrinsic::aarch64_neon_uhadd:
1883 return LowerBinOp(TargetOpcode::G_UAVGFLOOR);
1884 case Intrinsic::aarch64_neon_urhadd:
1885 return LowerBinOp(TargetOpcode::G_UAVGCEIL);
1886 case Intrinsic::aarch64_neon_shadd:
1887 return LowerBinOp(TargetOpcode::G_SAVGFLOOR);
1888 case Intrinsic::aarch64_neon_srhadd:
1889 return LowerBinOp(TargetOpcode::G_SAVGCEIL);
1890 case Intrinsic::aarch64_neon_sqshrn: {
1891 if (!MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
1892 return true;
1893 // Create right shift instruction. Store the output register in Shr.
1894 auto Shr = MIB.buildInstr(Opc: AArch64::G_VASHR,
1895 DstOps: {MRI.getType(Reg: MI.getOperand(i: 2).getReg())},
1896 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3).getImm()});
1897 // Build the narrow intrinsic, taking in Shr.
1898 MIB.buildInstr(Opc: TargetOpcode::G_TRUNC_SSAT_S, DstOps: {MI.getOperand(i: 0)}, SrcOps: {Shr});
1899 MI.eraseFromParent();
1900 return true;
1901 }
1902 case Intrinsic::aarch64_neon_sqshrun: {
1903 if (!MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
1904 return true;
1905 // Create right shift instruction. Store the output register in Shr.
1906 auto Shr = MIB.buildInstr(Opc: AArch64::G_VASHR,
1907 DstOps: {MRI.getType(Reg: MI.getOperand(i: 2).getReg())},
1908 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3).getImm()});
1909 // Build the narrow intrinsic, taking in Shr.
1910 MIB.buildInstr(Opc: TargetOpcode::G_TRUNC_SSAT_U, DstOps: {MI.getOperand(i: 0)}, SrcOps: {Shr});
1911 MI.eraseFromParent();
1912 return true;
1913 }
1914 case Intrinsic::aarch64_neon_sqrshrn: {
1915 if (!MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
1916 return true;
1917 // Create right shift instruction. Store the output register in Shr.
1918 auto Shr = MIB.buildInstr(Opc: AArch64::G_SRSHR_I,
1919 DstOps: {MRI.getType(Reg: MI.getOperand(i: 2).getReg())},
1920 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3).getImm()});
1921 // Build the narrow intrinsic, taking in Shr.
1922 MIB.buildInstr(Opc: TargetOpcode::G_TRUNC_SSAT_S, DstOps: {MI.getOperand(i: 0)}, SrcOps: {Shr});
1923 MI.eraseFromParent();
1924 return true;
1925 }
1926 case Intrinsic::aarch64_neon_sqrshrun: {
1927 if (!MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
1928 return true;
1929 // Create right shift instruction. Store the output register in Shr.
1930 auto Shr = MIB.buildInstr(Opc: AArch64::G_SRSHR_I,
1931 DstOps: {MRI.getType(Reg: MI.getOperand(i: 2).getReg())},
1932 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3).getImm()});
1933 // Build the narrow intrinsic, taking in Shr.
1934 MIB.buildInstr(Opc: TargetOpcode::G_TRUNC_SSAT_U, DstOps: {MI.getOperand(i: 0)}, SrcOps: {Shr});
1935 MI.eraseFromParent();
1936 return true;
1937 }
1938 case Intrinsic::aarch64_neon_uqrshrn: {
1939 if (!MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
1940 return true;
1941 // Create right shift instruction. Store the output register in Shr.
1942 auto Shr = MIB.buildInstr(Opc: AArch64::G_URSHR_I,
1943 DstOps: {MRI.getType(Reg: MI.getOperand(i: 2).getReg())},
1944 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3).getImm()});
1945 // Build the narrow intrinsic, taking in Shr.
1946 MIB.buildInstr(Opc: TargetOpcode::G_TRUNC_USAT_U, DstOps: {MI.getOperand(i: 0)}, SrcOps: {Shr});
1947 MI.eraseFromParent();
1948 return true;
1949 }
1950 case Intrinsic::aarch64_neon_uqshrn: {
1951 if (!MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
1952 return true;
1953 // Create right shift instruction. Store the output register in Shr.
1954 auto Shr = MIB.buildInstr(Opc: AArch64::G_VLSHR,
1955 DstOps: {MRI.getType(Reg: MI.getOperand(i: 2).getReg())},
1956 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3).getImm()});
1957 // Build the narrow intrinsic, taking in Shr.
1958 MIB.buildInstr(Opc: TargetOpcode::G_TRUNC_USAT_U, DstOps: {MI.getOperand(i: 0)}, SrcOps: {Shr});
1959 MI.eraseFromParent();
1960 return true;
1961 }
1962 case Intrinsic::aarch64_neon_sqshlu: {
1963 // Check if last operand is constant vector dup
1964 auto ShiftAmount = isConstantOrConstantSplatVector(
1965 MI&: *MRI.getVRegDef(Reg: MI.getOperand(i: 3).getReg()), MRI);
1966 if (ShiftAmount) {
1967 // If so, create a new intrinsic with the correct shift amount
1968 MIB.buildInstr(Opc: AArch64::G_SQSHLU_I, DstOps: {MI.getOperand(i: 0)},
1969 SrcOps: {MI.getOperand(i: 2)})
1970 .addImm(Val: ShiftAmount->getSExtValue());
1971 MI.eraseFromParent();
1972 return true;
1973 }
1974 return false;
1975 }
1976 case Intrinsic::aarch64_neon_vsli: {
1977 MIB.buildInstr(
1978 Opc: AArch64::G_SLI, DstOps: {MI.getOperand(i: 0)},
1979 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3), MI.getOperand(i: 4).getImm()});
1980 MI.eraseFromParent();
1981 break;
1982 }
1983 case Intrinsic::aarch64_neon_vsri: {
1984 MIB.buildInstr(
1985 Opc: AArch64::G_SRI, DstOps: {MI.getOperand(i: 0)},
1986 SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3), MI.getOperand(i: 4).getImm()});
1987 MI.eraseFromParent();
1988 break;
1989 }
1990 case Intrinsic::aarch64_neon_abs: {
1991 // Lower the intrinsic to G_ABS.
1992 MIB.buildInstr(Opc: TargetOpcode::G_ABS, DstOps: {MI.getOperand(i: 0)}, SrcOps: {MI.getOperand(i: 2)});
1993 MI.eraseFromParent();
1994 return true;
1995 }
1996 case Intrinsic::aarch64_neon_sqadd: {
1997 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
1998 return LowerBinOp(TargetOpcode::G_SADDSAT);
1999 break;
2000 }
2001 case Intrinsic::aarch64_neon_sqsub: {
2002 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
2003 return LowerBinOp(TargetOpcode::G_SSUBSAT);
2004 break;
2005 }
2006 case Intrinsic::aarch64_neon_uqadd: {
2007 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
2008 return LowerBinOp(TargetOpcode::G_UADDSAT);
2009 break;
2010 }
2011 case Intrinsic::aarch64_neon_uqsub: {
2012 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).isVector())
2013 return LowerBinOp(TargetOpcode::G_USUBSAT);
2014 break;
2015 }
2016 case Intrinsic::aarch64_neon_udot:
2017 return LowerTriOp(AArch64::G_UDOT);
2018 case Intrinsic::aarch64_neon_sdot:
2019 return LowerTriOp(AArch64::G_SDOT);
2020 case Intrinsic::aarch64_neon_usdot:
2021 return LowerTriOp(AArch64::G_USDOT);
2022 case Intrinsic::aarch64_neon_sqxtn:
2023 return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S);
2024 case Intrinsic::aarch64_neon_sqxtun:
2025 return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_U);
2026 case Intrinsic::aarch64_neon_uqxtn:
2027 return LowerUnaryOp(TargetOpcode::G_TRUNC_USAT_U);
2028 case Intrinsic::aarch64_neon_fcvtzu:
2029 return LowerUnaryOp(TargetOpcode::G_FPTOUI_SAT);
2030 case Intrinsic::aarch64_neon_fcvtzs:
2031 return LowerUnaryOp(TargetOpcode::G_FPTOSI_SAT);
2032
2033 case Intrinsic::vector_reverse:
2034 // TODO: Add support for vector_reverse
2035 return false;
2036 }
2037
2038 return true;
2039}
2040
2041bool AArch64LegalizerInfo::legalizeShlAshrLshr(
2042 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
2043 GISelChangeObserver &Observer) const {
2044 assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
2045 MI.getOpcode() == TargetOpcode::G_LSHR ||
2046 MI.getOpcode() == TargetOpcode::G_SHL);
2047 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
2048 // imported patterns can select it later. Either way, it will be legal.
2049 Register AmtReg = MI.getOperand(i: 2).getReg();
2050 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: AmtReg, MRI);
2051 if (!VRegAndVal)
2052 return true;
2053 // Check the shift amount is in range for an immediate form.
2054 int64_t Amount = VRegAndVal->Value.getSExtValue();
2055 if (Amount > 31)
2056 return true; // This will have to remain a register variant.
2057 auto ExtCst = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: Amount);
2058 Observer.changingInstr(MI);
2059 MI.getOperand(i: 2).setReg(ExtCst.getReg(Idx: 0));
2060 Observer.changedInstr(MI);
2061 return true;
2062}
2063
2064static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
2065 MachineRegisterInfo &MRI) {
2066 Base = Root;
2067 Offset = 0;
2068
2069 Register NewBase;
2070 int64_t NewOffset;
2071 if (mi_match(R: Root, MRI, P: m_GPtrAdd(L: m_Reg(R&: NewBase), R: m_ICst(Cst&: NewOffset))) &&
2072 isShiftedInt<7, 3>(x: NewOffset)) {
2073 Base = NewBase;
2074 Offset = NewOffset;
2075 }
2076}
2077
2078// FIXME: This should be removed and replaced with the generic bitcast legalize
2079// action.
2080bool AArch64LegalizerInfo::legalizeLoadStore(
2081 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
2082 GISelChangeObserver &Observer) const {
2083 assert(MI.getOpcode() == TargetOpcode::G_STORE ||
2084 MI.getOpcode() == TargetOpcode::G_LOAD);
2085 // Here we just try to handle vector loads/stores where our value type might
2086 // have pointer elements, which the SelectionDAG importer can't handle. To
2087 // allow the existing patterns for s64 to fire for p0, we just try to bitcast
2088 // the value to use s64 types.
2089
2090 // Custom legalization requires the instruction, if not deleted, must be fully
2091 // legalized. In order to allow further legalization of the inst, we create
2092 // a new instruction and erase the existing one.
2093
2094 Register ValReg = MI.getOperand(i: 0).getReg();
2095 const LLT ValTy = MRI.getType(Reg: ValReg);
2096
2097 if (ValTy == LLT::scalar(SizeInBits: 128)) {
2098
2099 AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
2100 bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
2101 bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
2102 bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
2103 bool IsRcpC3 =
2104 ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
2105
2106 LLT s64 = LLT::scalar(SizeInBits: 64);
2107
2108 unsigned Opcode;
2109 if (IsRcpC3) {
2110 Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
2111 } else {
2112 // For LSE2, loads/stores should have been converted to monotonic and had
2113 // a fence inserted after them.
2114 assert(Ordering == AtomicOrdering::Monotonic ||
2115 Ordering == AtomicOrdering::Unordered);
2116 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
2117
2118 Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
2119 }
2120
2121 MachineInstrBuilder NewI;
2122 if (IsLoad) {
2123 NewI = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {s64, s64}, SrcOps: {});
2124 MIRBuilder.buildMergeLikeInstr(
2125 Res: ValReg, Ops: {NewI->getOperand(i: 0), NewI->getOperand(i: 1)});
2126 } else {
2127 auto Split = MIRBuilder.buildUnmerge(Res: s64, Op: MI.getOperand(i: 0));
2128 NewI = MIRBuilder.buildInstr(
2129 Opc: Opcode, DstOps: {}, SrcOps: {Split->getOperand(i: 0), Split->getOperand(i: 1)});
2130 }
2131
2132 if (IsRcpC3) {
2133 NewI.addUse(RegNo: MI.getOperand(i: 1).getReg());
2134 } else {
2135 Register Base;
2136 int Offset;
2137 matchLDPSTPAddrMode(Root: MI.getOperand(i: 1).getReg(), Base, Offset, MRI);
2138 NewI.addUse(RegNo: Base);
2139 NewI.addImm(Val: Offset / 8);
2140 }
2141
2142 NewI.cloneMemRefs(OtherMI: MI);
2143 constrainSelectedInstRegOperands(I&: *NewI, TII: *ST->getInstrInfo(),
2144 TRI: *MRI.getTargetRegisterInfo(),
2145 RBI: *ST->getRegBankInfo());
2146 MI.eraseFromParent();
2147 return true;
2148 }
2149
2150 if (!ValTy.isPointerVector() ||
2151 ValTy.getElementType().getAddressSpace() != 0) {
2152 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
2153 return false;
2154 }
2155
2156 unsigned PtrSize = ValTy.getElementType().getSizeInBits();
2157 const LLT NewTy = LLT::vector(EC: ValTy.getElementCount(), ScalarSizeInBits: PtrSize);
2158 auto &MMO = **MI.memoperands_begin();
2159 MMO.setType(NewTy);
2160
2161 if (MI.getOpcode() == TargetOpcode::G_STORE) {
2162 auto Bitcast = MIRBuilder.buildBitcast(Dst: NewTy, Src: ValReg);
2163 MIRBuilder.buildStore(Val: Bitcast.getReg(Idx: 0), Addr: MI.getOperand(i: 1), MMO);
2164 } else {
2165 auto NewLoad = MIRBuilder.buildLoad(Res: NewTy, Addr: MI.getOperand(i: 1), MMO);
2166 MIRBuilder.buildBitcast(Dst: ValReg, Src: NewLoad);
2167 }
2168 MI.eraseFromParent();
2169 return true;
2170}
2171
2172bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
2173 MachineRegisterInfo &MRI,
2174 MachineIRBuilder &MIRBuilder) const {
2175 MachineFunction &MF = MIRBuilder.getMF();
2176 Align Alignment(MI.getOperand(i: 2).getImm());
2177 Register Dst = MI.getOperand(i: 0).getReg();
2178 Register ListPtr = MI.getOperand(i: 1).getReg();
2179
2180 LLT PtrTy = MRI.getType(Reg: ListPtr);
2181 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
2182
2183 const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
2184 const Align PtrAlign = Align(PtrSize);
2185 auto List = MIRBuilder.buildLoad(
2186 Res: PtrTy, Addr: ListPtr,
2187 MMO&: *MF.getMachineMemOperand(PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad,
2188 MemTy: PtrTy, base_alignment: PtrAlign));
2189
2190 MachineInstrBuilder DstPtr;
2191 if (Alignment > PtrAlign) {
2192 // Realign the list to the actual required alignment.
2193 auto AlignMinus1 =
2194 MIRBuilder.buildConstant(Res: IntPtrTy, Val: Alignment.value() - 1);
2195 auto ListTmp = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: List, Op1: AlignMinus1.getReg(Idx: 0));
2196 DstPtr = MIRBuilder.buildMaskLowPtrBits(Res: PtrTy, Op0: ListTmp, NumBits: Log2(A: Alignment));
2197 } else
2198 DstPtr = List;
2199
2200 LLT ValTy = MRI.getType(Reg: Dst);
2201 uint64_t ValSize = ValTy.getSizeInBits() / 8;
2202 MIRBuilder.buildLoad(
2203 Res: Dst, Addr: DstPtr,
2204 MMO&: *MF.getMachineMemOperand(PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad,
2205 MemTy: ValTy, base_alignment: std::max(a: Alignment, b: PtrAlign)));
2206
2207 auto Size = MIRBuilder.buildConstant(Res: IntPtrTy, Val: alignTo(Size: ValSize, A: PtrAlign));
2208
2209 auto NewList = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: DstPtr, Op1: Size.getReg(Idx: 0));
2210
2211 MIRBuilder.buildStore(Val: NewList, Addr: ListPtr,
2212 MMO&: *MF.getMachineMemOperand(PtrInfo: MachinePointerInfo(),
2213 f: MachineMemOperand::MOStore,
2214 MemTy: PtrTy, base_alignment: PtrAlign));
2215
2216 MI.eraseFromParent();
2217 return true;
2218}
2219
2220bool AArch64LegalizerInfo::legalizeBitfieldExtract(
2221 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2222 // Only legal if we can select immediate forms.
2223 // TODO: Lower this otherwise.
2224 return getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 2).getReg(), MRI) &&
2225 getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: 3).getReg(), MRI);
2226}
2227
2228bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
2229 MachineRegisterInfo &MRI,
2230 LegalizerHelper &Helper) const {
2231 // When there is no integer popcount instruction (FEAT_CSSC isn't available),
2232 // it can be more efficiently lowered to the following sequence that uses
2233 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
2234 // registers are cheap.
2235 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
2236 // CNT V0.8B, V0.8B // 8xbyte pop-counts
2237 // ADDV B0, V0.8B // sum 8xbyte pop-counts
2238 // UMOV X0, V0.B[0] // copy byte result back to integer reg
2239 //
2240 // For 128 bit vector popcounts, we lower to the following sequence:
2241 // cnt.16b v0, v0 // v8s16, v4s32, v2s64
2242 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
2243 // uaddlp.4s v0, v0 // v4s32, v2s64
2244 // uaddlp.2d v0, v0 // v2s64
2245 //
2246 // For 64 bit vector popcounts, we lower to the following sequence:
2247 // cnt.8b v0, v0 // v4s16, v2s32
2248 // uaddlp.4h v0, v0 // v4s16, v2s32
2249 // uaddlp.2s v0, v0 // v2s32
2250
2251 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2252 Register Dst = MI.getOperand(i: 0).getReg();
2253 Register Val = MI.getOperand(i: 1).getReg();
2254 LLT Ty = MRI.getType(Reg: Val);
2255 unsigned Size = Ty.getSizeInBits();
2256
2257 assert(Ty == MRI.getType(Dst) &&
2258 "Expected src and dst to have the same type!");
2259
2260 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
2261 LLT s64 = LLT::scalar(SizeInBits: 64);
2262
2263 auto Split = MIRBuilder.buildUnmerge(Res: s64, Op: Val);
2264 auto CTPOP1 = MIRBuilder.buildCTPOP(Dst: s64, Src0: Split->getOperand(i: 0));
2265 auto CTPOP2 = MIRBuilder.buildCTPOP(Dst: s64, Src0: Split->getOperand(i: 1));
2266 auto Add = MIRBuilder.buildAdd(Dst: s64, Src0: CTPOP1, Src1: CTPOP2);
2267
2268 MIRBuilder.buildZExt(Res: Dst, Op: Add);
2269 MI.eraseFromParent();
2270 return true;
2271 }
2272
2273 if (!ST->hasNEON() ||
2274 MI.getMF()->getFunction().hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
2275 // Use generic lowering when custom lowering is not possible.
2276 return Ty.isScalar() && (Size == 32 || Size == 64) &&
2277 Helper.lowerBitCount(MI) ==
2278 LegalizerHelper::LegalizeResult::Legalized;
2279 }
2280
2281 // Pre-conditioning: widen Val up to the nearest vector type.
2282 // s32,s64,v4s16,v2s32 -> v8i8
2283 // v8s16,v4s32,v2s64 -> v16i8
2284 LLT VTy = Size == 128 ? LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8) : LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8);
2285 if (Ty.isScalar()) {
2286 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
2287 if (Size == 32) {
2288 Val = MIRBuilder.buildZExt(Res: LLT::scalar(SizeInBits: 64), Op: Val).getReg(Idx: 0);
2289 }
2290 }
2291 Val = MIRBuilder.buildBitcast(Dst: VTy, Src: Val).getReg(Idx: 0);
2292
2293 // Count bits in each byte-sized lane.
2294 auto CTPOP = MIRBuilder.buildCTPOP(Dst: VTy, Src0: Val);
2295
2296 // Sum across lanes.
2297
2298 if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 &&
2299 Ty.getScalarSizeInBits() != 16) {
2300 LLT Dt = Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) ? LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32) : Ty;
2301 auto Zeros = MIRBuilder.buildConstant(Res: Dt, Val: 0);
2302 auto Ones = MIRBuilder.buildConstant(Res: VTy, Val: 1);
2303 MachineInstrBuilder Sum;
2304
2305 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
2306 auto UDOT =
2307 MIRBuilder.buildInstr(Opc: AArch64::G_UDOT, DstOps: {Dt}, SrcOps: {Zeros, Ones, CTPOP});
2308 Sum = MIRBuilder.buildInstr(Opc: AArch64::G_UADDLP, DstOps: {Ty}, SrcOps: {UDOT});
2309 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
2310 Sum = MIRBuilder.buildInstr(Opc: AArch64::G_UDOT, DstOps: {Dt}, SrcOps: {Zeros, Ones, CTPOP});
2311 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
2312 Sum = MIRBuilder.buildInstr(Opc: AArch64::G_UDOT, DstOps: {Dt}, SrcOps: {Zeros, Ones, CTPOP});
2313 } else {
2314 llvm_unreachable("unexpected vector shape");
2315 }
2316
2317 Sum->getOperand(i: 0).setReg(Dst);
2318 MI.eraseFromParent();
2319 return true;
2320 }
2321
2322 Register HSum = CTPOP.getReg(Idx: 0);
2323 unsigned Opc;
2324 SmallVector<LLT> HAddTys;
2325 if (Ty.isScalar()) {
2326 Opc = Intrinsic::aarch64_neon_uaddlv;
2327 HAddTys.push_back(Elt: LLT::scalar(SizeInBits: 32));
2328 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
2329 Opc = Intrinsic::aarch64_neon_uaddlp;
2330 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16));
2331 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
2332 Opc = Intrinsic::aarch64_neon_uaddlp;
2333 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16));
2334 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32));
2335 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
2336 Opc = Intrinsic::aarch64_neon_uaddlp;
2337 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16));
2338 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32));
2339 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64));
2340 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
2341 Opc = Intrinsic::aarch64_neon_uaddlp;
2342 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16));
2343 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
2344 Opc = Intrinsic::aarch64_neon_uaddlp;
2345 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16));
2346 HAddTys.push_back(Elt: LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32));
2347 } else
2348 llvm_unreachable("unexpected vector shape");
2349 MachineInstrBuilder UADD;
2350 for (LLT HTy : HAddTys) {
2351 UADD = MIRBuilder.buildIntrinsic(ID: Opc, Res: {HTy}).addUse(RegNo: HSum);
2352 HSum = UADD.getReg(Idx: 0);
2353 }
2354
2355 // Post-conditioning.
2356 if (Ty.isScalar() && (Size == 64 || Size == 128))
2357 MIRBuilder.buildZExt(Res: Dst, Op: UADD);
2358 else
2359 UADD->getOperand(i: 0).setReg(Dst);
2360 MI.eraseFromParent();
2361 return true;
2362}
2363
2364bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
2365 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2366 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2367 LLT s64 = LLT::scalar(SizeInBits: 64);
2368 auto Addr = MI.getOperand(i: 1).getReg();
2369 auto DesiredI = MIRBuilder.buildUnmerge(Res: {s64, s64}, Op: MI.getOperand(i: 2));
2370 auto NewI = MIRBuilder.buildUnmerge(Res: {s64, s64}, Op: MI.getOperand(i: 3));
2371 auto DstLo = MRI.createGenericVirtualRegister(Ty: s64);
2372 auto DstHi = MRI.createGenericVirtualRegister(Ty: s64);
2373
2374 MachineInstrBuilder CAS;
2375 if (ST->hasLSE()) {
2376 // We have 128-bit CASP instructions taking XSeqPair registers, which are
2377 // s128. We need the merge/unmerge to bracket the expansion and pair up with
2378 // the rest of the MIR so we must reassemble the extracted registers into a
2379 // 128-bit known-regclass one with code like this:
2380 //
2381 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
2382 // %out = CASP %in1, ...
2383 // %OldLo = G_EXTRACT %out, 0
2384 // %OldHi = G_EXTRACT %out, 64
2385 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2386 unsigned Opcode;
2387 switch (Ordering) {
2388 case AtomicOrdering::Acquire:
2389 Opcode = AArch64::CASPAX;
2390 break;
2391 case AtomicOrdering::Release:
2392 Opcode = AArch64::CASPLX;
2393 break;
2394 case AtomicOrdering::AcquireRelease:
2395 case AtomicOrdering::SequentiallyConsistent:
2396 Opcode = AArch64::CASPALX;
2397 break;
2398 default:
2399 Opcode = AArch64::CASPX;
2400 break;
2401 }
2402
2403 LLT s128 = LLT::scalar(SizeInBits: 128);
2404 auto CASDst = MRI.createGenericVirtualRegister(Ty: s128);
2405 auto CASDesired = MRI.createGenericVirtualRegister(Ty: s128);
2406 auto CASNew = MRI.createGenericVirtualRegister(Ty: s128);
2407 MIRBuilder.buildInstr(Opc: TargetOpcode::REG_SEQUENCE, DstOps: {CASDesired}, SrcOps: {})
2408 .addUse(RegNo: DesiredI->getOperand(i: 0).getReg())
2409 .addImm(Val: AArch64::sube64)
2410 .addUse(RegNo: DesiredI->getOperand(i: 1).getReg())
2411 .addImm(Val: AArch64::subo64);
2412 MIRBuilder.buildInstr(Opc: TargetOpcode::REG_SEQUENCE, DstOps: {CASNew}, SrcOps: {})
2413 .addUse(RegNo: NewI->getOperand(i: 0).getReg())
2414 .addImm(Val: AArch64::sube64)
2415 .addUse(RegNo: NewI->getOperand(i: 1).getReg())
2416 .addImm(Val: AArch64::subo64);
2417
2418 CAS = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {CASDst}, SrcOps: {CASDesired, CASNew, Addr});
2419
2420 MIRBuilder.buildExtract(Res: {DstLo}, Src: {CASDst}, Index: 0);
2421 MIRBuilder.buildExtract(Res: {DstHi}, Src: {CASDst}, Index: 64);
2422 } else {
2423 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
2424 // can take arbitrary registers so it just has the normal GPR64 operands the
2425 // rest of AArch64 is expecting.
2426 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2427 unsigned Opcode;
2428 switch (Ordering) {
2429 case AtomicOrdering::Acquire:
2430 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
2431 break;
2432 case AtomicOrdering::Release:
2433 Opcode = AArch64::CMP_SWAP_128_RELEASE;
2434 break;
2435 case AtomicOrdering::AcquireRelease:
2436 case AtomicOrdering::SequentiallyConsistent:
2437 Opcode = AArch64::CMP_SWAP_128;
2438 break;
2439 default:
2440 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
2441 break;
2442 }
2443
2444 auto Scratch = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2445 CAS = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {DstLo, DstHi, Scratch},
2446 SrcOps: {Addr, DesiredI->getOperand(i: 0),
2447 DesiredI->getOperand(i: 1), NewI->getOperand(i: 0),
2448 NewI->getOperand(i: 1)});
2449 }
2450
2451 CAS.cloneMemRefs(OtherMI: MI);
2452 constrainSelectedInstRegOperands(I&: *CAS, TII: *ST->getInstrInfo(),
2453 TRI: *MRI.getTargetRegisterInfo(),
2454 RBI: *ST->getRegBankInfo());
2455
2456 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {DstLo, DstHi});
2457 MI.eraseFromParent();
2458 return true;
2459}
2460
2461bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
2462 LegalizerHelper &Helper) const {
2463 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2464 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2465 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
2466 auto BitReverse = MIRBuilder.buildBitReverse(Dst: Ty, Src: MI.getOperand(i: 1));
2467 MIRBuilder.buildCTLZ(Dst: MI.getOperand(i: 0).getReg(), Src0: BitReverse);
2468 MI.eraseFromParent();
2469 return true;
2470}
2471
2472bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
2473 LegalizerHelper &Helper) const {
2474 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2475
2476 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
2477 if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
2478 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
2479 // the instruction).
2480 auto &Value = MI.getOperand(i: 1);
2481 Register ExtValueReg =
2482 MIRBuilder.buildAnyExt(Res: LLT::scalar(SizeInBits: 64), Op: Value).getReg(Idx: 0);
2483 Value.setReg(ExtValueReg);
2484 return true;
2485 }
2486
2487 return false;
2488}
2489
2490bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2491 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2492 const GExtractVectorElement *Element = cast<GExtractVectorElement>(Val: &MI);
2493 auto VRegAndVal =
2494 getIConstantVRegValWithLookThrough(VReg: Element->getIndexReg(), MRI);
2495 if (VRegAndVal)
2496 return true;
2497 LLT VecTy = MRI.getType(Reg: Element->getVectorReg());
2498 if (VecTy.isScalableVector())
2499 return true;
2500 return Helper.lowerExtractInsertVectorElt(MI) !=
2501 LegalizerHelper::LegalizeResult::UnableToLegalize;
2502}
2503
2504bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2505 MachineInstr &MI, LegalizerHelper &Helper) const {
2506 MachineFunction &MF = *MI.getParent()->getParent();
2507 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2508 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2509
2510 // If stack probing is not enabled for this function, use the default
2511 // lowering.
2512 if (!MF.getFunction().hasFnAttribute(Kind: "probe-stack") ||
2513 MF.getFunction().getFnAttribute(Kind: "probe-stack").getValueAsString() !=
2514 "inline-asm") {
2515 Helper.lowerDynStackAlloc(MI);
2516 return true;
2517 }
2518
2519 Register Dst = MI.getOperand(i: 0).getReg();
2520 Register AllocSize = MI.getOperand(i: 1).getReg();
2521 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
2522
2523 assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2524 "Unexpected type for dynamic alloca");
2525 assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2526 "Unexpected type for dynamic alloca");
2527
2528 LLT PtrTy = MRI.getType(Reg: Dst);
2529 Register SPReg =
2530 Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2531 Register SPTmp =
2532 Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2533 auto NewMI =
2534 MIRBuilder.buildInstr(Opc: AArch64::PROBED_STACKALLOC_DYN, DstOps: {}, SrcOps: {SPTmp});
2535 MRI.setRegClass(Reg: NewMI.getReg(Idx: 0), RC: &AArch64::GPR64commonRegClass);
2536 MIRBuilder.setInsertPt(MBB&: *NewMI->getParent(), II: NewMI);
2537 MIRBuilder.buildCopy(Res: Dst, Op: SPTmp);
2538
2539 MI.eraseFromParent();
2540 return true;
2541}
2542
2543bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2544 LegalizerHelper &Helper) const {
2545 MachineIRBuilder &MIB = Helper.MIRBuilder;
2546 auto &AddrVal = MI.getOperand(i: 0);
2547
2548 int64_t IsWrite = MI.getOperand(i: 1).getImm();
2549 int64_t Locality = MI.getOperand(i: 2).getImm();
2550 int64_t IsData = MI.getOperand(i: 3).getImm();
2551
2552 bool IsStream = Locality == 0;
2553 if (Locality != 0) {
2554 assert(Locality <= 3 && "Prefetch locality out-of-range");
2555 // The locality degree is the opposite of the cache speed.
2556 // Put the number the other way around.
2557 // The encoding starts at 0 for level 1
2558 Locality = 3 - Locality;
2559 }
2560
2561 unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2562
2563 MIB.buildInstr(Opcode: AArch64::G_AARCH64_PREFETCH).addImm(Val: PrfOp).add(MO: AddrVal);
2564 MI.eraseFromParent();
2565 return true;
2566}
2567
2568bool AArch64LegalizerInfo::legalizeFptrunc(MachineInstr &MI,
2569 MachineIRBuilder &MIRBuilder,
2570 MachineRegisterInfo &MRI) const {
2571 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
2572 assert(SrcTy.isFixedVector() && isPowerOf2_32(SrcTy.getNumElements()) &&
2573 "Expected a power of 2 elements");
2574
2575 LLT s16 = LLT::scalar(SizeInBits: 16);
2576 LLT s32 = LLT::scalar(SizeInBits: 32);
2577 LLT s64 = LLT::scalar(SizeInBits: 64);
2578 LLT v2s16 = LLT::fixed_vector(NumElements: 2, ScalarTy: s16);
2579 LLT v4s16 = LLT::fixed_vector(NumElements: 4, ScalarTy: s16);
2580 LLT v2s32 = LLT::fixed_vector(NumElements: 2, ScalarTy: s32);
2581 LLT v4s32 = LLT::fixed_vector(NumElements: 4, ScalarTy: s32);
2582 LLT v2s64 = LLT::fixed_vector(NumElements: 2, ScalarTy: s64);
2583
2584 SmallVector<Register> RegsToUnmergeTo;
2585 SmallVector<Register> TruncOddDstRegs;
2586 SmallVector<Register> RegsToMerge;
2587
2588 unsigned ElemCount = SrcTy.getNumElements();
2589
2590 // Find the biggest size chunks we can work with
2591 int StepSize = ElemCount % 4 ? 2 : 4;
2592
2593 // If we have a power of 2 greater than 2, we need to first unmerge into
2594 // enough pieces
2595 if (ElemCount <= 2)
2596 RegsToUnmergeTo.push_back(Elt: Src);
2597 else {
2598 for (unsigned i = 0; i < ElemCount / 2; ++i)
2599 RegsToUnmergeTo.push_back(Elt: MRI.createGenericVirtualRegister(Ty: v2s64));
2600
2601 MIRBuilder.buildUnmerge(Res: RegsToUnmergeTo, Op: Src);
2602 }
2603
2604 // Create all of the round-to-odd instructions and store them
2605 for (auto SrcReg : RegsToUnmergeTo) {
2606 Register Mid =
2607 MIRBuilder.buildInstr(Opc: AArch64::G_FPTRUNC_ODD, DstOps: {v2s32}, SrcOps: {SrcReg})
2608 .getReg(Idx: 0);
2609 TruncOddDstRegs.push_back(Elt: Mid);
2610 }
2611
2612 // Truncate 4s32 to 4s16 if we can to reduce instruction count, otherwise
2613 // truncate 2s32 to 2s16.
2614 unsigned Index = 0;
2615 for (unsigned LoopIter = 0; LoopIter < ElemCount / StepSize; ++LoopIter) {
2616 if (StepSize == 4) {
2617 Register ConcatDst =
2618 MIRBuilder
2619 .buildMergeLikeInstr(
2620 Res: {v4s32}, Ops: {TruncOddDstRegs[Index++], TruncOddDstRegs[Index++]})
2621 .getReg(Idx: 0);
2622
2623 RegsToMerge.push_back(
2624 Elt: MIRBuilder.buildFPTrunc(Res: v4s16, Op: ConcatDst).getReg(Idx: 0));
2625 } else {
2626 RegsToMerge.push_back(
2627 Elt: MIRBuilder.buildFPTrunc(Res: v2s16, Op: TruncOddDstRegs[Index++]).getReg(Idx: 0));
2628 }
2629 }
2630
2631 // If there is only one register, replace the destination
2632 if (RegsToMerge.size() == 1) {
2633 MRI.replaceRegWith(FromReg: Dst, ToReg: RegsToMerge.pop_back_val());
2634 MI.eraseFromParent();
2635 return true;
2636 }
2637
2638 // Merge the rest of the instructions & replace the register
2639 Register Fin = MIRBuilder.buildMergeLikeInstr(Res: DstTy, Ops: RegsToMerge).getReg(Idx: 0);
2640 MRI.replaceRegWith(FromReg: Dst, ToReg: Fin);
2641 MI.eraseFromParent();
2642 return true;
2643}
2644