1//===---------- ARM.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "ABIInfo.h"
14#include "CGBuiltin.h"
15#include "CGDebugInfo.h"
16#include "TargetInfo.h"
17#include "clang/Basic/TargetBuiltins.h"
18#include "llvm/IR/InlineAsm.h"
19#include "llvm/IR/IntrinsicsAArch64.h"
20#include "llvm/IR/IntrinsicsARM.h"
21#include "llvm/IR/IntrinsicsBPF.h"
22#include "llvm/TargetParser/AArch64TargetParser.h"
23
24#include <numeric>
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm;
29
30static std::optional<CodeGenFunction::MSVCIntrin>
31translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
32 using MSVCIntrin = CodeGenFunction::MSVCIntrin;
33 switch (BuiltinID) {
34 default:
35 return std::nullopt;
36 case clang::AArch64::BI_BitScanForward:
37 case clang::AArch64::BI_BitScanForward64:
38 return MSVCIntrin::_BitScanForward;
39 case clang::AArch64::BI_BitScanReverse:
40 case clang::AArch64::BI_BitScanReverse64:
41 return MSVCIntrin::_BitScanReverse;
42 case clang::AArch64::BI_InterlockedAnd64:
43 return MSVCIntrin::_InterlockedAnd;
44 case clang::AArch64::BI_InterlockedExchange64:
45 return MSVCIntrin::_InterlockedExchange;
46 case clang::AArch64::BI_InterlockedExchangeAdd64:
47 return MSVCIntrin::_InterlockedExchangeAdd;
48 case clang::AArch64::BI_InterlockedExchangeSub64:
49 return MSVCIntrin::_InterlockedExchangeSub;
50 case clang::AArch64::BI_InterlockedOr64:
51 return MSVCIntrin::_InterlockedOr;
52 case clang::AArch64::BI_InterlockedXor64:
53 return MSVCIntrin::_InterlockedXor;
54 case clang::AArch64::BI_InterlockedDecrement64:
55 return MSVCIntrin::_InterlockedDecrement;
56 case clang::AArch64::BI_InterlockedIncrement64:
57 return MSVCIntrin::_InterlockedIncrement;
58 case clang::AArch64::BI_InterlockedExchangeAdd8_acq:
59 case clang::AArch64::BI_InterlockedExchangeAdd16_acq:
60 case clang::AArch64::BI_InterlockedExchangeAdd_acq:
61 case clang::AArch64::BI_InterlockedExchangeAdd64_acq:
62 return MSVCIntrin::_InterlockedExchangeAdd_acq;
63 case clang::AArch64::BI_InterlockedExchangeAdd8_rel:
64 case clang::AArch64::BI_InterlockedExchangeAdd16_rel:
65 case clang::AArch64::BI_InterlockedExchangeAdd_rel:
66 case clang::AArch64::BI_InterlockedExchangeAdd64_rel:
67 return MSVCIntrin::_InterlockedExchangeAdd_rel;
68 case clang::AArch64::BI_InterlockedExchangeAdd8_nf:
69 case clang::AArch64::BI_InterlockedExchangeAdd16_nf:
70 case clang::AArch64::BI_InterlockedExchangeAdd_nf:
71 case clang::AArch64::BI_InterlockedExchangeAdd64_nf:
72 return MSVCIntrin::_InterlockedExchangeAdd_nf;
73 case clang::AArch64::BI_InterlockedExchange8_acq:
74 case clang::AArch64::BI_InterlockedExchange16_acq:
75 case clang::AArch64::BI_InterlockedExchange_acq:
76 case clang::AArch64::BI_InterlockedExchange64_acq:
77 case clang::AArch64::BI_InterlockedExchangePointer_acq:
78 return MSVCIntrin::_InterlockedExchange_acq;
79 case clang::AArch64::BI_InterlockedExchange8_rel:
80 case clang::AArch64::BI_InterlockedExchange16_rel:
81 case clang::AArch64::BI_InterlockedExchange_rel:
82 case clang::AArch64::BI_InterlockedExchange64_rel:
83 case clang::AArch64::BI_InterlockedExchangePointer_rel:
84 return MSVCIntrin::_InterlockedExchange_rel;
85 case clang::AArch64::BI_InterlockedExchange8_nf:
86 case clang::AArch64::BI_InterlockedExchange16_nf:
87 case clang::AArch64::BI_InterlockedExchange_nf:
88 case clang::AArch64::BI_InterlockedExchange64_nf:
89 case clang::AArch64::BI_InterlockedExchangePointer_nf:
90 return MSVCIntrin::_InterlockedExchange_nf;
91 case clang::AArch64::BI_InterlockedCompareExchange8_acq:
92 case clang::AArch64::BI_InterlockedCompareExchange16_acq:
93 case clang::AArch64::BI_InterlockedCompareExchange_acq:
94 case clang::AArch64::BI_InterlockedCompareExchange64_acq:
95 case clang::AArch64::BI_InterlockedCompareExchangePointer_acq:
96 return MSVCIntrin::_InterlockedCompareExchange_acq;
97 case clang::AArch64::BI_InterlockedCompareExchange8_rel:
98 case clang::AArch64::BI_InterlockedCompareExchange16_rel:
99 case clang::AArch64::BI_InterlockedCompareExchange_rel:
100 case clang::AArch64::BI_InterlockedCompareExchange64_rel:
101 case clang::AArch64::BI_InterlockedCompareExchangePointer_rel:
102 return MSVCIntrin::_InterlockedCompareExchange_rel;
103 case clang::AArch64::BI_InterlockedCompareExchange8_nf:
104 case clang::AArch64::BI_InterlockedCompareExchange16_nf:
105 case clang::AArch64::BI_InterlockedCompareExchange_nf:
106 case clang::AArch64::BI_InterlockedCompareExchange64_nf:
107 return MSVCIntrin::_InterlockedCompareExchange_nf;
108 case clang::AArch64::BI_InterlockedCompareExchange128:
109 return MSVCIntrin::_InterlockedCompareExchange128;
110 case clang::AArch64::BI_InterlockedCompareExchange128_acq:
111 return MSVCIntrin::_InterlockedCompareExchange128_acq;
112 case clang::AArch64::BI_InterlockedCompareExchange128_nf:
113 return MSVCIntrin::_InterlockedCompareExchange128_nf;
114 case clang::AArch64::BI_InterlockedCompareExchange128_rel:
115 return MSVCIntrin::_InterlockedCompareExchange128_rel;
116 case clang::AArch64::BI_InterlockedOr8_acq:
117 case clang::AArch64::BI_InterlockedOr16_acq:
118 case clang::AArch64::BI_InterlockedOr_acq:
119 case clang::AArch64::BI_InterlockedOr64_acq:
120 return MSVCIntrin::_InterlockedOr_acq;
121 case clang::AArch64::BI_InterlockedOr8_rel:
122 case clang::AArch64::BI_InterlockedOr16_rel:
123 case clang::AArch64::BI_InterlockedOr_rel:
124 case clang::AArch64::BI_InterlockedOr64_rel:
125 return MSVCIntrin::_InterlockedOr_rel;
126 case clang::AArch64::BI_InterlockedOr8_nf:
127 case clang::AArch64::BI_InterlockedOr16_nf:
128 case clang::AArch64::BI_InterlockedOr_nf:
129 case clang::AArch64::BI_InterlockedOr64_nf:
130 return MSVCIntrin::_InterlockedOr_nf;
131 case clang::AArch64::BI_InterlockedXor8_acq:
132 case clang::AArch64::BI_InterlockedXor16_acq:
133 case clang::AArch64::BI_InterlockedXor_acq:
134 case clang::AArch64::BI_InterlockedXor64_acq:
135 return MSVCIntrin::_InterlockedXor_acq;
136 case clang::AArch64::BI_InterlockedXor8_rel:
137 case clang::AArch64::BI_InterlockedXor16_rel:
138 case clang::AArch64::BI_InterlockedXor_rel:
139 case clang::AArch64::BI_InterlockedXor64_rel:
140 return MSVCIntrin::_InterlockedXor_rel;
141 case clang::AArch64::BI_InterlockedXor8_nf:
142 case clang::AArch64::BI_InterlockedXor16_nf:
143 case clang::AArch64::BI_InterlockedXor_nf:
144 case clang::AArch64::BI_InterlockedXor64_nf:
145 return MSVCIntrin::_InterlockedXor_nf;
146 case clang::AArch64::BI_InterlockedAnd8_acq:
147 case clang::AArch64::BI_InterlockedAnd16_acq:
148 case clang::AArch64::BI_InterlockedAnd_acq:
149 case clang::AArch64::BI_InterlockedAnd64_acq:
150 return MSVCIntrin::_InterlockedAnd_acq;
151 case clang::AArch64::BI_InterlockedAnd8_rel:
152 case clang::AArch64::BI_InterlockedAnd16_rel:
153 case clang::AArch64::BI_InterlockedAnd_rel:
154 case clang::AArch64::BI_InterlockedAnd64_rel:
155 return MSVCIntrin::_InterlockedAnd_rel;
156 case clang::AArch64::BI_InterlockedAnd8_nf:
157 case clang::AArch64::BI_InterlockedAnd16_nf:
158 case clang::AArch64::BI_InterlockedAnd_nf:
159 case clang::AArch64::BI_InterlockedAnd64_nf:
160 return MSVCIntrin::_InterlockedAnd_nf;
161 case clang::AArch64::BI_InterlockedIncrement16_acq:
162 case clang::AArch64::BI_InterlockedIncrement_acq:
163 case clang::AArch64::BI_InterlockedIncrement64_acq:
164 return MSVCIntrin::_InterlockedIncrement_acq;
165 case clang::AArch64::BI_InterlockedIncrement16_rel:
166 case clang::AArch64::BI_InterlockedIncrement_rel:
167 case clang::AArch64::BI_InterlockedIncrement64_rel:
168 return MSVCIntrin::_InterlockedIncrement_rel;
169 case clang::AArch64::BI_InterlockedIncrement16_nf:
170 case clang::AArch64::BI_InterlockedIncrement_nf:
171 case clang::AArch64::BI_InterlockedIncrement64_nf:
172 return MSVCIntrin::_InterlockedIncrement_nf;
173 case clang::AArch64::BI_InterlockedDecrement16_acq:
174 case clang::AArch64::BI_InterlockedDecrement_acq:
175 case clang::AArch64::BI_InterlockedDecrement64_acq:
176 return MSVCIntrin::_InterlockedDecrement_acq;
177 case clang::AArch64::BI_InterlockedDecrement16_rel:
178 case clang::AArch64::BI_InterlockedDecrement_rel:
179 case clang::AArch64::BI_InterlockedDecrement64_rel:
180 return MSVCIntrin::_InterlockedDecrement_rel;
181 case clang::AArch64::BI_InterlockedDecrement16_nf:
182 case clang::AArch64::BI_InterlockedDecrement_nf:
183 case clang::AArch64::BI_InterlockedDecrement64_nf:
184 return MSVCIntrin::_InterlockedDecrement_nf;
185 }
186 llvm_unreachable("must return from switch");
187}
188
189static std::optional<CodeGenFunction::MSVCIntrin>
190translateArmToMsvcIntrin(unsigned BuiltinID) {
191 using MSVCIntrin = CodeGenFunction::MSVCIntrin;
192 switch (BuiltinID) {
193 default:
194 return std::nullopt;
195 case clang::ARM::BI_BitScanForward:
196 case clang::ARM::BI_BitScanForward64:
197 return MSVCIntrin::_BitScanForward;
198 case clang::ARM::BI_BitScanReverse:
199 case clang::ARM::BI_BitScanReverse64:
200 return MSVCIntrin::_BitScanReverse;
201 case clang::ARM::BI_InterlockedAnd64:
202 return MSVCIntrin::_InterlockedAnd;
203 case clang::ARM::BI_InterlockedExchange64:
204 return MSVCIntrin::_InterlockedExchange;
205 case clang::ARM::BI_InterlockedExchangeAdd64:
206 return MSVCIntrin::_InterlockedExchangeAdd;
207 case clang::ARM::BI_InterlockedExchangeSub64:
208 return MSVCIntrin::_InterlockedExchangeSub;
209 case clang::ARM::BI_InterlockedOr64:
210 return MSVCIntrin::_InterlockedOr;
211 case clang::ARM::BI_InterlockedXor64:
212 return MSVCIntrin::_InterlockedXor;
213 case clang::ARM::BI_InterlockedDecrement64:
214 return MSVCIntrin::_InterlockedDecrement;
215 case clang::ARM::BI_InterlockedIncrement64:
216 return MSVCIntrin::_InterlockedIncrement;
217 case clang::ARM::BI_InterlockedExchangeAdd8_acq:
218 case clang::ARM::BI_InterlockedExchangeAdd16_acq:
219 case clang::ARM::BI_InterlockedExchangeAdd_acq:
220 case clang::ARM::BI_InterlockedExchangeAdd64_acq:
221 return MSVCIntrin::_InterlockedExchangeAdd_acq;
222 case clang::ARM::BI_InterlockedExchangeAdd8_rel:
223 case clang::ARM::BI_InterlockedExchangeAdd16_rel:
224 case clang::ARM::BI_InterlockedExchangeAdd_rel:
225 case clang::ARM::BI_InterlockedExchangeAdd64_rel:
226 return MSVCIntrin::_InterlockedExchangeAdd_rel;
227 case clang::ARM::BI_InterlockedExchangeAdd8_nf:
228 case clang::ARM::BI_InterlockedExchangeAdd16_nf:
229 case clang::ARM::BI_InterlockedExchangeAdd_nf:
230 case clang::ARM::BI_InterlockedExchangeAdd64_nf:
231 return MSVCIntrin::_InterlockedExchangeAdd_nf;
232 case clang::ARM::BI_InterlockedExchange8_acq:
233 case clang::ARM::BI_InterlockedExchange16_acq:
234 case clang::ARM::BI_InterlockedExchange_acq:
235 case clang::ARM::BI_InterlockedExchange64_acq:
236 case clang::ARM::BI_InterlockedExchangePointer_acq:
237 return MSVCIntrin::_InterlockedExchange_acq;
238 case clang::ARM::BI_InterlockedExchange8_rel:
239 case clang::ARM::BI_InterlockedExchange16_rel:
240 case clang::ARM::BI_InterlockedExchange_rel:
241 case clang::ARM::BI_InterlockedExchange64_rel:
242 case clang::ARM::BI_InterlockedExchangePointer_rel:
243 return MSVCIntrin::_InterlockedExchange_rel;
244 case clang::ARM::BI_InterlockedExchange8_nf:
245 case clang::ARM::BI_InterlockedExchange16_nf:
246 case clang::ARM::BI_InterlockedExchange_nf:
247 case clang::ARM::BI_InterlockedExchange64_nf:
248 case clang::ARM::BI_InterlockedExchangePointer_nf:
249 return MSVCIntrin::_InterlockedExchange_nf;
250 case clang::ARM::BI_InterlockedCompareExchange8_acq:
251 case clang::ARM::BI_InterlockedCompareExchange16_acq:
252 case clang::ARM::BI_InterlockedCompareExchange_acq:
253 case clang::ARM::BI_InterlockedCompareExchange64_acq:
254 case clang::ARM::BI_InterlockedCompareExchangePointer_acq:
255 return MSVCIntrin::_InterlockedCompareExchange_acq;
256 case clang::ARM::BI_InterlockedCompareExchange8_rel:
257 case clang::ARM::BI_InterlockedCompareExchange16_rel:
258 case clang::ARM::BI_InterlockedCompareExchange_rel:
259 case clang::ARM::BI_InterlockedCompareExchange64_rel:
260 case clang::ARM::BI_InterlockedCompareExchangePointer_rel:
261 return MSVCIntrin::_InterlockedCompareExchange_rel;
262 case clang::ARM::BI_InterlockedCompareExchange8_nf:
263 case clang::ARM::BI_InterlockedCompareExchange16_nf:
264 case clang::ARM::BI_InterlockedCompareExchange_nf:
265 case clang::ARM::BI_InterlockedCompareExchange64_nf:
266 return MSVCIntrin::_InterlockedCompareExchange_nf;
267 case clang::ARM::BI_InterlockedOr8_acq:
268 case clang::ARM::BI_InterlockedOr16_acq:
269 case clang::ARM::BI_InterlockedOr_acq:
270 case clang::ARM::BI_InterlockedOr64_acq:
271 return MSVCIntrin::_InterlockedOr_acq;
272 case clang::ARM::BI_InterlockedOr8_rel:
273 case clang::ARM::BI_InterlockedOr16_rel:
274 case clang::ARM::BI_InterlockedOr_rel:
275 case clang::ARM::BI_InterlockedOr64_rel:
276 return MSVCIntrin::_InterlockedOr_rel;
277 case clang::ARM::BI_InterlockedOr8_nf:
278 case clang::ARM::BI_InterlockedOr16_nf:
279 case clang::ARM::BI_InterlockedOr_nf:
280 case clang::ARM::BI_InterlockedOr64_nf:
281 return MSVCIntrin::_InterlockedOr_nf;
282 case clang::ARM::BI_InterlockedXor8_acq:
283 case clang::ARM::BI_InterlockedXor16_acq:
284 case clang::ARM::BI_InterlockedXor_acq:
285 case clang::ARM::BI_InterlockedXor64_acq:
286 return MSVCIntrin::_InterlockedXor_acq;
287 case clang::ARM::BI_InterlockedXor8_rel:
288 case clang::ARM::BI_InterlockedXor16_rel:
289 case clang::ARM::BI_InterlockedXor_rel:
290 case clang::ARM::BI_InterlockedXor64_rel:
291 return MSVCIntrin::_InterlockedXor_rel;
292 case clang::ARM::BI_InterlockedXor8_nf:
293 case clang::ARM::BI_InterlockedXor16_nf:
294 case clang::ARM::BI_InterlockedXor_nf:
295 case clang::ARM::BI_InterlockedXor64_nf:
296 return MSVCIntrin::_InterlockedXor_nf;
297 case clang::ARM::BI_InterlockedAnd8_acq:
298 case clang::ARM::BI_InterlockedAnd16_acq:
299 case clang::ARM::BI_InterlockedAnd_acq:
300 case clang::ARM::BI_InterlockedAnd64_acq:
301 return MSVCIntrin::_InterlockedAnd_acq;
302 case clang::ARM::BI_InterlockedAnd8_rel:
303 case clang::ARM::BI_InterlockedAnd16_rel:
304 case clang::ARM::BI_InterlockedAnd_rel:
305 case clang::ARM::BI_InterlockedAnd64_rel:
306 return MSVCIntrin::_InterlockedAnd_rel;
307 case clang::ARM::BI_InterlockedAnd8_nf:
308 case clang::ARM::BI_InterlockedAnd16_nf:
309 case clang::ARM::BI_InterlockedAnd_nf:
310 case clang::ARM::BI_InterlockedAnd64_nf:
311 return MSVCIntrin::_InterlockedAnd_nf;
312 case clang::ARM::BI_InterlockedIncrement16_acq:
313 case clang::ARM::BI_InterlockedIncrement_acq:
314 case clang::ARM::BI_InterlockedIncrement64_acq:
315 return MSVCIntrin::_InterlockedIncrement_acq;
316 case clang::ARM::BI_InterlockedIncrement16_rel:
317 case clang::ARM::BI_InterlockedIncrement_rel:
318 case clang::ARM::BI_InterlockedIncrement64_rel:
319 return MSVCIntrin::_InterlockedIncrement_rel;
320 case clang::ARM::BI_InterlockedIncrement16_nf:
321 case clang::ARM::BI_InterlockedIncrement_nf:
322 case clang::ARM::BI_InterlockedIncrement64_nf:
323 return MSVCIntrin::_InterlockedIncrement_nf;
324 case clang::ARM::BI_InterlockedDecrement16_acq:
325 case clang::ARM::BI_InterlockedDecrement_acq:
326 case clang::ARM::BI_InterlockedDecrement64_acq:
327 return MSVCIntrin::_InterlockedDecrement_acq;
328 case clang::ARM::BI_InterlockedDecrement16_rel:
329 case clang::ARM::BI_InterlockedDecrement_rel:
330 case clang::ARM::BI_InterlockedDecrement64_rel:
331 return MSVCIntrin::_InterlockedDecrement_rel;
332 case clang::ARM::BI_InterlockedDecrement16_nf:
333 case clang::ARM::BI_InterlockedDecrement_nf:
334 case clang::ARM::BI_InterlockedDecrement64_nf:
335 return MSVCIntrin::_InterlockedDecrement_nf;
336 }
337 llvm_unreachable("must return from switch");
338}
339
340// Emit an intrinsic where all operands are of the same type as the result.
341// Depending on mode, this may be a constrained floating-point intrinsic.
342static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
343 unsigned IntrinsicID,
344 unsigned ConstrainedIntrinsicID,
345 llvm::Type *Ty,
346 ArrayRef<Value *> Args) {
347 Function *F;
348 if (CGF.Builder.getIsFPConstrained())
349 F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID, Tys: Ty);
350 else
351 F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Ty);
352
353 if (CGF.Builder.getIsFPConstrained())
354 return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args);
355 else
356 return CGF.Builder.CreateCall(Callee: F, Args);
357}
358
359static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
360 NeonTypeFlags TypeFlags,
361 bool HasFastHalfType = true,
362 bool V1Ty = false,
363 bool AllowBFloatArgsAndRet = true) {
364 int IsQuad = TypeFlags.isQuad();
365 switch (TypeFlags.getEltType()) {
366 case NeonTypeFlags::Int8:
367 case NeonTypeFlags::Poly8:
368 case NeonTypeFlags::MFloat8:
369 return llvm::FixedVectorType::get(ElementType: CGF->Int8Ty, NumElts: V1Ty ? 1 : (8 << IsQuad));
370 case NeonTypeFlags::Int16:
371 case NeonTypeFlags::Poly16:
372 return llvm::FixedVectorType::get(ElementType: CGF->Int16Ty, NumElts: V1Ty ? 1 : (4 << IsQuad));
373 case NeonTypeFlags::BFloat16:
374 if (AllowBFloatArgsAndRet)
375 return llvm::FixedVectorType::get(ElementType: CGF->BFloatTy, NumElts: V1Ty ? 1 : (4 << IsQuad));
376 else
377 return llvm::FixedVectorType::get(ElementType: CGF->Int16Ty, NumElts: V1Ty ? 1 : (4 << IsQuad));
378 case NeonTypeFlags::Float16:
379 if (HasFastHalfType)
380 return llvm::FixedVectorType::get(ElementType: CGF->HalfTy, NumElts: V1Ty ? 1 : (4 << IsQuad));
381 else
382 return llvm::FixedVectorType::get(ElementType: CGF->Int16Ty, NumElts: V1Ty ? 1 : (4 << IsQuad));
383 case NeonTypeFlags::Int32:
384 return llvm::FixedVectorType::get(ElementType: CGF->Int32Ty, NumElts: V1Ty ? 1 : (2 << IsQuad));
385 case NeonTypeFlags::Int64:
386 case NeonTypeFlags::Poly64:
387 return llvm::FixedVectorType::get(ElementType: CGF->Int64Ty, NumElts: V1Ty ? 1 : (1 << IsQuad));
388 case NeonTypeFlags::Poly128:
389 // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
390 // There is a lot of i128 and f128 API missing.
391 // so we use v16i8 to represent poly128 and get pattern matched.
392 return llvm::FixedVectorType::get(ElementType: CGF->Int8Ty, NumElts: 16);
393 case NeonTypeFlags::Float32:
394 return llvm::FixedVectorType::get(ElementType: CGF->FloatTy, NumElts: V1Ty ? 1 : (2 << IsQuad));
395 case NeonTypeFlags::Float64:
396 return llvm::FixedVectorType::get(ElementType: CGF->DoubleTy, NumElts: V1Ty ? 1 : (1 << IsQuad));
397 }
398 llvm_unreachable("Unknown vector element type!");
399}
400
401static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
402 NeonTypeFlags IntTypeFlags) {
403 int IsQuad = IntTypeFlags.isQuad();
404 switch (IntTypeFlags.getEltType()) {
405 case NeonTypeFlags::Int16:
406 return llvm::FixedVectorType::get(ElementType: CGF->HalfTy, NumElts: (4 << IsQuad));
407 case NeonTypeFlags::Int32:
408 return llvm::FixedVectorType::get(ElementType: CGF->FloatTy, NumElts: (2 << IsQuad));
409 case NeonTypeFlags::Int64:
410 return llvm::FixedVectorType::get(ElementType: CGF->DoubleTy, NumElts: (1 << IsQuad));
411 default:
412 llvm_unreachable("Type can't be converted to floating-point!");
413 }
414}
415
416Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
417 const ElementCount &Count) {
418 Value *SV = llvm::ConstantVector::getSplat(EC: Count, Elt: C);
419 return Builder.CreateShuffleVector(V1: V, V2: V, Mask: SV, Name: "lane");
420}
421
422Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
423 ElementCount EC = cast<llvm::VectorType>(Val: V->getType())->getElementCount();
424 return EmitNeonSplat(V, C, Count: EC);
425}
426
427Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
428 const char *name,
429 unsigned shift, bool rightshift) {
430 unsigned j = 0;
431 for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
432 ai != ae; ++ai, ++j) {
433 if (F->isConstrainedFPIntrinsic())
434 if (ai->getType()->isMetadataTy())
435 continue;
436 if (shift > 0 && shift == j)
437 Ops[j] = EmitNeonShiftVector(V: Ops[j], Ty: ai->getType(), negateForRightShift: rightshift);
438 else
439 Ops[j] = Builder.CreateBitCast(V: Ops[j], DestTy: ai->getType(), Name: name);
440 }
441
442 if (F->isConstrainedFPIntrinsic())
443 return Builder.CreateConstrainedFPCall(Callee: F, Args: Ops, Name: name);
444 else
445 return Builder.CreateCall(Callee: F, Args: Ops, Name: name);
446}
447
448Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
449 ArrayRef<llvm::Type *> Tys,
450 SmallVectorImpl<Value *> &Ops,
451 const CallExpr *E, const char *name) {
452 llvm::Value *FPM =
453 EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, Idx: E->getNumArgs() - 1, E);
454 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_set_fpmr), Args: FPM);
455 return EmitNeonCall(F: CGM.getIntrinsic(IID, Tys), Ops, name);
456}
457
458llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
459 unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
460 SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
461
462 const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
463 RetTy->getPrimitiveSizeInBits();
464 llvm::Type *Tys[] = {llvm::FixedVectorType::get(ElementType: RetTy, NumElts: ElemCount),
465 Ops[1]->getType()};
466 if (ExtendLaneArg) {
467 auto *VT = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
468 Ops[2] = Builder.CreateInsertVector(DstType: VT, SrcVec: PoisonValue::get(T: VT), SubVec: Ops[2],
469 Idx: uint64_t(0));
470 }
471 return EmitFP8NeonCall(IID, Tys, Ops, E, name);
472}
473
474llvm::Value *CodeGenFunction::EmitFP8NeonFMLACall(
475 unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
476 SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
477
478 if (ExtendLaneArg) {
479 auto *VT = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
480 Ops[2] = Builder.CreateInsertVector(DstType: VT, SrcVec: PoisonValue::get(T: VT), SubVec: Ops[2],
481 Idx: uint64_t(0));
482 }
483 const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
484 RetTy->getPrimitiveSizeInBits();
485 return EmitFP8NeonCall(IID, Tys: {llvm::FixedVectorType::get(ElementType: RetTy, NumElts: ElemCount)},
486 Ops, E, name);
487}
488
489Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
490 bool neg) {
491 int SV = cast<ConstantInt>(Val: V)->getSExtValue();
492 return ConstantInt::getSigned(Ty, V: neg ? -SV : SV);
493}
494
495Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
496 llvm::Type *Ty1, bool Extract,
497 SmallVectorImpl<llvm::Value *> &Ops,
498 const CallExpr *E,
499 const char *name) {
500 llvm::Type *Tys[] = {Ty0, Ty1};
501 if (Extract) {
502 // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of
503 // the vector.
504 Tys[1] = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8);
505 Ops[0] = Builder.CreateExtractVector(DstType: Tys[1], SrcVec: Ops[0], Idx: uint64_t(0));
506 }
507 return EmitFP8NeonCall(IID, Tys, Ops, E, name);
508}
509
510// Right-shift a vector by a constant.
511Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
512 llvm::Type *Ty, bool usgn,
513 const char *name) {
514 llvm::VectorType *VTy = cast<llvm::VectorType>(Val: Ty);
515
516 int ShiftAmt = cast<ConstantInt>(Val: Shift)->getSExtValue();
517 int EltSize = VTy->getScalarSizeInBits();
518
519 Vec = Builder.CreateBitCast(V: Vec, DestTy: Ty);
520
521 // lshr/ashr are undefined when the shift amount is equal to the vector
522 // element size.
523 if (ShiftAmt == EltSize) {
524 if (usgn) {
525 // Right-shifting an unsigned value by its size yields 0.
526 return llvm::ConstantAggregateZero::get(Ty: VTy);
527 } else {
528 // Right-shifting a signed value by its size is equivalent
529 // to a shift of size-1.
530 --ShiftAmt;
531 Shift = ConstantInt::get(Ty: VTy->getElementType(), V: ShiftAmt);
532 }
533 }
534
535 Shift = EmitNeonShiftVector(V: Shift, Ty, neg: false);
536 if (usgn)
537 return Builder.CreateLShr(LHS: Vec, RHS: Shift, Name: name);
538 else
539 return Builder.CreateAShr(LHS: Vec, RHS: Shift, Name: name);
540}
541
542enum {
543 AddRetType = (1 << 0),
544 Add1ArgType = (1 << 1),
545 Add2ArgTypes = (1 << 2),
546
547 VectorizeRetType = (1 << 3),
548 VectorizeArgTypes = (1 << 4),
549
550 InventFloatType = (1 << 5),
551 UnsignedAlts = (1 << 6),
552
553 Use64BitVectors = (1 << 7),
554 Use128BitVectors = (1 << 8),
555
556 Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
557 VectorRet = AddRetType | VectorizeRetType,
558 VectorRetGetArgs01 =
559 AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
560 FpCmpzModifiers =
561 AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
562};
563
564namespace {
565struct ARMVectorIntrinsicInfo {
566 const char *NameHint;
567 unsigned BuiltinID;
568 unsigned LLVMIntrinsic;
569 unsigned AltLLVMIntrinsic;
570 uint64_t TypeModifier;
571
572 bool operator<(unsigned RHSBuiltinID) const {
573 return BuiltinID < RHSBuiltinID;
574 }
575 bool operator<(const ARMVectorIntrinsicInfo &TE) const {
576 return BuiltinID < TE.BuiltinID;
577 }
578};
579} // end anonymous namespace
580
581#define NEONMAP0(NameBase) \
582 { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 }
583
584#define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
585 { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
586 Intrinsic::LLVMIntrinsic, 0, TypeModifier }
587
588#define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
589 { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
590 Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
591 TypeModifier }
592
593// clang-format off
594static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
595 NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0),
596 NEONMAP0(splat_lane_v),
597 NEONMAP0(splat_laneq_v),
598 NEONMAP0(splatq_lane_v),
599 NEONMAP0(splatq_laneq_v),
600 NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
601 NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
602 NEONMAP1(vabs_v, arm_neon_vabs, 0),
603 NEONMAP1(vabsq_v, arm_neon_vabs, 0),
604 NEONMAP0(vadd_v),
605 NEONMAP0(vaddhn_v),
606 NEONMAP0(vaddq_v),
607 NEONMAP1(vaesdq_u8, arm_neon_aesd, 0),
608 NEONMAP1(vaeseq_u8, arm_neon_aese, 0),
609 NEONMAP1(vaesimcq_u8, arm_neon_aesimc, 0),
610 NEONMAP1(vaesmcq_u8, arm_neon_aesmc, 0),
611 NEONMAP1(vbfdot_f32, arm_neon_bfdot, 0),
612 NEONMAP1(vbfdotq_f32, arm_neon_bfdot, 0),
613 NEONMAP1(vbfmlalbq_f32, arm_neon_bfmlalb, 0),
614 NEONMAP1(vbfmlaltq_f32, arm_neon_bfmlalt, 0),
615 NEONMAP1(vbfmmlaq_f32, arm_neon_bfmmla, 0),
616 NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
617 NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
618 NEONMAP1(vcadd_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
619 NEONMAP1(vcadd_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
620 NEONMAP1(vcadd_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
621 NEONMAP1(vcadd_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
622 NEONMAP1(vcaddq_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
623 NEONMAP1(vcaddq_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
624 NEONMAP1(vcaddq_rot270_f64, arm_neon_vcadd_rot270, Add1ArgType),
625 NEONMAP1(vcaddq_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
626 NEONMAP1(vcaddq_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
627 NEONMAP1(vcaddq_rot90_f64, arm_neon_vcadd_rot90, Add1ArgType),
628 NEONMAP1(vcage_v, arm_neon_vacge, 0),
629 NEONMAP1(vcageq_v, arm_neon_vacge, 0),
630 NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
631 NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
632 NEONMAP1(vcale_v, arm_neon_vacge, 0),
633 NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
634 NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
635 NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
636 NEONMAP0(vceqz_v),
637 NEONMAP0(vceqzq_v),
638 NEONMAP0(vcgez_v),
639 NEONMAP0(vcgezq_v),
640 NEONMAP0(vcgtz_v),
641 NEONMAP0(vcgtzq_v),
642 NEONMAP0(vclez_v),
643 NEONMAP0(vclezq_v),
644 NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
645 NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
646 NEONMAP0(vcltz_v),
647 NEONMAP0(vcltzq_v),
648 NEONMAP1(vclz_v, ctlz, Add1ArgType),
649 NEONMAP1(vclzq_v, ctlz, Add1ArgType),
650 NEONMAP1(vcnt_v, ctpop, Add1ArgType),
651 NEONMAP1(vcntq_v, ctpop, Add1ArgType),
652 NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
653 NEONMAP0(vcvt_f16_s16),
654 NEONMAP0(vcvt_f16_u16),
655 NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
656 NEONMAP0(vcvt_f32_v),
657 NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
658 NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
659 NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
660 NEONMAP1(vcvt_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
661 NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
662 NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
663 NEONMAP1(vcvt_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
664 NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
665 NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
666 NEONMAP0(vcvt_s16_f16),
667 NEONMAP0(vcvt_s32_v),
668 NEONMAP0(vcvt_s64_v),
669 NEONMAP0(vcvt_u16_f16),
670 NEONMAP0(vcvt_u32_v),
671 NEONMAP0(vcvt_u64_v),
672 NEONMAP1(vcvta_s16_f16, arm_neon_vcvtas, 0),
673 NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
674 NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
675 NEONMAP1(vcvta_u16_f16, arm_neon_vcvtau, 0),
676 NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
677 NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
678 NEONMAP1(vcvtaq_s16_f16, arm_neon_vcvtas, 0),
679 NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
680 NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
681 NEONMAP1(vcvtaq_u16_f16, arm_neon_vcvtau, 0),
682 NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
683 NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
684 NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0),
685 NEONMAP1(vcvtm_s16_f16, arm_neon_vcvtms, 0),
686 NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
687 NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
688 NEONMAP1(vcvtm_u16_f16, arm_neon_vcvtmu, 0),
689 NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
690 NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
691 NEONMAP1(vcvtmq_s16_f16, arm_neon_vcvtms, 0),
692 NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
693 NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
694 NEONMAP1(vcvtmq_u16_f16, arm_neon_vcvtmu, 0),
695 NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
696 NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
697 NEONMAP1(vcvtn_s16_f16, arm_neon_vcvtns, 0),
698 NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
699 NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
700 NEONMAP1(vcvtn_u16_f16, arm_neon_vcvtnu, 0),
701 NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
702 NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
703 NEONMAP1(vcvtnq_s16_f16, arm_neon_vcvtns, 0),
704 NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
705 NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
706 NEONMAP1(vcvtnq_u16_f16, arm_neon_vcvtnu, 0),
707 NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
708 NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
709 NEONMAP1(vcvtp_s16_f16, arm_neon_vcvtps, 0),
710 NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
711 NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
712 NEONMAP1(vcvtp_u16_f16, arm_neon_vcvtpu, 0),
713 NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
714 NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
715 NEONMAP1(vcvtpq_s16_f16, arm_neon_vcvtps, 0),
716 NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
717 NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
718 NEONMAP1(vcvtpq_u16_f16, arm_neon_vcvtpu, 0),
719 NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
720 NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
721 NEONMAP0(vcvtq_f16_s16),
722 NEONMAP0(vcvtq_f16_u16),
723 NEONMAP0(vcvtq_f32_v),
724 NEONMAP1(vcvtq_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
725 NEONMAP1(vcvtq_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
726 NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
727 NEONMAP1(vcvtq_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
728 NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
729 NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
730 NEONMAP1(vcvtq_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
731 NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
732 NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
733 NEONMAP0(vcvtq_s16_f16),
734 NEONMAP0(vcvtq_s32_v),
735 NEONMAP0(vcvtq_s64_v),
736 NEONMAP0(vcvtq_u16_f16),
737 NEONMAP0(vcvtq_u32_v),
738 NEONMAP0(vcvtq_u64_v),
739 NEONMAP1(vdot_s32, arm_neon_sdot, 0),
740 NEONMAP1(vdot_u32, arm_neon_udot, 0),
741 NEONMAP1(vdotq_s32, arm_neon_sdot, 0),
742 NEONMAP1(vdotq_u32, arm_neon_udot, 0),
743 NEONMAP0(vext_v),
744 NEONMAP0(vextq_v),
745 NEONMAP0(vfma_v),
746 NEONMAP0(vfmaq_v),
747 NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
748 NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
749 NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
750 NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
751 NEONMAP0(vld1_dup_v),
752 NEONMAP1(vld1_v, arm_neon_vld1, 0),
753 NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
754 NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
755 NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
756 NEONMAP0(vld1q_dup_v),
757 NEONMAP1(vld1q_v, arm_neon_vld1, 0),
758 NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
759 NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
760 NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
761 NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
762 NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
763 NEONMAP1(vld2_v, arm_neon_vld2, 0),
764 NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
765 NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
766 NEONMAP1(vld2q_v, arm_neon_vld2, 0),
767 NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
768 NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
769 NEONMAP1(vld3_v, arm_neon_vld3, 0),
770 NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
771 NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
772 NEONMAP1(vld3q_v, arm_neon_vld3, 0),
773 NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
774 NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
775 NEONMAP1(vld4_v, arm_neon_vld4, 0),
776 NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
777 NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
778 NEONMAP1(vld4q_v, arm_neon_vld4, 0),
779 NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
780 NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
781 NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
782 NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
783 NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
784 NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
785 NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
786 NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
787 NEONMAP1(vmmlaq_s32, arm_neon_smmla, 0),
788 NEONMAP1(vmmlaq_u32, arm_neon_ummla, 0),
789 NEONMAP0(vmovl_v),
790 NEONMAP0(vmovn_v),
791 NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
792 NEONMAP0(vmull_v),
793 NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
794 NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
795 NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
796 NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
797 NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
798 NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
799 NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
800 NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
801 NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
802 NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
803 NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
804 NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
805 NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
806 NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0),
807 NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0),
808 NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
809 NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
810 NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
811 NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
812 NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
813 NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
814 NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
815 NEONMAP1(vqrdmlah_s16, arm_neon_vqrdmlah, Add1ArgType),
816 NEONMAP1(vqrdmlah_s32, arm_neon_vqrdmlah, Add1ArgType),
817 NEONMAP1(vqrdmlahq_s16, arm_neon_vqrdmlah, Add1ArgType),
818 NEONMAP1(vqrdmlahq_s32, arm_neon_vqrdmlah, Add1ArgType),
819 NEONMAP1(vqrdmlsh_s16, arm_neon_vqrdmlsh, Add1ArgType),
820 NEONMAP1(vqrdmlsh_s32, arm_neon_vqrdmlsh, Add1ArgType),
821 NEONMAP1(vqrdmlshq_s16, arm_neon_vqrdmlsh, Add1ArgType),
822 NEONMAP1(vqrdmlshq_s32, arm_neon_vqrdmlsh, Add1ArgType),
823 NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
824 NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
825 NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
826 NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
827 NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
828 NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
829 NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
830 NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
831 NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
832 NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
833 NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
834 NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
835 NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
836 NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
837 NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
838 NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
839 NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
840 NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
841 NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
842 NEONMAP1(vrnd_v, trunc, Add1ArgType),
843 NEONMAP1(vrnda_v, round, Add1ArgType),
844 NEONMAP1(vrndaq_v, round, Add1ArgType),
845 NEONMAP0(vrndi_v),
846 NEONMAP0(vrndiq_v),
847 NEONMAP1(vrndm_v, floor, Add1ArgType),
848 NEONMAP1(vrndmq_v, floor, Add1ArgType),
849 NEONMAP1(vrndn_v, roundeven, Add1ArgType),
850 NEONMAP1(vrndnq_v, roundeven, Add1ArgType),
851 NEONMAP1(vrndp_v, ceil, Add1ArgType),
852 NEONMAP1(vrndpq_v, ceil, Add1ArgType),
853 NEONMAP1(vrndq_v, trunc, Add1ArgType),
854 NEONMAP1(vrndx_v, rint, Add1ArgType),
855 NEONMAP1(vrndxq_v, rint, Add1ArgType),
856 NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
857 NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
858 NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
859 NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
860 NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
861 NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
862 NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
863 NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
864 NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
865 NEONMAP1(vsha1su0q_u32, arm_neon_sha1su0, 0),
866 NEONMAP1(vsha1su1q_u32, arm_neon_sha1su1, 0),
867 NEONMAP1(vsha256h2q_u32, arm_neon_sha256h2, 0),
868 NEONMAP1(vsha256hq_u32, arm_neon_sha256h, 0),
869 NEONMAP1(vsha256su0q_u32, arm_neon_sha256su0, 0),
870 NEONMAP1(vsha256su1q_u32, arm_neon_sha256su1, 0),
871 NEONMAP0(vshl_n_v),
872 NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
873 NEONMAP0(vshll_n_v),
874 NEONMAP0(vshlq_n_v),
875 NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
876 NEONMAP0(vshr_n_v),
877 NEONMAP0(vshrn_n_v),
878 NEONMAP0(vshrq_n_v),
879 NEONMAP1(vst1_v, arm_neon_vst1, 0),
880 NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
881 NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
882 NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
883 NEONMAP1(vst1q_v, arm_neon_vst1, 0),
884 NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
885 NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
886 NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
887 NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
888 NEONMAP1(vst2_v, arm_neon_vst2, 0),
889 NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
890 NEONMAP1(vst2q_v, arm_neon_vst2, 0),
891 NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
892 NEONMAP1(vst3_v, arm_neon_vst3, 0),
893 NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
894 NEONMAP1(vst3q_v, arm_neon_vst3, 0),
895 NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
896 NEONMAP1(vst4_v, arm_neon_vst4, 0),
897 NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
898 NEONMAP1(vst4q_v, arm_neon_vst4, 0),
899 NEONMAP0(vsubhn_v),
900 NEONMAP0(vtrn_v),
901 NEONMAP0(vtrnq_v),
902 NEONMAP0(vtst_v),
903 NEONMAP0(vtstq_v),
904 NEONMAP1(vusdot_s32, arm_neon_usdot, 0),
905 NEONMAP1(vusdotq_s32, arm_neon_usdot, 0),
906 NEONMAP1(vusmmlaq_s32, arm_neon_usmmla, 0),
907 NEONMAP0(vuzp_v),
908 NEONMAP0(vuzpq_v),
909 NEONMAP0(vzip_v),
910 NEONMAP0(vzipq_v)
911};
912
913static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
914 NEONMAP0(splat_lane_v),
915 NEONMAP0(splat_laneq_v),
916 NEONMAP0(splatq_lane_v),
917 NEONMAP0(splatq_laneq_v),
918 NEONMAP1(vabs_v, aarch64_neon_abs, 0),
919 NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
920 NEONMAP0(vadd_v),
921 NEONMAP0(vaddhn_v),
922 NEONMAP0(vaddq_p128),
923 NEONMAP0(vaddq_v),
924 NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0),
925 NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0),
926 NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0),
927 NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0),
928 NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
929 NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
930 NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
931 NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
932 NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
933 NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
934 NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
935 NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
936 NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0),
937 NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0),
938 NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0),
939 NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0),
940 NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0),
941 NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
942 NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
943 NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
944 NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
945 NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
946 NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
947 NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType),
948 NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
949 NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
950 NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType),
951 NEONMAP1(vcage_v, aarch64_neon_facge, 0),
952 NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
953 NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
954 NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
955 NEONMAP1(vcale_v, aarch64_neon_facge, 0),
956 NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
957 NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
958 NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
959 NEONMAP0(vceqz_v),
960 NEONMAP0(vceqzq_v),
961 NEONMAP0(vcgez_v),
962 NEONMAP0(vcgezq_v),
963 NEONMAP0(vcgtz_v),
964 NEONMAP0(vcgtzq_v),
965 NEONMAP0(vclez_v),
966 NEONMAP0(vclezq_v),
967 NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
968 NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
969 NEONMAP0(vcltz_v),
970 NEONMAP0(vcltzq_v),
971 NEONMAP1(vclz_v, ctlz, Add1ArgType),
972 NEONMAP1(vclzq_v, ctlz, Add1ArgType),
973 NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
974 NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
975 NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
976 NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
977 NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
978 NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
979 NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
980 NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
981 NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
982 NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
983 NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType),
984 NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
985 NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
986 NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType),
987 NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
988 NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
989 NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType),
990 NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
991 NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
992 NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType),
993 NEONMAP1(vcnt_v, ctpop, Add1ArgType),
994 NEONMAP1(vcntq_v, ctpop, Add1ArgType),
995 NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
996 NEONMAP0(vcvt_f16_s16),
997 NEONMAP0(vcvt_f16_u16),
998 NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
999 NEONMAP0(vcvt_f32_v),
1000 NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
1001 NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
1002 NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
1003 NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
1004 NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
1005 NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
1006 NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
1007 NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
1008 NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
1009 NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
1010 NEONMAP0(vcvtq_f16_s16),
1011 NEONMAP0(vcvtq_f16_u16),
1012 NEONMAP0(vcvtq_f32_v),
1013 NEONMAP0(vcvtq_high_bf16_f32),
1014 NEONMAP0(vcvtq_low_bf16_f32),
1015 NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
1016 NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
1017 NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
1018 NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
1019 NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
1020 NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
1021 NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
1022 NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
1023 NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
1024 NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
1025 NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
1026 NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
1027 NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
1028 NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0),
1029 NEONMAP1(vdotq_u32, aarch64_neon_udot, 0),
1030 NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1031 NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1032 NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1033 NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1034 NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1035 NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1036 NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1037 NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
1038 NEONMAP0(vext_v),
1039 NEONMAP0(vextq_v),
1040 NEONMAP0(vfma_v),
1041 NEONMAP0(vfmaq_v),
1042 NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0),
1043 NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0),
1044 NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0),
1045 NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0),
1046 NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0),
1047 NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0),
1048 NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0),
1049 NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0),
1050 NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
1051 NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
1052 NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
1053 NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
1054 NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
1055 NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
1056 NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
1057 NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
1058 NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
1059 NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
1060 NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0),
1061 NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0),
1062 NEONMAP0(vmovl_v),
1063 NEONMAP0(vmovn_v),
1064 NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
1065 NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
1066 NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
1067 NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
1068 NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
1069 NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
1070 NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
1071 NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
1072 NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
1073 NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
1074 NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
1075 NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
1076 NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
1077 NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
1078 NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
1079 NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
1080 NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
1081 NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
1082 NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
1083 NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
1084 NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
1085 NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
1086 NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
1087 NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType),
1088 NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType),
1089 NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType),
1090 NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType),
1091 NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
1092 NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
1093 NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
1094 NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
1095 NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
1096 NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
1097 NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
1098 NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
1099 NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
1100 NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
1101 NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
1102 NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
1103 NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
1104 NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
1105 NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
1106 NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
1107 NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
1108 NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
1109 NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
1110 NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
1111 NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
1112 NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0),
1113 NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
1114 NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
1115 NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
1116 NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
1117 NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
1118 NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
1119 NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType),
1120 NEONMAP1(vrnd32x_f64, aarch64_neon_frint32x, Add1ArgType),
1121 NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType),
1122 NEONMAP1(vrnd32xq_f64, aarch64_neon_frint32x, Add1ArgType),
1123 NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType),
1124 NEONMAP1(vrnd32z_f64, aarch64_neon_frint32z, Add1ArgType),
1125 NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType),
1126 NEONMAP1(vrnd32zq_f64, aarch64_neon_frint32z, Add1ArgType),
1127 NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType),
1128 NEONMAP1(vrnd64x_f64, aarch64_neon_frint64x, Add1ArgType),
1129 NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType),
1130 NEONMAP1(vrnd64xq_f64, aarch64_neon_frint64x, Add1ArgType),
1131 NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType),
1132 NEONMAP1(vrnd64z_f64, aarch64_neon_frint64z, Add1ArgType),
1133 NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType),
1134 NEONMAP1(vrnd64zq_f64, aarch64_neon_frint64z, Add1ArgType),
1135 NEONMAP0(vrndi_v),
1136 NEONMAP0(vrndiq_v),
1137 NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
1138 NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
1139 NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
1140 NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
1141 NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
1142 NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
1143 NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
1144 NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
1145 NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
1146 NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0),
1147 NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0),
1148 NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0),
1149 NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0),
1150 NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0),
1151 NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0),
1152 NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0),
1153 NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0),
1154 NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0),
1155 NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0),
1156 NEONMAP0(vshl_n_v),
1157 NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
1158 NEONMAP0(vshll_n_v),
1159 NEONMAP0(vshlq_n_v),
1160 NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
1161 NEONMAP0(vshr_n_v),
1162 NEONMAP0(vshrn_n_v),
1163 NEONMAP0(vshrq_n_v),
1164 NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0),
1165 NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0),
1166 NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0),
1167 NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0),
1168 NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0),
1169 NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0),
1170 NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0),
1171 NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0),
1172 NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0),
1173 NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
1174 NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
1175 NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
1176 NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
1177 NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
1178 NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
1179 NEONMAP0(vsubhn_v),
1180 NEONMAP0(vtst_v),
1181 NEONMAP0(vtstq_v),
1182 NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0),
1183 NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0),
1184 NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0),
1185 NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0),
1186};
1187
1188static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
1189 NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
1190 NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
1191 NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
1192 NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
1193 NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
1194 NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
1195 NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
1196 NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
1197 NEONMAP1(vaddv_s16, vector_reduce_add, Add1ArgType),
1198 NEONMAP1(vaddv_s32, vector_reduce_add, Add1ArgType),
1199 NEONMAP1(vaddv_s8, vector_reduce_add, Add1ArgType),
1200 NEONMAP1(vaddv_u16, vector_reduce_add, Add1ArgType),
1201 NEONMAP1(vaddv_u32, vector_reduce_add, Add1ArgType),
1202 NEONMAP1(vaddv_u8, vector_reduce_add, Add1ArgType),
1203 NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
1204 NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
1205 NEONMAP1(vaddvq_s16, vector_reduce_add, Add1ArgType),
1206 NEONMAP1(vaddvq_s32, vector_reduce_add, Add1ArgType),
1207 NEONMAP1(vaddvq_s64, vector_reduce_add, Add1ArgType),
1208 NEONMAP1(vaddvq_s8, vector_reduce_add, Add1ArgType),
1209 NEONMAP1(vaddvq_u16, vector_reduce_add, Add1ArgType),
1210 NEONMAP1(vaddvq_u32, vector_reduce_add, Add1ArgType),
1211 NEONMAP1(vaddvq_u64, vector_reduce_add, Add1ArgType),
1212 NEONMAP1(vaddvq_u8, vector_reduce_add, Add1ArgType),
1213 NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
1214 NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
1215 NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
1216 NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
1217 NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
1218 NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
1219 NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
1220 NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
1221 NEONMAP1(vcvtad_s32_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1222 NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1223 NEONMAP1(vcvtad_u32_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1224 NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1225 NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1226 NEONMAP1(vcvtas_s64_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1227 NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1228 NEONMAP1(vcvtas_u64_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1229 NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
1230 NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
1231 NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
1232 NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
1233 NEONMAP1(vcvtd_s32_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1234 NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1235 NEONMAP1(vcvtd_u32_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1236 NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1237 NEONMAP0(vcvth_bf16_f32),
1238 NEONMAP1(vcvtmd_s32_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1239 NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1240 NEONMAP1(vcvtmd_u32_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1241 NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1242 NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1243 NEONMAP1(vcvtms_s64_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1244 NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1245 NEONMAP1(vcvtms_u64_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1246 NEONMAP1(vcvtnd_s32_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1247 NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1248 NEONMAP1(vcvtnd_u32_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1249 NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1250 NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1251 NEONMAP1(vcvtns_s64_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1252 NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1253 NEONMAP1(vcvtns_u64_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1254 NEONMAP1(vcvtpd_s32_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1255 NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1256 NEONMAP1(vcvtpd_u32_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1257 NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1258 NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1259 NEONMAP1(vcvtps_s64_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1260 NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1261 NEONMAP1(vcvtps_u64_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1262 NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
1263 NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
1264 NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
1265 NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
1266 NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1267 NEONMAP1(vcvts_s64_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1268 NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1269 NEONMAP1(vcvts_u64_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1270 NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
1271 NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1272 NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1273 NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1274 NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1275 NEONMAP1(vmaxv_s16, vector_reduce_smax, Add1ArgType),
1276 NEONMAP1(vmaxv_s32, vector_reduce_smax, Add1ArgType),
1277 NEONMAP1(vmaxv_s8, vector_reduce_smax, Add1ArgType),
1278 NEONMAP1(vmaxv_u16, vector_reduce_umax, Add1ArgType),
1279 NEONMAP1(vmaxv_u32, vector_reduce_umax, Add1ArgType),
1280 NEONMAP1(vmaxv_u8, vector_reduce_umax, Add1ArgType),
1281 NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1282 NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1283 NEONMAP1(vmaxvq_s16, vector_reduce_smax, Add1ArgType),
1284 NEONMAP1(vmaxvq_s32, vector_reduce_smax, Add1ArgType),
1285 NEONMAP1(vmaxvq_s8, vector_reduce_smax, Add1ArgType),
1286 NEONMAP1(vmaxvq_u16, vector_reduce_umax, Add1ArgType),
1287 NEONMAP1(vmaxvq_u32, vector_reduce_umax, Add1ArgType),
1288 NEONMAP1(vmaxvq_u8, vector_reduce_umax, Add1ArgType),
1289 NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1290 NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1291 NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1292 NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
1293 NEONMAP1(vminv_s16, vector_reduce_smin, Add1ArgType),
1294 NEONMAP1(vminv_s32, vector_reduce_smin, Add1ArgType),
1295 NEONMAP1(vminv_s8, vector_reduce_smin, Add1ArgType),
1296 NEONMAP1(vminv_u16, vector_reduce_umin, Add1ArgType),
1297 NEONMAP1(vminv_u32, vector_reduce_umin, Add1ArgType),
1298 NEONMAP1(vminv_u8, vector_reduce_umin, Add1ArgType),
1299 NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
1300 NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
1301 NEONMAP1(vminvq_s16, vector_reduce_smin, Add1ArgType),
1302 NEONMAP1(vminvq_s32, vector_reduce_smin, Add1ArgType),
1303 NEONMAP1(vminvq_s8, vector_reduce_smin, Add1ArgType),
1304 NEONMAP1(vminvq_u16, vector_reduce_umin, Add1ArgType),
1305 NEONMAP1(vminvq_u32, vector_reduce_umin, Add1ArgType),
1306 NEONMAP1(vminvq_u8, vector_reduce_umin, Add1ArgType),
1307 NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
1308 NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
1309 NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
1310 NEONMAP1(vpaddd_s64, vector_reduce_add, Add1ArgType),
1311 NEONMAP1(vpaddd_u64, vector_reduce_add, Add1ArgType),
1312 NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1313 NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
1314 NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1315 NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
1316 NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1317 NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
1318 NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
1319 NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
1320 NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
1321 NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
1322 NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
1323 NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
1324 NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
1325 NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
1326 NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
1327 NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
1328 NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
1329 NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
1330 NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
1331 NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
1332 NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
1333 NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
1334 NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
1335 NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
1336 NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
1337 NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
1338 NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
1339 NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
1340 NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
1341 NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
1342 NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
1343 NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
1344 NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
1345 NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
1346 NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
1347 NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
1348 NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
1349 NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors),
1350 NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
1351 NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors),
1352 NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
1353 NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
1354 NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
1355 NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
1356 NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
1357 NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
1358 NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
1359 NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
1360 NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
1361 NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
1362 NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
1363 NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
1364 NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
1365 NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
1366 NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
1367 NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
1368 NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
1369 NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
1370 NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
1371 NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
1372 NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
1373 NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
1374 NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
1375 NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
1376 NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
1377 NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
1378 NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
1379 NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
1380 NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
1381 NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
1382 NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
1383 NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
1384 NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
1385 NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
1386 NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
1387 NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
1388 NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
1389 NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
1390 NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
1391 NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
1392 NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
1393 NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
1394 NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
1395 NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
1396 NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
1397 NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
1398 NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
1399 NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
1400 NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
1401 NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
1402 NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
1403 NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
1404 NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
1405 NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
1406 NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
1407 NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
1408 NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
1409 NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
1410 NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
1411 NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
1412 NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
1413 NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
1414 NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
1415 NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
1416 NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
1417 NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
1418 NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
1419 NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
1420 NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
1421 NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
1422 NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
1423 NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
1424 NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
1425 NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
1426 NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
1427 NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
1428 NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
1429 NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
1430 NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
1431 NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
1432 NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
1433 NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
1434 // FP16 scalar intrinisics go here.
1435 NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
1436 NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1437 NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
1438 NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1439 NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
1440 NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
1441 NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
1442 NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
1443 NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
1444 NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
1445 NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
1446 NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
1447 NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
1448 NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1449 NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
1450 NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1451 NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
1452 NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1453 NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
1454 NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1455 NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
1456 NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1457 NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
1458 NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1459 NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
1460 NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1461 NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
1462 NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1463 NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
1464 NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
1465 NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
1466 NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
1467 NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
1468 NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
1469};
1470// clang-format on
1471
1472// Some intrinsics are equivalent for codegen.
1473static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
1474 { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, },
1475 { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, },
1476 { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, },
1477 { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, },
1478 { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, },
1479 { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, },
1480 { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, },
1481 { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, },
1482 { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, },
1483 { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, },
1484 { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, },
1485 { NEON::BI__builtin_neon_vcagtq_f16, NEON::BI__builtin_neon_vcagtq_v, },
1486 { NEON::BI__builtin_neon_vcale_f16, NEON::BI__builtin_neon_vcale_v, },
1487 { NEON::BI__builtin_neon_vcaleq_f16, NEON::BI__builtin_neon_vcaleq_v, },
1488 { NEON::BI__builtin_neon_vcalt_f16, NEON::BI__builtin_neon_vcalt_v, },
1489 { NEON::BI__builtin_neon_vcaltq_f16, NEON::BI__builtin_neon_vcaltq_v, },
1490 { NEON::BI__builtin_neon_vceqz_f16, NEON::BI__builtin_neon_vceqz_v, },
1491 { NEON::BI__builtin_neon_vceqzq_f16, NEON::BI__builtin_neon_vceqzq_v, },
1492 { NEON::BI__builtin_neon_vcgez_f16, NEON::BI__builtin_neon_vcgez_v, },
1493 { NEON::BI__builtin_neon_vcgezq_f16, NEON::BI__builtin_neon_vcgezq_v, },
1494 { NEON::BI__builtin_neon_vcgtz_f16, NEON::BI__builtin_neon_vcgtz_v, },
1495 { NEON::BI__builtin_neon_vcgtzq_f16, NEON::BI__builtin_neon_vcgtzq_v, },
1496 { NEON::BI__builtin_neon_vclez_f16, NEON::BI__builtin_neon_vclez_v, },
1497 { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, },
1498 { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, },
1499 { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, },
1500 { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, },
1501 { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, },
1502 { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, },
1503 { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, },
1504 { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, },
1505 { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, },
1506 { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v },
1507 { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v },
1508 { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v },
1509 { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v },
1510 { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v },
1511 { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v },
1512 { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v },
1513 { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v },
1514 { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v },
1515 { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v },
1516 { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v },
1517 { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v },
1518 { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v },
1519 { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v },
1520 { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v },
1521 { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v },
1522 { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v },
1523 { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v },
1524 { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v },
1525 { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v },
1526 { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v },
1527 { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v },
1528 { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v },
1529 { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v },
1530 { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v },
1531 { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v },
1532 { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v },
1533 { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v },
1534 { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v },
1535 { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v },
1536 { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, },
1537 { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, },
1538 { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, },
1539 { NEON::BI__builtin_neon_vmaxq_f16, NEON::BI__builtin_neon_vmaxq_v, },
1540 { NEON::BI__builtin_neon_vmin_f16, NEON::BI__builtin_neon_vmin_v, },
1541 { NEON::BI__builtin_neon_vminnm_f16, NEON::BI__builtin_neon_vminnm_v, },
1542 { NEON::BI__builtin_neon_vminnmq_f16, NEON::BI__builtin_neon_vminnmq_v, },
1543 { NEON::BI__builtin_neon_vminq_f16, NEON::BI__builtin_neon_vminq_v, },
1544 { NEON::BI__builtin_neon_vmulx_f16, NEON::BI__builtin_neon_vmulx_v, },
1545 { NEON::BI__builtin_neon_vmulxq_f16, NEON::BI__builtin_neon_vmulxq_v, },
1546 { NEON::BI__builtin_neon_vpadd_f16, NEON::BI__builtin_neon_vpadd_v, },
1547 { NEON::BI__builtin_neon_vpaddq_f16, NEON::BI__builtin_neon_vpaddq_v, },
1548 { NEON::BI__builtin_neon_vpmax_f16, NEON::BI__builtin_neon_vpmax_v, },
1549 { NEON::BI__builtin_neon_vpmaxnm_f16, NEON::BI__builtin_neon_vpmaxnm_v, },
1550 { NEON::BI__builtin_neon_vpmaxnmq_f16, NEON::BI__builtin_neon_vpmaxnmq_v, },
1551 { NEON::BI__builtin_neon_vpmaxq_f16, NEON::BI__builtin_neon_vpmaxq_v, },
1552 { NEON::BI__builtin_neon_vpmin_f16, NEON::BI__builtin_neon_vpmin_v, },
1553 { NEON::BI__builtin_neon_vpminnm_f16, NEON::BI__builtin_neon_vpminnm_v, },
1554 { NEON::BI__builtin_neon_vpminnmq_f16, NEON::BI__builtin_neon_vpminnmq_v, },
1555 { NEON::BI__builtin_neon_vpminq_f16, NEON::BI__builtin_neon_vpminq_v, },
1556 { NEON::BI__builtin_neon_vrecpe_f16, NEON::BI__builtin_neon_vrecpe_v, },
1557 { NEON::BI__builtin_neon_vrecpeq_f16, NEON::BI__builtin_neon_vrecpeq_v, },
1558 { NEON::BI__builtin_neon_vrecps_f16, NEON::BI__builtin_neon_vrecps_v, },
1559 { NEON::BI__builtin_neon_vrecpsq_f16, NEON::BI__builtin_neon_vrecpsq_v, },
1560 { NEON::BI__builtin_neon_vrnd_f16, NEON::BI__builtin_neon_vrnd_v, },
1561 { NEON::BI__builtin_neon_vrnda_f16, NEON::BI__builtin_neon_vrnda_v, },
1562 { NEON::BI__builtin_neon_vrndaq_f16, NEON::BI__builtin_neon_vrndaq_v, },
1563 { NEON::BI__builtin_neon_vrndi_f16, NEON::BI__builtin_neon_vrndi_v, },
1564 { NEON::BI__builtin_neon_vrndiq_f16, NEON::BI__builtin_neon_vrndiq_v, },
1565 { NEON::BI__builtin_neon_vrndm_f16, NEON::BI__builtin_neon_vrndm_v, },
1566 { NEON::BI__builtin_neon_vrndmq_f16, NEON::BI__builtin_neon_vrndmq_v, },
1567 { NEON::BI__builtin_neon_vrndn_f16, NEON::BI__builtin_neon_vrndn_v, },
1568 { NEON::BI__builtin_neon_vrndnq_f16, NEON::BI__builtin_neon_vrndnq_v, },
1569 { NEON::BI__builtin_neon_vrndp_f16, NEON::BI__builtin_neon_vrndp_v, },
1570 { NEON::BI__builtin_neon_vrndpq_f16, NEON::BI__builtin_neon_vrndpq_v, },
1571 { NEON::BI__builtin_neon_vrndq_f16, NEON::BI__builtin_neon_vrndq_v, },
1572 { NEON::BI__builtin_neon_vrndx_f16, NEON::BI__builtin_neon_vrndx_v, },
1573 { NEON::BI__builtin_neon_vrndxq_f16, NEON::BI__builtin_neon_vrndxq_v, },
1574 { NEON::BI__builtin_neon_vrsqrte_f16, NEON::BI__builtin_neon_vrsqrte_v, },
1575 { NEON::BI__builtin_neon_vrsqrteq_f16, NEON::BI__builtin_neon_vrsqrteq_v, },
1576 { NEON::BI__builtin_neon_vrsqrts_f16, NEON::BI__builtin_neon_vrsqrts_v, },
1577 { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, },
1578 { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, },
1579 { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, },
1580 { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v },
1581 { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v },
1582 { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v },
1583 { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v },
1584 { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v },
1585 { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v },
1586 { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v },
1587 { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v },
1588 { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v },
1589 { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v },
1590 { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v },
1591 { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v },
1592 { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v },
1593 { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v },
1594 { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v },
1595 { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v },
1596 { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v },
1597 { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v },
1598 { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v },
1599 { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v },
1600 { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
1601 { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v },
1602 // The mangling rules cause us to have one ID for each type for vldap1(q)_lane
1603 // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
1604 // arbitrary one to be handled as tha canonical variation.
1605 { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 },
1606 { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 },
1607 { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 },
1608 { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
1609 { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
1610 { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
1611 { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1612 { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1613 { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1614 { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1615 { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1616 { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1617};
1618
1619#undef NEONMAP0
1620#undef NEONMAP1
1621#undef NEONMAP2
1622
1623#define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
1624 { \
1625 #NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0, \
1626 TypeModifier \
1627 }
1628
1629#define SVEMAP2(NameBase, TypeModifier) \
1630 { #NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier }
1631static const ARMVectorIntrinsicInfo AArch64SVEIntrinsicMap[] = {
1632#define GET_SVE_LLVM_INTRINSIC_MAP
1633#include "clang/Basic/arm_sve_builtin_cg.inc"
1634#include "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
1635#undef GET_SVE_LLVM_INTRINSIC_MAP
1636};
1637
1638#undef SVEMAP1
1639#undef SVEMAP2
1640
1641#define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
1642 { \
1643 #NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0, \
1644 TypeModifier \
1645 }
1646
1647#define SMEMAP2(NameBase, TypeModifier) \
1648 { #NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier }
1649static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = {
1650#define GET_SME_LLVM_INTRINSIC_MAP
1651#include "clang/Basic/arm_sme_builtin_cg.inc"
1652#undef GET_SME_LLVM_INTRINSIC_MAP
1653};
1654
1655#undef SMEMAP1
1656#undef SMEMAP2
1657
1658static bool NEONSIMDIntrinsicsProvenSorted = false;
1659
1660static bool AArch64SIMDIntrinsicsProvenSorted = false;
1661static bool AArch64SISDIntrinsicsProvenSorted = false;
1662static bool AArch64SVEIntrinsicsProvenSorted = false;
1663static bool AArch64SMEIntrinsicsProvenSorted = false;
1664
1665static const ARMVectorIntrinsicInfo *
1666findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> IntrinsicMap,
1667 unsigned BuiltinID, bool &MapProvenSorted) {
1668
1669#ifndef NDEBUG
1670 if (!MapProvenSorted) {
1671 assert(llvm::is_sorted(IntrinsicMap));
1672 MapProvenSorted = true;
1673 }
1674#endif
1675
1676 const ARMVectorIntrinsicInfo *Builtin =
1677 llvm::lower_bound(Range&: IntrinsicMap, Value&: BuiltinID);
1678
1679 if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
1680 return Builtin;
1681
1682 return nullptr;
1683}
1684
1685Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
1686 unsigned Modifier,
1687 llvm::Type *ArgType,
1688 const CallExpr *E) {
1689 int VectorSize = 0;
1690 if (Modifier & Use64BitVectors)
1691 VectorSize = 64;
1692 else if (Modifier & Use128BitVectors)
1693 VectorSize = 128;
1694
1695 // Return type.
1696 SmallVector<llvm::Type *, 3> Tys;
1697 if (Modifier & AddRetType) {
1698 llvm::Type *Ty = ConvertType(T: E->getCallReturnType(Ctx: getContext()));
1699 if (Modifier & VectorizeRetType)
1700 Ty = llvm::FixedVectorType::get(
1701 ElementType: Ty, NumElts: VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
1702
1703 Tys.push_back(Elt: Ty);
1704 }
1705
1706 // Arguments.
1707 if (Modifier & VectorizeArgTypes) {
1708 int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
1709 ArgType = llvm::FixedVectorType::get(ElementType: ArgType, NumElts: Elts);
1710 }
1711
1712 if (Modifier & (Add1ArgType | Add2ArgTypes))
1713 Tys.push_back(Elt: ArgType);
1714
1715 if (Modifier & Add2ArgTypes)
1716 Tys.push_back(Elt: ArgType);
1717
1718 if (Modifier & InventFloatType)
1719 Tys.push_back(Elt: FloatTy);
1720
1721 return CGM.getIntrinsic(IID: IntrinsicID, Tys);
1722}
1723
1724static Value *EmitCommonNeonSISDBuiltinExpr(
1725 CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo,
1726 SmallVectorImpl<Value *> &Ops, const CallExpr *E) {
1727 unsigned BuiltinID = SISDInfo.BuiltinID;
1728 unsigned int Int = SISDInfo.LLVMIntrinsic;
1729 unsigned Modifier = SISDInfo.TypeModifier;
1730 const char *s = SISDInfo.NameHint;
1731
1732 switch (BuiltinID) {
1733 case NEON::BI__builtin_neon_vcled_s64:
1734 case NEON::BI__builtin_neon_vcled_u64:
1735 case NEON::BI__builtin_neon_vcles_f32:
1736 case NEON::BI__builtin_neon_vcled_f64:
1737 case NEON::BI__builtin_neon_vcltd_s64:
1738 case NEON::BI__builtin_neon_vcltd_u64:
1739 case NEON::BI__builtin_neon_vclts_f32:
1740 case NEON::BI__builtin_neon_vcltd_f64:
1741 case NEON::BI__builtin_neon_vcales_f32:
1742 case NEON::BI__builtin_neon_vcaled_f64:
1743 case NEON::BI__builtin_neon_vcalts_f32:
1744 case NEON::BI__builtin_neon_vcaltd_f64:
1745 // Only one direction of comparisons actually exist, cmle is actually a cmge
1746 // with swapped operands. The table gives us the right intrinsic but we
1747 // still need to do the swap.
1748 std::swap(a&: Ops[0], b&: Ops[1]);
1749 break;
1750 }
1751
1752 assert(Int && "Generic code assumes a valid intrinsic");
1753
1754 // Determine the type(s) of this overloaded AArch64 intrinsic.
1755 const Expr *Arg = E->getArg(Arg: 0);
1756 llvm::Type *ArgTy = CGF.ConvertType(T: Arg->getType());
1757 Function *F = CGF.LookupNeonLLVMIntrinsic(IntrinsicID: Int, Modifier, ArgType: ArgTy, E);
1758
1759 int j = 0;
1760 ConstantInt *C0 = ConstantInt::get(Ty: CGF.SizeTy, V: 0);
1761 for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
1762 ai != ae; ++ai, ++j) {
1763 llvm::Type *ArgTy = ai->getType();
1764 if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
1765 ArgTy->getPrimitiveSizeInBits())
1766 continue;
1767
1768 assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
1769 // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
1770 // it before inserting.
1771 Ops[j] = CGF.Builder.CreateTruncOrBitCast(
1772 V: Ops[j], DestTy: cast<llvm::VectorType>(Val: ArgTy)->getElementType());
1773 Ops[j] =
1774 CGF.Builder.CreateInsertElement(Vec: PoisonValue::get(T: ArgTy), NewElt: Ops[j], Idx: C0);
1775 }
1776
1777 Value *Result = CGF.EmitNeonCall(F, Ops, name: s);
1778 llvm::Type *ResultType = CGF.ConvertType(T: E->getType());
1779 if (ResultType->getPrimitiveSizeInBits().getFixedValue() <
1780 Result->getType()->getPrimitiveSizeInBits().getFixedValue())
1781 return CGF.Builder.CreateExtractElement(Vec: Result, Idx: C0);
1782
1783 return CGF.Builder.CreateBitCast(V: Result, DestTy: ResultType, Name: s);
1784}
1785
1786Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
1787 unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
1788 const char *NameHint, unsigned Modifier, const CallExpr *E,
1789 SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
1790 llvm::Triple::ArchType Arch) {
1791 // Get the last argument, which specifies the vector type.
1792 const Expr *Arg = E->getArg(Arg: E->getNumArgs() - 1);
1793 std::optional<llvm::APSInt> NeonTypeConst =
1794 Arg->getIntegerConstantExpr(Ctx: getContext());
1795 if (!NeonTypeConst)
1796 return nullptr;
1797
1798 // Determine the type of this overloaded NEON intrinsic.
1799 NeonTypeFlags Type(NeonTypeConst->getZExtValue());
1800 const bool Usgn = Type.isUnsigned();
1801 const bool Quad = Type.isQuad();
1802 const bool Floating = Type.isFloatingPoint();
1803 const bool HasFastHalfType = getTarget().hasFastHalfType();
1804 const bool AllowBFloatArgsAndRet =
1805 getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
1806
1807 llvm::FixedVectorType *VTy =
1808 GetNeonType(CGF: this, TypeFlags: Type, HasFastHalfType, V1Ty: false, AllowBFloatArgsAndRet);
1809 llvm::Type *Ty = VTy;
1810 if (!Ty)
1811 return nullptr;
1812
1813 auto getAlignmentValue32 = [&](Address addr) -> Value* {
1814 return Builder.getInt32(C: addr.getAlignment().getQuantity());
1815 };
1816
1817 unsigned Int = LLVMIntrinsic;
1818 if ((Modifier & UnsignedAlts) && !Usgn)
1819 Int = AltLLVMIntrinsic;
1820
1821 switch (BuiltinID) {
1822 default: break;
1823 case NEON::BI__builtin_neon_splat_lane_v:
1824 case NEON::BI__builtin_neon_splat_laneq_v:
1825 case NEON::BI__builtin_neon_splatq_lane_v:
1826 case NEON::BI__builtin_neon_splatq_laneq_v: {
1827 auto NumElements = VTy->getElementCount();
1828 if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
1829 NumElements = NumElements * 2;
1830 if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
1831 NumElements = NumElements.divideCoefficientBy(RHS: 2);
1832
1833 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: VTy);
1834 return EmitNeonSplat(V: Ops[0], C: cast<ConstantInt>(Val: Ops[1]), Count: NumElements);
1835 }
1836 case NEON::BI__builtin_neon_vpadd_v:
1837 case NEON::BI__builtin_neon_vpaddq_v:
1838 // We don't allow fp/int overloading of intrinsics.
1839 if (VTy->getElementType()->isFloatingPointTy() &&
1840 Int == Intrinsic::aarch64_neon_addp)
1841 Int = Intrinsic::aarch64_neon_faddp;
1842 break;
1843 case NEON::BI__builtin_neon_vabs_v:
1844 case NEON::BI__builtin_neon_vabsq_v:
1845 if (VTy->getElementType()->isFloatingPointTy())
1846 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::fabs, Tys: Ty), Ops, name: "vabs");
1847 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys: Ty), Ops, name: "vabs");
1848 case NEON::BI__builtin_neon_vadd_v:
1849 case NEON::BI__builtin_neon_vaddq_v: {
1850 llvm::Type *VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Quad ? 16 : 8);
1851 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: VTy);
1852 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: VTy);
1853 Ops[0] = Builder.CreateXor(LHS: Ops[0], RHS: Ops[1]);
1854 return Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1855 }
1856 case NEON::BI__builtin_neon_vaddhn_v: {
1857 llvm::FixedVectorType *SrcTy =
1858 llvm::FixedVectorType::getExtendedElementVectorType(VTy);
1859
1860 // %sum = add <4 x i32> %lhs, %rhs
1861 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: SrcTy);
1862 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: SrcTy);
1863 Ops[0] = Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1], Name: "vaddhn");
1864
1865 // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
1866 Constant *ShiftAmt =
1867 ConstantInt::get(Ty: SrcTy, V: SrcTy->getScalarSizeInBits() / 2);
1868 Ops[0] = Builder.CreateLShr(LHS: Ops[0], RHS: ShiftAmt, Name: "vaddhn");
1869
1870 // %res = trunc <4 x i32> %high to <4 x i16>
1871 return Builder.CreateTrunc(V: Ops[0], DestTy: VTy, Name: "vaddhn");
1872 }
1873 case NEON::BI__builtin_neon_vcale_v:
1874 case NEON::BI__builtin_neon_vcaleq_v:
1875 case NEON::BI__builtin_neon_vcalt_v:
1876 case NEON::BI__builtin_neon_vcaltq_v:
1877 std::swap(a&: Ops[0], b&: Ops[1]);
1878 [[fallthrough]];
1879 case NEON::BI__builtin_neon_vcage_v:
1880 case NEON::BI__builtin_neon_vcageq_v:
1881 case NEON::BI__builtin_neon_vcagt_v:
1882 case NEON::BI__builtin_neon_vcagtq_v: {
1883 llvm::Type *Ty;
1884 switch (VTy->getScalarSizeInBits()) {
1885 default: llvm_unreachable("unexpected type");
1886 case 32:
1887 Ty = FloatTy;
1888 break;
1889 case 64:
1890 Ty = DoubleTy;
1891 break;
1892 case 16:
1893 Ty = HalfTy;
1894 break;
1895 }
1896 auto *VecFlt = llvm::FixedVectorType::get(ElementType: Ty, NumElts: VTy->getNumElements());
1897 llvm::Type *Tys[] = { VTy, VecFlt };
1898 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
1899 return EmitNeonCall(F, Ops, name: NameHint);
1900 }
1901 case NEON::BI__builtin_neon_vceqz_v:
1902 case NEON::BI__builtin_neon_vceqzq_v:
1903 return EmitAArch64CompareBuiltinExpr(
1904 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, Name: "vceqz");
1905 case NEON::BI__builtin_neon_vcgez_v:
1906 case NEON::BI__builtin_neon_vcgezq_v:
1907 return EmitAArch64CompareBuiltinExpr(
1908 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE,
1909 Name: "vcgez");
1910 case NEON::BI__builtin_neon_vclez_v:
1911 case NEON::BI__builtin_neon_vclezq_v:
1912 return EmitAArch64CompareBuiltinExpr(
1913 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE,
1914 Name: "vclez");
1915 case NEON::BI__builtin_neon_vcgtz_v:
1916 case NEON::BI__builtin_neon_vcgtzq_v:
1917 return EmitAArch64CompareBuiltinExpr(
1918 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT,
1919 Name: "vcgtz");
1920 case NEON::BI__builtin_neon_vcltz_v:
1921 case NEON::BI__builtin_neon_vcltzq_v:
1922 return EmitAArch64CompareBuiltinExpr(
1923 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT,
1924 Name: "vcltz");
1925 case NEON::BI__builtin_neon_vclz_v:
1926 case NEON::BI__builtin_neon_vclzq_v:
1927 // We generate target-independent intrinsic, which needs a second argument
1928 // for whether or not clz of zero is undefined; on ARM it isn't.
1929 Ops.push_back(Elt: Builder.getInt1(V: getTarget().isCLZForZeroUndef()));
1930 break;
1931 case NEON::BI__builtin_neon_vcvt_f32_v:
1932 case NEON::BI__builtin_neon_vcvtq_f32_v:
1933 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1934 Ty = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
1935 HasFastHalfType);
1936 return Usgn ? Builder.CreateUIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt")
1937 : Builder.CreateSIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt");
1938 case NEON::BI__builtin_neon_vcvt_f16_s16:
1939 case NEON::BI__builtin_neon_vcvt_f16_u16:
1940 case NEON::BI__builtin_neon_vcvtq_f16_s16:
1941 case NEON::BI__builtin_neon_vcvtq_f16_u16:
1942 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1943 Ty = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
1944 HasFastHalfType);
1945 return Usgn ? Builder.CreateUIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt")
1946 : Builder.CreateSIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt");
1947 case NEON::BI__builtin_neon_vcvt_n_f16_s16:
1948 case NEON::BI__builtin_neon_vcvt_n_f16_u16:
1949 case NEON::BI__builtin_neon_vcvtq_n_f16_s16:
1950 case NEON::BI__builtin_neon_vcvtq_n_f16_u16: {
1951 llvm::Type *Tys[2] = { GetFloatNeonType(CGF: this, IntTypeFlags: Type), Ty };
1952 Function *F = CGM.getIntrinsic(IID: Int, Tys);
1953 return EmitNeonCall(F, Ops, name: "vcvt_n");
1954 }
1955 case NEON::BI__builtin_neon_vcvt_n_f32_v:
1956 case NEON::BI__builtin_neon_vcvt_n_f64_v:
1957 case NEON::BI__builtin_neon_vcvtq_n_f32_v:
1958 case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
1959 llvm::Type *Tys[2] = { GetFloatNeonType(CGF: this, IntTypeFlags: Type), Ty };
1960 Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
1961 Function *F = CGM.getIntrinsic(IID: Int, Tys);
1962 return EmitNeonCall(F, Ops, name: "vcvt_n");
1963 }
1964 case NEON::BI__builtin_neon_vcvt_n_s16_f16:
1965 case NEON::BI__builtin_neon_vcvt_n_s32_v:
1966 case NEON::BI__builtin_neon_vcvt_n_u16_f16:
1967 case NEON::BI__builtin_neon_vcvt_n_u32_v:
1968 case NEON::BI__builtin_neon_vcvt_n_s64_v:
1969 case NEON::BI__builtin_neon_vcvt_n_u64_v:
1970 case NEON::BI__builtin_neon_vcvtq_n_s16_f16:
1971 case NEON::BI__builtin_neon_vcvtq_n_s32_v:
1972 case NEON::BI__builtin_neon_vcvtq_n_u16_f16:
1973 case NEON::BI__builtin_neon_vcvtq_n_u32_v:
1974 case NEON::BI__builtin_neon_vcvtq_n_s64_v:
1975 case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
1976 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
1977 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
1978 return EmitNeonCall(F, Ops, name: "vcvt_n");
1979 }
1980 case NEON::BI__builtin_neon_vcvt_s32_v:
1981 case NEON::BI__builtin_neon_vcvt_u32_v:
1982 case NEON::BI__builtin_neon_vcvt_s64_v:
1983 case NEON::BI__builtin_neon_vcvt_u64_v:
1984 case NEON::BI__builtin_neon_vcvt_s16_f16:
1985 case NEON::BI__builtin_neon_vcvt_u16_f16:
1986 case NEON::BI__builtin_neon_vcvtq_s32_v:
1987 case NEON::BI__builtin_neon_vcvtq_u32_v:
1988 case NEON::BI__builtin_neon_vcvtq_s64_v:
1989 case NEON::BI__builtin_neon_vcvtq_u64_v:
1990 case NEON::BI__builtin_neon_vcvtq_s16_f16:
1991 case NEON::BI__builtin_neon_vcvtq_u16_f16: {
1992 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: GetFloatNeonType(CGF: this, IntTypeFlags: Type));
1993 return Usgn ? Builder.CreateFPToUI(V: Ops[0], DestTy: Ty, Name: "vcvt")
1994 : Builder.CreateFPToSI(V: Ops[0], DestTy: Ty, Name: "vcvt");
1995 }
1996 case NEON::BI__builtin_neon_vcvta_s16_f16:
1997 case NEON::BI__builtin_neon_vcvta_s32_v:
1998 case NEON::BI__builtin_neon_vcvta_s64_v:
1999 case NEON::BI__builtin_neon_vcvta_u16_f16:
2000 case NEON::BI__builtin_neon_vcvta_u32_v:
2001 case NEON::BI__builtin_neon_vcvta_u64_v:
2002 case NEON::BI__builtin_neon_vcvtaq_s16_f16:
2003 case NEON::BI__builtin_neon_vcvtaq_s32_v:
2004 case NEON::BI__builtin_neon_vcvtaq_s64_v:
2005 case NEON::BI__builtin_neon_vcvtaq_u16_f16:
2006 case NEON::BI__builtin_neon_vcvtaq_u32_v:
2007 case NEON::BI__builtin_neon_vcvtaq_u64_v:
2008 case NEON::BI__builtin_neon_vcvtn_s16_f16:
2009 case NEON::BI__builtin_neon_vcvtn_s32_v:
2010 case NEON::BI__builtin_neon_vcvtn_s64_v:
2011 case NEON::BI__builtin_neon_vcvtn_u16_f16:
2012 case NEON::BI__builtin_neon_vcvtn_u32_v:
2013 case NEON::BI__builtin_neon_vcvtn_u64_v:
2014 case NEON::BI__builtin_neon_vcvtnq_s16_f16:
2015 case NEON::BI__builtin_neon_vcvtnq_s32_v:
2016 case NEON::BI__builtin_neon_vcvtnq_s64_v:
2017 case NEON::BI__builtin_neon_vcvtnq_u16_f16:
2018 case NEON::BI__builtin_neon_vcvtnq_u32_v:
2019 case NEON::BI__builtin_neon_vcvtnq_u64_v:
2020 case NEON::BI__builtin_neon_vcvtp_s16_f16:
2021 case NEON::BI__builtin_neon_vcvtp_s32_v:
2022 case NEON::BI__builtin_neon_vcvtp_s64_v:
2023 case NEON::BI__builtin_neon_vcvtp_u16_f16:
2024 case NEON::BI__builtin_neon_vcvtp_u32_v:
2025 case NEON::BI__builtin_neon_vcvtp_u64_v:
2026 case NEON::BI__builtin_neon_vcvtpq_s16_f16:
2027 case NEON::BI__builtin_neon_vcvtpq_s32_v:
2028 case NEON::BI__builtin_neon_vcvtpq_s64_v:
2029 case NEON::BI__builtin_neon_vcvtpq_u16_f16:
2030 case NEON::BI__builtin_neon_vcvtpq_u32_v:
2031 case NEON::BI__builtin_neon_vcvtpq_u64_v:
2032 case NEON::BI__builtin_neon_vcvtm_s16_f16:
2033 case NEON::BI__builtin_neon_vcvtm_s32_v:
2034 case NEON::BI__builtin_neon_vcvtm_s64_v:
2035 case NEON::BI__builtin_neon_vcvtm_u16_f16:
2036 case NEON::BI__builtin_neon_vcvtm_u32_v:
2037 case NEON::BI__builtin_neon_vcvtm_u64_v:
2038 case NEON::BI__builtin_neon_vcvtmq_s16_f16:
2039 case NEON::BI__builtin_neon_vcvtmq_s32_v:
2040 case NEON::BI__builtin_neon_vcvtmq_s64_v:
2041 case NEON::BI__builtin_neon_vcvtmq_u16_f16:
2042 case NEON::BI__builtin_neon_vcvtmq_u32_v:
2043 case NEON::BI__builtin_neon_vcvtmq_u64_v: {
2044 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
2045 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: NameHint);
2046 }
2047 case NEON::BI__builtin_neon_vcvtx_f32_v: {
2048 llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty};
2049 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: NameHint);
2050
2051 }
2052 case NEON::BI__builtin_neon_vext_v:
2053 case NEON::BI__builtin_neon_vextq_v: {
2054 int CV = cast<ConstantInt>(Val: Ops[2])->getSExtValue();
2055 SmallVector<int, 16> Indices;
2056 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
2057 Indices.push_back(Elt: i+CV);
2058
2059 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
2060 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2061 return Builder.CreateShuffleVector(V1: Ops[0], V2: Ops[1], Mask: Indices, Name: "vext");
2062 }
2063 case NEON::BI__builtin_neon_vfma_v:
2064 case NEON::BI__builtin_neon_vfmaq_v: {
2065 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
2066 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2067 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
2068
2069 // NEON intrinsic puts accumulator first, unlike the LLVM fma.
2070 return emitCallMaybeConstrainedFPBuiltin(
2071 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty,
2072 Args: {Ops[1], Ops[2], Ops[0]});
2073 }
2074 case NEON::BI__builtin_neon_vld1_v:
2075 case NEON::BI__builtin_neon_vld1q_v: {
2076 llvm::Type *Tys[] = {Ty, Int8PtrTy};
2077 Ops.push_back(Elt: getAlignmentValue32(PtrOp0));
2078 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: "vld1");
2079 }
2080 case NEON::BI__builtin_neon_vld1_x2_v:
2081 case NEON::BI__builtin_neon_vld1q_x2_v:
2082 case NEON::BI__builtin_neon_vld1_x3_v:
2083 case NEON::BI__builtin_neon_vld1q_x3_v:
2084 case NEON::BI__builtin_neon_vld1_x4_v:
2085 case NEON::BI__builtin_neon_vld1q_x4_v: {
2086 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
2087 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
2088 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld1xN");
2089 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
2090 }
2091 case NEON::BI__builtin_neon_vld2_v:
2092 case NEON::BI__builtin_neon_vld2q_v:
2093 case NEON::BI__builtin_neon_vld3_v:
2094 case NEON::BI__builtin_neon_vld3q_v:
2095 case NEON::BI__builtin_neon_vld4_v:
2096 case NEON::BI__builtin_neon_vld4q_v:
2097 case NEON::BI__builtin_neon_vld2_dup_v:
2098 case NEON::BI__builtin_neon_vld2q_dup_v:
2099 case NEON::BI__builtin_neon_vld3_dup_v:
2100 case NEON::BI__builtin_neon_vld3q_dup_v:
2101 case NEON::BI__builtin_neon_vld4_dup_v:
2102 case NEON::BI__builtin_neon_vld4q_dup_v: {
2103 llvm::Type *Tys[] = {Ty, Int8PtrTy};
2104 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
2105 Value *Align = getAlignmentValue32(PtrOp1);
2106 Ops[1] = Builder.CreateCall(Callee: F, Args: {Ops[1], Align}, Name: NameHint);
2107 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
2108 }
2109 case NEON::BI__builtin_neon_vld1_dup_v:
2110 case NEON::BI__builtin_neon_vld1q_dup_v: {
2111 Value *V = PoisonValue::get(T: Ty);
2112 PtrOp0 = PtrOp0.withElementType(ElemTy: VTy->getElementType());
2113 LoadInst *Ld = Builder.CreateLoad(Addr: PtrOp0);
2114 llvm::Constant *CI = ConstantInt::get(Ty: SizeTy, V: 0);
2115 Ops[0] = Builder.CreateInsertElement(Vec: V, NewElt: Ld, Idx: CI);
2116 return EmitNeonSplat(V: Ops[0], C: CI);
2117 }
2118 case NEON::BI__builtin_neon_vld2_lane_v:
2119 case NEON::BI__builtin_neon_vld2q_lane_v:
2120 case NEON::BI__builtin_neon_vld3_lane_v:
2121 case NEON::BI__builtin_neon_vld3q_lane_v:
2122 case NEON::BI__builtin_neon_vld4_lane_v:
2123 case NEON::BI__builtin_neon_vld4q_lane_v: {
2124 llvm::Type *Tys[] = {Ty, Int8PtrTy};
2125 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
2126 for (unsigned I = 2; I < Ops.size() - 1; ++I)
2127 Ops[I] = Builder.CreateBitCast(V: Ops[I], DestTy: Ty);
2128 Ops.push_back(Elt: getAlignmentValue32(PtrOp1));
2129 Ops[1] = Builder.CreateCall(Callee: F, Args: ArrayRef(Ops).slice(N: 1), Name: NameHint);
2130 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
2131 }
2132 case NEON::BI__builtin_neon_vmovl_v: {
2133 llvm::FixedVectorType *DTy =
2134 llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
2135 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DTy);
2136 if (Usgn)
2137 return Builder.CreateZExt(V: Ops[0], DestTy: Ty, Name: "vmovl");
2138 return Builder.CreateSExt(V: Ops[0], DestTy: Ty, Name: "vmovl");
2139 }
2140 case NEON::BI__builtin_neon_vmovn_v: {
2141 llvm::FixedVectorType *QTy =
2142 llvm::FixedVectorType::getExtendedElementVectorType(VTy);
2143 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: QTy);
2144 return Builder.CreateTrunc(V: Ops[0], DestTy: Ty, Name: "vmovn");
2145 }
2146 case NEON::BI__builtin_neon_vmull_v:
2147 // FIXME: the integer vmull operations could be emitted in terms of pure
2148 // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
2149 // hoisting the exts outside loops. Until global ISel comes along that can
2150 // see through such movement this leads to bad CodeGen. So we need an
2151 // intrinsic for now.
2152 Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
2153 Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
2154 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmull");
2155 case NEON::BI__builtin_neon_vpadal_v:
2156 case NEON::BI__builtin_neon_vpadalq_v: {
2157 // The source operand type has twice as many elements of half the size.
2158 unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
2159 llvm::Type *EltTy =
2160 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: EltBits / 2);
2161 auto *NarrowTy =
2162 llvm::FixedVectorType::get(ElementType: EltTy, NumElts: VTy->getNumElements() * 2);
2163 llvm::Type *Tys[2] = { Ty, NarrowTy };
2164 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: NameHint);
2165 }
2166 case NEON::BI__builtin_neon_vpaddl_v:
2167 case NEON::BI__builtin_neon_vpaddlq_v: {
2168 // The source operand type has twice as many elements of half the size.
2169 unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
2170 llvm::Type *EltTy = llvm::IntegerType::get(C&: getLLVMContext(), NumBits: EltBits / 2);
2171 auto *NarrowTy =
2172 llvm::FixedVectorType::get(ElementType: EltTy, NumElts: VTy->getNumElements() * 2);
2173 llvm::Type *Tys[2] = { Ty, NarrowTy };
2174 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vpaddl");
2175 }
2176 case NEON::BI__builtin_neon_vqdmlal_v:
2177 case NEON::BI__builtin_neon_vqdmlsl_v: {
2178 SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
2179 Ops[1] =
2180 EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys: Ty), Ops&: MulOps, name: "vqdmlal");
2181 Ops.resize(N: 2);
2182 return EmitNeonCall(F: CGM.getIntrinsic(IID: AltLLVMIntrinsic, Tys: Ty), Ops, name: NameHint);
2183 }
2184 case NEON::BI__builtin_neon_vqdmulhq_lane_v:
2185 case NEON::BI__builtin_neon_vqdmulh_lane_v:
2186 case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
2187 case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
2188 auto *RTy = cast<llvm::FixedVectorType>(Val: Ty);
2189 if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
2190 BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
2191 RTy = llvm::FixedVectorType::get(ElementType: RTy->getElementType(),
2192 NumElts: RTy->getNumElements() * 2);
2193 llvm::Type *Tys[2] = {
2194 RTy, GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
2195 /*isQuad*/ false))};
2196 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: NameHint);
2197 }
2198 case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
2199 case NEON::BI__builtin_neon_vqdmulh_laneq_v:
2200 case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
2201 case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
2202 llvm::Type *Tys[2] = {
2203 Ty, GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
2204 /*isQuad*/ true))};
2205 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: NameHint);
2206 }
2207 case NEON::BI__builtin_neon_vqshl_n_v:
2208 case NEON::BI__builtin_neon_vqshlq_n_v:
2209 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshl_n",
2210 shift: 1, rightshift: false);
2211 case NEON::BI__builtin_neon_vqshlu_n_v:
2212 case NEON::BI__builtin_neon_vqshluq_n_v:
2213 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshlu_n",
2214 shift: 1, rightshift: false);
2215 case NEON::BI__builtin_neon_vrecpe_v:
2216 case NEON::BI__builtin_neon_vrecpeq_v:
2217 case NEON::BI__builtin_neon_vrsqrte_v:
2218 case NEON::BI__builtin_neon_vrsqrteq_v:
2219 Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
2220 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: NameHint);
2221 case NEON::BI__builtin_neon_vrndi_v:
2222 case NEON::BI__builtin_neon_vrndiq_v:
2223 Int = Builder.getIsFPConstrained()
2224 ? Intrinsic::experimental_constrained_nearbyint
2225 : Intrinsic::nearbyint;
2226 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: NameHint);
2227 case NEON::BI__builtin_neon_vrshr_n_v:
2228 case NEON::BI__builtin_neon_vrshrq_n_v:
2229 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrshr_n",
2230 shift: 1, rightshift: true);
2231 case NEON::BI__builtin_neon_vsha512hq_u64:
2232 case NEON::BI__builtin_neon_vsha512h2q_u64:
2233 case NEON::BI__builtin_neon_vsha512su0q_u64:
2234 case NEON::BI__builtin_neon_vsha512su1q_u64: {
2235 Function *F = CGM.getIntrinsic(IID: Int);
2236 return EmitNeonCall(F, Ops, name: "");
2237 }
2238 case NEON::BI__builtin_neon_vshl_n_v:
2239 case NEON::BI__builtin_neon_vshlq_n_v:
2240 Ops[1] = EmitNeonShiftVector(V: Ops[1], Ty, neg: false);
2241 return Builder.CreateShl(LHS: Builder.CreateBitCast(V: Ops[0],DestTy: Ty), RHS: Ops[1],
2242 Name: "vshl_n");
2243 case NEON::BI__builtin_neon_vshll_n_v: {
2244 llvm::FixedVectorType *SrcTy =
2245 llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
2246 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: SrcTy);
2247 if (Usgn)
2248 Ops[0] = Builder.CreateZExt(V: Ops[0], DestTy: VTy);
2249 else
2250 Ops[0] = Builder.CreateSExt(V: Ops[0], DestTy: VTy);
2251 Ops[1] = EmitNeonShiftVector(V: Ops[1], Ty: VTy, neg: false);
2252 return Builder.CreateShl(LHS: Ops[0], RHS: Ops[1], Name: "vshll_n");
2253 }
2254 case NEON::BI__builtin_neon_vshrn_n_v: {
2255 llvm::FixedVectorType *SrcTy =
2256 llvm::FixedVectorType::getExtendedElementVectorType(VTy);
2257 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: SrcTy);
2258 Ops[1] = EmitNeonShiftVector(V: Ops[1], Ty: SrcTy, neg: false);
2259 if (Usgn)
2260 Ops[0] = Builder.CreateLShr(LHS: Ops[0], RHS: Ops[1]);
2261 else
2262 Ops[0] = Builder.CreateAShr(LHS: Ops[0], RHS: Ops[1]);
2263 return Builder.CreateTrunc(V: Ops[0], DestTy: Ty, Name: "vshrn_n");
2264 }
2265 case NEON::BI__builtin_neon_vshr_n_v:
2266 case NEON::BI__builtin_neon_vshrq_n_v:
2267 return EmitNeonRShiftImm(Vec: Ops[0], Shift: Ops[1], Ty, usgn: Usgn, name: "vshr_n");
2268 case NEON::BI__builtin_neon_vst1_v:
2269 case NEON::BI__builtin_neon_vst1q_v:
2270 case NEON::BI__builtin_neon_vst2_v:
2271 case NEON::BI__builtin_neon_vst2q_v:
2272 case NEON::BI__builtin_neon_vst3_v:
2273 case NEON::BI__builtin_neon_vst3q_v:
2274 case NEON::BI__builtin_neon_vst4_v:
2275 case NEON::BI__builtin_neon_vst4q_v:
2276 case NEON::BI__builtin_neon_vst2_lane_v:
2277 case NEON::BI__builtin_neon_vst2q_lane_v:
2278 case NEON::BI__builtin_neon_vst3_lane_v:
2279 case NEON::BI__builtin_neon_vst3q_lane_v:
2280 case NEON::BI__builtin_neon_vst4_lane_v:
2281 case NEON::BI__builtin_neon_vst4q_lane_v: {
2282 llvm::Type *Tys[] = {Int8PtrTy, Ty};
2283 Ops.push_back(Elt: getAlignmentValue32(PtrOp0));
2284 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "");
2285 }
2286 case NEON::BI__builtin_neon_vsm3partw1q_u32:
2287 case NEON::BI__builtin_neon_vsm3partw2q_u32:
2288 case NEON::BI__builtin_neon_vsm3ss1q_u32:
2289 case NEON::BI__builtin_neon_vsm4ekeyq_u32:
2290 case NEON::BI__builtin_neon_vsm4eq_u32: {
2291 Function *F = CGM.getIntrinsic(IID: Int);
2292 return EmitNeonCall(F, Ops, name: "");
2293 }
2294 case NEON::BI__builtin_neon_vsm3tt1aq_u32:
2295 case NEON::BI__builtin_neon_vsm3tt1bq_u32:
2296 case NEON::BI__builtin_neon_vsm3tt2aq_u32:
2297 case NEON::BI__builtin_neon_vsm3tt2bq_u32: {
2298 Function *F = CGM.getIntrinsic(IID: Int);
2299 Ops[3] = Builder.CreateZExt(V: Ops[3], DestTy: Int64Ty);
2300 return EmitNeonCall(F, Ops, name: "");
2301 }
2302 case NEON::BI__builtin_neon_vst1_x2_v:
2303 case NEON::BI__builtin_neon_vst1q_x2_v:
2304 case NEON::BI__builtin_neon_vst1_x3_v:
2305 case NEON::BI__builtin_neon_vst1q_x3_v:
2306 case NEON::BI__builtin_neon_vst1_x4_v:
2307 case NEON::BI__builtin_neon_vst1q_x4_v: {
2308 // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
2309 // in AArch64 it comes last. We may want to stick to one or another.
2310 if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be ||
2311 Arch == llvm::Triple::aarch64_32) {
2312 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
2313 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
2314 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: "");
2315 }
2316 llvm::Type *Tys[2] = {DefaultPtrTy, VTy};
2317 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: "");
2318 }
2319 case NEON::BI__builtin_neon_vsubhn_v: {
2320 llvm::FixedVectorType *SrcTy =
2321 llvm::FixedVectorType::getExtendedElementVectorType(VTy);
2322
2323 // %sum = add <4 x i32> %lhs, %rhs
2324 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: SrcTy);
2325 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: SrcTy);
2326 Ops[0] = Builder.CreateSub(LHS: Ops[0], RHS: Ops[1], Name: "vsubhn");
2327
2328 // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
2329 Constant *ShiftAmt =
2330 ConstantInt::get(Ty: SrcTy, V: SrcTy->getScalarSizeInBits() / 2);
2331 Ops[0] = Builder.CreateLShr(LHS: Ops[0], RHS: ShiftAmt, Name: "vsubhn");
2332
2333 // %res = trunc <4 x i32> %high to <4 x i16>
2334 return Builder.CreateTrunc(V: Ops[0], DestTy: VTy, Name: "vsubhn");
2335 }
2336 case NEON::BI__builtin_neon_vtrn_v:
2337 case NEON::BI__builtin_neon_vtrnq_v: {
2338 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2339 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
2340 Value *SV = nullptr;
2341
2342 for (unsigned vi = 0; vi != 2; ++vi) {
2343 SmallVector<int, 16> Indices;
2344 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
2345 Indices.push_back(Elt: i+vi);
2346 Indices.push_back(Elt: i+e+vi);
2347 }
2348 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
2349 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vtrn");
2350 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
2351 }
2352 return SV;
2353 }
2354 case NEON::BI__builtin_neon_vtst_v:
2355 case NEON::BI__builtin_neon_vtstq_v: {
2356 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
2357 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2358 Ops[0] = Builder.CreateAnd(LHS: Ops[0], RHS: Ops[1]);
2359 Ops[0] = Builder.CreateICmp(P: ICmpInst::ICMP_NE, LHS: Ops[0],
2360 RHS: ConstantAggregateZero::get(Ty));
2361 return Builder.CreateSExt(V: Ops[0], DestTy: Ty, Name: "vtst");
2362 }
2363 case NEON::BI__builtin_neon_vuzp_v:
2364 case NEON::BI__builtin_neon_vuzpq_v: {
2365 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2366 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
2367 Value *SV = nullptr;
2368
2369 for (unsigned vi = 0; vi != 2; ++vi) {
2370 SmallVector<int, 16> Indices;
2371 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
2372 Indices.push_back(Elt: 2*i+vi);
2373
2374 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
2375 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vuzp");
2376 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
2377 }
2378 return SV;
2379 }
2380 case NEON::BI__builtin_neon_vxarq_u64: {
2381 Function *F = CGM.getIntrinsic(IID: Int);
2382 Ops[2] = Builder.CreateZExt(V: Ops[2], DestTy: Int64Ty);
2383 return EmitNeonCall(F, Ops, name: "");
2384 }
2385 case NEON::BI__builtin_neon_vzip_v:
2386 case NEON::BI__builtin_neon_vzipq_v: {
2387 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2388 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
2389 Value *SV = nullptr;
2390
2391 for (unsigned vi = 0; vi != 2; ++vi) {
2392 SmallVector<int, 16> Indices;
2393 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
2394 Indices.push_back(Elt: (i + vi*e) >> 1);
2395 Indices.push_back(Elt: ((i + vi*e) >> 1)+e);
2396 }
2397 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
2398 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vzip");
2399 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
2400 }
2401 return SV;
2402 }
2403 case NEON::BI__builtin_neon_vdot_s32:
2404 case NEON::BI__builtin_neon_vdot_u32:
2405 case NEON::BI__builtin_neon_vdotq_s32:
2406 case NEON::BI__builtin_neon_vdotq_u32: {
2407 auto *InputTy =
2408 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Ty->getPrimitiveSizeInBits() / 8);
2409 llvm::Type *Tys[2] = { Ty, InputTy };
2410 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vdot");
2411 }
2412 case NEON::BI__builtin_neon_vfmlal_low_f16:
2413 case NEON::BI__builtin_neon_vfmlalq_low_f16: {
2414 auto *InputTy =
2415 llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
2416 llvm::Type *Tys[2] = { Ty, InputTy };
2417 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vfmlal_low");
2418 }
2419 case NEON::BI__builtin_neon_vfmlsl_low_f16:
2420 case NEON::BI__builtin_neon_vfmlslq_low_f16: {
2421 auto *InputTy =
2422 llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
2423 llvm::Type *Tys[2] = { Ty, InputTy };
2424 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vfmlsl_low");
2425 }
2426 case NEON::BI__builtin_neon_vfmlal_high_f16:
2427 case NEON::BI__builtin_neon_vfmlalq_high_f16: {
2428 auto *InputTy =
2429 llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
2430 llvm::Type *Tys[2] = { Ty, InputTy };
2431 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vfmlal_high");
2432 }
2433 case NEON::BI__builtin_neon_vfmlsl_high_f16:
2434 case NEON::BI__builtin_neon_vfmlslq_high_f16: {
2435 auto *InputTy =
2436 llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
2437 llvm::Type *Tys[2] = { Ty, InputTy };
2438 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vfmlsl_high");
2439 }
2440 case NEON::BI__builtin_neon_vmmlaq_s32:
2441 case NEON::BI__builtin_neon_vmmlaq_u32: {
2442 auto *InputTy =
2443 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Ty->getPrimitiveSizeInBits() / 8);
2444 llvm::Type *Tys[2] = { Ty, InputTy };
2445 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: "vmmla");
2446 }
2447 case NEON::BI__builtin_neon_vusmmlaq_s32: {
2448 auto *InputTy =
2449 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Ty->getPrimitiveSizeInBits() / 8);
2450 llvm::Type *Tys[2] = { Ty, InputTy };
2451 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vusmmla");
2452 }
2453 case NEON::BI__builtin_neon_vusdot_s32:
2454 case NEON::BI__builtin_neon_vusdotq_s32: {
2455 auto *InputTy =
2456 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Ty->getPrimitiveSizeInBits() / 8);
2457 llvm::Type *Tys[2] = { Ty, InputTy };
2458 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vusdot");
2459 }
2460 case NEON::BI__builtin_neon_vbfdot_f32:
2461 case NEON::BI__builtin_neon_vbfdotq_f32: {
2462 llvm::Type *InputTy =
2463 llvm::FixedVectorType::get(ElementType: BFloatTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
2464 llvm::Type *Tys[2] = { Ty, InputTy };
2465 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vbfdot");
2466 }
2467 case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: {
2468 llvm::Type *Tys[1] = { Ty };
2469 Function *F = CGM.getIntrinsic(IID: Int, Tys);
2470 return EmitNeonCall(F, Ops, name: "vcvtfp2bf");
2471 }
2472
2473 }
2474
2475 assert(Int && "Expected valid intrinsic number");
2476
2477 // Determine the type(s) of this overloaded AArch64 intrinsic.
2478 Function *F = LookupNeonLLVMIntrinsic(IntrinsicID: Int, Modifier, ArgType: Ty, E);
2479
2480 Value *Result = EmitNeonCall(F, Ops, name: NameHint);
2481 llvm::Type *ResultType = ConvertType(T: E->getType());
2482 // AArch64 intrinsic one-element vector type cast to
2483 // scalar type expected by the builtin
2484 return Builder.CreateBitCast(V: Result, DestTy: ResultType, Name: NameHint);
2485}
2486
2487Value *
2488CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty,
2489 const CmpInst::Predicate Pred,
2490 const Twine &Name) {
2491
2492 if (isa<FixedVectorType>(Val: Ty)) {
2493 // Vector types are cast to i8 vectors. Recover original type.
2494 Op = Builder.CreateBitCast(V: Op, DestTy: Ty);
2495 }
2496
2497 if (CmpInst::isFPPredicate(P: Pred)) {
2498 if (Pred == CmpInst::FCMP_OEQ)
2499 Op = Builder.CreateFCmp(P: Pred, LHS: Op, RHS: Constant::getNullValue(Ty: Op->getType()));
2500 else
2501 Op = Builder.CreateFCmpS(P: Pred, LHS: Op, RHS: Constant::getNullValue(Ty: Op->getType()));
2502 } else {
2503 Op = Builder.CreateICmp(P: Pred, LHS: Op, RHS: Constant::getNullValue(Ty: Op->getType()));
2504 }
2505
2506 llvm::Type *ResTy = Ty;
2507 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty))
2508 ResTy = FixedVectorType::get(
2509 ElementType: IntegerType::get(C&: getLLVMContext(), NumBits: VTy->getScalarSizeInBits()),
2510 NumElts: VTy->getNumElements());
2511
2512 return Builder.CreateSExt(V: Op, DestTy: ResTy, Name);
2513}
2514
2515static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
2516 Value *ExtOp, Value *IndexOp,
2517 llvm::Type *ResTy, unsigned IntID,
2518 const char *Name) {
2519 SmallVector<Value *, 2> TblOps;
2520 if (ExtOp)
2521 TblOps.push_back(Elt: ExtOp);
2522
2523 // Build a vector containing sequential number like (0, 1, 2, ..., 15)
2524 SmallVector<int, 16> Indices;
2525 auto *TblTy = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
2526 for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
2527 Indices.push_back(Elt: 2*i);
2528 Indices.push_back(Elt: 2*i+1);
2529 }
2530
2531 int PairPos = 0, End = Ops.size() - 1;
2532 while (PairPos < End) {
2533 TblOps.push_back(Elt: CGF.Builder.CreateShuffleVector(V1: Ops[PairPos],
2534 V2: Ops[PairPos+1], Mask: Indices,
2535 Name));
2536 PairPos += 2;
2537 }
2538
2539 // If there's an odd number of 64-bit lookup table, fill the high 64-bit
2540 // of the 128-bit lookup table with zero.
2541 if (PairPos == End) {
2542 Value *ZeroTbl = ConstantAggregateZero::get(Ty: TblTy);
2543 TblOps.push_back(Elt: CGF.Builder.CreateShuffleVector(V1: Ops[PairPos],
2544 V2: ZeroTbl, Mask: Indices, Name));
2545 }
2546
2547 Function *TblF;
2548 TblOps.push_back(Elt: IndexOp);
2549 TblF = CGF.CGM.getIntrinsic(IID: IntID, Tys: ResTy);
2550
2551 return CGF.EmitNeonCall(F: TblF, Ops&: TblOps, name: Name);
2552}
2553
2554Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
2555 unsigned Value;
2556 switch (BuiltinID) {
2557 default:
2558 return nullptr;
2559 case clang::ARM::BI__builtin_arm_nop:
2560 Value = 0;
2561 break;
2562 case clang::ARM::BI__builtin_arm_yield:
2563 case clang::ARM::BI__yield:
2564 Value = 1;
2565 break;
2566 case clang::ARM::BI__builtin_arm_wfe:
2567 case clang::ARM::BI__wfe:
2568 Value = 2;
2569 break;
2570 case clang::ARM::BI__builtin_arm_wfi:
2571 case clang::ARM::BI__wfi:
2572 Value = 3;
2573 break;
2574 case clang::ARM::BI__builtin_arm_sev:
2575 case clang::ARM::BI__sev:
2576 Value = 4;
2577 break;
2578 case clang::ARM::BI__builtin_arm_sevl:
2579 case clang::ARM::BI__sevl:
2580 Value = 5;
2581 break;
2582 }
2583
2584 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_hint),
2585 Args: llvm::ConstantInt::get(Ty: Int32Ty, V: Value));
2586}
2587
2588enum SpecialRegisterAccessKind {
2589 NormalRead,
2590 VolatileRead,
2591 Write,
2592};
2593
2594// Generates the IR for the read/write special register builtin,
2595// ValueType is the type of the value that is to be written or read,
2596// RegisterType is the type of the register being written to or read from.
2597static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
2598 const CallExpr *E,
2599 llvm::Type *RegisterType,
2600 llvm::Type *ValueType,
2601 SpecialRegisterAccessKind AccessKind,
2602 StringRef SysReg = "") {
2603 // write and register intrinsics only support 32, 64 and 128 bit operations.
2604 assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64) ||
2605 RegisterType->isIntegerTy(128)) &&
2606 "Unsupported size for register.");
2607
2608 CodeGen::CGBuilderTy &Builder = CGF.Builder;
2609 CodeGen::CodeGenModule &CGM = CGF.CGM;
2610 LLVMContext &Context = CGM.getLLVMContext();
2611
2612 if (SysReg.empty()) {
2613 const Expr *SysRegStrExpr = E->getArg(Arg: 0)->IgnoreParenCasts();
2614 SysReg = cast<clang::StringLiteral>(Val: SysRegStrExpr)->getString();
2615 }
2616
2617 llvm::Metadata *Ops[] = { llvm::MDString::get(Context, Str: SysReg) };
2618 llvm::MDNode *RegName = llvm::MDNode::get(Context, MDs: Ops);
2619 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, MD: RegName);
2620
2621 llvm::Type *Types[] = { RegisterType };
2622
2623 bool MixedTypes = RegisterType->isIntegerTy(Bitwidth: 64) && ValueType->isIntegerTy(Bitwidth: 32);
2624 assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
2625 && "Can't fit 64-bit value in 32-bit register");
2626
2627 if (AccessKind != Write) {
2628 assert(AccessKind == NormalRead || AccessKind == VolatileRead);
2629 llvm::Function *F = CGM.getIntrinsic(
2630 IID: AccessKind == VolatileRead ? Intrinsic::read_volatile_register
2631 : Intrinsic::read_register,
2632 Tys: Types);
2633 llvm::Value *Call = Builder.CreateCall(Callee: F, Args: Metadata);
2634
2635 if (MixedTypes)
2636 // Read into 64 bit register and then truncate result to 32 bit.
2637 return Builder.CreateTrunc(V: Call, DestTy: ValueType);
2638
2639 if (ValueType->isPointerTy())
2640 // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
2641 return Builder.CreateIntToPtr(V: Call, DestTy: ValueType);
2642
2643 return Call;
2644 }
2645
2646 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::write_register, Tys: Types);
2647 llvm::Value *ArgValue = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
2648 if (MixedTypes) {
2649 // Extend 32 bit write value to 64 bit to pass to write.
2650 ArgValue = Builder.CreateZExt(V: ArgValue, DestTy: RegisterType);
2651 return Builder.CreateCall(Callee: F, Args: { Metadata, ArgValue });
2652 }
2653
2654 if (ValueType->isPointerTy()) {
2655 // Have VoidPtrTy ArgValue but want to return an i32/i64.
2656 ArgValue = Builder.CreatePtrToInt(V: ArgValue, DestTy: RegisterType);
2657 return Builder.CreateCall(Callee: F, Args: { Metadata, ArgValue });
2658 }
2659
2660 return Builder.CreateCall(Callee: F, Args: { Metadata, ArgValue });
2661}
2662
2663static Value *EmitRangePrefetchBuiltin(CodeGenFunction &CGF, unsigned BuiltinID,
2664 const CallExpr *E) {
2665 CodeGen::CGBuilderTy &Builder = CGF.Builder;
2666 CodeGen::CodeGenModule &CGM = CGF.CGM;
2667 SmallVector<llvm::Value *, 4> Ops;
2668
2669 auto getIntArg = [&](unsigned ArgNo) {
2670 Expr::EvalResult Result;
2671 if (!E->getArg(Arg: ArgNo)->EvaluateAsInt(Result, Ctx: CGM.getContext()))
2672 llvm_unreachable("Expected constant argument to range prefetch.");
2673 return Result.Val.getInt().getExtValue();
2674 };
2675
2676 Ops.push_back(Elt: CGF.EmitScalarExpr(E: E->getArg(Arg: 0))); /*Addr*/
2677 Ops.push_back(Elt: CGF.EmitScalarExpr(E: E->getArg(Arg: 1))); /*Access Kind*/
2678 Ops.push_back(Elt: CGF.EmitScalarExpr(E: E->getArg(Arg: 2))); /*Policy*/
2679
2680 if (BuiltinID == clang::AArch64::BI__builtin_arm_range_prefetch_x) {
2681 auto Length = getIntArg(3);
2682 auto Count = getIntArg(4) - 1;
2683 auto Stride = getIntArg(5);
2684 auto Distance = getIntArg(6);
2685
2686 // Map ReuseDistance given in bytes to four bits representing decreasing
2687 // powers of two in the range 512MiB (0b0001) to 32KiB (0b1111). Values
2688 // are rounded up to the nearest power of 2, starting at 32KiB. Any value
2689 // over the maximum is represented by 0 (distance not known).
2690 if (Distance > 0) {
2691 Distance = llvm::Log2_32_Ceil(Value: Distance);
2692 if (Distance < 15)
2693 Distance = 15;
2694 else if (Distance > 29)
2695 Distance = 0;
2696 else
2697 Distance = 30 - Distance;
2698 }
2699
2700 uint64_t Mask22 = (1ULL << 22) - 1;
2701 uint64_t Mask16 = (1ULL << 16) - 1;
2702 uint64_t Metadata = (Distance << 60) | ((Stride & Mask22) << 38) |
2703 ((Count & Mask16) << 22) | (Length & Mask22);
2704
2705 Ops.push_back(Elt: llvm::ConstantInt::get(Ty: Builder.getInt64Ty(), V: Metadata));
2706 } else
2707 Ops.push_back(Elt: CGF.EmitScalarExpr(E: E->getArg(Arg: 3)));
2708
2709 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_range_prefetch),
2710 Args: Ops);
2711}
2712
2713/// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
2714/// argument that specifies the vector type.
2715static bool HasExtraNeonArgument(unsigned BuiltinID) {
2716 switch (BuiltinID) {
2717 default: break;
2718 case NEON::BI__builtin_neon_vget_lane_i8:
2719 case NEON::BI__builtin_neon_vget_lane_i16:
2720 case NEON::BI__builtin_neon_vget_lane_bf16:
2721 case NEON::BI__builtin_neon_vget_lane_i32:
2722 case NEON::BI__builtin_neon_vget_lane_i64:
2723 case NEON::BI__builtin_neon_vget_lane_mf8:
2724 case NEON::BI__builtin_neon_vget_lane_f32:
2725 case NEON::BI__builtin_neon_vgetq_lane_i8:
2726 case NEON::BI__builtin_neon_vgetq_lane_i16:
2727 case NEON::BI__builtin_neon_vgetq_lane_bf16:
2728 case NEON::BI__builtin_neon_vgetq_lane_i32:
2729 case NEON::BI__builtin_neon_vgetq_lane_i64:
2730 case NEON::BI__builtin_neon_vgetq_lane_mf8:
2731 case NEON::BI__builtin_neon_vgetq_lane_f32:
2732 case NEON::BI__builtin_neon_vduph_lane_bf16:
2733 case NEON::BI__builtin_neon_vduph_laneq_bf16:
2734 case NEON::BI__builtin_neon_vset_lane_i8:
2735 case NEON::BI__builtin_neon_vset_lane_mf8:
2736 case NEON::BI__builtin_neon_vset_lane_i16:
2737 case NEON::BI__builtin_neon_vset_lane_bf16:
2738 case NEON::BI__builtin_neon_vset_lane_i32:
2739 case NEON::BI__builtin_neon_vset_lane_i64:
2740 case NEON::BI__builtin_neon_vset_lane_f32:
2741 case NEON::BI__builtin_neon_vsetq_lane_i8:
2742 case NEON::BI__builtin_neon_vsetq_lane_mf8:
2743 case NEON::BI__builtin_neon_vsetq_lane_i16:
2744 case NEON::BI__builtin_neon_vsetq_lane_bf16:
2745 case NEON::BI__builtin_neon_vsetq_lane_i32:
2746 case NEON::BI__builtin_neon_vsetq_lane_i64:
2747 case NEON::BI__builtin_neon_vsetq_lane_f32:
2748 case NEON::BI__builtin_neon_vsha1h_u32:
2749 case NEON::BI__builtin_neon_vsha1cq_u32:
2750 case NEON::BI__builtin_neon_vsha1pq_u32:
2751 case NEON::BI__builtin_neon_vsha1mq_u32:
2752 case NEON::BI__builtin_neon_vcvth_bf16_f32:
2753 case clang::ARM::BI_MoveToCoprocessor:
2754 case clang::ARM::BI_MoveToCoprocessor2:
2755 return false;
2756 }
2757 return true;
2758}
2759
2760Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
2761 const CallExpr *E,
2762 ReturnValueSlot ReturnValue,
2763 llvm::Triple::ArchType Arch) {
2764 if (auto Hint = GetValueForARMHint(BuiltinID))
2765 return Hint;
2766
2767 if (BuiltinID == clang::ARM::BI__emit) {
2768 bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
2769 llvm::FunctionType *FTy =
2770 llvm::FunctionType::get(Result: VoidTy, /*Variadic=*/isVarArg: false);
2771
2772 Expr::EvalResult Result;
2773 if (!E->getArg(Arg: 0)->EvaluateAsInt(Result, Ctx: CGM.getContext()))
2774 llvm_unreachable("Sema will ensure that the parameter is constant");
2775
2776 llvm::APSInt Value = Result.Val.getInt();
2777 uint64_t ZExtValue = Value.zextOrTrunc(width: IsThumb ? 16 : 32).getZExtValue();
2778
2779 llvm::InlineAsm *Emit =
2780 IsThumb ? InlineAsm::get(Ty: FTy, AsmString: ".inst.n 0x" + utohexstr(X: ZExtValue), Constraints: "",
2781 /*hasSideEffects=*/true)
2782 : InlineAsm::get(Ty: FTy, AsmString: ".inst 0x" + utohexstr(X: ZExtValue), Constraints: "",
2783 /*hasSideEffects=*/true);
2784
2785 return Builder.CreateCall(Callee: Emit);
2786 }
2787
2788 if (BuiltinID == clang::ARM::BI__builtin_arm_dbg) {
2789 Value *Option = EmitScalarExpr(E: E->getArg(Arg: 0));
2790 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_dbg), Args: Option);
2791 }
2792
2793 if (BuiltinID == clang::ARM::BI__builtin_arm_prefetch) {
2794 Value *Address = EmitScalarExpr(E: E->getArg(Arg: 0));
2795 Value *RW = EmitScalarExpr(E: E->getArg(Arg: 1));
2796 Value *IsData = EmitScalarExpr(E: E->getArg(Arg: 2));
2797
2798 // Locality is not supported on ARM target
2799 Value *Locality = llvm::ConstantInt::get(Ty: Int32Ty, V: 3);
2800
2801 Function *F = CGM.getIntrinsic(IID: Intrinsic::prefetch, Tys: Address->getType());
2802 return Builder.CreateCall(Callee: F, Args: {Address, RW, Locality, IsData});
2803 }
2804
2805 if (BuiltinID == clang::ARM::BI__builtin_arm_rbit) {
2806 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2807 return Builder.CreateCall(
2808 Callee: CGM.getIntrinsic(IID: Intrinsic::bitreverse, Tys: Arg->getType()), Args: Arg, Name: "rbit");
2809 }
2810
2811 if (BuiltinID == clang::ARM::BI__builtin_arm_clz ||
2812 BuiltinID == clang::ARM::BI__builtin_arm_clz64) {
2813 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2814 Function *F = CGM.getIntrinsic(IID: Intrinsic::ctlz, Tys: Arg->getType());
2815 Value *Res = Builder.CreateCall(Callee: F, Args: {Arg, Builder.getInt1(V: false)});
2816 if (BuiltinID == clang::ARM::BI__builtin_arm_clz64)
2817 Res = Builder.CreateTrunc(V: Res, DestTy: Builder.getInt32Ty());
2818 return Res;
2819 }
2820
2821
2822 if (BuiltinID == clang::ARM::BI__builtin_arm_cls) {
2823 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2824 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_cls), Args: Arg, Name: "cls");
2825 }
2826 if (BuiltinID == clang::ARM::BI__builtin_arm_cls64) {
2827 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2828 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_cls64), Args: Arg,
2829 Name: "cls");
2830 }
2831
2832 if (BuiltinID == clang::ARM::BI__clear_cache) {
2833 assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
2834 const FunctionDecl *FD = E->getDirectCallee();
2835 Value *Ops[2];
2836 for (unsigned i = 0; i < 2; i++)
2837 Ops[i] = EmitScalarExpr(E: E->getArg(Arg: i));
2838 llvm::Type *Ty = CGM.getTypes().ConvertType(T: FD->getType());
2839 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty);
2840 StringRef Name = FD->getName();
2841 return EmitNounwindRuntimeCall(callee: CGM.CreateRuntimeFunction(Ty: FTy, Name), args: Ops);
2842 }
2843
2844 if (BuiltinID == clang::ARM::BI__builtin_arm_mcrr ||
2845 BuiltinID == clang::ARM::BI__builtin_arm_mcrr2) {
2846 Function *F;
2847
2848 switch (BuiltinID) {
2849 default: llvm_unreachable("unexpected builtin");
2850 case clang::ARM::BI__builtin_arm_mcrr:
2851 F = CGM.getIntrinsic(IID: Intrinsic::arm_mcrr);
2852 break;
2853 case clang::ARM::BI__builtin_arm_mcrr2:
2854 F = CGM.getIntrinsic(IID: Intrinsic::arm_mcrr2);
2855 break;
2856 }
2857
2858 // MCRR{2} instruction has 5 operands but
2859 // the intrinsic has 4 because Rt and Rt2
2860 // are represented as a single unsigned 64
2861 // bit integer in the intrinsic definition
2862 // but internally it's represented as 2 32
2863 // bit integers.
2864
2865 Value *Coproc = EmitScalarExpr(E: E->getArg(Arg: 0));
2866 Value *Opc1 = EmitScalarExpr(E: E->getArg(Arg: 1));
2867 Value *RtAndRt2 = EmitScalarExpr(E: E->getArg(Arg: 2));
2868 Value *CRm = EmitScalarExpr(E: E->getArg(Arg: 3));
2869
2870 Value *C1 = llvm::ConstantInt::get(Ty: Int64Ty, V: 32);
2871 Value *Rt = Builder.CreateTruncOrBitCast(V: RtAndRt2, DestTy: Int32Ty);
2872 Value *Rt2 = Builder.CreateLShr(LHS: RtAndRt2, RHS: C1);
2873 Rt2 = Builder.CreateTruncOrBitCast(V: Rt2, DestTy: Int32Ty);
2874
2875 return Builder.CreateCall(Callee: F, Args: {Coproc, Opc1, Rt, Rt2, CRm});
2876 }
2877
2878 if (BuiltinID == clang::ARM::BI__builtin_arm_mrrc ||
2879 BuiltinID == clang::ARM::BI__builtin_arm_mrrc2) {
2880 Function *F;
2881
2882 switch (BuiltinID) {
2883 default: llvm_unreachable("unexpected builtin");
2884 case clang::ARM::BI__builtin_arm_mrrc:
2885 F = CGM.getIntrinsic(IID: Intrinsic::arm_mrrc);
2886 break;
2887 case clang::ARM::BI__builtin_arm_mrrc2:
2888 F = CGM.getIntrinsic(IID: Intrinsic::arm_mrrc2);
2889 break;
2890 }
2891
2892 Value *Coproc = EmitScalarExpr(E: E->getArg(Arg: 0));
2893 Value *Opc1 = EmitScalarExpr(E: E->getArg(Arg: 1));
2894 Value *CRm = EmitScalarExpr(E: E->getArg(Arg: 2));
2895 Value *RtAndRt2 = Builder.CreateCall(Callee: F, Args: {Coproc, Opc1, CRm});
2896
2897 // Returns an unsigned 64 bit integer, represented
2898 // as two 32 bit integers.
2899
2900 Value *Rt = Builder.CreateExtractValue(Agg: RtAndRt2, Idxs: 1);
2901 Value *Rt1 = Builder.CreateExtractValue(Agg: RtAndRt2, Idxs: 0);
2902 Rt = Builder.CreateZExt(V: Rt, DestTy: Int64Ty);
2903 Rt1 = Builder.CreateZExt(V: Rt1, DestTy: Int64Ty);
2904
2905 Value *ShiftCast = llvm::ConstantInt::get(Ty: Int64Ty, V: 32);
2906 RtAndRt2 = Builder.CreateShl(LHS: Rt, RHS: ShiftCast, Name: "shl", HasNUW: true);
2907 RtAndRt2 = Builder.CreateOr(LHS: RtAndRt2, RHS: Rt1);
2908
2909 return Builder.CreateBitCast(V: RtAndRt2, DestTy: ConvertType(T: E->getType()));
2910 }
2911
2912 if (BuiltinID == clang::ARM::BI__builtin_arm_ldrexd ||
2913 ((BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
2914 BuiltinID == clang::ARM::BI__builtin_arm_ldaex) &&
2915 getContext().getTypeSize(T: E->getType()) == 64) ||
2916 BuiltinID == clang::ARM::BI__ldrexd) {
2917 Function *F;
2918
2919 switch (BuiltinID) {
2920 default: llvm_unreachable("unexpected builtin");
2921 case clang::ARM::BI__builtin_arm_ldaex:
2922 F = CGM.getIntrinsic(IID: Intrinsic::arm_ldaexd);
2923 break;
2924 case clang::ARM::BI__builtin_arm_ldrexd:
2925 case clang::ARM::BI__builtin_arm_ldrex:
2926 case clang::ARM::BI__ldrexd:
2927 F = CGM.getIntrinsic(IID: Intrinsic::arm_ldrexd);
2928 break;
2929 }
2930
2931 Value *LdPtr = EmitScalarExpr(E: E->getArg(Arg: 0));
2932 Value *Val = Builder.CreateCall(Callee: F, Args: LdPtr, Name: "ldrexd");
2933
2934 Value *Val0 = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
2935 Value *Val1 = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
2936 Val0 = Builder.CreateZExt(V: Val0, DestTy: Int64Ty);
2937 Val1 = Builder.CreateZExt(V: Val1, DestTy: Int64Ty);
2938
2939 Value *ShiftCst = llvm::ConstantInt::get(Ty: Int64Ty, V: 32);
2940 Val = Builder.CreateShl(LHS: Val0, RHS: ShiftCst, Name: "shl", HasNUW: true /* nuw */);
2941 Val = Builder.CreateOr(LHS: Val, RHS: Val1);
2942 return Builder.CreateBitCast(V: Val, DestTy: ConvertType(T: E->getType()));
2943 }
2944
2945 if (BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
2946 BuiltinID == clang::ARM::BI__builtin_arm_ldaex) {
2947 Value *LoadAddr = EmitScalarExpr(E: E->getArg(Arg: 0));
2948
2949 QualType Ty = E->getType();
2950 llvm::Type *RealResTy = ConvertType(T: Ty);
2951 llvm::Type *IntTy =
2952 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: getContext().getTypeSize(T: Ty));
2953
2954 Function *F = CGM.getIntrinsic(
2955 IID: BuiltinID == clang::ARM::BI__builtin_arm_ldaex ? Intrinsic::arm_ldaex
2956 : Intrinsic::arm_ldrex,
2957 Tys: DefaultPtrTy);
2958 CallInst *Val = Builder.CreateCall(Callee: F, Args: LoadAddr, Name: "ldrex");
2959 Val->addParamAttr(
2960 ArgNo: 0, Attr: Attribute::get(Context&: getLLVMContext(), Kind: Attribute::ElementType, Ty: IntTy));
2961
2962 if (RealResTy->isPointerTy())
2963 return Builder.CreateIntToPtr(V: Val, DestTy: RealResTy);
2964 else {
2965 llvm::Type *IntResTy = llvm::IntegerType::get(
2966 C&: getLLVMContext(), NumBits: CGM.getDataLayout().getTypeSizeInBits(Ty: RealResTy));
2967 return Builder.CreateBitCast(V: Builder.CreateTruncOrBitCast(V: Val, DestTy: IntResTy),
2968 DestTy: RealResTy);
2969 }
2970 }
2971
2972 if (BuiltinID == clang::ARM::BI__builtin_arm_strexd ||
2973 ((BuiltinID == clang::ARM::BI__builtin_arm_stlex ||
2974 BuiltinID == clang::ARM::BI__builtin_arm_strex) &&
2975 getContext().getTypeSize(T: E->getArg(Arg: 0)->getType()) == 64)) {
2976 Function *F = CGM.getIntrinsic(
2977 IID: BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlexd
2978 : Intrinsic::arm_strexd);
2979 llvm::Type *STy = llvm::StructType::get(elt1: Int32Ty, elts: Int32Ty);
2980
2981 Address Tmp = CreateMemTemp(T: E->getArg(Arg: 0)->getType());
2982 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 0));
2983 Builder.CreateStore(Val, Addr: Tmp);
2984
2985 Address LdPtr = Tmp.withElementType(ElemTy: STy);
2986 Val = Builder.CreateLoad(Addr: LdPtr);
2987
2988 Value *Arg0 = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
2989 Value *Arg1 = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
2990 Value *StPtr = EmitScalarExpr(E: E->getArg(Arg: 1));
2991 return Builder.CreateCall(Callee: F, Args: {Arg0, Arg1, StPtr}, Name: "strexd");
2992 }
2993
2994 if (BuiltinID == clang::ARM::BI__builtin_arm_strex ||
2995 BuiltinID == clang::ARM::BI__builtin_arm_stlex) {
2996 Value *StoreVal = EmitScalarExpr(E: E->getArg(Arg: 0));
2997 Value *StoreAddr = EmitScalarExpr(E: E->getArg(Arg: 1));
2998
2999 QualType Ty = E->getArg(Arg: 0)->getType();
3000 llvm::Type *StoreTy =
3001 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: getContext().getTypeSize(T: Ty));
3002
3003 if (StoreVal->getType()->isPointerTy())
3004 StoreVal = Builder.CreatePtrToInt(V: StoreVal, DestTy: Int32Ty);
3005 else {
3006 llvm::Type *IntTy = llvm::IntegerType::get(
3007 C&: getLLVMContext(),
3008 NumBits: CGM.getDataLayout().getTypeSizeInBits(Ty: StoreVal->getType()));
3009 StoreVal = Builder.CreateBitCast(V: StoreVal, DestTy: IntTy);
3010 StoreVal = Builder.CreateZExtOrBitCast(V: StoreVal, DestTy: Int32Ty);
3011 }
3012
3013 Function *F = CGM.getIntrinsic(
3014 IID: BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlex
3015 : Intrinsic::arm_strex,
3016 Tys: StoreAddr->getType());
3017
3018 CallInst *CI = Builder.CreateCall(Callee: F, Args: {StoreVal, StoreAddr}, Name: "strex");
3019 CI->addParamAttr(
3020 ArgNo: 1, Attr: Attribute::get(Context&: getLLVMContext(), Kind: Attribute::ElementType, Ty: StoreTy));
3021 return CI;
3022 }
3023
3024 if (BuiltinID == clang::ARM::BI__builtin_arm_clrex) {
3025 Function *F = CGM.getIntrinsic(IID: Intrinsic::arm_clrex);
3026 return Builder.CreateCall(Callee: F);
3027 }
3028
3029 // CRC32
3030 Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
3031 switch (BuiltinID) {
3032 case clang::ARM::BI__builtin_arm_crc32b:
3033 CRCIntrinsicID = Intrinsic::arm_crc32b; break;
3034 case clang::ARM::BI__builtin_arm_crc32cb:
3035 CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
3036 case clang::ARM::BI__builtin_arm_crc32h:
3037 CRCIntrinsicID = Intrinsic::arm_crc32h; break;
3038 case clang::ARM::BI__builtin_arm_crc32ch:
3039 CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
3040 case clang::ARM::BI__builtin_arm_crc32w:
3041 case clang::ARM::BI__builtin_arm_crc32d:
3042 CRCIntrinsicID = Intrinsic::arm_crc32w; break;
3043 case clang::ARM::BI__builtin_arm_crc32cw:
3044 case clang::ARM::BI__builtin_arm_crc32cd:
3045 CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
3046 }
3047
3048 if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
3049 Value *Arg0 = EmitScalarExpr(E: E->getArg(Arg: 0));
3050 Value *Arg1 = EmitScalarExpr(E: E->getArg(Arg: 1));
3051
3052 // crc32{c,}d intrinsics are implemented as two calls to crc32{c,}w
3053 // intrinsics, hence we need different codegen for these cases.
3054 if (BuiltinID == clang::ARM::BI__builtin_arm_crc32d ||
3055 BuiltinID == clang::ARM::BI__builtin_arm_crc32cd) {
3056 Value *C1 = llvm::ConstantInt::get(Ty: Int64Ty, V: 32);
3057 Value *Arg1a = Builder.CreateTruncOrBitCast(V: Arg1, DestTy: Int32Ty);
3058 Value *Arg1b = Builder.CreateLShr(LHS: Arg1, RHS: C1);
3059 Arg1b = Builder.CreateTruncOrBitCast(V: Arg1b, DestTy: Int32Ty);
3060
3061 Function *F = CGM.getIntrinsic(IID: CRCIntrinsicID);
3062 Value *Res = Builder.CreateCall(Callee: F, Args: {Arg0, Arg1a});
3063 return Builder.CreateCall(Callee: F, Args: {Res, Arg1b});
3064 } else {
3065 Arg1 = Builder.CreateZExtOrBitCast(V: Arg1, DestTy: Int32Ty);
3066
3067 Function *F = CGM.getIntrinsic(IID: CRCIntrinsicID);
3068 return Builder.CreateCall(Callee: F, Args: {Arg0, Arg1});
3069 }
3070 }
3071
3072 if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
3073 BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
3074 BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
3075 BuiltinID == clang::ARM::BI__builtin_arm_wsr ||
3076 BuiltinID == clang::ARM::BI__builtin_arm_wsr64 ||
3077 BuiltinID == clang::ARM::BI__builtin_arm_wsrp) {
3078
3079 SpecialRegisterAccessKind AccessKind = Write;
3080 if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
3081 BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
3082 BuiltinID == clang::ARM::BI__builtin_arm_rsrp)
3083 AccessKind = VolatileRead;
3084
3085 bool IsPointerBuiltin = BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
3086 BuiltinID == clang::ARM::BI__builtin_arm_wsrp;
3087
3088 bool Is64Bit = BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
3089 BuiltinID == clang::ARM::BI__builtin_arm_wsr64;
3090
3091 llvm::Type *ValueType;
3092 llvm::Type *RegisterType;
3093 if (IsPointerBuiltin) {
3094 ValueType = VoidPtrTy;
3095 RegisterType = Int32Ty;
3096 } else if (Is64Bit) {
3097 ValueType = RegisterType = Int64Ty;
3098 } else {
3099 ValueType = RegisterType = Int32Ty;
3100 }
3101
3102 return EmitSpecialRegisterBuiltin(CGF&: *this, E, RegisterType, ValueType,
3103 AccessKind);
3104 }
3105
3106 if (BuiltinID == ARM::BI__builtin_sponentry) {
3107 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::sponentry, Tys: AllocaInt8PtrTy);
3108 return Builder.CreateCall(Callee: F);
3109 }
3110
3111 // Handle MSVC intrinsics before argument evaluation to prevent double
3112 // evaluation.
3113 if (std::optional<MSVCIntrin> MsvcIntId = translateArmToMsvcIntrin(BuiltinID))
3114 return EmitMSVCBuiltinExpr(BuiltinID: *MsvcIntId, E);
3115
3116 // Deal with MVE builtins
3117 if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
3118 return Result;
3119 // Handle CDE builtins
3120 if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
3121 return Result;
3122
3123 // Some intrinsics are equivalent - if they are use the base intrinsic ID.
3124 auto It = llvm::find_if(Range: NEONEquivalentIntrinsicMap, P: [BuiltinID](auto &P) {
3125 return P.first == BuiltinID;
3126 });
3127 if (It != end(arr: NEONEquivalentIntrinsicMap))
3128 BuiltinID = It->second;
3129
3130 // Find out if any arguments are required to be integer constant
3131 // expressions.
3132 unsigned ICEArguments = 0;
3133 ASTContext::GetBuiltinTypeError Error;
3134 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
3135 assert(Error == ASTContext::GE_None && "Should not codegen an error");
3136
3137 auto getAlignmentValue32 = [&](Address addr) -> Value* {
3138 return Builder.getInt32(C: addr.getAlignment().getQuantity());
3139 };
3140
3141 Address PtrOp0 = Address::invalid();
3142 Address PtrOp1 = Address::invalid();
3143 SmallVector<Value*, 4> Ops;
3144 bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
3145 unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
3146 for (unsigned i = 0, e = NumArgs; i != e; i++) {
3147 if (i == 0) {
3148 switch (BuiltinID) {
3149 case NEON::BI__builtin_neon_vld1_v:
3150 case NEON::BI__builtin_neon_vld1q_v:
3151 case NEON::BI__builtin_neon_vld1q_lane_v:
3152 case NEON::BI__builtin_neon_vld1_lane_v:
3153 case NEON::BI__builtin_neon_vld1_dup_v:
3154 case NEON::BI__builtin_neon_vld1q_dup_v:
3155 case NEON::BI__builtin_neon_vst1_v:
3156 case NEON::BI__builtin_neon_vst1q_v:
3157 case NEON::BI__builtin_neon_vst1q_lane_v:
3158 case NEON::BI__builtin_neon_vst1_lane_v:
3159 case NEON::BI__builtin_neon_vst2_v:
3160 case NEON::BI__builtin_neon_vst2q_v:
3161 case NEON::BI__builtin_neon_vst2_lane_v:
3162 case NEON::BI__builtin_neon_vst2q_lane_v:
3163 case NEON::BI__builtin_neon_vst3_v:
3164 case NEON::BI__builtin_neon_vst3q_v:
3165 case NEON::BI__builtin_neon_vst3_lane_v:
3166 case NEON::BI__builtin_neon_vst3q_lane_v:
3167 case NEON::BI__builtin_neon_vst4_v:
3168 case NEON::BI__builtin_neon_vst4q_v:
3169 case NEON::BI__builtin_neon_vst4_lane_v:
3170 case NEON::BI__builtin_neon_vst4q_lane_v:
3171 // Get the alignment for the argument in addition to the value;
3172 // we'll use it later.
3173 PtrOp0 = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
3174 Ops.push_back(Elt: PtrOp0.emitRawPointer(CGF&: *this));
3175 continue;
3176 }
3177 }
3178 if (i == 1) {
3179 switch (BuiltinID) {
3180 case NEON::BI__builtin_neon_vld2_v:
3181 case NEON::BI__builtin_neon_vld2q_v:
3182 case NEON::BI__builtin_neon_vld3_v:
3183 case NEON::BI__builtin_neon_vld3q_v:
3184 case NEON::BI__builtin_neon_vld4_v:
3185 case NEON::BI__builtin_neon_vld4q_v:
3186 case NEON::BI__builtin_neon_vld2_lane_v:
3187 case NEON::BI__builtin_neon_vld2q_lane_v:
3188 case NEON::BI__builtin_neon_vld3_lane_v:
3189 case NEON::BI__builtin_neon_vld3q_lane_v:
3190 case NEON::BI__builtin_neon_vld4_lane_v:
3191 case NEON::BI__builtin_neon_vld4q_lane_v:
3192 case NEON::BI__builtin_neon_vld2_dup_v:
3193 case NEON::BI__builtin_neon_vld2q_dup_v:
3194 case NEON::BI__builtin_neon_vld3_dup_v:
3195 case NEON::BI__builtin_neon_vld3q_dup_v:
3196 case NEON::BI__builtin_neon_vld4_dup_v:
3197 case NEON::BI__builtin_neon_vld4q_dup_v:
3198 // Get the alignment for the argument in addition to the value;
3199 // we'll use it later.
3200 PtrOp1 = EmitPointerWithAlignment(Addr: E->getArg(Arg: 1));
3201 Ops.push_back(Elt: PtrOp1.emitRawPointer(CGF&: *this));
3202 continue;
3203 }
3204 }
3205
3206 Ops.push_back(Elt: EmitScalarOrConstFoldImmArg(ICEArguments, Idx: i, E));
3207 }
3208
3209 switch (BuiltinID) {
3210 default: break;
3211
3212 case NEON::BI__builtin_neon_vget_lane_i8:
3213 case NEON::BI__builtin_neon_vget_lane_i16:
3214 case NEON::BI__builtin_neon_vget_lane_i32:
3215 case NEON::BI__builtin_neon_vget_lane_i64:
3216 case NEON::BI__builtin_neon_vget_lane_bf16:
3217 case NEON::BI__builtin_neon_vget_lane_f32:
3218 case NEON::BI__builtin_neon_vgetq_lane_i8:
3219 case NEON::BI__builtin_neon_vgetq_lane_i16:
3220 case NEON::BI__builtin_neon_vgetq_lane_i32:
3221 case NEON::BI__builtin_neon_vgetq_lane_i64:
3222 case NEON::BI__builtin_neon_vgetq_lane_bf16:
3223 case NEON::BI__builtin_neon_vgetq_lane_f32:
3224 case NEON::BI__builtin_neon_vduph_lane_bf16:
3225 case NEON::BI__builtin_neon_vduph_laneq_bf16:
3226 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
3227
3228 case NEON::BI__builtin_neon_vrndns_f32: {
3229 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
3230 llvm::Type *Tys[] = {Arg->getType()};
3231 Function *F = CGM.getIntrinsic(IID: Intrinsic::roundeven, Tys);
3232 return Builder.CreateCall(Callee: F, Args: {Arg}, Name: "vrndn"); }
3233
3234 case NEON::BI__builtin_neon_vset_lane_i8:
3235 case NEON::BI__builtin_neon_vset_lane_i16:
3236 case NEON::BI__builtin_neon_vset_lane_i32:
3237 case NEON::BI__builtin_neon_vset_lane_i64:
3238 case NEON::BI__builtin_neon_vset_lane_bf16:
3239 case NEON::BI__builtin_neon_vset_lane_f32:
3240 case NEON::BI__builtin_neon_vsetq_lane_i8:
3241 case NEON::BI__builtin_neon_vsetq_lane_i16:
3242 case NEON::BI__builtin_neon_vsetq_lane_i32:
3243 case NEON::BI__builtin_neon_vsetq_lane_i64:
3244 case NEON::BI__builtin_neon_vsetq_lane_bf16:
3245 case NEON::BI__builtin_neon_vsetq_lane_f32:
3246 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
3247
3248 case NEON::BI__builtin_neon_vsha1h_u32:
3249 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_sha1h), Ops,
3250 name: "vsha1h");
3251 case NEON::BI__builtin_neon_vsha1cq_u32:
3252 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_sha1c), Ops,
3253 name: "vsha1h");
3254 case NEON::BI__builtin_neon_vsha1pq_u32:
3255 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_sha1p), Ops,
3256 name: "vsha1h");
3257 case NEON::BI__builtin_neon_vsha1mq_u32:
3258 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_sha1m), Ops,
3259 name: "vsha1h");
3260
3261 case NEON::BI__builtin_neon_vcvth_bf16_f32: {
3262 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vcvtbfp2bf), Ops,
3263 name: "vcvtbfp2bf");
3264 }
3265
3266 // The ARM _MoveToCoprocessor builtins put the input register value as
3267 // the first argument, but the LLVM intrinsic expects it as the third one.
3268 case clang::ARM::BI_MoveToCoprocessor:
3269 case clang::ARM::BI_MoveToCoprocessor2: {
3270 Function *F = CGM.getIntrinsic(IID: BuiltinID == clang::ARM::BI_MoveToCoprocessor
3271 ? Intrinsic::arm_mcr
3272 : Intrinsic::arm_mcr2);
3273 return Builder.CreateCall(Callee: F, Args: {Ops[1], Ops[2], Ops[0],
3274 Ops[3], Ops[4], Ops[5]});
3275 }
3276 }
3277
3278 // Get the last argument, which specifies the vector type.
3279 assert(HasExtraArg);
3280 const Expr *Arg = E->getArg(Arg: E->getNumArgs()-1);
3281 std::optional<llvm::APSInt> Result =
3282 Arg->getIntegerConstantExpr(Ctx: getContext());
3283 if (!Result)
3284 return nullptr;
3285
3286 if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f ||
3287 BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_d) {
3288 // Determine the overloaded type of this builtin.
3289 llvm::Type *Ty;
3290 if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f)
3291 Ty = FloatTy;
3292 else
3293 Ty = DoubleTy;
3294
3295 // Determine whether this is an unsigned conversion or not.
3296 bool usgn = Result->getZExtValue() == 1;
3297 unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
3298
3299 // Call the appropriate intrinsic.
3300 Function *F = CGM.getIntrinsic(IID: Int, Tys: Ty);
3301 return Builder.CreateCall(Callee: F, Args: Ops, Name: "vcvtr");
3302 }
3303
3304 // Determine the type of this overloaded NEON intrinsic.
3305 NeonTypeFlags Type = Result->getZExtValue();
3306 bool usgn = Type.isUnsigned();
3307 bool rightShift = false;
3308
3309 llvm::FixedVectorType *VTy =
3310 GetNeonType(CGF: this, TypeFlags: Type, HasFastHalfType: getTarget().hasFastHalfType(), V1Ty: false,
3311 AllowBFloatArgsAndRet: getTarget().hasBFloat16Type());
3312 llvm::Type *Ty = VTy;
3313 if (!Ty)
3314 return nullptr;
3315
3316 // Many NEON builtins have identical semantics and uses in ARM and
3317 // AArch64. Emit these in a single function.
3318 auto IntrinsicMap = ArrayRef(ARMSIMDIntrinsicMap);
3319 const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
3320 IntrinsicMap, BuiltinID, MapProvenSorted&: NEONSIMDIntrinsicsProvenSorted);
3321 if (Builtin)
3322 return EmitCommonNeonBuiltinExpr(
3323 BuiltinID: Builtin->BuiltinID, LLVMIntrinsic: Builtin->LLVMIntrinsic, AltLLVMIntrinsic: Builtin->AltLLVMIntrinsic,
3324 NameHint: Builtin->NameHint, Modifier: Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
3325
3326 unsigned Int;
3327 switch (BuiltinID) {
3328 default: return nullptr;
3329 case NEON::BI__builtin_neon_vld1q_lane_v:
3330 // Handle 64-bit integer elements as a special case. Use shuffles of
3331 // one-element vectors to avoid poor code for i64 in the backend.
3332 if (VTy->getElementType()->isIntegerTy(Bitwidth: 64)) {
3333 // Extract the other lane.
3334 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
3335 int Lane = cast<ConstantInt>(Val: Ops[2])->getZExtValue();
3336 Value *SV = llvm::ConstantVector::get(V: ConstantInt::get(Ty: Int32Ty, V: 1-Lane));
3337 Ops[1] = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[1], Mask: SV);
3338 // Load the value as a one-element vector.
3339 Ty = llvm::FixedVectorType::get(ElementType: VTy->getElementType(), NumElts: 1);
3340 llvm::Type *Tys[] = {Ty, Int8PtrTy};
3341 Function *F = CGM.getIntrinsic(IID: Intrinsic::arm_neon_vld1, Tys);
3342 Value *Align = getAlignmentValue32(PtrOp0);
3343 Value *Ld = Builder.CreateCall(Callee: F, Args: {Ops[0], Align});
3344 // Combine them.
3345 int Indices[] = {1 - Lane, Lane};
3346 return Builder.CreateShuffleVector(V1: Ops[1], V2: Ld, Mask: Indices, Name: "vld1q_lane");
3347 }
3348 [[fallthrough]];
3349 case NEON::BI__builtin_neon_vld1_lane_v: {
3350 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
3351 PtrOp0 = PtrOp0.withElementType(ElemTy: VTy->getElementType());
3352 Value *Ld = Builder.CreateLoad(Addr: PtrOp0);
3353 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ld, Idx: Ops[2], Name: "vld1_lane");
3354 }
3355 case NEON::BI__builtin_neon_vqrshrn_n_v:
3356 Int =
3357 usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
3358 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqrshrn_n",
3359 shift: 1, rightshift: true);
3360 case NEON::BI__builtin_neon_vqrshrun_n_v:
3361 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vqrshiftnsu, Tys: Ty),
3362 Ops, name: "vqrshrun_n", shift: 1, rightshift: true);
3363 case NEON::BI__builtin_neon_vqshrn_n_v:
3364 Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
3365 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshrn_n",
3366 shift: 1, rightshift: true);
3367 case NEON::BI__builtin_neon_vqshrun_n_v:
3368 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vqshiftnsu, Tys: Ty),
3369 Ops, name: "vqshrun_n", shift: 1, rightshift: true);
3370 case NEON::BI__builtin_neon_vrecpe_v:
3371 case NEON::BI__builtin_neon_vrecpeq_v:
3372 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vrecpe, Tys: Ty),
3373 Ops, name: "vrecpe");
3374 case NEON::BI__builtin_neon_vrshrn_n_v:
3375 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vrshiftn, Tys: Ty),
3376 Ops, name: "vrshrn_n", shift: 1, rightshift: true);
3377 case NEON::BI__builtin_neon_vrsra_n_v:
3378 case NEON::BI__builtin_neon_vrsraq_n_v:
3379 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
3380 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
3381 Ops[2] = EmitNeonShiftVector(V: Ops[2], Ty, neg: true);
3382 Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
3383 Ops[1] = Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Int, Tys: Ty), Args: {Ops[1], Ops[2]});
3384 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1], Name: "vrsra_n");
3385 case NEON::BI__builtin_neon_vsri_n_v:
3386 case NEON::BI__builtin_neon_vsriq_n_v:
3387 rightShift = true;
3388 [[fallthrough]];
3389 case NEON::BI__builtin_neon_vsli_n_v:
3390 case NEON::BI__builtin_neon_vsliq_n_v:
3391 Ops[2] = EmitNeonShiftVector(V: Ops[2], Ty, neg: rightShift);
3392 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vshiftins, Tys: Ty),
3393 Ops, name: "vsli_n");
3394 case NEON::BI__builtin_neon_vsra_n_v:
3395 case NEON::BI__builtin_neon_vsraq_n_v:
3396 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
3397 Ops[1] = EmitNeonRShiftImm(Vec: Ops[1], Shift: Ops[2], Ty, usgn, name: "vsra_n");
3398 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1]);
3399 case NEON::BI__builtin_neon_vst1q_lane_v:
3400 // Handle 64-bit integer elements as a special case. Use a shuffle to get
3401 // a one-element vector and avoid poor code for i64 in the backend.
3402 if (VTy->getElementType()->isIntegerTy(Bitwidth: 64)) {
3403 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
3404 Value *SV = llvm::ConstantVector::get(V: cast<llvm::Constant>(Val: Ops[2]));
3405 Ops[1] = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[1], Mask: SV);
3406 Ops[2] = getAlignmentValue32(PtrOp0);
3407 llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
3408 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vst1,
3409 Tys), Args: Ops);
3410 }
3411 [[fallthrough]];
3412 case NEON::BI__builtin_neon_vst1_lane_v: {
3413 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
3414 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2]);
3415 return Builder.CreateStore(Val: Ops[1],
3416 Addr: PtrOp0.withElementType(ElemTy: Ops[1]->getType()));
3417 }
3418 case NEON::BI__builtin_neon_vtbl1_v:
3419 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbl1),
3420 Ops, name: "vtbl1");
3421 case NEON::BI__builtin_neon_vtbl2_v:
3422 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbl2),
3423 Ops, name: "vtbl2");
3424 case NEON::BI__builtin_neon_vtbl3_v:
3425 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbl3),
3426 Ops, name: "vtbl3");
3427 case NEON::BI__builtin_neon_vtbl4_v:
3428 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbl4),
3429 Ops, name: "vtbl4");
3430 case NEON::BI__builtin_neon_vtbx1_v:
3431 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbx1),
3432 Ops, name: "vtbx1");
3433 case NEON::BI__builtin_neon_vtbx2_v:
3434 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbx2),
3435 Ops, name: "vtbx2");
3436 case NEON::BI__builtin_neon_vtbx3_v:
3437 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbx3),
3438 Ops, name: "vtbx3");
3439 case NEON::BI__builtin_neon_vtbx4_v:
3440 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbx4),
3441 Ops, name: "vtbx4");
3442 }
3443}
3444
3445template<typename Integer>
3446static Integer GetIntegerConstantValue(const Expr *E, ASTContext &Context) {
3447 return E->getIntegerConstantExpr(Ctx: Context)->getExtValue();
3448}
3449
3450static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
3451 llvm::Type *T, bool Unsigned) {
3452 // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
3453 // which finds it convenient to specify signed/unsigned as a boolean flag.
3454 return Unsigned ? Builder.CreateZExt(V, DestTy: T) : Builder.CreateSExt(V, DestTy: T);
3455}
3456
3457static llvm::Value *MVEImmediateShr(CGBuilderTy &Builder, llvm::Value *V,
3458 uint32_t Shift, bool Unsigned) {
3459 // MVE helper function for integer shift right. This must handle signed vs
3460 // unsigned, and also deal specially with the case where the shift count is
3461 // equal to the lane size. In LLVM IR, an LShr with that parameter would be
3462 // undefined behavior, but in MVE it's legal, so we must convert it to code
3463 // that is not undefined in IR.
3464 unsigned LaneBits = cast<llvm::VectorType>(Val: V->getType())
3465 ->getElementType()
3466 ->getPrimitiveSizeInBits();
3467 if (Shift == LaneBits) {
3468 // An unsigned shift of the full lane size always generates zero, so we can
3469 // simply emit a zero vector. A signed shift of the full lane size does the
3470 // same thing as shifting by one bit fewer.
3471 if (Unsigned)
3472 return llvm::Constant::getNullValue(Ty: V->getType());
3473 else
3474 --Shift;
3475 }
3476 return Unsigned ? Builder.CreateLShr(LHS: V, RHS: Shift) : Builder.CreateAShr(LHS: V, RHS: Shift);
3477}
3478
3479static llvm::Value *ARMMVEVectorSplat(CGBuilderTy &Builder, llvm::Value *V) {
3480 // MVE-specific helper function for a vector splat, which infers the element
3481 // count of the output vector by knowing that MVE vectors are all 128 bits
3482 // wide.
3483 unsigned Elements = 128 / V->getType()->getPrimitiveSizeInBits();
3484 return Builder.CreateVectorSplat(NumElts: Elements, V);
3485}
3486
3487static llvm::Value *ARMMVEVectorReinterpret(CGBuilderTy &Builder,
3488 CodeGenFunction *CGF,
3489 llvm::Value *V,
3490 llvm::Type *DestType) {
3491 // Convert one MVE vector type into another by reinterpreting its in-register
3492 // format.
3493 //
3494 // Little-endian, this is identical to a bitcast (which reinterprets the
3495 // memory format). But big-endian, they're not necessarily the same, because
3496 // the register and memory formats map to each other differently depending on
3497 // the lane size.
3498 //
3499 // We generate a bitcast whenever we can (if we're little-endian, or if the
3500 // lane sizes are the same anyway). Otherwise we fall back to an IR intrinsic
3501 // that performs the different kind of reinterpretation.
3502 if (CGF->getTarget().isBigEndian() &&
3503 V->getType()->getScalarSizeInBits() != DestType->getScalarSizeInBits()) {
3504 return Builder.CreateCall(
3505 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vreinterpretq,
3506 Tys: {DestType, V->getType()}),
3507 Args: V);
3508 } else {
3509 return Builder.CreateBitCast(V, DestTy: DestType);
3510 }
3511}
3512
3513static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) {
3514 // Make a shufflevector that extracts every other element of a vector (evens
3515 // or odds, as desired).
3516 SmallVector<int, 16> Indices;
3517 unsigned InputElements =
3518 cast<llvm::FixedVectorType>(Val: V->getType())->getNumElements();
3519 for (unsigned i = 0; i < InputElements; i += 2)
3520 Indices.push_back(Elt: i + Odd);
3521 return Builder.CreateShuffleVector(V, Mask: Indices);
3522}
3523
3524static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
3525 llvm::Value *V1) {
3526 // Make a shufflevector that interleaves two vectors element by element.
3527 assert(V0->getType() == V1->getType() && "Can't zip different vector types");
3528 SmallVector<int, 16> Indices;
3529 unsigned InputElements =
3530 cast<llvm::FixedVectorType>(Val: V0->getType())->getNumElements();
3531 for (unsigned i = 0; i < InputElements; i++) {
3532 Indices.push_back(Elt: i);
3533 Indices.push_back(Elt: i + InputElements);
3534 }
3535 return Builder.CreateShuffleVector(V1: V0, V2: V1, Mask: Indices);
3536}
3537
3538template<unsigned HighBit, unsigned OtherBits>
3539static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
3540 // MVE-specific helper function to make a vector splat of a constant such as
3541 // UINT_MAX or INT_MIN, in which all bits below the highest one are equal.
3542 llvm::Type *T = cast<llvm::VectorType>(Val: VT)->getElementType();
3543 unsigned LaneBits = T->getPrimitiveSizeInBits();
3544 uint32_t Value = HighBit << (LaneBits - 1);
3545 if (OtherBits)
3546 Value |= (1UL << (LaneBits - 1)) - 1;
3547 llvm::Value *Lane = llvm::ConstantInt::get(Ty: T, V: Value);
3548 return ARMMVEVectorSplat(Builder, V: Lane);
3549}
3550
3551static llvm::Value *ARMMVEVectorElementReverse(CGBuilderTy &Builder,
3552 llvm::Value *V,
3553 unsigned ReverseWidth) {
3554 // MVE-specific helper function which reverses the elements of a
3555 // vector within every (ReverseWidth)-bit collection of lanes.
3556 SmallVector<int, 16> Indices;
3557 unsigned LaneSize = V->getType()->getScalarSizeInBits();
3558 unsigned Elements = 128 / LaneSize;
3559 unsigned Mask = ReverseWidth / LaneSize - 1;
3560 for (unsigned i = 0; i < Elements; i++)
3561 Indices.push_back(Elt: i ^ Mask);
3562 return Builder.CreateShuffleVector(V, Mask: Indices);
3563}
3564
3565static llvm::Value *ARMMVECreateSIToFP(CGBuilderTy &Builder,
3566 CodeGenFunction *CGF, llvm::Value *V,
3567 llvm::Type *Ty) {
3568 return Builder.CreateCall(
3569 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vcvt_fp_int, Tys: {Ty, V->getType()}),
3570 Args: {V, llvm::ConstantInt::get(Ty: Builder.getInt32Ty(), V: 0)});
3571}
3572
3573static llvm::Value *ARMMVECreateUIToFP(CGBuilderTy &Builder,
3574 CodeGenFunction *CGF, llvm::Value *V,
3575 llvm::Type *Ty) {
3576 return Builder.CreateCall(
3577 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vcvt_fp_int, Tys: {Ty, V->getType()}),
3578 Args: {V, llvm::ConstantInt::get(Ty: Builder.getInt32Ty(), V: 1)});
3579}
3580
3581static llvm::Value *ARMMVECreateFPToSI(CGBuilderTy &Builder,
3582 CodeGenFunction *CGF, llvm::Value *V,
3583 llvm::Type *Ty) {
3584 return Builder.CreateCall(
3585 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vcvt_int_fp, Tys: {Ty, V->getType()}),
3586 Args: {V, llvm::ConstantInt::get(Ty: Builder.getInt32Ty(), V: 0)});
3587}
3588
3589static llvm::Value *ARMMVECreateFPToUI(CGBuilderTy &Builder,
3590 CodeGenFunction *CGF, llvm::Value *V,
3591 llvm::Type *Ty) {
3592 return Builder.CreateCall(
3593 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vcvt_int_fp, Tys: {Ty, V->getType()}),
3594 Args: {V, llvm::ConstantInt::get(Ty: Builder.getInt32Ty(), V: 1)});
3595}
3596
3597Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
3598 const CallExpr *E,
3599 ReturnValueSlot ReturnValue,
3600 llvm::Triple::ArchType Arch) {
3601 enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType;
3602 Intrinsic::ID IRIntr;
3603 unsigned NumVectors;
3604
3605 // Code autogenerated by Tablegen will handle all the simple builtins.
3606 switch (BuiltinID) {
3607 #include "clang/Basic/arm_mve_builtin_cg.inc"
3608
3609 // If we didn't match an MVE builtin id at all, go back to the
3610 // main EmitARMBuiltinExpr.
3611 default:
3612 return nullptr;
3613 }
3614
3615 // Anything that breaks from that switch is an MVE builtin that
3616 // needs handwritten code to generate.
3617
3618 switch (CustomCodeGenType) {
3619
3620 case CustomCodeGen::VLD24: {
3621 llvm::SmallVector<Value *, 4> Ops;
3622 llvm::SmallVector<llvm::Type *, 4> Tys;
3623
3624 auto MvecCType = E->getType();
3625 auto MvecLType = ConvertType(T: MvecCType);
3626 assert(MvecLType->isStructTy() &&
3627 "Return type for vld[24]q should be a struct");
3628 assert(MvecLType->getStructNumElements() == 1 &&
3629 "Return-type struct for vld[24]q should have one element");
3630 auto MvecLTypeInner = MvecLType->getStructElementType(N: 0);
3631 assert(MvecLTypeInner->isArrayTy() &&
3632 "Return-type struct for vld[24]q should contain an array");
3633 assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
3634 "Array member of return-type struct vld[24]q has wrong length");
3635 auto VecLType = MvecLTypeInner->getArrayElementType();
3636
3637 Tys.push_back(Elt: VecLType);
3638
3639 auto Addr = E->getArg(Arg: 0);
3640 Ops.push_back(Elt: EmitScalarExpr(E: Addr));
3641 Tys.push_back(Elt: ConvertType(T: Addr->getType()));
3642
3643 Function *F = CGM.getIntrinsic(IID: IRIntr, Tys: ArrayRef(Tys));
3644 Value *LoadResult = Builder.CreateCall(Callee: F, Args: Ops);
3645 Value *MvecOut = PoisonValue::get(T: MvecLType);
3646 for (unsigned i = 0; i < NumVectors; ++i) {
3647 Value *Vec = Builder.CreateExtractValue(Agg: LoadResult, Idxs: i);
3648 MvecOut = Builder.CreateInsertValue(Agg: MvecOut, Val: Vec, Idxs: {0, i});
3649 }
3650
3651 if (ReturnValue.isNull())
3652 return MvecOut;
3653 else
3654 return Builder.CreateStore(Val: MvecOut, Addr: ReturnValue.getAddress());
3655 }
3656
3657 case CustomCodeGen::VST24: {
3658 llvm::SmallVector<Value *, 4> Ops;
3659 llvm::SmallVector<llvm::Type *, 4> Tys;
3660
3661 auto Addr = E->getArg(Arg: 0);
3662 Ops.push_back(Elt: EmitScalarExpr(E: Addr));
3663 Tys.push_back(Elt: ConvertType(T: Addr->getType()));
3664
3665 auto MvecCType = E->getArg(Arg: 1)->getType();
3666 auto MvecLType = ConvertType(T: MvecCType);
3667 assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct");
3668 assert(MvecLType->getStructNumElements() == 1 &&
3669 "Data-type struct for vst2q should have one element");
3670 auto MvecLTypeInner = MvecLType->getStructElementType(N: 0);
3671 assert(MvecLTypeInner->isArrayTy() &&
3672 "Data-type struct for vst2q should contain an array");
3673 assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
3674 "Array member of return-type struct vld[24]q has wrong length");
3675 auto VecLType = MvecLTypeInner->getArrayElementType();
3676
3677 Tys.push_back(Elt: VecLType);
3678
3679 AggValueSlot MvecSlot = CreateAggTemp(T: MvecCType);
3680 EmitAggExpr(E: E->getArg(Arg: 1), AS: MvecSlot);
3681 auto Mvec = Builder.CreateLoad(Addr: MvecSlot.getAddress());
3682 for (unsigned i = 0; i < NumVectors; i++)
3683 Ops.push_back(Elt: Builder.CreateExtractValue(Agg: Mvec, Idxs: {0, i}));
3684
3685 Function *F = CGM.getIntrinsic(IID: IRIntr, Tys: ArrayRef(Tys));
3686 Value *ToReturn = nullptr;
3687 for (unsigned i = 0; i < NumVectors; i++) {
3688 Ops.push_back(Elt: llvm::ConstantInt::get(Ty: Int32Ty, V: i));
3689 ToReturn = Builder.CreateCall(Callee: F, Args: Ops);
3690 Ops.pop_back();
3691 }
3692 return ToReturn;
3693 }
3694 }
3695 llvm_unreachable("unknown custom codegen type.");
3696}
3697
3698Value *CodeGenFunction::EmitARMCDEBuiltinExpr(unsigned BuiltinID,
3699 const CallExpr *E,
3700 ReturnValueSlot ReturnValue,
3701 llvm::Triple::ArchType Arch) {
3702 switch (BuiltinID) {
3703 default:
3704 return nullptr;
3705#include "clang/Basic/arm_cde_builtin_cg.inc"
3706 }
3707}
3708
3709static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
3710 const CallExpr *E,
3711 SmallVectorImpl<Value *> &Ops,
3712 llvm::Triple::ArchType Arch) {
3713 unsigned int Int = 0;
3714 const char *s = nullptr;
3715
3716 switch (BuiltinID) {
3717 default:
3718 return nullptr;
3719 case NEON::BI__builtin_neon_vtbl1_v:
3720 case NEON::BI__builtin_neon_vqtbl1_v:
3721 case NEON::BI__builtin_neon_vqtbl1q_v:
3722 case NEON::BI__builtin_neon_vtbl2_v:
3723 case NEON::BI__builtin_neon_vqtbl2_v:
3724 case NEON::BI__builtin_neon_vqtbl2q_v:
3725 case NEON::BI__builtin_neon_vtbl3_v:
3726 case NEON::BI__builtin_neon_vqtbl3_v:
3727 case NEON::BI__builtin_neon_vqtbl3q_v:
3728 case NEON::BI__builtin_neon_vtbl4_v:
3729 case NEON::BI__builtin_neon_vqtbl4_v:
3730 case NEON::BI__builtin_neon_vqtbl4q_v:
3731 break;
3732 case NEON::BI__builtin_neon_vtbx1_v:
3733 case NEON::BI__builtin_neon_vqtbx1_v:
3734 case NEON::BI__builtin_neon_vqtbx1q_v:
3735 case NEON::BI__builtin_neon_vtbx2_v:
3736 case NEON::BI__builtin_neon_vqtbx2_v:
3737 case NEON::BI__builtin_neon_vqtbx2q_v:
3738 case NEON::BI__builtin_neon_vtbx3_v:
3739 case NEON::BI__builtin_neon_vqtbx3_v:
3740 case NEON::BI__builtin_neon_vqtbx3q_v:
3741 case NEON::BI__builtin_neon_vtbx4_v:
3742 case NEON::BI__builtin_neon_vqtbx4_v:
3743 case NEON::BI__builtin_neon_vqtbx4q_v:
3744 break;
3745 }
3746
3747 assert(E->getNumArgs() >= 3);
3748
3749 // Get the last argument, which specifies the vector type.
3750 const Expr *Arg = E->getArg(Arg: E->getNumArgs() - 1);
3751 std::optional<llvm::APSInt> Result =
3752 Arg->getIntegerConstantExpr(Ctx: CGF.getContext());
3753 if (!Result)
3754 return nullptr;
3755
3756 // Determine the type of this overloaded NEON intrinsic.
3757 NeonTypeFlags Type = Result->getZExtValue();
3758 llvm::FixedVectorType *Ty = GetNeonType(CGF: &CGF, TypeFlags: Type);
3759 if (!Ty)
3760 return nullptr;
3761
3762 CodeGen::CGBuilderTy &Builder = CGF.Builder;
3763
3764 // AArch64 scalar builtins are not overloaded, they do not have an extra
3765 // argument that specifies the vector type, need to handle each case.
3766 switch (BuiltinID) {
3767 case NEON::BI__builtin_neon_vtbl1_v: {
3768 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 0, M: 1), ExtOp: nullptr, IndexOp: Ops[1],
3769 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbl1, Name: "vtbl1");
3770 }
3771 case NEON::BI__builtin_neon_vtbl2_v: {
3772 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 0, M: 2), ExtOp: nullptr, IndexOp: Ops[2],
3773 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbl1, Name: "vtbl1");
3774 }
3775 case NEON::BI__builtin_neon_vtbl3_v: {
3776 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 0, M: 3), ExtOp: nullptr, IndexOp: Ops[3],
3777 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbl2, Name: "vtbl2");
3778 }
3779 case NEON::BI__builtin_neon_vtbl4_v: {
3780 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 0, M: 4), ExtOp: nullptr, IndexOp: Ops[4],
3781 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbl2, Name: "vtbl2");
3782 }
3783 case NEON::BI__builtin_neon_vtbx1_v: {
3784 Value *TblRes =
3785 packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 1, M: 1), ExtOp: nullptr, IndexOp: Ops[2], ResTy: Ty,
3786 IntID: Intrinsic::aarch64_neon_tbl1, Name: "vtbl1");
3787
3788 llvm::Constant *EightV = ConstantInt::get(Ty, V: 8);
3789 Value *CmpRes = Builder.CreateICmp(P: ICmpInst::ICMP_UGE, LHS: Ops[2], RHS: EightV);
3790 CmpRes = Builder.CreateSExt(V: CmpRes, DestTy: Ty);
3791
3792 Value *EltsFromInput = Builder.CreateAnd(LHS: CmpRes, RHS: Ops[0]);
3793 Value *EltsFromTbl = Builder.CreateAnd(LHS: Builder.CreateNot(V: CmpRes), RHS: TblRes);
3794 return Builder.CreateOr(LHS: EltsFromInput, RHS: EltsFromTbl, Name: "vtbx");
3795 }
3796 case NEON::BI__builtin_neon_vtbx2_v: {
3797 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 1, M: 2), ExtOp: Ops[0], IndexOp: Ops[3],
3798 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbx1, Name: "vtbx1");
3799 }
3800 case NEON::BI__builtin_neon_vtbx3_v: {
3801 Value *TblRes =
3802 packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 1, M: 3), ExtOp: nullptr, IndexOp: Ops[4], ResTy: Ty,
3803 IntID: Intrinsic::aarch64_neon_tbl2, Name: "vtbl2");
3804
3805 llvm::Constant *TwentyFourV = ConstantInt::get(Ty, V: 24);
3806 Value *CmpRes = Builder.CreateICmp(P: ICmpInst::ICMP_UGE, LHS: Ops[4],
3807 RHS: TwentyFourV);
3808 CmpRes = Builder.CreateSExt(V: CmpRes, DestTy: Ty);
3809
3810 Value *EltsFromInput = Builder.CreateAnd(LHS: CmpRes, RHS: Ops[0]);
3811 Value *EltsFromTbl = Builder.CreateAnd(LHS: Builder.CreateNot(V: CmpRes), RHS: TblRes);
3812 return Builder.CreateOr(LHS: EltsFromInput, RHS: EltsFromTbl, Name: "vtbx");
3813 }
3814 case NEON::BI__builtin_neon_vtbx4_v: {
3815 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 1, M: 4), ExtOp: Ops[0], IndexOp: Ops[5],
3816 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbx2, Name: "vtbx2");
3817 }
3818 case NEON::BI__builtin_neon_vqtbl1_v:
3819 case NEON::BI__builtin_neon_vqtbl1q_v:
3820 Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
3821 case NEON::BI__builtin_neon_vqtbl2_v:
3822 case NEON::BI__builtin_neon_vqtbl2q_v: {
3823 Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
3824 case NEON::BI__builtin_neon_vqtbl3_v:
3825 case NEON::BI__builtin_neon_vqtbl3q_v:
3826 Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
3827 case NEON::BI__builtin_neon_vqtbl4_v:
3828 case NEON::BI__builtin_neon_vqtbl4q_v:
3829 Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
3830 case NEON::BI__builtin_neon_vqtbx1_v:
3831 case NEON::BI__builtin_neon_vqtbx1q_v:
3832 Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
3833 case NEON::BI__builtin_neon_vqtbx2_v:
3834 case NEON::BI__builtin_neon_vqtbx2q_v:
3835 Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
3836 case NEON::BI__builtin_neon_vqtbx3_v:
3837 case NEON::BI__builtin_neon_vqtbx3q_v:
3838 Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
3839 case NEON::BI__builtin_neon_vqtbx4_v:
3840 case NEON::BI__builtin_neon_vqtbx4q_v:
3841 Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
3842 }
3843 }
3844
3845 if (!Int)
3846 return nullptr;
3847
3848 Function *F = CGF.CGM.getIntrinsic(IID: Int, Tys: Ty);
3849 return CGF.EmitNeonCall(F, Ops, name: s);
3850}
3851
3852Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
3853 auto *VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 4);
3854 Op = Builder.CreateBitCast(V: Op, DestTy: Int16Ty);
3855 Value *V = PoisonValue::get(T: VTy);
3856 llvm::Constant *CI = ConstantInt::get(Ty: SizeTy, V: 0);
3857 Op = Builder.CreateInsertElement(Vec: V, NewElt: Op, Idx: CI);
3858 return Op;
3859}
3860
3861/// SVEBuiltinMemEltTy - Returns the memory element type for this memory
3862/// access builtin. Only required if it can't be inferred from the base pointer
3863/// operand.
3864llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) {
3865 switch (TypeFlags.getMemEltType()) {
3866 case SVETypeFlags::MemEltTyDefault:
3867 return getEltType(TypeFlags);
3868 case SVETypeFlags::MemEltTyInt8:
3869 return Builder.getInt8Ty();
3870 case SVETypeFlags::MemEltTyInt16:
3871 return Builder.getInt16Ty();
3872 case SVETypeFlags::MemEltTyInt32:
3873 return Builder.getInt32Ty();
3874 case SVETypeFlags::MemEltTyInt64:
3875 return Builder.getInt64Ty();
3876 }
3877 llvm_unreachable("Unknown MemEltType");
3878}
3879
3880llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) {
3881 switch (TypeFlags.getEltType()) {
3882 default:
3883 llvm_unreachable("Invalid SVETypeFlag!");
3884
3885 case SVETypeFlags::EltTyMFloat8:
3886 case SVETypeFlags::EltTyInt8:
3887 return Builder.getInt8Ty();
3888 case SVETypeFlags::EltTyInt16:
3889 return Builder.getInt16Ty();
3890 case SVETypeFlags::EltTyInt32:
3891 return Builder.getInt32Ty();
3892 case SVETypeFlags::EltTyInt64:
3893 return Builder.getInt64Ty();
3894 case SVETypeFlags::EltTyInt128:
3895 return Builder.getInt128Ty();
3896
3897 case SVETypeFlags::EltTyFloat16:
3898 return Builder.getHalfTy();
3899 case SVETypeFlags::EltTyFloat32:
3900 return Builder.getFloatTy();
3901 case SVETypeFlags::EltTyFloat64:
3902 return Builder.getDoubleTy();
3903
3904 case SVETypeFlags::EltTyBFloat16:
3905 return Builder.getBFloatTy();
3906
3907 case SVETypeFlags::EltTyBool8:
3908 case SVETypeFlags::EltTyBool16:
3909 case SVETypeFlags::EltTyBool32:
3910 case SVETypeFlags::EltTyBool64:
3911 return Builder.getInt1Ty();
3912 }
3913}
3914
3915// Return the llvm predicate vector type corresponding to the specified element
3916// TypeFlags.
3917llvm::ScalableVectorType *
3918CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) {
3919 switch (TypeFlags.getEltType()) {
3920 default: llvm_unreachable("Unhandled SVETypeFlag!");
3921
3922 case SVETypeFlags::EltTyInt8:
3923 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16);
3924 case SVETypeFlags::EltTyInt16:
3925 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3926 case SVETypeFlags::EltTyInt32:
3927 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 4);
3928 case SVETypeFlags::EltTyInt64:
3929 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 2);
3930
3931 case SVETypeFlags::EltTyBFloat16:
3932 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3933 case SVETypeFlags::EltTyFloat16:
3934 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3935 case SVETypeFlags::EltTyFloat32:
3936 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 4);
3937 case SVETypeFlags::EltTyFloat64:
3938 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 2);
3939
3940 case SVETypeFlags::EltTyBool8:
3941 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16);
3942 case SVETypeFlags::EltTyBool16:
3943 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3944 case SVETypeFlags::EltTyBool32:
3945 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 4);
3946 case SVETypeFlags::EltTyBool64:
3947 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 2);
3948 }
3949}
3950
3951// Return the llvm vector type corresponding to the specified element TypeFlags.
3952llvm::ScalableVectorType *
3953CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
3954 switch (TypeFlags.getEltType()) {
3955 default:
3956 llvm_unreachable("Invalid SVETypeFlag!");
3957
3958 case SVETypeFlags::EltTyInt8:
3959 return llvm::ScalableVectorType::get(ElementType: Builder.getInt8Ty(), MinNumElts: 16);
3960 case SVETypeFlags::EltTyInt16:
3961 return llvm::ScalableVectorType::get(ElementType: Builder.getInt16Ty(), MinNumElts: 8);
3962 case SVETypeFlags::EltTyInt32:
3963 return llvm::ScalableVectorType::get(ElementType: Builder.getInt32Ty(), MinNumElts: 4);
3964 case SVETypeFlags::EltTyInt64:
3965 return llvm::ScalableVectorType::get(ElementType: Builder.getInt64Ty(), MinNumElts: 2);
3966
3967 case SVETypeFlags::EltTyMFloat8:
3968 return llvm::ScalableVectorType::get(ElementType: Builder.getInt8Ty(), MinNumElts: 16);
3969 case SVETypeFlags::EltTyFloat16:
3970 return llvm::ScalableVectorType::get(ElementType: Builder.getHalfTy(), MinNumElts: 8);
3971 case SVETypeFlags::EltTyBFloat16:
3972 return llvm::ScalableVectorType::get(ElementType: Builder.getBFloatTy(), MinNumElts: 8);
3973 case SVETypeFlags::EltTyFloat32:
3974 return llvm::ScalableVectorType::get(ElementType: Builder.getFloatTy(), MinNumElts: 4);
3975 case SVETypeFlags::EltTyFloat64:
3976 return llvm::ScalableVectorType::get(ElementType: Builder.getDoubleTy(), MinNumElts: 2);
3977
3978 case SVETypeFlags::EltTyBool8:
3979 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16);
3980 case SVETypeFlags::EltTyBool16:
3981 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3982 case SVETypeFlags::EltTyBool32:
3983 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 4);
3984 case SVETypeFlags::EltTyBool64:
3985 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 2);
3986 }
3987}
3988
3989llvm::Value *
3990CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) {
3991 Function *Ptrue =
3992 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_ptrue, Tys: getSVEPredType(TypeFlags));
3993 return Builder.CreateCall(Callee: Ptrue, Args: {Builder.getInt32(/*SV_ALL*/ C: 31)});
3994}
3995
3996constexpr unsigned SVEBitsPerBlock = 128;
3997
3998static llvm::ScalableVectorType *getSVEVectorForElementType(llvm::Type *EltTy) {
3999 unsigned NumElts = SVEBitsPerBlock / EltTy->getScalarSizeInBits();
4000 return llvm::ScalableVectorType::get(ElementType: EltTy, MinNumElts: NumElts);
4001}
4002
4003// Reinterpret the input predicate so that it can be used to correctly isolate
4004// the elements of the specified datatype.
4005Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
4006 llvm::ScalableVectorType *VTy) {
4007
4008 if (isa<TargetExtType>(Val: Pred->getType()) &&
4009 cast<TargetExtType>(Val: Pred->getType())->getName() == "aarch64.svcount")
4010 return Pred;
4011
4012 auto *RTy = llvm::VectorType::get(ElementType: IntegerType::get(C&: getLLVMContext(), NumBits: 1), Other: VTy);
4013 if (Pred->getType() == RTy)
4014 return Pred;
4015
4016 unsigned IntID;
4017 llvm::Type *IntrinsicTy;
4018 switch (VTy->getMinNumElements()) {
4019 default:
4020 llvm_unreachable("unsupported element count!");
4021 case 1:
4022 case 2:
4023 case 4:
4024 case 8:
4025 IntID = Intrinsic::aarch64_sve_convert_from_svbool;
4026 IntrinsicTy = RTy;
4027 break;
4028 case 16:
4029 IntID = Intrinsic::aarch64_sve_convert_to_svbool;
4030 IntrinsicTy = Pred->getType();
4031 break;
4032 }
4033
4034 Function *F = CGM.getIntrinsic(IID: IntID, Tys: IntrinsicTy);
4035 Value *C = Builder.CreateCall(Callee: F, Args: Pred);
4036 assert(C->getType() == RTy && "Unexpected return type!");
4037 return C;
4038}
4039
4040Value *CodeGenFunction::EmitSVEPredicateTupleCast(Value *PredTuple,
4041 llvm::StructType *Ty) {
4042 if (PredTuple->getType() == Ty)
4043 return PredTuple;
4044
4045 Value *Ret = llvm::PoisonValue::get(T: Ty);
4046 for (unsigned I = 0; I < Ty->getNumElements(); ++I) {
4047 Value *Pred = Builder.CreateExtractValue(Agg: PredTuple, Idxs: I);
4048 Pred = EmitSVEPredicateCast(
4049 Pred, VTy: cast<llvm::ScalableVectorType>(Val: Ty->getTypeAtIndex(N: I)));
4050 Ret = Builder.CreateInsertValue(Agg: Ret, Val: Pred, Idxs: I);
4051 }
4052
4053 return Ret;
4054}
4055
4056Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
4057 SmallVectorImpl<Value *> &Ops,
4058 unsigned IntID) {
4059 auto *ResultTy = getSVEType(TypeFlags);
4060 auto *OverloadedTy =
4061 llvm::ScalableVectorType::get(ElementType: SVEBuiltinMemEltTy(TypeFlags), SVTy: ResultTy);
4062
4063 Function *F = nullptr;
4064 if (Ops[1]->getType()->isVectorTy())
4065 // This is the "vector base, scalar offset" case. In order to uniquely
4066 // map this built-in to an LLVM IR intrinsic, we need both the return type
4067 // and the type of the vector base.
4068 F = CGM.getIntrinsic(IID: IntID, Tys: {OverloadedTy, Ops[1]->getType()});
4069 else
4070 // This is the "scalar base, vector offset case". The type of the offset
4071 // is encoded in the name of the intrinsic. We only need to specify the
4072 // return type in order to uniquely map this built-in to an LLVM IR
4073 // intrinsic.
4074 F = CGM.getIntrinsic(IID: IntID, Tys: OverloadedTy);
4075
4076 // At the ACLE level there's only one predicate type, svbool_t, which is
4077 // mapped to <n x 16 x i1>. However, this might be incompatible with the
4078 // actual type being loaded. For example, when loading doubles (i64) the
4079 // predicate should be <n x 2 x i1> instead. At the IR level the type of
4080 // the predicate and the data being loaded must match. Cast to the type
4081 // expected by the intrinsic. The intrinsic itself should be defined in
4082 // a way than enforces relations between parameter types.
4083 Ops[0] = EmitSVEPredicateCast(
4084 Pred: Ops[0], VTy: cast<llvm::ScalableVectorType>(Val: F->getArg(i: 0)->getType()));
4085
4086 // Pass 0 when the offset is missing. This can only be applied when using
4087 // the "vector base" addressing mode for which ACLE allows no offset. The
4088 // corresponding LLVM IR always requires an offset.
4089 if (Ops.size() == 2) {
4090 assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
4091 Ops.push_back(Elt: ConstantInt::get(Ty: Int64Ty, V: 0));
4092 }
4093
4094 // For "vector base, scalar index" scale the index so that it becomes a
4095 // scalar offset.
4096 if (!TypeFlags.isByteIndexed() && Ops[1]->getType()->isVectorTy()) {
4097 unsigned BytesPerElt =
4098 OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
4099 Ops[2] = Builder.CreateShl(LHS: Ops[2], RHS: Log2_32(Value: BytesPerElt));
4100 }
4101
4102 Value *Call = Builder.CreateCall(Callee: F, Args: Ops);
4103
4104 // The following sext/zext is only needed when ResultTy != OverloadedTy. In
4105 // other cases it's folded into a nop.
4106 return TypeFlags.isZExtReturn() ? Builder.CreateZExt(V: Call, DestTy: ResultTy)
4107 : Builder.CreateSExt(V: Call, DestTy: ResultTy);
4108}
4109
4110Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
4111 SmallVectorImpl<Value *> &Ops,
4112 unsigned IntID) {
4113 auto *SrcDataTy = getSVEType(TypeFlags);
4114 auto *OverloadedTy =
4115 llvm::ScalableVectorType::get(ElementType: SVEBuiltinMemEltTy(TypeFlags), SVTy: SrcDataTy);
4116
4117 // In ACLE the source data is passed in the last argument, whereas in LLVM IR
4118 // it's the first argument. Move it accordingly.
4119 Ops.insert(I: Ops.begin(), Elt: Ops.pop_back_val());
4120
4121 Function *F = nullptr;
4122 if (Ops[2]->getType()->isVectorTy())
4123 // This is the "vector base, scalar offset" case. In order to uniquely
4124 // map this built-in to an LLVM IR intrinsic, we need both the return type
4125 // and the type of the vector base.
4126 F = CGM.getIntrinsic(IID: IntID, Tys: {OverloadedTy, Ops[2]->getType()});
4127 else
4128 // This is the "scalar base, vector offset case". The type of the offset
4129 // is encoded in the name of the intrinsic. We only need to specify the
4130 // return type in order to uniquely map this built-in to an LLVM IR
4131 // intrinsic.
4132 F = CGM.getIntrinsic(IID: IntID, Tys: OverloadedTy);
4133
4134 // Pass 0 when the offset is missing. This can only be applied when using
4135 // the "vector base" addressing mode for which ACLE allows no offset. The
4136 // corresponding LLVM IR always requires an offset.
4137 if (Ops.size() == 3) {
4138 assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
4139 Ops.push_back(Elt: ConstantInt::get(Ty: Int64Ty, V: 0));
4140 }
4141
4142 // Truncation is needed when SrcDataTy != OverloadedTy. In other cases it's
4143 // folded into a nop.
4144 Ops[0] = Builder.CreateTrunc(V: Ops[0], DestTy: OverloadedTy);
4145
4146 // At the ACLE level there's only one predicate type, svbool_t, which is
4147 // mapped to <n x 16 x i1>. However, this might be incompatible with the
4148 // actual type being stored. For example, when storing doubles (i64) the
4149 // predicated should be <n x 2 x i1> instead. At the IR level the type of
4150 // the predicate and the data being stored must match. Cast to the type
4151 // expected by the intrinsic. The intrinsic itself should be defined in
4152 // a way that enforces relations between parameter types.
4153 Ops[1] = EmitSVEPredicateCast(
4154 Pred: Ops[1], VTy: cast<llvm::ScalableVectorType>(Val: F->getArg(i: 1)->getType()));
4155
4156 // For "vector base, scalar index" scale the index so that it becomes a
4157 // scalar offset.
4158 if (!TypeFlags.isByteIndexed() && Ops[2]->getType()->isVectorTy()) {
4159 unsigned BytesPerElt =
4160 OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
4161 Ops[3] = Builder.CreateShl(LHS: Ops[3], RHS: Log2_32(Value: BytesPerElt));
4162 }
4163
4164 return Builder.CreateCall(Callee: F, Args: Ops);
4165}
4166
4167Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
4168 SmallVectorImpl<Value *> &Ops,
4169 unsigned IntID) {
4170 // The gather prefetches are overloaded on the vector input - this can either
4171 // be the vector of base addresses or vector of offsets.
4172 auto *OverloadedTy = dyn_cast<llvm::ScalableVectorType>(Val: Ops[1]->getType());
4173 if (!OverloadedTy)
4174 OverloadedTy = cast<llvm::ScalableVectorType>(Val: Ops[2]->getType());
4175
4176 // Cast the predicate from svbool_t to the right number of elements.
4177 Ops[0] = EmitSVEPredicateCast(Pred: Ops[0], VTy: OverloadedTy);
4178
4179 // vector + imm addressing modes
4180 if (Ops[1]->getType()->isVectorTy()) {
4181 if (Ops.size() == 3) {
4182 // Pass 0 for 'vector+imm' when the index is omitted.
4183 Ops.push_back(Elt: ConstantInt::get(Ty: Int64Ty, V: 0));
4184
4185 // The sv_prfop is the last operand in the builtin and IR intrinsic.
4186 std::swap(a&: Ops[2], b&: Ops[3]);
4187 } else {
4188 // Index needs to be passed as scaled offset.
4189 llvm::Type *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
4190 unsigned BytesPerElt = MemEltTy->getPrimitiveSizeInBits() / 8;
4191 if (BytesPerElt > 1)
4192 Ops[2] = Builder.CreateShl(LHS: Ops[2], RHS: Log2_32(Value: BytesPerElt));
4193 }
4194 }
4195
4196 Function *F = CGM.getIntrinsic(IID: IntID, Tys: OverloadedTy);
4197 return Builder.CreateCall(Callee: F, Args: Ops);
4198}
4199
4200Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
4201 SmallVectorImpl<Value*> &Ops,
4202 unsigned IntID) {
4203 llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
4204 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy);
4205 Value *BasePtr = Ops[1];
4206
4207 // Does the load have an offset?
4208 if (Ops.size() > 2)
4209 BasePtr = Builder.CreateGEP(Ty: VTy, Ptr: BasePtr, IdxList: Ops[2]);
4210
4211 Function *F = CGM.getIntrinsic(IID: IntID, Tys: {VTy});
4212 return Builder.CreateCall(Callee: F, Args: {Predicate, BasePtr});
4213}
4214
4215Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
4216 SmallVectorImpl<Value*> &Ops,
4217 unsigned IntID) {
4218 llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
4219
4220 unsigned N;
4221 switch (IntID) {
4222 case Intrinsic::aarch64_sve_st2:
4223 case Intrinsic::aarch64_sve_st1_pn_x2:
4224 case Intrinsic::aarch64_sve_stnt1_pn_x2:
4225 case Intrinsic::aarch64_sve_st2q:
4226 N = 2;
4227 break;
4228 case Intrinsic::aarch64_sve_st3:
4229 case Intrinsic::aarch64_sve_st3q:
4230 N = 3;
4231 break;
4232 case Intrinsic::aarch64_sve_st4:
4233 case Intrinsic::aarch64_sve_st1_pn_x4:
4234 case Intrinsic::aarch64_sve_stnt1_pn_x4:
4235 case Intrinsic::aarch64_sve_st4q:
4236 N = 4;
4237 break;
4238 default:
4239 llvm_unreachable("unknown intrinsic!");
4240 }
4241
4242 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy);
4243 Value *BasePtr = Ops[1];
4244
4245 // Does the store have an offset?
4246 if (Ops.size() > (2 + N))
4247 BasePtr = Builder.CreateGEP(Ty: VTy, Ptr: BasePtr, IdxList: Ops[2]);
4248
4249 // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
4250 // need to break up the tuple vector.
4251 SmallVector<llvm::Value*, 5> Operands;
4252 for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
4253 Operands.push_back(Elt: Ops[I]);
4254 Operands.append(IL: {Predicate, BasePtr});
4255 Function *F = CGM.getIntrinsic(IID: IntID, Tys: { VTy });
4256
4257 return Builder.CreateCall(Callee: F, Args: Operands);
4258}
4259
4260// SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and
4261// svpmullt_pair intrinsics, with the exception that their results are bitcast
4262// to a wider type.
4263Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags,
4264 SmallVectorImpl<Value *> &Ops,
4265 unsigned BuiltinID) {
4266 // Splat scalar operand to vector (intrinsics with _n infix)
4267 if (TypeFlags.hasSplatOperand()) {
4268 unsigned OpNo = TypeFlags.getSplatOperand();
4269 Ops[OpNo] = EmitSVEDupX(Scalar: Ops[OpNo]);
4270 }
4271
4272 // The pair-wise function has a narrower overloaded type.
4273 Function *F = CGM.getIntrinsic(IID: BuiltinID, Tys: Ops[0]->getType());
4274 Value *Call = Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1]});
4275
4276 // Now bitcast to the wider result type.
4277 llvm::ScalableVectorType *Ty = getSVEType(TypeFlags);
4278 return EmitSVEReinterpret(Val: Call, Ty);
4279}
4280
4281Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags,
4282 ArrayRef<Value *> Ops, unsigned BuiltinID) {
4283 llvm::Type *OverloadedTy = getSVEType(TypeFlags);
4284 Function *F = CGM.getIntrinsic(IID: BuiltinID, Tys: OverloadedTy);
4285 return Builder.CreateCall(Callee: F, Args: {Ops[0], Builder.getInt32(C: 0)});
4286}
4287
4288Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
4289 SmallVectorImpl<Value *> &Ops,
4290 unsigned BuiltinID) {
4291 auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
4292 auto *VectorTy = getSVEVectorForElementType(EltTy: MemEltTy);
4293 auto *MemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, SVTy: VectorTy);
4294
4295 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy: MemoryTy);
4296 Value *BasePtr = Ops[1];
4297
4298 // Implement the index operand if not omitted.
4299 if (Ops.size() > 3)
4300 BasePtr = Builder.CreateGEP(Ty: MemoryTy, Ptr: BasePtr, IdxList: Ops[2]);
4301
4302 Value *PrfOp = Ops.back();
4303
4304 Function *F = CGM.getIntrinsic(IID: BuiltinID, Tys: Predicate->getType());
4305 return Builder.CreateCall(Callee: F, Args: {Predicate, BasePtr, PrfOp});
4306}
4307
4308Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
4309 llvm::Type *ReturnTy,
4310 SmallVectorImpl<Value *> &Ops,
4311 unsigned IntrinsicID,
4312 bool IsZExtReturn) {
4313 QualType LangPTy = E->getArg(Arg: 1)->getType();
4314 llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
4315 T: LangPTy->castAs<PointerType>()->getPointeeType());
4316
4317 // Mfloat8 types is stored as a vector, so extra work
4318 // to extract sclar element type is necessary.
4319 if (MemEltTy->isVectorTy()) {
4320 assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) &&
4321 "Only <1 x i8> expected");
4322 MemEltTy = cast<llvm::VectorType>(Val: MemEltTy)->getElementType();
4323 }
4324
4325 // The vector type that is returned may be different from the
4326 // eventual type loaded from memory.
4327 auto VectorTy = cast<llvm::ScalableVectorType>(Val: ReturnTy);
4328 llvm::ScalableVectorType *MemoryTy = nullptr;
4329 llvm::ScalableVectorType *PredTy = nullptr;
4330 bool IsQuadLoad = false;
4331 switch (IntrinsicID) {
4332 case Intrinsic::aarch64_sve_ld1uwq:
4333 case Intrinsic::aarch64_sve_ld1udq:
4334 MemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, MinNumElts: 1);
4335 PredTy = llvm::ScalableVectorType::get(
4336 ElementType: llvm::Type::getInt1Ty(C&: getLLVMContext()), MinNumElts: 1);
4337 IsQuadLoad = true;
4338 break;
4339 default:
4340 MemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, SVTy: VectorTy);
4341 PredTy = MemoryTy;
4342 break;
4343 }
4344
4345 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy: PredTy);
4346 Value *BasePtr = Ops[1];
4347
4348 // Does the load have an offset?
4349 if (Ops.size() > 2)
4350 BasePtr = Builder.CreateGEP(Ty: MemoryTy, Ptr: BasePtr, IdxList: Ops[2]);
4351
4352 Function *F = CGM.getIntrinsic(IID: IntrinsicID, Tys: IsQuadLoad ? VectorTy : MemoryTy);
4353 auto *Load =
4354 cast<llvm::Instruction>(Val: Builder.CreateCall(Callee: F, Args: {Predicate, BasePtr}));
4355 auto TBAAInfo = CGM.getTBAAAccessInfo(AccessType: LangPTy->getPointeeType());
4356 CGM.DecorateInstructionWithTBAA(Inst: Load, TBAAInfo);
4357
4358 if (IsQuadLoad)
4359 return Load;
4360
4361 return IsZExtReturn ? Builder.CreateZExt(V: Load, DestTy: VectorTy)
4362 : Builder.CreateSExt(V: Load, DestTy: VectorTy);
4363}
4364
4365Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
4366 SmallVectorImpl<Value *> &Ops,
4367 unsigned IntrinsicID) {
4368 QualType LangPTy = E->getArg(Arg: 1)->getType();
4369 llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
4370 T: LangPTy->castAs<PointerType>()->getPointeeType());
4371
4372 // Mfloat8 types is stored as a vector, so extra work
4373 // to extract sclar element type is necessary.
4374 if (MemEltTy->isVectorTy()) {
4375 assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) &&
4376 "Only <1 x i8> expected");
4377 MemEltTy = cast<llvm::VectorType>(Val: MemEltTy)->getElementType();
4378 }
4379
4380 // The vector type that is stored may be different from the
4381 // eventual type stored to memory.
4382 auto VectorTy = cast<llvm::ScalableVectorType>(Val: Ops.back()->getType());
4383 auto MemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, SVTy: VectorTy);
4384
4385 auto PredTy = MemoryTy;
4386 auto AddrMemoryTy = MemoryTy;
4387 bool IsQuadStore = false;
4388
4389 switch (IntrinsicID) {
4390 case Intrinsic::aarch64_sve_st1wq:
4391 case Intrinsic::aarch64_sve_st1dq:
4392 AddrMemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, MinNumElts: 1);
4393 PredTy =
4394 llvm::ScalableVectorType::get(ElementType: IntegerType::get(C&: getLLVMContext(), NumBits: 1), MinNumElts: 1);
4395 IsQuadStore = true;
4396 break;
4397 default:
4398 break;
4399 }
4400 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy: PredTy);
4401 Value *BasePtr = Ops[1];
4402
4403 // Does the store have an offset?
4404 if (Ops.size() == 4)
4405 BasePtr = Builder.CreateGEP(Ty: AddrMemoryTy, Ptr: BasePtr, IdxList: Ops[2]);
4406
4407 // Last value is always the data
4408 Value *Val =
4409 IsQuadStore ? Ops.back() : Builder.CreateTrunc(V: Ops.back(), DestTy: MemoryTy);
4410
4411 Function *F =
4412 CGM.getIntrinsic(IID: IntrinsicID, Tys: IsQuadStore ? VectorTy : MemoryTy);
4413 auto *Store =
4414 cast<llvm::Instruction>(Val: Builder.CreateCall(Callee: F, Args: {Val, Predicate, BasePtr}));
4415 auto TBAAInfo = CGM.getTBAAAccessInfo(AccessType: LangPTy->getPointeeType());
4416 CGM.DecorateInstructionWithTBAA(Inst: Store, TBAAInfo);
4417 return Store;
4418}
4419
4420Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags,
4421 SmallVectorImpl<Value *> &Ops,
4422 unsigned IntID) {
4423 Ops[2] = EmitSVEPredicateCast(
4424 Pred: Ops[2], VTy: getSVEVectorForElementType(EltTy: SVEBuiltinMemEltTy(TypeFlags)));
4425
4426 SmallVector<Value *> NewOps;
4427 NewOps.push_back(Elt: Ops[2]);
4428
4429 llvm::Value *BasePtr = Ops[3];
4430 llvm::Value *RealSlice = Ops[1];
4431 // If the intrinsic contains the vnum parameter, multiply it with the vector
4432 // size in bytes.
4433 if (Ops.size() == 5) {
4434 Function *StreamingVectorLength =
4435 CGM.getIntrinsic(IID: Intrinsic::aarch64_sme_cntsd);
4436 llvm::Value *StreamingVectorLengthCall =
4437 Builder.CreateMul(LHS: Builder.CreateCall(Callee: StreamingVectorLength),
4438 RHS: llvm::ConstantInt::get(Ty: Int64Ty, V: 8), Name: "svl",
4439 /* HasNUW */ true, /* HasNSW */ true);
4440 llvm::Value *Mulvl =
4441 Builder.CreateMul(LHS: StreamingVectorLengthCall, RHS: Ops[4], Name: "mulvl");
4442 // The type of the ptr parameter is void *, so use Int8Ty here.
4443 BasePtr = Builder.CreateGEP(Ty: Int8Ty, Ptr: Ops[3], IdxList: Mulvl);
4444 RealSlice = Builder.CreateZExt(V: RealSlice, DestTy: Int64Ty);
4445 RealSlice = Builder.CreateAdd(LHS: RealSlice, RHS: Ops[4]);
4446 RealSlice = Builder.CreateTrunc(V: RealSlice, DestTy: Int32Ty);
4447 }
4448 NewOps.push_back(Elt: BasePtr);
4449 NewOps.push_back(Elt: Ops[0]);
4450 NewOps.push_back(Elt: RealSlice);
4451 Function *F = CGM.getIntrinsic(IID: IntID);
4452 return Builder.CreateCall(Callee: F, Args: NewOps);
4453}
4454
4455Value *CodeGenFunction::EmitSMEReadWrite(const SVETypeFlags &TypeFlags,
4456 SmallVectorImpl<Value *> &Ops,
4457 unsigned IntID) {
4458 auto *VecTy = getSVEType(TypeFlags);
4459 Function *F = CGM.getIntrinsic(IID: IntID, Tys: VecTy);
4460 if (TypeFlags.isReadZA())
4461 Ops[1] = EmitSVEPredicateCast(Pred: Ops[1], VTy: VecTy);
4462 else if (TypeFlags.isWriteZA())
4463 Ops[2] = EmitSVEPredicateCast(Pred: Ops[2], VTy: VecTy);
4464 return Builder.CreateCall(Callee: F, Args: Ops);
4465}
4466
4467Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
4468 SmallVectorImpl<Value *> &Ops,
4469 unsigned IntID) {
4470 // svzero_za() intrinsic zeros the entire za tile and has no paramters.
4471 if (Ops.size() == 0)
4472 Ops.push_back(Elt: llvm::ConstantInt::get(Ty: Int32Ty, V: 255));
4473 Function *F = CGM.getIntrinsic(IID: IntID, Tys: {});
4474 return Builder.CreateCall(Callee: F, Args: Ops);
4475}
4476
4477Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
4478 SmallVectorImpl<Value *> &Ops,
4479 unsigned IntID) {
4480 if (Ops.size() == 2)
4481 Ops.push_back(Elt: Builder.getInt32(C: 0));
4482 else
4483 Ops[2] = Builder.CreateIntCast(V: Ops[2], DestTy: Int32Ty, isSigned: true);
4484 Function *F = CGM.getIntrinsic(IID: IntID, Tys: {});
4485 return Builder.CreateCall(Callee: F, Args: Ops);
4486}
4487
4488// Limit the usage of scalable llvm IR generated by the ACLE by using the
4489// sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
4490Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
4491 return Builder.CreateVectorSplat(
4492 EC: cast<llvm::VectorType>(Val: Ty)->getElementCount(), V: Scalar);
4493}
4494
4495Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) {
4496 if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) {
4497#ifndef NDEBUG
4498 auto *VecTy = cast<llvm::VectorType>(Ty);
4499 ElementCount EC = VecTy->getElementCount();
4500 assert(EC.isScalar() && VecTy->getElementType() == Int8Ty &&
4501 "Only <1 x i8> expected");
4502#endif
4503 Scalar = Builder.CreateExtractElement(Vec: Scalar, Idx: uint64_t(0));
4504 }
4505 return EmitSVEDupX(Scalar, Ty: getSVEVectorForElementType(EltTy: Scalar->getType()));
4506}
4507
4508Value *CodeGenFunction::EmitSVEReinterpret(Value *Val, llvm::Type *Ty) {
4509 // FIXME: For big endian this needs an additional REV, or needs a separate
4510 // intrinsic that is code-generated as a no-op, because the LLVM bitcast
4511 // instruction is defined as 'bitwise' equivalent from memory point of
4512 // view (when storing/reloading), whereas the svreinterpret builtin
4513 // implements bitwise equivalent cast from register point of view.
4514 // LLVM CodeGen for a bitcast must add an explicit REV for big-endian.
4515
4516 if (auto *StructTy = dyn_cast<StructType>(Val: Ty)) {
4517 Value *Tuple = llvm::PoisonValue::get(T: Ty);
4518
4519 for (unsigned I = 0; I < StructTy->getNumElements(); ++I) {
4520 Value *In = Builder.CreateExtractValue(Agg: Val, Idxs: I);
4521 Value *Out = Builder.CreateBitCast(V: In, DestTy: StructTy->getTypeAtIndex(N: I));
4522 Tuple = Builder.CreateInsertValue(Agg: Tuple, Val: Out, Idxs: I);
4523 }
4524
4525 return Tuple;
4526 }
4527
4528 return Builder.CreateBitCast(V: Val, DestTy: Ty);
4529}
4530
4531static void InsertExplicitZeroOperand(CGBuilderTy &Builder, llvm::Type *Ty,
4532 SmallVectorImpl<Value *> &Ops) {
4533 auto *SplatZero = Constant::getNullValue(Ty);
4534 Ops.insert(I: Ops.begin(), Elt: SplatZero);
4535}
4536
4537static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty,
4538 SmallVectorImpl<Value *> &Ops) {
4539 auto *SplatUndef = UndefValue::get(T: Ty);
4540 Ops.insert(I: Ops.begin(), Elt: SplatUndef);
4541}
4542
4543SmallVector<llvm::Type *, 2>
4544CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
4545 llvm::Type *ResultType,
4546 ArrayRef<Value *> Ops) {
4547 if (TypeFlags.isOverloadNone())
4548 return {};
4549
4550 llvm::Type *DefaultType = getSVEType(TypeFlags);
4551
4552 if (TypeFlags.isOverloadWhileOrMultiVecCvt())
4553 return {DefaultType, Ops[1]->getType()};
4554
4555 if (TypeFlags.isOverloadWhileRW())
4556 return {getSVEPredType(TypeFlags), Ops[0]->getType()};
4557
4558 if (TypeFlags.isOverloadFirstandLast())
4559 return {Ops[0]->getType(), Ops.back()->getType()};
4560
4561 if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() &&
4562 ResultType->isVectorTy())
4563 return {ResultType, Ops[1]->getType()};
4564
4565 assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads");
4566 return {DefaultType};
4567}
4568
4569Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags,
4570 ArrayRef<Value *> Ops) {
4571 assert((TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) &&
4572 "Expects TypleFlags.isTupleSet() or TypeFlags.isTupleGet()");
4573 unsigned Idx = cast<ConstantInt>(Val: Ops[1])->getZExtValue();
4574
4575 if (TypeFlags.isTupleSet())
4576 return Builder.CreateInsertValue(Agg: Ops[0], Val: Ops[2], Idxs: Idx);
4577 return Builder.CreateExtractValue(Agg: Ops[0], Idxs: Idx);
4578}
4579
4580Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
4581 llvm::Type *Ty,
4582 ArrayRef<Value *> Ops) {
4583 assert(TypeFlags.isTupleCreate() && "Expects TypleFlag isTupleCreate");
4584
4585 Value *Tuple = llvm::PoisonValue::get(T: Ty);
4586 for (unsigned Idx = 0; Idx < Ops.size(); Idx++)
4587 Tuple = Builder.CreateInsertValue(Agg: Tuple, Val: Ops[Idx], Idxs: Idx);
4588
4589 return Tuple;
4590}
4591
4592void CodeGenFunction::GetAArch64SVEProcessedOperands(
4593 unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
4594 SVETypeFlags TypeFlags) {
4595 // Find out if any arguments are required to be integer constant expressions.
4596 unsigned ICEArguments = 0;
4597 ASTContext::GetBuiltinTypeError Error;
4598 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
4599 assert(Error == ASTContext::GE_None && "Should not codegen an error");
4600
4601 // Tuple set/get only requires one insert/extract vector, which is
4602 // created by EmitSVETupleSetOrGet.
4603 bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
4604
4605 for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
4606 bool IsICE = ICEArguments & (1 << i);
4607 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: i));
4608
4609 if (IsICE) {
4610 // If this is required to be a constant, constant fold it so that we know
4611 // that the generated intrinsic gets a ConstantInt.
4612 std::optional<llvm::APSInt> Result =
4613 E->getArg(Arg: i)->getIntegerConstantExpr(Ctx: getContext());
4614 assert(Result && "Expected argument to be a constant");
4615
4616 // Immediates for SVE llvm intrinsics are always 32bit. We can safely
4617 // truncate because the immediate has been range checked and no valid
4618 // immediate requires more than a handful of bits.
4619 *Result = Result->extOrTrunc(width: 32);
4620 Ops.push_back(Elt: llvm::ConstantInt::get(Context&: getLLVMContext(), V: *Result));
4621 continue;
4622 }
4623
4624 if (isa<StructType>(Val: Arg->getType()) && !IsTupleGetOrSet) {
4625 for (unsigned I = 0; I < Arg->getType()->getStructNumElements(); ++I)
4626 Ops.push_back(Elt: Builder.CreateExtractValue(Agg: Arg, Idxs: I));
4627
4628 continue;
4629 }
4630
4631 Ops.push_back(Elt: Arg);
4632 }
4633}
4634
4635Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
4636 const CallExpr *E) {
4637 llvm::Type *Ty = ConvertType(T: E->getType());
4638 if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
4639 BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64_x4) {
4640 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 0));
4641 return EmitSVEReinterpret(Val, Ty);
4642 }
4643
4644 auto *Builtin = findARMVectorIntrinsicInMap(IntrinsicMap: AArch64SVEIntrinsicMap, BuiltinID,
4645 MapProvenSorted&: AArch64SVEIntrinsicsProvenSorted);
4646
4647 llvm::SmallVector<Value *, 4> Ops;
4648 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4649 GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
4650
4651 if (TypeFlags.isLoad())
4652 return EmitSVEMaskedLoad(E, ReturnTy: Ty, Ops, IntrinsicID: Builtin->LLVMIntrinsic,
4653 IsZExtReturn: TypeFlags.isZExtReturn());
4654 else if (TypeFlags.isStore())
4655 return EmitSVEMaskedStore(E, Ops, IntrinsicID: Builtin->LLVMIntrinsic);
4656 else if (TypeFlags.isGatherLoad())
4657 return EmitSVEGatherLoad(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4658 else if (TypeFlags.isScatterStore())
4659 return EmitSVEScatterStore(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4660 else if (TypeFlags.isPrefetch())
4661 return EmitSVEPrefetchLoad(TypeFlags, Ops, BuiltinID: Builtin->LLVMIntrinsic);
4662 else if (TypeFlags.isGatherPrefetch())
4663 return EmitSVEGatherPrefetch(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4664 else if (TypeFlags.isStructLoad())
4665 return EmitSVEStructLoad(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4666 else if (TypeFlags.isStructStore())
4667 return EmitSVEStructStore(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4668 else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
4669 return EmitSVETupleSetOrGet(TypeFlags, Ops);
4670 else if (TypeFlags.isTupleCreate())
4671 return EmitSVETupleCreate(TypeFlags, Ty, Ops);
4672 else if (TypeFlags.isUndef())
4673 return UndefValue::get(T: Ty);
4674 else if (Builtin->LLVMIntrinsic != 0) {
4675 // Emit set FPMR for intrinsics that require it
4676 if (TypeFlags.setsFPMR())
4677 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_set_fpmr),
4678 Args: Ops.pop_back_val());
4679 if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp)
4680 InsertExplicitZeroOperand(Builder, Ty, Ops);
4681
4682 if (TypeFlags.getMergeType() == SVETypeFlags::MergeAnyExp)
4683 InsertExplicitUndefOperand(Builder, Ty, Ops);
4684
4685 // Some ACLE builtins leave out the argument to specify the predicate
4686 // pattern, which is expected to be expanded to an SV_ALL pattern.
4687 if (TypeFlags.isAppendSVALL())
4688 Ops.push_back(Elt: Builder.getInt32(/*SV_ALL*/ C: 31));
4689 if (TypeFlags.isInsertOp1SVALL())
4690 Ops.insert(I: &Ops[1], Elt: Builder.getInt32(/*SV_ALL*/ C: 31));
4691
4692 // Predicates must match the main datatype.
4693 for (Value *&Op : Ops)
4694 if (auto PredTy = dyn_cast<llvm::VectorType>(Val: Op->getType()))
4695 if (PredTy->getElementType()->isIntegerTy(Bitwidth: 1))
4696 Op = EmitSVEPredicateCast(Pred: Op, VTy: getSVEType(TypeFlags));
4697
4698 // Splat scalar operand to vector (intrinsics with _n infix)
4699 if (TypeFlags.hasSplatOperand()) {
4700 unsigned OpNo = TypeFlags.getSplatOperand();
4701 Ops[OpNo] = EmitSVEDupX(Scalar: Ops[OpNo]);
4702 }
4703
4704 if (TypeFlags.isReverseCompare())
4705 std::swap(a&: Ops[1], b&: Ops[2]);
4706 else if (TypeFlags.isReverseUSDOT())
4707 std::swap(a&: Ops[1], b&: Ops[2]);
4708 else if (TypeFlags.isReverseMergeAnyBinOp() &&
4709 TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
4710 std::swap(a&: Ops[1], b&: Ops[2]);
4711 else if (TypeFlags.isReverseMergeAnyAccOp() &&
4712 TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
4713 std::swap(a&: Ops[1], b&: Ops[3]);
4714
4715 // Predicated intrinsics with _z suffix need a select w/ zeroinitializer.
4716 if (TypeFlags.getMergeType() == SVETypeFlags::MergeZero) {
4717 llvm::Type *OpndTy = Ops[1]->getType();
4718 auto *SplatZero = Constant::getNullValue(Ty: OpndTy);
4719 Ops[1] = Builder.CreateSelect(C: Ops[0], True: Ops[1], False: SplatZero);
4720 }
4721
4722 Function *F = CGM.getIntrinsic(IID: Builtin->LLVMIntrinsic,
4723 Tys: getSVEOverloadTypes(TypeFlags, ResultType: Ty, Ops));
4724 Value *Call = Builder.CreateCall(Callee: F, Args: Ops);
4725
4726 if (Call->getType() == Ty)
4727 return Call;
4728
4729 // Predicate results must be converted to svbool_t.
4730 if (auto PredTy = dyn_cast<llvm::ScalableVectorType>(Val: Ty))
4731 return EmitSVEPredicateCast(Pred: Call, VTy: PredTy);
4732 if (auto PredTupleTy = dyn_cast<llvm::StructType>(Val: Ty))
4733 return EmitSVEPredicateTupleCast(PredTuple: Call, Ty: PredTupleTy);
4734
4735 llvm_unreachable("unsupported element count!");
4736 }
4737
4738 switch (BuiltinID) {
4739 default:
4740 return nullptr;
4741
4742 case SVE::BI__builtin_sve_svreinterpret_b: {
4743 auto SVCountTy =
4744 llvm::TargetExtType::get(Context&: getLLVMContext(), Name: "aarch64.svcount");
4745 Function *CastFromSVCountF =
4746 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_to_svbool, Tys: SVCountTy);
4747 return Builder.CreateCall(Callee: CastFromSVCountF, Args: Ops[0]);
4748 }
4749 case SVE::BI__builtin_sve_svreinterpret_c: {
4750 auto SVCountTy =
4751 llvm::TargetExtType::get(Context&: getLLVMContext(), Name: "aarch64.svcount");
4752 Function *CastToSVCountF =
4753 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_from_svbool, Tys: SVCountTy);
4754 return Builder.CreateCall(Callee: CastToSVCountF, Args: Ops[0]);
4755 }
4756
4757 case SVE::BI__builtin_sve_svpsel_lane_b8:
4758 case SVE::BI__builtin_sve_svpsel_lane_b16:
4759 case SVE::BI__builtin_sve_svpsel_lane_b32:
4760 case SVE::BI__builtin_sve_svpsel_lane_b64:
4761 case SVE::BI__builtin_sve_svpsel_lane_c8:
4762 case SVE::BI__builtin_sve_svpsel_lane_c16:
4763 case SVE::BI__builtin_sve_svpsel_lane_c32:
4764 case SVE::BI__builtin_sve_svpsel_lane_c64: {
4765 bool IsSVCount = isa<TargetExtType>(Val: Ops[0]->getType());
4766 assert(((!IsSVCount || cast<TargetExtType>(Ops[0]->getType())->getName() ==
4767 "aarch64.svcount")) &&
4768 "Unexpected TargetExtType");
4769 auto SVCountTy =
4770 llvm::TargetExtType::get(Context&: getLLVMContext(), Name: "aarch64.svcount");
4771 Function *CastFromSVCountF =
4772 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_to_svbool, Tys: SVCountTy);
4773 Function *CastToSVCountF =
4774 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_from_svbool, Tys: SVCountTy);
4775
4776 auto OverloadedTy = getSVEType(TypeFlags: SVETypeFlags(Builtin->TypeModifier));
4777 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_psel, Tys: OverloadedTy);
4778 llvm::Value *Ops0 =
4779 IsSVCount ? Builder.CreateCall(Callee: CastFromSVCountF, Args: Ops[0]) : Ops[0];
4780 llvm::Value *Ops1 = EmitSVEPredicateCast(Pred: Ops[1], VTy: OverloadedTy);
4781 llvm::Value *PSel = Builder.CreateCall(Callee: F, Args: {Ops0, Ops1, Ops[2]});
4782 return IsSVCount ? Builder.CreateCall(Callee: CastToSVCountF, Args: PSel) : PSel;
4783 }
4784 case SVE::BI__builtin_sve_svmov_b_z: {
4785 // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op)
4786 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4787 llvm::Type* OverloadedTy = getSVEType(TypeFlags);
4788 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_and_z, Tys: OverloadedTy);
4789 return Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1], Ops[1]});
4790 }
4791
4792 case SVE::BI__builtin_sve_svnot_b_z: {
4793 // svnot_b_z(pg, op) <=> sveor_b_z(pg, op, pg)
4794 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4795 llvm::Type* OverloadedTy = getSVEType(TypeFlags);
4796 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_eor_z, Tys: OverloadedTy);
4797 return Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1], Ops[0]});
4798 }
4799
4800 case SVE::BI__builtin_sve_svmovlb_u16:
4801 case SVE::BI__builtin_sve_svmovlb_u32:
4802 case SVE::BI__builtin_sve_svmovlb_u64:
4803 return EmitSVEMovl(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_ushllb);
4804
4805 case SVE::BI__builtin_sve_svmovlb_s16:
4806 case SVE::BI__builtin_sve_svmovlb_s32:
4807 case SVE::BI__builtin_sve_svmovlb_s64:
4808 return EmitSVEMovl(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_sshllb);
4809
4810 case SVE::BI__builtin_sve_svmovlt_u16:
4811 case SVE::BI__builtin_sve_svmovlt_u32:
4812 case SVE::BI__builtin_sve_svmovlt_u64:
4813 return EmitSVEMovl(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_ushllt);
4814
4815 case SVE::BI__builtin_sve_svmovlt_s16:
4816 case SVE::BI__builtin_sve_svmovlt_s32:
4817 case SVE::BI__builtin_sve_svmovlt_s64:
4818 return EmitSVEMovl(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_sshllt);
4819
4820 case SVE::BI__builtin_sve_svpmullt_u16:
4821 case SVE::BI__builtin_sve_svpmullt_u64:
4822 case SVE::BI__builtin_sve_svpmullt_n_u16:
4823 case SVE::BI__builtin_sve_svpmullt_n_u64:
4824 return EmitSVEPMull(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_pmullt_pair);
4825
4826 case SVE::BI__builtin_sve_svpmullb_u16:
4827 case SVE::BI__builtin_sve_svpmullb_u64:
4828 case SVE::BI__builtin_sve_svpmullb_n_u16:
4829 case SVE::BI__builtin_sve_svpmullb_n_u64:
4830 return EmitSVEPMull(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_pmullb_pair);
4831
4832 case SVE::BI__builtin_sve_svdup_n_b8:
4833 case SVE::BI__builtin_sve_svdup_n_b16:
4834 case SVE::BI__builtin_sve_svdup_n_b32:
4835 case SVE::BI__builtin_sve_svdup_n_b64: {
4836 Value *CmpNE =
4837 Builder.CreateICmpNE(LHS: Ops[0], RHS: Constant::getNullValue(Ty: Ops[0]->getType()));
4838 llvm::ScalableVectorType *OverloadedTy = getSVEType(TypeFlags);
4839 Value *Dup = EmitSVEDupX(Scalar: CmpNE, Ty: OverloadedTy);
4840 return EmitSVEPredicateCast(Pred: Dup, VTy: cast<llvm::ScalableVectorType>(Val: Ty));
4841 }
4842
4843 case SVE::BI__builtin_sve_svdupq_n_b8:
4844 case SVE::BI__builtin_sve_svdupq_n_b16:
4845 case SVE::BI__builtin_sve_svdupq_n_b32:
4846 case SVE::BI__builtin_sve_svdupq_n_b64:
4847 case SVE::BI__builtin_sve_svdupq_n_u8:
4848 case SVE::BI__builtin_sve_svdupq_n_s8:
4849 case SVE::BI__builtin_sve_svdupq_n_u64:
4850 case SVE::BI__builtin_sve_svdupq_n_f64:
4851 case SVE::BI__builtin_sve_svdupq_n_s64:
4852 case SVE::BI__builtin_sve_svdupq_n_u16:
4853 case SVE::BI__builtin_sve_svdupq_n_f16:
4854 case SVE::BI__builtin_sve_svdupq_n_bf16:
4855 case SVE::BI__builtin_sve_svdupq_n_s16:
4856 case SVE::BI__builtin_sve_svdupq_n_u32:
4857 case SVE::BI__builtin_sve_svdupq_n_f32:
4858 case SVE::BI__builtin_sve_svdupq_n_s32: {
4859 // These builtins are implemented by storing each element to an array and using
4860 // ld1rq to materialize a vector.
4861 unsigned NumOpnds = Ops.size();
4862
4863 bool IsBoolTy =
4864 cast<llvm::VectorType>(Val: Ty)->getElementType()->isIntegerTy(Bitwidth: 1);
4865
4866 // For svdupq_n_b* the element type of is an integer of type 128/numelts,
4867 // so that the compare can use the width that is natural for the expected
4868 // number of predicate lanes.
4869 llvm::Type *EltTy = Ops[0]->getType();
4870 if (IsBoolTy)
4871 EltTy = IntegerType::get(C&: getLLVMContext(), NumBits: SVEBitsPerBlock / NumOpnds);
4872
4873 SmallVector<llvm::Value *, 16> VecOps;
4874 for (unsigned I = 0; I < NumOpnds; ++I)
4875 VecOps.push_back(Elt: Builder.CreateZExt(V: Ops[I], DestTy: EltTy));
4876 Value *Vec = BuildVector(Ops: VecOps);
4877
4878 llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy);
4879 Value *InsertSubVec = Builder.CreateInsertVector(
4880 DstType: OverloadedTy, SrcVec: PoisonValue::get(T: OverloadedTy), SubVec: Vec, Idx: uint64_t(0));
4881
4882 Function *F =
4883 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_dupq_lane, Tys: OverloadedTy);
4884 Value *DupQLane =
4885 Builder.CreateCall(Callee: F, Args: {InsertSubVec, Builder.getInt64(C: 0)});
4886
4887 if (!IsBoolTy)
4888 return DupQLane;
4889
4890 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4891 Value *Pred = EmitSVEAllTruePred(TypeFlags);
4892
4893 // For svdupq_n_b* we need to add an additional 'cmpne' with '0'.
4894 F = CGM.getIntrinsic(IID: NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne
4895 : Intrinsic::aarch64_sve_cmpne_wide,
4896 Tys: OverloadedTy);
4897 Value *Call = Builder.CreateCall(
4898 Callee: F, Args: {Pred, DupQLane, EmitSVEDupX(Scalar: Builder.getInt64(C: 0))});
4899 return EmitSVEPredicateCast(Pred: Call, VTy: cast<llvm::ScalableVectorType>(Val: Ty));
4900 }
4901
4902 case SVE::BI__builtin_sve_svpfalse_b:
4903 return ConstantInt::getFalse(Ty);
4904
4905 case SVE::BI__builtin_sve_svpfalse_c: {
4906 auto SVBoolTy = ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16);
4907 Function *CastToSVCountF =
4908 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_from_svbool, Tys: Ty);
4909 return Builder.CreateCall(Callee: CastToSVCountF, Args: ConstantInt::getFalse(Ty: SVBoolTy));
4910 }
4911
4912 case SVE::BI__builtin_sve_svlen_bf16:
4913 case SVE::BI__builtin_sve_svlen_f16:
4914 case SVE::BI__builtin_sve_svlen_f32:
4915 case SVE::BI__builtin_sve_svlen_f64:
4916 case SVE::BI__builtin_sve_svlen_s8:
4917 case SVE::BI__builtin_sve_svlen_s16:
4918 case SVE::BI__builtin_sve_svlen_s32:
4919 case SVE::BI__builtin_sve_svlen_s64:
4920 case SVE::BI__builtin_sve_svlen_u8:
4921 case SVE::BI__builtin_sve_svlen_u16:
4922 case SVE::BI__builtin_sve_svlen_u32:
4923 case SVE::BI__builtin_sve_svlen_u64: {
4924 SVETypeFlags TF(Builtin->TypeModifier);
4925 return Builder.CreateElementCount(Ty, EC: getSVEType(TypeFlags: TF)->getElementCount());
4926 }
4927
4928 case SVE::BI__builtin_sve_svtbl2_u8:
4929 case SVE::BI__builtin_sve_svtbl2_s8:
4930 case SVE::BI__builtin_sve_svtbl2_u16:
4931 case SVE::BI__builtin_sve_svtbl2_s16:
4932 case SVE::BI__builtin_sve_svtbl2_u32:
4933 case SVE::BI__builtin_sve_svtbl2_s32:
4934 case SVE::BI__builtin_sve_svtbl2_u64:
4935 case SVE::BI__builtin_sve_svtbl2_s64:
4936 case SVE::BI__builtin_sve_svtbl2_f16:
4937 case SVE::BI__builtin_sve_svtbl2_bf16:
4938 case SVE::BI__builtin_sve_svtbl2_f32:
4939 case SVE::BI__builtin_sve_svtbl2_f64: {
4940 SVETypeFlags TF(Builtin->TypeModifier);
4941 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_tbl2, Tys: getSVEType(TypeFlags: TF));
4942 return Builder.CreateCall(Callee: F, Args: Ops);
4943 }
4944
4945 case SVE::BI__builtin_sve_svset_neonq_s8:
4946 case SVE::BI__builtin_sve_svset_neonq_s16:
4947 case SVE::BI__builtin_sve_svset_neonq_s32:
4948 case SVE::BI__builtin_sve_svset_neonq_s64:
4949 case SVE::BI__builtin_sve_svset_neonq_u8:
4950 case SVE::BI__builtin_sve_svset_neonq_u16:
4951 case SVE::BI__builtin_sve_svset_neonq_u32:
4952 case SVE::BI__builtin_sve_svset_neonq_u64:
4953 case SVE::BI__builtin_sve_svset_neonq_f16:
4954 case SVE::BI__builtin_sve_svset_neonq_f32:
4955 case SVE::BI__builtin_sve_svset_neonq_f64:
4956 case SVE::BI__builtin_sve_svset_neonq_bf16: {
4957 return Builder.CreateInsertVector(DstType: Ty, SrcVec: Ops[0], SubVec: Ops[1], Idx: uint64_t(0));
4958 }
4959
4960 case SVE::BI__builtin_sve_svget_neonq_s8:
4961 case SVE::BI__builtin_sve_svget_neonq_s16:
4962 case SVE::BI__builtin_sve_svget_neonq_s32:
4963 case SVE::BI__builtin_sve_svget_neonq_s64:
4964 case SVE::BI__builtin_sve_svget_neonq_u8:
4965 case SVE::BI__builtin_sve_svget_neonq_u16:
4966 case SVE::BI__builtin_sve_svget_neonq_u32:
4967 case SVE::BI__builtin_sve_svget_neonq_u64:
4968 case SVE::BI__builtin_sve_svget_neonq_f16:
4969 case SVE::BI__builtin_sve_svget_neonq_f32:
4970 case SVE::BI__builtin_sve_svget_neonq_f64:
4971 case SVE::BI__builtin_sve_svget_neonq_bf16: {
4972 return Builder.CreateExtractVector(DstType: Ty, SrcVec: Ops[0], Idx: uint64_t(0));
4973 }
4974
4975 case SVE::BI__builtin_sve_svdup_neonq_s8:
4976 case SVE::BI__builtin_sve_svdup_neonq_s16:
4977 case SVE::BI__builtin_sve_svdup_neonq_s32:
4978 case SVE::BI__builtin_sve_svdup_neonq_s64:
4979 case SVE::BI__builtin_sve_svdup_neonq_u8:
4980 case SVE::BI__builtin_sve_svdup_neonq_u16:
4981 case SVE::BI__builtin_sve_svdup_neonq_u32:
4982 case SVE::BI__builtin_sve_svdup_neonq_u64:
4983 case SVE::BI__builtin_sve_svdup_neonq_f16:
4984 case SVE::BI__builtin_sve_svdup_neonq_f32:
4985 case SVE::BI__builtin_sve_svdup_neonq_f64:
4986 case SVE::BI__builtin_sve_svdup_neonq_bf16: {
4987 Value *Insert = Builder.CreateInsertVector(DstType: Ty, SrcVec: PoisonValue::get(T: Ty), SubVec: Ops[0],
4988 Idx: uint64_t(0));
4989 return Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_dupq_lane, Types: {Ty},
4990 Args: {Insert, Builder.getInt64(C: 0)});
4991 }
4992 }
4993
4994 /// Should not happen
4995 return nullptr;
4996}
4997
4998static void swapCommutativeSMEOperands(unsigned BuiltinID,
4999 SmallVectorImpl<Value *> &Ops) {
5000 unsigned MultiVec;
5001 switch (BuiltinID) {
5002 default:
5003 return;
5004 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1:
5005 MultiVec = 1;
5006 break;
5007 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
5008 case SME::BI__builtin_sme_svsudot_za32_s8_vg1x2:
5009 MultiVec = 2;
5010 break;
5011 case SME::BI__builtin_sme_svsudot_za32_s8_vg1x4:
5012 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
5013 MultiVec = 4;
5014 break;
5015 }
5016
5017 if (MultiVec > 0)
5018 for (unsigned I = 0; I < MultiVec; ++I)
5019 std::swap(a&: Ops[I + 1], b&: Ops[I + 1 + MultiVec]);
5020}
5021
5022Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
5023 const CallExpr *E) {
5024 auto *Builtin = findARMVectorIntrinsicInMap(IntrinsicMap: AArch64SMEIntrinsicMap, BuiltinID,
5025 MapProvenSorted&: AArch64SMEIntrinsicsProvenSorted);
5026
5027 llvm::SmallVector<Value *, 4> Ops;
5028 SVETypeFlags TypeFlags(Builtin->TypeModifier);
5029 GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
5030
5031 if (TypeFlags.isLoad() || TypeFlags.isStore())
5032 return EmitSMELd1St1(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
5033 else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
5034 return EmitSMEReadWrite(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
5035 else if (BuiltinID == SME::BI__builtin_sme_svzero_mask_za ||
5036 BuiltinID == SME::BI__builtin_sme_svzero_za)
5037 return EmitSMEZero(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
5038 else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za ||
5039 BuiltinID == SME::BI__builtin_sme_svstr_vnum_za ||
5040 BuiltinID == SME::BI__builtin_sme_svldr_za ||
5041 BuiltinID == SME::BI__builtin_sme_svstr_za)
5042 return EmitSMELdrStr(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
5043
5044 // Emit set FPMR for intrinsics that require it
5045 if (TypeFlags.setsFPMR())
5046 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_set_fpmr),
5047 Args: Ops.pop_back_val());
5048 // Handle builtins which require their multi-vector operands to be swapped
5049 swapCommutativeSMEOperands(BuiltinID, Ops);
5050
5051 auto isCntsBuiltin = [&]() {
5052 switch (BuiltinID) {
5053 default:
5054 return 0;
5055 case SME::BI__builtin_sme_svcntsb:
5056 return 8;
5057 case SME::BI__builtin_sme_svcntsh:
5058 return 4;
5059 case SME::BI__builtin_sme_svcntsw:
5060 return 2;
5061 }
5062 };
5063
5064 if (auto Mul = isCntsBuiltin()) {
5065 llvm::Value *Cntd =
5066 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_sme_cntsd));
5067 return Builder.CreateMul(LHS: Cntd, RHS: llvm::ConstantInt::get(Ty: Int64Ty, V: Mul),
5068 Name: "mulsvl", /* HasNUW */ true, /* HasNSW */ true);
5069 }
5070
5071 // Should not happen!
5072 if (Builtin->LLVMIntrinsic == 0)
5073 return nullptr;
5074
5075 // Predicates must match the main datatype.
5076 for (Value *&Op : Ops)
5077 if (auto PredTy = dyn_cast<llvm::VectorType>(Val: Op->getType()))
5078 if (PredTy->getElementType()->isIntegerTy(Bitwidth: 1))
5079 Op = EmitSVEPredicateCast(Pred: Op, VTy: getSVEType(TypeFlags));
5080
5081 Function *F =
5082 TypeFlags.isOverloadNone()
5083 ? CGM.getIntrinsic(IID: Builtin->LLVMIntrinsic)
5084 : CGM.getIntrinsic(IID: Builtin->LLVMIntrinsic, Tys: {getSVEType(TypeFlags)});
5085
5086 return Builder.CreateCall(Callee: F, Args: Ops);
5087}
5088
5089/// Helper for the read/write/add/inc X18 builtins: read the X18 register and
5090/// return it as an i8 pointer.
5091Value *readX18AsPtr(CodeGenFunction &CGF) {
5092 LLVMContext &Context = CGF.CGM.getLLVMContext();
5093 llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Str: "x18")};
5094 llvm::MDNode *RegName = llvm::MDNode::get(Context, MDs: Ops);
5095 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, MD: RegName);
5096 llvm::Function *F =
5097 CGF.CGM.getIntrinsic(IID: Intrinsic::read_register, Tys: {CGF.Int64Ty});
5098 llvm::Value *X18 = CGF.Builder.CreateCall(Callee: F, Args: Metadata);
5099 return CGF.Builder.CreateIntToPtr(V: X18, DestTy: CGF.Int8PtrTy);
5100}
5101
5102Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
5103 const CallExpr *E,
5104 llvm::Triple::ArchType Arch) {
5105 if (BuiltinID >= clang::AArch64::FirstSVEBuiltin &&
5106 BuiltinID <= clang::AArch64::LastSVEBuiltin)
5107 return EmitAArch64SVEBuiltinExpr(BuiltinID, E);
5108
5109 if (BuiltinID >= clang::AArch64::FirstSMEBuiltin &&
5110 BuiltinID <= clang::AArch64::LastSMEBuiltin)
5111 return EmitAArch64SMEBuiltinExpr(BuiltinID, E);
5112
5113 if (BuiltinID == Builtin::BI__builtin_cpu_supports)
5114 return EmitAArch64CpuSupports(E);
5115
5116 unsigned HintID = static_cast<unsigned>(-1);
5117 switch (BuiltinID) {
5118 default: break;
5119 case clang::AArch64::BI__builtin_arm_nop:
5120 HintID = 0;
5121 break;
5122 case clang::AArch64::BI__builtin_arm_yield:
5123 case clang::AArch64::BI__yield:
5124 HintID = 1;
5125 break;
5126 case clang::AArch64::BI__builtin_arm_wfe:
5127 case clang::AArch64::BI__wfe:
5128 HintID = 2;
5129 break;
5130 case clang::AArch64::BI__builtin_arm_wfi:
5131 case clang::AArch64::BI__wfi:
5132 HintID = 3;
5133 break;
5134 case clang::AArch64::BI__builtin_arm_sev:
5135 case clang::AArch64::BI__sev:
5136 HintID = 4;
5137 break;
5138 case clang::AArch64::BI__builtin_arm_sevl:
5139 case clang::AArch64::BI__sevl:
5140 HintID = 5;
5141 break;
5142 }
5143
5144 if (HintID != static_cast<unsigned>(-1)) {
5145 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_hint);
5146 return Builder.CreateCall(Callee: F, Args: llvm::ConstantInt::get(Ty: Int32Ty, V: HintID));
5147 }
5148
5149 if (BuiltinID == clang::AArch64::BI__builtin_arm_trap) {
5150 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_break);
5151 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5152 return Builder.CreateCall(Callee: F, Args: Builder.CreateZExt(V: Arg, DestTy: CGM.Int32Ty));
5153 }
5154
5155 if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
5156 // Create call to __arm_sme_state and store the results to the two pointers.
5157 CallInst *CI = EmitRuntimeCall(callee: CGM.CreateRuntimeFunction(
5158 Ty: llvm::FunctionType::get(Result: StructType::get(elt1: CGM.Int64Ty, elts: CGM.Int64Ty), Params: {},
5159 isVarArg: false),
5160 Name: "__arm_sme_state"));
5161 auto Attrs = AttributeList().addFnAttribute(C&: getLLVMContext(),
5162 Kind: "aarch64_pstate_sm_compatible");
5163 CI->setAttributes(Attrs);
5164 CI->setCallingConv(
5165 llvm::CallingConv::
5166 AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2);
5167 Builder.CreateStore(Val: Builder.CreateExtractValue(Agg: CI, Idxs: 0),
5168 Addr: EmitPointerWithAlignment(Addr: E->getArg(Arg: 0)));
5169 return Builder.CreateStore(Val: Builder.CreateExtractValue(Agg: CI, Idxs: 1),
5170 Addr: EmitPointerWithAlignment(Addr: E->getArg(Arg: 1)));
5171 }
5172
5173 if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
5174 assert((getContext().getTypeSize(E->getType()) == 32) &&
5175 "rbit of unusual size!");
5176 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5177 return Builder.CreateCall(
5178 Callee: CGM.getIntrinsic(IID: Intrinsic::bitreverse, Tys: Arg->getType()), Args: Arg, Name: "rbit");
5179 }
5180 if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit64) {
5181 assert((getContext().getTypeSize(E->getType()) == 64) &&
5182 "rbit of unusual size!");
5183 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5184 return Builder.CreateCall(
5185 Callee: CGM.getIntrinsic(IID: Intrinsic::bitreverse, Tys: Arg->getType()), Args: Arg, Name: "rbit");
5186 }
5187
5188 if (BuiltinID == clang::AArch64::BI__builtin_arm_clz ||
5189 BuiltinID == clang::AArch64::BI__builtin_arm_clz64) {
5190 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5191 Function *F = CGM.getIntrinsic(IID: Intrinsic::ctlz, Tys: Arg->getType());
5192 Value *Res = Builder.CreateCall(Callee: F, Args: {Arg, Builder.getInt1(V: false)});
5193 if (BuiltinID == clang::AArch64::BI__builtin_arm_clz64)
5194 Res = Builder.CreateTrunc(V: Res, DestTy: Builder.getInt32Ty());
5195 return Res;
5196 }
5197
5198 if (BuiltinID == clang::AArch64::BI__builtin_arm_cls) {
5199 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5200 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_cls), Args: Arg,
5201 Name: "cls");
5202 }
5203 if (BuiltinID == clang::AArch64::BI__builtin_arm_cls64) {
5204 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5205 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_cls64), Args: Arg,
5206 Name: "cls");
5207 }
5208
5209 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32zf ||
5210 BuiltinID == clang::AArch64::BI__builtin_arm_rint32z) {
5211 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5212 llvm::Type *Ty = Arg->getType();
5213 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_frint32z, Tys: Ty),
5214 Args: Arg, Name: "frint32z");
5215 }
5216
5217 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64zf ||
5218 BuiltinID == clang::AArch64::BI__builtin_arm_rint64z) {
5219 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5220 llvm::Type *Ty = Arg->getType();
5221 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_frint64z, Tys: Ty),
5222 Args: Arg, Name: "frint64z");
5223 }
5224
5225 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32xf ||
5226 BuiltinID == clang::AArch64::BI__builtin_arm_rint32x) {
5227 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5228 llvm::Type *Ty = Arg->getType();
5229 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_frint32x, Tys: Ty),
5230 Args: Arg, Name: "frint32x");
5231 }
5232
5233 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64xf ||
5234 BuiltinID == clang::AArch64::BI__builtin_arm_rint64x) {
5235 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5236 llvm::Type *Ty = Arg->getType();
5237 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_frint64x, Tys: Ty),
5238 Args: Arg, Name: "frint64x");
5239 }
5240
5241 if (BuiltinID == clang::AArch64::BI__builtin_arm_jcvt) {
5242 assert((getContext().getTypeSize(E->getType()) == 32) &&
5243 "__jcvt of unusual size!");
5244 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5245 return Builder.CreateCall(
5246 Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_fjcvtzs), Args: Arg);
5247 }
5248
5249 if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b ||
5250 BuiltinID == clang::AArch64::BI__builtin_arm_st64b ||
5251 BuiltinID == clang::AArch64::BI__builtin_arm_st64bv ||
5252 BuiltinID == clang::AArch64::BI__builtin_arm_st64bv0) {
5253 llvm::Value *MemAddr = EmitScalarExpr(E: E->getArg(Arg: 0));
5254 llvm::Value *ValPtr = EmitScalarExpr(E: E->getArg(Arg: 1));
5255
5256 if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b) {
5257 // Load from the address via an LLVM intrinsic, receiving a
5258 // tuple of 8 i64 words, and store each one to ValPtr.
5259 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_ld64b);
5260 llvm::Value *Val = Builder.CreateCall(Callee: F, Args: MemAddr);
5261 llvm::Value *ToRet;
5262 for (size_t i = 0; i < 8; i++) {
5263 llvm::Value *ValOffsetPtr =
5264 Builder.CreateGEP(Ty: Int64Ty, Ptr: ValPtr, IdxList: Builder.getInt32(C: i));
5265 Address Addr =
5266 Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(Quantity: 8));
5267 ToRet = Builder.CreateStore(Val: Builder.CreateExtractValue(Agg: Val, Idxs: i), Addr);
5268 }
5269 return ToRet;
5270 } else {
5271 // Load 8 i64 words from ValPtr, and store them to the address
5272 // via an LLVM intrinsic.
5273 SmallVector<llvm::Value *, 9> Args;
5274 Args.push_back(Elt: MemAddr);
5275 for (size_t i = 0; i < 8; i++) {
5276 llvm::Value *ValOffsetPtr =
5277 Builder.CreateGEP(Ty: Int64Ty, Ptr: ValPtr, IdxList: Builder.getInt32(C: i));
5278 Address Addr =
5279 Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(Quantity: 8));
5280 Args.push_back(Elt: Builder.CreateLoad(Addr));
5281 }
5282
5283 auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_st64b
5284 ? Intrinsic::aarch64_st64b
5285 : BuiltinID == clang::AArch64::BI__builtin_arm_st64bv
5286 ? Intrinsic::aarch64_st64bv
5287 : Intrinsic::aarch64_st64bv0);
5288 Function *F = CGM.getIntrinsic(IID: Intr);
5289 return Builder.CreateCall(Callee: F, Args);
5290 }
5291 }
5292
5293 if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
5294 BuiltinID == clang::AArch64::BI__builtin_arm_rndrrs) {
5295
5296 auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_rndr
5297 ? Intrinsic::aarch64_rndr
5298 : Intrinsic::aarch64_rndrrs);
5299 Function *F = CGM.getIntrinsic(IID: Intr);
5300 llvm::Value *Val = Builder.CreateCall(Callee: F);
5301 Value *RandomValue = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
5302 Value *Status = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
5303
5304 Address MemAddress = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
5305 Builder.CreateStore(Val: RandomValue, Addr: MemAddress);
5306 Status = Builder.CreateZExt(V: Status, DestTy: Int32Ty);
5307 return Status;
5308 }
5309
5310 if (BuiltinID == clang::AArch64::BI__clear_cache) {
5311 assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
5312 const FunctionDecl *FD = E->getDirectCallee();
5313 Value *Ops[2];
5314 for (unsigned i = 0; i < 2; i++)
5315 Ops[i] = EmitScalarExpr(E: E->getArg(Arg: i));
5316 llvm::Type *Ty = CGM.getTypes().ConvertType(T: FD->getType());
5317 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty);
5318 StringRef Name = FD->getName();
5319 return EmitNounwindRuntimeCall(callee: CGM.CreateRuntimeFunction(Ty: FTy, Name), args: Ops);
5320 }
5321
5322 if ((BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
5323 BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) &&
5324 getContext().getTypeSize(T: E->getType()) == 128) {
5325 Function *F =
5326 CGM.getIntrinsic(IID: BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
5327 ? Intrinsic::aarch64_ldaxp
5328 : Intrinsic::aarch64_ldxp);
5329
5330 Value *LdPtr = EmitScalarExpr(E: E->getArg(Arg: 0));
5331 Value *Val = Builder.CreateCall(Callee: F, Args: LdPtr, Name: "ldxp");
5332
5333 Value *Val0 = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
5334 Value *Val1 = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
5335 llvm::Type *Int128Ty = llvm::IntegerType::get(C&: getLLVMContext(), NumBits: 128);
5336 Val0 = Builder.CreateZExt(V: Val0, DestTy: Int128Ty);
5337 Val1 = Builder.CreateZExt(V: Val1, DestTy: Int128Ty);
5338
5339 Value *ShiftCst = llvm::ConstantInt::get(Ty: Int128Ty, V: 64);
5340 Val = Builder.CreateShl(LHS: Val0, RHS: ShiftCst, Name: "shl", HasNUW: true /* nuw */);
5341 Val = Builder.CreateOr(LHS: Val, RHS: Val1);
5342 return Builder.CreateBitCast(V: Val, DestTy: ConvertType(T: E->getType()));
5343 } else if (BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
5344 BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) {
5345 Value *LoadAddr = EmitScalarExpr(E: E->getArg(Arg: 0));
5346
5347 QualType Ty = E->getType();
5348 llvm::Type *RealResTy = ConvertType(T: Ty);
5349 llvm::Type *IntTy =
5350 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: getContext().getTypeSize(T: Ty));
5351
5352 Function *F =
5353 CGM.getIntrinsic(IID: BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
5354 ? Intrinsic::aarch64_ldaxr
5355 : Intrinsic::aarch64_ldxr,
5356 Tys: DefaultPtrTy);
5357 CallInst *Val = Builder.CreateCall(Callee: F, Args: LoadAddr, Name: "ldxr");
5358 Val->addParamAttr(
5359 ArgNo: 0, Attr: Attribute::get(Context&: getLLVMContext(), Kind: Attribute::ElementType, Ty: IntTy));
5360
5361 if (RealResTy->isPointerTy())
5362 return Builder.CreateIntToPtr(V: Val, DestTy: RealResTy);
5363
5364 llvm::Type *IntResTy = llvm::IntegerType::get(
5365 C&: getLLVMContext(), NumBits: CGM.getDataLayout().getTypeSizeInBits(Ty: RealResTy));
5366 return Builder.CreateBitCast(V: Builder.CreateTruncOrBitCast(V: Val, DestTy: IntResTy),
5367 DestTy: RealResTy);
5368 }
5369
5370 if ((BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
5371 BuiltinID == clang::AArch64::BI__builtin_arm_stlex) &&
5372 getContext().getTypeSize(T: E->getArg(Arg: 0)->getType()) == 128) {
5373 Function *F =
5374 CGM.getIntrinsic(IID: BuiltinID == clang::AArch64::BI__builtin_arm_stlex
5375 ? Intrinsic::aarch64_stlxp
5376 : Intrinsic::aarch64_stxp);
5377 llvm::Type *STy = llvm::StructType::get(elt1: Int64Ty, elts: Int64Ty);
5378
5379 Address Tmp = CreateMemTemp(T: E->getArg(Arg: 0)->getType());
5380 EmitAnyExprToMem(E: E->getArg(Arg: 0), Location: Tmp, Quals: Qualifiers(), /*init*/ IsInitializer: true);
5381
5382 Tmp = Tmp.withElementType(ElemTy: STy);
5383 llvm::Value *Val = Builder.CreateLoad(Addr: Tmp);
5384
5385 Value *Arg0 = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
5386 Value *Arg1 = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
5387 Value *StPtr = EmitScalarExpr(E: E->getArg(Arg: 1));
5388 return Builder.CreateCall(Callee: F, Args: {Arg0, Arg1, StPtr}, Name: "stxp");
5389 }
5390
5391 if (BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
5392 BuiltinID == clang::AArch64::BI__builtin_arm_stlex) {
5393 Value *StoreVal = EmitScalarExpr(E: E->getArg(Arg: 0));
5394 Value *StoreAddr = EmitScalarExpr(E: E->getArg(Arg: 1));
5395
5396 QualType Ty = E->getArg(Arg: 0)->getType();
5397 llvm::Type *StoreTy =
5398 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: getContext().getTypeSize(T: Ty));
5399
5400 if (StoreVal->getType()->isPointerTy())
5401 StoreVal = Builder.CreatePtrToInt(V: StoreVal, DestTy: Int64Ty);
5402 else {
5403 llvm::Type *IntTy = llvm::IntegerType::get(
5404 C&: getLLVMContext(),
5405 NumBits: CGM.getDataLayout().getTypeSizeInBits(Ty: StoreVal->getType()));
5406 StoreVal = Builder.CreateBitCast(V: StoreVal, DestTy: IntTy);
5407 StoreVal = Builder.CreateZExtOrBitCast(V: StoreVal, DestTy: Int64Ty);
5408 }
5409
5410 Function *F =
5411 CGM.getIntrinsic(IID: BuiltinID == clang::AArch64::BI__builtin_arm_stlex
5412 ? Intrinsic::aarch64_stlxr
5413 : Intrinsic::aarch64_stxr,
5414 Tys: StoreAddr->getType());
5415 CallInst *CI = Builder.CreateCall(Callee: F, Args: {StoreVal, StoreAddr}, Name: "stxr");
5416 CI->addParamAttr(
5417 ArgNo: 1, Attr: Attribute::get(Context&: getLLVMContext(), Kind: Attribute::ElementType, Ty: StoreTy));
5418 return CI;
5419 }
5420
5421 if (BuiltinID == clang::AArch64::BI__getReg) {
5422 Expr::EvalResult Result;
5423 if (!E->getArg(Arg: 0)->EvaluateAsInt(Result, Ctx: CGM.getContext()))
5424 llvm_unreachable("Sema will ensure that the parameter is constant");
5425
5426 llvm::APSInt Value = Result.Val.getInt();
5427 LLVMContext &Context = CGM.getLLVMContext();
5428 std::string Reg = Value == 31 ? "sp" : "x" + toString(I: Value, Radix: 10);
5429
5430 llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Str: Reg)};
5431 llvm::MDNode *RegName = llvm::MDNode::get(Context, MDs: Ops);
5432 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, MD: RegName);
5433
5434 llvm::Function *F =
5435 CGM.getIntrinsic(IID: Intrinsic::read_register, Tys: {Int64Ty});
5436 return Builder.CreateCall(Callee: F, Args: Metadata);
5437 }
5438
5439 if (BuiltinID == clang::AArch64::BI__break) {
5440 Expr::EvalResult Result;
5441 if (!E->getArg(Arg: 0)->EvaluateAsInt(Result, Ctx: CGM.getContext()))
5442 llvm_unreachable("Sema will ensure that the parameter is constant");
5443
5444 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_break);
5445 return Builder.CreateCall(Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0))});
5446 }
5447
5448 if (BuiltinID == clang::AArch64::BI__builtin_arm_clrex) {
5449 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_clrex);
5450 return Builder.CreateCall(Callee: F);
5451 }
5452
5453 if (BuiltinID == clang::AArch64::BI_ReadWriteBarrier)
5454 return Builder.CreateFence(Ordering: llvm::AtomicOrdering::SequentiallyConsistent,
5455 SSID: llvm::SyncScope::SingleThread);
5456
5457 // CRC32
5458 Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
5459 switch (BuiltinID) {
5460 case clang::AArch64::BI__builtin_arm_crc32b:
5461 CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
5462 case clang::AArch64::BI__builtin_arm_crc32cb:
5463 CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
5464 case clang::AArch64::BI__builtin_arm_crc32h:
5465 CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
5466 case clang::AArch64::BI__builtin_arm_crc32ch:
5467 CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
5468 case clang::AArch64::BI__builtin_arm_crc32w:
5469 CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
5470 case clang::AArch64::BI__builtin_arm_crc32cw:
5471 CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
5472 case clang::AArch64::BI__builtin_arm_crc32d:
5473 CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
5474 case clang::AArch64::BI__builtin_arm_crc32cd:
5475 CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
5476 }
5477
5478 if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
5479 Value *Arg0 = EmitScalarExpr(E: E->getArg(Arg: 0));
5480 Value *Arg1 = EmitScalarExpr(E: E->getArg(Arg: 1));
5481 Function *F = CGM.getIntrinsic(IID: CRCIntrinsicID);
5482
5483 llvm::Type *DataTy = F->getFunctionType()->getParamType(i: 1);
5484 Arg1 = Builder.CreateZExtOrBitCast(V: Arg1, DestTy: DataTy);
5485
5486 return Builder.CreateCall(Callee: F, Args: {Arg0, Arg1});
5487 }
5488
5489 // Memory Operations (MOPS)
5490 if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) {
5491 Value *Dst = EmitScalarExpr(E: E->getArg(Arg: 0));
5492 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
5493 Value *Size = EmitScalarExpr(E: E->getArg(Arg: 2));
5494 Val = Builder.CreateTrunc(V: Val, DestTy: Int8Ty);
5495 Size = Builder.CreateIntCast(V: Size, DestTy: Int64Ty, isSigned: false);
5496 return Builder.CreateCall(
5497 Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_mops_memset_tag), Args: {Dst, Val, Size});
5498 }
5499
5500 if (BuiltinID == AArch64::BI__builtin_arm_range_prefetch ||
5501 BuiltinID == AArch64::BI__builtin_arm_range_prefetch_x)
5502 return EmitRangePrefetchBuiltin(CGF&: *this, BuiltinID, E);
5503
5504 // Memory Tagging Extensions (MTE) Intrinsics
5505 Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic;
5506 switch (BuiltinID) {
5507 case clang::AArch64::BI__builtin_arm_irg:
5508 MTEIntrinsicID = Intrinsic::aarch64_irg; break;
5509 case clang::AArch64::BI__builtin_arm_addg:
5510 MTEIntrinsicID = Intrinsic::aarch64_addg; break;
5511 case clang::AArch64::BI__builtin_arm_gmi:
5512 MTEIntrinsicID = Intrinsic::aarch64_gmi; break;
5513 case clang::AArch64::BI__builtin_arm_ldg:
5514 MTEIntrinsicID = Intrinsic::aarch64_ldg; break;
5515 case clang::AArch64::BI__builtin_arm_stg:
5516 MTEIntrinsicID = Intrinsic::aarch64_stg; break;
5517 case clang::AArch64::BI__builtin_arm_subp:
5518 MTEIntrinsicID = Intrinsic::aarch64_subp; break;
5519 }
5520
5521 if (MTEIntrinsicID != Intrinsic::not_intrinsic) {
5522 if (MTEIntrinsicID == Intrinsic::aarch64_irg) {
5523 Value *Pointer = EmitScalarExpr(E: E->getArg(Arg: 0));
5524 Value *Mask = EmitScalarExpr(E: E->getArg(Arg: 1));
5525
5526 Mask = Builder.CreateZExt(V: Mask, DestTy: Int64Ty);
5527 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: MTEIntrinsicID),
5528 Args: {Pointer, Mask});
5529 }
5530 if (MTEIntrinsicID == Intrinsic::aarch64_addg) {
5531 Value *Pointer = EmitScalarExpr(E: E->getArg(Arg: 0));
5532 Value *TagOffset = EmitScalarExpr(E: E->getArg(Arg: 1));
5533
5534 TagOffset = Builder.CreateZExt(V: TagOffset, DestTy: Int64Ty);
5535 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: MTEIntrinsicID),
5536 Args: {Pointer, TagOffset});
5537 }
5538 if (MTEIntrinsicID == Intrinsic::aarch64_gmi) {
5539 Value *Pointer = EmitScalarExpr(E: E->getArg(Arg: 0));
5540 Value *ExcludedMask = EmitScalarExpr(E: E->getArg(Arg: 1));
5541
5542 ExcludedMask = Builder.CreateZExt(V: ExcludedMask, DestTy: Int64Ty);
5543 return Builder.CreateCall(
5544 Callee: CGM.getIntrinsic(IID: MTEIntrinsicID), Args: {Pointer, ExcludedMask});
5545 }
5546 // Although it is possible to supply a different return
5547 // address (first arg) to this intrinsic, for now we set
5548 // return address same as input address.
5549 if (MTEIntrinsicID == Intrinsic::aarch64_ldg) {
5550 Value *TagAddress = EmitScalarExpr(E: E->getArg(Arg: 0));
5551 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: MTEIntrinsicID),
5552 Args: {TagAddress, TagAddress});
5553 }
5554 // Although it is possible to supply a different tag (to set)
5555 // to this intrinsic (as first arg), for now we supply
5556 // the tag that is in input address arg (common use case).
5557 if (MTEIntrinsicID == Intrinsic::aarch64_stg) {
5558 Value *TagAddress = EmitScalarExpr(E: E->getArg(Arg: 0));
5559 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: MTEIntrinsicID),
5560 Args: {TagAddress, TagAddress});
5561 }
5562 if (MTEIntrinsicID == Intrinsic::aarch64_subp) {
5563 Value *PointerA = EmitScalarExpr(E: E->getArg(Arg: 0));
5564 Value *PointerB = EmitScalarExpr(E: E->getArg(Arg: 1));
5565 return Builder.CreateCall(
5566 Callee: CGM.getIntrinsic(IID: MTEIntrinsicID), Args: {PointerA, PointerB});
5567 }
5568 }
5569
5570 if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
5571 BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
5572 BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
5573 BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
5574 BuiltinID == clang::AArch64::BI__builtin_arm_wsr ||
5575 BuiltinID == clang::AArch64::BI__builtin_arm_wsr64 ||
5576 BuiltinID == clang::AArch64::BI__builtin_arm_wsr128 ||
5577 BuiltinID == clang::AArch64::BI__builtin_arm_wsrp) {
5578
5579 SpecialRegisterAccessKind AccessKind = Write;
5580 if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
5581 BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
5582 BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
5583 BuiltinID == clang::AArch64::BI__builtin_arm_rsrp)
5584 AccessKind = VolatileRead;
5585
5586 bool IsPointerBuiltin = BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
5587 BuiltinID == clang::AArch64::BI__builtin_arm_wsrp;
5588
5589 bool Is32Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
5590 BuiltinID == clang::AArch64::BI__builtin_arm_wsr;
5591
5592 bool Is128Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
5593 BuiltinID == clang::AArch64::BI__builtin_arm_wsr128;
5594
5595 llvm::Type *ValueType;
5596 llvm::Type *RegisterType = Int64Ty;
5597 if (Is32Bit) {
5598 ValueType = Int32Ty;
5599 } else if (Is128Bit) {
5600 llvm::Type *Int128Ty =
5601 llvm::IntegerType::getInt128Ty(C&: CGM.getLLVMContext());
5602 ValueType = Int128Ty;
5603 RegisterType = Int128Ty;
5604 } else if (IsPointerBuiltin) {
5605 ValueType = VoidPtrTy;
5606 } else {
5607 ValueType = Int64Ty;
5608 };
5609
5610 return EmitSpecialRegisterBuiltin(CGF&: *this, E, RegisterType, ValueType,
5611 AccessKind);
5612 }
5613
5614 if (BuiltinID == clang::AArch64::BI_ReadStatusReg ||
5615 BuiltinID == clang::AArch64::BI_WriteStatusReg ||
5616 BuiltinID == clang::AArch64::BI__sys) {
5617 LLVMContext &Context = CGM.getLLVMContext();
5618
5619 unsigned SysReg =
5620 E->getArg(Arg: 0)->EvaluateKnownConstInt(Ctx: getContext()).getZExtValue();
5621
5622 std::string SysRegStr;
5623 unsigned SysRegOp0 = (BuiltinID == clang::AArch64::BI_ReadStatusReg ||
5624 BuiltinID == clang::AArch64::BI_WriteStatusReg)
5625 ? ((1 << 1) | ((SysReg >> 14) & 1))
5626 : 1;
5627 llvm::raw_string_ostream(SysRegStr)
5628 << SysRegOp0 << ":" << ((SysReg >> 11) & 7) << ":"
5629 << ((SysReg >> 7) & 15) << ":" << ((SysReg >> 3) & 15) << ":"
5630 << (SysReg & 7);
5631
5632 llvm::Metadata *Ops[] = { llvm::MDString::get(Context, Str: SysRegStr) };
5633 llvm::MDNode *RegName = llvm::MDNode::get(Context, MDs: Ops);
5634 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, MD: RegName);
5635
5636 llvm::Type *RegisterType = Int64Ty;
5637 llvm::Type *Types[] = { RegisterType };
5638
5639 if (BuiltinID == clang::AArch64::BI_ReadStatusReg) {
5640 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::read_register, Tys: Types);
5641
5642 return Builder.CreateCall(Callee: F, Args: Metadata);
5643 }
5644
5645 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::write_register, Tys: Types);
5646 llvm::Value *ArgValue = EmitScalarExpr(E: E->getArg(Arg: 1));
5647 llvm::Value *Result = Builder.CreateCall(Callee: F, Args: {Metadata, ArgValue});
5648 if (BuiltinID == clang::AArch64::BI__sys) {
5649 // Return 0 for convenience, even though MSVC returns some other undefined
5650 // value.
5651 Result = ConstantInt::get(Ty: Builder.getInt32Ty(), V: 0);
5652 }
5653 return Result;
5654 }
5655
5656 if (BuiltinID == clang::AArch64::BI_AddressOfReturnAddress) {
5657 llvm::Function *F =
5658 CGM.getIntrinsic(IID: Intrinsic::addressofreturnaddress, Tys: AllocaInt8PtrTy);
5659 return Builder.CreateCall(Callee: F);
5660 }
5661
5662 if (BuiltinID == clang::AArch64::BI__builtin_sponentry) {
5663 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::sponentry, Tys: AllocaInt8PtrTy);
5664 return Builder.CreateCall(Callee: F);
5665 }
5666
5667 if (BuiltinID == clang::AArch64::BI__mulh ||
5668 BuiltinID == clang::AArch64::BI__umulh) {
5669 llvm::Type *ResType = ConvertType(T: E->getType());
5670 llvm::Type *Int128Ty = llvm::IntegerType::get(C&: getLLVMContext(), NumBits: 128);
5671
5672 bool IsSigned = BuiltinID == clang::AArch64::BI__mulh;
5673 Value *LHS =
5674 Builder.CreateIntCast(V: EmitScalarExpr(E: E->getArg(Arg: 0)), DestTy: Int128Ty, isSigned: IsSigned);
5675 Value *RHS =
5676 Builder.CreateIntCast(V: EmitScalarExpr(E: E->getArg(Arg: 1)), DestTy: Int128Ty, isSigned: IsSigned);
5677
5678 Value *MulResult, *HigherBits;
5679 if (IsSigned) {
5680 MulResult = Builder.CreateNSWMul(LHS, RHS);
5681 HigherBits = Builder.CreateAShr(LHS: MulResult, RHS: 64);
5682 } else {
5683 MulResult = Builder.CreateNUWMul(LHS, RHS);
5684 HigherBits = Builder.CreateLShr(LHS: MulResult, RHS: 64);
5685 }
5686 HigherBits = Builder.CreateIntCast(V: HigherBits, DestTy: ResType, isSigned: IsSigned);
5687
5688 return HigherBits;
5689 }
5690
5691 if (BuiltinID == AArch64::BI__writex18byte ||
5692 BuiltinID == AArch64::BI__writex18word ||
5693 BuiltinID == AArch64::BI__writex18dword ||
5694 BuiltinID == AArch64::BI__writex18qword) {
5695 // Process the args first
5696 Value *OffsetArg = EmitScalarExpr(E: E->getArg(Arg: 0));
5697 Value *DataArg = EmitScalarExpr(E: E->getArg(Arg: 1));
5698
5699 // Read x18 as i8*
5700 llvm::Value *X18 = readX18AsPtr(CGF&: *this);
5701
5702 // Store val at x18 + offset
5703 Value *Offset = Builder.CreateZExt(V: OffsetArg, DestTy: Int64Ty);
5704 Value *Ptr = Builder.CreateGEP(Ty: Int8Ty, Ptr: X18, IdxList: Offset);
5705 StoreInst *Store =
5706 Builder.CreateAlignedStore(Val: DataArg, Addr: Ptr, Align: CharUnits::One());
5707 return Store;
5708 }
5709
5710 if (BuiltinID == AArch64::BI__readx18byte ||
5711 BuiltinID == AArch64::BI__readx18word ||
5712 BuiltinID == AArch64::BI__readx18dword ||
5713 BuiltinID == AArch64::BI__readx18qword) {
5714 // Process the args first
5715 Value *OffsetArg = EmitScalarExpr(E: E->getArg(Arg: 0));
5716
5717 // Read x18 as i8*
5718 llvm::Value *X18 = readX18AsPtr(CGF&: *this);
5719
5720 // Load x18 + offset
5721 Value *Offset = Builder.CreateZExt(V: OffsetArg, DestTy: Int64Ty);
5722 Value *Ptr = Builder.CreateGEP(Ty: Int8Ty, Ptr: X18, IdxList: Offset);
5723 llvm::Type *IntTy = ConvertType(T: E->getType());
5724 LoadInst *Load = Builder.CreateAlignedLoad(Ty: IntTy, Addr: Ptr, Align: CharUnits::One());
5725 return Load;
5726 }
5727
5728 if (BuiltinID == AArch64::BI__addx18byte ||
5729 BuiltinID == AArch64::BI__addx18word ||
5730 BuiltinID == AArch64::BI__addx18dword ||
5731 BuiltinID == AArch64::BI__addx18qword ||
5732 BuiltinID == AArch64::BI__incx18byte ||
5733 BuiltinID == AArch64::BI__incx18word ||
5734 BuiltinID == AArch64::BI__incx18dword ||
5735 BuiltinID == AArch64::BI__incx18qword) {
5736 llvm::Type *IntTy;
5737 bool isIncrement;
5738 switch (BuiltinID) {
5739 case AArch64::BI__incx18byte:
5740 IntTy = Int8Ty;
5741 isIncrement = true;
5742 break;
5743 case AArch64::BI__incx18word:
5744 IntTy = Int16Ty;
5745 isIncrement = true;
5746 break;
5747 case AArch64::BI__incx18dword:
5748 IntTy = Int32Ty;
5749 isIncrement = true;
5750 break;
5751 case AArch64::BI__incx18qword:
5752 IntTy = Int64Ty;
5753 isIncrement = true;
5754 break;
5755 default:
5756 IntTy = ConvertType(T: E->getArg(Arg: 1)->getType());
5757 isIncrement = false;
5758 break;
5759 }
5760 // Process the args first
5761 Value *OffsetArg = EmitScalarExpr(E: E->getArg(Arg: 0));
5762 Value *ValToAdd =
5763 isIncrement ? ConstantInt::get(Ty: IntTy, V: 1) : EmitScalarExpr(E: E->getArg(Arg: 1));
5764
5765 // Read x18 as i8*
5766 llvm::Value *X18 = readX18AsPtr(CGF&: *this);
5767
5768 // Load x18 + offset
5769 Value *Offset = Builder.CreateZExt(V: OffsetArg, DestTy: Int64Ty);
5770 Value *Ptr = Builder.CreateGEP(Ty: Int8Ty, Ptr: X18, IdxList: Offset);
5771 LoadInst *Load = Builder.CreateAlignedLoad(Ty: IntTy, Addr: Ptr, Align: CharUnits::One());
5772
5773 // Add values
5774 Value *AddResult = Builder.CreateAdd(LHS: Load, RHS: ValToAdd);
5775
5776 // Store val at x18 + offset
5777 StoreInst *Store =
5778 Builder.CreateAlignedStore(Val: AddResult, Addr: Ptr, Align: CharUnits::One());
5779 return Store;
5780 }
5781
5782 if (BuiltinID == AArch64::BI_CopyDoubleFromInt64 ||
5783 BuiltinID == AArch64::BI_CopyFloatFromInt32 ||
5784 BuiltinID == AArch64::BI_CopyInt32FromFloat ||
5785 BuiltinID == AArch64::BI_CopyInt64FromDouble) {
5786 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5787 llvm::Type *RetTy = ConvertType(T: E->getType());
5788 return Builder.CreateBitCast(V: Arg, DestTy: RetTy);
5789 }
5790
5791 if (BuiltinID == AArch64::BI_CountLeadingOnes ||
5792 BuiltinID == AArch64::BI_CountLeadingOnes64 ||
5793 BuiltinID == AArch64::BI_CountLeadingZeros ||
5794 BuiltinID == AArch64::BI_CountLeadingZeros64) {
5795 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5796 llvm::Type *ArgType = Arg->getType();
5797
5798 if (BuiltinID == AArch64::BI_CountLeadingOnes ||
5799 BuiltinID == AArch64::BI_CountLeadingOnes64)
5800 Arg = Builder.CreateXor(LHS: Arg, RHS: Constant::getAllOnesValue(Ty: ArgType));
5801
5802 Function *F = CGM.getIntrinsic(IID: Intrinsic::ctlz, Tys: ArgType);
5803 Value *Result = Builder.CreateCall(Callee: F, Args: {Arg, Builder.getInt1(V: false)});
5804
5805 if (BuiltinID == AArch64::BI_CountLeadingOnes64 ||
5806 BuiltinID == AArch64::BI_CountLeadingZeros64)
5807 Result = Builder.CreateTrunc(V: Result, DestTy: Builder.getInt32Ty());
5808 return Result;
5809 }
5810
5811 if (BuiltinID == AArch64::BI_CountLeadingSigns ||
5812 BuiltinID == AArch64::BI_CountLeadingSigns64) {
5813 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5814
5815 Function *F = (BuiltinID == AArch64::BI_CountLeadingSigns)
5816 ? CGM.getIntrinsic(IID: Intrinsic::aarch64_cls)
5817 : CGM.getIntrinsic(IID: Intrinsic::aarch64_cls64);
5818
5819 Value *Result = Builder.CreateCall(Callee: F, Args: Arg, Name: "cls");
5820 if (BuiltinID == AArch64::BI_CountLeadingSigns64)
5821 Result = Builder.CreateTrunc(V: Result, DestTy: Builder.getInt32Ty());
5822 return Result;
5823 }
5824
5825 if (BuiltinID == AArch64::BI_CountOneBits ||
5826 BuiltinID == AArch64::BI_CountOneBits64) {
5827 Value *ArgValue = EmitScalarExpr(E: E->getArg(Arg: 0));
5828 llvm::Type *ArgType = ArgValue->getType();
5829 Function *F = CGM.getIntrinsic(IID: Intrinsic::ctpop, Tys: ArgType);
5830
5831 Value *Result = Builder.CreateCall(Callee: F, Args: ArgValue);
5832 if (BuiltinID == AArch64::BI_CountOneBits64)
5833 Result = Builder.CreateTrunc(V: Result, DestTy: Builder.getInt32Ty());
5834 return Result;
5835 }
5836
5837 if (BuiltinID == AArch64::BI__prefetch) {
5838 Value *Address = EmitScalarExpr(E: E->getArg(Arg: 0));
5839 Value *RW = llvm::ConstantInt::get(Ty: Int32Ty, V: 0);
5840 Value *Locality = ConstantInt::get(Ty: Int32Ty, V: 3);
5841 Value *Data = llvm::ConstantInt::get(Ty: Int32Ty, V: 1);
5842 Function *F = CGM.getIntrinsic(IID: Intrinsic::prefetch, Tys: Address->getType());
5843 return Builder.CreateCall(Callee: F, Args: {Address, RW, Locality, Data});
5844 }
5845
5846 if (BuiltinID == AArch64::BI__hlt) {
5847 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_hlt);
5848 Builder.CreateCall(Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0))});
5849
5850 // Return 0 for convenience, even though MSVC returns some other undefined
5851 // value.
5852 return ConstantInt::get(Ty: Builder.getInt32Ty(), V: 0);
5853 }
5854
5855 if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
5856 return Builder.CreateFPTrunc(
5857 V: Builder.CreateBitCast(V: EmitScalarExpr(E: E->getArg(Arg: 0)),
5858 DestTy: Builder.getFloatTy()),
5859 DestTy: Builder.getBFloatTy());
5860
5861 // Handle MSVC intrinsics before argument evaluation to prevent double
5862 // evaluation.
5863 if (std::optional<MSVCIntrin> MsvcIntId =
5864 translateAarch64ToMsvcIntrin(BuiltinID))
5865 return EmitMSVCBuiltinExpr(BuiltinID: *MsvcIntId, E);
5866
5867 // Some intrinsics are equivalent - if they are use the base intrinsic ID.
5868 auto It = llvm::find_if(Range: NEONEquivalentIntrinsicMap, P: [BuiltinID](auto &P) {
5869 return P.first == BuiltinID;
5870 });
5871 if (It != end(arr: NEONEquivalentIntrinsicMap))
5872 BuiltinID = It->second;
5873
5874 // Find out if any arguments are required to be integer constant
5875 // expressions.
5876 unsigned ICEArguments = 0;
5877 ASTContext::GetBuiltinTypeError Error;
5878 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
5879 assert(Error == ASTContext::GE_None && "Should not codegen an error");
5880
5881 llvm::SmallVector<Value*, 4> Ops;
5882 Address PtrOp0 = Address::invalid();
5883 for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
5884 if (i == 0) {
5885 switch (BuiltinID) {
5886 case NEON::BI__builtin_neon_vld1_v:
5887 case NEON::BI__builtin_neon_vld1q_v:
5888 case NEON::BI__builtin_neon_vld1_dup_v:
5889 case NEON::BI__builtin_neon_vld1q_dup_v:
5890 case NEON::BI__builtin_neon_vld1_lane_v:
5891 case NEON::BI__builtin_neon_vld1q_lane_v:
5892 case NEON::BI__builtin_neon_vst1_v:
5893 case NEON::BI__builtin_neon_vst1q_v:
5894 case NEON::BI__builtin_neon_vst1_lane_v:
5895 case NEON::BI__builtin_neon_vst1q_lane_v:
5896 case NEON::BI__builtin_neon_vldap1_lane_s64:
5897 case NEON::BI__builtin_neon_vldap1q_lane_s64:
5898 case NEON::BI__builtin_neon_vstl1_lane_s64:
5899 case NEON::BI__builtin_neon_vstl1q_lane_s64:
5900 // Get the alignment for the argument in addition to the value;
5901 // we'll use it later.
5902 PtrOp0 = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
5903 Ops.push_back(Elt: PtrOp0.emitRawPointer(CGF&: *this));
5904 continue;
5905 }
5906 }
5907 Ops.push_back(Elt: EmitScalarOrConstFoldImmArg(ICEArguments, Idx: i, E));
5908 }
5909
5910 auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap);
5911 const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
5912 IntrinsicMap: SISDMap, BuiltinID, MapProvenSorted&: AArch64SISDIntrinsicsProvenSorted);
5913
5914 if (Builtin) {
5915 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: E->getNumArgs() - 1)));
5916 Value *Result = EmitCommonNeonSISDBuiltinExpr(CGF&: *this, SISDInfo: *Builtin, Ops, E);
5917 assert(Result && "SISD intrinsic should have been handled");
5918 return Result;
5919 }
5920
5921 const Expr *Arg = E->getArg(Arg: E->getNumArgs()-1);
5922 NeonTypeFlags Type(0);
5923 if (std::optional<llvm::APSInt> Result =
5924 Arg->getIntegerConstantExpr(Ctx: getContext()))
5925 // Determine the type of this overloaded NEON intrinsic.
5926 Type = NeonTypeFlags(Result->getZExtValue());
5927
5928 bool usgn = Type.isUnsigned();
5929 bool quad = Type.isQuad();
5930
5931 // Handle non-overloaded intrinsics first.
5932 switch (BuiltinID) {
5933 default: break;
5934 case NEON::BI__builtin_neon_vabsh_f16:
5935 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
5936 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::fabs, Tys: HalfTy), Ops, name: "vabs");
5937 case NEON::BI__builtin_neon_vaddq_p128: {
5938 llvm::Type *Ty = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags::Poly128);
5939 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
5940 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
5941 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
5942 Ops[0] = Builder.CreateXor(LHS: Ops[0], RHS: Ops[1]);
5943 llvm::Type *Int128Ty = llvm::Type::getIntNTy(C&: getLLVMContext(), N: 128);
5944 return Builder.CreateBitCast(V: Ops[0], DestTy: Int128Ty);
5945 }
5946 case NEON::BI__builtin_neon_vldrq_p128: {
5947 llvm::Type *Int128Ty = llvm::Type::getIntNTy(C&: getLLVMContext(), N: 128);
5948 Value *Ptr = EmitScalarExpr(E: E->getArg(Arg: 0));
5949 return Builder.CreateAlignedLoad(Ty: Int128Ty, Addr: Ptr,
5950 Align: CharUnits::fromQuantity(Quantity: 16));
5951 }
5952 case NEON::BI__builtin_neon_vstrq_p128: {
5953 Value *Ptr = Ops[0];
5954 return Builder.CreateDefaultAlignedStore(Val: EmitScalarExpr(E: E->getArg(Arg: 1)), Addr: Ptr);
5955 }
5956 case NEON::BI__builtin_neon_vcvts_f32_u32:
5957 case NEON::BI__builtin_neon_vcvtd_f64_u64:
5958 usgn = true;
5959 [[fallthrough]];
5960 case NEON::BI__builtin_neon_vcvts_f32_s32:
5961 case NEON::BI__builtin_neon_vcvtd_f64_s64: {
5962 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
5963 bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
5964 llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
5965 llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
5966 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: InTy);
5967 if (usgn)
5968 return Builder.CreateUIToFP(V: Ops[0], DestTy: FTy);
5969 return Builder.CreateSIToFP(V: Ops[0], DestTy: FTy);
5970 }
5971 case NEON::BI__builtin_neon_vcvth_f16_u16:
5972 case NEON::BI__builtin_neon_vcvth_f16_u32:
5973 case NEON::BI__builtin_neon_vcvth_f16_u64:
5974 usgn = true;
5975 [[fallthrough]];
5976 case NEON::BI__builtin_neon_vcvth_f16_s16:
5977 case NEON::BI__builtin_neon_vcvth_f16_s32:
5978 case NEON::BI__builtin_neon_vcvth_f16_s64: {
5979 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
5980 llvm::Type *FTy = HalfTy;
5981 llvm::Type *InTy;
5982 if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
5983 InTy = Int64Ty;
5984 else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
5985 InTy = Int32Ty;
5986 else
5987 InTy = Int16Ty;
5988 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: InTy);
5989 if (usgn)
5990 return Builder.CreateUIToFP(V: Ops[0], DestTy: FTy);
5991 return Builder.CreateSIToFP(V: Ops[0], DestTy: FTy);
5992 }
5993 case NEON::BI__builtin_neon_vcvtah_u16_f16:
5994 case NEON::BI__builtin_neon_vcvtmh_u16_f16:
5995 case NEON::BI__builtin_neon_vcvtnh_u16_f16:
5996 case NEON::BI__builtin_neon_vcvtph_u16_f16:
5997 case NEON::BI__builtin_neon_vcvth_u16_f16:
5998 case NEON::BI__builtin_neon_vcvtah_s16_f16:
5999 case NEON::BI__builtin_neon_vcvtmh_s16_f16:
6000 case NEON::BI__builtin_neon_vcvtnh_s16_f16:
6001 case NEON::BI__builtin_neon_vcvtph_s16_f16:
6002 case NEON::BI__builtin_neon_vcvth_s16_f16: {
6003 unsigned Int;
6004 llvm::Type *InTy = Int16Ty;
6005 llvm::Type* FTy = HalfTy;
6006 llvm::Type *Tys[2] = {InTy, FTy};
6007 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6008 switch (BuiltinID) {
6009 default: llvm_unreachable("missing builtin ID in switch!");
6010 case NEON::BI__builtin_neon_vcvtah_u16_f16:
6011 Int = Intrinsic::aarch64_neon_fcvtau; break;
6012 case NEON::BI__builtin_neon_vcvtmh_u16_f16:
6013 Int = Intrinsic::aarch64_neon_fcvtmu; break;
6014 case NEON::BI__builtin_neon_vcvtnh_u16_f16:
6015 Int = Intrinsic::aarch64_neon_fcvtnu; break;
6016 case NEON::BI__builtin_neon_vcvtph_u16_f16:
6017 Int = Intrinsic::aarch64_neon_fcvtpu; break;
6018 case NEON::BI__builtin_neon_vcvth_u16_f16:
6019 Int = Intrinsic::aarch64_neon_fcvtzu; break;
6020 case NEON::BI__builtin_neon_vcvtah_s16_f16:
6021 Int = Intrinsic::aarch64_neon_fcvtas; break;
6022 case NEON::BI__builtin_neon_vcvtmh_s16_f16:
6023 Int = Intrinsic::aarch64_neon_fcvtms; break;
6024 case NEON::BI__builtin_neon_vcvtnh_s16_f16:
6025 Int = Intrinsic::aarch64_neon_fcvtns; break;
6026 case NEON::BI__builtin_neon_vcvtph_s16_f16:
6027 Int = Intrinsic::aarch64_neon_fcvtps; break;
6028 case NEON::BI__builtin_neon_vcvth_s16_f16:
6029 Int = Intrinsic::aarch64_neon_fcvtzs; break;
6030 }
6031 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "fcvt");
6032 }
6033 case NEON::BI__builtin_neon_vcaleh_f16:
6034 case NEON::BI__builtin_neon_vcalth_f16:
6035 case NEON::BI__builtin_neon_vcageh_f16:
6036 case NEON::BI__builtin_neon_vcagth_f16: {
6037 unsigned Int;
6038 llvm::Type* InTy = Int32Ty;
6039 llvm::Type* FTy = HalfTy;
6040 llvm::Type *Tys[2] = {InTy, FTy};
6041 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6042 switch (BuiltinID) {
6043 default: llvm_unreachable("missing builtin ID in switch!");
6044 case NEON::BI__builtin_neon_vcageh_f16:
6045 Int = Intrinsic::aarch64_neon_facge; break;
6046 case NEON::BI__builtin_neon_vcagth_f16:
6047 Int = Intrinsic::aarch64_neon_facgt; break;
6048 case NEON::BI__builtin_neon_vcaleh_f16:
6049 Int = Intrinsic::aarch64_neon_facge; std::swap(a&: Ops[0], b&: Ops[1]); break;
6050 case NEON::BI__builtin_neon_vcalth_f16:
6051 Int = Intrinsic::aarch64_neon_facgt; std::swap(a&: Ops[0], b&: Ops[1]); break;
6052 }
6053 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "facg");
6054 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
6055 }
6056 case NEON::BI__builtin_neon_vcvth_n_s16_f16:
6057 case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
6058 unsigned Int;
6059 llvm::Type* InTy = Int32Ty;
6060 llvm::Type* FTy = HalfTy;
6061 llvm::Type *Tys[2] = {InTy, FTy};
6062 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6063 switch (BuiltinID) {
6064 default: llvm_unreachable("missing builtin ID in switch!");
6065 case NEON::BI__builtin_neon_vcvth_n_s16_f16:
6066 Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
6067 case NEON::BI__builtin_neon_vcvth_n_u16_f16:
6068 Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
6069 }
6070 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "fcvth_n");
6071 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
6072 }
6073 case NEON::BI__builtin_neon_vcvth_n_f16_s16:
6074 case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
6075 unsigned Int;
6076 llvm::Type* FTy = HalfTy;
6077 llvm::Type* InTy = Int32Ty;
6078 llvm::Type *Tys[2] = {FTy, InTy};
6079 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6080 switch (BuiltinID) {
6081 default: llvm_unreachable("missing builtin ID in switch!");
6082 case NEON::BI__builtin_neon_vcvth_n_f16_s16:
6083 Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
6084 Ops[0] = Builder.CreateSExt(V: Ops[0], DestTy: InTy, Name: "sext");
6085 break;
6086 case NEON::BI__builtin_neon_vcvth_n_f16_u16:
6087 Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
6088 Ops[0] = Builder.CreateZExt(V: Ops[0], DestTy: InTy);
6089 break;
6090 }
6091 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "fcvth_n");
6092 }
6093 case NEON::BI__builtin_neon_vpaddd_s64: {
6094 auto *Ty = llvm::FixedVectorType::get(ElementType: Int64Ty, NumElts: 2);
6095 Value *Vec = EmitScalarExpr(E: E->getArg(Arg: 0));
6096 // The vector is v2f64, so make sure it's bitcast to that.
6097 Vec = Builder.CreateBitCast(V: Vec, DestTy: Ty, Name: "v2i64");
6098 llvm::Value *Idx0 = llvm::ConstantInt::get(Ty: SizeTy, V: 0);
6099 llvm::Value *Idx1 = llvm::ConstantInt::get(Ty: SizeTy, V: 1);
6100 Value *Op0 = Builder.CreateExtractElement(Vec, Idx: Idx0, Name: "lane0");
6101 Value *Op1 = Builder.CreateExtractElement(Vec, Idx: Idx1, Name: "lane1");
6102 // Pairwise addition of a v2f64 into a scalar f64.
6103 return Builder.CreateAdd(LHS: Op0, RHS: Op1, Name: "vpaddd");
6104 }
6105 case NEON::BI__builtin_neon_vpaddd_f64: {
6106 auto *Ty = llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 2);
6107 Value *Vec = EmitScalarExpr(E: E->getArg(Arg: 0));
6108 // The vector is v2f64, so make sure it's bitcast to that.
6109 Vec = Builder.CreateBitCast(V: Vec, DestTy: Ty, Name: "v2f64");
6110 llvm::Value *Idx0 = llvm::ConstantInt::get(Ty: SizeTy, V: 0);
6111 llvm::Value *Idx1 = llvm::ConstantInt::get(Ty: SizeTy, V: 1);
6112 Value *Op0 = Builder.CreateExtractElement(Vec, Idx: Idx0, Name: "lane0");
6113 Value *Op1 = Builder.CreateExtractElement(Vec, Idx: Idx1, Name: "lane1");
6114 // Pairwise addition of a v2f64 into a scalar f64.
6115 return Builder.CreateFAdd(L: Op0, R: Op1, Name: "vpaddd");
6116 }
6117 case NEON::BI__builtin_neon_vpadds_f32: {
6118 auto *Ty = llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 2);
6119 Value *Vec = EmitScalarExpr(E: E->getArg(Arg: 0));
6120 // The vector is v2f32, so make sure it's bitcast to that.
6121 Vec = Builder.CreateBitCast(V: Vec, DestTy: Ty, Name: "v2f32");
6122 llvm::Value *Idx0 = llvm::ConstantInt::get(Ty: SizeTy, V: 0);
6123 llvm::Value *Idx1 = llvm::ConstantInt::get(Ty: SizeTy, V: 1);
6124 Value *Op0 = Builder.CreateExtractElement(Vec, Idx: Idx0, Name: "lane0");
6125 Value *Op1 = Builder.CreateExtractElement(Vec, Idx: Idx1, Name: "lane1");
6126 // Pairwise addition of a v2f32 into a scalar f32.
6127 return Builder.CreateFAdd(L: Op0, R: Op1, Name: "vpaddd");
6128 }
6129 case NEON::BI__builtin_neon_vceqzd_s64:
6130 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6131 return EmitAArch64CompareBuiltinExpr(
6132 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6133 Pred: ICmpInst::ICMP_EQ, Name: "vceqz");
6134 case NEON::BI__builtin_neon_vceqzd_f64:
6135 case NEON::BI__builtin_neon_vceqzs_f32:
6136 case NEON::BI__builtin_neon_vceqzh_f16:
6137 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6138 return EmitAArch64CompareBuiltinExpr(
6139 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6140 Pred: ICmpInst::FCMP_OEQ, Name: "vceqz");
6141 case NEON::BI__builtin_neon_vcgezd_s64:
6142 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6143 return EmitAArch64CompareBuiltinExpr(
6144 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6145 Pred: ICmpInst::ICMP_SGE, Name: "vcgez");
6146 case NEON::BI__builtin_neon_vcgezd_f64:
6147 case NEON::BI__builtin_neon_vcgezs_f32:
6148 case NEON::BI__builtin_neon_vcgezh_f16:
6149 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6150 return EmitAArch64CompareBuiltinExpr(
6151 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6152 Pred: ICmpInst::FCMP_OGE, Name: "vcgez");
6153 case NEON::BI__builtin_neon_vclezd_s64:
6154 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6155 return EmitAArch64CompareBuiltinExpr(
6156 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6157 Pred: ICmpInst::ICMP_SLE, Name: "vclez");
6158 case NEON::BI__builtin_neon_vclezd_f64:
6159 case NEON::BI__builtin_neon_vclezs_f32:
6160 case NEON::BI__builtin_neon_vclezh_f16:
6161 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6162 return EmitAArch64CompareBuiltinExpr(
6163 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6164 Pred: ICmpInst::FCMP_OLE, Name: "vclez");
6165 case NEON::BI__builtin_neon_vcgtzd_s64:
6166 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6167 return EmitAArch64CompareBuiltinExpr(
6168 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6169 Pred: ICmpInst::ICMP_SGT, Name: "vcgtz");
6170 case NEON::BI__builtin_neon_vcgtzd_f64:
6171 case NEON::BI__builtin_neon_vcgtzs_f32:
6172 case NEON::BI__builtin_neon_vcgtzh_f16:
6173 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6174 return EmitAArch64CompareBuiltinExpr(
6175 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6176 Pred: ICmpInst::FCMP_OGT, Name: "vcgtz");
6177 case NEON::BI__builtin_neon_vcltzd_s64:
6178 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6179 return EmitAArch64CompareBuiltinExpr(
6180 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6181 Pred: ICmpInst::ICMP_SLT, Name: "vcltz");
6182
6183 case NEON::BI__builtin_neon_vcltzd_f64:
6184 case NEON::BI__builtin_neon_vcltzs_f32:
6185 case NEON::BI__builtin_neon_vcltzh_f16:
6186 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6187 return EmitAArch64CompareBuiltinExpr(
6188 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
6189 Pred: ICmpInst::FCMP_OLT, Name: "vcltz");
6190
6191 case NEON::BI__builtin_neon_vceqzd_u64: {
6192 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6193 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Int64Ty);
6194 Ops[0] =
6195 Builder.CreateICmpEQ(LHS: Ops[0], RHS: llvm::Constant::getNullValue(Ty: Int64Ty));
6196 return Builder.CreateSExt(V: Ops[0], DestTy: Int64Ty, Name: "vceqzd");
6197 }
6198 case NEON::BI__builtin_neon_vceqd_f64:
6199 case NEON::BI__builtin_neon_vcled_f64:
6200 case NEON::BI__builtin_neon_vcltd_f64:
6201 case NEON::BI__builtin_neon_vcged_f64:
6202 case NEON::BI__builtin_neon_vcgtd_f64: {
6203 llvm::CmpInst::Predicate P;
6204 switch (BuiltinID) {
6205 default: llvm_unreachable("missing builtin ID in switch!");
6206 case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
6207 case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
6208 case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
6209 case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
6210 case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
6211 }
6212 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6213 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DoubleTy);
6214 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: DoubleTy);
6215 if (P == llvm::FCmpInst::FCMP_OEQ)
6216 Ops[0] = Builder.CreateFCmp(P, LHS: Ops[0], RHS: Ops[1]);
6217 else
6218 Ops[0] = Builder.CreateFCmpS(P, LHS: Ops[0], RHS: Ops[1]);
6219 return Builder.CreateSExt(V: Ops[0], DestTy: Int64Ty, Name: "vcmpd");
6220 }
6221 case NEON::BI__builtin_neon_vceqs_f32:
6222 case NEON::BI__builtin_neon_vcles_f32:
6223 case NEON::BI__builtin_neon_vclts_f32:
6224 case NEON::BI__builtin_neon_vcges_f32:
6225 case NEON::BI__builtin_neon_vcgts_f32: {
6226 llvm::CmpInst::Predicate P;
6227 switch (BuiltinID) {
6228 default: llvm_unreachable("missing builtin ID in switch!");
6229 case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
6230 case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
6231 case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
6232 case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
6233 case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
6234 }
6235 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6236 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: FloatTy);
6237 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: FloatTy);
6238 if (P == llvm::FCmpInst::FCMP_OEQ)
6239 Ops[0] = Builder.CreateFCmp(P, LHS: Ops[0], RHS: Ops[1]);
6240 else
6241 Ops[0] = Builder.CreateFCmpS(P, LHS: Ops[0], RHS: Ops[1]);
6242 return Builder.CreateSExt(V: Ops[0], DestTy: Int32Ty, Name: "vcmpd");
6243 }
6244 case NEON::BI__builtin_neon_vceqh_f16:
6245 case NEON::BI__builtin_neon_vcleh_f16:
6246 case NEON::BI__builtin_neon_vclth_f16:
6247 case NEON::BI__builtin_neon_vcgeh_f16:
6248 case NEON::BI__builtin_neon_vcgth_f16: {
6249 llvm::CmpInst::Predicate P;
6250 switch (BuiltinID) {
6251 default: llvm_unreachable("missing builtin ID in switch!");
6252 case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
6253 case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
6254 case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
6255 case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
6256 case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
6257 }
6258 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6259 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: HalfTy);
6260 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: HalfTy);
6261 if (P == llvm::FCmpInst::FCMP_OEQ)
6262 Ops[0] = Builder.CreateFCmp(P, LHS: Ops[0], RHS: Ops[1]);
6263 else
6264 Ops[0] = Builder.CreateFCmpS(P, LHS: Ops[0], RHS: Ops[1]);
6265 return Builder.CreateSExt(V: Ops[0], DestTy: Int16Ty, Name: "vcmpd");
6266 }
6267 case NEON::BI__builtin_neon_vceqd_s64:
6268 case NEON::BI__builtin_neon_vceqd_u64:
6269 case NEON::BI__builtin_neon_vcgtd_s64:
6270 case NEON::BI__builtin_neon_vcgtd_u64:
6271 case NEON::BI__builtin_neon_vcltd_s64:
6272 case NEON::BI__builtin_neon_vcltd_u64:
6273 case NEON::BI__builtin_neon_vcged_u64:
6274 case NEON::BI__builtin_neon_vcged_s64:
6275 case NEON::BI__builtin_neon_vcled_u64:
6276 case NEON::BI__builtin_neon_vcled_s64: {
6277 llvm::CmpInst::Predicate P;
6278 switch (BuiltinID) {
6279 default: llvm_unreachable("missing builtin ID in switch!");
6280 case NEON::BI__builtin_neon_vceqd_s64:
6281 case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
6282 case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
6283 case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
6284 case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
6285 case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
6286 case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
6287 case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
6288 case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
6289 case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
6290 }
6291 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6292 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Int64Ty);
6293 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Int64Ty);
6294 Ops[0] = Builder.CreateICmp(P, LHS: Ops[0], RHS: Ops[1]);
6295 return Builder.CreateSExt(V: Ops[0], DestTy: Int64Ty, Name: "vceqd");
6296 }
6297 case NEON::BI__builtin_neon_vtstd_s64:
6298 case NEON::BI__builtin_neon_vtstd_u64: {
6299 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6300 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Int64Ty);
6301 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Int64Ty);
6302 Ops[0] = Builder.CreateAnd(LHS: Ops[0], RHS: Ops[1]);
6303 Ops[0] = Builder.CreateICmp(P: ICmpInst::ICMP_NE, LHS: Ops[0],
6304 RHS: llvm::Constant::getNullValue(Ty: Int64Ty));
6305 return Builder.CreateSExt(V: Ops[0], DestTy: Int64Ty, Name: "vtstd");
6306 }
6307 case NEON::BI__builtin_neon_vset_lane_i8:
6308 case NEON::BI__builtin_neon_vset_lane_i16:
6309 case NEON::BI__builtin_neon_vset_lane_i32:
6310 case NEON::BI__builtin_neon_vset_lane_i64:
6311 case NEON::BI__builtin_neon_vset_lane_bf16:
6312 case NEON::BI__builtin_neon_vset_lane_f32:
6313 case NEON::BI__builtin_neon_vsetq_lane_i8:
6314 case NEON::BI__builtin_neon_vsetq_lane_i16:
6315 case NEON::BI__builtin_neon_vsetq_lane_i32:
6316 case NEON::BI__builtin_neon_vsetq_lane_i64:
6317 case NEON::BI__builtin_neon_vsetq_lane_bf16:
6318 case NEON::BI__builtin_neon_vsetq_lane_f32:
6319 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 2)));
6320 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
6321 case NEON::BI__builtin_neon_vset_lane_f64:
6322 // The vector type needs a cast for the v1f64 variant.
6323 Ops[1] =
6324 Builder.CreateBitCast(V: Ops[1], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 1));
6325 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 2)));
6326 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
6327 case NEON::BI__builtin_neon_vset_lane_mf8:
6328 case NEON::BI__builtin_neon_vsetq_lane_mf8:
6329 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 2)));
6330 // The input vector type needs a cast to scalar type.
6331 Ops[0] =
6332 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::Type::getInt8Ty(C&: getLLVMContext()));
6333 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
6334 case NEON::BI__builtin_neon_vsetq_lane_f64:
6335 // The vector type needs a cast for the v2f64 variant.
6336 Ops[1] =
6337 Builder.CreateBitCast(V: Ops[1], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 2));
6338 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 2)));
6339 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
6340
6341 case NEON::BI__builtin_neon_vget_lane_i8:
6342 case NEON::BI__builtin_neon_vdupb_lane_i8:
6343 Ops[0] =
6344 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8));
6345 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6346 Name: "vget_lane");
6347 case NEON::BI__builtin_neon_vgetq_lane_i8:
6348 case NEON::BI__builtin_neon_vdupb_laneq_i8:
6349 Ops[0] =
6350 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16));
6351 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6352 Name: "vgetq_lane");
6353 case NEON::BI__builtin_neon_vget_lane_mf8:
6354 case NEON::BI__builtin_neon_vdupb_lane_mf8:
6355 case NEON::BI__builtin_neon_vgetq_lane_mf8:
6356 case NEON::BI__builtin_neon_vdupb_laneq_mf8:
6357 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6358 Name: "vget_lane");
6359 case NEON::BI__builtin_neon_vget_lane_i16:
6360 case NEON::BI__builtin_neon_vduph_lane_i16:
6361 Ops[0] =
6362 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 4));
6363 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6364 Name: "vget_lane");
6365 case NEON::BI__builtin_neon_vgetq_lane_i16:
6366 case NEON::BI__builtin_neon_vduph_laneq_i16:
6367 Ops[0] =
6368 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 8));
6369 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6370 Name: "vgetq_lane");
6371 case NEON::BI__builtin_neon_vget_lane_i32:
6372 case NEON::BI__builtin_neon_vdups_lane_i32:
6373 Ops[0] =
6374 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int32Ty, NumElts: 2));
6375 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6376 Name: "vget_lane");
6377 case NEON::BI__builtin_neon_vdups_lane_f32:
6378 Ops[0] =
6379 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 2));
6380 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6381 Name: "vdups_lane");
6382 case NEON::BI__builtin_neon_vgetq_lane_i32:
6383 case NEON::BI__builtin_neon_vdups_laneq_i32:
6384 Ops[0] =
6385 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int32Ty, NumElts: 4));
6386 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6387 Name: "vgetq_lane");
6388 case NEON::BI__builtin_neon_vget_lane_i64:
6389 case NEON::BI__builtin_neon_vdupd_lane_i64:
6390 Ops[0] =
6391 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int64Ty, NumElts: 1));
6392 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6393 Name: "vget_lane");
6394 case NEON::BI__builtin_neon_vdupd_lane_f64:
6395 Ops[0] =
6396 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 1));
6397 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6398 Name: "vdupd_lane");
6399 case NEON::BI__builtin_neon_vgetq_lane_i64:
6400 case NEON::BI__builtin_neon_vdupd_laneq_i64:
6401 Ops[0] =
6402 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int64Ty, NumElts: 2));
6403 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6404 Name: "vgetq_lane");
6405 case NEON::BI__builtin_neon_vget_lane_f32:
6406 Ops[0] =
6407 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 2));
6408 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6409 Name: "vget_lane");
6410 case NEON::BI__builtin_neon_vget_lane_f64:
6411 Ops[0] =
6412 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 1));
6413 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6414 Name: "vget_lane");
6415 case NEON::BI__builtin_neon_vgetq_lane_f32:
6416 case NEON::BI__builtin_neon_vdups_laneq_f32:
6417 Ops[0] =
6418 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4));
6419 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6420 Name: "vgetq_lane");
6421 case NEON::BI__builtin_neon_vgetq_lane_f64:
6422 case NEON::BI__builtin_neon_vdupd_laneq_f64:
6423 Ops[0] =
6424 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 2));
6425 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6426 Name: "vgetq_lane");
6427 case NEON::BI__builtin_neon_vaddh_f16:
6428 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6429 return Builder.CreateFAdd(L: Ops[0], R: Ops[1], Name: "vaddh");
6430 case NEON::BI__builtin_neon_vsubh_f16:
6431 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6432 return Builder.CreateFSub(L: Ops[0], R: Ops[1], Name: "vsubh");
6433 case NEON::BI__builtin_neon_vmulh_f16:
6434 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6435 return Builder.CreateFMul(L: Ops[0], R: Ops[1], Name: "vmulh");
6436 case NEON::BI__builtin_neon_vdivh_f16:
6437 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6438 return Builder.CreateFDiv(L: Ops[0], R: Ops[1], Name: "vdivh");
6439 case NEON::BI__builtin_neon_vfmah_f16:
6440 // NEON intrinsic puts accumulator first, unlike the LLVM fma.
6441 return emitCallMaybeConstrainedFPBuiltin(
6442 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty: HalfTy,
6443 Args: {EmitScalarExpr(E: E->getArg(Arg: 1)), EmitScalarExpr(E: E->getArg(Arg: 2)), Ops[0]});
6444 case NEON::BI__builtin_neon_vfmsh_f16: {
6445 Value* Neg = Builder.CreateFNeg(V: EmitScalarExpr(E: E->getArg(Arg: 1)), Name: "vsubh");
6446
6447 // NEON intrinsic puts accumulator first, unlike the LLVM fma.
6448 return emitCallMaybeConstrainedFPBuiltin(
6449 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty: HalfTy,
6450 Args: {Neg, EmitScalarExpr(E: E->getArg(Arg: 2)), Ops[0]});
6451 }
6452 case NEON::BI__builtin_neon_vaddd_s64:
6453 case NEON::BI__builtin_neon_vaddd_u64:
6454 return Builder.CreateAdd(LHS: Ops[0], RHS: EmitScalarExpr(E: E->getArg(Arg: 1)), Name: "vaddd");
6455 case NEON::BI__builtin_neon_vsubd_s64:
6456 case NEON::BI__builtin_neon_vsubd_u64:
6457 return Builder.CreateSub(LHS: Ops[0], RHS: EmitScalarExpr(E: E->getArg(Arg: 1)), Name: "vsubd");
6458 case NEON::BI__builtin_neon_vqdmlalh_s16:
6459 case NEON::BI__builtin_neon_vqdmlslh_s16: {
6460 SmallVector<Value *, 2> ProductOps;
6461 ProductOps.push_back(Elt: vectorWrapScalar16(Op: Ops[1]));
6462 ProductOps.push_back(Elt: vectorWrapScalar16(Op: EmitScalarExpr(E: E->getArg(Arg: 2))));
6463 auto *VTy = llvm::FixedVectorType::get(ElementType: Int32Ty, NumElts: 4);
6464 Ops[1] = EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqdmull, Tys: VTy),
6465 Ops&: ProductOps, name: "vqdmlXl");
6466 Constant *CI = ConstantInt::get(Ty: SizeTy, V: 0);
6467 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: CI, Name: "lane0");
6468
6469 unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
6470 ? Intrinsic::aarch64_neon_sqadd
6471 : Intrinsic::aarch64_neon_sqsub;
6472 return EmitNeonCall(F: CGM.getIntrinsic(IID: AccumInt, Tys: Int32Ty), Ops, name: "vqdmlXl");
6473 }
6474 case NEON::BI__builtin_neon_vqshlud_n_s64: {
6475 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6476 Ops[1] = Builder.CreateZExt(V: Ops[1], DestTy: Int64Ty);
6477 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqshlu, Tys: Int64Ty),
6478 Ops, name: "vqshlu_n");
6479 }
6480 case NEON::BI__builtin_neon_vqshld_n_u64:
6481 case NEON::BI__builtin_neon_vqshld_n_s64: {
6482 unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
6483 ? Intrinsic::aarch64_neon_uqshl
6484 : Intrinsic::aarch64_neon_sqshl;
6485 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6486 Ops[1] = Builder.CreateZExt(V: Ops[1], DestTy: Int64Ty);
6487 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Int64Ty), Ops, name: "vqshl_n");
6488 }
6489 case NEON::BI__builtin_neon_vrshrd_n_u64:
6490 case NEON::BI__builtin_neon_vrshrd_n_s64: {
6491 unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
6492 ? Intrinsic::aarch64_neon_urshl
6493 : Intrinsic::aarch64_neon_srshl;
6494 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6495 int SV = cast<ConstantInt>(Val: Ops[1])->getSExtValue();
6496 Ops[1] = ConstantInt::get(Ty: Int64Ty, V: -SV);
6497 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Int64Ty), Ops, name: "vrshr_n");
6498 }
6499 case NEON::BI__builtin_neon_vrsrad_n_u64:
6500 case NEON::BI__builtin_neon_vrsrad_n_s64: {
6501 unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
6502 ? Intrinsic::aarch64_neon_urshl
6503 : Intrinsic::aarch64_neon_srshl;
6504 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Int64Ty);
6505 Ops.push_back(Elt: Builder.CreateNeg(V: EmitScalarExpr(E: E->getArg(Arg: 2))));
6506 Ops[1] = Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Int, Tys: Int64Ty),
6507 Args: {Ops[1], Builder.CreateSExt(V: Ops[2], DestTy: Int64Ty)});
6508 return Builder.CreateAdd(LHS: Ops[0], RHS: Builder.CreateBitCast(V: Ops[1], DestTy: Int64Ty));
6509 }
6510 case NEON::BI__builtin_neon_vshld_n_s64:
6511 case NEON::BI__builtin_neon_vshld_n_u64: {
6512 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
6513 return Builder.CreateShl(
6514 LHS: Ops[0], RHS: ConstantInt::get(Ty: Int64Ty, V: Amt->getZExtValue()), Name: "shld_n");
6515 }
6516 case NEON::BI__builtin_neon_vshrd_n_s64: {
6517 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
6518 return Builder.CreateAShr(
6519 LHS: Ops[0], RHS: ConstantInt::get(Ty: Int64Ty, V: std::min(a: static_cast<uint64_t>(63),
6520 b: Amt->getZExtValue())),
6521 Name: "shrd_n");
6522 }
6523 case NEON::BI__builtin_neon_vshrd_n_u64: {
6524 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
6525 uint64_t ShiftAmt = Amt->getZExtValue();
6526 // Right-shifting an unsigned value by its size yields 0.
6527 if (ShiftAmt == 64)
6528 return ConstantInt::get(Ty: Int64Ty, V: 0);
6529 return Builder.CreateLShr(LHS: Ops[0], RHS: ConstantInt::get(Ty: Int64Ty, V: ShiftAmt),
6530 Name: "shrd_n");
6531 }
6532 case NEON::BI__builtin_neon_vsrad_n_s64: {
6533 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 2)));
6534 Ops[1] = Builder.CreateAShr(
6535 LHS: Ops[1], RHS: ConstantInt::get(Ty: Int64Ty, V: std::min(a: static_cast<uint64_t>(63),
6536 b: Amt->getZExtValue())),
6537 Name: "shrd_n");
6538 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1]);
6539 }
6540 case NEON::BI__builtin_neon_vsrad_n_u64: {
6541 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 2)));
6542 uint64_t ShiftAmt = Amt->getZExtValue();
6543 // Right-shifting an unsigned value by its size yields 0.
6544 // As Op + 0 = Op, return Ops[0] directly.
6545 if (ShiftAmt == 64)
6546 return Ops[0];
6547 Ops[1] = Builder.CreateLShr(LHS: Ops[1], RHS: ConstantInt::get(Ty: Int64Ty, V: ShiftAmt),
6548 Name: "shrd_n");
6549 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1]);
6550 }
6551 case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
6552 case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
6553 case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
6554 case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
6555 Ops[2] = Builder.CreateExtractElement(Vec: Ops[2], Idx: EmitScalarExpr(E: E->getArg(Arg: 3)),
6556 Name: "lane");
6557 SmallVector<Value *, 2> ProductOps;
6558 ProductOps.push_back(Elt: vectorWrapScalar16(Op: Ops[1]));
6559 ProductOps.push_back(Elt: vectorWrapScalar16(Op: Ops[2]));
6560 auto *VTy = llvm::FixedVectorType::get(ElementType: Int32Ty, NumElts: 4);
6561 Ops[1] = EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqdmull, Tys: VTy),
6562 Ops&: ProductOps, name: "vqdmlXl");
6563 Constant *CI = ConstantInt::get(Ty: SizeTy, V: 0);
6564 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: CI, Name: "lane0");
6565 Ops.pop_back();
6566
6567 unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
6568 BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
6569 ? Intrinsic::aarch64_neon_sqadd
6570 : Intrinsic::aarch64_neon_sqsub;
6571 return EmitNeonCall(F: CGM.getIntrinsic(IID: AccInt, Tys: Int32Ty), Ops, name: "vqdmlXl");
6572 }
6573 case NEON::BI__builtin_neon_vqdmlals_s32:
6574 case NEON::BI__builtin_neon_vqdmlsls_s32: {
6575 SmallVector<Value *, 2> ProductOps;
6576 ProductOps.push_back(Elt: Ops[1]);
6577 ProductOps.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 2)));
6578 Ops[1] =
6579 EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqdmulls_scalar),
6580 Ops&: ProductOps, name: "vqdmlXl");
6581
6582 unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
6583 ? Intrinsic::aarch64_neon_sqadd
6584 : Intrinsic::aarch64_neon_sqsub;
6585 return EmitNeonCall(F: CGM.getIntrinsic(IID: AccumInt, Tys: Int64Ty), Ops, name: "vqdmlXl");
6586 }
6587 case NEON::BI__builtin_neon_vqdmlals_lane_s32:
6588 case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
6589 case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
6590 case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
6591 Ops[2] = Builder.CreateExtractElement(Vec: Ops[2], Idx: EmitScalarExpr(E: E->getArg(Arg: 3)),
6592 Name: "lane");
6593 SmallVector<Value *, 2> ProductOps;
6594 ProductOps.push_back(Elt: Ops[1]);
6595 ProductOps.push_back(Elt: Ops[2]);
6596 Ops[1] =
6597 EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqdmulls_scalar),
6598 Ops&: ProductOps, name: "vqdmlXl");
6599 Ops.pop_back();
6600
6601 unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
6602 BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
6603 ? Intrinsic::aarch64_neon_sqadd
6604 : Intrinsic::aarch64_neon_sqsub;
6605 return EmitNeonCall(F: CGM.getIntrinsic(IID: AccInt, Tys: Int64Ty), Ops, name: "vqdmlXl");
6606 }
6607 case NEON::BI__builtin_neon_vget_lane_bf16:
6608 case NEON::BI__builtin_neon_vduph_lane_bf16:
6609 case NEON::BI__builtin_neon_vduph_lane_f16: {
6610 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6611 Name: "vget_lane");
6612 }
6613 case NEON::BI__builtin_neon_vgetq_lane_bf16:
6614 case NEON::BI__builtin_neon_vduph_laneq_bf16:
6615 case NEON::BI__builtin_neon_vduph_laneq_f16: {
6616 return Builder.CreateExtractElement(Vec: Ops[0], Idx: EmitScalarExpr(E: E->getArg(Arg: 1)),
6617 Name: "vgetq_lane");
6618 }
6619 case NEON::BI__builtin_neon_vcvt_bf16_f32: {
6620 llvm::Type *V4F32 = FixedVectorType::get(ElementType: Builder.getFloatTy(), NumElts: 4);
6621 llvm::Type *V4BF16 = FixedVectorType::get(ElementType: Builder.getBFloatTy(), NumElts: 4);
6622 return Builder.CreateFPTrunc(V: Builder.CreateBitCast(V: Ops[0], DestTy: V4F32), DestTy: V4BF16);
6623 }
6624 case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
6625 SmallVector<int, 16> ConcatMask(8);
6626 std::iota(first: ConcatMask.begin(), last: ConcatMask.end(), value: 0);
6627 llvm::Type *V4F32 = FixedVectorType::get(ElementType: Builder.getFloatTy(), NumElts: 4);
6628 llvm::Type *V4BF16 = FixedVectorType::get(ElementType: Builder.getBFloatTy(), NumElts: 4);
6629 llvm::Value *Trunc =
6630 Builder.CreateFPTrunc(V: Builder.CreateBitCast(V: Ops[0], DestTy: V4F32), DestTy: V4BF16);
6631 return Builder.CreateShuffleVector(
6632 V1: Trunc, V2: ConstantAggregateZero::get(Ty: V4BF16), Mask: ConcatMask);
6633 }
6634 case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
6635 SmallVector<int, 16> ConcatMask(8);
6636 std::iota(first: ConcatMask.begin(), last: ConcatMask.end(), value: 0);
6637 SmallVector<int, 16> LoMask(4);
6638 std::iota(first: LoMask.begin(), last: LoMask.end(), value: 0);
6639 llvm::Type *V4F32 = FixedVectorType::get(ElementType: Builder.getFloatTy(), NumElts: 4);
6640 llvm::Type *V4BF16 = FixedVectorType::get(ElementType: Builder.getBFloatTy(), NumElts: 4);
6641 llvm::Type *V8BF16 = FixedVectorType::get(ElementType: Builder.getBFloatTy(), NumElts: 8);
6642 llvm::Value *Inactive = Builder.CreateShuffleVector(
6643 V: Builder.CreateBitCast(V: Ops[0], DestTy: V8BF16), Mask: LoMask);
6644 llvm::Value *Trunc =
6645 Builder.CreateFPTrunc(V: Builder.CreateBitCast(V: Ops[1], DestTy: V4F32), DestTy: V4BF16);
6646 return Builder.CreateShuffleVector(V1: Inactive, V2: Trunc, Mask: ConcatMask);
6647 }
6648
6649 case clang::AArch64::BI_InterlockedAdd:
6650 case clang::AArch64::BI_InterlockedAdd_acq:
6651 case clang::AArch64::BI_InterlockedAdd_rel:
6652 case clang::AArch64::BI_InterlockedAdd_nf:
6653 case clang::AArch64::BI_InterlockedAdd64:
6654 case clang::AArch64::BI_InterlockedAdd64_acq:
6655 case clang::AArch64::BI_InterlockedAdd64_rel:
6656 case clang::AArch64::BI_InterlockedAdd64_nf: {
6657 Address DestAddr = CheckAtomicAlignment(CGF&: *this, E);
6658 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
6659 llvm::AtomicOrdering Ordering;
6660 switch (BuiltinID) {
6661 case clang::AArch64::BI_InterlockedAdd:
6662 case clang::AArch64::BI_InterlockedAdd64:
6663 Ordering = llvm::AtomicOrdering::SequentiallyConsistent;
6664 break;
6665 case clang::AArch64::BI_InterlockedAdd_acq:
6666 case clang::AArch64::BI_InterlockedAdd64_acq:
6667 Ordering = llvm::AtomicOrdering::Acquire;
6668 break;
6669 case clang::AArch64::BI_InterlockedAdd_rel:
6670 case clang::AArch64::BI_InterlockedAdd64_rel:
6671 Ordering = llvm::AtomicOrdering::Release;
6672 break;
6673 case clang::AArch64::BI_InterlockedAdd_nf:
6674 case clang::AArch64::BI_InterlockedAdd64_nf:
6675 Ordering = llvm::AtomicOrdering::Monotonic;
6676 break;
6677 default:
6678 llvm_unreachable("missing builtin ID in switch!");
6679 }
6680 AtomicRMWInst *RMWI =
6681 Builder.CreateAtomicRMW(Op: AtomicRMWInst::Add, Addr: DestAddr, Val, Ordering);
6682 return Builder.CreateAdd(LHS: RMWI, RHS: Val);
6683 }
6684 }
6685
6686 llvm::FixedVectorType *VTy = GetNeonType(CGF: this, TypeFlags: Type);
6687 llvm::Type *Ty = VTy;
6688 if (!Ty)
6689 return nullptr;
6690
6691 // Not all intrinsics handled by the common case work for AArch64 yet, so only
6692 // defer to common code if it's been added to our special map.
6693 Builtin = findARMVectorIntrinsicInMap(IntrinsicMap: AArch64SIMDIntrinsicMap, BuiltinID,
6694 MapProvenSorted&: AArch64SIMDIntrinsicsProvenSorted);
6695
6696 if (Builtin)
6697 return EmitCommonNeonBuiltinExpr(
6698 BuiltinID: Builtin->BuiltinID, LLVMIntrinsic: Builtin->LLVMIntrinsic, AltLLVMIntrinsic: Builtin->AltLLVMIntrinsic,
6699 NameHint: Builtin->NameHint, Modifier: Builtin->TypeModifier, E, Ops,
6700 /*never use addresses*/ PtrOp0: Address::invalid(), PtrOp1: Address::invalid(), Arch);
6701
6702 if (Value *V = EmitAArch64TblBuiltinExpr(CGF&: *this, BuiltinID, E, Ops, Arch))
6703 return V;
6704
6705 unsigned Int;
6706 bool ExtractLow = false;
6707 bool ExtendLaneArg = false;
6708 switch (BuiltinID) {
6709 default: return nullptr;
6710 case NEON::BI__builtin_neon_vbsl_v:
6711 case NEON::BI__builtin_neon_vbslq_v: {
6712 llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
6713 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: BitTy, Name: "vbsl");
6714 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: BitTy, Name: "vbsl");
6715 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: BitTy, Name: "vbsl");
6716
6717 Ops[1] = Builder.CreateAnd(LHS: Ops[0], RHS: Ops[1], Name: "vbsl");
6718 Ops[2] = Builder.CreateAnd(LHS: Builder.CreateNot(V: Ops[0]), RHS: Ops[2], Name: "vbsl");
6719 Ops[0] = Builder.CreateOr(LHS: Ops[1], RHS: Ops[2], Name: "vbsl");
6720 return Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6721 }
6722 case NEON::BI__builtin_neon_vfma_lane_v:
6723 case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
6724 // The ARM builtins (and instructions) have the addend as the first
6725 // operand, but the 'fma' intrinsics have it last. Swap it around here.
6726 Value *Addend = Ops[0];
6727 Value *Multiplicand = Ops[1];
6728 Value *LaneSource = Ops[2];
6729 Ops[0] = Multiplicand;
6730 Ops[1] = LaneSource;
6731 Ops[2] = Addend;
6732
6733 // Now adjust things to handle the lane access.
6734 auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v
6735 ? llvm::FixedVectorType::get(ElementType: VTy->getElementType(),
6736 NumElts: VTy->getNumElements() / 2)
6737 : VTy;
6738 llvm::Constant *cst = cast<Constant>(Val: Ops[3]);
6739 Value *SV = llvm::ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: cst);
6740 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: SourceTy);
6741 Ops[1] = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[1], Mask: SV, Name: "lane");
6742
6743 Ops.pop_back();
6744 Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
6745 : Intrinsic::fma;
6746 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "fmla");
6747 }
6748 case NEON::BI__builtin_neon_vfma_laneq_v: {
6749 auto *VTy = cast<llvm::FixedVectorType>(Val: Ty);
6750 // v1f64 fma should be mapped to Neon scalar f64 fma
6751 if (VTy && VTy->getElementType() == DoubleTy) {
6752 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DoubleTy);
6753 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: DoubleTy);
6754 llvm::FixedVectorType *VTy =
6755 GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float64, false, true));
6756 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: VTy);
6757 Ops[2] = Builder.CreateExtractElement(Vec: Ops[2], Idx: Ops[3], Name: "extract");
6758 Value *Result;
6759 Result = emitCallMaybeConstrainedFPBuiltin(
6760 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma,
6761 Ty: DoubleTy, Args: {Ops[1], Ops[2], Ops[0]});
6762 return Builder.CreateBitCast(V: Result, DestTy: Ty);
6763 }
6764 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6765 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6766
6767 auto *STy = llvm::FixedVectorType::get(ElementType: VTy->getElementType(),
6768 NumElts: VTy->getNumElements() * 2);
6769 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: STy);
6770 Value *SV = llvm::ConstantVector::getSplat(EC: VTy->getElementCount(),
6771 Elt: cast<ConstantInt>(Val: Ops[3]));
6772 Ops[2] = Builder.CreateShuffleVector(V1: Ops[2], V2: Ops[2], Mask: SV, Name: "lane");
6773
6774 return emitCallMaybeConstrainedFPBuiltin(
6775 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty,
6776 Args: {Ops[2], Ops[1], Ops[0]});
6777 }
6778 case NEON::BI__builtin_neon_vfmaq_laneq_v: {
6779 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6780 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6781
6782 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
6783 Ops[2] = EmitNeonSplat(V: Ops[2], C: cast<ConstantInt>(Val: Ops[3]));
6784 return emitCallMaybeConstrainedFPBuiltin(
6785 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty,
6786 Args: {Ops[2], Ops[1], Ops[0]});
6787 }
6788 case NEON::BI__builtin_neon_vfmah_lane_f16:
6789 case NEON::BI__builtin_neon_vfmas_lane_f32:
6790 case NEON::BI__builtin_neon_vfmah_laneq_f16:
6791 case NEON::BI__builtin_neon_vfmas_laneq_f32:
6792 case NEON::BI__builtin_neon_vfmad_lane_f64:
6793 case NEON::BI__builtin_neon_vfmad_laneq_f64: {
6794 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 3)));
6795 llvm::Type *Ty = ConvertType(T: E->getCallReturnType(Ctx: getContext()));
6796 Ops[2] = Builder.CreateExtractElement(Vec: Ops[2], Idx: Ops[3], Name: "extract");
6797 return emitCallMaybeConstrainedFPBuiltin(
6798 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty,
6799 Args: {Ops[1], Ops[2], Ops[0]});
6800 }
6801 case NEON::BI__builtin_neon_vmull_v:
6802 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6803 Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
6804 if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
6805 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmull");
6806 case NEON::BI__builtin_neon_vmax_v:
6807 case NEON::BI__builtin_neon_vmaxq_v:
6808 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6809 Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
6810 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
6811 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmax");
6812 case NEON::BI__builtin_neon_vmaxh_f16: {
6813 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6814 Int = Intrinsic::aarch64_neon_fmax;
6815 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vmax");
6816 }
6817 case NEON::BI__builtin_neon_vmin_v:
6818 case NEON::BI__builtin_neon_vminq_v:
6819 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6820 Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
6821 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
6822 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmin");
6823 case NEON::BI__builtin_neon_vminh_f16: {
6824 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6825 Int = Intrinsic::aarch64_neon_fmin;
6826 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vmin");
6827 }
6828 case NEON::BI__builtin_neon_vabd_v:
6829 case NEON::BI__builtin_neon_vabdq_v:
6830 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6831 Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
6832 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
6833 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vabd");
6834 case NEON::BI__builtin_neon_vpadal_v:
6835 case NEON::BI__builtin_neon_vpadalq_v: {
6836 unsigned ArgElts = VTy->getNumElements();
6837 llvm::IntegerType *EltTy = cast<IntegerType>(Val: VTy->getElementType());
6838 unsigned BitWidth = EltTy->getBitWidth();
6839 auto *ArgTy = llvm::FixedVectorType::get(
6840 ElementType: llvm::IntegerType::get(C&: getLLVMContext(), NumBits: BitWidth / 2), NumElts: 2 * ArgElts);
6841 llvm::Type* Tys[2] = { VTy, ArgTy };
6842 Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
6843 SmallVector<llvm::Value*, 1> TmpOps;
6844 TmpOps.push_back(Elt: Ops[1]);
6845 Function *F = CGM.getIntrinsic(IID: Int, Tys);
6846 llvm::Value *tmp = EmitNeonCall(F, Ops&: TmpOps, name: "vpadal");
6847 llvm::Value *addend = Builder.CreateBitCast(V: Ops[0], DestTy: tmp->getType());
6848 return Builder.CreateAdd(LHS: tmp, RHS: addend);
6849 }
6850 case NEON::BI__builtin_neon_vpmin_v:
6851 case NEON::BI__builtin_neon_vpminq_v:
6852 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6853 Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
6854 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
6855 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vpmin");
6856 case NEON::BI__builtin_neon_vpmax_v:
6857 case NEON::BI__builtin_neon_vpmaxq_v:
6858 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6859 Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
6860 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
6861 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vpmax");
6862 case NEON::BI__builtin_neon_vminnm_v:
6863 case NEON::BI__builtin_neon_vminnmq_v:
6864 Int = Intrinsic::aarch64_neon_fminnm;
6865 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vminnm");
6866 case NEON::BI__builtin_neon_vminnmh_f16:
6867 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6868 Int = Intrinsic::aarch64_neon_fminnm;
6869 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vminnm");
6870 case NEON::BI__builtin_neon_vmaxnm_v:
6871 case NEON::BI__builtin_neon_vmaxnmq_v:
6872 Int = Intrinsic::aarch64_neon_fmaxnm;
6873 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmaxnm");
6874 case NEON::BI__builtin_neon_vmaxnmh_f16:
6875 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6876 Int = Intrinsic::aarch64_neon_fmaxnm;
6877 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vmaxnm");
6878 case NEON::BI__builtin_neon_vrecpss_f32: {
6879 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6880 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_frecps, Tys: FloatTy),
6881 Ops, name: "vrecps");
6882 }
6883 case NEON::BI__builtin_neon_vrecpsd_f64:
6884 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6885 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_frecps, Tys: DoubleTy),
6886 Ops, name: "vrecps");
6887 case NEON::BI__builtin_neon_vrecpsh_f16:
6888 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 1)));
6889 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_frecps, Tys: HalfTy),
6890 Ops, name: "vrecps");
6891 case NEON::BI__builtin_neon_vqshrun_n_v:
6892 Int = Intrinsic::aarch64_neon_sqshrun;
6893 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshrun_n");
6894 case NEON::BI__builtin_neon_vqrshrun_n_v:
6895 Int = Intrinsic::aarch64_neon_sqrshrun;
6896 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqrshrun_n");
6897 case NEON::BI__builtin_neon_vqshrn_n_v:
6898 Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
6899 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshrn_n");
6900 case NEON::BI__builtin_neon_vrshrn_n_v:
6901 Int = Intrinsic::aarch64_neon_rshrn;
6902 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrshrn_n");
6903 case NEON::BI__builtin_neon_vqrshrn_n_v:
6904 Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
6905 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqrshrn_n");
6906 case NEON::BI__builtin_neon_vrndah_f16: {
6907 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6908 Int = Builder.getIsFPConstrained()
6909 ? Intrinsic::experimental_constrained_round
6910 : Intrinsic::round;
6911 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrnda");
6912 }
6913 case NEON::BI__builtin_neon_vrnda_v:
6914 case NEON::BI__builtin_neon_vrndaq_v: {
6915 Int = Builder.getIsFPConstrained()
6916 ? Intrinsic::experimental_constrained_round
6917 : Intrinsic::round;
6918 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnda");
6919 }
6920 case NEON::BI__builtin_neon_vrndih_f16: {
6921 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6922 Int = Builder.getIsFPConstrained()
6923 ? Intrinsic::experimental_constrained_nearbyint
6924 : Intrinsic::nearbyint;
6925 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndi");
6926 }
6927 case NEON::BI__builtin_neon_vrndmh_f16: {
6928 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6929 Int = Builder.getIsFPConstrained()
6930 ? Intrinsic::experimental_constrained_floor
6931 : Intrinsic::floor;
6932 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndm");
6933 }
6934 case NEON::BI__builtin_neon_vrndm_v:
6935 case NEON::BI__builtin_neon_vrndmq_v: {
6936 Int = Builder.getIsFPConstrained()
6937 ? Intrinsic::experimental_constrained_floor
6938 : Intrinsic::floor;
6939 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndm");
6940 }
6941 case NEON::BI__builtin_neon_vrndnh_f16: {
6942 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6943 Int = Builder.getIsFPConstrained()
6944 ? Intrinsic::experimental_constrained_roundeven
6945 : Intrinsic::roundeven;
6946 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndn");
6947 }
6948 case NEON::BI__builtin_neon_vrndn_v:
6949 case NEON::BI__builtin_neon_vrndnq_v: {
6950 Int = Builder.getIsFPConstrained()
6951 ? Intrinsic::experimental_constrained_roundeven
6952 : Intrinsic::roundeven;
6953 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndn");
6954 }
6955 case NEON::BI__builtin_neon_vrndns_f32: {
6956 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6957 Int = Builder.getIsFPConstrained()
6958 ? Intrinsic::experimental_constrained_roundeven
6959 : Intrinsic::roundeven;
6960 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: FloatTy), Ops, name: "vrndn");
6961 }
6962 case NEON::BI__builtin_neon_vrndph_f16: {
6963 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6964 Int = Builder.getIsFPConstrained()
6965 ? Intrinsic::experimental_constrained_ceil
6966 : Intrinsic::ceil;
6967 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndp");
6968 }
6969 case NEON::BI__builtin_neon_vrndp_v:
6970 case NEON::BI__builtin_neon_vrndpq_v: {
6971 Int = Builder.getIsFPConstrained()
6972 ? Intrinsic::experimental_constrained_ceil
6973 : Intrinsic::ceil;
6974 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndp");
6975 }
6976 case NEON::BI__builtin_neon_vrndxh_f16: {
6977 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6978 Int = Builder.getIsFPConstrained()
6979 ? Intrinsic::experimental_constrained_rint
6980 : Intrinsic::rint;
6981 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndx");
6982 }
6983 case NEON::BI__builtin_neon_vrndx_v:
6984 case NEON::BI__builtin_neon_vrndxq_v: {
6985 Int = Builder.getIsFPConstrained()
6986 ? Intrinsic::experimental_constrained_rint
6987 : Intrinsic::rint;
6988 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndx");
6989 }
6990 case NEON::BI__builtin_neon_vrndh_f16: {
6991 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
6992 Int = Builder.getIsFPConstrained()
6993 ? Intrinsic::experimental_constrained_trunc
6994 : Intrinsic::trunc;
6995 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndz");
6996 }
6997 case NEON::BI__builtin_neon_vrnd32x_f32:
6998 case NEON::BI__builtin_neon_vrnd32xq_f32:
6999 case NEON::BI__builtin_neon_vrnd32x_f64:
7000 case NEON::BI__builtin_neon_vrnd32xq_f64: {
7001 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7002 Int = Intrinsic::aarch64_neon_frint32x;
7003 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnd32x");
7004 }
7005 case NEON::BI__builtin_neon_vrnd32z_f32:
7006 case NEON::BI__builtin_neon_vrnd32zq_f32:
7007 case NEON::BI__builtin_neon_vrnd32z_f64:
7008 case NEON::BI__builtin_neon_vrnd32zq_f64: {
7009 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7010 Int = Intrinsic::aarch64_neon_frint32z;
7011 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnd32z");
7012 }
7013 case NEON::BI__builtin_neon_vrnd64x_f32:
7014 case NEON::BI__builtin_neon_vrnd64xq_f32:
7015 case NEON::BI__builtin_neon_vrnd64x_f64:
7016 case NEON::BI__builtin_neon_vrnd64xq_f64: {
7017 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7018 Int = Intrinsic::aarch64_neon_frint64x;
7019 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnd64x");
7020 }
7021 case NEON::BI__builtin_neon_vrnd64z_f32:
7022 case NEON::BI__builtin_neon_vrnd64zq_f32:
7023 case NEON::BI__builtin_neon_vrnd64z_f64:
7024 case NEON::BI__builtin_neon_vrnd64zq_f64: {
7025 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7026 Int = Intrinsic::aarch64_neon_frint64z;
7027 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnd64z");
7028 }
7029 case NEON::BI__builtin_neon_vrnd_v:
7030 case NEON::BI__builtin_neon_vrndq_v: {
7031 Int = Builder.getIsFPConstrained()
7032 ? Intrinsic::experimental_constrained_trunc
7033 : Intrinsic::trunc;
7034 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndz");
7035 }
7036 case NEON::BI__builtin_neon_vcvt_f64_v:
7037 case NEON::BI__builtin_neon_vcvtq_f64_v:
7038 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
7039 Ty = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
7040 return usgn ? Builder.CreateUIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt")
7041 : Builder.CreateSIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt");
7042 case NEON::BI__builtin_neon_vcvt_f64_f32: {
7043 assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
7044 "unexpected vcvt_f64_f32 builtin");
7045 NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
7046 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: GetNeonType(CGF: this, TypeFlags: SrcFlag));
7047
7048 return Builder.CreateFPExt(V: Ops[0], DestTy: Ty, Name: "vcvt");
7049 }
7050 case NEON::BI__builtin_neon_vcvt_f32_f64: {
7051 assert(Type.getEltType() == NeonTypeFlags::Float32 &&
7052 "unexpected vcvt_f32_f64 builtin");
7053 NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
7054 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: GetNeonType(CGF: this, TypeFlags: SrcFlag));
7055
7056 return Builder.CreateFPTrunc(V: Ops[0], DestTy: Ty, Name: "vcvt");
7057 }
7058 case NEON::BI__builtin_neon_vcvt_s32_v:
7059 case NEON::BI__builtin_neon_vcvt_u32_v:
7060 case NEON::BI__builtin_neon_vcvt_s64_v:
7061 case NEON::BI__builtin_neon_vcvt_u64_v:
7062 case NEON::BI__builtin_neon_vcvt_s16_f16:
7063 case NEON::BI__builtin_neon_vcvt_u16_f16:
7064 case NEON::BI__builtin_neon_vcvtq_s32_v:
7065 case NEON::BI__builtin_neon_vcvtq_u32_v:
7066 case NEON::BI__builtin_neon_vcvtq_s64_v:
7067 case NEON::BI__builtin_neon_vcvtq_u64_v:
7068 case NEON::BI__builtin_neon_vcvtq_s16_f16:
7069 case NEON::BI__builtin_neon_vcvtq_u16_f16: {
7070 Int =
7071 usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs;
7072 llvm::Type *Tys[2] = {Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type)};
7073 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvtz");
7074 }
7075 case NEON::BI__builtin_neon_vcvta_s16_f16:
7076 case NEON::BI__builtin_neon_vcvta_u16_f16:
7077 case NEON::BI__builtin_neon_vcvta_s32_v:
7078 case NEON::BI__builtin_neon_vcvtaq_s16_f16:
7079 case NEON::BI__builtin_neon_vcvtaq_s32_v:
7080 case NEON::BI__builtin_neon_vcvta_u32_v:
7081 case NEON::BI__builtin_neon_vcvtaq_u16_f16:
7082 case NEON::BI__builtin_neon_vcvtaq_u32_v:
7083 case NEON::BI__builtin_neon_vcvta_s64_v:
7084 case NEON::BI__builtin_neon_vcvtaq_s64_v:
7085 case NEON::BI__builtin_neon_vcvta_u64_v:
7086 case NEON::BI__builtin_neon_vcvtaq_u64_v: {
7087 Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
7088 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
7089 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvta");
7090 }
7091 case NEON::BI__builtin_neon_vcvtm_s16_f16:
7092 case NEON::BI__builtin_neon_vcvtm_s32_v:
7093 case NEON::BI__builtin_neon_vcvtmq_s16_f16:
7094 case NEON::BI__builtin_neon_vcvtmq_s32_v:
7095 case NEON::BI__builtin_neon_vcvtm_u16_f16:
7096 case NEON::BI__builtin_neon_vcvtm_u32_v:
7097 case NEON::BI__builtin_neon_vcvtmq_u16_f16:
7098 case NEON::BI__builtin_neon_vcvtmq_u32_v:
7099 case NEON::BI__builtin_neon_vcvtm_s64_v:
7100 case NEON::BI__builtin_neon_vcvtmq_s64_v:
7101 case NEON::BI__builtin_neon_vcvtm_u64_v:
7102 case NEON::BI__builtin_neon_vcvtmq_u64_v: {
7103 Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
7104 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
7105 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvtm");
7106 }
7107 case NEON::BI__builtin_neon_vcvtn_s16_f16:
7108 case NEON::BI__builtin_neon_vcvtn_s32_v:
7109 case NEON::BI__builtin_neon_vcvtnq_s16_f16:
7110 case NEON::BI__builtin_neon_vcvtnq_s32_v:
7111 case NEON::BI__builtin_neon_vcvtn_u16_f16:
7112 case NEON::BI__builtin_neon_vcvtn_u32_v:
7113 case NEON::BI__builtin_neon_vcvtnq_u16_f16:
7114 case NEON::BI__builtin_neon_vcvtnq_u32_v:
7115 case NEON::BI__builtin_neon_vcvtn_s64_v:
7116 case NEON::BI__builtin_neon_vcvtnq_s64_v:
7117 case NEON::BI__builtin_neon_vcvtn_u64_v:
7118 case NEON::BI__builtin_neon_vcvtnq_u64_v: {
7119 Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
7120 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
7121 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvtn");
7122 }
7123 case NEON::BI__builtin_neon_vcvtp_s16_f16:
7124 case NEON::BI__builtin_neon_vcvtp_s32_v:
7125 case NEON::BI__builtin_neon_vcvtpq_s16_f16:
7126 case NEON::BI__builtin_neon_vcvtpq_s32_v:
7127 case NEON::BI__builtin_neon_vcvtp_u16_f16:
7128 case NEON::BI__builtin_neon_vcvtp_u32_v:
7129 case NEON::BI__builtin_neon_vcvtpq_u16_f16:
7130 case NEON::BI__builtin_neon_vcvtpq_u32_v:
7131 case NEON::BI__builtin_neon_vcvtp_s64_v:
7132 case NEON::BI__builtin_neon_vcvtpq_s64_v:
7133 case NEON::BI__builtin_neon_vcvtp_u64_v:
7134 case NEON::BI__builtin_neon_vcvtpq_u64_v: {
7135 Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
7136 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
7137 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvtp");
7138 }
7139 case NEON::BI__builtin_neon_vmulx_v:
7140 case NEON::BI__builtin_neon_vmulxq_v: {
7141 Int = Intrinsic::aarch64_neon_fmulx;
7142 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmulx");
7143 }
7144 case NEON::BI__builtin_neon_vmulxh_lane_f16:
7145 case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
7146 // vmulx_lane should be mapped to Neon scalar mulx after
7147 // extracting the scalar element
7148 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 2)));
7149 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2], Name: "extract");
7150 Ops.pop_back();
7151 Int = Intrinsic::aarch64_neon_fmulx;
7152 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vmulx");
7153 }
7154 case NEON::BI__builtin_neon_vmul_lane_v:
7155 case NEON::BI__builtin_neon_vmul_laneq_v: {
7156 // v1f64 vmul_lane should be mapped to Neon scalar mul lane
7157 bool Quad = false;
7158 if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
7159 Quad = true;
7160 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DoubleTy);
7161 llvm::FixedVectorType *VTy =
7162 GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
7163 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: VTy);
7164 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2], Name: "extract");
7165 Value *Result = Builder.CreateFMul(L: Ops[0], R: Ops[1]);
7166 return Builder.CreateBitCast(V: Result, DestTy: Ty);
7167 }
7168 case NEON::BI__builtin_neon_vnegd_s64:
7169 return Builder.CreateNeg(V: EmitScalarExpr(E: E->getArg(Arg: 0)), Name: "vnegd");
7170 case NEON::BI__builtin_neon_vnegh_f16:
7171 return Builder.CreateFNeg(V: EmitScalarExpr(E: E->getArg(Arg: 0)), Name: "vnegh");
7172 case NEON::BI__builtin_neon_vpmaxnm_v:
7173 case NEON::BI__builtin_neon_vpmaxnmq_v: {
7174 Int = Intrinsic::aarch64_neon_fmaxnmp;
7175 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vpmaxnm");
7176 }
7177 case NEON::BI__builtin_neon_vpminnm_v:
7178 case NEON::BI__builtin_neon_vpminnmq_v: {
7179 Int = Intrinsic::aarch64_neon_fminnmp;
7180 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vpminnm");
7181 }
7182 case NEON::BI__builtin_neon_vsqrth_f16: {
7183 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7184 Int = Builder.getIsFPConstrained()
7185 ? Intrinsic::experimental_constrained_sqrt
7186 : Intrinsic::sqrt;
7187 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vsqrt");
7188 }
7189 case NEON::BI__builtin_neon_vsqrt_v:
7190 case NEON::BI__builtin_neon_vsqrtq_v: {
7191 Int = Builder.getIsFPConstrained()
7192 ? Intrinsic::experimental_constrained_sqrt
7193 : Intrinsic::sqrt;
7194 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
7195 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vsqrt");
7196 }
7197 case NEON::BI__builtin_neon_vrbit_v:
7198 case NEON::BI__builtin_neon_vrbitq_v: {
7199 Int = Intrinsic::bitreverse;
7200 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrbit");
7201 }
7202 case NEON::BI__builtin_neon_vmaxv_f16: {
7203 Int = Intrinsic::aarch64_neon_fmaxv;
7204 Ty = HalfTy;
7205 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4);
7206 llvm::Type *Tys[2] = { Ty, VTy };
7207 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7208 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vmaxv");
7209 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
7210 }
7211 case NEON::BI__builtin_neon_vmaxvq_f16: {
7212 Int = Intrinsic::aarch64_neon_fmaxv;
7213 Ty = HalfTy;
7214 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8);
7215 llvm::Type *Tys[2] = { Ty, VTy };
7216 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7217 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vmaxv");
7218 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
7219 }
7220 case NEON::BI__builtin_neon_vminv_f16: {
7221 Int = Intrinsic::aarch64_neon_fminv;
7222 Ty = HalfTy;
7223 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4);
7224 llvm::Type *Tys[2] = { Ty, VTy };
7225 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7226 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vminv");
7227 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
7228 }
7229 case NEON::BI__builtin_neon_vminvq_f16: {
7230 Int = Intrinsic::aarch64_neon_fminv;
7231 Ty = HalfTy;
7232 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8);
7233 llvm::Type *Tys[2] = { Ty, VTy };
7234 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7235 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vminv");
7236 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
7237 }
7238 case NEON::BI__builtin_neon_vmaxnmv_f16: {
7239 Int = Intrinsic::aarch64_neon_fmaxnmv;
7240 Ty = HalfTy;
7241 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4);
7242 llvm::Type *Tys[2] = { Ty, VTy };
7243 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7244 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vmaxnmv");
7245 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
7246 }
7247 case NEON::BI__builtin_neon_vmaxnmvq_f16: {
7248 Int = Intrinsic::aarch64_neon_fmaxnmv;
7249 Ty = HalfTy;
7250 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8);
7251 llvm::Type *Tys[2] = { Ty, VTy };
7252 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7253 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vmaxnmv");
7254 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
7255 }
7256 case NEON::BI__builtin_neon_vminnmv_f16: {
7257 Int = Intrinsic::aarch64_neon_fminnmv;
7258 Ty = HalfTy;
7259 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4);
7260 llvm::Type *Tys[2] = { Ty, VTy };
7261 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7262 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vminnmv");
7263 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
7264 }
7265 case NEON::BI__builtin_neon_vminnmvq_f16: {
7266 Int = Intrinsic::aarch64_neon_fminnmv;
7267 Ty = HalfTy;
7268 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8);
7269 llvm::Type *Tys[2] = { Ty, VTy };
7270 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7271 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vminnmv");
7272 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
7273 }
7274 case NEON::BI__builtin_neon_vmul_n_f64: {
7275 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DoubleTy);
7276 Value *RHS = Builder.CreateBitCast(V: EmitScalarExpr(E: E->getArg(Arg: 1)), DestTy: DoubleTy);
7277 return Builder.CreateFMul(L: Ops[0], R: RHS);
7278 }
7279 case NEON::BI__builtin_neon_vaddlv_u8: {
7280 Int = Intrinsic::aarch64_neon_uaddlv;
7281 Ty = Int32Ty;
7282 VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8);
7283 llvm::Type *Tys[2] = { Ty, VTy };
7284 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7285 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
7286 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
7287 }
7288 case NEON::BI__builtin_neon_vaddlv_u16: {
7289 Int = Intrinsic::aarch64_neon_uaddlv;
7290 Ty = Int32Ty;
7291 VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 4);
7292 llvm::Type *Tys[2] = { Ty, VTy };
7293 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7294 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
7295 }
7296 case NEON::BI__builtin_neon_vaddlvq_u8: {
7297 Int = Intrinsic::aarch64_neon_uaddlv;
7298 Ty = Int32Ty;
7299 VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
7300 llvm::Type *Tys[2] = { Ty, VTy };
7301 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7302 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
7303 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
7304 }
7305 case NEON::BI__builtin_neon_vaddlvq_u16: {
7306 Int = Intrinsic::aarch64_neon_uaddlv;
7307 Ty = Int32Ty;
7308 VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 8);
7309 llvm::Type *Tys[2] = { Ty, VTy };
7310 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7311 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
7312 }
7313 case NEON::BI__builtin_neon_vaddlv_s8: {
7314 Int = Intrinsic::aarch64_neon_saddlv;
7315 Ty = Int32Ty;
7316 VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8);
7317 llvm::Type *Tys[2] = { Ty, VTy };
7318 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7319 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
7320 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
7321 }
7322 case NEON::BI__builtin_neon_vaddlv_s16: {
7323 Int = Intrinsic::aarch64_neon_saddlv;
7324 Ty = Int32Ty;
7325 VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 4);
7326 llvm::Type *Tys[2] = { Ty, VTy };
7327 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7328 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
7329 }
7330 case NEON::BI__builtin_neon_vaddlvq_s8: {
7331 Int = Intrinsic::aarch64_neon_saddlv;
7332 Ty = Int32Ty;
7333 VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
7334 llvm::Type *Tys[2] = { Ty, VTy };
7335 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7336 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
7337 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
7338 }
7339 case NEON::BI__builtin_neon_vaddlvq_s16: {
7340 Int = Intrinsic::aarch64_neon_saddlv;
7341 Ty = Int32Ty;
7342 VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 8);
7343 llvm::Type *Tys[2] = { Ty, VTy };
7344 Ops.push_back(Elt: EmitScalarExpr(E: E->getArg(Arg: 0)));
7345 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
7346 }
7347 case NEON::BI__builtin_neon_vsri_n_v:
7348 case NEON::BI__builtin_neon_vsriq_n_v: {
7349 Int = Intrinsic::aarch64_neon_vsri;
7350 llvm::Function *Intrin = CGM.getIntrinsic(IID: Int, Tys: Ty);
7351 return EmitNeonCall(F: Intrin, Ops, name: "vsri_n");
7352 }
7353 case NEON::BI__builtin_neon_vsli_n_v:
7354 case NEON::BI__builtin_neon_vsliq_n_v: {
7355 Int = Intrinsic::aarch64_neon_vsli;
7356 llvm::Function *Intrin = CGM.getIntrinsic(IID: Int, Tys: Ty);
7357 return EmitNeonCall(F: Intrin, Ops, name: "vsli_n");
7358 }
7359 case NEON::BI__builtin_neon_vsra_n_v:
7360 case NEON::BI__builtin_neon_vsraq_n_v:
7361 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
7362 Ops[1] = EmitNeonRShiftImm(Vec: Ops[1], Shift: Ops[2], Ty, usgn, name: "vsra_n");
7363 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1]);
7364 case NEON::BI__builtin_neon_vrsra_n_v:
7365 case NEON::BI__builtin_neon_vrsraq_n_v: {
7366 Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
7367 SmallVector<llvm::Value*,2> TmpOps;
7368 TmpOps.push_back(Elt: Ops[1]);
7369 TmpOps.push_back(Elt: Ops[2]);
7370 Function* F = CGM.getIntrinsic(IID: Int, Tys: Ty);
7371 llvm::Value *tmp = EmitNeonCall(F, Ops&: TmpOps, name: "vrshr_n", shift: 1, rightshift: true);
7372 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: VTy);
7373 return Builder.CreateAdd(LHS: Ops[0], RHS: tmp);
7374 }
7375 case NEON::BI__builtin_neon_vld1_v:
7376 case NEON::BI__builtin_neon_vld1q_v: {
7377 return Builder.CreateAlignedLoad(Ty: VTy, Addr: Ops[0], Align: PtrOp0.getAlignment());
7378 }
7379 case NEON::BI__builtin_neon_vst1_v:
7380 case NEON::BI__builtin_neon_vst1q_v:
7381 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: VTy);
7382 return Builder.CreateAlignedStore(Val: Ops[1], Addr: Ops[0], Align: PtrOp0.getAlignment());
7383 case NEON::BI__builtin_neon_vld1_lane_v:
7384 case NEON::BI__builtin_neon_vld1q_lane_v: {
7385 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7386 Ops[0] = Builder.CreateAlignedLoad(Ty: VTy->getElementType(), Addr: Ops[0],
7387 Align: PtrOp0.getAlignment());
7388 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vld1_lane");
7389 }
7390 case NEON::BI__builtin_neon_vldap1_lane_s64:
7391 case NEON::BI__builtin_neon_vldap1q_lane_s64: {
7392 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7393 llvm::LoadInst *LI = Builder.CreateAlignedLoad(
7394 Ty: VTy->getElementType(), Addr: Ops[0], Align: PtrOp0.getAlignment());
7395 LI->setAtomic(Ordering: llvm::AtomicOrdering::Acquire);
7396 Ops[0] = LI;
7397 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vldap1_lane");
7398 }
7399 case NEON::BI__builtin_neon_vld1_dup_v:
7400 case NEON::BI__builtin_neon_vld1q_dup_v: {
7401 Value *V = PoisonValue::get(T: Ty);
7402 Ops[0] = Builder.CreateAlignedLoad(Ty: VTy->getElementType(), Addr: Ops[0],
7403 Align: PtrOp0.getAlignment());
7404 llvm::Constant *CI = ConstantInt::get(Ty: Int32Ty, V: 0);
7405 Ops[0] = Builder.CreateInsertElement(Vec: V, NewElt: Ops[0], Idx: CI);
7406 return EmitNeonSplat(V: Ops[0], C: CI);
7407 }
7408 case NEON::BI__builtin_neon_vst1_lane_v:
7409 case NEON::BI__builtin_neon_vst1q_lane_v:
7410 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7411 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2]);
7412 return Builder.CreateAlignedStore(Val: Ops[1], Addr: Ops[0], Align: PtrOp0.getAlignment());
7413 case NEON::BI__builtin_neon_vstl1_lane_s64:
7414 case NEON::BI__builtin_neon_vstl1q_lane_s64: {
7415 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7416 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2]);
7417 llvm::StoreInst *SI =
7418 Builder.CreateAlignedStore(Val: Ops[1], Addr: Ops[0], Align: PtrOp0.getAlignment());
7419 SI->setAtomic(Ordering: llvm::AtomicOrdering::Release);
7420 return SI;
7421 }
7422 case NEON::BI__builtin_neon_vld2_v:
7423 case NEON::BI__builtin_neon_vld2q_v: {
7424 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
7425 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld2, Tys);
7426 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld2");
7427 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7428 }
7429 case NEON::BI__builtin_neon_vld3_v:
7430 case NEON::BI__builtin_neon_vld3q_v: {
7431 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
7432 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld3, Tys);
7433 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld3");
7434 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7435 }
7436 case NEON::BI__builtin_neon_vld4_v:
7437 case NEON::BI__builtin_neon_vld4q_v: {
7438 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
7439 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld4, Tys);
7440 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld4");
7441 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7442 }
7443 case NEON::BI__builtin_neon_vld2_dup_v:
7444 case NEON::BI__builtin_neon_vld2q_dup_v: {
7445 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
7446 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld2r, Tys);
7447 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld2");
7448 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7449 }
7450 case NEON::BI__builtin_neon_vld3_dup_v:
7451 case NEON::BI__builtin_neon_vld3q_dup_v: {
7452 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
7453 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld3r, Tys);
7454 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld3");
7455 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7456 }
7457 case NEON::BI__builtin_neon_vld4_dup_v:
7458 case NEON::BI__builtin_neon_vld4q_dup_v: {
7459 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
7460 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld4r, Tys);
7461 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld4");
7462 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7463 }
7464 case NEON::BI__builtin_neon_vld2_lane_v:
7465 case NEON::BI__builtin_neon_vld2q_lane_v: {
7466 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
7467 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld2lane, Tys);
7468 std::rotate(first: Ops.begin() + 1, middle: Ops.begin() + 2, last: Ops.end());
7469 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7470 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
7471 Ops[3] = Builder.CreateZExt(V: Ops[3], DestTy: Int64Ty);
7472 Ops[1] = Builder.CreateCall(Callee: F, Args: ArrayRef(Ops).slice(N: 1), Name: "vld2_lane");
7473 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7474 }
7475 case NEON::BI__builtin_neon_vld3_lane_v:
7476 case NEON::BI__builtin_neon_vld3q_lane_v: {
7477 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
7478 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld3lane, Tys);
7479 std::rotate(first: Ops.begin() + 1, middle: Ops.begin() + 2, last: Ops.end());
7480 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7481 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
7482 Ops[3] = Builder.CreateBitCast(V: Ops[3], DestTy: Ty);
7483 Ops[4] = Builder.CreateZExt(V: Ops[4], DestTy: Int64Ty);
7484 Ops[1] = Builder.CreateCall(Callee: F, Args: ArrayRef(Ops).slice(N: 1), Name: "vld3_lane");
7485 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7486 }
7487 case NEON::BI__builtin_neon_vld4_lane_v:
7488 case NEON::BI__builtin_neon_vld4q_lane_v: {
7489 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
7490 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld4lane, Tys);
7491 std::rotate(first: Ops.begin() + 1, middle: Ops.begin() + 2, last: Ops.end());
7492 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7493 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
7494 Ops[3] = Builder.CreateBitCast(V: Ops[3], DestTy: Ty);
7495 Ops[4] = Builder.CreateBitCast(V: Ops[4], DestTy: Ty);
7496 Ops[5] = Builder.CreateZExt(V: Ops[5], DestTy: Int64Ty);
7497 Ops[1] = Builder.CreateCall(Callee: F, Args: ArrayRef(Ops).slice(N: 1), Name: "vld4_lane");
7498 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
7499 }
7500 case NEON::BI__builtin_neon_vst2_v:
7501 case NEON::BI__builtin_neon_vst2q_v: {
7502 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
7503 llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
7504 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st2, Tys),
7505 Ops, name: "");
7506 }
7507 case NEON::BI__builtin_neon_vst2_lane_v:
7508 case NEON::BI__builtin_neon_vst2q_lane_v: {
7509 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
7510 Ops[2] = Builder.CreateZExt(V: Ops[2], DestTy: Int64Ty);
7511 llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
7512 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st2lane, Tys),
7513 Ops, name: "");
7514 }
7515 case NEON::BI__builtin_neon_vst3_v:
7516 case NEON::BI__builtin_neon_vst3q_v: {
7517 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
7518 llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
7519 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st3, Tys),
7520 Ops, name: "");
7521 }
7522 case NEON::BI__builtin_neon_vst3_lane_v:
7523 case NEON::BI__builtin_neon_vst3q_lane_v: {
7524 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
7525 Ops[3] = Builder.CreateZExt(V: Ops[3], DestTy: Int64Ty);
7526 llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
7527 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st3lane, Tys),
7528 Ops, name: "");
7529 }
7530 case NEON::BI__builtin_neon_vst4_v:
7531 case NEON::BI__builtin_neon_vst4q_v: {
7532 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
7533 llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
7534 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st4, Tys),
7535 Ops, name: "");
7536 }
7537 case NEON::BI__builtin_neon_vst4_lane_v:
7538 case NEON::BI__builtin_neon_vst4q_lane_v: {
7539 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
7540 Ops[4] = Builder.CreateZExt(V: Ops[4], DestTy: Int64Ty);
7541 llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
7542 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st4lane, Tys),
7543 Ops, name: "");
7544 }
7545 case NEON::BI__builtin_neon_vtrn_v:
7546 case NEON::BI__builtin_neon_vtrnq_v: {
7547 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7548 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
7549 Value *SV = nullptr;
7550
7551 for (unsigned vi = 0; vi != 2; ++vi) {
7552 SmallVector<int, 16> Indices;
7553 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7554 Indices.push_back(Elt: i+vi);
7555 Indices.push_back(Elt: i+e+vi);
7556 }
7557 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
7558 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vtrn");
7559 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
7560 }
7561 return SV;
7562 }
7563 case NEON::BI__builtin_neon_vuzp_v:
7564 case NEON::BI__builtin_neon_vuzpq_v: {
7565 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7566 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
7567 Value *SV = nullptr;
7568
7569 for (unsigned vi = 0; vi != 2; ++vi) {
7570 SmallVector<int, 16> Indices;
7571 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7572 Indices.push_back(Elt: 2*i+vi);
7573
7574 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
7575 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vuzp");
7576 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
7577 }
7578 return SV;
7579 }
7580 case NEON::BI__builtin_neon_vzip_v:
7581 case NEON::BI__builtin_neon_vzipq_v: {
7582 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
7583 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
7584 Value *SV = nullptr;
7585
7586 for (unsigned vi = 0; vi != 2; ++vi) {
7587 SmallVector<int, 16> Indices;
7588 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7589 Indices.push_back(Elt: (i + vi*e) >> 1);
7590 Indices.push_back(Elt: ((i + vi*e) >> 1)+e);
7591 }
7592 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
7593 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vzip");
7594 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
7595 }
7596 return SV;
7597 }
7598 case NEON::BI__builtin_neon_vqtbl1q_v: {
7599 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbl1, Tys: Ty),
7600 Ops, name: "vtbl1");
7601 }
7602 case NEON::BI__builtin_neon_vqtbl2q_v: {
7603 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbl2, Tys: Ty),
7604 Ops, name: "vtbl2");
7605 }
7606 case NEON::BI__builtin_neon_vqtbl3q_v: {
7607 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbl3, Tys: Ty),
7608 Ops, name: "vtbl3");
7609 }
7610 case NEON::BI__builtin_neon_vqtbl4q_v: {
7611 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbl4, Tys: Ty),
7612 Ops, name: "vtbl4");
7613 }
7614 case NEON::BI__builtin_neon_vqtbx1q_v: {
7615 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbx1, Tys: Ty),
7616 Ops, name: "vtbx1");
7617 }
7618 case NEON::BI__builtin_neon_vqtbx2q_v: {
7619 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbx2, Tys: Ty),
7620 Ops, name: "vtbx2");
7621 }
7622 case NEON::BI__builtin_neon_vqtbx3q_v: {
7623 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbx3, Tys: Ty),
7624 Ops, name: "vtbx3");
7625 }
7626 case NEON::BI__builtin_neon_vqtbx4q_v: {
7627 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbx4, Tys: Ty),
7628 Ops, name: "vtbx4");
7629 }
7630 case NEON::BI__builtin_neon_vsqadd_v:
7631 case NEON::BI__builtin_neon_vsqaddq_v: {
7632 Int = Intrinsic::aarch64_neon_usqadd;
7633 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vsqadd");
7634 }
7635 case NEON::BI__builtin_neon_vuqadd_v:
7636 case NEON::BI__builtin_neon_vuqaddq_v: {
7637 Int = Intrinsic::aarch64_neon_suqadd;
7638 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vuqadd");
7639 }
7640
7641 case NEON::BI__builtin_neon_vluti2_laneq_mf8:
7642 case NEON::BI__builtin_neon_vluti2_laneq_bf16:
7643 case NEON::BI__builtin_neon_vluti2_laneq_f16:
7644 case NEON::BI__builtin_neon_vluti2_laneq_p16:
7645 case NEON::BI__builtin_neon_vluti2_laneq_p8:
7646 case NEON::BI__builtin_neon_vluti2_laneq_s16:
7647 case NEON::BI__builtin_neon_vluti2_laneq_s8:
7648 case NEON::BI__builtin_neon_vluti2_laneq_u16:
7649 case NEON::BI__builtin_neon_vluti2_laneq_u8: {
7650 Int = Intrinsic::aarch64_neon_vluti2_laneq;
7651 llvm::Type *Tys[2];
7652 Tys[0] = Ty;
7653 Tys[1] = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
7654 /*isQuad*/ false));
7655 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vluti2_laneq");
7656 }
7657 case NEON::BI__builtin_neon_vluti2q_laneq_mf8:
7658 case NEON::BI__builtin_neon_vluti2q_laneq_bf16:
7659 case NEON::BI__builtin_neon_vluti2q_laneq_f16:
7660 case NEON::BI__builtin_neon_vluti2q_laneq_p16:
7661 case NEON::BI__builtin_neon_vluti2q_laneq_p8:
7662 case NEON::BI__builtin_neon_vluti2q_laneq_s16:
7663 case NEON::BI__builtin_neon_vluti2q_laneq_s8:
7664 case NEON::BI__builtin_neon_vluti2q_laneq_u16:
7665 case NEON::BI__builtin_neon_vluti2q_laneq_u8: {
7666 Int = Intrinsic::aarch64_neon_vluti2_laneq;
7667 llvm::Type *Tys[2];
7668 Tys[0] = Ty;
7669 Tys[1] = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
7670 /*isQuad*/ true));
7671 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vluti2_laneq");
7672 }
7673 case NEON::BI__builtin_neon_vluti2_lane_mf8:
7674 case NEON::BI__builtin_neon_vluti2_lane_bf16:
7675 case NEON::BI__builtin_neon_vluti2_lane_f16:
7676 case NEON::BI__builtin_neon_vluti2_lane_p16:
7677 case NEON::BI__builtin_neon_vluti2_lane_p8:
7678 case NEON::BI__builtin_neon_vluti2_lane_s16:
7679 case NEON::BI__builtin_neon_vluti2_lane_s8:
7680 case NEON::BI__builtin_neon_vluti2_lane_u16:
7681 case NEON::BI__builtin_neon_vluti2_lane_u8: {
7682 Int = Intrinsic::aarch64_neon_vluti2_lane;
7683 llvm::Type *Tys[2];
7684 Tys[0] = Ty;
7685 Tys[1] = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
7686 /*isQuad*/ false));
7687 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vluti2_lane");
7688 }
7689 case NEON::BI__builtin_neon_vluti2q_lane_mf8:
7690 case NEON::BI__builtin_neon_vluti2q_lane_bf16:
7691 case NEON::BI__builtin_neon_vluti2q_lane_f16:
7692 case NEON::BI__builtin_neon_vluti2q_lane_p16:
7693 case NEON::BI__builtin_neon_vluti2q_lane_p8:
7694 case NEON::BI__builtin_neon_vluti2q_lane_s16:
7695 case NEON::BI__builtin_neon_vluti2q_lane_s8:
7696 case NEON::BI__builtin_neon_vluti2q_lane_u16:
7697 case NEON::BI__builtin_neon_vluti2q_lane_u8: {
7698 Int = Intrinsic::aarch64_neon_vluti2_lane;
7699 llvm::Type *Tys[2];
7700 Tys[0] = Ty;
7701 Tys[1] = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
7702 /*isQuad*/ true));
7703 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vluti2_lane");
7704 }
7705 case NEON::BI__builtin_neon_vluti4q_lane_mf8:
7706 case NEON::BI__builtin_neon_vluti4q_lane_p8:
7707 case NEON::BI__builtin_neon_vluti4q_lane_s8:
7708 case NEON::BI__builtin_neon_vluti4q_lane_u8: {
7709 Int = Intrinsic::aarch64_neon_vluti4q_lane;
7710 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vluti4q_lane");
7711 }
7712 case NEON::BI__builtin_neon_vluti4q_laneq_mf8:
7713 case NEON::BI__builtin_neon_vluti4q_laneq_p8:
7714 case NEON::BI__builtin_neon_vluti4q_laneq_s8:
7715 case NEON::BI__builtin_neon_vluti4q_laneq_u8: {
7716 Int = Intrinsic::aarch64_neon_vluti4q_laneq;
7717 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vluti4q_laneq");
7718 }
7719 case NEON::BI__builtin_neon_vluti4q_lane_bf16_x2:
7720 case NEON::BI__builtin_neon_vluti4q_lane_f16_x2:
7721 case NEON::BI__builtin_neon_vluti4q_lane_p16_x2:
7722 case NEON::BI__builtin_neon_vluti4q_lane_s16_x2:
7723 case NEON::BI__builtin_neon_vluti4q_lane_u16_x2: {
7724 Int = Intrinsic::aarch64_neon_vluti4q_lane_x2;
7725 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vluti4q_lane_x2");
7726 }
7727 case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2:
7728 case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2:
7729 case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2:
7730 case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2:
7731 case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2: {
7732 Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
7733 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vluti4q_laneq_x2");
7734 }
7735 case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
7736 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fmmla,
7737 Tys: {llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8),
7738 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16)},
7739 Ops, E, name: "fmmla");
7740 case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
7741 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fmmla,
7742 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4),
7743 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16)},
7744 Ops, E, name: "fmmla");
7745 case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
7746 ExtractLow = true;
7747 [[fallthrough]];
7748 case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
7749 case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
7750 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_cvtl1,
7751 Ty0: llvm::FixedVectorType::get(ElementType: BFloatTy, NumElts: 8),
7752 Ty1: Ops[0]->getType(), Extract: ExtractLow, Ops, E, name: "vbfcvt1");
7753 case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
7754 ExtractLow = true;
7755 [[fallthrough]];
7756 case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
7757 case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
7758 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_cvtl2,
7759 Ty0: llvm::FixedVectorType::get(ElementType: BFloatTy, NumElts: 8),
7760 Ty1: Ops[0]->getType(), Extract: ExtractLow, Ops, E, name: "vbfcvt2");
7761 case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
7762 ExtractLow = true;
7763 [[fallthrough]];
7764 case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
7765 case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
7766 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_cvtl1,
7767 Ty0: llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8),
7768 Ty1: Ops[0]->getType(), Extract: ExtractLow, Ops, E, name: "vbfcvt1");
7769 case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
7770 ExtractLow = true;
7771 [[fallthrough]];
7772 case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
7773 case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
7774 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_cvtl2,
7775 Ty0: llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8),
7776 Ty1: Ops[0]->getType(), Extract: ExtractLow, Ops, E, name: "vbfcvt2");
7777 case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
7778 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_fcvtn,
7779 Ty0: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8),
7780 Ty1: Ops[0]->getType(), Extract: false, Ops, E, name: "vfcvtn");
7781 case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
7782 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_fcvtn,
7783 Ty0: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8),
7784 Ty1: llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4), Extract: false, Ops,
7785 E, name: "vfcvtn");
7786 case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
7787 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_fcvtn,
7788 Ty0: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16),
7789 Ty1: llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8), Extract: false, Ops,
7790 E, name: "vfcvtn");
7791 case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
7792 llvm::Type *Ty = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
7793 Ops[0] = Builder.CreateInsertVector(DstType: Ty, SrcVec: PoisonValue::get(T: Ty), SubVec: Ops[0],
7794 Idx: uint64_t(0));
7795 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_fcvtn2, Ty0: Ty,
7796 Ty1: Ops[1]->getType(), Extract: false, Ops, E, name: "vfcvtn2");
7797 }
7798
7799 case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
7800 case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
7801 return EmitFP8NeonFDOTCall(IID: Intrinsic::aarch64_neon_fp8_fdot2, ExtendLaneArg: false, RetTy: HalfTy,
7802 Ops, E, name: "fdot2");
7803 case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
7804 case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
7805 ExtendLaneArg = true;
7806 [[fallthrough]];
7807 case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
7808 case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
7809 return EmitFP8NeonFDOTCall(IID: Intrinsic::aarch64_neon_fp8_fdot2_lane,
7810 ExtendLaneArg, RetTy: HalfTy, Ops, E, name: "fdot2_lane");
7811 case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
7812 case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
7813 return EmitFP8NeonFDOTCall(IID: Intrinsic::aarch64_neon_fp8_fdot4, ExtendLaneArg: false,
7814 RetTy: FloatTy, Ops, E, name: "fdot4");
7815 case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
7816 case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
7817 ExtendLaneArg = true;
7818 [[fallthrough]];
7819 case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
7820 case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
7821 return EmitFP8NeonFDOTCall(IID: Intrinsic::aarch64_neon_fp8_fdot4_lane,
7822 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "fdot4_lane");
7823
7824 case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
7825 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlalb,
7826 Tys: {llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8)}, Ops, E,
7827 name: "vmlal");
7828 case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm:
7829 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlalt,
7830 Tys: {llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8)}, Ops, E,
7831 name: "vmlal");
7832 case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm:
7833 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlallbb,
7834 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4)}, Ops, E,
7835 name: "vmlall");
7836 case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm:
7837 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlallbt,
7838 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4)}, Ops, E,
7839 name: "vmlall");
7840 case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm:
7841 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlalltb,
7842 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4)}, Ops, E,
7843 name: "vmlall");
7844 case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm:
7845 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlalltt,
7846 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4)}, Ops, E,
7847 name: "vmlall");
7848 case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm:
7849 ExtendLaneArg = true;
7850 [[fallthrough]];
7851 case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm:
7852 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlalb_lane,
7853 ExtendLaneArg, RetTy: HalfTy, Ops, E, name: "vmlal_lane");
7854 case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm:
7855 ExtendLaneArg = true;
7856 [[fallthrough]];
7857 case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm:
7858 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlalt_lane,
7859 ExtendLaneArg, RetTy: HalfTy, Ops, E, name: "vmlal_lane");
7860 case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm:
7861 ExtendLaneArg = true;
7862 [[fallthrough]];
7863 case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm:
7864 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlallbb_lane,
7865 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "vmlall_lane");
7866 case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm:
7867 ExtendLaneArg = true;
7868 [[fallthrough]];
7869 case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm:
7870 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlallbt_lane,
7871 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "vmlall_lane");
7872 case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm:
7873 ExtendLaneArg = true;
7874 [[fallthrough]];
7875 case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm:
7876 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlalltb_lane,
7877 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "vmlall_lane");
7878 case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm:
7879 ExtendLaneArg = true;
7880 [[fallthrough]];
7881 case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm:
7882 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlalltt_lane,
7883 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "vmlall_lane");
7884 case NEON::BI__builtin_neon_vamin_f16:
7885 case NEON::BI__builtin_neon_vaminq_f16:
7886 case NEON::BI__builtin_neon_vamin_f32:
7887 case NEON::BI__builtin_neon_vaminq_f32:
7888 case NEON::BI__builtin_neon_vaminq_f64: {
7889 Int = Intrinsic::aarch64_neon_famin;
7890 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "famin");
7891 }
7892 case NEON::BI__builtin_neon_vamax_f16:
7893 case NEON::BI__builtin_neon_vamaxq_f16:
7894 case NEON::BI__builtin_neon_vamax_f32:
7895 case NEON::BI__builtin_neon_vamaxq_f32:
7896 case NEON::BI__builtin_neon_vamaxq_f64: {
7897 Int = Intrinsic::aarch64_neon_famax;
7898 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "famax");
7899 }
7900 case NEON::BI__builtin_neon_vscale_f16:
7901 case NEON::BI__builtin_neon_vscaleq_f16:
7902 case NEON::BI__builtin_neon_vscale_f32:
7903 case NEON::BI__builtin_neon_vscaleq_f32:
7904 case NEON::BI__builtin_neon_vscaleq_f64: {
7905 Int = Intrinsic::aarch64_neon_fp8_fscale;
7906 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "fscale");
7907 }
7908 }
7909}
7910
7911Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
7912 const CallExpr *E) {
7913 assert((BuiltinID == BPF::BI__builtin_preserve_field_info ||
7914 BuiltinID == BPF::BI__builtin_btf_type_id ||
7915 BuiltinID == BPF::BI__builtin_preserve_type_info ||
7916 BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
7917 "unexpected BPF builtin");
7918
7919 // A sequence number, injected into IR builtin functions, to
7920 // prevent CSE given the only difference of the function
7921 // may just be the debuginfo metadata.
7922 static uint32_t BuiltinSeqNum;
7923
7924 switch (BuiltinID) {
7925 default:
7926 llvm_unreachable("Unexpected BPF builtin");
7927 case BPF::BI__builtin_preserve_field_info: {
7928 const Expr *Arg = E->getArg(Arg: 0);
7929 bool IsBitField = Arg->IgnoreParens()->getObjectKind() == OK_BitField;
7930
7931 if (!getDebugInfo()) {
7932 CGM.Error(loc: E->getExprLoc(),
7933 error: "using __builtin_preserve_field_info() without -g");
7934 return IsBitField ? EmitLValue(E: Arg).getRawBitFieldPointer(CGF&: *this)
7935 : EmitLValue(E: Arg).emitRawPointer(CGF&: *this);
7936 }
7937
7938 // Enable underlying preserve_*_access_index() generation.
7939 bool OldIsInPreservedAIRegion = IsInPreservedAIRegion;
7940 IsInPreservedAIRegion = true;
7941 Value *FieldAddr = IsBitField ? EmitLValue(E: Arg).getRawBitFieldPointer(CGF&: *this)
7942 : EmitLValue(E: Arg).emitRawPointer(CGF&: *this);
7943 IsInPreservedAIRegion = OldIsInPreservedAIRegion;
7944
7945 ConstantInt *C = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
7946 Value *InfoKind = ConstantInt::get(Ty: Int64Ty, V: C->getSExtValue());
7947
7948 // Built the IR for the preserve_field_info intrinsic.
7949 llvm::Function *FnGetFieldInfo = Intrinsic::getOrInsertDeclaration(
7950 M: &CGM.getModule(), id: Intrinsic::bpf_preserve_field_info,
7951 Tys: {FieldAddr->getType()});
7952 return Builder.CreateCall(Callee: FnGetFieldInfo, Args: {FieldAddr, InfoKind});
7953 }
7954 case BPF::BI__builtin_btf_type_id:
7955 case BPF::BI__builtin_preserve_type_info: {
7956 if (!getDebugInfo()) {
7957 CGM.Error(loc: E->getExprLoc(), error: "using builtin function without -g");
7958 return nullptr;
7959 }
7960
7961 const Expr *Arg0 = E->getArg(Arg: 0);
7962 llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
7963 Ty: Arg0->getType(), Loc: Arg0->getExprLoc());
7964
7965 ConstantInt *Flag = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
7966 Value *FlagValue = ConstantInt::get(Ty: Int64Ty, V: Flag->getSExtValue());
7967 Value *SeqNumVal = ConstantInt::get(Ty: Int32Ty, V: BuiltinSeqNum++);
7968
7969 llvm::Function *FnDecl;
7970 if (BuiltinID == BPF::BI__builtin_btf_type_id)
7971 FnDecl = Intrinsic::getOrInsertDeclaration(
7972 M: &CGM.getModule(), id: Intrinsic::bpf_btf_type_id, Tys: {});
7973 else
7974 FnDecl = Intrinsic::getOrInsertDeclaration(
7975 M: &CGM.getModule(), id: Intrinsic::bpf_preserve_type_info, Tys: {});
7976 CallInst *Fn = Builder.CreateCall(Callee: FnDecl, Args: {SeqNumVal, FlagValue});
7977 Fn->setMetadata(KindID: LLVMContext::MD_preserve_access_index, Node: DbgInfo);
7978 return Fn;
7979 }
7980 case BPF::BI__builtin_preserve_enum_value: {
7981 if (!getDebugInfo()) {
7982 CGM.Error(loc: E->getExprLoc(), error: "using builtin function without -g");
7983 return nullptr;
7984 }
7985
7986 const Expr *Arg0 = E->getArg(Arg: 0);
7987 llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
7988 Ty: Arg0->getType(), Loc: Arg0->getExprLoc());
7989
7990 // Find enumerator
7991 const auto *UO = cast<UnaryOperator>(Val: Arg0->IgnoreParens());
7992 const auto *CE = cast<CStyleCastExpr>(Val: UO->getSubExpr());
7993 const auto *DR = cast<DeclRefExpr>(Val: CE->getSubExpr());
7994 const auto *Enumerator = cast<EnumConstantDecl>(Val: DR->getDecl());
7995
7996 auto InitVal = Enumerator->getInitVal();
7997 std::string InitValStr;
7998 if (InitVal.isNegative() || InitVal > uint64_t(INT64_MAX))
7999 InitValStr = std::to_string(val: InitVal.getSExtValue());
8000 else
8001 InitValStr = std::to_string(val: InitVal.getZExtValue());
8002 std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr;
8003 Value *EnumStrVal = Builder.CreateGlobalString(Str: EnumStr);
8004
8005 ConstantInt *Flag = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
8006 Value *FlagValue = ConstantInt::get(Ty: Int64Ty, V: Flag->getSExtValue());
8007 Value *SeqNumVal = ConstantInt::get(Ty: Int32Ty, V: BuiltinSeqNum++);
8008
8009 llvm::Function *IntrinsicFn = Intrinsic::getOrInsertDeclaration(
8010 M: &CGM.getModule(), id: Intrinsic::bpf_preserve_enum_value, Tys: {});
8011 CallInst *Fn =
8012 Builder.CreateCall(Callee: IntrinsicFn, Args: {SeqNumVal, EnumStrVal, FlagValue});
8013 Fn->setMetadata(KindID: LLVMContext::MD_preserve_access_index, Node: DbgInfo);
8014 return Fn;
8015 }
8016 }
8017}
8018
8019llvm::Value *CodeGenFunction::
8020BuildVector(ArrayRef<llvm::Value*> Ops) {
8021 assert((Ops.size() & (Ops.size() - 1)) == 0 &&
8022 "Not a power-of-two sized vector!");
8023 bool AllConstants = true;
8024 for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
8025 AllConstants &= isa<Constant>(Val: Ops[i]);
8026
8027 // If this is a constant vector, create a ConstantVector.
8028 if (AllConstants) {
8029 SmallVector<llvm::Constant*, 16> CstOps;
8030 for (llvm::Value *Op : Ops)
8031 CstOps.push_back(Elt: cast<Constant>(Val: Op));
8032 return llvm::ConstantVector::get(V: CstOps);
8033 }
8034
8035 // Otherwise, insertelement the values to build the vector.
8036 Value *Result = llvm::PoisonValue::get(
8037 T: llvm::FixedVectorType::get(ElementType: Ops[0]->getType(), NumElts: Ops.size()));
8038
8039 for (unsigned i = 0, e = Ops.size(); i != e; ++i)
8040 Result = Builder.CreateInsertElement(Vec: Result, NewElt: Ops[i], Idx: Builder.getInt64(C: i));
8041
8042 return Result;
8043}
8044
8045Value *CodeGenFunction::EmitAArch64CpuInit() {
8046 llvm::FunctionType *FTy = llvm::FunctionType::get(Result: VoidTy, isVarArg: false);
8047 llvm::FunctionCallee Func =
8048 CGM.CreateRuntimeFunction(Ty: FTy, Name: "__init_cpu_features_resolver");
8049 cast<llvm::GlobalValue>(Val: Func.getCallee())->setDSOLocal(true);
8050 cast<llvm::GlobalValue>(Val: Func.getCallee())
8051 ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
8052 return Builder.CreateCall(Callee: Func);
8053}
8054
8055Value *CodeGenFunction::EmitAArch64CpuSupports(const CallExpr *E) {
8056 const Expr *ArgExpr = E->getArg(Arg: 0)->IgnoreParenCasts();
8057 StringRef ArgStr = cast<StringLiteral>(Val: ArgExpr)->getString();
8058 llvm::SmallVector<StringRef, 8> Features;
8059 ArgStr.split(A&: Features, Separator: "+");
8060 for (auto &Feature : Features) {
8061 Feature = Feature.trim();
8062 if (!llvm::AArch64::parseFMVExtension(Extension: Feature))
8063 return Builder.getFalse();
8064 if (Feature != "default")
8065 Features.push_back(Elt: Feature);
8066 }
8067 return EmitAArch64CpuSupports(FeatureStrs: Features);
8068}
8069
8070llvm::Value *
8071CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) {
8072 llvm::APInt FeaturesMask = llvm::AArch64::getCpuSupportsMask(Features: FeaturesStrs);
8073 Value *Result = Builder.getTrue();
8074 if (FeaturesMask != 0) {
8075 // Get features from structure in runtime library
8076 // struct {
8077 // unsigned long long features;
8078 // } __aarch64_cpu_features;
8079 llvm::Type *STy = llvm::StructType::get(elt1: Int64Ty);
8080 llvm::Constant *AArch64CPUFeatures =
8081 CGM.CreateRuntimeVariable(Ty: STy, Name: "__aarch64_cpu_features");
8082 cast<llvm::GlobalValue>(Val: AArch64CPUFeatures)->setDSOLocal(true);
8083 llvm::Value *CpuFeatures = Builder.CreateGEP(
8084 Ty: STy, Ptr: AArch64CPUFeatures,
8085 IdxList: {ConstantInt::get(Ty: Int32Ty, V: 0), ConstantInt::get(Ty: Int32Ty, V: 0)});
8086 Value *Features = Builder.CreateAlignedLoad(Ty: Int64Ty, Addr: CpuFeatures,
8087 Align: CharUnits::fromQuantity(Quantity: 8));
8088 Value *Mask = Builder.getInt(AI: FeaturesMask.trunc(width: 64));
8089 Value *Bitset = Builder.CreateAnd(LHS: Features, RHS: Mask);
8090 Value *Cmp = Builder.CreateICmpEQ(LHS: Bitset, RHS: Mask);
8091 Result = Builder.CreateAnd(LHS: Result, RHS: Cmp);
8092 }
8093 return Result;
8094}
8095