1//===---------- ARM.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "ABIInfo.h"
14#include "CGBuiltin.h"
15#include "CGDebugInfo.h"
16#include "TargetInfo.h"
17#include "clang/Basic/AArch64CodeGenUtils.h"
18#include "clang/Basic/TargetBuiltins.h"
19#include "llvm/IR/InlineAsm.h"
20#include "llvm/IR/IntrinsicsAArch64.h"
21#include "llvm/IR/IntrinsicsARM.h"
22#include "llvm/IR/IntrinsicsBPF.h"
23#include "llvm/TargetParser/AArch64TargetParser.h"
24
25#include <numeric>
26
27using namespace clang;
28using namespace CodeGen;
29using namespace llvm;
30using namespace clang::aarch64;
31
32static std::optional<CodeGenFunction::MSVCIntrin>
33translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
34 using MSVCIntrin = CodeGenFunction::MSVCIntrin;
35 switch (BuiltinID) {
36 default:
37 return std::nullopt;
38 case clang::AArch64::BI_BitScanForward:
39 case clang::AArch64::BI_BitScanForward64:
40 return MSVCIntrin::_BitScanForward;
41 case clang::AArch64::BI_BitScanReverse:
42 case clang::AArch64::BI_BitScanReverse64:
43 return MSVCIntrin::_BitScanReverse;
44 case clang::AArch64::BI_InterlockedAnd64:
45 return MSVCIntrin::_InterlockedAnd;
46 case clang::AArch64::BI_InterlockedExchange64:
47 return MSVCIntrin::_InterlockedExchange;
48 case clang::AArch64::BI_InterlockedExchangeAdd64:
49 return MSVCIntrin::_InterlockedExchangeAdd;
50 case clang::AArch64::BI_InterlockedExchangeSub64:
51 return MSVCIntrin::_InterlockedExchangeSub;
52 case clang::AArch64::BI_InterlockedOr64:
53 return MSVCIntrin::_InterlockedOr;
54 case clang::AArch64::BI_InterlockedXor64:
55 return MSVCIntrin::_InterlockedXor;
56 case clang::AArch64::BI_InterlockedDecrement64:
57 return MSVCIntrin::_InterlockedDecrement;
58 case clang::AArch64::BI_InterlockedIncrement64:
59 return MSVCIntrin::_InterlockedIncrement;
60 case clang::AArch64::BI_InterlockedExchangeAdd8_acq:
61 case clang::AArch64::BI_InterlockedExchangeAdd16_acq:
62 case clang::AArch64::BI_InterlockedExchangeAdd_acq:
63 case clang::AArch64::BI_InterlockedExchangeAdd64_acq:
64 return MSVCIntrin::_InterlockedExchangeAdd_acq;
65 case clang::AArch64::BI_InterlockedExchangeAdd8_rel:
66 case clang::AArch64::BI_InterlockedExchangeAdd16_rel:
67 case clang::AArch64::BI_InterlockedExchangeAdd_rel:
68 case clang::AArch64::BI_InterlockedExchangeAdd64_rel:
69 return MSVCIntrin::_InterlockedExchangeAdd_rel;
70 case clang::AArch64::BI_InterlockedExchangeAdd8_nf:
71 case clang::AArch64::BI_InterlockedExchangeAdd16_nf:
72 case clang::AArch64::BI_InterlockedExchangeAdd_nf:
73 case clang::AArch64::BI_InterlockedExchangeAdd64_nf:
74 return MSVCIntrin::_InterlockedExchangeAdd_nf;
75 case clang::AArch64::BI_InterlockedExchange8_acq:
76 case clang::AArch64::BI_InterlockedExchange16_acq:
77 case clang::AArch64::BI_InterlockedExchange_acq:
78 case clang::AArch64::BI_InterlockedExchange64_acq:
79 case clang::AArch64::BI_InterlockedExchangePointer_acq:
80 return MSVCIntrin::_InterlockedExchange_acq;
81 case clang::AArch64::BI_InterlockedExchange8_rel:
82 case clang::AArch64::BI_InterlockedExchange16_rel:
83 case clang::AArch64::BI_InterlockedExchange_rel:
84 case clang::AArch64::BI_InterlockedExchange64_rel:
85 case clang::AArch64::BI_InterlockedExchangePointer_rel:
86 return MSVCIntrin::_InterlockedExchange_rel;
87 case clang::AArch64::BI_InterlockedExchange8_nf:
88 case clang::AArch64::BI_InterlockedExchange16_nf:
89 case clang::AArch64::BI_InterlockedExchange_nf:
90 case clang::AArch64::BI_InterlockedExchange64_nf:
91 case clang::AArch64::BI_InterlockedExchangePointer_nf:
92 return MSVCIntrin::_InterlockedExchange_nf;
93 case clang::AArch64::BI_InterlockedCompareExchange8_acq:
94 case clang::AArch64::BI_InterlockedCompareExchange16_acq:
95 case clang::AArch64::BI_InterlockedCompareExchange_acq:
96 case clang::AArch64::BI_InterlockedCompareExchange64_acq:
97 case clang::AArch64::BI_InterlockedCompareExchangePointer_acq:
98 return MSVCIntrin::_InterlockedCompareExchange_acq;
99 case clang::AArch64::BI_InterlockedCompareExchange8_rel:
100 case clang::AArch64::BI_InterlockedCompareExchange16_rel:
101 case clang::AArch64::BI_InterlockedCompareExchange_rel:
102 case clang::AArch64::BI_InterlockedCompareExchange64_rel:
103 case clang::AArch64::BI_InterlockedCompareExchangePointer_rel:
104 return MSVCIntrin::_InterlockedCompareExchange_rel;
105 case clang::AArch64::BI_InterlockedCompareExchange8_nf:
106 case clang::AArch64::BI_InterlockedCompareExchange16_nf:
107 case clang::AArch64::BI_InterlockedCompareExchange_nf:
108 case clang::AArch64::BI_InterlockedCompareExchange64_nf:
109 return MSVCIntrin::_InterlockedCompareExchange_nf;
110 case clang::AArch64::BI_InterlockedCompareExchange128:
111 return MSVCIntrin::_InterlockedCompareExchange128;
112 case clang::AArch64::BI_InterlockedCompareExchange128_acq:
113 return MSVCIntrin::_InterlockedCompareExchange128_acq;
114 case clang::AArch64::BI_InterlockedCompareExchange128_nf:
115 return MSVCIntrin::_InterlockedCompareExchange128_nf;
116 case clang::AArch64::BI_InterlockedCompareExchange128_rel:
117 return MSVCIntrin::_InterlockedCompareExchange128_rel;
118 case clang::AArch64::BI_InterlockedOr8_acq:
119 case clang::AArch64::BI_InterlockedOr16_acq:
120 case clang::AArch64::BI_InterlockedOr_acq:
121 case clang::AArch64::BI_InterlockedOr64_acq:
122 return MSVCIntrin::_InterlockedOr_acq;
123 case clang::AArch64::BI_InterlockedOr8_rel:
124 case clang::AArch64::BI_InterlockedOr16_rel:
125 case clang::AArch64::BI_InterlockedOr_rel:
126 case clang::AArch64::BI_InterlockedOr64_rel:
127 return MSVCIntrin::_InterlockedOr_rel;
128 case clang::AArch64::BI_InterlockedOr8_nf:
129 case clang::AArch64::BI_InterlockedOr16_nf:
130 case clang::AArch64::BI_InterlockedOr_nf:
131 case clang::AArch64::BI_InterlockedOr64_nf:
132 return MSVCIntrin::_InterlockedOr_nf;
133 case clang::AArch64::BI_InterlockedXor8_acq:
134 case clang::AArch64::BI_InterlockedXor16_acq:
135 case clang::AArch64::BI_InterlockedXor_acq:
136 case clang::AArch64::BI_InterlockedXor64_acq:
137 return MSVCIntrin::_InterlockedXor_acq;
138 case clang::AArch64::BI_InterlockedXor8_rel:
139 case clang::AArch64::BI_InterlockedXor16_rel:
140 case clang::AArch64::BI_InterlockedXor_rel:
141 case clang::AArch64::BI_InterlockedXor64_rel:
142 return MSVCIntrin::_InterlockedXor_rel;
143 case clang::AArch64::BI_InterlockedXor8_nf:
144 case clang::AArch64::BI_InterlockedXor16_nf:
145 case clang::AArch64::BI_InterlockedXor_nf:
146 case clang::AArch64::BI_InterlockedXor64_nf:
147 return MSVCIntrin::_InterlockedXor_nf;
148 case clang::AArch64::BI_InterlockedAnd8_acq:
149 case clang::AArch64::BI_InterlockedAnd16_acq:
150 case clang::AArch64::BI_InterlockedAnd_acq:
151 case clang::AArch64::BI_InterlockedAnd64_acq:
152 return MSVCIntrin::_InterlockedAnd_acq;
153 case clang::AArch64::BI_InterlockedAnd8_rel:
154 case clang::AArch64::BI_InterlockedAnd16_rel:
155 case clang::AArch64::BI_InterlockedAnd_rel:
156 case clang::AArch64::BI_InterlockedAnd64_rel:
157 return MSVCIntrin::_InterlockedAnd_rel;
158 case clang::AArch64::BI_InterlockedAnd8_nf:
159 case clang::AArch64::BI_InterlockedAnd16_nf:
160 case clang::AArch64::BI_InterlockedAnd_nf:
161 case clang::AArch64::BI_InterlockedAnd64_nf:
162 return MSVCIntrin::_InterlockedAnd_nf;
163 case clang::AArch64::BI_InterlockedIncrement16_acq:
164 case clang::AArch64::BI_InterlockedIncrement_acq:
165 case clang::AArch64::BI_InterlockedIncrement64_acq:
166 return MSVCIntrin::_InterlockedIncrement_acq;
167 case clang::AArch64::BI_InterlockedIncrement16_rel:
168 case clang::AArch64::BI_InterlockedIncrement_rel:
169 case clang::AArch64::BI_InterlockedIncrement64_rel:
170 return MSVCIntrin::_InterlockedIncrement_rel;
171 case clang::AArch64::BI_InterlockedIncrement16_nf:
172 case clang::AArch64::BI_InterlockedIncrement_nf:
173 case clang::AArch64::BI_InterlockedIncrement64_nf:
174 return MSVCIntrin::_InterlockedIncrement_nf;
175 case clang::AArch64::BI_InterlockedDecrement16_acq:
176 case clang::AArch64::BI_InterlockedDecrement_acq:
177 case clang::AArch64::BI_InterlockedDecrement64_acq:
178 return MSVCIntrin::_InterlockedDecrement_acq;
179 case clang::AArch64::BI_InterlockedDecrement16_rel:
180 case clang::AArch64::BI_InterlockedDecrement_rel:
181 case clang::AArch64::BI_InterlockedDecrement64_rel:
182 return MSVCIntrin::_InterlockedDecrement_rel;
183 case clang::AArch64::BI_InterlockedDecrement16_nf:
184 case clang::AArch64::BI_InterlockedDecrement_nf:
185 case clang::AArch64::BI_InterlockedDecrement64_nf:
186 return MSVCIntrin::_InterlockedDecrement_nf;
187 }
188 llvm_unreachable("must return from switch");
189}
190
191static std::optional<CodeGenFunction::MSVCIntrin>
192translateArmToMsvcIntrin(unsigned BuiltinID) {
193 using MSVCIntrin = CodeGenFunction::MSVCIntrin;
194 switch (BuiltinID) {
195 default:
196 return std::nullopt;
197 case clang::ARM::BI_BitScanForward:
198 case clang::ARM::BI_BitScanForward64:
199 return MSVCIntrin::_BitScanForward;
200 case clang::ARM::BI_BitScanReverse:
201 case clang::ARM::BI_BitScanReverse64:
202 return MSVCIntrin::_BitScanReverse;
203 case clang::ARM::BI_InterlockedAnd64:
204 return MSVCIntrin::_InterlockedAnd;
205 case clang::ARM::BI_InterlockedExchange64:
206 return MSVCIntrin::_InterlockedExchange;
207 case clang::ARM::BI_InterlockedExchangeAdd64:
208 return MSVCIntrin::_InterlockedExchangeAdd;
209 case clang::ARM::BI_InterlockedExchangeSub64:
210 return MSVCIntrin::_InterlockedExchangeSub;
211 case clang::ARM::BI_InterlockedOr64:
212 return MSVCIntrin::_InterlockedOr;
213 case clang::ARM::BI_InterlockedXor64:
214 return MSVCIntrin::_InterlockedXor;
215 case clang::ARM::BI_InterlockedDecrement64:
216 return MSVCIntrin::_InterlockedDecrement;
217 case clang::ARM::BI_InterlockedIncrement64:
218 return MSVCIntrin::_InterlockedIncrement;
219 case clang::ARM::BI_InterlockedExchangeAdd8_acq:
220 case clang::ARM::BI_InterlockedExchangeAdd16_acq:
221 case clang::ARM::BI_InterlockedExchangeAdd_acq:
222 case clang::ARM::BI_InterlockedExchangeAdd64_acq:
223 return MSVCIntrin::_InterlockedExchangeAdd_acq;
224 case clang::ARM::BI_InterlockedExchangeAdd8_rel:
225 case clang::ARM::BI_InterlockedExchangeAdd16_rel:
226 case clang::ARM::BI_InterlockedExchangeAdd_rel:
227 case clang::ARM::BI_InterlockedExchangeAdd64_rel:
228 return MSVCIntrin::_InterlockedExchangeAdd_rel;
229 case clang::ARM::BI_InterlockedExchangeAdd8_nf:
230 case clang::ARM::BI_InterlockedExchangeAdd16_nf:
231 case clang::ARM::BI_InterlockedExchangeAdd_nf:
232 case clang::ARM::BI_InterlockedExchangeAdd64_nf:
233 return MSVCIntrin::_InterlockedExchangeAdd_nf;
234 case clang::ARM::BI_InterlockedExchange8_acq:
235 case clang::ARM::BI_InterlockedExchange16_acq:
236 case clang::ARM::BI_InterlockedExchange_acq:
237 case clang::ARM::BI_InterlockedExchange64_acq:
238 case clang::ARM::BI_InterlockedExchangePointer_acq:
239 return MSVCIntrin::_InterlockedExchange_acq;
240 case clang::ARM::BI_InterlockedExchange8_rel:
241 case clang::ARM::BI_InterlockedExchange16_rel:
242 case clang::ARM::BI_InterlockedExchange_rel:
243 case clang::ARM::BI_InterlockedExchange64_rel:
244 case clang::ARM::BI_InterlockedExchangePointer_rel:
245 return MSVCIntrin::_InterlockedExchange_rel;
246 case clang::ARM::BI_InterlockedExchange8_nf:
247 case clang::ARM::BI_InterlockedExchange16_nf:
248 case clang::ARM::BI_InterlockedExchange_nf:
249 case clang::ARM::BI_InterlockedExchange64_nf:
250 case clang::ARM::BI_InterlockedExchangePointer_nf:
251 return MSVCIntrin::_InterlockedExchange_nf;
252 case clang::ARM::BI_InterlockedCompareExchange8_acq:
253 case clang::ARM::BI_InterlockedCompareExchange16_acq:
254 case clang::ARM::BI_InterlockedCompareExchange_acq:
255 case clang::ARM::BI_InterlockedCompareExchange64_acq:
256 case clang::ARM::BI_InterlockedCompareExchangePointer_acq:
257 return MSVCIntrin::_InterlockedCompareExchange_acq;
258 case clang::ARM::BI_InterlockedCompareExchange8_rel:
259 case clang::ARM::BI_InterlockedCompareExchange16_rel:
260 case clang::ARM::BI_InterlockedCompareExchange_rel:
261 case clang::ARM::BI_InterlockedCompareExchange64_rel:
262 case clang::ARM::BI_InterlockedCompareExchangePointer_rel:
263 return MSVCIntrin::_InterlockedCompareExchange_rel;
264 case clang::ARM::BI_InterlockedCompareExchange8_nf:
265 case clang::ARM::BI_InterlockedCompareExchange16_nf:
266 case clang::ARM::BI_InterlockedCompareExchange_nf:
267 case clang::ARM::BI_InterlockedCompareExchange64_nf:
268 return MSVCIntrin::_InterlockedCompareExchange_nf;
269 case clang::ARM::BI_InterlockedOr8_acq:
270 case clang::ARM::BI_InterlockedOr16_acq:
271 case clang::ARM::BI_InterlockedOr_acq:
272 case clang::ARM::BI_InterlockedOr64_acq:
273 return MSVCIntrin::_InterlockedOr_acq;
274 case clang::ARM::BI_InterlockedOr8_rel:
275 case clang::ARM::BI_InterlockedOr16_rel:
276 case clang::ARM::BI_InterlockedOr_rel:
277 case clang::ARM::BI_InterlockedOr64_rel:
278 return MSVCIntrin::_InterlockedOr_rel;
279 case clang::ARM::BI_InterlockedOr8_nf:
280 case clang::ARM::BI_InterlockedOr16_nf:
281 case clang::ARM::BI_InterlockedOr_nf:
282 case clang::ARM::BI_InterlockedOr64_nf:
283 return MSVCIntrin::_InterlockedOr_nf;
284 case clang::ARM::BI_InterlockedXor8_acq:
285 case clang::ARM::BI_InterlockedXor16_acq:
286 case clang::ARM::BI_InterlockedXor_acq:
287 case clang::ARM::BI_InterlockedXor64_acq:
288 return MSVCIntrin::_InterlockedXor_acq;
289 case clang::ARM::BI_InterlockedXor8_rel:
290 case clang::ARM::BI_InterlockedXor16_rel:
291 case clang::ARM::BI_InterlockedXor_rel:
292 case clang::ARM::BI_InterlockedXor64_rel:
293 return MSVCIntrin::_InterlockedXor_rel;
294 case clang::ARM::BI_InterlockedXor8_nf:
295 case clang::ARM::BI_InterlockedXor16_nf:
296 case clang::ARM::BI_InterlockedXor_nf:
297 case clang::ARM::BI_InterlockedXor64_nf:
298 return MSVCIntrin::_InterlockedXor_nf;
299 case clang::ARM::BI_InterlockedAnd8_acq:
300 case clang::ARM::BI_InterlockedAnd16_acq:
301 case clang::ARM::BI_InterlockedAnd_acq:
302 case clang::ARM::BI_InterlockedAnd64_acq:
303 return MSVCIntrin::_InterlockedAnd_acq;
304 case clang::ARM::BI_InterlockedAnd8_rel:
305 case clang::ARM::BI_InterlockedAnd16_rel:
306 case clang::ARM::BI_InterlockedAnd_rel:
307 case clang::ARM::BI_InterlockedAnd64_rel:
308 return MSVCIntrin::_InterlockedAnd_rel;
309 case clang::ARM::BI_InterlockedAnd8_nf:
310 case clang::ARM::BI_InterlockedAnd16_nf:
311 case clang::ARM::BI_InterlockedAnd_nf:
312 case clang::ARM::BI_InterlockedAnd64_nf:
313 return MSVCIntrin::_InterlockedAnd_nf;
314 case clang::ARM::BI_InterlockedIncrement16_acq:
315 case clang::ARM::BI_InterlockedIncrement_acq:
316 case clang::ARM::BI_InterlockedIncrement64_acq:
317 return MSVCIntrin::_InterlockedIncrement_acq;
318 case clang::ARM::BI_InterlockedIncrement16_rel:
319 case clang::ARM::BI_InterlockedIncrement_rel:
320 case clang::ARM::BI_InterlockedIncrement64_rel:
321 return MSVCIntrin::_InterlockedIncrement_rel;
322 case clang::ARM::BI_InterlockedIncrement16_nf:
323 case clang::ARM::BI_InterlockedIncrement_nf:
324 case clang::ARM::BI_InterlockedIncrement64_nf:
325 return MSVCIntrin::_InterlockedIncrement_nf;
326 case clang::ARM::BI_InterlockedDecrement16_acq:
327 case clang::ARM::BI_InterlockedDecrement_acq:
328 case clang::ARM::BI_InterlockedDecrement64_acq:
329 return MSVCIntrin::_InterlockedDecrement_acq;
330 case clang::ARM::BI_InterlockedDecrement16_rel:
331 case clang::ARM::BI_InterlockedDecrement_rel:
332 case clang::ARM::BI_InterlockedDecrement64_rel:
333 return MSVCIntrin::_InterlockedDecrement_rel;
334 case clang::ARM::BI_InterlockedDecrement16_nf:
335 case clang::ARM::BI_InterlockedDecrement_nf:
336 case clang::ARM::BI_InterlockedDecrement64_nf:
337 return MSVCIntrin::_InterlockedDecrement_nf;
338 }
339 llvm_unreachable("must return from switch");
340}
341
342// Emit an intrinsic where all operands are of the same type as the result.
343// Depending on mode, this may be a constrained floating-point intrinsic.
344static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
345 unsigned IntrinsicID,
346 unsigned ConstrainedIntrinsicID,
347 llvm::Type *Ty,
348 ArrayRef<Value *> Args) {
349 Function *F;
350 if (CGF.Builder.getIsFPConstrained())
351 F = CGF.CGM.getIntrinsic(IID: ConstrainedIntrinsicID, Tys: Ty);
352 else
353 F = CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: Ty);
354
355 if (CGF.Builder.getIsFPConstrained())
356 return CGF.Builder.CreateConstrainedFPCall(Callee: F, Args);
357
358 return CGF.Builder.CreateCall(Callee: F, Args);
359}
360
361static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
362 NeonTypeFlags TypeFlags,
363 bool HasFastHalfType = true,
364 bool V1Ty = false,
365 bool AllowBFloatArgsAndRet = true) {
366 int IsQuad = TypeFlags.isQuad();
367 switch (TypeFlags.getEltType()) {
368 case NeonTypeFlags::Int8:
369 case NeonTypeFlags::Poly8:
370 case NeonTypeFlags::MFloat8:
371 return llvm::FixedVectorType::get(ElementType: CGF->Int8Ty, NumElts: V1Ty ? 1 : (8 << IsQuad));
372 case NeonTypeFlags::Int16:
373 case NeonTypeFlags::Poly16:
374 return llvm::FixedVectorType::get(ElementType: CGF->Int16Ty, NumElts: V1Ty ? 1 : (4 << IsQuad));
375 case NeonTypeFlags::BFloat16:
376 if (AllowBFloatArgsAndRet)
377 return llvm::FixedVectorType::get(ElementType: CGF->BFloatTy, NumElts: V1Ty ? 1 : (4 << IsQuad));
378 return llvm::FixedVectorType::get(ElementType: CGF->Int16Ty, NumElts: V1Ty ? 1 : (4 << IsQuad));
379 case NeonTypeFlags::Float16:
380 if (HasFastHalfType)
381 return llvm::FixedVectorType::get(ElementType: CGF->HalfTy, NumElts: V1Ty ? 1 : (4 << IsQuad));
382 return llvm::FixedVectorType::get(ElementType: CGF->Int16Ty, NumElts: V1Ty ? 1 : (4 << IsQuad));
383 case NeonTypeFlags::Int32:
384 return llvm::FixedVectorType::get(ElementType: CGF->Int32Ty, NumElts: V1Ty ? 1 : (2 << IsQuad));
385 case NeonTypeFlags::Int64:
386 case NeonTypeFlags::Poly64:
387 return llvm::FixedVectorType::get(ElementType: CGF->Int64Ty, NumElts: V1Ty ? 1 : (1 << IsQuad));
388 case NeonTypeFlags::Poly128:
389 // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
390 // There is a lot of i128 and f128 API missing.
391 // so we use v16i8 to represent poly128 and get pattern matched.
392 return llvm::FixedVectorType::get(ElementType: CGF->Int8Ty, NumElts: 16);
393 case NeonTypeFlags::Float32:
394 return llvm::FixedVectorType::get(ElementType: CGF->FloatTy, NumElts: V1Ty ? 1 : (2 << IsQuad));
395 case NeonTypeFlags::Float64:
396 return llvm::FixedVectorType::get(ElementType: CGF->DoubleTy, NumElts: V1Ty ? 1 : (1 << IsQuad));
397 }
398 llvm_unreachable("Unknown vector element type!");
399}
400
401static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
402 NeonTypeFlags IntTypeFlags) {
403 int IsQuad = IntTypeFlags.isQuad();
404 switch (IntTypeFlags.getEltType()) {
405 case NeonTypeFlags::Int16:
406 return llvm::FixedVectorType::get(ElementType: CGF->HalfTy, NumElts: (4 << IsQuad));
407 case NeonTypeFlags::Int32:
408 return llvm::FixedVectorType::get(ElementType: CGF->FloatTy, NumElts: (2 << IsQuad));
409 case NeonTypeFlags::Int64:
410 return llvm::FixedVectorType::get(ElementType: CGF->DoubleTy, NumElts: (1 << IsQuad));
411 default:
412 llvm_unreachable("Type can't be converted to floating-point!");
413 }
414}
415
416Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
417 const ElementCount &Count) {
418 Value *SV = llvm::ConstantVector::getSplat(EC: Count, Elt: C);
419 return Builder.CreateShuffleVector(V1: V, V2: V, Mask: SV, Name: "lane");
420}
421
422Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
423 ElementCount EC = cast<llvm::VectorType>(Val: V->getType())->getElementCount();
424 return EmitNeonSplat(V, C, Count: EC);
425}
426
427Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
428 const char *name,
429 unsigned shift, bool rightshift) {
430 unsigned j = 0;
431 for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
432 ai != ae; ++ai, ++j) {
433 if (F->isConstrainedFPIntrinsic())
434 if (ai->getType()->isMetadataTy())
435 continue;
436 if (shift > 0 && shift == j)
437 Ops[j] = EmitNeonShiftVector(V: Ops[j], Ty: ai->getType(), negateForRightShift: rightshift);
438 else
439 Ops[j] = Builder.CreateBitCast(V: Ops[j], DestTy: ai->getType(), Name: name);
440 }
441
442 if (F->isConstrainedFPIntrinsic())
443 return Builder.CreateConstrainedFPCall(Callee: F, Args: Ops, Name: name);
444 return Builder.CreateCall(Callee: F, Args: Ops, Name: name);
445}
446
447Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
448 ArrayRef<llvm::Type *> Tys,
449 SmallVectorImpl<Value *> &Ops,
450 const CallExpr *E, const char *name) {
451 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_set_fpmr),
452 Args: Ops.pop_back_val());
453 return EmitNeonCall(F: CGM.getIntrinsic(IID, Tys), Ops, name);
454}
455
456llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
457 unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
458 SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
459
460 const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
461 RetTy->getPrimitiveSizeInBits();
462 llvm::Type *Tys[] = {llvm::FixedVectorType::get(ElementType: RetTy, NumElts: ElemCount),
463 Ops[1]->getType()};
464 if (ExtendLaneArg) {
465 auto *VT = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
466 Ops[2] = Builder.CreateInsertVector(DstType: VT, SrcVec: PoisonValue::get(T: VT), SubVec: Ops[2],
467 Idx: uint64_t(0));
468 }
469 return EmitFP8NeonCall(IID, Tys, Ops, E, name);
470}
471
472llvm::Value *CodeGenFunction::EmitFP8NeonFMLACall(
473 unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
474 SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
475
476 if (ExtendLaneArg) {
477 auto *VT = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
478 Ops[2] = Builder.CreateInsertVector(DstType: VT, SrcVec: PoisonValue::get(T: VT), SubVec: Ops[2],
479 Idx: uint64_t(0));
480 }
481 const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
482 RetTy->getPrimitiveSizeInBits();
483 return EmitFP8NeonCall(IID, Tys: {llvm::FixedVectorType::get(ElementType: RetTy, NumElts: ElemCount)},
484 Ops, E, name);
485}
486
487Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
488 bool neg) {
489 int SV = cast<ConstantInt>(Val: V)->getSExtValue();
490 return ConstantInt::getSigned(Ty, V: neg ? -SV : SV);
491}
492
493Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
494 llvm::Type *Ty1, bool Extract,
495 SmallVectorImpl<llvm::Value *> &Ops,
496 const CallExpr *E,
497 const char *name) {
498 llvm::Type *Tys[] = {Ty0, Ty1};
499 if (Extract) {
500 // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of
501 // the vector.
502 Tys[1] = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8);
503 Ops[0] = Builder.CreateExtractVector(DstType: Tys[1], SrcVec: Ops[0], Idx: uint64_t(0));
504 }
505 return EmitFP8NeonCall(IID, Tys, Ops, E, name);
506}
507
508// Right-shift a vector by a constant.
509Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
510 llvm::Type *Ty, bool usgn,
511 const char *name) {
512 llvm::VectorType *VTy = cast<llvm::VectorType>(Val: Ty);
513
514 int ShiftAmt = cast<ConstantInt>(Val: Shift)->getSExtValue();
515 int EltSize = VTy->getScalarSizeInBits();
516
517 Vec = Builder.CreateBitCast(V: Vec, DestTy: Ty);
518
519 // lshr/ashr are undefined when the shift amount is equal to the vector
520 // element size.
521 if (ShiftAmt == EltSize) {
522 if (usgn) {
523 // Right-shifting an unsigned value by its size yields 0.
524 return llvm::ConstantAggregateZero::get(Ty: VTy);
525 } else {
526 // Right-shifting a signed value by its size is equivalent
527 // to a shift of size-1.
528 --ShiftAmt;
529 Shift = ConstantInt::get(Ty: VTy->getElementType(), V: ShiftAmt);
530 }
531 }
532
533 Shift = EmitNeonShiftVector(V: Shift, Ty, neg: false);
534 if (usgn)
535 return Builder.CreateLShr(LHS: Vec, RHS: Shift, Name: name);
536 return Builder.CreateAShr(LHS: Vec, RHS: Shift, Name: name);
537}
538
539// clang-format off
540static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
541 NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0),
542 NEONMAP0(splat_lane_v),
543 NEONMAP0(splat_laneq_v),
544 NEONMAP0(splatq_lane_v),
545 NEONMAP0(splatq_laneq_v),
546 NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
547 NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
548 NEONMAP1(vabs_v, arm_neon_vabs, 0),
549 NEONMAP1(vabsq_v, arm_neon_vabs, 0),
550 NEONMAP0(vadd_v),
551 NEONMAP0(vaddhn_v),
552 NEONMAP0(vaddq_v),
553 NEONMAP1(vaesdq_u8, arm_neon_aesd, 0),
554 NEONMAP1(vaeseq_u8, arm_neon_aese, 0),
555 NEONMAP1(vaesimcq_u8, arm_neon_aesimc, 0),
556 NEONMAP1(vaesmcq_u8, arm_neon_aesmc, 0),
557 NEONMAP1(vbfdot_f32, arm_neon_bfdot, 0),
558 NEONMAP1(vbfdotq_f32, arm_neon_bfdot, 0),
559 NEONMAP1(vbfmlalbq_f32, arm_neon_bfmlalb, 0),
560 NEONMAP1(vbfmlaltq_f32, arm_neon_bfmlalt, 0),
561 NEONMAP1(vbfmmlaq_f32, arm_neon_bfmmla, 0),
562 NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
563 NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
564 NEONMAP1(vcadd_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
565 NEONMAP1(vcadd_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
566 NEONMAP1(vcadd_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
567 NEONMAP1(vcadd_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
568 NEONMAP1(vcaddq_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
569 NEONMAP1(vcaddq_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
570 NEONMAP1(vcaddq_rot270_f64, arm_neon_vcadd_rot270, Add1ArgType),
571 NEONMAP1(vcaddq_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
572 NEONMAP1(vcaddq_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
573 NEONMAP1(vcaddq_rot90_f64, arm_neon_vcadd_rot90, Add1ArgType),
574 NEONMAP1(vcage_v, arm_neon_vacge, 0),
575 NEONMAP1(vcageq_v, arm_neon_vacge, 0),
576 NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
577 NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
578 NEONMAP1(vcale_v, arm_neon_vacge, 0),
579 NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
580 NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
581 NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
582 NEONMAP0(vceqz_v),
583 NEONMAP0(vceqzq_v),
584 NEONMAP0(vcgez_v),
585 NEONMAP0(vcgezq_v),
586 NEONMAP0(vcgtz_v),
587 NEONMAP0(vcgtzq_v),
588 NEONMAP0(vclez_v),
589 NEONMAP0(vclezq_v),
590 NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
591 NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
592 NEONMAP0(vcltz_v),
593 NEONMAP0(vcltzq_v),
594 NEONMAP1(vclz_v, ctlz, Add1ArgType),
595 NEONMAP1(vclzq_v, ctlz, Add1ArgType),
596 NEONMAP1(vcnt_v, ctpop, Add1ArgType),
597 NEONMAP1(vcntq_v, ctpop, Add1ArgType),
598 NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
599 NEONMAP0(vcvt_f16_s16),
600 NEONMAP0(vcvt_f16_u16),
601 NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
602 NEONMAP0(vcvt_f32_v),
603 NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
604 NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
605 NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
606 NEONMAP1(vcvt_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
607 NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
608 NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
609 NEONMAP1(vcvt_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
610 NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
611 NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
612 NEONMAP0(vcvt_s16_f16),
613 NEONMAP0(vcvt_s32_v),
614 NEONMAP0(vcvt_s64_v),
615 NEONMAP0(vcvt_u16_f16),
616 NEONMAP0(vcvt_u32_v),
617 NEONMAP0(vcvt_u64_v),
618 NEONMAP1(vcvta_s16_f16, arm_neon_vcvtas, 0),
619 NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
620 NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
621 NEONMAP1(vcvta_u16_f16, arm_neon_vcvtau, 0),
622 NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
623 NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
624 NEONMAP1(vcvtaq_s16_f16, arm_neon_vcvtas, 0),
625 NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
626 NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
627 NEONMAP1(vcvtaq_u16_f16, arm_neon_vcvtau, 0),
628 NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
629 NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
630 NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0),
631 NEONMAP1(vcvtm_s16_f16, arm_neon_vcvtms, 0),
632 NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
633 NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
634 NEONMAP1(vcvtm_u16_f16, arm_neon_vcvtmu, 0),
635 NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
636 NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
637 NEONMAP1(vcvtmq_s16_f16, arm_neon_vcvtms, 0),
638 NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
639 NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
640 NEONMAP1(vcvtmq_u16_f16, arm_neon_vcvtmu, 0),
641 NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
642 NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
643 NEONMAP1(vcvtn_s16_f16, arm_neon_vcvtns, 0),
644 NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
645 NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
646 NEONMAP1(vcvtn_u16_f16, arm_neon_vcvtnu, 0),
647 NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
648 NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
649 NEONMAP1(vcvtnq_s16_f16, arm_neon_vcvtns, 0),
650 NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
651 NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
652 NEONMAP1(vcvtnq_u16_f16, arm_neon_vcvtnu, 0),
653 NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
654 NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
655 NEONMAP1(vcvtp_s16_f16, arm_neon_vcvtps, 0),
656 NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
657 NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
658 NEONMAP1(vcvtp_u16_f16, arm_neon_vcvtpu, 0),
659 NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
660 NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
661 NEONMAP1(vcvtpq_s16_f16, arm_neon_vcvtps, 0),
662 NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
663 NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
664 NEONMAP1(vcvtpq_u16_f16, arm_neon_vcvtpu, 0),
665 NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
666 NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
667 NEONMAP0(vcvtq_f16_s16),
668 NEONMAP0(vcvtq_f16_u16),
669 NEONMAP0(vcvtq_f32_v),
670 NEONMAP1(vcvtq_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
671 NEONMAP1(vcvtq_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
672 NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
673 NEONMAP1(vcvtq_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
674 NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
675 NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
676 NEONMAP1(vcvtq_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
677 NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
678 NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
679 NEONMAP0(vcvtq_s16_f16),
680 NEONMAP0(vcvtq_s32_v),
681 NEONMAP0(vcvtq_s64_v),
682 NEONMAP0(vcvtq_u16_f16),
683 NEONMAP0(vcvtq_u32_v),
684 NEONMAP0(vcvtq_u64_v),
685 NEONMAP1(vdot_s32, arm_neon_sdot, 0),
686 NEONMAP1(vdot_u32, arm_neon_udot, 0),
687 NEONMAP1(vdotq_s32, arm_neon_sdot, 0),
688 NEONMAP1(vdotq_u32, arm_neon_udot, 0),
689 NEONMAP0(vext_v),
690 NEONMAP0(vextq_v),
691 NEONMAP0(vfma_v),
692 NEONMAP0(vfmaq_v),
693 NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
694 NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
695 NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
696 NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
697 NEONMAP0(vld1_dup_v),
698 NEONMAP1(vld1_v, arm_neon_vld1, 0),
699 NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
700 NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
701 NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
702 NEONMAP0(vld1q_dup_v),
703 NEONMAP1(vld1q_v, arm_neon_vld1, 0),
704 NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
705 NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
706 NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
707 NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
708 NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
709 NEONMAP1(vld2_v, arm_neon_vld2, 0),
710 NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
711 NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
712 NEONMAP1(vld2q_v, arm_neon_vld2, 0),
713 NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
714 NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
715 NEONMAP1(vld3_v, arm_neon_vld3, 0),
716 NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
717 NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
718 NEONMAP1(vld3q_v, arm_neon_vld3, 0),
719 NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
720 NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
721 NEONMAP1(vld4_v, arm_neon_vld4, 0),
722 NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
723 NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
724 NEONMAP1(vld4q_v, arm_neon_vld4, 0),
725 NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
726 NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
727 NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
728 NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
729 NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
730 NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
731 NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
732 NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
733 NEONMAP1(vmmlaq_s32, arm_neon_smmla, 0),
734 NEONMAP1(vmmlaq_u32, arm_neon_ummla, 0),
735 NEONMAP0(vmovl_v),
736 NEONMAP0(vmovn_v),
737 NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
738 NEONMAP0(vmull_v),
739 NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
740 NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
741 NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
742 NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
743 NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
744 NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
745 NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
746 NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
747 NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
748 NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
749 NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
750 NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
751 NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
752 NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0),
753 NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0),
754 NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
755 NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
756 NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
757 NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
758 NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
759 NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
760 NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
761 NEONMAP1(vqrdmlah_s16, arm_neon_vqrdmlah, Add1ArgType),
762 NEONMAP1(vqrdmlah_s32, arm_neon_vqrdmlah, Add1ArgType),
763 NEONMAP1(vqrdmlahq_s16, arm_neon_vqrdmlah, Add1ArgType),
764 NEONMAP1(vqrdmlahq_s32, arm_neon_vqrdmlah, Add1ArgType),
765 NEONMAP1(vqrdmlsh_s16, arm_neon_vqrdmlsh, Add1ArgType),
766 NEONMAP1(vqrdmlsh_s32, arm_neon_vqrdmlsh, Add1ArgType),
767 NEONMAP1(vqrdmlshq_s16, arm_neon_vqrdmlsh, Add1ArgType),
768 NEONMAP1(vqrdmlshq_s32, arm_neon_vqrdmlsh, Add1ArgType),
769 NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
770 NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
771 NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
772 NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
773 NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
774 NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
775 NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
776 NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
777 NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
778 NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
779 NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
780 NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
781 NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
782 NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
783 NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
784 NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
785 NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
786 NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
787 NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
788 NEONMAP1(vrnd_v, trunc, Add1ArgType),
789 NEONMAP1(vrnda_v, round, Add1ArgType),
790 NEONMAP1(vrndaq_v, round, Add1ArgType),
791 NEONMAP0(vrndi_v),
792 NEONMAP0(vrndiq_v),
793 NEONMAP1(vrndm_v, floor, Add1ArgType),
794 NEONMAP1(vrndmq_v, floor, Add1ArgType),
795 NEONMAP1(vrndn_v, roundeven, Add1ArgType),
796 NEONMAP1(vrndnq_v, roundeven, Add1ArgType),
797 NEONMAP1(vrndp_v, ceil, Add1ArgType),
798 NEONMAP1(vrndpq_v, ceil, Add1ArgType),
799 NEONMAP1(vrndq_v, trunc, Add1ArgType),
800 NEONMAP1(vrndx_v, rint, Add1ArgType),
801 NEONMAP1(vrndxq_v, rint, Add1ArgType),
802 NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
803 NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
804 NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
805 NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
806 NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
807 NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
808 NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
809 NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
810 NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
811 NEONMAP1(vsha1su0q_u32, arm_neon_sha1su0, 0),
812 NEONMAP1(vsha1su1q_u32, arm_neon_sha1su1, 0),
813 NEONMAP1(vsha256h2q_u32, arm_neon_sha256h2, 0),
814 NEONMAP1(vsha256hq_u32, arm_neon_sha256h, 0),
815 NEONMAP1(vsha256su0q_u32, arm_neon_sha256su0, 0),
816 NEONMAP1(vsha256su1q_u32, arm_neon_sha256su1, 0),
817 NEONMAP0(vshl_n_v),
818 NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
819 NEONMAP0(vshll_n_v),
820 NEONMAP0(vshlq_n_v),
821 NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
822 NEONMAP0(vshr_n_v),
823 NEONMAP0(vshrn_n_v),
824 NEONMAP0(vshrq_n_v),
825 NEONMAP1(vst1_v, arm_neon_vst1, 0),
826 NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
827 NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
828 NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
829 NEONMAP1(vst1q_v, arm_neon_vst1, 0),
830 NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
831 NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
832 NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
833 NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
834 NEONMAP1(vst2_v, arm_neon_vst2, 0),
835 NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
836 NEONMAP1(vst2q_v, arm_neon_vst2, 0),
837 NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
838 NEONMAP1(vst3_v, arm_neon_vst3, 0),
839 NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
840 NEONMAP1(vst3q_v, arm_neon_vst3, 0),
841 NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
842 NEONMAP1(vst4_v, arm_neon_vst4, 0),
843 NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
844 NEONMAP1(vst4q_v, arm_neon_vst4, 0),
845 NEONMAP0(vsubhn_v),
846 NEONMAP0(vtrn_v),
847 NEONMAP0(vtrnq_v),
848 NEONMAP0(vtst_v),
849 NEONMAP0(vtstq_v),
850 NEONMAP1(vusdot_s32, arm_neon_usdot, 0),
851 NEONMAP1(vusdotq_s32, arm_neon_usdot, 0),
852 NEONMAP1(vusmmlaq_s32, arm_neon_usmmla, 0),
853 NEONMAP0(vuzp_v),
854 NEONMAP0(vuzpq_v),
855 NEONMAP0(vzip_v),
856 NEONMAP0(vzipq_v)
857};
858
859// clang-format on
860
861// Some intrinsics are equivalent for codegen.
862static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
863 { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, },
864 { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, },
865 { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, },
866 { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, },
867 { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, },
868 { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, },
869 { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, },
870 { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, },
871 { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, },
872 { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, },
873 { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, },
874 { NEON::BI__builtin_neon_vcagtq_f16, NEON::BI__builtin_neon_vcagtq_v, },
875 { NEON::BI__builtin_neon_vcale_f16, NEON::BI__builtin_neon_vcale_v, },
876 { NEON::BI__builtin_neon_vcaleq_f16, NEON::BI__builtin_neon_vcaleq_v, },
877 { NEON::BI__builtin_neon_vcalt_f16, NEON::BI__builtin_neon_vcalt_v, },
878 { NEON::BI__builtin_neon_vcaltq_f16, NEON::BI__builtin_neon_vcaltq_v, },
879 { NEON::BI__builtin_neon_vceqz_f16, NEON::BI__builtin_neon_vceqz_v, },
880 { NEON::BI__builtin_neon_vceqzq_f16, NEON::BI__builtin_neon_vceqzq_v, },
881 { NEON::BI__builtin_neon_vcgez_f16, NEON::BI__builtin_neon_vcgez_v, },
882 { NEON::BI__builtin_neon_vcgezq_f16, NEON::BI__builtin_neon_vcgezq_v, },
883 { NEON::BI__builtin_neon_vcgtz_f16, NEON::BI__builtin_neon_vcgtz_v, },
884 { NEON::BI__builtin_neon_vcgtzq_f16, NEON::BI__builtin_neon_vcgtzq_v, },
885 { NEON::BI__builtin_neon_vclez_f16, NEON::BI__builtin_neon_vclez_v, },
886 { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, },
887 { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, },
888 { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, },
889 { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, },
890 { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, },
891 { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, },
892 { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, },
893 { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, },
894 { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, },
895 { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v },
896 { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v },
897 { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v },
898 { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v },
899 { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v },
900 { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v },
901 { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v },
902 { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v },
903 { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v },
904 { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v },
905 { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v },
906 { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v },
907 { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v },
908 { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v },
909 { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v },
910 { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v },
911 { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v },
912 { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v },
913 { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v },
914 { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v },
915 { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v },
916 { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v },
917 { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v },
918 { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v },
919 { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v },
920 { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v },
921 { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v },
922 { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v },
923 { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v },
924 { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v },
925 { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, },
926 { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, },
927 { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, },
928 { NEON::BI__builtin_neon_vmaxq_f16, NEON::BI__builtin_neon_vmaxq_v, },
929 { NEON::BI__builtin_neon_vmin_f16, NEON::BI__builtin_neon_vmin_v, },
930 { NEON::BI__builtin_neon_vminnm_f16, NEON::BI__builtin_neon_vminnm_v, },
931 { NEON::BI__builtin_neon_vminnmq_f16, NEON::BI__builtin_neon_vminnmq_v, },
932 { NEON::BI__builtin_neon_vminq_f16, NEON::BI__builtin_neon_vminq_v, },
933 { NEON::BI__builtin_neon_vmulx_f16, NEON::BI__builtin_neon_vmulx_v, },
934 { NEON::BI__builtin_neon_vmulxq_f16, NEON::BI__builtin_neon_vmulxq_v, },
935 { NEON::BI__builtin_neon_vpadd_f16, NEON::BI__builtin_neon_vpadd_v, },
936 { NEON::BI__builtin_neon_vpaddq_f16, NEON::BI__builtin_neon_vpaddq_v, },
937 { NEON::BI__builtin_neon_vpmax_f16, NEON::BI__builtin_neon_vpmax_v, },
938 { NEON::BI__builtin_neon_vpmaxnm_f16, NEON::BI__builtin_neon_vpmaxnm_v, },
939 { NEON::BI__builtin_neon_vpmaxnmq_f16, NEON::BI__builtin_neon_vpmaxnmq_v, },
940 { NEON::BI__builtin_neon_vpmaxq_f16, NEON::BI__builtin_neon_vpmaxq_v, },
941 { NEON::BI__builtin_neon_vpmin_f16, NEON::BI__builtin_neon_vpmin_v, },
942 { NEON::BI__builtin_neon_vpminnm_f16, NEON::BI__builtin_neon_vpminnm_v, },
943 { NEON::BI__builtin_neon_vpminnmq_f16, NEON::BI__builtin_neon_vpminnmq_v, },
944 { NEON::BI__builtin_neon_vpminq_f16, NEON::BI__builtin_neon_vpminq_v, },
945 { NEON::BI__builtin_neon_vrecpe_f16, NEON::BI__builtin_neon_vrecpe_v, },
946 { NEON::BI__builtin_neon_vrecpeq_f16, NEON::BI__builtin_neon_vrecpeq_v, },
947 { NEON::BI__builtin_neon_vrecps_f16, NEON::BI__builtin_neon_vrecps_v, },
948 { NEON::BI__builtin_neon_vrecpsq_f16, NEON::BI__builtin_neon_vrecpsq_v, },
949 { NEON::BI__builtin_neon_vrnd_f16, NEON::BI__builtin_neon_vrnd_v, },
950 { NEON::BI__builtin_neon_vrnda_f16, NEON::BI__builtin_neon_vrnda_v, },
951 { NEON::BI__builtin_neon_vrndaq_f16, NEON::BI__builtin_neon_vrndaq_v, },
952 { NEON::BI__builtin_neon_vrndi_f16, NEON::BI__builtin_neon_vrndi_v, },
953 { NEON::BI__builtin_neon_vrndiq_f16, NEON::BI__builtin_neon_vrndiq_v, },
954 { NEON::BI__builtin_neon_vrndm_f16, NEON::BI__builtin_neon_vrndm_v, },
955 { NEON::BI__builtin_neon_vrndmq_f16, NEON::BI__builtin_neon_vrndmq_v, },
956 { NEON::BI__builtin_neon_vrndn_f16, NEON::BI__builtin_neon_vrndn_v, },
957 { NEON::BI__builtin_neon_vrndnq_f16, NEON::BI__builtin_neon_vrndnq_v, },
958 { NEON::BI__builtin_neon_vrndp_f16, NEON::BI__builtin_neon_vrndp_v, },
959 { NEON::BI__builtin_neon_vrndpq_f16, NEON::BI__builtin_neon_vrndpq_v, },
960 { NEON::BI__builtin_neon_vrndq_f16, NEON::BI__builtin_neon_vrndq_v, },
961 { NEON::BI__builtin_neon_vrndx_f16, NEON::BI__builtin_neon_vrndx_v, },
962 { NEON::BI__builtin_neon_vrndxq_f16, NEON::BI__builtin_neon_vrndxq_v, },
963 { NEON::BI__builtin_neon_vrsqrte_f16, NEON::BI__builtin_neon_vrsqrte_v, },
964 { NEON::BI__builtin_neon_vrsqrteq_f16, NEON::BI__builtin_neon_vrsqrteq_v, },
965 { NEON::BI__builtin_neon_vrsqrts_f16, NEON::BI__builtin_neon_vrsqrts_v, },
966 { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, },
967 { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, },
968 { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, },
969 { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v },
970 { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v },
971 { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v },
972 { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v },
973 { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v },
974 { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v },
975 { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v },
976 { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v },
977 { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v },
978 { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v },
979 { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v },
980 { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v },
981 { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v },
982 { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v },
983 { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v },
984 { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v },
985 { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v },
986 { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v },
987 { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v },
988 { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v },
989 { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
990 { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v },
991 // The mangling rules cause us to have one ID for each type for vldap1(q)_lane
992 // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
993 // arbitrary one to be handled as tha canonical variation.
994 { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 },
995 { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 },
996 { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 },
997 { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
998 { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
999 { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
1000 { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1001 { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1002 { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 },
1003 { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1004 { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1005 { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
1006};
1007
1008#undef NEONMAP0
1009#undef NEONMAP1
1010#undef NEONMAP2
1011
1012#define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
1013 { \
1014 #NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0, \
1015 TypeModifier \
1016 }
1017
1018#define SVEMAP2(NameBase, TypeModifier) \
1019 { #NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier }
1020static const ARMVectorIntrinsicInfo AArch64SVEIntrinsicMap[] = {
1021#define GET_SVE_LLVM_INTRINSIC_MAP
1022#include "clang/Basic/arm_sve_builtin_cg.inc"
1023#include "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
1024#undef GET_SVE_LLVM_INTRINSIC_MAP
1025};
1026
1027#undef SVEMAP1
1028#undef SVEMAP2
1029
1030#define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
1031 { \
1032 #NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0, \
1033 TypeModifier \
1034 }
1035
1036#define SMEMAP2(NameBase, TypeModifier) \
1037 { #NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier }
1038static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = {
1039#define GET_SME_LLVM_INTRINSIC_MAP
1040#include "clang/Basic/arm_sme_builtin_cg.inc"
1041#undef GET_SME_LLVM_INTRINSIC_MAP
1042};
1043
1044#undef SMEMAP1
1045#undef SMEMAP2
1046
1047static bool NEONSIMDIntrinsicsProvenSorted = false;
1048
1049static bool AArch64SIMDIntrinsicsProvenSorted = false;
1050static bool AArch64SISDIntrinsicsProvenSorted = false;
1051static bool AArch64SVEIntrinsicsProvenSorted = false;
1052static bool AArch64SMEIntrinsicsProvenSorted = false;
1053
1054// Check if Builtin `BuiltinId` is present in `IntrinsicMap`. If yes, returns
1055// the corresponding info struct.
1056static const ARMVectorIntrinsicInfo *
1057findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> IntrinsicMap,
1058 unsigned BuiltinID, bool &MapProvenSorted) {
1059
1060#ifndef NDEBUG
1061 if (!MapProvenSorted) {
1062 assert(llvm::is_sorted(IntrinsicMap));
1063 MapProvenSorted = true;
1064 }
1065#endif
1066
1067 const ARMVectorIntrinsicInfo *Builtin =
1068 llvm::lower_bound(Range&: IntrinsicMap, Value&: BuiltinID);
1069
1070 if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
1071 return Builtin;
1072
1073 return nullptr;
1074}
1075
1076Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
1077 unsigned Modifier,
1078 llvm::Type *ArgType,
1079 const CallExpr *E) {
1080 int VectorSize = 0;
1081 if (Modifier & Use64BitVectors)
1082 VectorSize = 64;
1083 else if (Modifier & Use128BitVectors)
1084 VectorSize = 128;
1085
1086 // Return type.
1087 SmallVector<llvm::Type *, 3> Tys;
1088 if (Modifier & AddRetType) {
1089 llvm::Type *Ty = ConvertType(T: E->getCallReturnType(Ctx: getContext()));
1090 if (Modifier & VectorizeRetType)
1091 Ty = llvm::FixedVectorType::get(
1092 ElementType: Ty, NumElts: VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
1093
1094 Tys.push_back(Elt: Ty);
1095 }
1096
1097 // Arguments.
1098 if (Modifier & VectorizeArgTypes) {
1099 int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
1100 ArgType = llvm::FixedVectorType::get(ElementType: ArgType, NumElts: Elts);
1101 }
1102
1103 if (Modifier & (Add1ArgType | Add2ArgTypes))
1104 Tys.push_back(Elt: ArgType);
1105
1106 if (Modifier & Add2ArgTypes)
1107 Tys.push_back(Elt: ArgType);
1108
1109 if (Modifier & InventFloatType)
1110 Tys.push_back(Elt: FloatTy);
1111
1112 return CGM.getIntrinsic(IID: IntrinsicID, Tys);
1113}
1114
1115//===----------------------------------------------------------------------===//
1116// Emit-helpers
1117//===----------------------------------------------------------------------===//
1118static Value *EmitCommonNeonSISDBuiltinExpr(
1119 CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo,
1120 SmallVectorImpl<Value *> &Ops, const CallExpr *E) {
1121 assert(SISDInfo.LLVMIntrinsic && "Generic code assumes a valid intrinsic");
1122
1123 switch (SISDInfo.BuiltinID) {
1124 case NEON::BI__builtin_neon_vcled_s64:
1125 case NEON::BI__builtin_neon_vcled_u64:
1126 case NEON::BI__builtin_neon_vcles_f32:
1127 case NEON::BI__builtin_neon_vcled_f64:
1128 case NEON::BI__builtin_neon_vcltd_s64:
1129 case NEON::BI__builtin_neon_vcltd_u64:
1130 case NEON::BI__builtin_neon_vclts_f32:
1131 case NEON::BI__builtin_neon_vcltd_f64:
1132 case NEON::BI__builtin_neon_vcales_f32:
1133 case NEON::BI__builtin_neon_vcaled_f64:
1134 case NEON::BI__builtin_neon_vcalts_f32:
1135 case NEON::BI__builtin_neon_vcaltd_f64:
1136 // Only one direction of comparisons actually exist, cmle is actually a cmge
1137 // with swapped operands. The table gives us the right intrinsic but we
1138 // still need to do the swap.
1139 std::swap(a&: Ops[0], b&: Ops[1]);
1140 break;
1141 }
1142
1143 // Determine the type(s) of this overloaded AArch64 intrinsic.
1144 llvm::Type *ArgTy = CGF.ConvertType(T: E->getArg(Arg: 0)->getType());
1145 Function *F = CGF.LookupNeonLLVMIntrinsic(IntrinsicID: SISDInfo.LLVMIntrinsic,
1146 Modifier: SISDInfo.TypeModifier, ArgType: ArgTy, E);
1147
1148 int j = 0;
1149 ConstantInt *C0 = ConstantInt::get(Ty: CGF.SizeTy, V: 0);
1150 for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
1151 ai != ae; ++ai, ++j) {
1152 llvm::Type *ArgTy = ai->getType();
1153 if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
1154 ArgTy->getPrimitiveSizeInBits())
1155 continue;
1156 assert(
1157 ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy() &&
1158 "Expecting vector LLVM intrinsic type and scalar Clang builtin type!");
1159
1160 // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
1161 // it before inserting.
1162 Ops[j] = CGF.Builder.CreateTruncOrBitCast(
1163 V: Ops[j], DestTy: cast<llvm::VectorType>(Val: ArgTy)->getElementType());
1164 Ops[j] =
1165 CGF.Builder.CreateInsertElement(Vec: PoisonValue::get(T: ArgTy), NewElt: Ops[j], Idx: C0);
1166 }
1167
1168 Value *Result = CGF.EmitNeonCall(F, Ops, name: SISDInfo.NameHint);
1169 llvm::Type *ResultType = CGF.ConvertType(T: E->getType());
1170 if (ResultType->getPrimitiveSizeInBits().getFixedValue() <
1171 Result->getType()->getPrimitiveSizeInBits().getFixedValue())
1172 return CGF.Builder.CreateExtractElement(Vec: Result, Idx: C0);
1173
1174 return CGF.Builder.CreateBitCast(V: Result, DestTy: ResultType, Name: SISDInfo.NameHint);
1175}
1176
1177Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
1178 unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
1179 const char *NameHint, unsigned Modifier, const CallExpr *E,
1180 SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
1181 llvm::Triple::ArchType Arch) {
1182
1183 // Extract the trailing immediate argument that encodes the type discriminator
1184 // for this overloaded intrinsic.
1185 // TODO: Move to the parent code that takes care of argument processing.
1186 const Expr *Arg = E->getArg(Arg: E->getNumArgs() - 1);
1187 std::optional<llvm::APSInt> NeonTypeConst =
1188 Arg->getIntegerConstantExpr(Ctx: getContext());
1189 if (!NeonTypeConst)
1190 return nullptr;
1191
1192 // Determine the type of this overloaded NEON intrinsic.
1193 NeonTypeFlags Type(NeonTypeConst->getZExtValue());
1194 const bool Usgn = Type.isUnsigned();
1195 const bool Quad = Type.isQuad();
1196 const bool Floating = Type.isFloatingPoint();
1197 const bool HasFastHalfType = getTarget().hasFastHalfType();
1198 const bool AllowBFloatArgsAndRet =
1199 getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
1200
1201 llvm::FixedVectorType *VTy =
1202 GetNeonType(CGF: this, TypeFlags: Type, HasFastHalfType, V1Ty: false, AllowBFloatArgsAndRet);
1203 llvm::Type *Ty = VTy;
1204 if (!Ty)
1205 return nullptr;
1206
1207 auto getAlignmentValue32 = [&](Address addr) -> Value* {
1208 return Builder.getInt32(C: addr.getAlignment().getQuantity());
1209 };
1210
1211 unsigned Int = LLVMIntrinsic;
1212 if ((Modifier & UnsignedAlts) && !Usgn)
1213 Int = AltLLVMIntrinsic;
1214
1215 switch (BuiltinID) {
1216 default: break;
1217 case NEON::BI__builtin_neon_splat_lane_v:
1218 case NEON::BI__builtin_neon_splat_laneq_v:
1219 case NEON::BI__builtin_neon_splatq_lane_v:
1220 case NEON::BI__builtin_neon_splatq_laneq_v: {
1221 auto NumElements = VTy->getElementCount();
1222 if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
1223 NumElements = NumElements * 2;
1224 if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
1225 NumElements = NumElements.divideCoefficientBy(RHS: 2);
1226
1227 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: VTy);
1228 return EmitNeonSplat(V: Ops[0], C: cast<ConstantInt>(Val: Ops[1]), Count: NumElements);
1229 }
1230 case NEON::BI__builtin_neon_vpadd_v:
1231 case NEON::BI__builtin_neon_vpaddq_v:
1232 // We don't allow fp/int overloading of intrinsics.
1233 if (VTy->getElementType()->isFloatingPointTy() &&
1234 Int == Intrinsic::aarch64_neon_addp)
1235 Int = Intrinsic::aarch64_neon_faddp;
1236 break;
1237 case NEON::BI__builtin_neon_vabs_v:
1238 case NEON::BI__builtin_neon_vabsq_v:
1239 if (VTy->getElementType()->isFloatingPointTy())
1240 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::fabs, Tys: Ty), Ops, name: "vabs");
1241 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys: Ty), Ops, name: "vabs");
1242 case NEON::BI__builtin_neon_vadd_v:
1243 case NEON::BI__builtin_neon_vaddq_v: {
1244 llvm::Type *VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Quad ? 16 : 8);
1245 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: VTy);
1246 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: VTy);
1247 Ops[0] = Builder.CreateXor(LHS: Ops[0], RHS: Ops[1]);
1248 return Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1249 }
1250 case NEON::BI__builtin_neon_vaddhn_v: {
1251 llvm::FixedVectorType *SrcTy =
1252 llvm::FixedVectorType::getExtendedElementVectorType(VTy);
1253
1254 // %sum = add <4 x i32> %lhs, %rhs
1255 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: SrcTy);
1256 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: SrcTy);
1257 Ops[0] = Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1], Name: "vaddhn");
1258
1259 // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
1260 Constant *ShiftAmt =
1261 ConstantInt::get(Ty: SrcTy, V: SrcTy->getScalarSizeInBits() / 2);
1262 Ops[0] = Builder.CreateLShr(LHS: Ops[0], RHS: ShiftAmt, Name: "vaddhn");
1263
1264 // %res = trunc <4 x i32> %high to <4 x i16>
1265 return Builder.CreateTrunc(V: Ops[0], DestTy: VTy, Name: "vaddhn");
1266 }
1267 case NEON::BI__builtin_neon_vcale_v:
1268 case NEON::BI__builtin_neon_vcaleq_v:
1269 case NEON::BI__builtin_neon_vcalt_v:
1270 case NEON::BI__builtin_neon_vcaltq_v:
1271 std::swap(a&: Ops[0], b&: Ops[1]);
1272 [[fallthrough]];
1273 case NEON::BI__builtin_neon_vcage_v:
1274 case NEON::BI__builtin_neon_vcageq_v:
1275 case NEON::BI__builtin_neon_vcagt_v:
1276 case NEON::BI__builtin_neon_vcagtq_v: {
1277 llvm::Type *Ty;
1278 switch (VTy->getScalarSizeInBits()) {
1279 default: llvm_unreachable("unexpected type");
1280 case 32:
1281 Ty = FloatTy;
1282 break;
1283 case 64:
1284 Ty = DoubleTy;
1285 break;
1286 case 16:
1287 Ty = HalfTy;
1288 break;
1289 }
1290 auto *VecFlt = llvm::FixedVectorType::get(ElementType: Ty, NumElts: VTy->getNumElements());
1291 llvm::Type *Tys[] = { VTy, VecFlt };
1292 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
1293 return EmitNeonCall(F, Ops, name: NameHint);
1294 }
1295 case NEON::BI__builtin_neon_vceqz_v:
1296 case NEON::BI__builtin_neon_vceqzq_v:
1297 return EmitAArch64CompareBuiltinExpr(
1298 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, Name: "vceqz");
1299 case NEON::BI__builtin_neon_vcgez_v:
1300 case NEON::BI__builtin_neon_vcgezq_v:
1301 return EmitAArch64CompareBuiltinExpr(
1302 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE,
1303 Name: "vcgez");
1304 case NEON::BI__builtin_neon_vclez_v:
1305 case NEON::BI__builtin_neon_vclezq_v:
1306 return EmitAArch64CompareBuiltinExpr(
1307 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE,
1308 Name: "vclez");
1309 case NEON::BI__builtin_neon_vcgtz_v:
1310 case NEON::BI__builtin_neon_vcgtzq_v:
1311 return EmitAArch64CompareBuiltinExpr(
1312 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT,
1313 Name: "vcgtz");
1314 case NEON::BI__builtin_neon_vcltz_v:
1315 case NEON::BI__builtin_neon_vcltzq_v:
1316 return EmitAArch64CompareBuiltinExpr(
1317 Op: Ops[0], Ty, Pred: Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT,
1318 Name: "vcltz");
1319 case NEON::BI__builtin_neon_vclz_v:
1320 case NEON::BI__builtin_neon_vclzq_v:
1321 // We generate target-independent intrinsic, which needs a second argument
1322 // for whether or not clz of zero is undefined; on ARM it isn't.
1323 Ops.push_back(Elt: Builder.getInt1(V: getTarget().isCLZForZeroUndef()));
1324 break;
1325 case NEON::BI__builtin_neon_vcvt_f32_v:
1326 case NEON::BI__builtin_neon_vcvtq_f32_v:
1327 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1328 Ty = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
1329 HasFastHalfType);
1330 return Usgn ? Builder.CreateUIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt")
1331 : Builder.CreateSIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt");
1332 case NEON::BI__builtin_neon_vcvt_f16_s16:
1333 case NEON::BI__builtin_neon_vcvt_f16_u16:
1334 case NEON::BI__builtin_neon_vcvtq_f16_s16:
1335 case NEON::BI__builtin_neon_vcvtq_f16_u16:
1336 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1337 Ty = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
1338 HasFastHalfType);
1339 return Usgn ? Builder.CreateUIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt")
1340 : Builder.CreateSIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt");
1341 case NEON::BI__builtin_neon_vcvt_n_f16_s16:
1342 case NEON::BI__builtin_neon_vcvt_n_f16_u16:
1343 case NEON::BI__builtin_neon_vcvtq_n_f16_s16:
1344 case NEON::BI__builtin_neon_vcvtq_n_f16_u16: {
1345 llvm::Type *Tys[2] = { GetFloatNeonType(CGF: this, IntTypeFlags: Type), Ty };
1346 Function *F = CGM.getIntrinsic(IID: Int, Tys);
1347 return EmitNeonCall(F, Ops, name: "vcvt_n");
1348 }
1349 case NEON::BI__builtin_neon_vcvt_n_f32_v:
1350 case NEON::BI__builtin_neon_vcvt_n_f64_v:
1351 case NEON::BI__builtin_neon_vcvtq_n_f32_v:
1352 case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
1353 llvm::Type *Tys[2] = { GetFloatNeonType(CGF: this, IntTypeFlags: Type), Ty };
1354 Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
1355 Function *F = CGM.getIntrinsic(IID: Int, Tys);
1356 return EmitNeonCall(F, Ops, name: "vcvt_n");
1357 }
1358 case NEON::BI__builtin_neon_vcvt_n_s16_f16:
1359 case NEON::BI__builtin_neon_vcvt_n_s32_v:
1360 case NEON::BI__builtin_neon_vcvt_n_u16_f16:
1361 case NEON::BI__builtin_neon_vcvt_n_u32_v:
1362 case NEON::BI__builtin_neon_vcvt_n_s64_v:
1363 case NEON::BI__builtin_neon_vcvt_n_u64_v:
1364 case NEON::BI__builtin_neon_vcvtq_n_s16_f16:
1365 case NEON::BI__builtin_neon_vcvtq_n_s32_v:
1366 case NEON::BI__builtin_neon_vcvtq_n_u16_f16:
1367 case NEON::BI__builtin_neon_vcvtq_n_u32_v:
1368 case NEON::BI__builtin_neon_vcvtq_n_s64_v:
1369 case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
1370 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
1371 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
1372 return EmitNeonCall(F, Ops, name: "vcvt_n");
1373 }
1374 case NEON::BI__builtin_neon_vcvt_s32_v:
1375 case NEON::BI__builtin_neon_vcvt_u32_v:
1376 case NEON::BI__builtin_neon_vcvt_s64_v:
1377 case NEON::BI__builtin_neon_vcvt_u64_v:
1378 case NEON::BI__builtin_neon_vcvt_s16_f16:
1379 case NEON::BI__builtin_neon_vcvt_u16_f16:
1380 case NEON::BI__builtin_neon_vcvtq_s32_v:
1381 case NEON::BI__builtin_neon_vcvtq_u32_v:
1382 case NEON::BI__builtin_neon_vcvtq_s64_v:
1383 case NEON::BI__builtin_neon_vcvtq_u64_v:
1384 case NEON::BI__builtin_neon_vcvtq_s16_f16:
1385 case NEON::BI__builtin_neon_vcvtq_u16_f16: {
1386 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: GetFloatNeonType(CGF: this, IntTypeFlags: Type));
1387 return Usgn ? Builder.CreateFPToUI(V: Ops[0], DestTy: Ty, Name: "vcvt")
1388 : Builder.CreateFPToSI(V: Ops[0], DestTy: Ty, Name: "vcvt");
1389 }
1390 case NEON::BI__builtin_neon_vcvta_s16_f16:
1391 case NEON::BI__builtin_neon_vcvta_s32_v:
1392 case NEON::BI__builtin_neon_vcvta_s64_v:
1393 case NEON::BI__builtin_neon_vcvta_u16_f16:
1394 case NEON::BI__builtin_neon_vcvta_u32_v:
1395 case NEON::BI__builtin_neon_vcvta_u64_v:
1396 case NEON::BI__builtin_neon_vcvtaq_s16_f16:
1397 case NEON::BI__builtin_neon_vcvtaq_s32_v:
1398 case NEON::BI__builtin_neon_vcvtaq_s64_v:
1399 case NEON::BI__builtin_neon_vcvtaq_u16_f16:
1400 case NEON::BI__builtin_neon_vcvtaq_u32_v:
1401 case NEON::BI__builtin_neon_vcvtaq_u64_v:
1402 case NEON::BI__builtin_neon_vcvtn_s16_f16:
1403 case NEON::BI__builtin_neon_vcvtn_s32_v:
1404 case NEON::BI__builtin_neon_vcvtn_s64_v:
1405 case NEON::BI__builtin_neon_vcvtn_u16_f16:
1406 case NEON::BI__builtin_neon_vcvtn_u32_v:
1407 case NEON::BI__builtin_neon_vcvtn_u64_v:
1408 case NEON::BI__builtin_neon_vcvtnq_s16_f16:
1409 case NEON::BI__builtin_neon_vcvtnq_s32_v:
1410 case NEON::BI__builtin_neon_vcvtnq_s64_v:
1411 case NEON::BI__builtin_neon_vcvtnq_u16_f16:
1412 case NEON::BI__builtin_neon_vcvtnq_u32_v:
1413 case NEON::BI__builtin_neon_vcvtnq_u64_v:
1414 case NEON::BI__builtin_neon_vcvtp_s16_f16:
1415 case NEON::BI__builtin_neon_vcvtp_s32_v:
1416 case NEON::BI__builtin_neon_vcvtp_s64_v:
1417 case NEON::BI__builtin_neon_vcvtp_u16_f16:
1418 case NEON::BI__builtin_neon_vcvtp_u32_v:
1419 case NEON::BI__builtin_neon_vcvtp_u64_v:
1420 case NEON::BI__builtin_neon_vcvtpq_s16_f16:
1421 case NEON::BI__builtin_neon_vcvtpq_s32_v:
1422 case NEON::BI__builtin_neon_vcvtpq_s64_v:
1423 case NEON::BI__builtin_neon_vcvtpq_u16_f16:
1424 case NEON::BI__builtin_neon_vcvtpq_u32_v:
1425 case NEON::BI__builtin_neon_vcvtpq_u64_v:
1426 case NEON::BI__builtin_neon_vcvtm_s16_f16:
1427 case NEON::BI__builtin_neon_vcvtm_s32_v:
1428 case NEON::BI__builtin_neon_vcvtm_s64_v:
1429 case NEON::BI__builtin_neon_vcvtm_u16_f16:
1430 case NEON::BI__builtin_neon_vcvtm_u32_v:
1431 case NEON::BI__builtin_neon_vcvtm_u64_v:
1432 case NEON::BI__builtin_neon_vcvtmq_s16_f16:
1433 case NEON::BI__builtin_neon_vcvtmq_s32_v:
1434 case NEON::BI__builtin_neon_vcvtmq_s64_v:
1435 case NEON::BI__builtin_neon_vcvtmq_u16_f16:
1436 case NEON::BI__builtin_neon_vcvtmq_u32_v:
1437 case NEON::BI__builtin_neon_vcvtmq_u64_v: {
1438 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
1439 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: NameHint);
1440 }
1441 case NEON::BI__builtin_neon_vcvtx_f32_v: {
1442 llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty};
1443 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: NameHint);
1444
1445 }
1446 case NEON::BI__builtin_neon_vext_v:
1447 case NEON::BI__builtin_neon_vextq_v: {
1448 int CV = cast<ConstantInt>(Val: Ops[2])->getSExtValue();
1449 SmallVector<int, 16> Indices;
1450 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
1451 Indices.push_back(Elt: i+CV);
1452
1453 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1454 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
1455 return Builder.CreateShuffleVector(V1: Ops[0], V2: Ops[1], Mask: Indices, Name: "vext");
1456 }
1457 case NEON::BI__builtin_neon_vfma_v:
1458 case NEON::BI__builtin_neon_vfmaq_v: {
1459 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1460 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
1461 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
1462
1463 // NEON intrinsic puts accumulator first, unlike the LLVM fma.
1464 return emitCallMaybeConstrainedFPBuiltin(
1465 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty,
1466 Args: {Ops[1], Ops[2], Ops[0]});
1467 }
1468 case NEON::BI__builtin_neon_vld1_v:
1469 case NEON::BI__builtin_neon_vld1q_v: {
1470 llvm::Type *Tys[] = {Ty, Int8PtrTy};
1471 Ops.push_back(Elt: getAlignmentValue32(PtrOp0));
1472 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: "vld1");
1473 }
1474 case NEON::BI__builtin_neon_vld1_x2_v:
1475 case NEON::BI__builtin_neon_vld1q_x2_v:
1476 case NEON::BI__builtin_neon_vld1_x3_v:
1477 case NEON::BI__builtin_neon_vld1q_x3_v:
1478 case NEON::BI__builtin_neon_vld1_x4_v:
1479 case NEON::BI__builtin_neon_vld1q_x4_v: {
1480 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
1481 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
1482 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld1xN");
1483 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
1484 }
1485 case NEON::BI__builtin_neon_vld2_v:
1486 case NEON::BI__builtin_neon_vld2q_v:
1487 case NEON::BI__builtin_neon_vld3_v:
1488 case NEON::BI__builtin_neon_vld3q_v:
1489 case NEON::BI__builtin_neon_vld4_v:
1490 case NEON::BI__builtin_neon_vld4q_v:
1491 case NEON::BI__builtin_neon_vld2_dup_v:
1492 case NEON::BI__builtin_neon_vld2q_dup_v:
1493 case NEON::BI__builtin_neon_vld3_dup_v:
1494 case NEON::BI__builtin_neon_vld3q_dup_v:
1495 case NEON::BI__builtin_neon_vld4_dup_v:
1496 case NEON::BI__builtin_neon_vld4q_dup_v: {
1497 llvm::Type *Tys[] = {Ty, Int8PtrTy};
1498 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
1499 Value *Align = getAlignmentValue32(PtrOp1);
1500 Ops[1] = Builder.CreateCall(Callee: F, Args: {Ops[1], Align}, Name: NameHint);
1501 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
1502 }
1503 case NEON::BI__builtin_neon_vld1_dup_v:
1504 case NEON::BI__builtin_neon_vld1q_dup_v: {
1505 Value *V = PoisonValue::get(T: Ty);
1506 PtrOp0 = PtrOp0.withElementType(ElemTy: VTy->getElementType());
1507 LoadInst *Ld = Builder.CreateLoad(Addr: PtrOp0);
1508 llvm::Constant *CI = ConstantInt::get(Ty: SizeTy, V: 0);
1509 Ops[0] = Builder.CreateInsertElement(Vec: V, NewElt: Ld, Idx: CI);
1510 return EmitNeonSplat(V: Ops[0], C: CI);
1511 }
1512 case NEON::BI__builtin_neon_vld2_lane_v:
1513 case NEON::BI__builtin_neon_vld2q_lane_v:
1514 case NEON::BI__builtin_neon_vld3_lane_v:
1515 case NEON::BI__builtin_neon_vld3q_lane_v:
1516 case NEON::BI__builtin_neon_vld4_lane_v:
1517 case NEON::BI__builtin_neon_vld4q_lane_v: {
1518 llvm::Type *Tys[] = {Ty, Int8PtrTy};
1519 Function *F = CGM.getIntrinsic(IID: LLVMIntrinsic, Tys);
1520 for (unsigned I = 2; I < Ops.size() - 1; ++I)
1521 Ops[I] = Builder.CreateBitCast(V: Ops[I], DestTy: Ty);
1522 Ops.push_back(Elt: getAlignmentValue32(PtrOp1));
1523 Ops[1] = Builder.CreateCall(Callee: F, Args: ArrayRef(Ops).slice(N: 1), Name: NameHint);
1524 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
1525 }
1526 case NEON::BI__builtin_neon_vmovl_v: {
1527 llvm::FixedVectorType *DTy =
1528 llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
1529 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DTy);
1530 if (Usgn)
1531 return Builder.CreateZExt(V: Ops[0], DestTy: Ty, Name: "vmovl");
1532 return Builder.CreateSExt(V: Ops[0], DestTy: Ty, Name: "vmovl");
1533 }
1534 case NEON::BI__builtin_neon_vmovn_v: {
1535 llvm::FixedVectorType *QTy =
1536 llvm::FixedVectorType::getExtendedElementVectorType(VTy);
1537 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: QTy);
1538 return Builder.CreateTrunc(V: Ops[0], DestTy: Ty, Name: "vmovn");
1539 }
1540 case NEON::BI__builtin_neon_vmull_v:
1541 // FIXME: the integer vmull operations could be emitted in terms of pure
1542 // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
1543 // hoisting the exts outside loops. Until global ISel comes along that can
1544 // see through such movement this leads to bad CodeGen. So we need an
1545 // intrinsic for now.
1546 Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
1547 Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
1548 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmull");
1549 case NEON::BI__builtin_neon_vpadal_v:
1550 case NEON::BI__builtin_neon_vpadalq_v: {
1551 // The source operand type has twice as many elements of half the size.
1552 unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
1553 llvm::Type *EltTy =
1554 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: EltBits / 2);
1555 auto *NarrowTy =
1556 llvm::FixedVectorType::get(ElementType: EltTy, NumElts: VTy->getNumElements() * 2);
1557 llvm::Type *Tys[2] = { Ty, NarrowTy };
1558 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: NameHint);
1559 }
1560 case NEON::BI__builtin_neon_vpaddl_v:
1561 case NEON::BI__builtin_neon_vpaddlq_v: {
1562 // The source operand type has twice as many elements of half the size.
1563 unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
1564 llvm::Type *EltTy = llvm::IntegerType::get(C&: getLLVMContext(), NumBits: EltBits / 2);
1565 auto *NarrowTy =
1566 llvm::FixedVectorType::get(ElementType: EltTy, NumElts: VTy->getNumElements() * 2);
1567 llvm::Type *Tys[2] = { Ty, NarrowTy };
1568 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vpaddl");
1569 }
1570 case NEON::BI__builtin_neon_vqdmlal_v:
1571 case NEON::BI__builtin_neon_vqdmlsl_v: {
1572 SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
1573 Ops[1] =
1574 EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys: Ty), Ops&: MulOps, name: "vqdmlal");
1575 Ops.resize(N: 2);
1576 return EmitNeonCall(F: CGM.getIntrinsic(IID: AltLLVMIntrinsic, Tys: Ty), Ops, name: NameHint);
1577 }
1578 case NEON::BI__builtin_neon_vqdmulhq_lane_v:
1579 case NEON::BI__builtin_neon_vqdmulh_lane_v:
1580 case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
1581 case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
1582 auto *RTy = cast<llvm::FixedVectorType>(Val: Ty);
1583 if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
1584 BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
1585 RTy = llvm::FixedVectorType::get(ElementType: RTy->getElementType(),
1586 NumElts: RTy->getNumElements() * 2);
1587 llvm::Type *Tys[2] = {
1588 RTy, GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
1589 /*isQuad*/ false))};
1590 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: NameHint);
1591 }
1592 case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
1593 case NEON::BI__builtin_neon_vqdmulh_laneq_v:
1594 case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
1595 case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
1596 llvm::Type *Tys[2] = {
1597 Ty, GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
1598 /*isQuad*/ true))};
1599 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: NameHint);
1600 }
1601 case NEON::BI__builtin_neon_vqshl_n_v:
1602 case NEON::BI__builtin_neon_vqshlq_n_v:
1603 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshl_n",
1604 shift: 1, rightshift: false);
1605 case NEON::BI__builtin_neon_vqshlu_n_v:
1606 case NEON::BI__builtin_neon_vqshluq_n_v:
1607 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshlu_n",
1608 shift: 1, rightshift: false);
1609 case NEON::BI__builtin_neon_vrecpe_v:
1610 case NEON::BI__builtin_neon_vrecpeq_v:
1611 case NEON::BI__builtin_neon_vrsqrte_v:
1612 case NEON::BI__builtin_neon_vrsqrteq_v:
1613 Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
1614 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: NameHint);
1615 case NEON::BI__builtin_neon_vrndi_v:
1616 case NEON::BI__builtin_neon_vrndiq_v:
1617 Int = Builder.getIsFPConstrained()
1618 ? Intrinsic::experimental_constrained_nearbyint
1619 : Intrinsic::nearbyint;
1620 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: NameHint);
1621 case NEON::BI__builtin_neon_vrshr_n_v:
1622 case NEON::BI__builtin_neon_vrshrq_n_v:
1623 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrshr_n",
1624 shift: 1, rightshift: true);
1625 case NEON::BI__builtin_neon_vsha512hq_u64:
1626 case NEON::BI__builtin_neon_vsha512h2q_u64:
1627 case NEON::BI__builtin_neon_vsha512su0q_u64:
1628 case NEON::BI__builtin_neon_vsha512su1q_u64: {
1629 Function *F = CGM.getIntrinsic(IID: Int);
1630 return EmitNeonCall(F, Ops, name: "");
1631 }
1632 case NEON::BI__builtin_neon_vshl_n_v:
1633 case NEON::BI__builtin_neon_vshlq_n_v:
1634 Ops[1] = EmitNeonShiftVector(V: Ops[1], Ty, neg: false);
1635 return Builder.CreateShl(LHS: Builder.CreateBitCast(V: Ops[0],DestTy: Ty), RHS: Ops[1],
1636 Name: "vshl_n");
1637 case NEON::BI__builtin_neon_vshll_n_v: {
1638 llvm::FixedVectorType *SrcTy =
1639 llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
1640 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: SrcTy);
1641 if (Usgn)
1642 Ops[0] = Builder.CreateZExt(V: Ops[0], DestTy: VTy);
1643 else
1644 Ops[0] = Builder.CreateSExt(V: Ops[0], DestTy: VTy);
1645 Ops[1] = EmitNeonShiftVector(V: Ops[1], Ty: VTy, neg: false);
1646 return Builder.CreateShl(LHS: Ops[0], RHS: Ops[1], Name: "vshll_n");
1647 }
1648 case NEON::BI__builtin_neon_vshrn_n_v: {
1649 llvm::FixedVectorType *SrcTy =
1650 llvm::FixedVectorType::getExtendedElementVectorType(VTy);
1651 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: SrcTy);
1652 Ops[1] = EmitNeonShiftVector(V: Ops[1], Ty: SrcTy, neg: false);
1653 if (Usgn)
1654 Ops[0] = Builder.CreateLShr(LHS: Ops[0], RHS: Ops[1]);
1655 else
1656 Ops[0] = Builder.CreateAShr(LHS: Ops[0], RHS: Ops[1]);
1657 return Builder.CreateTrunc(V: Ops[0], DestTy: Ty, Name: "vshrn_n");
1658 }
1659 case NEON::BI__builtin_neon_vshr_n_v:
1660 case NEON::BI__builtin_neon_vshrq_n_v:
1661 return EmitNeonRShiftImm(Vec: Ops[0], Shift: Ops[1], Ty, usgn: Usgn, name: "vshr_n");
1662 case NEON::BI__builtin_neon_vst1_v:
1663 case NEON::BI__builtin_neon_vst1q_v:
1664 case NEON::BI__builtin_neon_vst2_v:
1665 case NEON::BI__builtin_neon_vst2q_v:
1666 case NEON::BI__builtin_neon_vst3_v:
1667 case NEON::BI__builtin_neon_vst3q_v:
1668 case NEON::BI__builtin_neon_vst4_v:
1669 case NEON::BI__builtin_neon_vst4q_v:
1670 case NEON::BI__builtin_neon_vst2_lane_v:
1671 case NEON::BI__builtin_neon_vst2q_lane_v:
1672 case NEON::BI__builtin_neon_vst3_lane_v:
1673 case NEON::BI__builtin_neon_vst3q_lane_v:
1674 case NEON::BI__builtin_neon_vst4_lane_v:
1675 case NEON::BI__builtin_neon_vst4q_lane_v: {
1676 llvm::Type *Tys[] = {Int8PtrTy, Ty};
1677 Ops.push_back(Elt: getAlignmentValue32(PtrOp0));
1678 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "");
1679 }
1680 case NEON::BI__builtin_neon_vsm3partw1q_u32:
1681 case NEON::BI__builtin_neon_vsm3partw2q_u32:
1682 case NEON::BI__builtin_neon_vsm3ss1q_u32:
1683 case NEON::BI__builtin_neon_vsm4ekeyq_u32:
1684 case NEON::BI__builtin_neon_vsm4eq_u32: {
1685 Function *F = CGM.getIntrinsic(IID: Int);
1686 return EmitNeonCall(F, Ops, name: "");
1687 }
1688 case NEON::BI__builtin_neon_vsm3tt1aq_u32:
1689 case NEON::BI__builtin_neon_vsm3tt1bq_u32:
1690 case NEON::BI__builtin_neon_vsm3tt2aq_u32:
1691 case NEON::BI__builtin_neon_vsm3tt2bq_u32: {
1692 Function *F = CGM.getIntrinsic(IID: Int);
1693 Ops[3] = Builder.CreateZExt(V: Ops[3], DestTy: Int64Ty);
1694 return EmitNeonCall(F, Ops, name: "");
1695 }
1696 case NEON::BI__builtin_neon_vst1_x2_v:
1697 case NEON::BI__builtin_neon_vst1q_x2_v:
1698 case NEON::BI__builtin_neon_vst1_x3_v:
1699 case NEON::BI__builtin_neon_vst1q_x3_v:
1700 case NEON::BI__builtin_neon_vst1_x4_v:
1701 case NEON::BI__builtin_neon_vst1q_x4_v: {
1702 // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
1703 // in AArch64 it comes last. We may want to stick to one or another.
1704 if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be ||
1705 Arch == llvm::Triple::aarch64_32) {
1706 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
1707 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
1708 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: "");
1709 }
1710 llvm::Type *Tys[2] = {DefaultPtrTy, VTy};
1711 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: "");
1712 }
1713 case NEON::BI__builtin_neon_vsubhn_v: {
1714 llvm::FixedVectorType *SrcTy =
1715 llvm::FixedVectorType::getExtendedElementVectorType(VTy);
1716
1717 // %sum = add <4 x i32> %lhs, %rhs
1718 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: SrcTy);
1719 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: SrcTy);
1720 Ops[0] = Builder.CreateSub(LHS: Ops[0], RHS: Ops[1], Name: "vsubhn");
1721
1722 // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
1723 Constant *ShiftAmt =
1724 ConstantInt::get(Ty: SrcTy, V: SrcTy->getScalarSizeInBits() / 2);
1725 Ops[0] = Builder.CreateLShr(LHS: Ops[0], RHS: ShiftAmt, Name: "vsubhn");
1726
1727 // %res = trunc <4 x i32> %high to <4 x i16>
1728 return Builder.CreateTrunc(V: Ops[0], DestTy: VTy, Name: "vsubhn");
1729 }
1730 case NEON::BI__builtin_neon_vtrn_v:
1731 case NEON::BI__builtin_neon_vtrnq_v: {
1732 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
1733 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
1734 Value *SV = nullptr;
1735
1736 for (unsigned vi = 0; vi != 2; ++vi) {
1737 SmallVector<int, 16> Indices;
1738 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
1739 Indices.push_back(Elt: i+vi);
1740 Indices.push_back(Elt: i+e+vi);
1741 }
1742 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
1743 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vtrn");
1744 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
1745 }
1746 return SV;
1747 }
1748 case NEON::BI__builtin_neon_vtst_v:
1749 case NEON::BI__builtin_neon_vtstq_v: {
1750 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
1751 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
1752 Ops[0] = Builder.CreateAnd(LHS: Ops[0], RHS: Ops[1]);
1753 Ops[0] = Builder.CreateICmp(P: ICmpInst::ICMP_NE, LHS: Ops[0],
1754 RHS: ConstantAggregateZero::get(Ty));
1755 return Builder.CreateSExt(V: Ops[0], DestTy: Ty, Name: "vtst");
1756 }
1757 case NEON::BI__builtin_neon_vuzp_v:
1758 case NEON::BI__builtin_neon_vuzpq_v: {
1759 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
1760 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
1761 Value *SV = nullptr;
1762
1763 for (unsigned vi = 0; vi != 2; ++vi) {
1764 SmallVector<int, 16> Indices;
1765 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
1766 Indices.push_back(Elt: 2*i+vi);
1767
1768 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
1769 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vuzp");
1770 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
1771 }
1772 return SV;
1773 }
1774 case NEON::BI__builtin_neon_vxarq_u64: {
1775 Function *F = CGM.getIntrinsic(IID: Int);
1776 Ops[2] = Builder.CreateZExt(V: Ops[2], DestTy: Int64Ty);
1777 return EmitNeonCall(F, Ops, name: "");
1778 }
1779 case NEON::BI__builtin_neon_vzip_v:
1780 case NEON::BI__builtin_neon_vzipq_v: {
1781 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
1782 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
1783 Value *SV = nullptr;
1784
1785 for (unsigned vi = 0; vi != 2; ++vi) {
1786 SmallVector<int, 16> Indices;
1787 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
1788 Indices.push_back(Elt: (i + vi*e) >> 1);
1789 Indices.push_back(Elt: ((i + vi*e) >> 1)+e);
1790 }
1791 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
1792 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vzip");
1793 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
1794 }
1795 return SV;
1796 }
1797 case NEON::BI__builtin_neon_vdot_s32:
1798 case NEON::BI__builtin_neon_vdot_u32:
1799 case NEON::BI__builtin_neon_vdotq_s32:
1800 case NEON::BI__builtin_neon_vdotq_u32: {
1801 auto *InputTy =
1802 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Ty->getPrimitiveSizeInBits() / 8);
1803 llvm::Type *Tys[2] = { Ty, InputTy };
1804 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vdot");
1805 }
1806 case NEON::BI__builtin_neon_vfmlal_low_f16:
1807 case NEON::BI__builtin_neon_vfmlalq_low_f16: {
1808 auto *InputTy =
1809 llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
1810 llvm::Type *Tys[2] = { Ty, InputTy };
1811 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vfmlal_low");
1812 }
1813 case NEON::BI__builtin_neon_vfmlsl_low_f16:
1814 case NEON::BI__builtin_neon_vfmlslq_low_f16: {
1815 auto *InputTy =
1816 llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
1817 llvm::Type *Tys[2] = { Ty, InputTy };
1818 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vfmlsl_low");
1819 }
1820 case NEON::BI__builtin_neon_vfmlal_high_f16:
1821 case NEON::BI__builtin_neon_vfmlalq_high_f16: {
1822 auto *InputTy =
1823 llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
1824 llvm::Type *Tys[2] = { Ty, InputTy };
1825 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vfmlal_high");
1826 }
1827 case NEON::BI__builtin_neon_vfmlsl_high_f16:
1828 case NEON::BI__builtin_neon_vfmlslq_high_f16: {
1829 auto *InputTy =
1830 llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
1831 llvm::Type *Tys[2] = { Ty, InputTy };
1832 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vfmlsl_high");
1833 }
1834 case NEON::BI__builtin_neon_vmmlaq_s32:
1835 case NEON::BI__builtin_neon_vmmlaq_u32: {
1836 auto *InputTy =
1837 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Ty->getPrimitiveSizeInBits() / 8);
1838 llvm::Type *Tys[2] = { Ty, InputTy };
1839 return EmitNeonCall(F: CGM.getIntrinsic(IID: LLVMIntrinsic, Tys), Ops, name: "vmmla");
1840 }
1841 case NEON::BI__builtin_neon_vusmmlaq_s32: {
1842 auto *InputTy =
1843 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Ty->getPrimitiveSizeInBits() / 8);
1844 llvm::Type *Tys[2] = { Ty, InputTy };
1845 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vusmmla");
1846 }
1847 case NEON::BI__builtin_neon_vusdot_s32:
1848 case NEON::BI__builtin_neon_vusdotq_s32: {
1849 auto *InputTy =
1850 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: Ty->getPrimitiveSizeInBits() / 8);
1851 llvm::Type *Tys[2] = { Ty, InputTy };
1852 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vusdot");
1853 }
1854 case NEON::BI__builtin_neon_vbfdot_f32:
1855 case NEON::BI__builtin_neon_vbfdotq_f32: {
1856 llvm::Type *InputTy =
1857 llvm::FixedVectorType::get(ElementType: BFloatTy, NumElts: Ty->getPrimitiveSizeInBits() / 16);
1858 llvm::Type *Tys[2] = { Ty, InputTy };
1859 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vbfdot");
1860 }
1861 case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: {
1862 llvm::Type *Tys[1] = { Ty };
1863 Function *F = CGM.getIntrinsic(IID: Int, Tys);
1864 return EmitNeonCall(F, Ops, name: "vcvtfp2bf");
1865 }
1866
1867 }
1868
1869 assert(Int && "Expected valid intrinsic number");
1870
1871 // Determine the type(s) of this overloaded AArch64 intrinsic.
1872 Function *F = LookupNeonLLVMIntrinsic(IntrinsicID: Int, Modifier, ArgType: Ty, E);
1873
1874 Value *Result = EmitNeonCall(F, Ops, name: NameHint);
1875 llvm::Type *ResultType = ConvertType(T: E->getType());
1876 // AArch64 intrinsic one-element vector type cast to
1877 // scalar type expected by the builtin
1878 return Builder.CreateBitCast(V: Result, DestTy: ResultType, Name: NameHint);
1879}
1880
1881Value *
1882CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty,
1883 const CmpInst::Predicate Pred,
1884 const Twine &Name) {
1885
1886 if (isa<FixedVectorType>(Val: Ty)) {
1887 // Vector types are cast to i8 vectors. Recover original type.
1888 Op = Builder.CreateBitCast(V: Op, DestTy: Ty);
1889 }
1890
1891 Constant *zero = Constant::getNullValue(Ty: Op->getType());
1892
1893 if (CmpInst::isFPPredicate(P: Pred)) {
1894 if (Pred == CmpInst::FCMP_OEQ)
1895 Op = Builder.CreateFCmp(P: Pred, LHS: Op, RHS: zero);
1896 else
1897 Op = Builder.CreateFCmpS(P: Pred, LHS: Op, RHS: zero);
1898 } else {
1899 Op = Builder.CreateICmp(P: Pred, LHS: Op, RHS: zero);
1900 }
1901
1902 llvm::Type *ResTy = Ty;
1903 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty))
1904 ResTy = FixedVectorType::get(
1905 ElementType: IntegerType::get(C&: getLLVMContext(), NumBits: VTy->getScalarSizeInBits()),
1906 NumElts: VTy->getNumElements());
1907
1908 return Builder.CreateSExt(V: Op, DestTy: ResTy, Name);
1909}
1910
1911static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
1912 Value *ExtOp, Value *IndexOp,
1913 llvm::Type *ResTy, unsigned IntID,
1914 const char *Name) {
1915 SmallVector<Value *, 2> TblOps;
1916 if (ExtOp)
1917 TblOps.push_back(Elt: ExtOp);
1918
1919 // Build a vector containing sequential number like (0, 1, 2, ..., 15)
1920 SmallVector<int, 16> Indices;
1921 auto *TblTy = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1922 for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
1923 Indices.push_back(Elt: 2*i);
1924 Indices.push_back(Elt: 2*i+1);
1925 }
1926
1927 int PairPos = 0, End = Ops.size() - 1;
1928 while (PairPos < End) {
1929 TblOps.push_back(Elt: CGF.Builder.CreateShuffleVector(V1: Ops[PairPos],
1930 V2: Ops[PairPos+1], Mask: Indices,
1931 Name));
1932 PairPos += 2;
1933 }
1934
1935 // If there's an odd number of 64-bit lookup table, fill the high 64-bit
1936 // of the 128-bit lookup table with zero.
1937 if (PairPos == End) {
1938 Value *ZeroTbl = ConstantAggregateZero::get(Ty: TblTy);
1939 TblOps.push_back(Elt: CGF.Builder.CreateShuffleVector(V1: Ops[PairPos],
1940 V2: ZeroTbl, Mask: Indices, Name));
1941 }
1942
1943 Function *TblF;
1944 TblOps.push_back(Elt: IndexOp);
1945 TblF = CGF.CGM.getIntrinsic(IID: IntID, Tys: ResTy);
1946
1947 return CGF.EmitNeonCall(F: TblF, Ops&: TblOps, name: Name);
1948}
1949
1950Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
1951 unsigned Value;
1952 switch (BuiltinID) {
1953 default:
1954 return nullptr;
1955 case clang::ARM::BI__builtin_arm_nop:
1956 Value = 0;
1957 break;
1958 case clang::ARM::BI__builtin_arm_yield:
1959 case clang::ARM::BI__yield:
1960 Value = 1;
1961 break;
1962 case clang::ARM::BI__builtin_arm_wfe:
1963 case clang::ARM::BI__wfe:
1964 Value = 2;
1965 break;
1966 case clang::ARM::BI__builtin_arm_wfi:
1967 case clang::ARM::BI__wfi:
1968 Value = 3;
1969 break;
1970 case clang::ARM::BI__builtin_arm_sev:
1971 case clang::ARM::BI__sev:
1972 Value = 4;
1973 break;
1974 case clang::ARM::BI__builtin_arm_sevl:
1975 case clang::ARM::BI__sevl:
1976 Value = 5;
1977 break;
1978 }
1979
1980 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_hint),
1981 Args: llvm::ConstantInt::get(Ty: Int32Ty, V: Value));
1982}
1983
1984enum SpecialRegisterAccessKind {
1985 NormalRead,
1986 VolatileRead,
1987 Write,
1988};
1989
1990// Generates the IR for the read/write special register builtin,
1991// ValueType is the type of the value that is to be written or read,
1992// RegisterType is the type of the register being written to or read from.
1993static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
1994 const CallExpr *E,
1995 llvm::Type *RegisterType,
1996 llvm::Type *ValueType,
1997 SpecialRegisterAccessKind AccessKind,
1998 StringRef SysReg = "") {
1999 // write and register intrinsics only support 32, 64 and 128 bit operations.
2000 assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64) ||
2001 RegisterType->isIntegerTy(128)) &&
2002 "Unsupported size for register.");
2003
2004 CodeGen::CGBuilderTy &Builder = CGF.Builder;
2005 CodeGen::CodeGenModule &CGM = CGF.CGM;
2006 LLVMContext &Context = CGM.getLLVMContext();
2007
2008 if (SysReg.empty()) {
2009 const Expr *SysRegStrExpr = E->getArg(Arg: 0)->IgnoreParenCasts();
2010 SysReg = cast<clang::StringLiteral>(Val: SysRegStrExpr)->getString();
2011 }
2012
2013 llvm::Metadata *Ops[] = { llvm::MDString::get(Context, Str: SysReg) };
2014 llvm::MDNode *RegName = llvm::MDNode::get(Context, MDs: Ops);
2015 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, MD: RegName);
2016
2017 llvm::Type *Types[] = { RegisterType };
2018
2019 bool MixedTypes = RegisterType->isIntegerTy(Bitwidth: 64) && ValueType->isIntegerTy(Bitwidth: 32);
2020 assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
2021 && "Can't fit 64-bit value in 32-bit register");
2022
2023 if (AccessKind != Write) {
2024 assert(AccessKind == NormalRead || AccessKind == VolatileRead);
2025 llvm::Function *F = CGM.getIntrinsic(
2026 IID: AccessKind == VolatileRead ? Intrinsic::read_volatile_register
2027 : Intrinsic::read_register,
2028 Tys: Types);
2029 llvm::Value *Call = Builder.CreateCall(Callee: F, Args: Metadata);
2030
2031 if (MixedTypes)
2032 // Read into 64 bit register and then truncate result to 32 bit.
2033 return Builder.CreateTrunc(V: Call, DestTy: ValueType);
2034
2035 if (ValueType->isPointerTy())
2036 // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
2037 return Builder.CreateIntToPtr(V: Call, DestTy: ValueType);
2038
2039 return Call;
2040 }
2041
2042 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::write_register, Tys: Types);
2043 llvm::Value *ArgValue = CGF.EmitScalarExpr(E: E->getArg(Arg: 1));
2044 if (MixedTypes) {
2045 // Extend 32 bit write value to 64 bit to pass to write.
2046 ArgValue = Builder.CreateZExt(V: ArgValue, DestTy: RegisterType);
2047 return Builder.CreateCall(Callee: F, Args: { Metadata, ArgValue });
2048 }
2049
2050 if (ValueType->isPointerTy()) {
2051 // Have VoidPtrTy ArgValue but want to return an i32/i64.
2052 ArgValue = Builder.CreatePtrToInt(V: ArgValue, DestTy: RegisterType);
2053 return Builder.CreateCall(Callee: F, Args: { Metadata, ArgValue });
2054 }
2055
2056 return Builder.CreateCall(Callee: F, Args: { Metadata, ArgValue });
2057}
2058
2059static Value *EmitRangePrefetchBuiltin(CodeGenFunction &CGF, unsigned BuiltinID,
2060 const CallExpr *E) {
2061 CodeGen::CGBuilderTy &Builder = CGF.Builder;
2062 CodeGen::CodeGenModule &CGM = CGF.CGM;
2063 SmallVector<llvm::Value *, 4> Ops;
2064
2065 auto getIntArg = [&](unsigned ArgNo) {
2066 Expr::EvalResult Result;
2067 if (!E->getArg(Arg: ArgNo)->EvaluateAsInt(Result, Ctx: CGM.getContext()))
2068 llvm_unreachable("Expected constant argument to range prefetch.");
2069 return Result.Val.getInt().getExtValue();
2070 };
2071
2072 Ops.push_back(Elt: CGF.EmitScalarExpr(E: E->getArg(Arg: 0))); /*Addr*/
2073 Ops.push_back(Elt: CGF.EmitScalarExpr(E: E->getArg(Arg: 1))); /*Access Kind*/
2074 Ops.push_back(Elt: CGF.EmitScalarExpr(E: E->getArg(Arg: 2))); /*Policy*/
2075
2076 if (BuiltinID == clang::AArch64::BI__builtin_arm_range_prefetch_x) {
2077 auto Length = getIntArg(3);
2078 auto Count = getIntArg(4) - 1;
2079 auto Stride = getIntArg(5);
2080 auto Distance = getIntArg(6);
2081
2082 // Map ReuseDistance given in bytes to four bits representing decreasing
2083 // powers of two in the range 512MiB (0b0001) to 32KiB (0b1111). Values
2084 // are rounded up to the nearest power of 2, starting at 32KiB. Any value
2085 // over the maximum is represented by 0 (distance not known).
2086 if (Distance > 0) {
2087 Distance = llvm::Log2_32_Ceil(Value: Distance);
2088 if (Distance < 15)
2089 Distance = 15;
2090 else if (Distance > 29)
2091 Distance = 0;
2092 else
2093 Distance = 30 - Distance;
2094 }
2095
2096 uint64_t Mask22 = (1ULL << 22) - 1;
2097 uint64_t Mask16 = (1ULL << 16) - 1;
2098 uint64_t Metadata = (Distance << 60) | ((Stride & Mask22) << 38) |
2099 ((Count & Mask16) << 22) | (Length & Mask22);
2100
2101 Ops.push_back(Elt: llvm::ConstantInt::get(Ty: Builder.getInt64Ty(), V: Metadata));
2102 } else
2103 Ops.push_back(Elt: CGF.EmitScalarExpr(E: E->getArg(Arg: 3)));
2104
2105 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_range_prefetch),
2106 Args: Ops);
2107}
2108
2109/// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
2110/// argument that specifies the vector type. The additional argument is meant
2111/// for Sema checking (see `CheckNeonBuiltinFunctionCall`) and this function
2112/// should be kept consistent with the logic in Sema.
2113/// TODO: Make this return false for SISD builtins.
2114static bool HasExtraNeonArgument(unsigned BuiltinID) {
2115 // Required by the headers included below, but not in this particular
2116 // function.
2117 [[maybe_unused]] int PtrArgNum = -1;
2118 [[maybe_unused]] bool HasConstPtr = false;
2119
2120 // The mask encodes the type. We don't care about the actual value. Instead,
2121 // we just check whether its been set.
2122 uint64_t mask = 0;
2123 switch (BuiltinID) {
2124#define GET_NEON_OVERLOAD_CHECK
2125#include "clang/Basic/arm_fp16.inc"
2126#include "clang/Basic/arm_neon.inc"
2127#undef GET_NEON_OVERLOAD_CHECK
2128 // Non-neon builtins for controling VFP that take extra argument for
2129 // discriminating the type.
2130 case ARM::BI__builtin_arm_vcvtr_f:
2131 case ARM::BI__builtin_arm_vcvtr_d:
2132 mask = 1;
2133 }
2134
2135 if (mask)
2136 return true;
2137
2138 return false;
2139}
2140
2141Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
2142 const CallExpr *E,
2143 ReturnValueSlot ReturnValue,
2144 llvm::Triple::ArchType Arch) {
2145 if (auto Hint = GetValueForARMHint(BuiltinID))
2146 return Hint;
2147
2148 if (BuiltinID == clang::ARM::BI__emit) {
2149 bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
2150 llvm::FunctionType *FTy =
2151 llvm::FunctionType::get(Result: VoidTy, /*Variadic=*/isVarArg: false);
2152
2153 Expr::EvalResult Result;
2154 if (!E->getArg(Arg: 0)->EvaluateAsInt(Result, Ctx: CGM.getContext()))
2155 llvm_unreachable("Sema will ensure that the parameter is constant");
2156
2157 llvm::APSInt Value = Result.Val.getInt();
2158 uint64_t ZExtValue = Value.zextOrTrunc(width: IsThumb ? 16 : 32).getZExtValue();
2159
2160 llvm::InlineAsm *Emit =
2161 IsThumb ? InlineAsm::get(Ty: FTy, AsmString: ".inst.n 0x" + utohexstr(X: ZExtValue), Constraints: "",
2162 /*hasSideEffects=*/true)
2163 : InlineAsm::get(Ty: FTy, AsmString: ".inst 0x" + utohexstr(X: ZExtValue), Constraints: "",
2164 /*hasSideEffects=*/true);
2165
2166 return Builder.CreateCall(Callee: Emit);
2167 }
2168
2169 if (BuiltinID == clang::ARM::BI__builtin_arm_dbg) {
2170 Value *Option = EmitScalarExpr(E: E->getArg(Arg: 0));
2171 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_dbg), Args: Option);
2172 }
2173
2174 if (BuiltinID == clang::ARM::BI__builtin_arm_prefetch) {
2175 Value *Address = EmitScalarExpr(E: E->getArg(Arg: 0));
2176 Value *RW = EmitScalarExpr(E: E->getArg(Arg: 1));
2177 Value *IsData = EmitScalarExpr(E: E->getArg(Arg: 2));
2178
2179 // Locality is not supported on ARM target
2180 Value *Locality = llvm::ConstantInt::get(Ty: Int32Ty, V: 3);
2181
2182 Function *F = CGM.getIntrinsic(IID: Intrinsic::prefetch, Tys: Address->getType());
2183 return Builder.CreateCall(Callee: F, Args: {Address, RW, Locality, IsData});
2184 }
2185
2186 if (BuiltinID == clang::ARM::BI__builtin_arm_rbit) {
2187 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2188 return Builder.CreateCall(
2189 Callee: CGM.getIntrinsic(IID: Intrinsic::bitreverse, Tys: Arg->getType()), Args: Arg, Name: "rbit");
2190 }
2191
2192 if (BuiltinID == clang::ARM::BI__builtin_arm_clz ||
2193 BuiltinID == clang::ARM::BI__builtin_arm_clz64) {
2194 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2195 Function *F = CGM.getIntrinsic(IID: Intrinsic::ctlz, Tys: Arg->getType());
2196 Value *Res = Builder.CreateCall(Callee: F, Args: {Arg, Builder.getInt1(V: false)});
2197 if (BuiltinID == clang::ARM::BI__builtin_arm_clz64)
2198 Res = Builder.CreateTrunc(V: Res, DestTy: Builder.getInt32Ty());
2199 return Res;
2200 }
2201
2202
2203 if (BuiltinID == clang::ARM::BI__builtin_arm_cls) {
2204 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2205 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_cls), Args: Arg, Name: "cls");
2206 }
2207 if (BuiltinID == clang::ARM::BI__builtin_arm_cls64) {
2208 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2209 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_cls64), Args: Arg,
2210 Name: "cls");
2211 }
2212
2213 if (BuiltinID == clang::ARM::BI__clear_cache) {
2214 assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
2215 const FunctionDecl *FD = E->getDirectCallee();
2216 Value *Ops[2];
2217 for (unsigned i = 0; i < 2; i++)
2218 Ops[i] = EmitScalarExpr(E: E->getArg(Arg: i));
2219 llvm::Type *Ty = CGM.getTypes().ConvertType(T: FD->getType());
2220 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty);
2221 StringRef Name = FD->getName();
2222 return EmitNounwindRuntimeCall(callee: CGM.CreateRuntimeFunction(Ty: FTy, Name), args: Ops);
2223 }
2224
2225 if (BuiltinID == clang::ARM::BI__builtin_arm_mcrr ||
2226 BuiltinID == clang::ARM::BI__builtin_arm_mcrr2) {
2227 Function *F;
2228
2229 switch (BuiltinID) {
2230 default: llvm_unreachable("unexpected builtin");
2231 case clang::ARM::BI__builtin_arm_mcrr:
2232 F = CGM.getIntrinsic(IID: Intrinsic::arm_mcrr);
2233 break;
2234 case clang::ARM::BI__builtin_arm_mcrr2:
2235 F = CGM.getIntrinsic(IID: Intrinsic::arm_mcrr2);
2236 break;
2237 }
2238
2239 // MCRR{2} instruction has 5 operands but
2240 // the intrinsic has 4 because Rt and Rt2
2241 // are represented as a single unsigned 64
2242 // bit integer in the intrinsic definition
2243 // but internally it's represented as 2 32
2244 // bit integers.
2245
2246 Value *Coproc = EmitScalarExpr(E: E->getArg(Arg: 0));
2247 Value *Opc1 = EmitScalarExpr(E: E->getArg(Arg: 1));
2248 Value *RtAndRt2 = EmitScalarExpr(E: E->getArg(Arg: 2));
2249 Value *CRm = EmitScalarExpr(E: E->getArg(Arg: 3));
2250
2251 Value *C1 = llvm::ConstantInt::get(Ty: Int64Ty, V: 32);
2252 Value *Rt = Builder.CreateTruncOrBitCast(V: RtAndRt2, DestTy: Int32Ty);
2253 Value *Rt2 = Builder.CreateLShr(LHS: RtAndRt2, RHS: C1);
2254 Rt2 = Builder.CreateTruncOrBitCast(V: Rt2, DestTy: Int32Ty);
2255
2256 return Builder.CreateCall(Callee: F, Args: {Coproc, Opc1, Rt, Rt2, CRm});
2257 }
2258
2259 if (BuiltinID == clang::ARM::BI__builtin_arm_mrrc ||
2260 BuiltinID == clang::ARM::BI__builtin_arm_mrrc2) {
2261 Function *F;
2262
2263 switch (BuiltinID) {
2264 default: llvm_unreachable("unexpected builtin");
2265 case clang::ARM::BI__builtin_arm_mrrc:
2266 F = CGM.getIntrinsic(IID: Intrinsic::arm_mrrc);
2267 break;
2268 case clang::ARM::BI__builtin_arm_mrrc2:
2269 F = CGM.getIntrinsic(IID: Intrinsic::arm_mrrc2);
2270 break;
2271 }
2272
2273 Value *Coproc = EmitScalarExpr(E: E->getArg(Arg: 0));
2274 Value *Opc1 = EmitScalarExpr(E: E->getArg(Arg: 1));
2275 Value *CRm = EmitScalarExpr(E: E->getArg(Arg: 2));
2276 Value *RtAndRt2 = Builder.CreateCall(Callee: F, Args: {Coproc, Opc1, CRm});
2277
2278 // Returns an unsigned 64 bit integer, represented
2279 // as two 32 bit integers.
2280
2281 Value *Rt = Builder.CreateExtractValue(Agg: RtAndRt2, Idxs: 1);
2282 Value *Rt1 = Builder.CreateExtractValue(Agg: RtAndRt2, Idxs: 0);
2283 Rt = Builder.CreateZExt(V: Rt, DestTy: Int64Ty);
2284 Rt1 = Builder.CreateZExt(V: Rt1, DestTy: Int64Ty);
2285
2286 Value *ShiftCast = llvm::ConstantInt::get(Ty: Int64Ty, V: 32);
2287 RtAndRt2 = Builder.CreateShl(LHS: Rt, RHS: ShiftCast, Name: "shl", HasNUW: true);
2288 RtAndRt2 = Builder.CreateOr(LHS: RtAndRt2, RHS: Rt1);
2289
2290 return Builder.CreateBitCast(V: RtAndRt2, DestTy: ConvertType(T: E->getType()));
2291 }
2292
2293 if (BuiltinID == clang::ARM::BI__builtin_arm_ldrexd ||
2294 ((BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
2295 BuiltinID == clang::ARM::BI__builtin_arm_ldaex) &&
2296 getContext().getTypeSize(T: E->getType()) == 64) ||
2297 BuiltinID == clang::ARM::BI__ldrexd) {
2298 Function *F;
2299
2300 switch (BuiltinID) {
2301 default: llvm_unreachable("unexpected builtin");
2302 case clang::ARM::BI__builtin_arm_ldaex:
2303 F = CGM.getIntrinsic(IID: Intrinsic::arm_ldaexd);
2304 break;
2305 case clang::ARM::BI__builtin_arm_ldrexd:
2306 case clang::ARM::BI__builtin_arm_ldrex:
2307 case clang::ARM::BI__ldrexd:
2308 F = CGM.getIntrinsic(IID: Intrinsic::arm_ldrexd);
2309 break;
2310 }
2311
2312 Value *LdPtr = EmitScalarExpr(E: E->getArg(Arg: 0));
2313 Value *Val = Builder.CreateCall(Callee: F, Args: LdPtr, Name: "ldrexd");
2314
2315 Value *Val0 = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
2316 Value *Val1 = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
2317 Val0 = Builder.CreateZExt(V: Val0, DestTy: Int64Ty);
2318 Val1 = Builder.CreateZExt(V: Val1, DestTy: Int64Ty);
2319
2320 Value *ShiftCst = llvm::ConstantInt::get(Ty: Int64Ty, V: 32);
2321 Val = Builder.CreateShl(LHS: Val0, RHS: ShiftCst, Name: "shl", HasNUW: true /* nuw */);
2322 Val = Builder.CreateOr(LHS: Val, RHS: Val1);
2323 return Builder.CreateBitCast(V: Val, DestTy: ConvertType(T: E->getType()));
2324 }
2325
2326 if (BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
2327 BuiltinID == clang::ARM::BI__builtin_arm_ldaex) {
2328 Value *LoadAddr = EmitScalarExpr(E: E->getArg(Arg: 0));
2329
2330 QualType Ty = E->getType();
2331 llvm::Type *RealResTy = ConvertType(T: Ty);
2332 llvm::Type *IntTy =
2333 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: getContext().getTypeSize(T: Ty));
2334
2335 Function *F = CGM.getIntrinsic(
2336 IID: BuiltinID == clang::ARM::BI__builtin_arm_ldaex ? Intrinsic::arm_ldaex
2337 : Intrinsic::arm_ldrex,
2338 Tys: DefaultPtrTy);
2339 CallInst *Val = Builder.CreateCall(Callee: F, Args: LoadAddr, Name: "ldrex");
2340 Val->addParamAttr(
2341 ArgNo: 0, Attr: Attribute::get(Context&: getLLVMContext(), Kind: Attribute::ElementType, Ty: IntTy));
2342
2343 if (RealResTy->isPointerTy())
2344 return Builder.CreateIntToPtr(V: Val, DestTy: RealResTy);
2345 else {
2346 llvm::Type *IntResTy = llvm::IntegerType::get(
2347 C&: getLLVMContext(), NumBits: CGM.getDataLayout().getTypeSizeInBits(Ty: RealResTy));
2348 return Builder.CreateBitCast(V: Builder.CreateTruncOrBitCast(V: Val, DestTy: IntResTy),
2349 DestTy: RealResTy);
2350 }
2351 }
2352
2353 if (BuiltinID == clang::ARM::BI__builtin_arm_strexd ||
2354 ((BuiltinID == clang::ARM::BI__builtin_arm_stlex ||
2355 BuiltinID == clang::ARM::BI__builtin_arm_strex) &&
2356 getContext().getTypeSize(T: E->getArg(Arg: 0)->getType()) == 64)) {
2357 Function *F = CGM.getIntrinsic(
2358 IID: BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlexd
2359 : Intrinsic::arm_strexd);
2360 llvm::Type *STy = llvm::StructType::get(elt1: Int32Ty, elts: Int32Ty);
2361
2362 Address Tmp = CreateMemTemp(T: E->getArg(Arg: 0)->getType());
2363 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 0));
2364 Builder.CreateStore(Val, Addr: Tmp);
2365
2366 Address LdPtr = Tmp.withElementType(ElemTy: STy);
2367 Val = Builder.CreateLoad(Addr: LdPtr);
2368
2369 Value *Arg0 = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
2370 Value *Arg1 = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
2371 Value *StPtr = EmitScalarExpr(E: E->getArg(Arg: 1));
2372 return Builder.CreateCall(Callee: F, Args: {Arg0, Arg1, StPtr}, Name: "strexd");
2373 }
2374
2375 if (BuiltinID == clang::ARM::BI__builtin_arm_strex ||
2376 BuiltinID == clang::ARM::BI__builtin_arm_stlex) {
2377 Value *StoreVal = EmitScalarExpr(E: E->getArg(Arg: 0));
2378 Value *StoreAddr = EmitScalarExpr(E: E->getArg(Arg: 1));
2379
2380 QualType Ty = E->getArg(Arg: 0)->getType();
2381 llvm::Type *StoreTy =
2382 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: getContext().getTypeSize(T: Ty));
2383
2384 if (StoreVal->getType()->isPointerTy())
2385 StoreVal = Builder.CreatePtrToInt(V: StoreVal, DestTy: Int32Ty);
2386 else {
2387 llvm::Type *IntTy = llvm::IntegerType::get(
2388 C&: getLLVMContext(),
2389 NumBits: CGM.getDataLayout().getTypeSizeInBits(Ty: StoreVal->getType()));
2390 StoreVal = Builder.CreateBitCast(V: StoreVal, DestTy: IntTy);
2391 StoreVal = Builder.CreateZExtOrBitCast(V: StoreVal, DestTy: Int32Ty);
2392 }
2393
2394 Function *F = CGM.getIntrinsic(
2395 IID: BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlex
2396 : Intrinsic::arm_strex,
2397 Tys: StoreAddr->getType());
2398
2399 CallInst *CI = Builder.CreateCall(Callee: F, Args: {StoreVal, StoreAddr}, Name: "strex");
2400 CI->addParamAttr(
2401 ArgNo: 1, Attr: Attribute::get(Context&: getLLVMContext(), Kind: Attribute::ElementType, Ty: StoreTy));
2402 return CI;
2403 }
2404
2405 if (BuiltinID == clang::ARM::BI__builtin_arm_clrex) {
2406 Function *F = CGM.getIntrinsic(IID: Intrinsic::arm_clrex);
2407 return Builder.CreateCall(Callee: F);
2408 }
2409
2410 // CRC32
2411 Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
2412 switch (BuiltinID) {
2413 case clang::ARM::BI__builtin_arm_crc32b:
2414 CRCIntrinsicID = Intrinsic::arm_crc32b; break;
2415 case clang::ARM::BI__builtin_arm_crc32cb:
2416 CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
2417 case clang::ARM::BI__builtin_arm_crc32h:
2418 CRCIntrinsicID = Intrinsic::arm_crc32h; break;
2419 case clang::ARM::BI__builtin_arm_crc32ch:
2420 CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
2421 case clang::ARM::BI__builtin_arm_crc32w:
2422 case clang::ARM::BI__builtin_arm_crc32d:
2423 CRCIntrinsicID = Intrinsic::arm_crc32w; break;
2424 case clang::ARM::BI__builtin_arm_crc32cw:
2425 case clang::ARM::BI__builtin_arm_crc32cd:
2426 CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
2427 }
2428
2429 if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
2430 Value *Arg0 = EmitScalarExpr(E: E->getArg(Arg: 0));
2431 Value *Arg1 = EmitScalarExpr(E: E->getArg(Arg: 1));
2432
2433 // crc32{c,}d intrinsics are implemented as two calls to crc32{c,}w
2434 // intrinsics, hence we need different codegen for these cases.
2435 if (BuiltinID == clang::ARM::BI__builtin_arm_crc32d ||
2436 BuiltinID == clang::ARM::BI__builtin_arm_crc32cd) {
2437 Value *C1 = llvm::ConstantInt::get(Ty: Int64Ty, V: 32);
2438 Value *Arg1a = Builder.CreateTruncOrBitCast(V: Arg1, DestTy: Int32Ty);
2439 Value *Arg1b = Builder.CreateLShr(LHS: Arg1, RHS: C1);
2440 Arg1b = Builder.CreateTruncOrBitCast(V: Arg1b, DestTy: Int32Ty);
2441
2442 Function *F = CGM.getIntrinsic(IID: CRCIntrinsicID);
2443 Value *Res = Builder.CreateCall(Callee: F, Args: {Arg0, Arg1a});
2444 return Builder.CreateCall(Callee: F, Args: {Res, Arg1b});
2445 } else {
2446 Arg1 = Builder.CreateZExtOrBitCast(V: Arg1, DestTy: Int32Ty);
2447
2448 Function *F = CGM.getIntrinsic(IID: CRCIntrinsicID);
2449 return Builder.CreateCall(Callee: F, Args: {Arg0, Arg1});
2450 }
2451 }
2452
2453 if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
2454 BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
2455 BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
2456 BuiltinID == clang::ARM::BI__builtin_arm_wsr ||
2457 BuiltinID == clang::ARM::BI__builtin_arm_wsr64 ||
2458 BuiltinID == clang::ARM::BI__builtin_arm_wsrp) {
2459
2460 SpecialRegisterAccessKind AccessKind = Write;
2461 if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
2462 BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
2463 BuiltinID == clang::ARM::BI__builtin_arm_rsrp)
2464 AccessKind = VolatileRead;
2465
2466 bool IsPointerBuiltin = BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
2467 BuiltinID == clang::ARM::BI__builtin_arm_wsrp;
2468
2469 bool Is64Bit = BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
2470 BuiltinID == clang::ARM::BI__builtin_arm_wsr64;
2471
2472 llvm::Type *ValueType;
2473 llvm::Type *RegisterType;
2474 if (IsPointerBuiltin) {
2475 ValueType = VoidPtrTy;
2476 RegisterType = Int32Ty;
2477 } else if (Is64Bit) {
2478 ValueType = RegisterType = Int64Ty;
2479 } else {
2480 ValueType = RegisterType = Int32Ty;
2481 }
2482
2483 return EmitSpecialRegisterBuiltin(CGF&: *this, E, RegisterType, ValueType,
2484 AccessKind);
2485 }
2486
2487 if (BuiltinID == ARM::BI__builtin_sponentry) {
2488 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::sponentry, Tys: AllocaInt8PtrTy);
2489 return Builder.CreateCall(Callee: F);
2490 }
2491
2492 // Handle MSVC intrinsics before argument evaluation to prevent double
2493 // evaluation.
2494 if (std::optional<MSVCIntrin> MsvcIntId = translateArmToMsvcIntrin(BuiltinID))
2495 return EmitMSVCBuiltinExpr(BuiltinID: *MsvcIntId, E);
2496
2497 // Deal with MVE builtins
2498 if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
2499 return Result;
2500 // Handle CDE builtins
2501 if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
2502 return Result;
2503
2504 // Some intrinsics are equivalent - if they are use the base intrinsic ID.
2505 auto It = llvm::find_if(Range: NEONEquivalentIntrinsicMap, P: [BuiltinID](auto &P) {
2506 return P.first == BuiltinID;
2507 });
2508 if (It != end(arr: NEONEquivalentIntrinsicMap))
2509 BuiltinID = It->second;
2510
2511 // Find out if any arguments are required to be integer constant
2512 // expressions.
2513 unsigned ICEArguments = 0;
2514 ASTContext::GetBuiltinTypeError Error;
2515 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
2516 assert(Error == ASTContext::GE_None && "Should not codegen an error");
2517
2518 auto getAlignmentValue32 = [&](Address addr) -> Value* {
2519 return Builder.getInt32(C: addr.getAlignment().getQuantity());
2520 };
2521
2522 Address PtrOp0 = Address::invalid();
2523 Address PtrOp1 = Address::invalid();
2524 SmallVector<Value*, 4> Ops;
2525 bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
2526 unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
2527 for (unsigned i = 0, e = NumArgs; i != e; i++) {
2528 if (i == 0) {
2529 switch (BuiltinID) {
2530 case NEON::BI__builtin_neon_vld1_v:
2531 case NEON::BI__builtin_neon_vld1q_v:
2532 case NEON::BI__builtin_neon_vld1q_lane_v:
2533 case NEON::BI__builtin_neon_vld1_lane_v:
2534 case NEON::BI__builtin_neon_vld1_dup_v:
2535 case NEON::BI__builtin_neon_vld1q_dup_v:
2536 case NEON::BI__builtin_neon_vst1_v:
2537 case NEON::BI__builtin_neon_vst1q_v:
2538 case NEON::BI__builtin_neon_vst1q_lane_v:
2539 case NEON::BI__builtin_neon_vst1_lane_v:
2540 case NEON::BI__builtin_neon_vst2_v:
2541 case NEON::BI__builtin_neon_vst2q_v:
2542 case NEON::BI__builtin_neon_vst2_lane_v:
2543 case NEON::BI__builtin_neon_vst2q_lane_v:
2544 case NEON::BI__builtin_neon_vst3_v:
2545 case NEON::BI__builtin_neon_vst3q_v:
2546 case NEON::BI__builtin_neon_vst3_lane_v:
2547 case NEON::BI__builtin_neon_vst3q_lane_v:
2548 case NEON::BI__builtin_neon_vst4_v:
2549 case NEON::BI__builtin_neon_vst4q_v:
2550 case NEON::BI__builtin_neon_vst4_lane_v:
2551 case NEON::BI__builtin_neon_vst4q_lane_v:
2552 // Get the alignment for the argument in addition to the value;
2553 // we'll use it later.
2554 PtrOp0 = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
2555 Ops.push_back(Elt: PtrOp0.emitRawPointer(CGF&: *this));
2556 continue;
2557 }
2558 }
2559 if (i == 1) {
2560 switch (BuiltinID) {
2561 case NEON::BI__builtin_neon_vld2_v:
2562 case NEON::BI__builtin_neon_vld2q_v:
2563 case NEON::BI__builtin_neon_vld3_v:
2564 case NEON::BI__builtin_neon_vld3q_v:
2565 case NEON::BI__builtin_neon_vld4_v:
2566 case NEON::BI__builtin_neon_vld4q_v:
2567 case NEON::BI__builtin_neon_vld2_lane_v:
2568 case NEON::BI__builtin_neon_vld2q_lane_v:
2569 case NEON::BI__builtin_neon_vld3_lane_v:
2570 case NEON::BI__builtin_neon_vld3q_lane_v:
2571 case NEON::BI__builtin_neon_vld4_lane_v:
2572 case NEON::BI__builtin_neon_vld4q_lane_v:
2573 case NEON::BI__builtin_neon_vld2_dup_v:
2574 case NEON::BI__builtin_neon_vld2q_dup_v:
2575 case NEON::BI__builtin_neon_vld3_dup_v:
2576 case NEON::BI__builtin_neon_vld3q_dup_v:
2577 case NEON::BI__builtin_neon_vld4_dup_v:
2578 case NEON::BI__builtin_neon_vld4q_dup_v:
2579 // Get the alignment for the argument in addition to the value;
2580 // we'll use it later.
2581 PtrOp1 = EmitPointerWithAlignment(Addr: E->getArg(Arg: 1));
2582 Ops.push_back(Elt: PtrOp1.emitRawPointer(CGF&: *this));
2583 continue;
2584 }
2585 }
2586
2587 Ops.push_back(Elt: EmitScalarOrConstFoldImmArg(ICEArguments, Idx: i, E));
2588 }
2589
2590 switch (BuiltinID) {
2591 default: break;
2592
2593 case NEON::BI__builtin_neon_vget_lane_i8:
2594 case NEON::BI__builtin_neon_vget_lane_i16:
2595 case NEON::BI__builtin_neon_vget_lane_i32:
2596 case NEON::BI__builtin_neon_vget_lane_i64:
2597 case NEON::BI__builtin_neon_vget_lane_bf16:
2598 case NEON::BI__builtin_neon_vget_lane_f32:
2599 case NEON::BI__builtin_neon_vgetq_lane_i8:
2600 case NEON::BI__builtin_neon_vgetq_lane_i16:
2601 case NEON::BI__builtin_neon_vgetq_lane_i32:
2602 case NEON::BI__builtin_neon_vgetq_lane_i64:
2603 case NEON::BI__builtin_neon_vgetq_lane_bf16:
2604 case NEON::BI__builtin_neon_vgetq_lane_f32:
2605 case NEON::BI__builtin_neon_vduph_lane_bf16:
2606 case NEON::BI__builtin_neon_vduph_laneq_bf16:
2607 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
2608
2609 case NEON::BI__builtin_neon_vrndns_f32: {
2610 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
2611 llvm::Type *Tys[] = {Arg->getType()};
2612 Function *F = CGM.getIntrinsic(IID: Intrinsic::roundeven, Tys);
2613 return Builder.CreateCall(Callee: F, Args: {Arg}, Name: "vrndn"); }
2614
2615 case NEON::BI__builtin_neon_vset_lane_i8:
2616 case NEON::BI__builtin_neon_vset_lane_i16:
2617 case NEON::BI__builtin_neon_vset_lane_i32:
2618 case NEON::BI__builtin_neon_vset_lane_i64:
2619 case NEON::BI__builtin_neon_vset_lane_bf16:
2620 case NEON::BI__builtin_neon_vset_lane_f32:
2621 case NEON::BI__builtin_neon_vsetq_lane_i8:
2622 case NEON::BI__builtin_neon_vsetq_lane_i16:
2623 case NEON::BI__builtin_neon_vsetq_lane_i32:
2624 case NEON::BI__builtin_neon_vsetq_lane_i64:
2625 case NEON::BI__builtin_neon_vsetq_lane_bf16:
2626 case NEON::BI__builtin_neon_vsetq_lane_f32:
2627 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
2628
2629 case NEON::BI__builtin_neon_vsha1h_u32:
2630 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_sha1h), Ops,
2631 name: "vsha1h");
2632 case NEON::BI__builtin_neon_vsha1cq_u32:
2633 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_sha1c), Ops,
2634 name: "vsha1h");
2635 case NEON::BI__builtin_neon_vsha1pq_u32:
2636 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_sha1p), Ops,
2637 name: "vsha1h");
2638 case NEON::BI__builtin_neon_vsha1mq_u32:
2639 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_sha1m), Ops,
2640 name: "vsha1h");
2641
2642 case NEON::BI__builtin_neon_vcvth_bf16_f32: {
2643 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vcvtbfp2bf), Ops,
2644 name: "vcvtbfp2bf");
2645 }
2646
2647 // The ARM _MoveToCoprocessor builtins put the input register value as
2648 // the first argument, but the LLVM intrinsic expects it as the third one.
2649 case clang::ARM::BI_MoveToCoprocessor:
2650 case clang::ARM::BI_MoveToCoprocessor2: {
2651 Function *F = CGM.getIntrinsic(IID: BuiltinID == clang::ARM::BI_MoveToCoprocessor
2652 ? Intrinsic::arm_mcr
2653 : Intrinsic::arm_mcr2);
2654 return Builder.CreateCall(Callee: F, Args: {Ops[1], Ops[2], Ops[0],
2655 Ops[3], Ops[4], Ops[5]});
2656 }
2657 }
2658
2659 // Get the last argument, which specifies the vector type.
2660 assert(HasExtraArg);
2661 const Expr *Arg = E->getArg(Arg: E->getNumArgs()-1);
2662 std::optional<llvm::APSInt> Result =
2663 Arg->getIntegerConstantExpr(Ctx: getContext());
2664 if (!Result)
2665 return nullptr;
2666
2667 if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f ||
2668 BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_d) {
2669 // Determine the overloaded type of this builtin.
2670 llvm::Type *Ty;
2671 if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f)
2672 Ty = FloatTy;
2673 else
2674 Ty = DoubleTy;
2675
2676 // Determine whether this is an unsigned conversion or not.
2677 bool usgn = Result->getZExtValue() == 1;
2678 unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
2679
2680 // Call the appropriate intrinsic.
2681 Function *F = CGM.getIntrinsic(IID: Int, Tys: Ty);
2682 return Builder.CreateCall(Callee: F, Args: Ops, Name: "vcvtr");
2683 }
2684
2685 // Determine the type of this overloaded NEON intrinsic.
2686 NeonTypeFlags Type = Result->getZExtValue();
2687 bool usgn = Type.isUnsigned();
2688 bool rightShift = false;
2689
2690 llvm::FixedVectorType *VTy =
2691 GetNeonType(CGF: this, TypeFlags: Type, HasFastHalfType: getTarget().hasFastHalfType(), V1Ty: false,
2692 AllowBFloatArgsAndRet: getTarget().hasBFloat16Type());
2693 llvm::Type *Ty = VTy;
2694 if (!Ty)
2695 return nullptr;
2696
2697 // Many NEON builtins have identical semantics and uses in ARM and
2698 // AArch64. Emit these in a single function.
2699 auto IntrinsicMap = ArrayRef(ARMSIMDIntrinsicMap);
2700 const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
2701 IntrinsicMap, BuiltinID, MapProvenSorted&: NEONSIMDIntrinsicsProvenSorted);
2702 if (Builtin)
2703 return EmitCommonNeonBuiltinExpr(
2704 BuiltinID: Builtin->BuiltinID, LLVMIntrinsic: Builtin->LLVMIntrinsic, AltLLVMIntrinsic: Builtin->AltLLVMIntrinsic,
2705 NameHint: Builtin->NameHint, Modifier: Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
2706
2707 unsigned Int;
2708 switch (BuiltinID) {
2709 default: return nullptr;
2710 case NEON::BI__builtin_neon_vld1q_lane_v:
2711 // Handle 64-bit integer elements as a special case. Use shuffles of
2712 // one-element vectors to avoid poor code for i64 in the backend.
2713 if (VTy->getElementType()->isIntegerTy(Bitwidth: 64)) {
2714 // Extract the other lane.
2715 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2716 int Lane = cast<ConstantInt>(Val: Ops[2])->getZExtValue();
2717 Value *SV = llvm::ConstantVector::get(V: ConstantInt::get(Ty: Int32Ty, V: 1-Lane));
2718 Ops[1] = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[1], Mask: SV);
2719 // Load the value as a one-element vector.
2720 Ty = llvm::FixedVectorType::get(ElementType: VTy->getElementType(), NumElts: 1);
2721 llvm::Type *Tys[] = {Ty, Int8PtrTy};
2722 Function *F = CGM.getIntrinsic(IID: Intrinsic::arm_neon_vld1, Tys);
2723 Value *Align = getAlignmentValue32(PtrOp0);
2724 Value *Ld = Builder.CreateCall(Callee: F, Args: {Ops[0], Align});
2725 // Combine them.
2726 int Indices[] = {1 - Lane, Lane};
2727 return Builder.CreateShuffleVector(V1: Ops[1], V2: Ld, Mask: Indices, Name: "vld1q_lane");
2728 }
2729 [[fallthrough]];
2730 case NEON::BI__builtin_neon_vld1_lane_v: {
2731 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2732 PtrOp0 = PtrOp0.withElementType(ElemTy: VTy->getElementType());
2733 Value *Ld = Builder.CreateLoad(Addr: PtrOp0);
2734 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ld, Idx: Ops[2], Name: "vld1_lane");
2735 }
2736 case NEON::BI__builtin_neon_vqrshrn_n_v:
2737 Int =
2738 usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
2739 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqrshrn_n",
2740 shift: 1, rightshift: true);
2741 case NEON::BI__builtin_neon_vqrshrun_n_v:
2742 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vqrshiftnsu, Tys: Ty),
2743 Ops, name: "vqrshrun_n", shift: 1, rightshift: true);
2744 case NEON::BI__builtin_neon_vqshrn_n_v:
2745 Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
2746 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshrn_n",
2747 shift: 1, rightshift: true);
2748 case NEON::BI__builtin_neon_vqshrun_n_v:
2749 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vqshiftnsu, Tys: Ty),
2750 Ops, name: "vqshrun_n", shift: 1, rightshift: true);
2751 case NEON::BI__builtin_neon_vrecpe_v:
2752 case NEON::BI__builtin_neon_vrecpeq_v:
2753 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vrecpe, Tys: Ty),
2754 Ops, name: "vrecpe");
2755 case NEON::BI__builtin_neon_vrshrn_n_v:
2756 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vrshiftn, Tys: Ty),
2757 Ops, name: "vrshrn_n", shift: 1, rightshift: true);
2758 case NEON::BI__builtin_neon_vrsra_n_v:
2759 case NEON::BI__builtin_neon_vrsraq_n_v:
2760 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
2761 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2762 Ops[2] = EmitNeonShiftVector(V: Ops[2], Ty, neg: true);
2763 Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
2764 Ops[1] = Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Int, Tys: Ty), Args: {Ops[1], Ops[2]});
2765 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1], Name: "vrsra_n");
2766 case NEON::BI__builtin_neon_vsri_n_v:
2767 case NEON::BI__builtin_neon_vsriq_n_v:
2768 rightShift = true;
2769 [[fallthrough]];
2770 case NEON::BI__builtin_neon_vsli_n_v:
2771 case NEON::BI__builtin_neon_vsliq_n_v:
2772 Ops[2] = EmitNeonShiftVector(V: Ops[2], Ty, neg: rightShift);
2773 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vshiftins, Tys: Ty),
2774 Ops, name: "vsli_n");
2775 case NEON::BI__builtin_neon_vsra_n_v:
2776 case NEON::BI__builtin_neon_vsraq_n_v:
2777 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
2778 Ops[1] = EmitNeonRShiftImm(Vec: Ops[1], Shift: Ops[2], Ty, usgn, name: "vsra_n");
2779 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1]);
2780 case NEON::BI__builtin_neon_vst1q_lane_v:
2781 // Handle 64-bit integer elements as a special case. Use a shuffle to get
2782 // a one-element vector and avoid poor code for i64 in the backend.
2783 if (VTy->getElementType()->isIntegerTy(Bitwidth: 64)) {
2784 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2785 Value *SV = llvm::ConstantVector::get(V: cast<llvm::Constant>(Val: Ops[2]));
2786 Ops[1] = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[1], Mask: SV);
2787 Ops[2] = getAlignmentValue32(PtrOp0);
2788 llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
2789 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vst1,
2790 Tys), Args: Ops);
2791 }
2792 [[fallthrough]];
2793 case NEON::BI__builtin_neon_vst1_lane_v: {
2794 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
2795 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2]);
2796 return Builder.CreateStore(Val: Ops[1],
2797 Addr: PtrOp0.withElementType(ElemTy: Ops[1]->getType()));
2798 }
2799 case NEON::BI__builtin_neon_vtbl1_v:
2800 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbl1),
2801 Ops, name: "vtbl1");
2802 case NEON::BI__builtin_neon_vtbl2_v:
2803 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbl2),
2804 Ops, name: "vtbl2");
2805 case NEON::BI__builtin_neon_vtbl3_v:
2806 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbl3),
2807 Ops, name: "vtbl3");
2808 case NEON::BI__builtin_neon_vtbl4_v:
2809 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbl4),
2810 Ops, name: "vtbl4");
2811 case NEON::BI__builtin_neon_vtbx1_v:
2812 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbx1),
2813 Ops, name: "vtbx1");
2814 case NEON::BI__builtin_neon_vtbx2_v:
2815 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbx2),
2816 Ops, name: "vtbx2");
2817 case NEON::BI__builtin_neon_vtbx3_v:
2818 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbx3),
2819 Ops, name: "vtbx3");
2820 case NEON::BI__builtin_neon_vtbx4_v:
2821 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::arm_neon_vtbx4),
2822 Ops, name: "vtbx4");
2823 }
2824}
2825
2826template<typename Integer>
2827static Integer GetIntegerConstantValue(const Expr *E, ASTContext &Context) {
2828 return E->getIntegerConstantExpr(Ctx: Context)->getExtValue();
2829}
2830
2831static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
2832 llvm::Type *T, bool Unsigned) {
2833 // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
2834 // which finds it convenient to specify signed/unsigned as a boolean flag.
2835 return Unsigned ? Builder.CreateZExt(V, DestTy: T) : Builder.CreateSExt(V, DestTy: T);
2836}
2837
2838static llvm::Value *MVEImmediateShr(CGBuilderTy &Builder, llvm::Value *V,
2839 uint32_t Shift, bool Unsigned) {
2840 // MVE helper function for integer shift right. This must handle signed vs
2841 // unsigned, and also deal specially with the case where the shift count is
2842 // equal to the lane size. In LLVM IR, an LShr with that parameter would be
2843 // undefined behavior, but in MVE it's legal, so we must convert it to code
2844 // that is not undefined in IR.
2845 unsigned LaneBits = cast<llvm::VectorType>(Val: V->getType())
2846 ->getElementType()
2847 ->getPrimitiveSizeInBits();
2848 if (Shift == LaneBits) {
2849 // An unsigned shift of the full lane size always generates zero, so we can
2850 // simply emit a zero vector. A signed shift of the full lane size does the
2851 // same thing as shifting by one bit fewer.
2852 if (Unsigned)
2853 return llvm::Constant::getNullValue(Ty: V->getType());
2854 else
2855 --Shift;
2856 }
2857 return Unsigned ? Builder.CreateLShr(LHS: V, RHS: Shift) : Builder.CreateAShr(LHS: V, RHS: Shift);
2858}
2859
2860static llvm::Value *ARMMVEVectorSplat(CGBuilderTy &Builder, llvm::Value *V) {
2861 // MVE-specific helper function for a vector splat, which infers the element
2862 // count of the output vector by knowing that MVE vectors are all 128 bits
2863 // wide.
2864 unsigned Elements = 128 / V->getType()->getPrimitiveSizeInBits();
2865 return Builder.CreateVectorSplat(NumElts: Elements, V);
2866}
2867
2868static llvm::Value *ARMMVEVectorReinterpret(CGBuilderTy &Builder,
2869 CodeGenFunction *CGF,
2870 llvm::Value *V,
2871 llvm::Type *DestType) {
2872 // Convert one MVE vector type into another by reinterpreting its in-register
2873 // format.
2874 //
2875 // Little-endian, this is identical to a bitcast (which reinterprets the
2876 // memory format). But big-endian, they're not necessarily the same, because
2877 // the register and memory formats map to each other differently depending on
2878 // the lane size.
2879 //
2880 // We generate a bitcast whenever we can (if we're little-endian, or if the
2881 // lane sizes are the same anyway). Otherwise we fall back to an IR intrinsic
2882 // that performs the different kind of reinterpretation.
2883 if (CGF->getTarget().isBigEndian() &&
2884 V->getType()->getScalarSizeInBits() != DestType->getScalarSizeInBits()) {
2885 return Builder.CreateCall(
2886 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vreinterpretq,
2887 Tys: {DestType, V->getType()}),
2888 Args: V);
2889 } else {
2890 return Builder.CreateBitCast(V, DestTy: DestType);
2891 }
2892}
2893
2894static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) {
2895 // Make a shufflevector that extracts every other element of a vector (evens
2896 // or odds, as desired).
2897 SmallVector<int, 16> Indices;
2898 unsigned InputElements =
2899 cast<llvm::FixedVectorType>(Val: V->getType())->getNumElements();
2900 for (unsigned i = 0; i < InputElements; i += 2)
2901 Indices.push_back(Elt: i + Odd);
2902 return Builder.CreateShuffleVector(V, Mask: Indices);
2903}
2904
2905static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
2906 llvm::Value *V1) {
2907 // Make a shufflevector that interleaves two vectors element by element.
2908 assert(V0->getType() == V1->getType() && "Can't zip different vector types");
2909 SmallVector<int, 16> Indices;
2910 unsigned InputElements =
2911 cast<llvm::FixedVectorType>(Val: V0->getType())->getNumElements();
2912 for (unsigned i = 0; i < InputElements; i++) {
2913 Indices.push_back(Elt: i);
2914 Indices.push_back(Elt: i + InputElements);
2915 }
2916 return Builder.CreateShuffleVector(V1: V0, V2: V1, Mask: Indices);
2917}
2918
2919template<unsigned HighBit, unsigned OtherBits>
2920static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
2921 // MVE-specific helper function to make a vector splat of a constant such as
2922 // UINT_MAX or INT_MIN, in which all bits below the highest one are equal.
2923 llvm::Type *T = cast<llvm::VectorType>(Val: VT)->getElementType();
2924 unsigned LaneBits = T->getPrimitiveSizeInBits();
2925 uint32_t Value = HighBit << (LaneBits - 1);
2926 if (OtherBits)
2927 Value |= (1UL << (LaneBits - 1)) - 1;
2928 llvm::Value *Lane = llvm::ConstantInt::get(Ty: T, V: Value);
2929 return ARMMVEVectorSplat(Builder, V: Lane);
2930}
2931
2932static llvm::Value *ARMMVEVectorElementReverse(CGBuilderTy &Builder,
2933 llvm::Value *V,
2934 unsigned ReverseWidth) {
2935 // MVE-specific helper function which reverses the elements of a
2936 // vector within every (ReverseWidth)-bit collection of lanes.
2937 SmallVector<int, 16> Indices;
2938 unsigned LaneSize = V->getType()->getScalarSizeInBits();
2939 unsigned Elements = 128 / LaneSize;
2940 unsigned Mask = ReverseWidth / LaneSize - 1;
2941 for (unsigned i = 0; i < Elements; i++)
2942 Indices.push_back(Elt: i ^ Mask);
2943 return Builder.CreateShuffleVector(V, Mask: Indices);
2944}
2945
2946static llvm::Value *ARMMVECreateSIToFP(CGBuilderTy &Builder,
2947 CodeGenFunction *CGF, llvm::Value *V,
2948 llvm::Type *Ty) {
2949 return Builder.CreateCall(
2950 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vcvt_fp_int, Tys: {Ty, V->getType()}),
2951 Args: {V, llvm::ConstantInt::get(Ty: Builder.getInt32Ty(), V: 0)});
2952}
2953
2954static llvm::Value *ARMMVECreateUIToFP(CGBuilderTy &Builder,
2955 CodeGenFunction *CGF, llvm::Value *V,
2956 llvm::Type *Ty) {
2957 return Builder.CreateCall(
2958 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vcvt_fp_int, Tys: {Ty, V->getType()}),
2959 Args: {V, llvm::ConstantInt::get(Ty: Builder.getInt32Ty(), V: 1)});
2960}
2961
2962static llvm::Value *ARMMVECreateFPToSI(CGBuilderTy &Builder,
2963 CodeGenFunction *CGF, llvm::Value *V,
2964 llvm::Type *Ty) {
2965 return Builder.CreateCall(
2966 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vcvt_int_fp, Tys: {Ty, V->getType()}),
2967 Args: {V, llvm::ConstantInt::get(Ty: Builder.getInt32Ty(), V: 0)});
2968}
2969
2970static llvm::Value *ARMMVECreateFPToUI(CGBuilderTy &Builder,
2971 CodeGenFunction *CGF, llvm::Value *V,
2972 llvm::Type *Ty) {
2973 return Builder.CreateCall(
2974 Callee: CGF->CGM.getIntrinsic(IID: Intrinsic::arm_mve_vcvt_int_fp, Tys: {Ty, V->getType()}),
2975 Args: {V, llvm::ConstantInt::get(Ty: Builder.getInt32Ty(), V: 1)});
2976}
2977
2978Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
2979 const CallExpr *E,
2980 ReturnValueSlot ReturnValue,
2981 llvm::Triple::ArchType Arch) {
2982 enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType;
2983 Intrinsic::ID IRIntr;
2984 unsigned NumVectors;
2985
2986 // Code autogenerated by Tablegen will handle all the simple builtins.
2987 switch (BuiltinID) {
2988 #include "clang/Basic/arm_mve_builtin_cg.inc"
2989
2990 // If we didn't match an MVE builtin id at all, go back to the
2991 // main EmitARMBuiltinExpr.
2992 default:
2993 return nullptr;
2994 }
2995
2996 // Anything that breaks from that switch is an MVE builtin that
2997 // needs handwritten code to generate.
2998
2999 switch (CustomCodeGenType) {
3000
3001 case CustomCodeGen::VLD24: {
3002 llvm::SmallVector<Value *, 4> Ops;
3003 llvm::SmallVector<llvm::Type *, 4> Tys;
3004
3005 auto MvecCType = E->getType();
3006 auto MvecLType = ConvertType(T: MvecCType);
3007 assert(MvecLType->isStructTy() &&
3008 "Return type for vld[24]q should be a struct");
3009 assert(MvecLType->getStructNumElements() == 1 &&
3010 "Return-type struct for vld[24]q should have one element");
3011 auto MvecLTypeInner = MvecLType->getStructElementType(N: 0);
3012 assert(MvecLTypeInner->isArrayTy() &&
3013 "Return-type struct for vld[24]q should contain an array");
3014 assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
3015 "Array member of return-type struct vld[24]q has wrong length");
3016 auto VecLType = MvecLTypeInner->getArrayElementType();
3017
3018 Tys.push_back(Elt: VecLType);
3019
3020 auto Addr = E->getArg(Arg: 0);
3021 Ops.push_back(Elt: EmitScalarExpr(E: Addr));
3022 Tys.push_back(Elt: ConvertType(T: Addr->getType()));
3023
3024 Function *F = CGM.getIntrinsic(IID: IRIntr, Tys: ArrayRef(Tys));
3025 Value *LoadResult = Builder.CreateCall(Callee: F, Args: Ops);
3026 Value *MvecOut = PoisonValue::get(T: MvecLType);
3027 for (unsigned i = 0; i < NumVectors; ++i) {
3028 Value *Vec = Builder.CreateExtractValue(Agg: LoadResult, Idxs: i);
3029 MvecOut = Builder.CreateInsertValue(Agg: MvecOut, Val: Vec, Idxs: {0, i});
3030 }
3031
3032 if (ReturnValue.isNull())
3033 return MvecOut;
3034 else
3035 return Builder.CreateStore(Val: MvecOut, Addr: ReturnValue.getAddress());
3036 }
3037
3038 case CustomCodeGen::VST24: {
3039 llvm::SmallVector<Value *, 4> Ops;
3040 llvm::SmallVector<llvm::Type *, 4> Tys;
3041
3042 auto Addr = E->getArg(Arg: 0);
3043 Ops.push_back(Elt: EmitScalarExpr(E: Addr));
3044 Tys.push_back(Elt: ConvertType(T: Addr->getType()));
3045
3046 auto MvecCType = E->getArg(Arg: 1)->getType();
3047 auto MvecLType = ConvertType(T: MvecCType);
3048 assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct");
3049 assert(MvecLType->getStructNumElements() == 1 &&
3050 "Data-type struct for vst2q should have one element");
3051 auto MvecLTypeInner = MvecLType->getStructElementType(N: 0);
3052 assert(MvecLTypeInner->isArrayTy() &&
3053 "Data-type struct for vst2q should contain an array");
3054 assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
3055 "Array member of return-type struct vld[24]q has wrong length");
3056 auto VecLType = MvecLTypeInner->getArrayElementType();
3057
3058 Tys.push_back(Elt: VecLType);
3059
3060 AggValueSlot MvecSlot = CreateAggTemp(T: MvecCType);
3061 EmitAggExpr(E: E->getArg(Arg: 1), AS: MvecSlot);
3062 auto Mvec = Builder.CreateLoad(Addr: MvecSlot.getAddress());
3063 for (unsigned i = 0; i < NumVectors; i++)
3064 Ops.push_back(Elt: Builder.CreateExtractValue(Agg: Mvec, Idxs: {0, i}));
3065
3066 Function *F = CGM.getIntrinsic(IID: IRIntr, Tys: ArrayRef(Tys));
3067 Value *ToReturn = nullptr;
3068 for (unsigned i = 0; i < NumVectors; i++) {
3069 Ops.push_back(Elt: llvm::ConstantInt::get(Ty: Int32Ty, V: i));
3070 ToReturn = Builder.CreateCall(Callee: F, Args: Ops);
3071 Ops.pop_back();
3072 }
3073 return ToReturn;
3074 }
3075 }
3076 llvm_unreachable("unknown custom codegen type.");
3077}
3078
3079Value *CodeGenFunction::EmitARMCDEBuiltinExpr(unsigned BuiltinID,
3080 const CallExpr *E,
3081 ReturnValueSlot ReturnValue,
3082 llvm::Triple::ArchType Arch) {
3083 switch (BuiltinID) {
3084 default:
3085 return nullptr;
3086#include "clang/Basic/arm_cde_builtin_cg.inc"
3087 }
3088}
3089
3090static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
3091 const CallExpr *E,
3092 SmallVectorImpl<Value *> &Ops,
3093 llvm::Triple::ArchType Arch) {
3094 unsigned int Int = 0;
3095 const char *s = nullptr;
3096
3097 switch (BuiltinID) {
3098 default:
3099 return nullptr;
3100 case NEON::BI__builtin_neon_vtbl1_v:
3101 case NEON::BI__builtin_neon_vqtbl1_v:
3102 case NEON::BI__builtin_neon_vqtbl1q_v:
3103 case NEON::BI__builtin_neon_vtbl2_v:
3104 case NEON::BI__builtin_neon_vqtbl2_v:
3105 case NEON::BI__builtin_neon_vqtbl2q_v:
3106 case NEON::BI__builtin_neon_vtbl3_v:
3107 case NEON::BI__builtin_neon_vqtbl3_v:
3108 case NEON::BI__builtin_neon_vqtbl3q_v:
3109 case NEON::BI__builtin_neon_vtbl4_v:
3110 case NEON::BI__builtin_neon_vqtbl4_v:
3111 case NEON::BI__builtin_neon_vqtbl4q_v:
3112 break;
3113 case NEON::BI__builtin_neon_vtbx1_v:
3114 case NEON::BI__builtin_neon_vqtbx1_v:
3115 case NEON::BI__builtin_neon_vqtbx1q_v:
3116 case NEON::BI__builtin_neon_vtbx2_v:
3117 case NEON::BI__builtin_neon_vqtbx2_v:
3118 case NEON::BI__builtin_neon_vqtbx2q_v:
3119 case NEON::BI__builtin_neon_vtbx3_v:
3120 case NEON::BI__builtin_neon_vqtbx3_v:
3121 case NEON::BI__builtin_neon_vqtbx3q_v:
3122 case NEON::BI__builtin_neon_vtbx4_v:
3123 case NEON::BI__builtin_neon_vqtbx4_v:
3124 case NEON::BI__builtin_neon_vqtbx4q_v:
3125 break;
3126 }
3127
3128 assert(E->getNumArgs() >= 3);
3129
3130 // Get the last argument, which specifies the vector type.
3131 const Expr *Arg = E->getArg(Arg: E->getNumArgs() - 1);
3132 std::optional<llvm::APSInt> Result =
3133 Arg->getIntegerConstantExpr(Ctx: CGF.getContext());
3134 if (!Result)
3135 return nullptr;
3136
3137 // Determine the type of this overloaded NEON intrinsic.
3138 NeonTypeFlags Type = Result->getZExtValue();
3139 llvm::FixedVectorType *Ty = GetNeonType(CGF: &CGF, TypeFlags: Type);
3140 if (!Ty)
3141 return nullptr;
3142
3143 CodeGen::CGBuilderTy &Builder = CGF.Builder;
3144
3145 // AArch64 scalar builtins are not overloaded, they do not have an extra
3146 // argument that specifies the vector type, need to handle each case.
3147 switch (BuiltinID) {
3148 case NEON::BI__builtin_neon_vtbl1_v: {
3149 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 0, M: 1), ExtOp: nullptr, IndexOp: Ops[1],
3150 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbl1, Name: "vtbl1");
3151 }
3152 case NEON::BI__builtin_neon_vtbl2_v: {
3153 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 0, M: 2), ExtOp: nullptr, IndexOp: Ops[2],
3154 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbl1, Name: "vtbl1");
3155 }
3156 case NEON::BI__builtin_neon_vtbl3_v: {
3157 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 0, M: 3), ExtOp: nullptr, IndexOp: Ops[3],
3158 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbl2, Name: "vtbl2");
3159 }
3160 case NEON::BI__builtin_neon_vtbl4_v: {
3161 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 0, M: 4), ExtOp: nullptr, IndexOp: Ops[4],
3162 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbl2, Name: "vtbl2");
3163 }
3164 case NEON::BI__builtin_neon_vtbx1_v: {
3165 Value *TblRes =
3166 packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 1, M: 1), ExtOp: nullptr, IndexOp: Ops[2], ResTy: Ty,
3167 IntID: Intrinsic::aarch64_neon_tbl1, Name: "vtbl1");
3168
3169 llvm::Constant *EightV = ConstantInt::get(Ty, V: 8);
3170 Value *CmpRes = Builder.CreateICmp(P: ICmpInst::ICMP_UGE, LHS: Ops[2], RHS: EightV);
3171 CmpRes = Builder.CreateSExt(V: CmpRes, DestTy: Ty);
3172
3173 Value *EltsFromInput = Builder.CreateAnd(LHS: CmpRes, RHS: Ops[0]);
3174 Value *EltsFromTbl = Builder.CreateAnd(LHS: Builder.CreateNot(V: CmpRes), RHS: TblRes);
3175 return Builder.CreateOr(LHS: EltsFromInput, RHS: EltsFromTbl, Name: "vtbx");
3176 }
3177 case NEON::BI__builtin_neon_vtbx2_v: {
3178 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 1, M: 2), ExtOp: Ops[0], IndexOp: Ops[3],
3179 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbx1, Name: "vtbx1");
3180 }
3181 case NEON::BI__builtin_neon_vtbx3_v: {
3182 Value *TblRes =
3183 packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 1, M: 3), ExtOp: nullptr, IndexOp: Ops[4], ResTy: Ty,
3184 IntID: Intrinsic::aarch64_neon_tbl2, Name: "vtbl2");
3185
3186 llvm::Constant *TwentyFourV = ConstantInt::get(Ty, V: 24);
3187 Value *CmpRes = Builder.CreateICmp(P: ICmpInst::ICMP_UGE, LHS: Ops[4],
3188 RHS: TwentyFourV);
3189 CmpRes = Builder.CreateSExt(V: CmpRes, DestTy: Ty);
3190
3191 Value *EltsFromInput = Builder.CreateAnd(LHS: CmpRes, RHS: Ops[0]);
3192 Value *EltsFromTbl = Builder.CreateAnd(LHS: Builder.CreateNot(V: CmpRes), RHS: TblRes);
3193 return Builder.CreateOr(LHS: EltsFromInput, RHS: EltsFromTbl, Name: "vtbx");
3194 }
3195 case NEON::BI__builtin_neon_vtbx4_v: {
3196 return packTBLDVectorList(CGF, Ops: ArrayRef(Ops).slice(N: 1, M: 4), ExtOp: Ops[0], IndexOp: Ops[5],
3197 ResTy: Ty, IntID: Intrinsic::aarch64_neon_tbx2, Name: "vtbx2");
3198 }
3199 case NEON::BI__builtin_neon_vqtbl1_v:
3200 case NEON::BI__builtin_neon_vqtbl1q_v:
3201 Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
3202 case NEON::BI__builtin_neon_vqtbl2_v:
3203 case NEON::BI__builtin_neon_vqtbl2q_v: {
3204 Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
3205 case NEON::BI__builtin_neon_vqtbl3_v:
3206 case NEON::BI__builtin_neon_vqtbl3q_v:
3207 Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
3208 case NEON::BI__builtin_neon_vqtbl4_v:
3209 case NEON::BI__builtin_neon_vqtbl4q_v:
3210 Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
3211 case NEON::BI__builtin_neon_vqtbx1_v:
3212 case NEON::BI__builtin_neon_vqtbx1q_v:
3213 Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
3214 case NEON::BI__builtin_neon_vqtbx2_v:
3215 case NEON::BI__builtin_neon_vqtbx2q_v:
3216 Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
3217 case NEON::BI__builtin_neon_vqtbx3_v:
3218 case NEON::BI__builtin_neon_vqtbx3q_v:
3219 Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
3220 case NEON::BI__builtin_neon_vqtbx4_v:
3221 case NEON::BI__builtin_neon_vqtbx4q_v:
3222 Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
3223 }
3224 }
3225
3226 if (!Int)
3227 return nullptr;
3228
3229 Function *F = CGF.CGM.getIntrinsic(IID: Int, Tys: Ty);
3230 return CGF.EmitNeonCall(F, Ops, name: s);
3231}
3232
3233Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
3234 auto *VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 4);
3235 Op = Builder.CreateBitCast(V: Op, DestTy: Int16Ty);
3236 Value *V = PoisonValue::get(T: VTy);
3237 llvm::Constant *CI = ConstantInt::get(Ty: SizeTy, V: 0);
3238 Op = Builder.CreateInsertElement(Vec: V, NewElt: Op, Idx: CI);
3239 return Op;
3240}
3241
3242/// SVEBuiltinMemEltTy - Returns the memory element type for this memory
3243/// access builtin. Only required if it can't be inferred from the base pointer
3244/// operand.
3245llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) {
3246 switch (TypeFlags.getMemEltType()) {
3247 case SVETypeFlags::MemEltTyDefault:
3248 return getEltType(TypeFlags);
3249 case SVETypeFlags::MemEltTyInt8:
3250 return Builder.getInt8Ty();
3251 case SVETypeFlags::MemEltTyInt16:
3252 return Builder.getInt16Ty();
3253 case SVETypeFlags::MemEltTyInt32:
3254 return Builder.getInt32Ty();
3255 case SVETypeFlags::MemEltTyInt64:
3256 return Builder.getInt64Ty();
3257 }
3258 llvm_unreachable("Unknown MemEltType");
3259}
3260
3261llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) {
3262 switch (TypeFlags.getEltType()) {
3263 default:
3264 llvm_unreachable("Invalid SVETypeFlag!");
3265
3266 case SVETypeFlags::EltTyMFloat8:
3267 case SVETypeFlags::EltTyInt8:
3268 return Builder.getInt8Ty();
3269 case SVETypeFlags::EltTyInt16:
3270 return Builder.getInt16Ty();
3271 case SVETypeFlags::EltTyInt32:
3272 return Builder.getInt32Ty();
3273 case SVETypeFlags::EltTyInt64:
3274 return Builder.getInt64Ty();
3275 case SVETypeFlags::EltTyInt128:
3276 return Builder.getInt128Ty();
3277
3278 case SVETypeFlags::EltTyFloat16:
3279 return Builder.getHalfTy();
3280 case SVETypeFlags::EltTyFloat32:
3281 return Builder.getFloatTy();
3282 case SVETypeFlags::EltTyFloat64:
3283 return Builder.getDoubleTy();
3284
3285 case SVETypeFlags::EltTyBFloat16:
3286 return Builder.getBFloatTy();
3287
3288 case SVETypeFlags::EltTyBool8:
3289 case SVETypeFlags::EltTyBool16:
3290 case SVETypeFlags::EltTyBool32:
3291 case SVETypeFlags::EltTyBool64:
3292 return Builder.getInt1Ty();
3293 }
3294}
3295
3296// Return the llvm predicate vector type corresponding to the specified element
3297// TypeFlags.
3298llvm::ScalableVectorType *
3299CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) {
3300 switch (TypeFlags.getEltType()) {
3301 default: llvm_unreachable("Unhandled SVETypeFlag!");
3302
3303 case SVETypeFlags::EltTyInt8:
3304 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16);
3305 case SVETypeFlags::EltTyInt16:
3306 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3307 case SVETypeFlags::EltTyInt32:
3308 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 4);
3309 case SVETypeFlags::EltTyInt64:
3310 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 2);
3311
3312 case SVETypeFlags::EltTyBFloat16:
3313 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3314 case SVETypeFlags::EltTyFloat16:
3315 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3316 case SVETypeFlags::EltTyFloat32:
3317 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 4);
3318 case SVETypeFlags::EltTyFloat64:
3319 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 2);
3320
3321 case SVETypeFlags::EltTyBool8:
3322 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16);
3323 case SVETypeFlags::EltTyBool16:
3324 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3325 case SVETypeFlags::EltTyBool32:
3326 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 4);
3327 case SVETypeFlags::EltTyBool64:
3328 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 2);
3329 }
3330}
3331
3332// Return the llvm vector type corresponding to the specified element TypeFlags.
3333llvm::ScalableVectorType *
3334CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
3335 switch (TypeFlags.getEltType()) {
3336 default:
3337 llvm_unreachable("Invalid SVETypeFlag!");
3338
3339 case SVETypeFlags::EltTyInt8:
3340 return llvm::ScalableVectorType::get(ElementType: Builder.getInt8Ty(), MinNumElts: 16);
3341 case SVETypeFlags::EltTyInt16:
3342 return llvm::ScalableVectorType::get(ElementType: Builder.getInt16Ty(), MinNumElts: 8);
3343 case SVETypeFlags::EltTyInt32:
3344 return llvm::ScalableVectorType::get(ElementType: Builder.getInt32Ty(), MinNumElts: 4);
3345 case SVETypeFlags::EltTyInt64:
3346 return llvm::ScalableVectorType::get(ElementType: Builder.getInt64Ty(), MinNumElts: 2);
3347
3348 case SVETypeFlags::EltTyMFloat8:
3349 return llvm::ScalableVectorType::get(ElementType: Builder.getInt8Ty(), MinNumElts: 16);
3350 case SVETypeFlags::EltTyFloat16:
3351 return llvm::ScalableVectorType::get(ElementType: Builder.getHalfTy(), MinNumElts: 8);
3352 case SVETypeFlags::EltTyBFloat16:
3353 return llvm::ScalableVectorType::get(ElementType: Builder.getBFloatTy(), MinNumElts: 8);
3354 case SVETypeFlags::EltTyFloat32:
3355 return llvm::ScalableVectorType::get(ElementType: Builder.getFloatTy(), MinNumElts: 4);
3356 case SVETypeFlags::EltTyFloat64:
3357 return llvm::ScalableVectorType::get(ElementType: Builder.getDoubleTy(), MinNumElts: 2);
3358
3359 case SVETypeFlags::EltTyBool8:
3360 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16);
3361 case SVETypeFlags::EltTyBool16:
3362 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 8);
3363 case SVETypeFlags::EltTyBool32:
3364 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 4);
3365 case SVETypeFlags::EltTyBool64:
3366 return llvm::ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 2);
3367 }
3368}
3369
3370llvm::Value *
3371CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) {
3372 Function *Ptrue =
3373 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_ptrue, Tys: getSVEPredType(TypeFlags));
3374 return Builder.CreateCall(Callee: Ptrue, Args: {Builder.getInt32(/*SV_ALL*/ C: 31)});
3375}
3376
3377constexpr unsigned SVEBitsPerBlock = 128;
3378
3379static llvm::ScalableVectorType *getSVEVectorForElementType(llvm::Type *EltTy) {
3380 unsigned NumElts = SVEBitsPerBlock / EltTy->getScalarSizeInBits();
3381 return llvm::ScalableVectorType::get(ElementType: EltTy, MinNumElts: NumElts);
3382}
3383
3384// Reinterpret the input predicate so that it can be used to correctly isolate
3385// the elements of the specified datatype.
3386Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
3387 llvm::ScalableVectorType *VTy) {
3388
3389 if (isa<TargetExtType>(Val: Pred->getType()) &&
3390 cast<TargetExtType>(Val: Pred->getType())->getName() == "aarch64.svcount")
3391 return Pred;
3392
3393 auto *RTy = llvm::VectorType::get(ElementType: IntegerType::get(C&: getLLVMContext(), NumBits: 1), Other: VTy);
3394 if (Pred->getType() == RTy)
3395 return Pred;
3396
3397 unsigned IntID;
3398 llvm::Type *IntrinsicTy;
3399 switch (VTy->getMinNumElements()) {
3400 default:
3401 llvm_unreachable("unsupported element count!");
3402 case 1:
3403 case 2:
3404 case 4:
3405 case 8:
3406 IntID = Intrinsic::aarch64_sve_convert_from_svbool;
3407 IntrinsicTy = RTy;
3408 break;
3409 case 16:
3410 IntID = Intrinsic::aarch64_sve_convert_to_svbool;
3411 IntrinsicTy = Pred->getType();
3412 break;
3413 }
3414
3415 Function *F = CGM.getIntrinsic(IID: IntID, Tys: IntrinsicTy);
3416 Value *C = Builder.CreateCall(Callee: F, Args: Pred);
3417 assert(C->getType() == RTy && "Unexpected return type!");
3418 return C;
3419}
3420
3421Value *CodeGenFunction::EmitSVEPredicateTupleCast(Value *PredTuple,
3422 llvm::StructType *Ty) {
3423 if (PredTuple->getType() == Ty)
3424 return PredTuple;
3425
3426 Value *Ret = llvm::PoisonValue::get(T: Ty);
3427 for (unsigned I = 0; I < Ty->getNumElements(); ++I) {
3428 Value *Pred = Builder.CreateExtractValue(Agg: PredTuple, Idxs: I);
3429 Pred = EmitSVEPredicateCast(
3430 Pred, VTy: cast<llvm::ScalableVectorType>(Val: Ty->getTypeAtIndex(N: I)));
3431 Ret = Builder.CreateInsertValue(Agg: Ret, Val: Pred, Idxs: I);
3432 }
3433
3434 return Ret;
3435}
3436
3437Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
3438 SmallVectorImpl<Value *> &Ops,
3439 unsigned IntID) {
3440 auto *ResultTy = getSVEType(TypeFlags);
3441 auto *OverloadedTy =
3442 llvm::ScalableVectorType::get(ElementType: SVEBuiltinMemEltTy(TypeFlags), SVTy: ResultTy);
3443
3444 Function *F = nullptr;
3445 if (Ops[1]->getType()->isVectorTy())
3446 // This is the "vector base, scalar offset" case. In order to uniquely
3447 // map this built-in to an LLVM IR intrinsic, we need both the return type
3448 // and the type of the vector base.
3449 F = CGM.getIntrinsic(IID: IntID, Tys: {OverloadedTy, Ops[1]->getType()});
3450 else
3451 // This is the "scalar base, vector offset case". The type of the offset
3452 // is encoded in the name of the intrinsic. We only need to specify the
3453 // return type in order to uniquely map this built-in to an LLVM IR
3454 // intrinsic.
3455 F = CGM.getIntrinsic(IID: IntID, Tys: OverloadedTy);
3456
3457 // At the ACLE level there's only one predicate type, svbool_t, which is
3458 // mapped to <n x 16 x i1>. However, this might be incompatible with the
3459 // actual type being loaded. For example, when loading doubles (i64) the
3460 // predicate should be <n x 2 x i1> instead. At the IR level the type of
3461 // the predicate and the data being loaded must match. Cast to the type
3462 // expected by the intrinsic. The intrinsic itself should be defined in
3463 // a way than enforces relations between parameter types.
3464 Ops[0] = EmitSVEPredicateCast(
3465 Pred: Ops[0], VTy: cast<llvm::ScalableVectorType>(Val: F->getArg(i: 0)->getType()));
3466
3467 // Pass 0 when the offset is missing. This can only be applied when using
3468 // the "vector base" addressing mode for which ACLE allows no offset. The
3469 // corresponding LLVM IR always requires an offset.
3470 if (Ops.size() == 2) {
3471 assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
3472 Ops.push_back(Elt: ConstantInt::get(Ty: Int64Ty, V: 0));
3473 }
3474
3475 // For "vector base, scalar index" scale the index so that it becomes a
3476 // scalar offset.
3477 if (!TypeFlags.isByteIndexed() && Ops[1]->getType()->isVectorTy()) {
3478 unsigned BytesPerElt =
3479 OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
3480 Ops[2] = Builder.CreateShl(LHS: Ops[2], RHS: Log2_32(Value: BytesPerElt));
3481 }
3482
3483 Value *Call = Builder.CreateCall(Callee: F, Args: Ops);
3484
3485 // The following sext/zext is only needed when ResultTy != OverloadedTy. In
3486 // other cases it's folded into a nop.
3487 return TypeFlags.isZExtReturn() ? Builder.CreateZExt(V: Call, DestTy: ResultTy)
3488 : Builder.CreateSExt(V: Call, DestTy: ResultTy);
3489}
3490
3491Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
3492 SmallVectorImpl<Value *> &Ops,
3493 unsigned IntID) {
3494 auto *SrcDataTy = getSVEType(TypeFlags);
3495 auto *OverloadedTy =
3496 llvm::ScalableVectorType::get(ElementType: SVEBuiltinMemEltTy(TypeFlags), SVTy: SrcDataTy);
3497
3498 // In ACLE the source data is passed in the last argument, whereas in LLVM IR
3499 // it's the first argument. Move it accordingly.
3500 Ops.insert(I: Ops.begin(), Elt: Ops.pop_back_val());
3501
3502 Function *F = nullptr;
3503 if (Ops[2]->getType()->isVectorTy())
3504 // This is the "vector base, scalar offset" case. In order to uniquely
3505 // map this built-in to an LLVM IR intrinsic, we need both the return type
3506 // and the type of the vector base.
3507 F = CGM.getIntrinsic(IID: IntID, Tys: {OverloadedTy, Ops[2]->getType()});
3508 else
3509 // This is the "scalar base, vector offset case". The type of the offset
3510 // is encoded in the name of the intrinsic. We only need to specify the
3511 // return type in order to uniquely map this built-in to an LLVM IR
3512 // intrinsic.
3513 F = CGM.getIntrinsic(IID: IntID, Tys: OverloadedTy);
3514
3515 // Pass 0 when the offset is missing. This can only be applied when using
3516 // the "vector base" addressing mode for which ACLE allows no offset. The
3517 // corresponding LLVM IR always requires an offset.
3518 if (Ops.size() == 3) {
3519 assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
3520 Ops.push_back(Elt: ConstantInt::get(Ty: Int64Ty, V: 0));
3521 }
3522
3523 // Truncation is needed when SrcDataTy != OverloadedTy. In other cases it's
3524 // folded into a nop.
3525 Ops[0] = Builder.CreateTrunc(V: Ops[0], DestTy: OverloadedTy);
3526
3527 // At the ACLE level there's only one predicate type, svbool_t, which is
3528 // mapped to <n x 16 x i1>. However, this might be incompatible with the
3529 // actual type being stored. For example, when storing doubles (i64) the
3530 // predicated should be <n x 2 x i1> instead. At the IR level the type of
3531 // the predicate and the data being stored must match. Cast to the type
3532 // expected by the intrinsic. The intrinsic itself should be defined in
3533 // a way that enforces relations between parameter types.
3534 Ops[1] = EmitSVEPredicateCast(
3535 Pred: Ops[1], VTy: cast<llvm::ScalableVectorType>(Val: F->getArg(i: 1)->getType()));
3536
3537 // For "vector base, scalar index" scale the index so that it becomes a
3538 // scalar offset.
3539 if (!TypeFlags.isByteIndexed() && Ops[2]->getType()->isVectorTy()) {
3540 unsigned BytesPerElt =
3541 OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
3542 Ops[3] = Builder.CreateShl(LHS: Ops[3], RHS: Log2_32(Value: BytesPerElt));
3543 }
3544
3545 return Builder.CreateCall(Callee: F, Args: Ops);
3546}
3547
3548Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
3549 SmallVectorImpl<Value *> &Ops,
3550 unsigned IntID) {
3551 // The gather prefetches are overloaded on the vector input - this can either
3552 // be the vector of base addresses or vector of offsets.
3553 auto *OverloadedTy = dyn_cast<llvm::ScalableVectorType>(Val: Ops[1]->getType());
3554 if (!OverloadedTy)
3555 OverloadedTy = cast<llvm::ScalableVectorType>(Val: Ops[2]->getType());
3556
3557 // Cast the predicate from svbool_t to the right number of elements.
3558 Ops[0] = EmitSVEPredicateCast(Pred: Ops[0], VTy: OverloadedTy);
3559
3560 // vector + imm addressing modes
3561 if (Ops[1]->getType()->isVectorTy()) {
3562 if (Ops.size() == 3) {
3563 // Pass 0 for 'vector+imm' when the index is omitted.
3564 Ops.push_back(Elt: ConstantInt::get(Ty: Int64Ty, V: 0));
3565
3566 // The sv_prfop is the last operand in the builtin and IR intrinsic.
3567 std::swap(a&: Ops[2], b&: Ops[3]);
3568 } else {
3569 // Index needs to be passed as scaled offset.
3570 llvm::Type *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
3571 unsigned BytesPerElt = MemEltTy->getPrimitiveSizeInBits() / 8;
3572 if (BytesPerElt > 1)
3573 Ops[2] = Builder.CreateShl(LHS: Ops[2], RHS: Log2_32(Value: BytesPerElt));
3574 }
3575 }
3576
3577 Function *F = CGM.getIntrinsic(IID: IntID, Tys: OverloadedTy);
3578 return Builder.CreateCall(Callee: F, Args: Ops);
3579}
3580
3581Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
3582 SmallVectorImpl<Value*> &Ops,
3583 unsigned IntID) {
3584 llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
3585 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy);
3586 Value *BasePtr = Ops[1];
3587
3588 // Does the load have an offset?
3589 if (Ops.size() > 2)
3590 BasePtr = Builder.CreateGEP(Ty: VTy, Ptr: BasePtr, IdxList: Ops[2]);
3591
3592 Function *F = CGM.getIntrinsic(IID: IntID, Tys: {VTy});
3593 return Builder.CreateCall(Callee: F, Args: {Predicate, BasePtr});
3594}
3595
3596Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
3597 SmallVectorImpl<Value*> &Ops,
3598 unsigned IntID) {
3599 llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
3600
3601 unsigned N;
3602 switch (IntID) {
3603 case Intrinsic::aarch64_sve_st2:
3604 case Intrinsic::aarch64_sve_st1_pn_x2:
3605 case Intrinsic::aarch64_sve_stnt1_pn_x2:
3606 case Intrinsic::aarch64_sve_st2q:
3607 N = 2;
3608 break;
3609 case Intrinsic::aarch64_sve_st3:
3610 case Intrinsic::aarch64_sve_st3q:
3611 N = 3;
3612 break;
3613 case Intrinsic::aarch64_sve_st4:
3614 case Intrinsic::aarch64_sve_st1_pn_x4:
3615 case Intrinsic::aarch64_sve_stnt1_pn_x4:
3616 case Intrinsic::aarch64_sve_st4q:
3617 N = 4;
3618 break;
3619 default:
3620 llvm_unreachable("unknown intrinsic!");
3621 }
3622
3623 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy);
3624 Value *BasePtr = Ops[1];
3625
3626 // Does the store have an offset?
3627 if (Ops.size() > (2 + N))
3628 BasePtr = Builder.CreateGEP(Ty: VTy, Ptr: BasePtr, IdxList: Ops[2]);
3629
3630 // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
3631 // need to break up the tuple vector.
3632 SmallVector<llvm::Value*, 5> Operands;
3633 for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
3634 Operands.push_back(Elt: Ops[I]);
3635 Operands.append(IL: {Predicate, BasePtr});
3636 Function *F = CGM.getIntrinsic(IID: IntID, Tys: { VTy });
3637
3638 return Builder.CreateCall(Callee: F, Args: Operands);
3639}
3640
3641// SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and
3642// svpmullt_pair intrinsics, with the exception that their results are bitcast
3643// to a wider type.
3644Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags,
3645 SmallVectorImpl<Value *> &Ops,
3646 unsigned BuiltinID) {
3647 // Splat scalar operand to vector (intrinsics with _n infix)
3648 if (TypeFlags.hasSplatOperand()) {
3649 unsigned OpNo = TypeFlags.getSplatOperand();
3650 Ops[OpNo] = EmitSVEDupX(Scalar: Ops[OpNo]);
3651 }
3652
3653 // The pair-wise function has a narrower overloaded type.
3654 Function *F = CGM.getIntrinsic(IID: BuiltinID, Tys: Ops[0]->getType());
3655 Value *Call = Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1]});
3656
3657 // Now bitcast to the wider result type.
3658 llvm::ScalableVectorType *Ty = getSVEType(TypeFlags);
3659 return EmitSVEReinterpret(Val: Call, Ty);
3660}
3661
3662Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags,
3663 ArrayRef<Value *> Ops, unsigned BuiltinID) {
3664 llvm::Type *OverloadedTy = getSVEType(TypeFlags);
3665 Function *F = CGM.getIntrinsic(IID: BuiltinID, Tys: OverloadedTy);
3666 return Builder.CreateCall(Callee: F, Args: {Ops[0], Builder.getInt32(C: 0)});
3667}
3668
3669Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
3670 SmallVectorImpl<Value *> &Ops,
3671 unsigned BuiltinID) {
3672 auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
3673 auto *VectorTy = getSVEVectorForElementType(EltTy: MemEltTy);
3674 auto *MemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, SVTy: VectorTy);
3675
3676 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy: MemoryTy);
3677 Value *BasePtr = Ops[1];
3678
3679 // Implement the index operand if not omitted.
3680 if (Ops.size() > 3)
3681 BasePtr = Builder.CreateGEP(Ty: MemoryTy, Ptr: BasePtr, IdxList: Ops[2]);
3682
3683 Value *PrfOp = Ops.back();
3684
3685 Function *F = CGM.getIntrinsic(IID: BuiltinID, Tys: Predicate->getType());
3686 return Builder.CreateCall(Callee: F, Args: {Predicate, BasePtr, PrfOp});
3687}
3688
3689Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
3690 llvm::Type *ReturnTy,
3691 SmallVectorImpl<Value *> &Ops,
3692 unsigned IntrinsicID,
3693 bool IsZExtReturn) {
3694 QualType LangPTy = E->getArg(Arg: 1)->getType();
3695 llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
3696 T: LangPTy->castAs<PointerType>()->getPointeeType());
3697
3698 // Mfloat8 types is stored as a vector, so extra work
3699 // to extract sclar element type is necessary.
3700 if (MemEltTy->isVectorTy()) {
3701 assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) &&
3702 "Only <1 x i8> expected");
3703 MemEltTy = cast<llvm::VectorType>(Val: MemEltTy)->getElementType();
3704 }
3705
3706 // The vector type that is returned may be different from the
3707 // eventual type loaded from memory.
3708 auto VectorTy = cast<llvm::ScalableVectorType>(Val: ReturnTy);
3709 llvm::ScalableVectorType *MemoryTy = nullptr;
3710 llvm::ScalableVectorType *PredTy = nullptr;
3711 bool IsQuadLoad = false;
3712 switch (IntrinsicID) {
3713 case Intrinsic::aarch64_sve_ld1uwq:
3714 case Intrinsic::aarch64_sve_ld1udq:
3715 MemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, MinNumElts: 1);
3716 PredTy = llvm::ScalableVectorType::get(
3717 ElementType: llvm::Type::getInt1Ty(C&: getLLVMContext()), MinNumElts: 1);
3718 IsQuadLoad = true;
3719 break;
3720 default:
3721 MemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, SVTy: VectorTy);
3722 PredTy = MemoryTy;
3723 break;
3724 }
3725
3726 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy: PredTy);
3727 Value *BasePtr = Ops[1];
3728
3729 // Does the load have an offset?
3730 if (Ops.size() > 2)
3731 BasePtr = Builder.CreateGEP(Ty: MemoryTy, Ptr: BasePtr, IdxList: Ops[2]);
3732
3733 Function *F = CGM.getIntrinsic(IID: IntrinsicID, Tys: IsQuadLoad ? VectorTy : MemoryTy);
3734 auto *Load =
3735 cast<llvm::Instruction>(Val: Builder.CreateCall(Callee: F, Args: {Predicate, BasePtr}));
3736 auto TBAAInfo = CGM.getTBAAAccessInfo(AccessType: LangPTy->getPointeeType());
3737 CGM.DecorateInstructionWithTBAA(Inst: Load, TBAAInfo);
3738
3739 if (IsQuadLoad)
3740 return Load;
3741
3742 return IsZExtReturn ? Builder.CreateZExt(V: Load, DestTy: VectorTy)
3743 : Builder.CreateSExt(V: Load, DestTy: VectorTy);
3744}
3745
3746Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
3747 SmallVectorImpl<Value *> &Ops,
3748 unsigned IntrinsicID) {
3749 QualType LangPTy = E->getArg(Arg: 1)->getType();
3750 llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
3751 T: LangPTy->castAs<PointerType>()->getPointeeType());
3752
3753 // Mfloat8 types is stored as a vector, so extra work
3754 // to extract sclar element type is necessary.
3755 if (MemEltTy->isVectorTy()) {
3756 assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) &&
3757 "Only <1 x i8> expected");
3758 MemEltTy = cast<llvm::VectorType>(Val: MemEltTy)->getElementType();
3759 }
3760
3761 // The vector type that is stored may be different from the
3762 // eventual type stored to memory.
3763 auto VectorTy = cast<llvm::ScalableVectorType>(Val: Ops.back()->getType());
3764 auto MemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, SVTy: VectorTy);
3765
3766 auto PredTy = MemoryTy;
3767 auto AddrMemoryTy = MemoryTy;
3768 bool IsQuadStore = false;
3769
3770 switch (IntrinsicID) {
3771 case Intrinsic::aarch64_sve_st1wq:
3772 case Intrinsic::aarch64_sve_st1dq:
3773 AddrMemoryTy = llvm::ScalableVectorType::get(ElementType: MemEltTy, MinNumElts: 1);
3774 PredTy =
3775 llvm::ScalableVectorType::get(ElementType: IntegerType::get(C&: getLLVMContext(), NumBits: 1), MinNumElts: 1);
3776 IsQuadStore = true;
3777 break;
3778 default:
3779 break;
3780 }
3781 Value *Predicate = EmitSVEPredicateCast(Pred: Ops[0], VTy: PredTy);
3782 Value *BasePtr = Ops[1];
3783
3784 // Does the store have an offset?
3785 if (Ops.size() == 4)
3786 BasePtr = Builder.CreateGEP(Ty: AddrMemoryTy, Ptr: BasePtr, IdxList: Ops[2]);
3787
3788 // Last value is always the data
3789 Value *Val =
3790 IsQuadStore ? Ops.back() : Builder.CreateTrunc(V: Ops.back(), DestTy: MemoryTy);
3791
3792 Function *F =
3793 CGM.getIntrinsic(IID: IntrinsicID, Tys: IsQuadStore ? VectorTy : MemoryTy);
3794 auto *Store =
3795 cast<llvm::Instruction>(Val: Builder.CreateCall(Callee: F, Args: {Val, Predicate, BasePtr}));
3796 auto TBAAInfo = CGM.getTBAAAccessInfo(AccessType: LangPTy->getPointeeType());
3797 CGM.DecorateInstructionWithTBAA(Inst: Store, TBAAInfo);
3798 return Store;
3799}
3800
3801Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags,
3802 SmallVectorImpl<Value *> &Ops,
3803 unsigned IntID) {
3804 Ops[2] = EmitSVEPredicateCast(
3805 Pred: Ops[2], VTy: getSVEVectorForElementType(EltTy: SVEBuiltinMemEltTy(TypeFlags)));
3806
3807 SmallVector<Value *> NewOps;
3808 NewOps.push_back(Elt: Ops[2]);
3809
3810 llvm::Value *BasePtr = Ops[3];
3811 llvm::Value *RealSlice = Ops[1];
3812 // If the intrinsic contains the vnum parameter, multiply it with the vector
3813 // size in bytes.
3814 if (Ops.size() == 5) {
3815 Function *StreamingVectorLength =
3816 CGM.getIntrinsic(IID: Intrinsic::aarch64_sme_cntsd);
3817 llvm::Value *StreamingVectorLengthCall =
3818 Builder.CreateMul(LHS: Builder.CreateCall(Callee: StreamingVectorLength),
3819 RHS: llvm::ConstantInt::get(Ty: Int64Ty, V: 8), Name: "svl",
3820 /* HasNUW */ true, /* HasNSW */ true);
3821 llvm::Value *Mulvl =
3822 Builder.CreateMul(LHS: StreamingVectorLengthCall, RHS: Ops[4], Name: "mulvl");
3823 // The type of the ptr parameter is void *, so use Int8Ty here.
3824 BasePtr = Builder.CreateGEP(Ty: Int8Ty, Ptr: Ops[3], IdxList: Mulvl);
3825 RealSlice = Builder.CreateZExt(V: RealSlice, DestTy: Int64Ty);
3826 RealSlice = Builder.CreateAdd(LHS: RealSlice, RHS: Ops[4]);
3827 RealSlice = Builder.CreateTrunc(V: RealSlice, DestTy: Int32Ty);
3828 }
3829 NewOps.push_back(Elt: BasePtr);
3830 NewOps.push_back(Elt: Ops[0]);
3831 NewOps.push_back(Elt: RealSlice);
3832 Function *F = CGM.getIntrinsic(IID: IntID);
3833 return Builder.CreateCall(Callee: F, Args: NewOps);
3834}
3835
3836Value *CodeGenFunction::EmitSMEReadWrite(const SVETypeFlags &TypeFlags,
3837 SmallVectorImpl<Value *> &Ops,
3838 unsigned IntID) {
3839 auto *VecTy = getSVEType(TypeFlags);
3840 Function *F = CGM.getIntrinsic(IID: IntID, Tys: VecTy);
3841 if (TypeFlags.isReadZA())
3842 Ops[1] = EmitSVEPredicateCast(Pred: Ops[1], VTy: VecTy);
3843 else if (TypeFlags.isWriteZA())
3844 Ops[2] = EmitSVEPredicateCast(Pred: Ops[2], VTy: VecTy);
3845 return Builder.CreateCall(Callee: F, Args: Ops);
3846}
3847
3848Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
3849 SmallVectorImpl<Value *> &Ops,
3850 unsigned IntID) {
3851 // svzero_za() intrinsic zeros the entire za tile and has no paramters.
3852 if (Ops.size() == 0)
3853 Ops.push_back(Elt: llvm::ConstantInt::get(Ty: Int32Ty, V: 255));
3854 Function *F = CGM.getIntrinsic(IID: IntID, Tys: {});
3855 return Builder.CreateCall(Callee: F, Args: Ops);
3856}
3857
3858Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
3859 SmallVectorImpl<Value *> &Ops,
3860 unsigned IntID) {
3861 if (Ops.size() == 2)
3862 Ops.push_back(Elt: Builder.getInt32(C: 0));
3863 else
3864 Ops[2] = Builder.CreateIntCast(V: Ops[2], DestTy: Int32Ty, isSigned: true);
3865 Function *F = CGM.getIntrinsic(IID: IntID, Tys: {});
3866 return Builder.CreateCall(Callee: F, Args: Ops);
3867}
3868
3869// Limit the usage of scalable llvm IR generated by the ACLE by using the
3870// sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
3871Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
3872 return Builder.CreateVectorSplat(
3873 EC: cast<llvm::VectorType>(Val: Ty)->getElementCount(), V: Scalar);
3874}
3875
3876Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) {
3877 if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) {
3878#ifndef NDEBUG
3879 auto *VecTy = cast<llvm::VectorType>(Ty);
3880 ElementCount EC = VecTy->getElementCount();
3881 assert(EC.isScalar() && VecTy->getElementType() == Int8Ty &&
3882 "Only <1 x i8> expected");
3883#endif
3884 Scalar = Builder.CreateExtractElement(Vec: Scalar, Idx: uint64_t(0));
3885 }
3886 return EmitSVEDupX(Scalar, Ty: getSVEVectorForElementType(EltTy: Scalar->getType()));
3887}
3888
3889Value *CodeGenFunction::EmitSVEReinterpret(Value *Val, llvm::Type *Ty) {
3890 // FIXME: For big endian this needs an additional REV, or needs a separate
3891 // intrinsic that is code-generated as a no-op, because the LLVM bitcast
3892 // instruction is defined as 'bitwise' equivalent from memory point of
3893 // view (when storing/reloading), whereas the svreinterpret builtin
3894 // implements bitwise equivalent cast from register point of view.
3895 // LLVM CodeGen for a bitcast must add an explicit REV for big-endian.
3896
3897 if (auto *StructTy = dyn_cast<StructType>(Val: Ty)) {
3898 Value *Tuple = llvm::PoisonValue::get(T: Ty);
3899
3900 for (unsigned I = 0; I < StructTy->getNumElements(); ++I) {
3901 Value *In = Builder.CreateExtractValue(Agg: Val, Idxs: I);
3902 Value *Out = Builder.CreateBitCast(V: In, DestTy: StructTy->getTypeAtIndex(N: I));
3903 Tuple = Builder.CreateInsertValue(Agg: Tuple, Val: Out, Idxs: I);
3904 }
3905
3906 return Tuple;
3907 }
3908
3909 return Builder.CreateBitCast(V: Val, DestTy: Ty);
3910}
3911
3912static void InsertExplicitZeroOperand(CGBuilderTy &Builder, llvm::Type *Ty,
3913 SmallVectorImpl<Value *> &Ops) {
3914 auto *SplatZero = Constant::getNullValue(Ty);
3915 Ops.insert(I: Ops.begin(), Elt: SplatZero);
3916}
3917
3918static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty,
3919 SmallVectorImpl<Value *> &Ops) {
3920 auto *SplatUndef = UndefValue::get(T: Ty);
3921 Ops.insert(I: Ops.begin(), Elt: SplatUndef);
3922}
3923
3924SmallVector<llvm::Type *, 2>
3925CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
3926 llvm::Type *ResultType,
3927 ArrayRef<Value *> Ops) {
3928 if (TypeFlags.isOverloadNone())
3929 return {};
3930
3931 llvm::Type *DefaultType = getSVEType(TypeFlags);
3932
3933 if (TypeFlags.isOverloadWhileOrMultiVecCvt())
3934 return {DefaultType, Ops[1]->getType()};
3935
3936 if (TypeFlags.isOverloadWhileRW())
3937 return {getSVEPredType(TypeFlags), Ops[0]->getType()};
3938
3939 if (TypeFlags.isOverloadFirstandLast())
3940 return {Ops[0]->getType(), Ops.back()->getType()};
3941
3942 if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() &&
3943 ResultType->isVectorTy())
3944 return {ResultType, Ops[1]->getType()};
3945
3946 assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads");
3947 return {DefaultType};
3948}
3949
3950Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags,
3951 ArrayRef<Value *> Ops) {
3952 assert((TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) &&
3953 "Expects TypleFlags.isTupleSet() or TypeFlags.isTupleGet()");
3954 unsigned Idx = cast<ConstantInt>(Val: Ops[1])->getZExtValue();
3955
3956 if (TypeFlags.isTupleSet())
3957 return Builder.CreateInsertValue(Agg: Ops[0], Val: Ops[2], Idxs: Idx);
3958 return Builder.CreateExtractValue(Agg: Ops[0], Idxs: Idx);
3959}
3960
3961Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
3962 llvm::Type *Ty,
3963 ArrayRef<Value *> Ops) {
3964 assert(TypeFlags.isTupleCreate() && "Expects TypleFlag isTupleCreate");
3965
3966 Value *Tuple = llvm::PoisonValue::get(T: Ty);
3967 for (unsigned Idx = 0; Idx < Ops.size(); Idx++)
3968 Tuple = Builder.CreateInsertValue(Agg: Tuple, Val: Ops[Idx], Idxs: Idx);
3969
3970 return Tuple;
3971}
3972
3973void CodeGenFunction::GetAArch64SVEProcessedOperands(
3974 unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
3975 SVETypeFlags TypeFlags) {
3976 // Find out if any arguments are required to be integer constant expressions.
3977 unsigned ICEArguments = 0;
3978 ASTContext::GetBuiltinTypeError Error;
3979 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
3980 assert(Error == ASTContext::GE_None && "Should not codegen an error");
3981
3982 // Tuple set/get only requires one insert/extract vector, which is
3983 // created by EmitSVETupleSetOrGet.
3984 bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
3985
3986 for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
3987 bool IsICE = ICEArguments & (1 << i);
3988 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: i));
3989
3990 if (IsICE) {
3991 // If this is required to be a constant, constant fold it so that we know
3992 // that the generated intrinsic gets a ConstantInt.
3993 std::optional<llvm::APSInt> Result =
3994 E->getArg(Arg: i)->getIntegerConstantExpr(Ctx: getContext());
3995 assert(Result && "Expected argument to be a constant");
3996
3997 // Immediates for SVE llvm intrinsics are always 32bit. We can safely
3998 // truncate because the immediate has been range checked and no valid
3999 // immediate requires more than a handful of bits.
4000 *Result = Result->extOrTrunc(width: 32);
4001 Ops.push_back(Elt: llvm::ConstantInt::get(Context&: getLLVMContext(), V: *Result));
4002 continue;
4003 }
4004
4005 if (isa<StructType>(Val: Arg->getType()) && !IsTupleGetOrSet) {
4006 for (unsigned I = 0; I < Arg->getType()->getStructNumElements(); ++I)
4007 Ops.push_back(Elt: Builder.CreateExtractValue(Agg: Arg, Idxs: I));
4008
4009 continue;
4010 }
4011
4012 Ops.push_back(Elt: Arg);
4013 }
4014}
4015
4016Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
4017 const CallExpr *E) {
4018 llvm::Type *Ty = ConvertType(T: E->getType());
4019 if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
4020 BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64_x4) {
4021 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 0));
4022 return EmitSVEReinterpret(Val, Ty);
4023 }
4024
4025 auto *Builtin = findARMVectorIntrinsicInMap(IntrinsicMap: AArch64SVEIntrinsicMap, BuiltinID,
4026 MapProvenSorted&: AArch64SVEIntrinsicsProvenSorted);
4027
4028 llvm::SmallVector<Value *, 4> Ops;
4029 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4030 GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
4031
4032 if (TypeFlags.isLoad())
4033 return EmitSVEMaskedLoad(E, ReturnTy: Ty, Ops, IntrinsicID: Builtin->LLVMIntrinsic,
4034 IsZExtReturn: TypeFlags.isZExtReturn());
4035 if (TypeFlags.isStore())
4036 return EmitSVEMaskedStore(E, Ops, IntrinsicID: Builtin->LLVMIntrinsic);
4037 if (TypeFlags.isGatherLoad())
4038 return EmitSVEGatherLoad(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4039 if (TypeFlags.isScatterStore())
4040 return EmitSVEScatterStore(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4041 if (TypeFlags.isPrefetch())
4042 return EmitSVEPrefetchLoad(TypeFlags, Ops, BuiltinID: Builtin->LLVMIntrinsic);
4043 if (TypeFlags.isGatherPrefetch())
4044 return EmitSVEGatherPrefetch(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4045 if (TypeFlags.isStructLoad())
4046 return EmitSVEStructLoad(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4047 if (TypeFlags.isStructStore())
4048 return EmitSVEStructStore(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4049 if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
4050 return EmitSVETupleSetOrGet(TypeFlags, Ops);
4051 if (TypeFlags.isTupleCreate())
4052 return EmitSVETupleCreate(TypeFlags, Ty, Ops);
4053 if (TypeFlags.isUndef())
4054 return UndefValue::get(T: Ty);
4055
4056 // Handle built-ins for which there is a corresponding LLVM Intrinsic.
4057 // -------------------------------------------------------------------
4058 if (Builtin->LLVMIntrinsic != 0) {
4059 // Emit set FPMR for intrinsics that require it
4060 if (TypeFlags.setsFPMR())
4061 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_set_fpmr),
4062 Args: Ops.pop_back_val());
4063 if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp)
4064 InsertExplicitZeroOperand(Builder, Ty, Ops);
4065
4066 if (TypeFlags.getMergeType() == SVETypeFlags::MergeAnyExp)
4067 InsertExplicitUndefOperand(Builder, Ty, Ops);
4068
4069 // Some ACLE builtins leave out the argument to specify the predicate
4070 // pattern, which is expected to be expanded to an SV_ALL pattern.
4071 if (TypeFlags.isAppendSVALL())
4072 Ops.push_back(Elt: Builder.getInt32(/*SV_ALL*/ C: 31));
4073 if (TypeFlags.isInsertOp1SVALL())
4074 Ops.insert(I: &Ops[1], Elt: Builder.getInt32(/*SV_ALL*/ C: 31));
4075
4076 // Predicates must match the main datatype.
4077 for (Value *&Op : Ops)
4078 if (auto PredTy = dyn_cast<llvm::VectorType>(Val: Op->getType()))
4079 if (PredTy->getElementType()->isIntegerTy(Bitwidth: 1))
4080 Op = EmitSVEPredicateCast(Pred: Op, VTy: getSVEType(TypeFlags));
4081
4082 // Splat scalar operand to vector (intrinsics with _n infix)
4083 if (TypeFlags.hasSplatOperand()) {
4084 unsigned OpNo = TypeFlags.getSplatOperand();
4085 Ops[OpNo] = EmitSVEDupX(Scalar: Ops[OpNo]);
4086 }
4087
4088 if (TypeFlags.isReverseCompare())
4089 std::swap(a&: Ops[1], b&: Ops[2]);
4090 else if (TypeFlags.isReverseUSDOT())
4091 std::swap(a&: Ops[1], b&: Ops[2]);
4092 else if (TypeFlags.isReverseMergeAnyBinOp() &&
4093 TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
4094 std::swap(a&: Ops[1], b&: Ops[2]);
4095 else if (TypeFlags.isReverseMergeAnyAccOp() &&
4096 TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
4097 std::swap(a&: Ops[1], b&: Ops[3]);
4098
4099 // Predicated intrinsics with _z suffix need a select w/ zeroinitializer.
4100 if (TypeFlags.getMergeType() == SVETypeFlags::MergeZero) {
4101 llvm::Type *OpndTy = Ops[1]->getType();
4102 auto *SplatZero = Constant::getNullValue(Ty: OpndTy);
4103 Ops[1] = Builder.CreateSelect(C: Ops[0], True: Ops[1], False: SplatZero);
4104 }
4105
4106 Function *F = CGM.getIntrinsic(IID: Builtin->LLVMIntrinsic,
4107 Tys: getSVEOverloadTypes(TypeFlags, ResultType: Ty, Ops));
4108 Value *Call = Builder.CreateCall(Callee: F, Args: Ops);
4109
4110 if (Call->getType() == Ty)
4111 return Call;
4112
4113 // Predicate results must be converted to svbool_t.
4114 if (auto PredTy = dyn_cast<llvm::ScalableVectorType>(Val: Ty))
4115 return EmitSVEPredicateCast(Pred: Call, VTy: PredTy);
4116 if (auto PredTupleTy = dyn_cast<llvm::StructType>(Val: Ty))
4117 return EmitSVEPredicateTupleCast(PredTuple: Call, Ty: PredTupleTy);
4118
4119 llvm_unreachable("unsupported element count!");
4120 }
4121
4122 switch (BuiltinID) {
4123 default:
4124 return nullptr;
4125
4126 case SVE::BI__builtin_sve_svreinterpret_b: {
4127 auto SVCountTy =
4128 llvm::TargetExtType::get(Context&: getLLVMContext(), Name: "aarch64.svcount");
4129 Function *CastFromSVCountF =
4130 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_to_svbool, Tys: SVCountTy);
4131 return Builder.CreateCall(Callee: CastFromSVCountF, Args: Ops[0]);
4132 }
4133 case SVE::BI__builtin_sve_svreinterpret_c: {
4134 auto SVCountTy =
4135 llvm::TargetExtType::get(Context&: getLLVMContext(), Name: "aarch64.svcount");
4136 Function *CastToSVCountF =
4137 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_from_svbool, Tys: SVCountTy);
4138 return Builder.CreateCall(Callee: CastToSVCountF, Args: Ops[0]);
4139 }
4140
4141 case SVE::BI__builtin_sve_svpsel_lane_b8:
4142 case SVE::BI__builtin_sve_svpsel_lane_b16:
4143 case SVE::BI__builtin_sve_svpsel_lane_b32:
4144 case SVE::BI__builtin_sve_svpsel_lane_b64:
4145 case SVE::BI__builtin_sve_svpsel_lane_c8:
4146 case SVE::BI__builtin_sve_svpsel_lane_c16:
4147 case SVE::BI__builtin_sve_svpsel_lane_c32:
4148 case SVE::BI__builtin_sve_svpsel_lane_c64: {
4149 bool IsSVCount = isa<TargetExtType>(Val: Ops[0]->getType());
4150 assert(((!IsSVCount || cast<TargetExtType>(Ops[0]->getType())->getName() ==
4151 "aarch64.svcount")) &&
4152 "Unexpected TargetExtType");
4153 auto SVCountTy =
4154 llvm::TargetExtType::get(Context&: getLLVMContext(), Name: "aarch64.svcount");
4155 Function *CastFromSVCountF =
4156 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_to_svbool, Tys: SVCountTy);
4157 Function *CastToSVCountF =
4158 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_from_svbool, Tys: SVCountTy);
4159
4160 auto OverloadedTy = getSVEType(TypeFlags: SVETypeFlags(Builtin->TypeModifier));
4161 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_psel, Tys: OverloadedTy);
4162 llvm::Value *Ops0 =
4163 IsSVCount ? Builder.CreateCall(Callee: CastFromSVCountF, Args: Ops[0]) : Ops[0];
4164 llvm::Value *Ops1 = EmitSVEPredicateCast(Pred: Ops[1], VTy: OverloadedTy);
4165 llvm::Value *PSel = Builder.CreateCall(Callee: F, Args: {Ops0, Ops1, Ops[2]});
4166 return IsSVCount ? Builder.CreateCall(Callee: CastToSVCountF, Args: PSel) : PSel;
4167 }
4168 case SVE::BI__builtin_sve_svmov_b_z: {
4169 // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op)
4170 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4171 llvm::Type* OverloadedTy = getSVEType(TypeFlags);
4172 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_and_z, Tys: OverloadedTy);
4173 return Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1], Ops[1]});
4174 }
4175
4176 case SVE::BI__builtin_sve_svnot_b_z: {
4177 // svnot_b_z(pg, op) <=> sveor_b_z(pg, op, pg)
4178 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4179 llvm::Type* OverloadedTy = getSVEType(TypeFlags);
4180 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_eor_z, Tys: OverloadedTy);
4181 return Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1], Ops[0]});
4182 }
4183
4184 case SVE::BI__builtin_sve_svmovlb_u16:
4185 case SVE::BI__builtin_sve_svmovlb_u32:
4186 case SVE::BI__builtin_sve_svmovlb_u64:
4187 return EmitSVEMovl(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_ushllb);
4188
4189 case SVE::BI__builtin_sve_svmovlb_s16:
4190 case SVE::BI__builtin_sve_svmovlb_s32:
4191 case SVE::BI__builtin_sve_svmovlb_s64:
4192 return EmitSVEMovl(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_sshllb);
4193
4194 case SVE::BI__builtin_sve_svmovlt_u16:
4195 case SVE::BI__builtin_sve_svmovlt_u32:
4196 case SVE::BI__builtin_sve_svmovlt_u64:
4197 return EmitSVEMovl(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_ushllt);
4198
4199 case SVE::BI__builtin_sve_svmovlt_s16:
4200 case SVE::BI__builtin_sve_svmovlt_s32:
4201 case SVE::BI__builtin_sve_svmovlt_s64:
4202 return EmitSVEMovl(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_sshllt);
4203
4204 case SVE::BI__builtin_sve_svpmullt_u16:
4205 case SVE::BI__builtin_sve_svpmullt_u64:
4206 case SVE::BI__builtin_sve_svpmullt_n_u16:
4207 case SVE::BI__builtin_sve_svpmullt_n_u64:
4208 return EmitSVEPMull(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_pmullt_pair);
4209
4210 case SVE::BI__builtin_sve_svpmullb_u16:
4211 case SVE::BI__builtin_sve_svpmullb_u64:
4212 case SVE::BI__builtin_sve_svpmullb_n_u16:
4213 case SVE::BI__builtin_sve_svpmullb_n_u64:
4214 return EmitSVEPMull(TypeFlags, Ops, BuiltinID: Intrinsic::aarch64_sve_pmullb_pair);
4215
4216 case SVE::BI__builtin_sve_svdup_n_b8:
4217 case SVE::BI__builtin_sve_svdup_n_b16:
4218 case SVE::BI__builtin_sve_svdup_n_b32:
4219 case SVE::BI__builtin_sve_svdup_n_b64: {
4220 Value *CmpNE =
4221 Builder.CreateICmpNE(LHS: Ops[0], RHS: Constant::getNullValue(Ty: Ops[0]->getType()));
4222 llvm::ScalableVectorType *OverloadedTy = getSVEType(TypeFlags);
4223 Value *Dup = EmitSVEDupX(Scalar: CmpNE, Ty: OverloadedTy);
4224 return EmitSVEPredicateCast(Pred: Dup, VTy: cast<llvm::ScalableVectorType>(Val: Ty));
4225 }
4226
4227 case SVE::BI__builtin_sve_svdupq_n_b8:
4228 case SVE::BI__builtin_sve_svdupq_n_b16:
4229 case SVE::BI__builtin_sve_svdupq_n_b32:
4230 case SVE::BI__builtin_sve_svdupq_n_b64:
4231 case SVE::BI__builtin_sve_svdupq_n_u8:
4232 case SVE::BI__builtin_sve_svdupq_n_s8:
4233 case SVE::BI__builtin_sve_svdupq_n_u64:
4234 case SVE::BI__builtin_sve_svdupq_n_f64:
4235 case SVE::BI__builtin_sve_svdupq_n_s64:
4236 case SVE::BI__builtin_sve_svdupq_n_u16:
4237 case SVE::BI__builtin_sve_svdupq_n_f16:
4238 case SVE::BI__builtin_sve_svdupq_n_bf16:
4239 case SVE::BI__builtin_sve_svdupq_n_s16:
4240 case SVE::BI__builtin_sve_svdupq_n_u32:
4241 case SVE::BI__builtin_sve_svdupq_n_f32:
4242 case SVE::BI__builtin_sve_svdupq_n_s32: {
4243 // These builtins are implemented by storing each element to an array and using
4244 // ld1rq to materialize a vector.
4245 unsigned NumOpnds = Ops.size();
4246
4247 bool IsBoolTy =
4248 cast<llvm::VectorType>(Val: Ty)->getElementType()->isIntegerTy(Bitwidth: 1);
4249
4250 // For svdupq_n_b* the element type of is an integer of type 128/numelts,
4251 // so that the compare can use the width that is natural for the expected
4252 // number of predicate lanes.
4253 llvm::Type *EltTy = Ops[0]->getType();
4254 if (IsBoolTy)
4255 EltTy = IntegerType::get(C&: getLLVMContext(), NumBits: SVEBitsPerBlock / NumOpnds);
4256
4257 SmallVector<llvm::Value *, 16> VecOps;
4258 for (unsigned I = 0; I < NumOpnds; ++I)
4259 VecOps.push_back(Elt: Builder.CreateZExt(V: Ops[I], DestTy: EltTy));
4260 Value *Vec = BuildVector(Ops: VecOps);
4261
4262 llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy);
4263 Value *InsertSubVec = Builder.CreateInsertVector(
4264 DstType: OverloadedTy, SrcVec: PoisonValue::get(T: OverloadedTy), SubVec: Vec, Idx: uint64_t(0));
4265
4266 Function *F =
4267 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_dupq_lane, Tys: OverloadedTy);
4268 Value *DupQLane =
4269 Builder.CreateCall(Callee: F, Args: {InsertSubVec, Builder.getInt64(C: 0)});
4270
4271 if (!IsBoolTy)
4272 return DupQLane;
4273
4274 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4275 Value *Pred = EmitSVEAllTruePred(TypeFlags);
4276
4277 // For svdupq_n_b* we need to add an additional 'cmpne' with '0'.
4278 F = CGM.getIntrinsic(IID: NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne
4279 : Intrinsic::aarch64_sve_cmpne_wide,
4280 Tys: OverloadedTy);
4281 Value *Call = Builder.CreateCall(
4282 Callee: F, Args: {Pred, DupQLane, EmitSVEDupX(Scalar: Builder.getInt64(C: 0))});
4283 return EmitSVEPredicateCast(Pred: Call, VTy: cast<llvm::ScalableVectorType>(Val: Ty));
4284 }
4285
4286 case SVE::BI__builtin_sve_svpfalse_b:
4287 return ConstantInt::getFalse(Ty);
4288
4289 case SVE::BI__builtin_sve_svpfalse_c: {
4290 auto SVBoolTy = ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16);
4291 Function *CastToSVCountF =
4292 CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_convert_from_svbool, Tys: Ty);
4293 return Builder.CreateCall(Callee: CastToSVCountF, Args: ConstantInt::getFalse(Ty: SVBoolTy));
4294 }
4295
4296 case SVE::BI__builtin_sve_svlen_bf16:
4297 case SVE::BI__builtin_sve_svlen_f16:
4298 case SVE::BI__builtin_sve_svlen_f32:
4299 case SVE::BI__builtin_sve_svlen_f64:
4300 case SVE::BI__builtin_sve_svlen_s8:
4301 case SVE::BI__builtin_sve_svlen_s16:
4302 case SVE::BI__builtin_sve_svlen_s32:
4303 case SVE::BI__builtin_sve_svlen_s64:
4304 case SVE::BI__builtin_sve_svlen_u8:
4305 case SVE::BI__builtin_sve_svlen_u16:
4306 case SVE::BI__builtin_sve_svlen_u32:
4307 case SVE::BI__builtin_sve_svlen_u64: {
4308 SVETypeFlags TF(Builtin->TypeModifier);
4309 return Builder.CreateElementCount(Ty, EC: getSVEType(TypeFlags: TF)->getElementCount());
4310 }
4311
4312 case SVE::BI__builtin_sve_svtbl2_u8:
4313 case SVE::BI__builtin_sve_svtbl2_s8:
4314 case SVE::BI__builtin_sve_svtbl2_u16:
4315 case SVE::BI__builtin_sve_svtbl2_s16:
4316 case SVE::BI__builtin_sve_svtbl2_u32:
4317 case SVE::BI__builtin_sve_svtbl2_s32:
4318 case SVE::BI__builtin_sve_svtbl2_u64:
4319 case SVE::BI__builtin_sve_svtbl2_s64:
4320 case SVE::BI__builtin_sve_svtbl2_f16:
4321 case SVE::BI__builtin_sve_svtbl2_bf16:
4322 case SVE::BI__builtin_sve_svtbl2_f32:
4323 case SVE::BI__builtin_sve_svtbl2_f64: {
4324 SVETypeFlags TF(Builtin->TypeModifier);
4325 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_sve_tbl2, Tys: getSVEType(TypeFlags: TF));
4326 return Builder.CreateCall(Callee: F, Args: Ops);
4327 }
4328
4329 case SVE::BI__builtin_sve_svset_neonq_s8:
4330 case SVE::BI__builtin_sve_svset_neonq_s16:
4331 case SVE::BI__builtin_sve_svset_neonq_s32:
4332 case SVE::BI__builtin_sve_svset_neonq_s64:
4333 case SVE::BI__builtin_sve_svset_neonq_u8:
4334 case SVE::BI__builtin_sve_svset_neonq_u16:
4335 case SVE::BI__builtin_sve_svset_neonq_u32:
4336 case SVE::BI__builtin_sve_svset_neonq_u64:
4337 case SVE::BI__builtin_sve_svset_neonq_f16:
4338 case SVE::BI__builtin_sve_svset_neonq_f32:
4339 case SVE::BI__builtin_sve_svset_neonq_f64:
4340 case SVE::BI__builtin_sve_svset_neonq_bf16: {
4341 return Builder.CreateInsertVector(DstType: Ty, SrcVec: Ops[0], SubVec: Ops[1], Idx: uint64_t(0));
4342 }
4343
4344 case SVE::BI__builtin_sve_svget_neonq_s8:
4345 case SVE::BI__builtin_sve_svget_neonq_s16:
4346 case SVE::BI__builtin_sve_svget_neonq_s32:
4347 case SVE::BI__builtin_sve_svget_neonq_s64:
4348 case SVE::BI__builtin_sve_svget_neonq_u8:
4349 case SVE::BI__builtin_sve_svget_neonq_u16:
4350 case SVE::BI__builtin_sve_svget_neonq_u32:
4351 case SVE::BI__builtin_sve_svget_neonq_u64:
4352 case SVE::BI__builtin_sve_svget_neonq_f16:
4353 case SVE::BI__builtin_sve_svget_neonq_f32:
4354 case SVE::BI__builtin_sve_svget_neonq_f64:
4355 case SVE::BI__builtin_sve_svget_neonq_bf16: {
4356 return Builder.CreateExtractVector(DstType: Ty, SrcVec: Ops[0], Idx: uint64_t(0));
4357 }
4358
4359 case SVE::BI__builtin_sve_svdup_neonq_s8:
4360 case SVE::BI__builtin_sve_svdup_neonq_s16:
4361 case SVE::BI__builtin_sve_svdup_neonq_s32:
4362 case SVE::BI__builtin_sve_svdup_neonq_s64:
4363 case SVE::BI__builtin_sve_svdup_neonq_u8:
4364 case SVE::BI__builtin_sve_svdup_neonq_u16:
4365 case SVE::BI__builtin_sve_svdup_neonq_u32:
4366 case SVE::BI__builtin_sve_svdup_neonq_u64:
4367 case SVE::BI__builtin_sve_svdup_neonq_f16:
4368 case SVE::BI__builtin_sve_svdup_neonq_f32:
4369 case SVE::BI__builtin_sve_svdup_neonq_f64:
4370 case SVE::BI__builtin_sve_svdup_neonq_bf16: {
4371 Value *Insert = Builder.CreateInsertVector(DstType: Ty, SrcVec: PoisonValue::get(T: Ty), SubVec: Ops[0],
4372 Idx: uint64_t(0));
4373 return Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_dupq_lane, Types: {Ty},
4374 Args: {Insert, Builder.getInt64(C: 0)});
4375 }
4376 }
4377
4378 /// Should not happen
4379 return nullptr;
4380}
4381
4382static void swapCommutativeSMEOperands(unsigned BuiltinID,
4383 SmallVectorImpl<Value *> &Ops) {
4384 unsigned MultiVec;
4385 switch (BuiltinID) {
4386 default:
4387 return;
4388 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1:
4389 MultiVec = 1;
4390 break;
4391 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
4392 case SME::BI__builtin_sme_svsudot_za32_s8_vg1x2:
4393 MultiVec = 2;
4394 break;
4395 case SME::BI__builtin_sme_svsudot_za32_s8_vg1x4:
4396 case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
4397 MultiVec = 4;
4398 break;
4399 }
4400
4401 if (MultiVec > 0)
4402 for (unsigned I = 0; I < MultiVec; ++I)
4403 std::swap(a&: Ops[I + 1], b&: Ops[I + 1 + MultiVec]);
4404}
4405
4406Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
4407 const CallExpr *E) {
4408 auto *Builtin = findARMVectorIntrinsicInMap(IntrinsicMap: AArch64SMEIntrinsicMap, BuiltinID,
4409 MapProvenSorted&: AArch64SMEIntrinsicsProvenSorted);
4410
4411 llvm::SmallVector<Value *, 4> Ops;
4412 SVETypeFlags TypeFlags(Builtin->TypeModifier);
4413 GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
4414
4415 if (TypeFlags.isLoad() || TypeFlags.isStore())
4416 return EmitSMELd1St1(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4417 if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
4418 return EmitSMEReadWrite(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4419 if (BuiltinID == SME::BI__builtin_sme_svzero_mask_za ||
4420 BuiltinID == SME::BI__builtin_sme_svzero_za)
4421 return EmitSMEZero(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4422 if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za ||
4423 BuiltinID == SME::BI__builtin_sme_svstr_vnum_za ||
4424 BuiltinID == SME::BI__builtin_sme_svldr_za ||
4425 BuiltinID == SME::BI__builtin_sme_svstr_za)
4426 return EmitSMELdrStr(TypeFlags, Ops, IntID: Builtin->LLVMIntrinsic);
4427
4428 // Emit set FPMR for intrinsics that require it
4429 if (TypeFlags.setsFPMR())
4430 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_set_fpmr),
4431 Args: Ops.pop_back_val());
4432 // Handle builtins which require their multi-vector operands to be swapped
4433 swapCommutativeSMEOperands(BuiltinID, Ops);
4434
4435 auto isCntsBuiltin = [&]() {
4436 switch (BuiltinID) {
4437 default:
4438 return 0;
4439 case SME::BI__builtin_sme_svcntsb:
4440 return 8;
4441 case SME::BI__builtin_sme_svcntsh:
4442 return 4;
4443 case SME::BI__builtin_sme_svcntsw:
4444 return 2;
4445 }
4446 };
4447
4448 if (auto Mul = isCntsBuiltin()) {
4449 llvm::Value *Cntd =
4450 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_sme_cntsd));
4451 return Builder.CreateMul(LHS: Cntd, RHS: llvm::ConstantInt::get(Ty: Int64Ty, V: Mul),
4452 Name: "mulsvl", /* HasNUW */ true, /* HasNSW */ true);
4453 }
4454
4455 // Should not happen!
4456 if (Builtin->LLVMIntrinsic == 0)
4457 return nullptr;
4458
4459 // Predicates must match the main datatype.
4460 for (Value *&Op : Ops)
4461 if (auto PredTy = dyn_cast<llvm::VectorType>(Val: Op->getType()))
4462 if (PredTy->getElementType()->isIntegerTy(Bitwidth: 1))
4463 Op = EmitSVEPredicateCast(Pred: Op, VTy: getSVEType(TypeFlags));
4464
4465 Function *F =
4466 TypeFlags.isOverloadNone()
4467 ? CGM.getIntrinsic(IID: Builtin->LLVMIntrinsic)
4468 : CGM.getIntrinsic(IID: Builtin->LLVMIntrinsic, Tys: {getSVEType(TypeFlags)});
4469
4470 return Builder.CreateCall(Callee: F, Args: Ops);
4471}
4472
4473/// Helper for the read/write/add/inc X18 builtins: read the X18 register and
4474/// return it as an i8 pointer.
4475Value *readX18AsPtr(CodeGenFunction &CGF) {
4476 LLVMContext &Context = CGF.CGM.getLLVMContext();
4477 llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Str: "x18")};
4478 llvm::MDNode *RegName = llvm::MDNode::get(Context, MDs: Ops);
4479 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, MD: RegName);
4480 llvm::Function *F =
4481 CGF.CGM.getIntrinsic(IID: Intrinsic::read_register, Tys: {CGF.Int64Ty});
4482 llvm::Value *X18 = CGF.Builder.CreateCall(Callee: F, Args: Metadata);
4483 return CGF.Builder.CreateIntToPtr(V: X18, DestTy: CGF.Int8PtrTy);
4484}
4485
4486Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
4487 const CallExpr *E,
4488 llvm::Triple::ArchType Arch) {
4489 if (BuiltinID >= clang::AArch64::FirstSVEBuiltin &&
4490 BuiltinID <= clang::AArch64::LastSVEBuiltin)
4491 return EmitAArch64SVEBuiltinExpr(BuiltinID, E);
4492
4493 if (BuiltinID >= clang::AArch64::FirstSMEBuiltin &&
4494 BuiltinID <= clang::AArch64::LastSMEBuiltin)
4495 return EmitAArch64SMEBuiltinExpr(BuiltinID, E);
4496
4497 if (BuiltinID == Builtin::BI__builtin_cpu_supports)
4498 return EmitAArch64CpuSupports(E);
4499
4500 unsigned HintID = static_cast<unsigned>(-1);
4501 switch (BuiltinID) {
4502 default: break;
4503 case clang::AArch64::BI__builtin_arm_nop:
4504 HintID = 0;
4505 break;
4506 case clang::AArch64::BI__builtin_arm_yield:
4507 case clang::AArch64::BI__yield:
4508 HintID = 1;
4509 break;
4510 case clang::AArch64::BI__builtin_arm_wfe:
4511 case clang::AArch64::BI__wfe:
4512 HintID = 2;
4513 break;
4514 case clang::AArch64::BI__builtin_arm_wfi:
4515 case clang::AArch64::BI__wfi:
4516 HintID = 3;
4517 break;
4518 case clang::AArch64::BI__builtin_arm_sev:
4519 case clang::AArch64::BI__sev:
4520 HintID = 4;
4521 break;
4522 case clang::AArch64::BI__builtin_arm_sevl:
4523 case clang::AArch64::BI__sevl:
4524 HintID = 5;
4525 break;
4526 }
4527
4528 if (HintID != static_cast<unsigned>(-1)) {
4529 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_hint);
4530 return Builder.CreateCall(Callee: F, Args: llvm::ConstantInt::get(Ty: Int32Ty, V: HintID));
4531 }
4532
4533 if (BuiltinID == clang::AArch64::BI__builtin_arm_trap) {
4534 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_break);
4535 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4536 return Builder.CreateCall(Callee: F, Args: Builder.CreateZExt(V: Arg, DestTy: CGM.Int32Ty));
4537 }
4538
4539 if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
4540 // Create call to __arm_sme_state and store the results to the two pointers.
4541 CallInst *CI = EmitRuntimeCall(callee: CGM.CreateRuntimeFunction(
4542 Ty: llvm::FunctionType::get(Result: StructType::get(elt1: CGM.Int64Ty, elts: CGM.Int64Ty), Params: {},
4543 isVarArg: false),
4544 Name: "__arm_sme_state"));
4545 auto Attrs = AttributeList().addFnAttribute(C&: getLLVMContext(),
4546 Kind: "aarch64_pstate_sm_compatible");
4547 CI->setAttributes(Attrs);
4548 CI->setCallingConv(
4549 llvm::CallingConv::
4550 AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2);
4551 Builder.CreateStore(Val: Builder.CreateExtractValue(Agg: CI, Idxs: 0),
4552 Addr: EmitPointerWithAlignment(Addr: E->getArg(Arg: 0)));
4553 return Builder.CreateStore(Val: Builder.CreateExtractValue(Agg: CI, Idxs: 1),
4554 Addr: EmitPointerWithAlignment(Addr: E->getArg(Arg: 1)));
4555 }
4556
4557 if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
4558 assert((getContext().getTypeSize(E->getType()) == 32) &&
4559 "rbit of unusual size!");
4560 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4561 return Builder.CreateCall(
4562 Callee: CGM.getIntrinsic(IID: Intrinsic::bitreverse, Tys: Arg->getType()), Args: Arg, Name: "rbit");
4563 }
4564 if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit64) {
4565 assert((getContext().getTypeSize(E->getType()) == 64) &&
4566 "rbit of unusual size!");
4567 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4568 return Builder.CreateCall(
4569 Callee: CGM.getIntrinsic(IID: Intrinsic::bitreverse, Tys: Arg->getType()), Args: Arg, Name: "rbit");
4570 }
4571
4572 if (BuiltinID == clang::AArch64::BI__builtin_arm_clz ||
4573 BuiltinID == clang::AArch64::BI__builtin_arm_clz64) {
4574 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4575 Function *F = CGM.getIntrinsic(IID: Intrinsic::ctlz, Tys: Arg->getType());
4576 Value *Res = Builder.CreateCall(Callee: F, Args: {Arg, Builder.getInt1(V: false)});
4577 if (BuiltinID == clang::AArch64::BI__builtin_arm_clz64)
4578 Res = Builder.CreateTrunc(V: Res, DestTy: Builder.getInt32Ty());
4579 return Res;
4580 }
4581
4582 if (BuiltinID == clang::AArch64::BI__builtin_arm_cls) {
4583 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4584 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_cls), Args: Arg,
4585 Name: "cls");
4586 }
4587 if (BuiltinID == clang::AArch64::BI__builtin_arm_cls64) {
4588 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4589 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_cls64), Args: Arg,
4590 Name: "cls");
4591 }
4592
4593 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32zf ||
4594 BuiltinID == clang::AArch64::BI__builtin_arm_rint32z) {
4595 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4596 llvm::Type *Ty = Arg->getType();
4597 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_frint32z, Tys: Ty),
4598 Args: Arg, Name: "frint32z");
4599 }
4600
4601 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64zf ||
4602 BuiltinID == clang::AArch64::BI__builtin_arm_rint64z) {
4603 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4604 llvm::Type *Ty = Arg->getType();
4605 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_frint64z, Tys: Ty),
4606 Args: Arg, Name: "frint64z");
4607 }
4608
4609 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32xf ||
4610 BuiltinID == clang::AArch64::BI__builtin_arm_rint32x) {
4611 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4612 llvm::Type *Ty = Arg->getType();
4613 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_frint32x, Tys: Ty),
4614 Args: Arg, Name: "frint32x");
4615 }
4616
4617 if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64xf ||
4618 BuiltinID == clang::AArch64::BI__builtin_arm_rint64x) {
4619 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4620 llvm::Type *Ty = Arg->getType();
4621 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_frint64x, Tys: Ty),
4622 Args: Arg, Name: "frint64x");
4623 }
4624
4625 if (BuiltinID == clang::AArch64::BI__builtin_arm_jcvt) {
4626 assert((getContext().getTypeSize(E->getType()) == 32) &&
4627 "__jcvt of unusual size!");
4628 llvm::Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
4629 return Builder.CreateCall(
4630 Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_fjcvtzs), Args: Arg);
4631 }
4632
4633 if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b ||
4634 BuiltinID == clang::AArch64::BI__builtin_arm_st64b ||
4635 BuiltinID == clang::AArch64::BI__builtin_arm_st64bv ||
4636 BuiltinID == clang::AArch64::BI__builtin_arm_st64bv0) {
4637 llvm::Value *MemAddr = EmitScalarExpr(E: E->getArg(Arg: 0));
4638 llvm::Value *ValPtr = EmitScalarExpr(E: E->getArg(Arg: 1));
4639
4640 if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b) {
4641 // Load from the address via an LLVM intrinsic, receiving a
4642 // tuple of 8 i64 words, and store each one to ValPtr.
4643 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_ld64b);
4644 llvm::Value *Val = Builder.CreateCall(Callee: F, Args: MemAddr);
4645 llvm::Value *ToRet;
4646 for (size_t i = 0; i < 8; i++) {
4647 llvm::Value *ValOffsetPtr =
4648 Builder.CreateGEP(Ty: Int64Ty, Ptr: ValPtr, IdxList: Builder.getInt32(C: i));
4649 Address Addr =
4650 Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(Quantity: 8));
4651 ToRet = Builder.CreateStore(Val: Builder.CreateExtractValue(Agg: Val, Idxs: i), Addr);
4652 }
4653 return ToRet;
4654 }
4655
4656 // Load 8 i64 words from ValPtr, and store them to the address
4657 // via an LLVM intrinsic.
4658 SmallVector<llvm::Value *, 9> Args;
4659 Args.push_back(Elt: MemAddr);
4660 for (size_t i = 0; i < 8; i++) {
4661 llvm::Value *ValOffsetPtr =
4662 Builder.CreateGEP(Ty: Int64Ty, Ptr: ValPtr, IdxList: Builder.getInt32(C: i));
4663 Address Addr = Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(Quantity: 8));
4664 Args.push_back(Elt: Builder.CreateLoad(Addr));
4665 }
4666
4667 auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_st64b
4668 ? Intrinsic::aarch64_st64b
4669 : BuiltinID == clang::AArch64::BI__builtin_arm_st64bv
4670 ? Intrinsic::aarch64_st64bv
4671 : Intrinsic::aarch64_st64bv0);
4672 Function *F = CGM.getIntrinsic(IID: Intr);
4673 return Builder.CreateCall(Callee: F, Args);
4674 }
4675
4676 if (BuiltinID == clang::AArch64::BI__builtin_arm_atomic_store_with_stshh) {
4677 Value *StoreAddr = EmitScalarExpr(E: E->getArg(Arg: 0));
4678 Value *StoreValue = EmitScalarExpr(E: E->getArg(Arg: 1));
4679
4680 auto *OrderC = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 2)));
4681 auto *PolicyC = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 3)));
4682
4683 // Compute pointee bit-width from arg0 and create as i32 constant
4684 QualType ValQT =
4685 E->getArg(Arg: 0)->getType()->castAs<PointerType>()->getPointeeType();
4686 unsigned SizeBits = getContext().getTypeSize(T: ValQT);
4687 auto *SizeC = llvm::ConstantInt::get(Ty: Int32Ty, V: SizeBits);
4688
4689 Value *StoreValue64 = Builder.CreateIntCast(V: StoreValue, DestTy: Int64Ty,
4690 isSigned: ValQT->isSignedIntegerType());
4691
4692 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_stshh_atomic_store,
4693 Tys: {StoreAddr->getType()});
4694
4695 // Emit a single intrinsic so backend can expand to STSHH followed by
4696 // atomic store, to guarantee STSHH immediately precedes STR insn
4697 return Builder.CreateCall(
4698 Callee: F, Args: {StoreAddr, StoreValue64,
4699 ConstantInt::get(Ty: Int32Ty, V: OrderC->getZExtValue()),
4700 ConstantInt::get(Ty: Int32Ty, V: PolicyC->getZExtValue()), SizeC});
4701 }
4702
4703 if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
4704 BuiltinID == clang::AArch64::BI__builtin_arm_rndrrs) {
4705
4706 auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_rndr
4707 ? Intrinsic::aarch64_rndr
4708 : Intrinsic::aarch64_rndrrs);
4709 Function *F = CGM.getIntrinsic(IID: Intr);
4710 llvm::Value *Val = Builder.CreateCall(Callee: F);
4711 Value *RandomValue = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
4712 Value *Status = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
4713
4714 Address MemAddress = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
4715 Builder.CreateStore(Val: RandomValue, Addr: MemAddress);
4716 Status = Builder.CreateZExt(V: Status, DestTy: Int32Ty);
4717 return Status;
4718 }
4719
4720 if (BuiltinID == clang::AArch64::BI__clear_cache) {
4721 assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
4722 const FunctionDecl *FD = E->getDirectCallee();
4723 Value *Ops[2];
4724 for (unsigned i = 0; i < 2; i++)
4725 Ops[i] = EmitScalarExpr(E: E->getArg(Arg: i));
4726 llvm::Type *Ty = CGM.getTypes().ConvertType(T: FD->getType());
4727 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Val: Ty);
4728 StringRef Name = FD->getName();
4729 return EmitNounwindRuntimeCall(callee: CGM.CreateRuntimeFunction(Ty: FTy, Name), args: Ops);
4730 }
4731
4732 if ((BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
4733 BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) &&
4734 getContext().getTypeSize(T: E->getType()) == 128) {
4735 Function *F =
4736 CGM.getIntrinsic(IID: BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
4737 ? Intrinsic::aarch64_ldaxp
4738 : Intrinsic::aarch64_ldxp);
4739
4740 Value *LdPtr = EmitScalarExpr(E: E->getArg(Arg: 0));
4741 Value *Val = Builder.CreateCall(Callee: F, Args: LdPtr, Name: "ldxp");
4742
4743 Value *Val0 = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
4744 Value *Val1 = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
4745 llvm::Type *Int128Ty = llvm::IntegerType::get(C&: getLLVMContext(), NumBits: 128);
4746 Val0 = Builder.CreateZExt(V: Val0, DestTy: Int128Ty);
4747 Val1 = Builder.CreateZExt(V: Val1, DestTy: Int128Ty);
4748
4749 Value *ShiftCst = llvm::ConstantInt::get(Ty: Int128Ty, V: 64);
4750 Val = Builder.CreateShl(LHS: Val0, RHS: ShiftCst, Name: "shl", HasNUW: true /* nuw */);
4751 Val = Builder.CreateOr(LHS: Val, RHS: Val1);
4752 return Builder.CreateBitCast(V: Val, DestTy: ConvertType(T: E->getType()));
4753 } else if (BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
4754 BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) {
4755 Value *LoadAddr = EmitScalarExpr(E: E->getArg(Arg: 0));
4756
4757 QualType Ty = E->getType();
4758 llvm::Type *RealResTy = ConvertType(T: Ty);
4759 llvm::Type *IntTy =
4760 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: getContext().getTypeSize(T: Ty));
4761
4762 Function *F =
4763 CGM.getIntrinsic(IID: BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
4764 ? Intrinsic::aarch64_ldaxr
4765 : Intrinsic::aarch64_ldxr,
4766 Tys: DefaultPtrTy);
4767 CallInst *Val = Builder.CreateCall(Callee: F, Args: LoadAddr, Name: "ldxr");
4768 Val->addParamAttr(
4769 ArgNo: 0, Attr: Attribute::get(Context&: getLLVMContext(), Kind: Attribute::ElementType, Ty: IntTy));
4770
4771 if (RealResTy->isPointerTy())
4772 return Builder.CreateIntToPtr(V: Val, DestTy: RealResTy);
4773
4774 llvm::Type *IntResTy = llvm::IntegerType::get(
4775 C&: getLLVMContext(), NumBits: CGM.getDataLayout().getTypeSizeInBits(Ty: RealResTy));
4776 return Builder.CreateBitCast(V: Builder.CreateTruncOrBitCast(V: Val, DestTy: IntResTy),
4777 DestTy: RealResTy);
4778 }
4779
4780 if ((BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
4781 BuiltinID == clang::AArch64::BI__builtin_arm_stlex) &&
4782 getContext().getTypeSize(T: E->getArg(Arg: 0)->getType()) == 128) {
4783 Function *F =
4784 CGM.getIntrinsic(IID: BuiltinID == clang::AArch64::BI__builtin_arm_stlex
4785 ? Intrinsic::aarch64_stlxp
4786 : Intrinsic::aarch64_stxp);
4787 llvm::Type *STy = llvm::StructType::get(elt1: Int64Ty, elts: Int64Ty);
4788
4789 Address Tmp = CreateMemTemp(T: E->getArg(Arg: 0)->getType());
4790 EmitAnyExprToMem(E: E->getArg(Arg: 0), Location: Tmp, Quals: Qualifiers(), /*init*/ IsInitializer: true);
4791
4792 Tmp = Tmp.withElementType(ElemTy: STy);
4793 llvm::Value *Val = Builder.CreateLoad(Addr: Tmp);
4794
4795 Value *Arg0 = Builder.CreateExtractValue(Agg: Val, Idxs: 0);
4796 Value *Arg1 = Builder.CreateExtractValue(Agg: Val, Idxs: 1);
4797 Value *StPtr = EmitScalarExpr(E: E->getArg(Arg: 1));
4798 return Builder.CreateCall(Callee: F, Args: {Arg0, Arg1, StPtr}, Name: "stxp");
4799 }
4800
4801 if (BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
4802 BuiltinID == clang::AArch64::BI__builtin_arm_stlex) {
4803 Value *StoreVal = EmitScalarExpr(E: E->getArg(Arg: 0));
4804 Value *StoreAddr = EmitScalarExpr(E: E->getArg(Arg: 1));
4805
4806 QualType Ty = E->getArg(Arg: 0)->getType();
4807 llvm::Type *StoreTy =
4808 llvm::IntegerType::get(C&: getLLVMContext(), NumBits: getContext().getTypeSize(T: Ty));
4809
4810 if (StoreVal->getType()->isPointerTy())
4811 StoreVal = Builder.CreatePtrToInt(V: StoreVal, DestTy: Int64Ty);
4812 else {
4813 llvm::Type *IntTy = llvm::IntegerType::get(
4814 C&: getLLVMContext(),
4815 NumBits: CGM.getDataLayout().getTypeSizeInBits(Ty: StoreVal->getType()));
4816 StoreVal = Builder.CreateBitCast(V: StoreVal, DestTy: IntTy);
4817 StoreVal = Builder.CreateZExtOrBitCast(V: StoreVal, DestTy: Int64Ty);
4818 }
4819
4820 Function *F =
4821 CGM.getIntrinsic(IID: BuiltinID == clang::AArch64::BI__builtin_arm_stlex
4822 ? Intrinsic::aarch64_stlxr
4823 : Intrinsic::aarch64_stxr,
4824 Tys: StoreAddr->getType());
4825 CallInst *CI = Builder.CreateCall(Callee: F, Args: {StoreVal, StoreAddr}, Name: "stxr");
4826 CI->addParamAttr(
4827 ArgNo: 1, Attr: Attribute::get(Context&: getLLVMContext(), Kind: Attribute::ElementType, Ty: StoreTy));
4828 return CI;
4829 }
4830
4831 if (BuiltinID == clang::AArch64::BI__getReg) {
4832 Expr::EvalResult Result;
4833 if (!E->getArg(Arg: 0)->EvaluateAsInt(Result, Ctx: CGM.getContext()))
4834 llvm_unreachable("Sema will ensure that the parameter is constant");
4835
4836 llvm::APSInt Value = Result.Val.getInt();
4837 LLVMContext &Context = CGM.getLLVMContext();
4838 std::string Reg = Value == 31 ? "sp" : "x" + toString(I: Value, Radix: 10);
4839
4840 llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Str: Reg)};
4841 llvm::MDNode *RegName = llvm::MDNode::get(Context, MDs: Ops);
4842 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, MD: RegName);
4843
4844 llvm::Function *F =
4845 CGM.getIntrinsic(IID: Intrinsic::read_register, Tys: {Int64Ty});
4846 return Builder.CreateCall(Callee: F, Args: Metadata);
4847 }
4848
4849 if (BuiltinID == clang::AArch64::BI__break) {
4850 Expr::EvalResult Result;
4851 if (!E->getArg(Arg: 0)->EvaluateAsInt(Result, Ctx: CGM.getContext()))
4852 llvm_unreachable("Sema will ensure that the parameter is constant");
4853
4854 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_break);
4855 return Builder.CreateCall(Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0))});
4856 }
4857
4858 if (BuiltinID == clang::AArch64::BI__builtin_arm_clrex) {
4859 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_clrex);
4860 return Builder.CreateCall(Callee: F);
4861 }
4862
4863 if (BuiltinID == clang::AArch64::BI_ReadWriteBarrier)
4864 return Builder.CreateFence(Ordering: llvm::AtomicOrdering::SequentiallyConsistent,
4865 SSID: llvm::SyncScope::SingleThread);
4866
4867 // CRC32
4868 Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
4869 switch (BuiltinID) {
4870 case clang::AArch64::BI__builtin_arm_crc32b:
4871 CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
4872 case clang::AArch64::BI__builtin_arm_crc32cb:
4873 CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
4874 case clang::AArch64::BI__builtin_arm_crc32h:
4875 CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
4876 case clang::AArch64::BI__builtin_arm_crc32ch:
4877 CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
4878 case clang::AArch64::BI__builtin_arm_crc32w:
4879 CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
4880 case clang::AArch64::BI__builtin_arm_crc32cw:
4881 CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
4882 case clang::AArch64::BI__builtin_arm_crc32d:
4883 CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
4884 case clang::AArch64::BI__builtin_arm_crc32cd:
4885 CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
4886 }
4887
4888 if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
4889 Value *Arg0 = EmitScalarExpr(E: E->getArg(Arg: 0));
4890 Value *Arg1 = EmitScalarExpr(E: E->getArg(Arg: 1));
4891 Function *F = CGM.getIntrinsic(IID: CRCIntrinsicID);
4892
4893 llvm::Type *DataTy = F->getFunctionType()->getParamType(i: 1);
4894 Arg1 = Builder.CreateZExtOrBitCast(V: Arg1, DestTy: DataTy);
4895
4896 return Builder.CreateCall(Callee: F, Args: {Arg0, Arg1});
4897 }
4898
4899 // Memory Operations (MOPS)
4900 if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) {
4901 Value *Dst = EmitScalarExpr(E: E->getArg(Arg: 0));
4902 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
4903 Value *Size = EmitScalarExpr(E: E->getArg(Arg: 2));
4904 Val = Builder.CreateTrunc(V: Val, DestTy: Int8Ty);
4905 Size = Builder.CreateIntCast(V: Size, DestTy: Int64Ty, isSigned: false);
4906 return Builder.CreateCall(
4907 Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_mops_memset_tag), Args: {Dst, Val, Size});
4908 }
4909
4910 if (BuiltinID == AArch64::BI__builtin_arm_range_prefetch ||
4911 BuiltinID == AArch64::BI__builtin_arm_range_prefetch_x)
4912 return EmitRangePrefetchBuiltin(CGF&: *this, BuiltinID, E);
4913
4914 // Memory Tagging Extensions (MTE) Intrinsics
4915 Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic;
4916 switch (BuiltinID) {
4917 case clang::AArch64::BI__builtin_arm_irg:
4918 MTEIntrinsicID = Intrinsic::aarch64_irg; break;
4919 case clang::AArch64::BI__builtin_arm_addg:
4920 MTEIntrinsicID = Intrinsic::aarch64_addg; break;
4921 case clang::AArch64::BI__builtin_arm_gmi:
4922 MTEIntrinsicID = Intrinsic::aarch64_gmi; break;
4923 case clang::AArch64::BI__builtin_arm_ldg:
4924 MTEIntrinsicID = Intrinsic::aarch64_ldg; break;
4925 case clang::AArch64::BI__builtin_arm_stg:
4926 MTEIntrinsicID = Intrinsic::aarch64_stg; break;
4927 case clang::AArch64::BI__builtin_arm_subp:
4928 MTEIntrinsicID = Intrinsic::aarch64_subp; break;
4929 }
4930
4931 if (MTEIntrinsicID != Intrinsic::not_intrinsic) {
4932 if (MTEIntrinsicID == Intrinsic::aarch64_irg) {
4933 Value *Pointer = EmitScalarExpr(E: E->getArg(Arg: 0));
4934 Value *Mask = EmitScalarExpr(E: E->getArg(Arg: 1));
4935
4936 Mask = Builder.CreateZExt(V: Mask, DestTy: Int64Ty);
4937 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: MTEIntrinsicID),
4938 Args: {Pointer, Mask});
4939 }
4940 if (MTEIntrinsicID == Intrinsic::aarch64_addg) {
4941 Value *Pointer = EmitScalarExpr(E: E->getArg(Arg: 0));
4942 Value *TagOffset = EmitScalarExpr(E: E->getArg(Arg: 1));
4943
4944 TagOffset = Builder.CreateZExt(V: TagOffset, DestTy: Int64Ty);
4945 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: MTEIntrinsicID),
4946 Args: {Pointer, TagOffset});
4947 }
4948 if (MTEIntrinsicID == Intrinsic::aarch64_gmi) {
4949 Value *Pointer = EmitScalarExpr(E: E->getArg(Arg: 0));
4950 Value *ExcludedMask = EmitScalarExpr(E: E->getArg(Arg: 1));
4951
4952 ExcludedMask = Builder.CreateZExt(V: ExcludedMask, DestTy: Int64Ty);
4953 return Builder.CreateCall(
4954 Callee: CGM.getIntrinsic(IID: MTEIntrinsicID), Args: {Pointer, ExcludedMask});
4955 }
4956 // Although it is possible to supply a different return
4957 // address (first arg) to this intrinsic, for now we set
4958 // return address same as input address.
4959 if (MTEIntrinsicID == Intrinsic::aarch64_ldg) {
4960 Value *TagAddress = EmitScalarExpr(E: E->getArg(Arg: 0));
4961 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: MTEIntrinsicID),
4962 Args: {TagAddress, TagAddress});
4963 }
4964 // Although it is possible to supply a different tag (to set)
4965 // to this intrinsic (as first arg), for now we supply
4966 // the tag that is in input address arg (common use case).
4967 if (MTEIntrinsicID == Intrinsic::aarch64_stg) {
4968 Value *TagAddress = EmitScalarExpr(E: E->getArg(Arg: 0));
4969 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: MTEIntrinsicID),
4970 Args: {TagAddress, TagAddress});
4971 }
4972 if (MTEIntrinsicID == Intrinsic::aarch64_subp) {
4973 Value *PointerA = EmitScalarExpr(E: E->getArg(Arg: 0));
4974 Value *PointerB = EmitScalarExpr(E: E->getArg(Arg: 1));
4975 return Builder.CreateCall(
4976 Callee: CGM.getIntrinsic(IID: MTEIntrinsicID), Args: {PointerA, PointerB});
4977 }
4978 }
4979
4980 if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
4981 BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
4982 BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
4983 BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
4984 BuiltinID == clang::AArch64::BI__builtin_arm_wsr ||
4985 BuiltinID == clang::AArch64::BI__builtin_arm_wsr64 ||
4986 BuiltinID == clang::AArch64::BI__builtin_arm_wsr128 ||
4987 BuiltinID == clang::AArch64::BI__builtin_arm_wsrp) {
4988
4989 SpecialRegisterAccessKind AccessKind = Write;
4990 if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
4991 BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
4992 BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
4993 BuiltinID == clang::AArch64::BI__builtin_arm_rsrp)
4994 AccessKind = VolatileRead;
4995
4996 bool IsPointerBuiltin = BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
4997 BuiltinID == clang::AArch64::BI__builtin_arm_wsrp;
4998
4999 bool Is32Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
5000 BuiltinID == clang::AArch64::BI__builtin_arm_wsr;
5001
5002 bool Is128Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
5003 BuiltinID == clang::AArch64::BI__builtin_arm_wsr128;
5004
5005 llvm::Type *ValueType;
5006 llvm::Type *RegisterType = Int64Ty;
5007 if (Is32Bit) {
5008 ValueType = Int32Ty;
5009 } else if (Is128Bit) {
5010 llvm::Type *Int128Ty =
5011 llvm::IntegerType::getInt128Ty(C&: CGM.getLLVMContext());
5012 ValueType = Int128Ty;
5013 RegisterType = Int128Ty;
5014 } else if (IsPointerBuiltin) {
5015 ValueType = VoidPtrTy;
5016 } else {
5017 ValueType = Int64Ty;
5018 };
5019
5020 return EmitSpecialRegisterBuiltin(CGF&: *this, E, RegisterType, ValueType,
5021 AccessKind);
5022 }
5023
5024 if (BuiltinID == clang::AArch64::BI_ReadStatusReg ||
5025 BuiltinID == clang::AArch64::BI_WriteStatusReg) {
5026 LLVMContext &Context = CGM.getLLVMContext();
5027
5028 unsigned SysReg =
5029 E->getArg(Arg: 0)->EvaluateKnownConstInt(Ctx: getContext()).getZExtValue();
5030
5031 std::string SysRegStr;
5032 llvm::raw_string_ostream(SysRegStr)
5033 << (0b10 | SysReg >> 14) << ":" << ((SysReg >> 11) & 7) << ":"
5034 << ((SysReg >> 7) & 15) << ":" << ((SysReg >> 3) & 15) << ":"
5035 << (SysReg & 7);
5036
5037 llvm::Metadata *Ops[] = { llvm::MDString::get(Context, Str: SysRegStr) };
5038 llvm::MDNode *RegName = llvm::MDNode::get(Context, MDs: Ops);
5039 llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, MD: RegName);
5040
5041 llvm::Type *RegisterType = Int64Ty;
5042 llvm::Type *Types[] = { RegisterType };
5043
5044 if (BuiltinID == clang::AArch64::BI_ReadStatusReg) {
5045 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::read_register, Tys: Types);
5046
5047 return Builder.CreateCall(Callee: F, Args: Metadata);
5048 }
5049
5050 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::write_register, Tys: Types);
5051 llvm::Value *ArgValue = EmitScalarExpr(E: E->getArg(Arg: 1));
5052 llvm::Value *Result = Builder.CreateCall(Callee: F, Args: {Metadata, ArgValue});
5053
5054 return Result;
5055 }
5056
5057 if (BuiltinID == clang::AArch64::BI__sys) {
5058 unsigned SysReg =
5059 E->getArg(Arg: 0)->EvaluateKnownConstInt(Ctx: getContext()).getZExtValue();
5060 const unsigned Op1 = SysReg >> 11;
5061 const unsigned CRn = (SysReg >> 7) & 0xf;
5062 const unsigned CRm = (SysReg >> 3) & 0xf;
5063 const unsigned Op2 = SysReg & 0x7;
5064
5065 Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Intrinsic::aarch64_sys),
5066 Args: {Builder.getInt32(C: Op1), Builder.getInt32(C: CRn),
5067 Builder.getInt32(C: CRm), Builder.getInt32(C: Op2),
5068 EmitScalarExpr(E: E->getArg(Arg: 1))});
5069
5070 // Return 0 for convenience, even though MSVC returns some other undefined
5071 // value.
5072 return ConstantInt::get(Ty: Builder.getInt32Ty(), V: 0);
5073 }
5074
5075 if (BuiltinID == clang::AArch64::BI_AddressOfReturnAddress) {
5076 llvm::Function *F =
5077 CGM.getIntrinsic(IID: Intrinsic::addressofreturnaddress, Tys: AllocaInt8PtrTy);
5078 return Builder.CreateCall(Callee: F);
5079 }
5080
5081 if (BuiltinID == clang::AArch64::BI__builtin_sponentry) {
5082 llvm::Function *F = CGM.getIntrinsic(IID: Intrinsic::sponentry, Tys: AllocaInt8PtrTy);
5083 return Builder.CreateCall(Callee: F);
5084 }
5085
5086 if (BuiltinID == clang::AArch64::BI__mulh ||
5087 BuiltinID == clang::AArch64::BI__umulh) {
5088 llvm::Type *ResType = ConvertType(T: E->getType());
5089 llvm::Type *Int128Ty = llvm::IntegerType::get(C&: getLLVMContext(), NumBits: 128);
5090
5091 bool IsSigned = BuiltinID == clang::AArch64::BI__mulh;
5092 Value *LHS =
5093 Builder.CreateIntCast(V: EmitScalarExpr(E: E->getArg(Arg: 0)), DestTy: Int128Ty, isSigned: IsSigned);
5094 Value *RHS =
5095 Builder.CreateIntCast(V: EmitScalarExpr(E: E->getArg(Arg: 1)), DestTy: Int128Ty, isSigned: IsSigned);
5096
5097 Value *MulResult, *HigherBits;
5098 if (IsSigned) {
5099 MulResult = Builder.CreateNSWMul(LHS, RHS);
5100 HigherBits = Builder.CreateAShr(LHS: MulResult, RHS: 64);
5101 } else {
5102 MulResult = Builder.CreateNUWMul(LHS, RHS);
5103 HigherBits = Builder.CreateLShr(LHS: MulResult, RHS: 64);
5104 }
5105 HigherBits = Builder.CreateIntCast(V: HigherBits, DestTy: ResType, isSigned: IsSigned);
5106
5107 return HigherBits;
5108 }
5109
5110 if (BuiltinID == AArch64::BI__writex18byte ||
5111 BuiltinID == AArch64::BI__writex18word ||
5112 BuiltinID == AArch64::BI__writex18dword ||
5113 BuiltinID == AArch64::BI__writex18qword) {
5114 // Process the args first
5115 Value *OffsetArg = EmitScalarExpr(E: E->getArg(Arg: 0));
5116 Value *DataArg = EmitScalarExpr(E: E->getArg(Arg: 1));
5117
5118 // Read x18 as i8*
5119 llvm::Value *X18 = readX18AsPtr(CGF&: *this);
5120
5121 // Store val at x18 + offset
5122 Value *Offset = Builder.CreateZExt(V: OffsetArg, DestTy: Int64Ty);
5123 Value *Ptr = Builder.CreateGEP(Ty: Int8Ty, Ptr: X18, IdxList: Offset);
5124 StoreInst *Store =
5125 Builder.CreateAlignedStore(Val: DataArg, Addr: Ptr, Align: CharUnits::One());
5126 return Store;
5127 }
5128
5129 if (BuiltinID == AArch64::BI__readx18byte ||
5130 BuiltinID == AArch64::BI__readx18word ||
5131 BuiltinID == AArch64::BI__readx18dword ||
5132 BuiltinID == AArch64::BI__readx18qword) {
5133 // Process the args first
5134 Value *OffsetArg = EmitScalarExpr(E: E->getArg(Arg: 0));
5135
5136 // Read x18 as i8*
5137 llvm::Value *X18 = readX18AsPtr(CGF&: *this);
5138
5139 // Load x18 + offset
5140 Value *Offset = Builder.CreateZExt(V: OffsetArg, DestTy: Int64Ty);
5141 Value *Ptr = Builder.CreateGEP(Ty: Int8Ty, Ptr: X18, IdxList: Offset);
5142 llvm::Type *IntTy = ConvertType(T: E->getType());
5143 LoadInst *Load = Builder.CreateAlignedLoad(Ty: IntTy, Addr: Ptr, Align: CharUnits::One());
5144 return Load;
5145 }
5146
5147 if (BuiltinID == AArch64::BI__addx18byte ||
5148 BuiltinID == AArch64::BI__addx18word ||
5149 BuiltinID == AArch64::BI__addx18dword ||
5150 BuiltinID == AArch64::BI__addx18qword ||
5151 BuiltinID == AArch64::BI__incx18byte ||
5152 BuiltinID == AArch64::BI__incx18word ||
5153 BuiltinID == AArch64::BI__incx18dword ||
5154 BuiltinID == AArch64::BI__incx18qword) {
5155 llvm::Type *IntTy;
5156 bool isIncrement;
5157 switch (BuiltinID) {
5158 case AArch64::BI__incx18byte:
5159 IntTy = Int8Ty;
5160 isIncrement = true;
5161 break;
5162 case AArch64::BI__incx18word:
5163 IntTy = Int16Ty;
5164 isIncrement = true;
5165 break;
5166 case AArch64::BI__incx18dword:
5167 IntTy = Int32Ty;
5168 isIncrement = true;
5169 break;
5170 case AArch64::BI__incx18qword:
5171 IntTy = Int64Ty;
5172 isIncrement = true;
5173 break;
5174 default:
5175 IntTy = ConvertType(T: E->getArg(Arg: 1)->getType());
5176 isIncrement = false;
5177 break;
5178 }
5179 // Process the args first
5180 Value *OffsetArg = EmitScalarExpr(E: E->getArg(Arg: 0));
5181 Value *ValToAdd =
5182 isIncrement ? ConstantInt::get(Ty: IntTy, V: 1) : EmitScalarExpr(E: E->getArg(Arg: 1));
5183
5184 // Read x18 as i8*
5185 llvm::Value *X18 = readX18AsPtr(CGF&: *this);
5186
5187 // Load x18 + offset
5188 Value *Offset = Builder.CreateZExt(V: OffsetArg, DestTy: Int64Ty);
5189 Value *Ptr = Builder.CreateGEP(Ty: Int8Ty, Ptr: X18, IdxList: Offset);
5190 LoadInst *Load = Builder.CreateAlignedLoad(Ty: IntTy, Addr: Ptr, Align: CharUnits::One());
5191
5192 // Add values
5193 Value *AddResult = Builder.CreateAdd(LHS: Load, RHS: ValToAdd);
5194
5195 // Store val at x18 + offset
5196 StoreInst *Store =
5197 Builder.CreateAlignedStore(Val: AddResult, Addr: Ptr, Align: CharUnits::One());
5198 return Store;
5199 }
5200
5201 if (BuiltinID == AArch64::BI_CopyDoubleFromInt64 ||
5202 BuiltinID == AArch64::BI_CopyFloatFromInt32 ||
5203 BuiltinID == AArch64::BI_CopyInt32FromFloat ||
5204 BuiltinID == AArch64::BI_CopyInt64FromDouble) {
5205 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5206 llvm::Type *RetTy = ConvertType(T: E->getType());
5207 return Builder.CreateBitCast(V: Arg, DestTy: RetTy);
5208 }
5209
5210 if (BuiltinID == AArch64::BI_CountLeadingOnes ||
5211 BuiltinID == AArch64::BI_CountLeadingOnes64 ||
5212 BuiltinID == AArch64::BI_CountLeadingZeros ||
5213 BuiltinID == AArch64::BI_CountLeadingZeros64) {
5214 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5215 llvm::Type *ArgType = Arg->getType();
5216
5217 if (BuiltinID == AArch64::BI_CountLeadingOnes ||
5218 BuiltinID == AArch64::BI_CountLeadingOnes64)
5219 Arg = Builder.CreateXor(LHS: Arg, RHS: Constant::getAllOnesValue(Ty: ArgType));
5220
5221 Function *F = CGM.getIntrinsic(IID: Intrinsic::ctlz, Tys: ArgType);
5222 Value *Result = Builder.CreateCall(Callee: F, Args: {Arg, Builder.getInt1(V: false)});
5223
5224 if (BuiltinID == AArch64::BI_CountLeadingOnes64 ||
5225 BuiltinID == AArch64::BI_CountLeadingZeros64)
5226 Result = Builder.CreateTrunc(V: Result, DestTy: Builder.getInt32Ty());
5227 return Result;
5228 }
5229
5230 if (BuiltinID == AArch64::BI_CountLeadingSigns ||
5231 BuiltinID == AArch64::BI_CountLeadingSigns64) {
5232 Value *Arg = EmitScalarExpr(E: E->getArg(Arg: 0));
5233
5234 Function *F = (BuiltinID == AArch64::BI_CountLeadingSigns)
5235 ? CGM.getIntrinsic(IID: Intrinsic::aarch64_cls)
5236 : CGM.getIntrinsic(IID: Intrinsic::aarch64_cls64);
5237
5238 Value *Result = Builder.CreateCall(Callee: F, Args: Arg, Name: "cls");
5239 if (BuiltinID == AArch64::BI_CountLeadingSigns64)
5240 Result = Builder.CreateTrunc(V: Result, DestTy: Builder.getInt32Ty());
5241 return Result;
5242 }
5243
5244 if (BuiltinID == AArch64::BI_CountOneBits ||
5245 BuiltinID == AArch64::BI_CountOneBits64) {
5246 Value *ArgValue = EmitScalarExpr(E: E->getArg(Arg: 0));
5247 llvm::Type *ArgType = ArgValue->getType();
5248 Function *F = CGM.getIntrinsic(IID: Intrinsic::ctpop, Tys: ArgType);
5249
5250 Value *Result = Builder.CreateCall(Callee: F, Args: ArgValue);
5251 if (BuiltinID == AArch64::BI_CountOneBits64)
5252 Result = Builder.CreateTrunc(V: Result, DestTy: Builder.getInt32Ty());
5253 return Result;
5254 }
5255
5256 if (BuiltinID == AArch64::BI__prefetch) {
5257 Value *Address = EmitScalarExpr(E: E->getArg(Arg: 0));
5258 Value *RW = llvm::ConstantInt::get(Ty: Int32Ty, V: 0);
5259 Value *Locality = ConstantInt::get(Ty: Int32Ty, V: 3);
5260 Value *Data = llvm::ConstantInt::get(Ty: Int32Ty, V: 1);
5261 Function *F = CGM.getIntrinsic(IID: Intrinsic::prefetch, Tys: Address->getType());
5262 return Builder.CreateCall(Callee: F, Args: {Address, RW, Locality, Data});
5263 }
5264
5265 if (BuiltinID == AArch64::BI__hlt) {
5266 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_hlt);
5267 Builder.CreateCall(Callee: F, Args: {EmitScalarExpr(E: E->getArg(Arg: 0))});
5268
5269 // Return 0 for convenience, even though MSVC returns some other undefined
5270 // value.
5271 return ConstantInt::get(Ty: Builder.getInt32Ty(), V: 0);
5272 }
5273
5274 if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
5275 return Builder.CreateFPTrunc(
5276 V: Builder.CreateBitCast(V: EmitScalarExpr(E: E->getArg(Arg: 0)),
5277 DestTy: Builder.getFloatTy()),
5278 DestTy: Builder.getBFloatTy());
5279
5280 // Handle MSVC intrinsics before argument evaluation to prevent double
5281 // evaluation.
5282 if (std::optional<MSVCIntrin> MsvcIntId =
5283 translateAarch64ToMsvcIntrin(BuiltinID))
5284 return EmitMSVCBuiltinExpr(BuiltinID: *MsvcIntId, E);
5285
5286 // Some intrinsics are equivalent - if they are use the base intrinsic ID.
5287 auto It = llvm::find_if(Range: NEONEquivalentIntrinsicMap, P: [BuiltinID](auto &P) {
5288 return P.first == BuiltinID;
5289 });
5290 if (It != end(arr: NEONEquivalentIntrinsicMap))
5291 BuiltinID = It->second;
5292
5293 // Check whether this is an SISD builtin.
5294 auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap);
5295 const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
5296 IntrinsicMap: SISDMap, BuiltinID, MapProvenSorted&: AArch64SISDIntrinsicsProvenSorted);
5297 bool IsSISD = (Builtin != nullptr);
5298
5299 // Find out if any arguments are required to be integer constant
5300 // expressions.
5301 unsigned ICEArguments = 0;
5302 ASTContext::GetBuiltinTypeError Error;
5303 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
5304 assert(Error == ASTContext::GE_None && "Should not codegen an error");
5305
5306 llvm::SmallVector<Value*, 4> Ops;
5307 Address PtrOp0 = Address::invalid();
5308 // Note the assumption that SISD intrinsics do not contain extra arguments.
5309 // TODO: Fold this into a single function call instead of, effectively, two
5310 // separate checks.
5311 bool HasExtraArg = !IsSISD && HasExtraNeonArgument(BuiltinID);
5312 unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
5313 for (unsigned i = 0, e = NumArgs; i != e; i++) {
5314 if (i == 0) {
5315 switch (BuiltinID) {
5316 case NEON::BI__builtin_neon_vld1_v:
5317 case NEON::BI__builtin_neon_vld1q_v:
5318 case NEON::BI__builtin_neon_vld1_dup_v:
5319 case NEON::BI__builtin_neon_vld1q_dup_v:
5320 case NEON::BI__builtin_neon_vld1_lane_v:
5321 case NEON::BI__builtin_neon_vld1q_lane_v:
5322 case NEON::BI__builtin_neon_vst1_v:
5323 case NEON::BI__builtin_neon_vst1q_v:
5324 case NEON::BI__builtin_neon_vst1_lane_v:
5325 case NEON::BI__builtin_neon_vst1q_lane_v:
5326 case NEON::BI__builtin_neon_vldap1_lane_s64:
5327 case NEON::BI__builtin_neon_vldap1q_lane_s64:
5328 case NEON::BI__builtin_neon_vstl1_lane_s64:
5329 case NEON::BI__builtin_neon_vstl1q_lane_s64:
5330 // Get the alignment for the argument in addition to the value;
5331 // we'll use it later.
5332 PtrOp0 = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
5333 Ops.push_back(Elt: PtrOp0.emitRawPointer(CGF&: *this));
5334 continue;
5335 }
5336 }
5337 Ops.push_back(Elt: EmitScalarOrConstFoldImmArg(ICEArguments, Idx: i, E));
5338 }
5339
5340 if (Builtin) {
5341 Value *Result = EmitCommonNeonSISDBuiltinExpr(CGF&: *this, SISDInfo: *Builtin, Ops, E);
5342 assert(Result && "SISD intrinsic should have been handled");
5343 return Result;
5344 }
5345
5346 const Expr *Arg = E->getArg(Arg: E->getNumArgs()-1);
5347 NeonTypeFlags Type(0);
5348 if (std::optional<llvm::APSInt> Result =
5349 Arg->getIntegerConstantExpr(Ctx: getContext()))
5350 // Determine the type of this overloaded NEON intrinsic.
5351 Type = NeonTypeFlags(Result->getZExtValue());
5352
5353 bool usgn = Type.isUnsigned();
5354 bool quad = Type.isQuad();
5355 unsigned Int;
5356
5357 // Not all intrinsics handled by the common case work for AArch64 yet, so only
5358 // defer to common code if it's been added to our special map.
5359 Builtin = findARMVectorIntrinsicInMap(IntrinsicMap: AArch64SIMDIntrinsicMap, BuiltinID,
5360 MapProvenSorted&: AArch64SIMDIntrinsicsProvenSorted);
5361
5362 if (Builtin)
5363 return EmitCommonNeonBuiltinExpr(
5364 BuiltinID: Builtin->BuiltinID, LLVMIntrinsic: Builtin->LLVMIntrinsic, AltLLVMIntrinsic: Builtin->AltLLVMIntrinsic,
5365 NameHint: Builtin->NameHint, Modifier: Builtin->TypeModifier, E, Ops,
5366 /*never use addresses*/ PtrOp0: Address::invalid(), PtrOp1: Address::invalid(), Arch);
5367
5368 if (Value *V = EmitAArch64TblBuiltinExpr(CGF&: *this, BuiltinID, E, Ops, Arch))
5369 return V;
5370
5371 // Handle non-overloaded intrinsics first.
5372 switch (BuiltinID) {
5373 default: break;
5374 case NEON::BI__builtin_neon_vabsh_f16:
5375 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::fabs, Tys: HalfTy), Ops, name: "vabs");
5376 case NEON::BI__builtin_neon_vaddq_p128: {
5377 llvm::Type *Ty = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags::Poly128);
5378 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
5379 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
5380 Ops[0] = Builder.CreateXor(LHS: Ops[0], RHS: Ops[1]);
5381 llvm::Type *Int128Ty = llvm::Type::getIntNTy(C&: getLLVMContext(), N: 128);
5382 return Builder.CreateBitCast(V: Ops[0], DestTy: Int128Ty);
5383 }
5384 case NEON::BI__builtin_neon_vldrq_p128: {
5385 llvm::Type *Int128Ty = llvm::Type::getIntNTy(C&: getLLVMContext(), N: 128);
5386 return Builder.CreateAlignedLoad(Ty: Int128Ty, Addr: Ops[0],
5387 Align: CharUnits::fromQuantity(Quantity: 16));
5388 }
5389 case NEON::BI__builtin_neon_vstrq_p128: {
5390 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
5391 }
5392 case NEON::BI__builtin_neon_vcvts_f32_u32:
5393 case NEON::BI__builtin_neon_vcvtd_f64_u64:
5394 usgn = true;
5395 [[fallthrough]];
5396 case NEON::BI__builtin_neon_vcvts_f32_s32:
5397 case NEON::BI__builtin_neon_vcvtd_f64_s64: {
5398 bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
5399 llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
5400 llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
5401 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: InTy);
5402 if (usgn)
5403 return Builder.CreateUIToFP(V: Ops[0], DestTy: FTy);
5404 return Builder.CreateSIToFP(V: Ops[0], DestTy: FTy);
5405 }
5406 case NEON::BI__builtin_neon_vcvth_f16_u16:
5407 case NEON::BI__builtin_neon_vcvth_f16_u32:
5408 case NEON::BI__builtin_neon_vcvth_f16_u64:
5409 usgn = true;
5410 [[fallthrough]];
5411 case NEON::BI__builtin_neon_vcvth_f16_s16:
5412 case NEON::BI__builtin_neon_vcvth_f16_s32:
5413 case NEON::BI__builtin_neon_vcvth_f16_s64: {
5414 llvm::Type *FTy = HalfTy;
5415 llvm::Type *InTy;
5416 if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
5417 InTy = Int64Ty;
5418 else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
5419 InTy = Int32Ty;
5420 else
5421 InTy = Int16Ty;
5422 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: InTy);
5423 if (usgn)
5424 return Builder.CreateUIToFP(V: Ops[0], DestTy: FTy);
5425 return Builder.CreateSIToFP(V: Ops[0], DestTy: FTy);
5426 }
5427 case NEON::BI__builtin_neon_vcvtah_u16_f16:
5428 case NEON::BI__builtin_neon_vcvtmh_u16_f16:
5429 case NEON::BI__builtin_neon_vcvtnh_u16_f16:
5430 case NEON::BI__builtin_neon_vcvtph_u16_f16:
5431 case NEON::BI__builtin_neon_vcvth_u16_f16:
5432 case NEON::BI__builtin_neon_vcvtah_s16_f16:
5433 case NEON::BI__builtin_neon_vcvtmh_s16_f16:
5434 case NEON::BI__builtin_neon_vcvtnh_s16_f16:
5435 case NEON::BI__builtin_neon_vcvtph_s16_f16:
5436 case NEON::BI__builtin_neon_vcvth_s16_f16: {
5437 llvm::Type *InTy = Int16Ty;
5438 llvm::Type* FTy = HalfTy;
5439 llvm::Type *Tys[2] = {InTy, FTy};
5440 switch (BuiltinID) {
5441 default: llvm_unreachable("missing builtin ID in switch!");
5442 case NEON::BI__builtin_neon_vcvtah_u16_f16:
5443 Int = Intrinsic::aarch64_neon_fcvtau; break;
5444 case NEON::BI__builtin_neon_vcvtmh_u16_f16:
5445 Int = Intrinsic::aarch64_neon_fcvtmu; break;
5446 case NEON::BI__builtin_neon_vcvtnh_u16_f16:
5447 Int = Intrinsic::aarch64_neon_fcvtnu; break;
5448 case NEON::BI__builtin_neon_vcvtph_u16_f16:
5449 Int = Intrinsic::aarch64_neon_fcvtpu; break;
5450 case NEON::BI__builtin_neon_vcvth_u16_f16:
5451 Int = Intrinsic::aarch64_neon_fcvtzu; break;
5452 case NEON::BI__builtin_neon_vcvtah_s16_f16:
5453 Int = Intrinsic::aarch64_neon_fcvtas; break;
5454 case NEON::BI__builtin_neon_vcvtmh_s16_f16:
5455 Int = Intrinsic::aarch64_neon_fcvtms; break;
5456 case NEON::BI__builtin_neon_vcvtnh_s16_f16:
5457 Int = Intrinsic::aarch64_neon_fcvtns; break;
5458 case NEON::BI__builtin_neon_vcvtph_s16_f16:
5459 Int = Intrinsic::aarch64_neon_fcvtps; break;
5460 case NEON::BI__builtin_neon_vcvth_s16_f16:
5461 Int = Intrinsic::aarch64_neon_fcvtzs; break;
5462 }
5463 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "fcvt");
5464 }
5465 case NEON::BI__builtin_neon_vcaleh_f16:
5466 case NEON::BI__builtin_neon_vcalth_f16:
5467 case NEON::BI__builtin_neon_vcageh_f16:
5468 case NEON::BI__builtin_neon_vcagth_f16: {
5469 llvm::Type* InTy = Int32Ty;
5470 llvm::Type* FTy = HalfTy;
5471 llvm::Type *Tys[2] = {InTy, FTy};
5472 switch (BuiltinID) {
5473 default: llvm_unreachable("missing builtin ID in switch!");
5474 case NEON::BI__builtin_neon_vcageh_f16:
5475 Int = Intrinsic::aarch64_neon_facge; break;
5476 case NEON::BI__builtin_neon_vcagth_f16:
5477 Int = Intrinsic::aarch64_neon_facgt; break;
5478 case NEON::BI__builtin_neon_vcaleh_f16:
5479 Int = Intrinsic::aarch64_neon_facge; std::swap(a&: Ops[0], b&: Ops[1]); break;
5480 case NEON::BI__builtin_neon_vcalth_f16:
5481 Int = Intrinsic::aarch64_neon_facgt; std::swap(a&: Ops[0], b&: Ops[1]); break;
5482 }
5483 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "facg");
5484 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
5485 }
5486 case NEON::BI__builtin_neon_vcvth_n_s16_f16:
5487 case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
5488 llvm::Type* InTy = Int32Ty;
5489 llvm::Type* FTy = HalfTy;
5490 llvm::Type *Tys[2] = {InTy, FTy};
5491 switch (BuiltinID) {
5492 default: llvm_unreachable("missing builtin ID in switch!");
5493 case NEON::BI__builtin_neon_vcvth_n_s16_f16:
5494 Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
5495 case NEON::BI__builtin_neon_vcvth_n_u16_f16:
5496 Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
5497 }
5498 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "fcvth_n");
5499 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
5500 }
5501 case NEON::BI__builtin_neon_vcvth_n_f16_s16:
5502 case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
5503 llvm::Type* FTy = HalfTy;
5504 llvm::Type* InTy = Int32Ty;
5505 llvm::Type *Tys[2] = {FTy, InTy};
5506 switch (BuiltinID) {
5507 default: llvm_unreachable("missing builtin ID in switch!");
5508 case NEON::BI__builtin_neon_vcvth_n_f16_s16:
5509 Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
5510 Ops[0] = Builder.CreateSExt(V: Ops[0], DestTy: InTy, Name: "sext");
5511 break;
5512 case NEON::BI__builtin_neon_vcvth_n_f16_u16:
5513 Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
5514 Ops[0] = Builder.CreateZExt(V: Ops[0], DestTy: InTy);
5515 break;
5516 }
5517 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "fcvth_n");
5518 }
5519 case NEON::BI__builtin_neon_vpaddd_s64: {
5520 // TODO: Isn't this handled by
5521 // EmitCommonNeonSISDBuiltinExpr?
5522 auto *Ty = llvm::FixedVectorType::get(ElementType: Int64Ty, NumElts: 2);
5523 // The vector is v2f64, so make sure it's bitcast to that.
5524 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty, Name: "v2i64");
5525 llvm::Value *Idx0 = llvm::ConstantInt::get(Ty: SizeTy, V: 0);
5526 llvm::Value *Idx1 = llvm::ConstantInt::get(Ty: SizeTy, V: 1);
5527 Value *Op0 = Builder.CreateExtractElement(Vec: Ops[0], Idx: Idx0, Name: "lane0");
5528 Value *Op1 = Builder.CreateExtractElement(Vec: Ops[0], Idx: Idx1, Name: "lane1");
5529 // Pairwise addition of a v2f64 into a scalar f64.
5530 return Builder.CreateAdd(LHS: Op0, RHS: Op1, Name: "vpaddd");
5531 }
5532 case NEON::BI__builtin_neon_vpaddd_f64: {
5533 auto *Ty = llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 2);
5534 // The vector is v2f64, so make sure it's bitcast to that.
5535 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty, Name: "v2f64");
5536 llvm::Value *Idx0 = llvm::ConstantInt::get(Ty: SizeTy, V: 0);
5537 llvm::Value *Idx1 = llvm::ConstantInt::get(Ty: SizeTy, V: 1);
5538 Value *Op0 = Builder.CreateExtractElement(Vec: Ops[0], Idx: Idx0, Name: "lane0");
5539 Value *Op1 = Builder.CreateExtractElement(Vec: Ops[0], Idx: Idx1, Name: "lane1");
5540 // Pairwise addition of a v2f64 into a scalar f64.
5541 return Builder.CreateFAdd(L: Op0, R: Op1, Name: "vpaddd");
5542 }
5543 case NEON::BI__builtin_neon_vpadds_f32: {
5544 auto *Ty = llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 2);
5545 // The vector is v2f32, so make sure it's bitcast to that.
5546 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty, Name: "v2f32");
5547 llvm::Value *Idx0 = llvm::ConstantInt::get(Ty: SizeTy, V: 0);
5548 llvm::Value *Idx1 = llvm::ConstantInt::get(Ty: SizeTy, V: 1);
5549 Value *Op0 = Builder.CreateExtractElement(Vec: Ops[0], Idx: Idx0, Name: "lane0");
5550 Value *Op1 = Builder.CreateExtractElement(Vec: Ops[0], Idx: Idx1, Name: "lane1");
5551 // Pairwise addition of a v2f32 into a scalar f32.
5552 return Builder.CreateFAdd(L: Op0, R: Op1, Name: "vpaddd");
5553 }
5554 case NEON::BI__builtin_neon_vceqzd_s64:
5555 return EmitAArch64CompareBuiltinExpr(
5556 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5557 Pred: ICmpInst::ICMP_EQ, Name: "vceqz");
5558 case NEON::BI__builtin_neon_vceqzd_f64:
5559 case NEON::BI__builtin_neon_vceqzs_f32:
5560 case NEON::BI__builtin_neon_vceqzh_f16:
5561 return EmitAArch64CompareBuiltinExpr(
5562 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5563 Pred: ICmpInst::FCMP_OEQ, Name: "vceqz");
5564 case NEON::BI__builtin_neon_vcgezd_s64:
5565 return EmitAArch64CompareBuiltinExpr(
5566 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5567 Pred: ICmpInst::ICMP_SGE, Name: "vcgez");
5568 case NEON::BI__builtin_neon_vcgezd_f64:
5569 case NEON::BI__builtin_neon_vcgezs_f32:
5570 case NEON::BI__builtin_neon_vcgezh_f16:
5571 return EmitAArch64CompareBuiltinExpr(
5572 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5573 Pred: ICmpInst::FCMP_OGE, Name: "vcgez");
5574 case NEON::BI__builtin_neon_vclezd_s64:
5575 return EmitAArch64CompareBuiltinExpr(
5576 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5577 Pred: ICmpInst::ICMP_SLE, Name: "vclez");
5578 case NEON::BI__builtin_neon_vclezd_f64:
5579 case NEON::BI__builtin_neon_vclezs_f32:
5580 case NEON::BI__builtin_neon_vclezh_f16:
5581 return EmitAArch64CompareBuiltinExpr(
5582 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5583 Pred: ICmpInst::FCMP_OLE, Name: "vclez");
5584 case NEON::BI__builtin_neon_vcgtzd_s64:
5585 return EmitAArch64CompareBuiltinExpr(
5586 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5587 Pred: ICmpInst::ICMP_SGT, Name: "vcgtz");
5588 case NEON::BI__builtin_neon_vcgtzd_f64:
5589 case NEON::BI__builtin_neon_vcgtzs_f32:
5590 case NEON::BI__builtin_neon_vcgtzh_f16:
5591 return EmitAArch64CompareBuiltinExpr(
5592 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5593 Pred: ICmpInst::FCMP_OGT, Name: "vcgtz");
5594 case NEON::BI__builtin_neon_vcltzd_s64:
5595 return EmitAArch64CompareBuiltinExpr(
5596 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5597 Pred: ICmpInst::ICMP_SLT, Name: "vcltz");
5598
5599 case NEON::BI__builtin_neon_vcltzd_f64:
5600 case NEON::BI__builtin_neon_vcltzs_f32:
5601 case NEON::BI__builtin_neon_vcltzh_f16:
5602 return EmitAArch64CompareBuiltinExpr(
5603 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5604 Pred: ICmpInst::FCMP_OLT, Name: "vcltz");
5605
5606 case NEON::BI__builtin_neon_vceqzd_u64: {
5607 return EmitAArch64CompareBuiltinExpr(
5608 Op: Ops[0], Ty: ConvertType(T: E->getCallReturnType(Ctx: getContext())),
5609 Pred: ICmpInst::ICMP_EQ, Name: "vceqzd");
5610 }
5611 case NEON::BI__builtin_neon_vceqd_f64:
5612 case NEON::BI__builtin_neon_vcled_f64:
5613 case NEON::BI__builtin_neon_vcltd_f64:
5614 case NEON::BI__builtin_neon_vcged_f64:
5615 case NEON::BI__builtin_neon_vcgtd_f64: {
5616 llvm::CmpInst::Predicate P;
5617 switch (BuiltinID) {
5618 default: llvm_unreachable("missing builtin ID in switch!");
5619 case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
5620 case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
5621 case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
5622 case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
5623 case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
5624 }
5625 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DoubleTy);
5626 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: DoubleTy);
5627 if (P == llvm::FCmpInst::FCMP_OEQ)
5628 Ops[0] = Builder.CreateFCmp(P, LHS: Ops[0], RHS: Ops[1]);
5629 else
5630 Ops[0] = Builder.CreateFCmpS(P, LHS: Ops[0], RHS: Ops[1]);
5631 return Builder.CreateSExt(V: Ops[0], DestTy: Int64Ty, Name: "vcmpd");
5632 }
5633 case NEON::BI__builtin_neon_vceqs_f32:
5634 case NEON::BI__builtin_neon_vcles_f32:
5635 case NEON::BI__builtin_neon_vclts_f32:
5636 case NEON::BI__builtin_neon_vcges_f32:
5637 case NEON::BI__builtin_neon_vcgts_f32: {
5638 llvm::CmpInst::Predicate P;
5639 switch (BuiltinID) {
5640 default: llvm_unreachable("missing builtin ID in switch!");
5641 case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
5642 case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
5643 case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
5644 case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
5645 case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
5646 }
5647 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: FloatTy);
5648 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: FloatTy);
5649 if (P == llvm::FCmpInst::FCMP_OEQ)
5650 Ops[0] = Builder.CreateFCmp(P, LHS: Ops[0], RHS: Ops[1]);
5651 else
5652 Ops[0] = Builder.CreateFCmpS(P, LHS: Ops[0], RHS: Ops[1]);
5653 return Builder.CreateSExt(V: Ops[0], DestTy: Int32Ty, Name: "vcmpd");
5654 }
5655 case NEON::BI__builtin_neon_vceqh_f16:
5656 case NEON::BI__builtin_neon_vcleh_f16:
5657 case NEON::BI__builtin_neon_vclth_f16:
5658 case NEON::BI__builtin_neon_vcgeh_f16:
5659 case NEON::BI__builtin_neon_vcgth_f16: {
5660 llvm::CmpInst::Predicate P;
5661 switch (BuiltinID) {
5662 default: llvm_unreachable("missing builtin ID in switch!");
5663 case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
5664 case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
5665 case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
5666 case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
5667 case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
5668 }
5669 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: HalfTy);
5670 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: HalfTy);
5671 if (P == llvm::FCmpInst::FCMP_OEQ)
5672 Ops[0] = Builder.CreateFCmp(P, LHS: Ops[0], RHS: Ops[1]);
5673 else
5674 Ops[0] = Builder.CreateFCmpS(P, LHS: Ops[0], RHS: Ops[1]);
5675 return Builder.CreateSExt(V: Ops[0], DestTy: Int16Ty, Name: "vcmpd");
5676 }
5677 case NEON::BI__builtin_neon_vceqd_s64:
5678 case NEON::BI__builtin_neon_vceqd_u64:
5679 case NEON::BI__builtin_neon_vcgtd_s64:
5680 case NEON::BI__builtin_neon_vcgtd_u64:
5681 case NEON::BI__builtin_neon_vcltd_s64:
5682 case NEON::BI__builtin_neon_vcltd_u64:
5683 case NEON::BI__builtin_neon_vcged_u64:
5684 case NEON::BI__builtin_neon_vcged_s64:
5685 case NEON::BI__builtin_neon_vcled_u64:
5686 case NEON::BI__builtin_neon_vcled_s64: {
5687 llvm::CmpInst::Predicate P;
5688 switch (BuiltinID) {
5689 default: llvm_unreachable("missing builtin ID in switch!");
5690 case NEON::BI__builtin_neon_vceqd_s64:
5691 case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
5692 case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
5693 case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
5694 case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
5695 case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
5696 case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
5697 case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
5698 case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
5699 case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
5700 }
5701 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Int64Ty);
5702 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Int64Ty);
5703 Ops[0] = Builder.CreateICmp(P, LHS: Ops[0], RHS: Ops[1]);
5704 return Builder.CreateSExt(V: Ops[0], DestTy: Int64Ty, Name: "vceqd");
5705 }
5706 case NEON::BI__builtin_neon_vnegd_s64:
5707 return Builder.CreateNeg(V: Ops[0], Name: "vnegd");
5708 case NEON::BI__builtin_neon_vnegh_f16:
5709 return Builder.CreateFNeg(V: Ops[0], Name: "vnegh");
5710 case NEON::BI__builtin_neon_vtstd_s64:
5711 case NEON::BI__builtin_neon_vtstd_u64: {
5712 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Int64Ty);
5713 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Int64Ty);
5714 Ops[0] = Builder.CreateAnd(LHS: Ops[0], RHS: Ops[1]);
5715 Ops[0] = Builder.CreateICmp(P: ICmpInst::ICMP_NE, LHS: Ops[0],
5716 RHS: llvm::Constant::getNullValue(Ty: Int64Ty));
5717 return Builder.CreateSExt(V: Ops[0], DestTy: Int64Ty, Name: "vtstd");
5718 }
5719 case NEON::BI__builtin_neon_vset_lane_i8:
5720 case NEON::BI__builtin_neon_vset_lane_i16:
5721 case NEON::BI__builtin_neon_vset_lane_i32:
5722 case NEON::BI__builtin_neon_vset_lane_i64:
5723 case NEON::BI__builtin_neon_vset_lane_bf16:
5724 case NEON::BI__builtin_neon_vset_lane_f32:
5725 case NEON::BI__builtin_neon_vsetq_lane_i8:
5726 case NEON::BI__builtin_neon_vsetq_lane_i16:
5727 case NEON::BI__builtin_neon_vsetq_lane_i32:
5728 case NEON::BI__builtin_neon_vsetq_lane_i64:
5729 case NEON::BI__builtin_neon_vsetq_lane_bf16:
5730 case NEON::BI__builtin_neon_vsetq_lane_f32:
5731 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
5732 case NEON::BI__builtin_neon_vset_lane_f64:
5733 // The vector type needs a cast for the v1f64 variant.
5734 Ops[1] =
5735 Builder.CreateBitCast(V: Ops[1], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 1));
5736 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
5737 case NEON::BI__builtin_neon_vset_lane_mf8:
5738 case NEON::BI__builtin_neon_vsetq_lane_mf8:
5739 // The input vector type needs a cast to scalar type.
5740 Ops[0] =
5741 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::Type::getInt8Ty(C&: getLLVMContext()));
5742 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
5743 case NEON::BI__builtin_neon_vsetq_lane_f64:
5744 // The vector type needs a cast for the v2f64 variant.
5745 Ops[1] =
5746 Builder.CreateBitCast(V: Ops[1], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 2));
5747 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vset_lane");
5748
5749 case NEON::BI__builtin_neon_vget_lane_i8:
5750 case NEON::BI__builtin_neon_vdupb_lane_i8:
5751 Ops[0] =
5752 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8));
5753 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
5754 case NEON::BI__builtin_neon_vgetq_lane_i8:
5755 case NEON::BI__builtin_neon_vdupb_laneq_i8:
5756 Ops[0] =
5757 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16));
5758 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vgetq_lane");
5759 case NEON::BI__builtin_neon_vget_lane_mf8:
5760 case NEON::BI__builtin_neon_vdupb_lane_mf8:
5761 case NEON::BI__builtin_neon_vgetq_lane_mf8:
5762 case NEON::BI__builtin_neon_vdupb_laneq_mf8:
5763 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
5764 case NEON::BI__builtin_neon_vget_lane_i16:
5765 case NEON::BI__builtin_neon_vduph_lane_i16:
5766 Ops[0] =
5767 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 4));
5768 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
5769 case NEON::BI__builtin_neon_vgetq_lane_i16:
5770 case NEON::BI__builtin_neon_vduph_laneq_i16:
5771 Ops[0] =
5772 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 8));
5773 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vgetq_lane");
5774 case NEON::BI__builtin_neon_vget_lane_i32:
5775 case NEON::BI__builtin_neon_vdups_lane_i32:
5776 Ops[0] =
5777 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int32Ty, NumElts: 2));
5778 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
5779 case NEON::BI__builtin_neon_vdups_lane_f32:
5780 Ops[0] =
5781 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 2));
5782 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vdups_lane");
5783 case NEON::BI__builtin_neon_vgetq_lane_i32:
5784 case NEON::BI__builtin_neon_vdups_laneq_i32:
5785 Ops[0] =
5786 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int32Ty, NumElts: 4));
5787 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vgetq_lane");
5788 case NEON::BI__builtin_neon_vget_lane_i64:
5789 case NEON::BI__builtin_neon_vdupd_lane_i64:
5790 Ops[0] =
5791 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int64Ty, NumElts: 1));
5792 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
5793 case NEON::BI__builtin_neon_vdupd_lane_f64:
5794 Ops[0] =
5795 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 1));
5796 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vdupd_lane");
5797 case NEON::BI__builtin_neon_vgetq_lane_i64:
5798 case NEON::BI__builtin_neon_vdupd_laneq_i64:
5799 Ops[0] =
5800 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: Int64Ty, NumElts: 2));
5801 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vgetq_lane");
5802 case NEON::BI__builtin_neon_vget_lane_f32:
5803 Ops[0] =
5804 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 2));
5805 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
5806 case NEON::BI__builtin_neon_vget_lane_f64:
5807 Ops[0] =
5808 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 1));
5809 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
5810 case NEON::BI__builtin_neon_vgetq_lane_f32:
5811 case NEON::BI__builtin_neon_vdups_laneq_f32:
5812 Ops[0] =
5813 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4));
5814 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vgetq_lane");
5815 case NEON::BI__builtin_neon_vgetq_lane_f64:
5816 case NEON::BI__builtin_neon_vdupd_laneq_f64:
5817 Ops[0] =
5818 Builder.CreateBitCast(V: Ops[0], DestTy: llvm::FixedVectorType::get(ElementType: DoubleTy, NumElts: 2));
5819 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vgetq_lane");
5820 case NEON::BI__builtin_neon_vaddh_f16:
5821 return Builder.CreateFAdd(L: Ops[0], R: Ops[1], Name: "vaddh");
5822 case NEON::BI__builtin_neon_vsubh_f16:
5823 return Builder.CreateFSub(L: Ops[0], R: Ops[1], Name: "vsubh");
5824 case NEON::BI__builtin_neon_vmulh_f16:
5825 return Builder.CreateFMul(L: Ops[0], R: Ops[1], Name: "vmulh");
5826 case NEON::BI__builtin_neon_vdivh_f16:
5827 return Builder.CreateFDiv(L: Ops[0], R: Ops[1], Name: "vdivh");
5828 case NEON::BI__builtin_neon_vfmah_f16:
5829 // NEON intrinsic puts accumulator first, unlike the LLVM fma.
5830 return emitCallMaybeConstrainedFPBuiltin(
5831 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty: HalfTy,
5832 Args: {Ops[1], Ops[2], Ops[0]});
5833 case NEON::BI__builtin_neon_vfmsh_f16: {
5834 Value *Neg = Builder.CreateFNeg(V: Ops[1], Name: "vsubh");
5835
5836 // NEON intrinsic puts accumulator first, unlike the LLVM fma.
5837 return emitCallMaybeConstrainedFPBuiltin(
5838 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty: HalfTy,
5839 Args: {Neg, Ops[2], Ops[0]});
5840 }
5841 case NEON::BI__builtin_neon_vaddd_s64:
5842 case NEON::BI__builtin_neon_vaddd_u64:
5843 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1], Name: "vaddd");
5844 case NEON::BI__builtin_neon_vsubd_s64:
5845 case NEON::BI__builtin_neon_vsubd_u64:
5846 return Builder.CreateSub(LHS: Ops[0], RHS: Ops[1], Name: "vsubd");
5847 case NEON::BI__builtin_neon_vqdmlalh_s16:
5848 case NEON::BI__builtin_neon_vqdmlslh_s16: {
5849 SmallVector<Value *, 2> ProductOps;
5850 ProductOps.push_back(Elt: vectorWrapScalar16(Op: Ops[1]));
5851 ProductOps.push_back(Elt: vectorWrapScalar16(Op: Ops[2]));
5852 auto *VTy = llvm::FixedVectorType::get(ElementType: Int32Ty, NumElts: 4);
5853 Ops[1] = EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqdmull, Tys: VTy),
5854 Ops&: ProductOps, name: "vqdmlXl");
5855 Constant *CI = ConstantInt::get(Ty: SizeTy, V: 0);
5856 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: CI, Name: "lane0");
5857
5858 unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
5859 ? Intrinsic::aarch64_neon_sqadd
5860 : Intrinsic::aarch64_neon_sqsub;
5861 // Drop the 2nd multiplication argument before the accumulation
5862 Ops.pop_back();
5863 return EmitNeonCall(F: CGM.getIntrinsic(IID: AccumInt, Tys: Int32Ty), Ops, name: "vqdmlXl");
5864 }
5865 case NEON::BI__builtin_neon_vqshlud_n_s64: {
5866 Ops[1] = Builder.CreateZExt(V: Ops[1], DestTy: Int64Ty);
5867 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqshlu, Tys: Int64Ty),
5868 Ops, name: "vqshlu_n");
5869 }
5870 case NEON::BI__builtin_neon_vqshld_n_u64:
5871 case NEON::BI__builtin_neon_vqshld_n_s64: {
5872 Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
5873 ? Intrinsic::aarch64_neon_uqshl
5874 : Intrinsic::aarch64_neon_sqshl;
5875 Ops[1] = Builder.CreateZExt(V: Ops[1], DestTy: Int64Ty);
5876 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Int64Ty), Ops, name: "vqshl_n");
5877 }
5878 case NEON::BI__builtin_neon_vrshrd_n_u64:
5879 case NEON::BI__builtin_neon_vrshrd_n_s64: {
5880 Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
5881 ? Intrinsic::aarch64_neon_urshl
5882 : Intrinsic::aarch64_neon_srshl;
5883 int SV = cast<ConstantInt>(Val: Ops[1])->getSExtValue();
5884 Ops[1] = ConstantInt::get(Ty: Int64Ty, V: -SV);
5885 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Int64Ty), Ops, name: "vrshr_n");
5886 }
5887 case NEON::BI__builtin_neon_vrsrad_n_u64:
5888 case NEON::BI__builtin_neon_vrsrad_n_s64: {
5889 Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
5890 ? Intrinsic::aarch64_neon_urshl
5891 : Intrinsic::aarch64_neon_srshl;
5892 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Int64Ty);
5893 Ops[2] = Builder.CreateNeg(V: Ops[2]);
5894 Ops[1] = Builder.CreateCall(Callee: CGM.getIntrinsic(IID: Int, Tys: Int64Ty),
5895 Args: {Ops[1], Builder.CreateSExt(V: Ops[2], DestTy: Int64Ty)});
5896 return Builder.CreateAdd(LHS: Ops[0], RHS: Builder.CreateBitCast(V: Ops[1], DestTy: Int64Ty));
5897 }
5898 case NEON::BI__builtin_neon_vshld_n_s64:
5899 case NEON::BI__builtin_neon_vshld_n_u64: {
5900 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: Ops[1]);
5901 return Builder.CreateShl(
5902 LHS: Ops[0], RHS: ConstantInt::get(Ty: Int64Ty, V: Amt->getZExtValue()), Name: "shld_n");
5903 }
5904 case NEON::BI__builtin_neon_vshrd_n_s64: {
5905 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: Ops[1]);
5906 return Builder.CreateAShr(
5907 LHS: Ops[0], RHS: ConstantInt::get(Ty: Int64Ty, V: std::min(a: static_cast<uint64_t>(63),
5908 b: Amt->getZExtValue())),
5909 Name: "shrd_n");
5910 }
5911 case NEON::BI__builtin_neon_vshrd_n_u64: {
5912 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: Ops[1]);
5913 uint64_t ShiftAmt = Amt->getZExtValue();
5914 // Right-shifting an unsigned value by its size yields 0.
5915 if (ShiftAmt == 64)
5916 return ConstantInt::get(Ty: Int64Ty, V: 0);
5917 return Builder.CreateLShr(LHS: Ops[0], RHS: ConstantInt::get(Ty: Int64Ty, V: ShiftAmt),
5918 Name: "shrd_n");
5919 }
5920 case NEON::BI__builtin_neon_vsrad_n_s64: {
5921 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: Ops[2]);
5922 Ops[1] = Builder.CreateAShr(
5923 LHS: Ops[1], RHS: ConstantInt::get(Ty: Int64Ty, V: std::min(a: static_cast<uint64_t>(63),
5924 b: Amt->getZExtValue())),
5925 Name: "shrd_n");
5926 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1]);
5927 }
5928 case NEON::BI__builtin_neon_vsrad_n_u64: {
5929 llvm::ConstantInt *Amt = cast<ConstantInt>(Val: Ops[2]);
5930 uint64_t ShiftAmt = Amt->getZExtValue();
5931 // Right-shifting an unsigned value by its size yields 0.
5932 // As Op + 0 = Op, return Ops[0] directly.
5933 if (ShiftAmt == 64)
5934 return Ops[0];
5935 Ops[1] = Builder.CreateLShr(LHS: Ops[1], RHS: ConstantInt::get(Ty: Int64Ty, V: ShiftAmt),
5936 Name: "shrd_n");
5937 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1]);
5938 }
5939 case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
5940 case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
5941 case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
5942 case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
5943 Ops[2] = Builder.CreateExtractElement(Vec: Ops[2], Idx: Ops[3], Name: "lane");
5944 SmallVector<Value *, 2> ProductOps;
5945 ProductOps.push_back(Elt: vectorWrapScalar16(Op: Ops[1]));
5946 ProductOps.push_back(Elt: vectorWrapScalar16(Op: Ops[2]));
5947 auto *VTy = llvm::FixedVectorType::get(ElementType: Int32Ty, NumElts: 4);
5948 Ops[1] = EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqdmull, Tys: VTy),
5949 Ops&: ProductOps, name: "vqdmlXl");
5950 Constant *CI = ConstantInt::get(Ty: SizeTy, V: 0);
5951 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: CI, Name: "lane0");
5952 // Drop lane-selection and the corresponding vector argument (these have
5953 // already been used)
5954 Ops.pop_back_n(NumItems: 2);
5955
5956 unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
5957 BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
5958 ? Intrinsic::aarch64_neon_sqadd
5959 : Intrinsic::aarch64_neon_sqsub;
5960 return EmitNeonCall(F: CGM.getIntrinsic(IID: AccInt, Tys: Int32Ty), Ops, name: "vqdmlXl");
5961 }
5962 case NEON::BI__builtin_neon_vqdmlals_s32:
5963 case NEON::BI__builtin_neon_vqdmlsls_s32: {
5964 SmallVector<Value *, 2> ProductOps;
5965 ProductOps.push_back(Elt: Ops[1]);
5966 ProductOps.push_back(Elt: Ops[2]);
5967 Ops[1] =
5968 EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqdmulls_scalar),
5969 Ops&: ProductOps, name: "vqdmlXl");
5970
5971 unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
5972 ? Intrinsic::aarch64_neon_sqadd
5973 : Intrinsic::aarch64_neon_sqsub;
5974 // Drop the 2nd multiplication argument before the accumulation
5975 Ops.pop_back();
5976 return EmitNeonCall(F: CGM.getIntrinsic(IID: AccumInt, Tys: Int64Ty), Ops, name: "vqdmlXl");
5977 }
5978 case NEON::BI__builtin_neon_vqdmlals_lane_s32:
5979 case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
5980 case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
5981 case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
5982 Ops[2] = Builder.CreateExtractElement(Vec: Ops[2], Idx: Ops[3], Name: "lane");
5983 SmallVector<Value *, 2> ProductOps;
5984 ProductOps.push_back(Elt: Ops[1]);
5985 ProductOps.push_back(Elt: Ops[2]);
5986 Ops[1] =
5987 EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_sqdmulls_scalar),
5988 Ops&: ProductOps, name: "vqdmlXl");
5989 // Drop lane-selection and the corresponding vector argument (these have
5990 // already been used)
5991 Ops.pop_back_n(NumItems: 2);
5992
5993 unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
5994 BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
5995 ? Intrinsic::aarch64_neon_sqadd
5996 : Intrinsic::aarch64_neon_sqsub;
5997 return EmitNeonCall(F: CGM.getIntrinsic(IID: AccInt, Tys: Int64Ty), Ops, name: "vqdmlXl");
5998 }
5999 case NEON::BI__builtin_neon_vget_lane_bf16:
6000 case NEON::BI__builtin_neon_vduph_lane_bf16:
6001 case NEON::BI__builtin_neon_vduph_lane_f16: {
6002 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vget_lane");
6003 }
6004 case NEON::BI__builtin_neon_vgetq_lane_bf16:
6005 case NEON::BI__builtin_neon_vduph_laneq_bf16:
6006 case NEON::BI__builtin_neon_vduph_laneq_f16: {
6007 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Ops[1], Name: "vgetq_lane");
6008 }
6009 case NEON::BI__builtin_neon_vcvt_bf16_f32: {
6010 llvm::Type *V4F32 = FixedVectorType::get(ElementType: Builder.getFloatTy(), NumElts: 4);
6011 llvm::Type *V4BF16 = FixedVectorType::get(ElementType: Builder.getBFloatTy(), NumElts: 4);
6012 return Builder.CreateFPTrunc(V: Builder.CreateBitCast(V: Ops[0], DestTy: V4F32), DestTy: V4BF16);
6013 }
6014 case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
6015 SmallVector<int, 16> ConcatMask(8);
6016 std::iota(first: ConcatMask.begin(), last: ConcatMask.end(), value: 0);
6017 llvm::Type *V4F32 = FixedVectorType::get(ElementType: Builder.getFloatTy(), NumElts: 4);
6018 llvm::Type *V4BF16 = FixedVectorType::get(ElementType: Builder.getBFloatTy(), NumElts: 4);
6019 llvm::Value *Trunc =
6020 Builder.CreateFPTrunc(V: Builder.CreateBitCast(V: Ops[0], DestTy: V4F32), DestTy: V4BF16);
6021 return Builder.CreateShuffleVector(
6022 V1: Trunc, V2: ConstantAggregateZero::get(Ty: V4BF16), Mask: ConcatMask);
6023 }
6024 case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
6025 SmallVector<int, 16> ConcatMask(8);
6026 std::iota(first: ConcatMask.begin(), last: ConcatMask.end(), value: 0);
6027 SmallVector<int, 16> LoMask(4);
6028 std::iota(first: LoMask.begin(), last: LoMask.end(), value: 0);
6029 llvm::Type *V4F32 = FixedVectorType::get(ElementType: Builder.getFloatTy(), NumElts: 4);
6030 llvm::Type *V4BF16 = FixedVectorType::get(ElementType: Builder.getBFloatTy(), NumElts: 4);
6031 llvm::Type *V8BF16 = FixedVectorType::get(ElementType: Builder.getBFloatTy(), NumElts: 8);
6032 llvm::Value *Inactive = Builder.CreateShuffleVector(
6033 V: Builder.CreateBitCast(V: Ops[0], DestTy: V8BF16), Mask: LoMask);
6034 llvm::Value *Trunc =
6035 Builder.CreateFPTrunc(V: Builder.CreateBitCast(V: Ops[1], DestTy: V4F32), DestTy: V4BF16);
6036 return Builder.CreateShuffleVector(V1: Inactive, V2: Trunc, Mask: ConcatMask);
6037 }
6038
6039 case clang::AArch64::BI_InterlockedAdd:
6040 case clang::AArch64::BI_InterlockedAdd_acq:
6041 case clang::AArch64::BI_InterlockedAdd_rel:
6042 case clang::AArch64::BI_InterlockedAdd_nf:
6043 case clang::AArch64::BI_InterlockedAdd64:
6044 case clang::AArch64::BI_InterlockedAdd64_acq:
6045 case clang::AArch64::BI_InterlockedAdd64_rel:
6046 case clang::AArch64::BI_InterlockedAdd64_nf: {
6047 Address DestAddr = CheckAtomicAlignment(CGF&: *this, E);
6048 Value *Val = Ops[1];
6049 llvm::AtomicOrdering Ordering;
6050 switch (BuiltinID) {
6051 case clang::AArch64::BI_InterlockedAdd:
6052 case clang::AArch64::BI_InterlockedAdd64:
6053 Ordering = llvm::AtomicOrdering::SequentiallyConsistent;
6054 break;
6055 case clang::AArch64::BI_InterlockedAdd_acq:
6056 case clang::AArch64::BI_InterlockedAdd64_acq:
6057 Ordering = llvm::AtomicOrdering::Acquire;
6058 break;
6059 case clang::AArch64::BI_InterlockedAdd_rel:
6060 case clang::AArch64::BI_InterlockedAdd64_rel:
6061 Ordering = llvm::AtomicOrdering::Release;
6062 break;
6063 case clang::AArch64::BI_InterlockedAdd_nf:
6064 case clang::AArch64::BI_InterlockedAdd64_nf:
6065 Ordering = llvm::AtomicOrdering::Monotonic;
6066 break;
6067 default:
6068 llvm_unreachable("missing builtin ID in switch!");
6069 }
6070 AtomicRMWInst *RMWI =
6071 Builder.CreateAtomicRMW(Op: AtomicRMWInst::Add, Addr: DestAddr, Val, Ordering);
6072 return Builder.CreateAdd(LHS: RMWI, RHS: Val);
6073 }
6074 }
6075
6076 llvm::FixedVectorType *VTy = GetNeonType(CGF: this, TypeFlags: Type);
6077 llvm::Type *Ty = VTy;
6078 if (!Ty)
6079 return nullptr;
6080
6081 bool ExtractLow = false;
6082 bool ExtendLaneArg = false;
6083 switch (BuiltinID) {
6084 default: return nullptr;
6085 case NEON::BI__builtin_neon_vbsl_v:
6086 case NEON::BI__builtin_neon_vbslq_v: {
6087 llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
6088 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: BitTy, Name: "vbsl");
6089 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: BitTy, Name: "vbsl");
6090 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: BitTy, Name: "vbsl");
6091
6092 Ops[1] = Builder.CreateAnd(LHS: Ops[0], RHS: Ops[1], Name: "vbsl");
6093 Ops[2] = Builder.CreateAnd(LHS: Builder.CreateNot(V: Ops[0]), RHS: Ops[2], Name: "vbsl");
6094 Ops[0] = Builder.CreateOr(LHS: Ops[1], RHS: Ops[2], Name: "vbsl");
6095 return Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6096 }
6097 case NEON::BI__builtin_neon_vfma_lane_v:
6098 case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
6099 // The ARM builtins (and instructions) have the addend as the first
6100 // operand, but the 'fma' intrinsics have it last. Swap it around here.
6101 Value *Addend = Ops[0];
6102 Value *Multiplicand = Ops[1];
6103 Value *LaneSource = Ops[2];
6104 Ops[0] = Multiplicand;
6105 Ops[1] = LaneSource;
6106 Ops[2] = Addend;
6107
6108 // Now adjust things to handle the lane access.
6109 auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v
6110 ? llvm::FixedVectorType::get(ElementType: VTy->getElementType(),
6111 NumElts: VTy->getNumElements() / 2)
6112 : VTy;
6113 llvm::Constant *cst = cast<Constant>(Val: Ops[3]);
6114 Value *SV = llvm::ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: cst);
6115 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: SourceTy);
6116 Ops[1] = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[1], Mask: SV, Name: "lane");
6117
6118 Ops.pop_back();
6119 Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
6120 : Intrinsic::fma;
6121 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "fmla");
6122 }
6123 case NEON::BI__builtin_neon_vfma_laneq_v: {
6124 auto *VTy = cast<llvm::FixedVectorType>(Val: Ty);
6125 // v1f64 fma should be mapped to Neon scalar f64 fma
6126 if (VTy && VTy->getElementType() == DoubleTy) {
6127 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DoubleTy);
6128 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: DoubleTy);
6129 llvm::FixedVectorType *VTy =
6130 GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float64, false, true));
6131 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: VTy);
6132 Ops[2] = Builder.CreateExtractElement(Vec: Ops[2], Idx: Ops[3], Name: "extract");
6133 Value *Result;
6134 Result = emitCallMaybeConstrainedFPBuiltin(
6135 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma,
6136 Ty: DoubleTy, Args: {Ops[1], Ops[2], Ops[0]});
6137 return Builder.CreateBitCast(V: Result, DestTy: Ty);
6138 }
6139 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6140 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6141
6142 auto *STy = llvm::FixedVectorType::get(ElementType: VTy->getElementType(),
6143 NumElts: VTy->getNumElements() * 2);
6144 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: STy);
6145 Value *SV = llvm::ConstantVector::getSplat(EC: VTy->getElementCount(),
6146 Elt: cast<ConstantInt>(Val: Ops[3]));
6147 Ops[2] = Builder.CreateShuffleVector(V1: Ops[2], V2: Ops[2], Mask: SV, Name: "lane");
6148
6149 return emitCallMaybeConstrainedFPBuiltin(
6150 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty,
6151 Args: {Ops[2], Ops[1], Ops[0]});
6152 }
6153 case NEON::BI__builtin_neon_vfmaq_laneq_v: {
6154 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6155 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6156
6157 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
6158 Ops[2] = EmitNeonSplat(V: Ops[2], C: cast<ConstantInt>(Val: Ops[3]));
6159 return emitCallMaybeConstrainedFPBuiltin(
6160 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty,
6161 Args: {Ops[2], Ops[1], Ops[0]});
6162 }
6163 case NEON::BI__builtin_neon_vfmah_lane_f16:
6164 case NEON::BI__builtin_neon_vfmas_lane_f32:
6165 case NEON::BI__builtin_neon_vfmah_laneq_f16:
6166 case NEON::BI__builtin_neon_vfmas_laneq_f32:
6167 case NEON::BI__builtin_neon_vfmad_lane_f64:
6168 case NEON::BI__builtin_neon_vfmad_laneq_f64: {
6169 llvm::Type *Ty = ConvertType(T: E->getCallReturnType(Ctx: getContext()));
6170 Ops[2] = Builder.CreateExtractElement(Vec: Ops[2], Idx: Ops[3], Name: "extract");
6171 return emitCallMaybeConstrainedFPBuiltin(
6172 CGF&: *this, IntrinsicID: Intrinsic::fma, ConstrainedIntrinsicID: Intrinsic::experimental_constrained_fma, Ty,
6173 Args: {Ops[1], Ops[2], Ops[0]});
6174 }
6175 case NEON::BI__builtin_neon_vmull_v:
6176 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6177 Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
6178 if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
6179 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmull");
6180 case NEON::BI__builtin_neon_vmax_v:
6181 case NEON::BI__builtin_neon_vmaxq_v:
6182 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6183 Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
6184 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
6185 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmax");
6186 case NEON::BI__builtin_neon_vmaxh_f16: {
6187 Int = Intrinsic::aarch64_neon_fmax;
6188 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vmax");
6189 }
6190 case NEON::BI__builtin_neon_vmin_v:
6191 case NEON::BI__builtin_neon_vminq_v:
6192 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6193 Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
6194 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
6195 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmin");
6196 case NEON::BI__builtin_neon_vminh_f16: {
6197 Int = Intrinsic::aarch64_neon_fmin;
6198 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vmin");
6199 }
6200 case NEON::BI__builtin_neon_vabd_v:
6201 case NEON::BI__builtin_neon_vabdq_v:
6202 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6203 Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
6204 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
6205 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vabd");
6206 case NEON::BI__builtin_neon_vpadal_v:
6207 case NEON::BI__builtin_neon_vpadalq_v: {
6208 unsigned ArgElts = VTy->getNumElements();
6209 llvm::IntegerType *EltTy = cast<IntegerType>(Val: VTy->getElementType());
6210 unsigned BitWidth = EltTy->getBitWidth();
6211 auto *ArgTy = llvm::FixedVectorType::get(
6212 ElementType: llvm::IntegerType::get(C&: getLLVMContext(), NumBits: BitWidth / 2), NumElts: 2 * ArgElts);
6213 llvm::Type* Tys[2] = { VTy, ArgTy };
6214 Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
6215 SmallVector<llvm::Value*, 1> TmpOps;
6216 TmpOps.push_back(Elt: Ops[1]);
6217 Function *F = CGM.getIntrinsic(IID: Int, Tys);
6218 llvm::Value *tmp = EmitNeonCall(F, Ops&: TmpOps, name: "vpadal");
6219 llvm::Value *addend = Builder.CreateBitCast(V: Ops[0], DestTy: tmp->getType());
6220 return Builder.CreateAdd(LHS: tmp, RHS: addend);
6221 }
6222 case NEON::BI__builtin_neon_vpmin_v:
6223 case NEON::BI__builtin_neon_vpminq_v:
6224 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6225 Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
6226 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
6227 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vpmin");
6228 case NEON::BI__builtin_neon_vpmax_v:
6229 case NEON::BI__builtin_neon_vpmaxq_v:
6230 // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
6231 Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
6232 if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
6233 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vpmax");
6234 case NEON::BI__builtin_neon_vminnm_v:
6235 case NEON::BI__builtin_neon_vminnmq_v:
6236 Int = Intrinsic::aarch64_neon_fminnm;
6237 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vminnm");
6238 case NEON::BI__builtin_neon_vminnmh_f16:
6239 Int = Intrinsic::aarch64_neon_fminnm;
6240 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vminnm");
6241 case NEON::BI__builtin_neon_vmaxnm_v:
6242 case NEON::BI__builtin_neon_vmaxnmq_v:
6243 Int = Intrinsic::aarch64_neon_fmaxnm;
6244 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmaxnm");
6245 case NEON::BI__builtin_neon_vmaxnmh_f16:
6246 Int = Intrinsic::aarch64_neon_fmaxnm;
6247 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vmaxnm");
6248 case NEON::BI__builtin_neon_vrecpss_f32: {
6249 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_frecps, Tys: FloatTy),
6250 Ops, name: "vrecps");
6251 }
6252 case NEON::BI__builtin_neon_vrecpsd_f64:
6253 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_frecps, Tys: DoubleTy),
6254 Ops, name: "vrecps");
6255 case NEON::BI__builtin_neon_vrecpsh_f16:
6256 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_frecps, Tys: HalfTy),
6257 Ops, name: "vrecps");
6258 case NEON::BI__builtin_neon_vqshrun_n_v:
6259 Int = Intrinsic::aarch64_neon_sqshrun;
6260 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshrun_n");
6261 case NEON::BI__builtin_neon_vqrshrun_n_v:
6262 Int = Intrinsic::aarch64_neon_sqrshrun;
6263 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqrshrun_n");
6264 case NEON::BI__builtin_neon_vqshrn_n_v:
6265 Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
6266 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqshrn_n");
6267 case NEON::BI__builtin_neon_vrshrn_n_v:
6268 Int = Intrinsic::aarch64_neon_rshrn;
6269 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrshrn_n");
6270 case NEON::BI__builtin_neon_vqrshrn_n_v:
6271 Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
6272 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vqrshrn_n");
6273 case NEON::BI__builtin_neon_vrndah_f16: {
6274 Int = Builder.getIsFPConstrained()
6275 ? Intrinsic::experimental_constrained_round
6276 : Intrinsic::round;
6277 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrnda");
6278 }
6279 case NEON::BI__builtin_neon_vrnda_v:
6280 case NEON::BI__builtin_neon_vrndaq_v: {
6281 Int = Builder.getIsFPConstrained()
6282 ? Intrinsic::experimental_constrained_round
6283 : Intrinsic::round;
6284 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnda");
6285 }
6286 case NEON::BI__builtin_neon_vrndih_f16: {
6287 Int = Builder.getIsFPConstrained()
6288 ? Intrinsic::experimental_constrained_nearbyint
6289 : Intrinsic::nearbyint;
6290 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndi");
6291 }
6292 case NEON::BI__builtin_neon_vrndmh_f16: {
6293 Int = Builder.getIsFPConstrained()
6294 ? Intrinsic::experimental_constrained_floor
6295 : Intrinsic::floor;
6296 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndm");
6297 }
6298 case NEON::BI__builtin_neon_vrndm_v:
6299 case NEON::BI__builtin_neon_vrndmq_v: {
6300 Int = Builder.getIsFPConstrained()
6301 ? Intrinsic::experimental_constrained_floor
6302 : Intrinsic::floor;
6303 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndm");
6304 }
6305 case NEON::BI__builtin_neon_vrndnh_f16: {
6306 Int = Builder.getIsFPConstrained()
6307 ? Intrinsic::experimental_constrained_roundeven
6308 : Intrinsic::roundeven;
6309 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndn");
6310 }
6311 case NEON::BI__builtin_neon_vrndn_v:
6312 case NEON::BI__builtin_neon_vrndnq_v: {
6313 Int = Builder.getIsFPConstrained()
6314 ? Intrinsic::experimental_constrained_roundeven
6315 : Intrinsic::roundeven;
6316 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndn");
6317 }
6318 case NEON::BI__builtin_neon_vrndns_f32: {
6319 Int = Builder.getIsFPConstrained()
6320 ? Intrinsic::experimental_constrained_roundeven
6321 : Intrinsic::roundeven;
6322 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: FloatTy), Ops, name: "vrndn");
6323 }
6324 case NEON::BI__builtin_neon_vrndph_f16: {
6325 Int = Builder.getIsFPConstrained()
6326 ? Intrinsic::experimental_constrained_ceil
6327 : Intrinsic::ceil;
6328 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndp");
6329 }
6330 case NEON::BI__builtin_neon_vrndp_v:
6331 case NEON::BI__builtin_neon_vrndpq_v: {
6332 Int = Builder.getIsFPConstrained()
6333 ? Intrinsic::experimental_constrained_ceil
6334 : Intrinsic::ceil;
6335 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndp");
6336 }
6337 case NEON::BI__builtin_neon_vrndxh_f16: {
6338 Int = Builder.getIsFPConstrained()
6339 ? Intrinsic::experimental_constrained_rint
6340 : Intrinsic::rint;
6341 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndx");
6342 }
6343 case NEON::BI__builtin_neon_vrndx_v:
6344 case NEON::BI__builtin_neon_vrndxq_v: {
6345 Int = Builder.getIsFPConstrained()
6346 ? Intrinsic::experimental_constrained_rint
6347 : Intrinsic::rint;
6348 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndx");
6349 }
6350 case NEON::BI__builtin_neon_vrndh_f16: {
6351 Int = Builder.getIsFPConstrained()
6352 ? Intrinsic::experimental_constrained_trunc
6353 : Intrinsic::trunc;
6354 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vrndz");
6355 }
6356 case NEON::BI__builtin_neon_vrnd32x_f32:
6357 case NEON::BI__builtin_neon_vrnd32xq_f32:
6358 case NEON::BI__builtin_neon_vrnd32x_f64:
6359 case NEON::BI__builtin_neon_vrnd32xq_f64: {
6360 Int = Intrinsic::aarch64_neon_frint32x;
6361 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnd32x");
6362 }
6363 case NEON::BI__builtin_neon_vrnd32z_f32:
6364 case NEON::BI__builtin_neon_vrnd32zq_f32:
6365 case NEON::BI__builtin_neon_vrnd32z_f64:
6366 case NEON::BI__builtin_neon_vrnd32zq_f64: {
6367 Int = Intrinsic::aarch64_neon_frint32z;
6368 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnd32z");
6369 }
6370 case NEON::BI__builtin_neon_vrnd64x_f32:
6371 case NEON::BI__builtin_neon_vrnd64xq_f32:
6372 case NEON::BI__builtin_neon_vrnd64x_f64:
6373 case NEON::BI__builtin_neon_vrnd64xq_f64: {
6374 Int = Intrinsic::aarch64_neon_frint64x;
6375 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnd64x");
6376 }
6377 case NEON::BI__builtin_neon_vrnd64z_f32:
6378 case NEON::BI__builtin_neon_vrnd64zq_f32:
6379 case NEON::BI__builtin_neon_vrnd64z_f64:
6380 case NEON::BI__builtin_neon_vrnd64zq_f64: {
6381 Int = Intrinsic::aarch64_neon_frint64z;
6382 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrnd64z");
6383 }
6384 case NEON::BI__builtin_neon_vrnd_v:
6385 case NEON::BI__builtin_neon_vrndq_v: {
6386 Int = Builder.getIsFPConstrained()
6387 ? Intrinsic::experimental_constrained_trunc
6388 : Intrinsic::trunc;
6389 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrndz");
6390 }
6391 case NEON::BI__builtin_neon_vcvt_f64_v:
6392 case NEON::BI__builtin_neon_vcvtq_f64_v:
6393 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6394 Ty = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
6395 return usgn ? Builder.CreateUIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt")
6396 : Builder.CreateSIToFP(V: Ops[0], DestTy: Ty, Name: "vcvt");
6397 case NEON::BI__builtin_neon_vcvt_f64_f32: {
6398 assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
6399 "unexpected vcvt_f64_f32 builtin");
6400 NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
6401 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: GetNeonType(CGF: this, TypeFlags: SrcFlag));
6402
6403 return Builder.CreateFPExt(V: Ops[0], DestTy: Ty, Name: "vcvt");
6404 }
6405 case NEON::BI__builtin_neon_vcvt_f32_f64: {
6406 assert(Type.getEltType() == NeonTypeFlags::Float32 &&
6407 "unexpected vcvt_f32_f64 builtin");
6408 NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
6409 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: GetNeonType(CGF: this, TypeFlags: SrcFlag));
6410
6411 return Builder.CreateFPTrunc(V: Ops[0], DestTy: Ty, Name: "vcvt");
6412 }
6413 case NEON::BI__builtin_neon_vcvt_s32_v:
6414 case NEON::BI__builtin_neon_vcvt_u32_v:
6415 case NEON::BI__builtin_neon_vcvt_s64_v:
6416 case NEON::BI__builtin_neon_vcvt_u64_v:
6417 case NEON::BI__builtin_neon_vcvt_s16_f16:
6418 case NEON::BI__builtin_neon_vcvt_u16_f16:
6419 case NEON::BI__builtin_neon_vcvtq_s32_v:
6420 case NEON::BI__builtin_neon_vcvtq_u32_v:
6421 case NEON::BI__builtin_neon_vcvtq_s64_v:
6422 case NEON::BI__builtin_neon_vcvtq_u64_v:
6423 case NEON::BI__builtin_neon_vcvtq_s16_f16:
6424 case NEON::BI__builtin_neon_vcvtq_u16_f16: {
6425 Int =
6426 usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs;
6427 llvm::Type *Tys[2] = {Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type)};
6428 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvtz");
6429 }
6430 case NEON::BI__builtin_neon_vcvta_s16_f16:
6431 case NEON::BI__builtin_neon_vcvta_u16_f16:
6432 case NEON::BI__builtin_neon_vcvta_s32_v:
6433 case NEON::BI__builtin_neon_vcvtaq_s16_f16:
6434 case NEON::BI__builtin_neon_vcvtaq_s32_v:
6435 case NEON::BI__builtin_neon_vcvta_u32_v:
6436 case NEON::BI__builtin_neon_vcvtaq_u16_f16:
6437 case NEON::BI__builtin_neon_vcvtaq_u32_v:
6438 case NEON::BI__builtin_neon_vcvta_s64_v:
6439 case NEON::BI__builtin_neon_vcvtaq_s64_v:
6440 case NEON::BI__builtin_neon_vcvta_u64_v:
6441 case NEON::BI__builtin_neon_vcvtaq_u64_v: {
6442 Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
6443 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
6444 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvta");
6445 }
6446 case NEON::BI__builtin_neon_vcvtm_s16_f16:
6447 case NEON::BI__builtin_neon_vcvtm_s32_v:
6448 case NEON::BI__builtin_neon_vcvtmq_s16_f16:
6449 case NEON::BI__builtin_neon_vcvtmq_s32_v:
6450 case NEON::BI__builtin_neon_vcvtm_u16_f16:
6451 case NEON::BI__builtin_neon_vcvtm_u32_v:
6452 case NEON::BI__builtin_neon_vcvtmq_u16_f16:
6453 case NEON::BI__builtin_neon_vcvtmq_u32_v:
6454 case NEON::BI__builtin_neon_vcvtm_s64_v:
6455 case NEON::BI__builtin_neon_vcvtmq_s64_v:
6456 case NEON::BI__builtin_neon_vcvtm_u64_v:
6457 case NEON::BI__builtin_neon_vcvtmq_u64_v: {
6458 Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
6459 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
6460 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvtm");
6461 }
6462 case NEON::BI__builtin_neon_vcvtn_s16_f16:
6463 case NEON::BI__builtin_neon_vcvtn_s32_v:
6464 case NEON::BI__builtin_neon_vcvtnq_s16_f16:
6465 case NEON::BI__builtin_neon_vcvtnq_s32_v:
6466 case NEON::BI__builtin_neon_vcvtn_u16_f16:
6467 case NEON::BI__builtin_neon_vcvtn_u32_v:
6468 case NEON::BI__builtin_neon_vcvtnq_u16_f16:
6469 case NEON::BI__builtin_neon_vcvtnq_u32_v:
6470 case NEON::BI__builtin_neon_vcvtn_s64_v:
6471 case NEON::BI__builtin_neon_vcvtnq_s64_v:
6472 case NEON::BI__builtin_neon_vcvtn_u64_v:
6473 case NEON::BI__builtin_neon_vcvtnq_u64_v: {
6474 Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
6475 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
6476 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvtn");
6477 }
6478 case NEON::BI__builtin_neon_vcvtp_s16_f16:
6479 case NEON::BI__builtin_neon_vcvtp_s32_v:
6480 case NEON::BI__builtin_neon_vcvtpq_s16_f16:
6481 case NEON::BI__builtin_neon_vcvtpq_s32_v:
6482 case NEON::BI__builtin_neon_vcvtp_u16_f16:
6483 case NEON::BI__builtin_neon_vcvtp_u32_v:
6484 case NEON::BI__builtin_neon_vcvtpq_u16_f16:
6485 case NEON::BI__builtin_neon_vcvtpq_u32_v:
6486 case NEON::BI__builtin_neon_vcvtp_s64_v:
6487 case NEON::BI__builtin_neon_vcvtpq_s64_v:
6488 case NEON::BI__builtin_neon_vcvtp_u64_v:
6489 case NEON::BI__builtin_neon_vcvtpq_u64_v: {
6490 Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
6491 llvm::Type *Tys[2] = { Ty, GetFloatNeonType(CGF: this, IntTypeFlags: Type) };
6492 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vcvtp");
6493 }
6494 case NEON::BI__builtin_neon_vmulx_v:
6495 case NEON::BI__builtin_neon_vmulxq_v: {
6496 Int = Intrinsic::aarch64_neon_fmulx;
6497 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vmulx");
6498 }
6499 case NEON::BI__builtin_neon_vmulxh_lane_f16:
6500 case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
6501 // vmulx_lane should be mapped to Neon scalar mulx after
6502 // extracting the scalar element
6503 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2], Name: "extract");
6504 Ops.pop_back();
6505 Int = Intrinsic::aarch64_neon_fmulx;
6506 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vmulx");
6507 }
6508 case NEON::BI__builtin_neon_vmul_lane_v:
6509 case NEON::BI__builtin_neon_vmul_laneq_v: {
6510 // v1f64 vmul_lane should be mapped to Neon scalar mul lane
6511 bool Quad = false;
6512 if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
6513 Quad = true;
6514 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DoubleTy);
6515 llvm::FixedVectorType *VTy =
6516 GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
6517 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: VTy);
6518 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2], Name: "extract");
6519 Value *Result = Builder.CreateFMul(L: Ops[0], R: Ops[1]);
6520 return Builder.CreateBitCast(V: Result, DestTy: Ty);
6521 }
6522 case NEON::BI__builtin_neon_vpmaxnm_v:
6523 case NEON::BI__builtin_neon_vpmaxnmq_v: {
6524 Int = Intrinsic::aarch64_neon_fmaxnmp;
6525 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vpmaxnm");
6526 }
6527 case NEON::BI__builtin_neon_vpminnm_v:
6528 case NEON::BI__builtin_neon_vpminnmq_v: {
6529 Int = Intrinsic::aarch64_neon_fminnmp;
6530 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vpminnm");
6531 }
6532 case NEON::BI__builtin_neon_vsqrth_f16: {
6533 Int = Builder.getIsFPConstrained()
6534 ? Intrinsic::experimental_constrained_sqrt
6535 : Intrinsic::sqrt;
6536 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: HalfTy), Ops, name: "vsqrt");
6537 }
6538 case NEON::BI__builtin_neon_vsqrt_v:
6539 case NEON::BI__builtin_neon_vsqrtq_v: {
6540 Int = Builder.getIsFPConstrained()
6541 ? Intrinsic::experimental_constrained_sqrt
6542 : Intrinsic::sqrt;
6543 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6544 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vsqrt");
6545 }
6546 case NEON::BI__builtin_neon_vrbit_v:
6547 case NEON::BI__builtin_neon_vrbitq_v: {
6548 Int = Intrinsic::bitreverse;
6549 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vrbit");
6550 }
6551 case NEON::BI__builtin_neon_vmaxv_f16: {
6552 Int = Intrinsic::aarch64_neon_fmaxv;
6553 Ty = HalfTy;
6554 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4);
6555 llvm::Type *Tys[2] = {Ty, VTy};
6556 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vmaxv");
6557 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
6558 }
6559 case NEON::BI__builtin_neon_vmaxvq_f16: {
6560 Int = Intrinsic::aarch64_neon_fmaxv;
6561 Ty = HalfTy;
6562 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8);
6563 llvm::Type *Tys[2] = {Ty, VTy};
6564 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vmaxv");
6565 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
6566 }
6567 case NEON::BI__builtin_neon_vminv_f16: {
6568 Int = Intrinsic::aarch64_neon_fminv;
6569 Ty = HalfTy;
6570 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4);
6571 llvm::Type *Tys[2] = {Ty, VTy};
6572 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vminv");
6573 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
6574 }
6575 case NEON::BI__builtin_neon_vminvq_f16: {
6576 Int = Intrinsic::aarch64_neon_fminv;
6577 Ty = HalfTy;
6578 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8);
6579 llvm::Type *Tys[2] = {Ty, VTy};
6580 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vminv");
6581 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
6582 }
6583 case NEON::BI__builtin_neon_vmaxnmv_f16: {
6584 Int = Intrinsic::aarch64_neon_fmaxnmv;
6585 Ty = HalfTy;
6586 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4);
6587 llvm::Type *Tys[2] = {Ty, VTy};
6588 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vmaxnmv");
6589 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
6590 }
6591 case NEON::BI__builtin_neon_vmaxnmvq_f16: {
6592 Int = Intrinsic::aarch64_neon_fmaxnmv;
6593 Ty = HalfTy;
6594 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8);
6595 llvm::Type *Tys[2] = {Ty, VTy};
6596 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vmaxnmv");
6597 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
6598 }
6599 case NEON::BI__builtin_neon_vminnmv_f16: {
6600 Int = Intrinsic::aarch64_neon_fminnmv;
6601 Ty = HalfTy;
6602 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4);
6603 llvm::Type *Tys[2] = {Ty, VTy};
6604 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vminnmv");
6605 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
6606 }
6607 case NEON::BI__builtin_neon_vminnmvq_f16: {
6608 Int = Intrinsic::aarch64_neon_fminnmv;
6609 Ty = HalfTy;
6610 VTy = llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8);
6611 llvm::Type *Tys[2] = {Ty, VTy};
6612 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vminnmv");
6613 return Builder.CreateTrunc(V: Ops[0], DestTy: HalfTy);
6614 }
6615 case NEON::BI__builtin_neon_vmul_n_f64: {
6616 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: DoubleTy);
6617 Value *RHS = Builder.CreateBitCast(V: Ops[1], DestTy: DoubleTy);
6618 return Builder.CreateFMul(L: Ops[0], R: RHS);
6619 }
6620 case NEON::BI__builtin_neon_vaddlv_u8: {
6621 Int = Intrinsic::aarch64_neon_uaddlv;
6622 Ty = Int32Ty;
6623 VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8);
6624 llvm::Type *Tys[2] = {Ty, VTy};
6625 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
6626 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
6627 }
6628 case NEON::BI__builtin_neon_vaddlv_u16: {
6629 Int = Intrinsic::aarch64_neon_uaddlv;
6630 Ty = Int32Ty;
6631 VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 4);
6632 llvm::Type *Tys[2] = {Ty, VTy};
6633 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
6634 }
6635 case NEON::BI__builtin_neon_vaddlvq_u8: {
6636 Int = Intrinsic::aarch64_neon_uaddlv;
6637 Ty = Int32Ty;
6638 VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
6639 llvm::Type *Tys[2] = {Ty, VTy};
6640 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
6641 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
6642 }
6643 case NEON::BI__builtin_neon_vaddlvq_u16: {
6644 Int = Intrinsic::aarch64_neon_uaddlv;
6645 Ty = Int32Ty;
6646 VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 8);
6647 llvm::Type *Tys[2] = {Ty, VTy};
6648 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
6649 }
6650 case NEON::BI__builtin_neon_vaddlv_s8: {
6651 Int = Intrinsic::aarch64_neon_saddlv;
6652 Ty = Int32Ty;
6653 VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8);
6654 llvm::Type *Tys[2] = {Ty, VTy};
6655 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
6656 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
6657 }
6658 case NEON::BI__builtin_neon_vaddlv_s16: {
6659 Int = Intrinsic::aarch64_neon_saddlv;
6660 Ty = Int32Ty;
6661 VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 4);
6662 llvm::Type *Tys[2] = {Ty, VTy};
6663 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
6664 }
6665 case NEON::BI__builtin_neon_vaddlvq_s8: {
6666 Int = Intrinsic::aarch64_neon_saddlv;
6667 Ty = Int32Ty;
6668 VTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
6669 llvm::Type *Tys[2] = {Ty, VTy};
6670 Ops[0] = EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
6671 return Builder.CreateTrunc(V: Ops[0], DestTy: Int16Ty);
6672 }
6673 case NEON::BI__builtin_neon_vaddlvq_s16: {
6674 Int = Intrinsic::aarch64_neon_saddlv;
6675 Ty = Int32Ty;
6676 VTy = llvm::FixedVectorType::get(ElementType: Int16Ty, NumElts: 8);
6677 llvm::Type *Tys[2] = {Ty, VTy};
6678 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vaddlv");
6679 }
6680 case NEON::BI__builtin_neon_vsri_n_v:
6681 case NEON::BI__builtin_neon_vsriq_n_v: {
6682 Int = Intrinsic::aarch64_neon_vsri;
6683 llvm::Function *Intrin = CGM.getIntrinsic(IID: Int, Tys: Ty);
6684 return EmitNeonCall(F: Intrin, Ops, name: "vsri_n");
6685 }
6686 case NEON::BI__builtin_neon_vsli_n_v:
6687 case NEON::BI__builtin_neon_vsliq_n_v: {
6688 Int = Intrinsic::aarch64_neon_vsli;
6689 llvm::Function *Intrin = CGM.getIntrinsic(IID: Int, Tys: Ty);
6690 return EmitNeonCall(F: Intrin, Ops, name: "vsli_n");
6691 }
6692 case NEON::BI__builtin_neon_vsra_n_v:
6693 case NEON::BI__builtin_neon_vsraq_n_v:
6694 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
6695 Ops[1] = EmitNeonRShiftImm(Vec: Ops[1], Shift: Ops[2], Ty, usgn, name: "vsra_n");
6696 return Builder.CreateAdd(LHS: Ops[0], RHS: Ops[1]);
6697 case NEON::BI__builtin_neon_vrsra_n_v:
6698 case NEON::BI__builtin_neon_vrsraq_n_v: {
6699 Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
6700 SmallVector<llvm::Value*,2> TmpOps;
6701 TmpOps.push_back(Elt: Ops[1]);
6702 TmpOps.push_back(Elt: Ops[2]);
6703 Function* F = CGM.getIntrinsic(IID: Int, Tys: Ty);
6704 llvm::Value *tmp = EmitNeonCall(F, Ops&: TmpOps, name: "vrshr_n", shift: 1, rightshift: true);
6705 Ops[0] = Builder.CreateBitCast(V: Ops[0], DestTy: VTy);
6706 return Builder.CreateAdd(LHS: Ops[0], RHS: tmp);
6707 }
6708 case NEON::BI__builtin_neon_vld1_v:
6709 case NEON::BI__builtin_neon_vld1q_v: {
6710 return Builder.CreateAlignedLoad(Ty: VTy, Addr: Ops[0], Align: PtrOp0.getAlignment());
6711 }
6712 case NEON::BI__builtin_neon_vst1_v:
6713 case NEON::BI__builtin_neon_vst1q_v:
6714 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: VTy);
6715 return Builder.CreateAlignedStore(Val: Ops[1], Addr: Ops[0], Align: PtrOp0.getAlignment());
6716 case NEON::BI__builtin_neon_vld1_lane_v:
6717 case NEON::BI__builtin_neon_vld1q_lane_v: {
6718 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6719 Ops[0] = Builder.CreateAlignedLoad(Ty: VTy->getElementType(), Addr: Ops[0],
6720 Align: PtrOp0.getAlignment());
6721 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vld1_lane");
6722 }
6723 case NEON::BI__builtin_neon_vldap1_lane_s64:
6724 case NEON::BI__builtin_neon_vldap1q_lane_s64: {
6725 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6726 llvm::LoadInst *LI = Builder.CreateAlignedLoad(
6727 Ty: VTy->getElementType(), Addr: Ops[0], Align: PtrOp0.getAlignment());
6728 LI->setAtomic(Ordering: llvm::AtomicOrdering::Acquire);
6729 Ops[0] = LI;
6730 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: Ops[0], Idx: Ops[2], Name: "vldap1_lane");
6731 }
6732 case NEON::BI__builtin_neon_vld1_dup_v:
6733 case NEON::BI__builtin_neon_vld1q_dup_v: {
6734 Value *V = PoisonValue::get(T: Ty);
6735 Ops[0] = Builder.CreateAlignedLoad(Ty: VTy->getElementType(), Addr: Ops[0],
6736 Align: PtrOp0.getAlignment());
6737 llvm::Constant *CI = ConstantInt::get(Ty: Int32Ty, V: 0);
6738 Ops[0] = Builder.CreateInsertElement(Vec: V, NewElt: Ops[0], Idx: CI);
6739 return EmitNeonSplat(V: Ops[0], C: CI);
6740 }
6741 case NEON::BI__builtin_neon_vst1_lane_v:
6742 case NEON::BI__builtin_neon_vst1q_lane_v:
6743 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6744 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2]);
6745 return Builder.CreateAlignedStore(Val: Ops[1], Addr: Ops[0], Align: PtrOp0.getAlignment());
6746 case NEON::BI__builtin_neon_vstl1_lane_s64:
6747 case NEON::BI__builtin_neon_vstl1q_lane_s64: {
6748 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6749 Ops[1] = Builder.CreateExtractElement(Vec: Ops[1], Idx: Ops[2]);
6750 llvm::StoreInst *SI =
6751 Builder.CreateAlignedStore(Val: Ops[1], Addr: Ops[0], Align: PtrOp0.getAlignment());
6752 SI->setAtomic(Ordering: llvm::AtomicOrdering::Release);
6753 return SI;
6754 }
6755 case NEON::BI__builtin_neon_vld2_v:
6756 case NEON::BI__builtin_neon_vld2q_v: {
6757 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
6758 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld2, Tys);
6759 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld2");
6760 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6761 }
6762 case NEON::BI__builtin_neon_vld3_v:
6763 case NEON::BI__builtin_neon_vld3q_v: {
6764 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
6765 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld3, Tys);
6766 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld3");
6767 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6768 }
6769 case NEON::BI__builtin_neon_vld4_v:
6770 case NEON::BI__builtin_neon_vld4q_v: {
6771 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
6772 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld4, Tys);
6773 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld4");
6774 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6775 }
6776 case NEON::BI__builtin_neon_vld2_dup_v:
6777 case NEON::BI__builtin_neon_vld2q_dup_v: {
6778 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
6779 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld2r, Tys);
6780 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld2");
6781 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6782 }
6783 case NEON::BI__builtin_neon_vld3_dup_v:
6784 case NEON::BI__builtin_neon_vld3q_dup_v: {
6785 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
6786 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld3r, Tys);
6787 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld3");
6788 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6789 }
6790 case NEON::BI__builtin_neon_vld4_dup_v:
6791 case NEON::BI__builtin_neon_vld4q_dup_v: {
6792 llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
6793 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld4r, Tys);
6794 Ops[1] = Builder.CreateCall(Callee: F, Args: Ops[1], Name: "vld4");
6795 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6796 }
6797 case NEON::BI__builtin_neon_vld2_lane_v:
6798 case NEON::BI__builtin_neon_vld2q_lane_v: {
6799 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
6800 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld2lane, Tys);
6801 std::rotate(first: Ops.begin() + 1, middle: Ops.begin() + 2, last: Ops.end());
6802 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6803 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
6804 Ops[3] = Builder.CreateZExt(V: Ops[3], DestTy: Int64Ty);
6805 Ops[1] = Builder.CreateCall(Callee: F, Args: ArrayRef(Ops).slice(N: 1), Name: "vld2_lane");
6806 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6807 }
6808 case NEON::BI__builtin_neon_vld3_lane_v:
6809 case NEON::BI__builtin_neon_vld3q_lane_v: {
6810 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
6811 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld3lane, Tys);
6812 std::rotate(first: Ops.begin() + 1, middle: Ops.begin() + 2, last: Ops.end());
6813 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6814 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
6815 Ops[3] = Builder.CreateBitCast(V: Ops[3], DestTy: Ty);
6816 Ops[4] = Builder.CreateZExt(V: Ops[4], DestTy: Int64Ty);
6817 Ops[1] = Builder.CreateCall(Callee: F, Args: ArrayRef(Ops).slice(N: 1), Name: "vld3_lane");
6818 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6819 }
6820 case NEON::BI__builtin_neon_vld4_lane_v:
6821 case NEON::BI__builtin_neon_vld4q_lane_v: {
6822 llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
6823 Function *F = CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_ld4lane, Tys);
6824 std::rotate(first: Ops.begin() + 1, middle: Ops.begin() + 2, last: Ops.end());
6825 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6826 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
6827 Ops[3] = Builder.CreateBitCast(V: Ops[3], DestTy: Ty);
6828 Ops[4] = Builder.CreateBitCast(V: Ops[4], DestTy: Ty);
6829 Ops[5] = Builder.CreateZExt(V: Ops[5], DestTy: Int64Ty);
6830 Ops[1] = Builder.CreateCall(Callee: F, Args: ArrayRef(Ops).slice(N: 1), Name: "vld4_lane");
6831 return Builder.CreateDefaultAlignedStore(Val: Ops[1], Addr: Ops[0]);
6832 }
6833 case NEON::BI__builtin_neon_vst2_v:
6834 case NEON::BI__builtin_neon_vst2q_v: {
6835 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
6836 llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
6837 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st2, Tys),
6838 Ops, name: "");
6839 }
6840 case NEON::BI__builtin_neon_vst2_lane_v:
6841 case NEON::BI__builtin_neon_vst2q_lane_v: {
6842 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
6843 Ops[2] = Builder.CreateZExt(V: Ops[2], DestTy: Int64Ty);
6844 llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
6845 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st2lane, Tys),
6846 Ops, name: "");
6847 }
6848 case NEON::BI__builtin_neon_vst3_v:
6849 case NEON::BI__builtin_neon_vst3q_v: {
6850 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
6851 llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
6852 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st3, Tys),
6853 Ops, name: "");
6854 }
6855 case NEON::BI__builtin_neon_vst3_lane_v:
6856 case NEON::BI__builtin_neon_vst3q_lane_v: {
6857 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
6858 Ops[3] = Builder.CreateZExt(V: Ops[3], DestTy: Int64Ty);
6859 llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
6860 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st3lane, Tys),
6861 Ops, name: "");
6862 }
6863 case NEON::BI__builtin_neon_vst4_v:
6864 case NEON::BI__builtin_neon_vst4q_v: {
6865 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
6866 llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
6867 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st4, Tys),
6868 Ops, name: "");
6869 }
6870 case NEON::BI__builtin_neon_vst4_lane_v:
6871 case NEON::BI__builtin_neon_vst4q_lane_v: {
6872 std::rotate(first: Ops.begin(), middle: Ops.begin() + 1, last: Ops.end());
6873 Ops[4] = Builder.CreateZExt(V: Ops[4], DestTy: Int64Ty);
6874 llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
6875 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_st4lane, Tys),
6876 Ops, name: "");
6877 }
6878 case NEON::BI__builtin_neon_vtrn_v:
6879 case NEON::BI__builtin_neon_vtrnq_v: {
6880 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6881 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
6882 Value *SV = nullptr;
6883
6884 for (unsigned vi = 0; vi != 2; ++vi) {
6885 SmallVector<int, 16> Indices;
6886 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
6887 Indices.push_back(Elt: i+vi);
6888 Indices.push_back(Elt: i+e+vi);
6889 }
6890 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
6891 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vtrn");
6892 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
6893 }
6894 return SV;
6895 }
6896 case NEON::BI__builtin_neon_vuzp_v:
6897 case NEON::BI__builtin_neon_vuzpq_v: {
6898 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6899 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
6900 Value *SV = nullptr;
6901
6902 for (unsigned vi = 0; vi != 2; ++vi) {
6903 SmallVector<int, 16> Indices;
6904 for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
6905 Indices.push_back(Elt: 2*i+vi);
6906
6907 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
6908 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vuzp");
6909 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
6910 }
6911 return SV;
6912 }
6913 case NEON::BI__builtin_neon_vzip_v:
6914 case NEON::BI__builtin_neon_vzipq_v: {
6915 Ops[1] = Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
6916 Ops[2] = Builder.CreateBitCast(V: Ops[2], DestTy: Ty);
6917 Value *SV = nullptr;
6918
6919 for (unsigned vi = 0; vi != 2; ++vi) {
6920 SmallVector<int, 16> Indices;
6921 for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
6922 Indices.push_back(Elt: (i + vi*e) >> 1);
6923 Indices.push_back(Elt: ((i + vi*e) >> 1)+e);
6924 }
6925 Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ptr: Ops[0], Idx0: vi);
6926 SV = Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[2], Mask: Indices, Name: "vzip");
6927 SV = Builder.CreateDefaultAlignedStore(Val: SV, Addr);
6928 }
6929 return SV;
6930 }
6931 case NEON::BI__builtin_neon_vqtbl1q_v: {
6932 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbl1, Tys: Ty),
6933 Ops, name: "vtbl1");
6934 }
6935 case NEON::BI__builtin_neon_vqtbl2q_v: {
6936 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbl2, Tys: Ty),
6937 Ops, name: "vtbl2");
6938 }
6939 case NEON::BI__builtin_neon_vqtbl3q_v: {
6940 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbl3, Tys: Ty),
6941 Ops, name: "vtbl3");
6942 }
6943 case NEON::BI__builtin_neon_vqtbl4q_v: {
6944 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbl4, Tys: Ty),
6945 Ops, name: "vtbl4");
6946 }
6947 case NEON::BI__builtin_neon_vqtbx1q_v: {
6948 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbx1, Tys: Ty),
6949 Ops, name: "vtbx1");
6950 }
6951 case NEON::BI__builtin_neon_vqtbx2q_v: {
6952 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbx2, Tys: Ty),
6953 Ops, name: "vtbx2");
6954 }
6955 case NEON::BI__builtin_neon_vqtbx3q_v: {
6956 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbx3, Tys: Ty),
6957 Ops, name: "vtbx3");
6958 }
6959 case NEON::BI__builtin_neon_vqtbx4q_v: {
6960 return EmitNeonCall(F: CGM.getIntrinsic(IID: Intrinsic::aarch64_neon_tbx4, Tys: Ty),
6961 Ops, name: "vtbx4");
6962 }
6963 case NEON::BI__builtin_neon_vsqadd_v:
6964 case NEON::BI__builtin_neon_vsqaddq_v: {
6965 Int = Intrinsic::aarch64_neon_usqadd;
6966 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vsqadd");
6967 }
6968 case NEON::BI__builtin_neon_vuqadd_v:
6969 case NEON::BI__builtin_neon_vuqaddq_v: {
6970 Int = Intrinsic::aarch64_neon_suqadd;
6971 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vuqadd");
6972 }
6973
6974 case NEON::BI__builtin_neon_vluti2_laneq_mf8:
6975 case NEON::BI__builtin_neon_vluti2_laneq_bf16:
6976 case NEON::BI__builtin_neon_vluti2_laneq_f16:
6977 case NEON::BI__builtin_neon_vluti2_laneq_p16:
6978 case NEON::BI__builtin_neon_vluti2_laneq_p8:
6979 case NEON::BI__builtin_neon_vluti2_laneq_s16:
6980 case NEON::BI__builtin_neon_vluti2_laneq_s8:
6981 case NEON::BI__builtin_neon_vluti2_laneq_u16:
6982 case NEON::BI__builtin_neon_vluti2_laneq_u8: {
6983 Int = Intrinsic::aarch64_neon_vluti2_laneq;
6984 llvm::Type *Tys[2];
6985 Tys[0] = Ty;
6986 Tys[1] = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
6987 /*isQuad*/ false));
6988 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vluti2_laneq");
6989 }
6990 case NEON::BI__builtin_neon_vluti2q_laneq_mf8:
6991 case NEON::BI__builtin_neon_vluti2q_laneq_bf16:
6992 case NEON::BI__builtin_neon_vluti2q_laneq_f16:
6993 case NEON::BI__builtin_neon_vluti2q_laneq_p16:
6994 case NEON::BI__builtin_neon_vluti2q_laneq_p8:
6995 case NEON::BI__builtin_neon_vluti2q_laneq_s16:
6996 case NEON::BI__builtin_neon_vluti2q_laneq_s8:
6997 case NEON::BI__builtin_neon_vluti2q_laneq_u16:
6998 case NEON::BI__builtin_neon_vluti2q_laneq_u8: {
6999 Int = Intrinsic::aarch64_neon_vluti2_laneq;
7000 llvm::Type *Tys[2];
7001 Tys[0] = Ty;
7002 Tys[1] = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
7003 /*isQuad*/ true));
7004 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vluti2_laneq");
7005 }
7006 case NEON::BI__builtin_neon_vluti2_lane_mf8:
7007 case NEON::BI__builtin_neon_vluti2_lane_bf16:
7008 case NEON::BI__builtin_neon_vluti2_lane_f16:
7009 case NEON::BI__builtin_neon_vluti2_lane_p16:
7010 case NEON::BI__builtin_neon_vluti2_lane_p8:
7011 case NEON::BI__builtin_neon_vluti2_lane_s16:
7012 case NEON::BI__builtin_neon_vluti2_lane_s8:
7013 case NEON::BI__builtin_neon_vluti2_lane_u16:
7014 case NEON::BI__builtin_neon_vluti2_lane_u8: {
7015 Int = Intrinsic::aarch64_neon_vluti2_lane;
7016 llvm::Type *Tys[2];
7017 Tys[0] = Ty;
7018 Tys[1] = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
7019 /*isQuad*/ false));
7020 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vluti2_lane");
7021 }
7022 case NEON::BI__builtin_neon_vluti2q_lane_mf8:
7023 case NEON::BI__builtin_neon_vluti2q_lane_bf16:
7024 case NEON::BI__builtin_neon_vluti2q_lane_f16:
7025 case NEON::BI__builtin_neon_vluti2q_lane_p16:
7026 case NEON::BI__builtin_neon_vluti2q_lane_p8:
7027 case NEON::BI__builtin_neon_vluti2q_lane_s16:
7028 case NEON::BI__builtin_neon_vluti2q_lane_s8:
7029 case NEON::BI__builtin_neon_vluti2q_lane_u16:
7030 case NEON::BI__builtin_neon_vluti2q_lane_u8: {
7031 Int = Intrinsic::aarch64_neon_vluti2_lane;
7032 llvm::Type *Tys[2];
7033 Tys[0] = Ty;
7034 Tys[1] = GetNeonType(CGF: this, TypeFlags: NeonTypeFlags(Type.getEltType(), false,
7035 /*isQuad*/ true));
7036 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys), Ops, name: "vluti2_lane");
7037 }
7038 case NEON::BI__builtin_neon_vluti4q_lane_mf8:
7039 case NEON::BI__builtin_neon_vluti4q_lane_p8:
7040 case NEON::BI__builtin_neon_vluti4q_lane_s8:
7041 case NEON::BI__builtin_neon_vluti4q_lane_u8: {
7042 Int = Intrinsic::aarch64_neon_vluti4q_lane;
7043 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vluti4q_lane");
7044 }
7045 case NEON::BI__builtin_neon_vluti4q_laneq_mf8:
7046 case NEON::BI__builtin_neon_vluti4q_laneq_p8:
7047 case NEON::BI__builtin_neon_vluti4q_laneq_s8:
7048 case NEON::BI__builtin_neon_vluti4q_laneq_u8: {
7049 Int = Intrinsic::aarch64_neon_vluti4q_laneq;
7050 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vluti4q_laneq");
7051 }
7052 case NEON::BI__builtin_neon_vluti4q_lane_bf16_x2:
7053 case NEON::BI__builtin_neon_vluti4q_lane_f16_x2:
7054 case NEON::BI__builtin_neon_vluti4q_lane_p16_x2:
7055 case NEON::BI__builtin_neon_vluti4q_lane_s16_x2:
7056 case NEON::BI__builtin_neon_vluti4q_lane_u16_x2: {
7057 Int = Intrinsic::aarch64_neon_vluti4q_lane_x2;
7058 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vluti4q_lane_x2");
7059 }
7060 case NEON::BI__builtin_neon_vluti4q_laneq_bf16_x2:
7061 case NEON::BI__builtin_neon_vluti4q_laneq_f16_x2:
7062 case NEON::BI__builtin_neon_vluti4q_laneq_p16_x2:
7063 case NEON::BI__builtin_neon_vluti4q_laneq_s16_x2:
7064 case NEON::BI__builtin_neon_vluti4q_laneq_u16_x2: {
7065 Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
7066 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "vluti4q_laneq_x2");
7067 }
7068 case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
7069 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fmmla,
7070 Tys: {llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8),
7071 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16)},
7072 Ops, E, name: "fmmla");
7073 case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
7074 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fmmla,
7075 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4),
7076 llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16)},
7077 Ops, E, name: "fmmla");
7078 case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
7079 ExtractLow = true;
7080 [[fallthrough]];
7081 case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
7082 case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
7083 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_cvtl1,
7084 Ty0: llvm::FixedVectorType::get(ElementType: BFloatTy, NumElts: 8),
7085 Ty1: Ops[0]->getType(), Extract: ExtractLow, Ops, E, name: "vbfcvt1");
7086 case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
7087 ExtractLow = true;
7088 [[fallthrough]];
7089 case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
7090 case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
7091 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_cvtl2,
7092 Ty0: llvm::FixedVectorType::get(ElementType: BFloatTy, NumElts: 8),
7093 Ty1: Ops[0]->getType(), Extract: ExtractLow, Ops, E, name: "vbfcvt2");
7094 case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
7095 ExtractLow = true;
7096 [[fallthrough]];
7097 case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
7098 case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
7099 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_cvtl1,
7100 Ty0: llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8),
7101 Ty1: Ops[0]->getType(), Extract: ExtractLow, Ops, E, name: "vbfcvt1");
7102 case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
7103 ExtractLow = true;
7104 [[fallthrough]];
7105 case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
7106 case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
7107 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_cvtl2,
7108 Ty0: llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8),
7109 Ty1: Ops[0]->getType(), Extract: ExtractLow, Ops, E, name: "vbfcvt2");
7110 case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
7111 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_fcvtn,
7112 Ty0: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8),
7113 Ty1: Ops[0]->getType(), Extract: false, Ops, E, name: "vfcvtn");
7114 case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
7115 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_fcvtn,
7116 Ty0: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 8),
7117 Ty1: llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 4), Extract: false, Ops,
7118 E, name: "vfcvtn");
7119 case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
7120 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_fcvtn,
7121 Ty0: llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16),
7122 Ty1: llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8), Extract: false, Ops,
7123 E, name: "vfcvtn");
7124 case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
7125 llvm::Type *Ty = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts: 16);
7126 Ops[0] = Builder.CreateInsertVector(DstType: Ty, SrcVec: PoisonValue::get(T: Ty), SubVec: Ops[0],
7127 Idx: uint64_t(0));
7128 return EmitFP8NeonCvtCall(IID: Intrinsic::aarch64_neon_fp8_fcvtn2, Ty0: Ty,
7129 Ty1: Ops[1]->getType(), Extract: false, Ops, E, name: "vfcvtn2");
7130 }
7131
7132 case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
7133 case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
7134 return EmitFP8NeonFDOTCall(IID: Intrinsic::aarch64_neon_fp8_fdot2, ExtendLaneArg: false, RetTy: HalfTy,
7135 Ops, E, name: "fdot2");
7136 case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
7137 case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
7138 ExtendLaneArg = true;
7139 [[fallthrough]];
7140 case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
7141 case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
7142 return EmitFP8NeonFDOTCall(IID: Intrinsic::aarch64_neon_fp8_fdot2_lane,
7143 ExtendLaneArg, RetTy: HalfTy, Ops, E, name: "fdot2_lane");
7144 case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
7145 case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
7146 return EmitFP8NeonFDOTCall(IID: Intrinsic::aarch64_neon_fp8_fdot4, ExtendLaneArg: false,
7147 RetTy: FloatTy, Ops, E, name: "fdot4");
7148 case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
7149 case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
7150 ExtendLaneArg = true;
7151 [[fallthrough]];
7152 case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
7153 case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
7154 return EmitFP8NeonFDOTCall(IID: Intrinsic::aarch64_neon_fp8_fdot4_lane,
7155 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "fdot4_lane");
7156
7157 case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
7158 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlalb,
7159 Tys: {llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8)}, Ops, E,
7160 name: "vmlal");
7161 case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm:
7162 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlalt,
7163 Tys: {llvm::FixedVectorType::get(ElementType: HalfTy, NumElts: 8)}, Ops, E,
7164 name: "vmlal");
7165 case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm:
7166 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlallbb,
7167 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4)}, Ops, E,
7168 name: "vmlall");
7169 case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm:
7170 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlallbt,
7171 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4)}, Ops, E,
7172 name: "vmlall");
7173 case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm:
7174 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlalltb,
7175 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4)}, Ops, E,
7176 name: "vmlall");
7177 case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm:
7178 return EmitFP8NeonCall(IID: Intrinsic::aarch64_neon_fp8_fmlalltt,
7179 Tys: {llvm::FixedVectorType::get(ElementType: FloatTy, NumElts: 4)}, Ops, E,
7180 name: "vmlall");
7181 case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm:
7182 ExtendLaneArg = true;
7183 [[fallthrough]];
7184 case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm:
7185 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlalb_lane,
7186 ExtendLaneArg, RetTy: HalfTy, Ops, E, name: "vmlal_lane");
7187 case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm:
7188 ExtendLaneArg = true;
7189 [[fallthrough]];
7190 case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm:
7191 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlalt_lane,
7192 ExtendLaneArg, RetTy: HalfTy, Ops, E, name: "vmlal_lane");
7193 case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm:
7194 ExtendLaneArg = true;
7195 [[fallthrough]];
7196 case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm:
7197 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlallbb_lane,
7198 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "vmlall_lane");
7199 case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm:
7200 ExtendLaneArg = true;
7201 [[fallthrough]];
7202 case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm:
7203 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlallbt_lane,
7204 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "vmlall_lane");
7205 case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm:
7206 ExtendLaneArg = true;
7207 [[fallthrough]];
7208 case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm:
7209 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlalltb_lane,
7210 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "vmlall_lane");
7211 case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm:
7212 ExtendLaneArg = true;
7213 [[fallthrough]];
7214 case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm:
7215 return EmitFP8NeonFMLACall(IID: Intrinsic::aarch64_neon_fp8_fmlalltt_lane,
7216 ExtendLaneArg, RetTy: FloatTy, Ops, E, name: "vmlall_lane");
7217 case NEON::BI__builtin_neon_vamin_f16:
7218 case NEON::BI__builtin_neon_vaminq_f16:
7219 case NEON::BI__builtin_neon_vamin_f32:
7220 case NEON::BI__builtin_neon_vaminq_f32:
7221 case NEON::BI__builtin_neon_vaminq_f64: {
7222 Int = Intrinsic::aarch64_neon_famin;
7223 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "famin");
7224 }
7225 case NEON::BI__builtin_neon_vamax_f16:
7226 case NEON::BI__builtin_neon_vamaxq_f16:
7227 case NEON::BI__builtin_neon_vamax_f32:
7228 case NEON::BI__builtin_neon_vamaxq_f32:
7229 case NEON::BI__builtin_neon_vamaxq_f64: {
7230 Int = Intrinsic::aarch64_neon_famax;
7231 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "famax");
7232 }
7233 case NEON::BI__builtin_neon_vscale_f16:
7234 case NEON::BI__builtin_neon_vscaleq_f16:
7235 case NEON::BI__builtin_neon_vscale_f32:
7236 case NEON::BI__builtin_neon_vscaleq_f32:
7237 case NEON::BI__builtin_neon_vscaleq_f64: {
7238 Int = Intrinsic::aarch64_neon_fp8_fscale;
7239 return EmitNeonCall(F: CGM.getIntrinsic(IID: Int, Tys: Ty), Ops, name: "fscale");
7240 }
7241 }
7242}
7243
7244Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
7245 const CallExpr *E) {
7246 assert((BuiltinID == BPF::BI__builtin_preserve_field_info ||
7247 BuiltinID == BPF::BI__builtin_btf_type_id ||
7248 BuiltinID == BPF::BI__builtin_preserve_type_info ||
7249 BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
7250 "unexpected BPF builtin");
7251
7252 // A sequence number, injected into IR builtin functions, to
7253 // prevent CSE given the only difference of the function
7254 // may just be the debuginfo metadata.
7255 static uint32_t BuiltinSeqNum;
7256
7257 switch (BuiltinID) {
7258 default:
7259 llvm_unreachable("Unexpected BPF builtin");
7260 case BPF::BI__builtin_preserve_field_info: {
7261 const Expr *Arg = E->getArg(Arg: 0);
7262 bool IsBitField = Arg->IgnoreParens()->getObjectKind() == OK_BitField;
7263
7264 if (!getDebugInfo()) {
7265 CGM.Error(loc: E->getExprLoc(),
7266 error: "using __builtin_preserve_field_info() without -g");
7267 return IsBitField ? EmitLValue(E: Arg).getRawBitFieldPointer(CGF&: *this)
7268 : EmitLValue(E: Arg).emitRawPointer(CGF&: *this);
7269 }
7270
7271 // Enable underlying preserve_*_access_index() generation.
7272 bool OldIsInPreservedAIRegion = IsInPreservedAIRegion;
7273 IsInPreservedAIRegion = true;
7274 Value *FieldAddr = IsBitField ? EmitLValue(E: Arg).getRawBitFieldPointer(CGF&: *this)
7275 : EmitLValue(E: Arg).emitRawPointer(CGF&: *this);
7276 IsInPreservedAIRegion = OldIsInPreservedAIRegion;
7277
7278 ConstantInt *C = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
7279 Value *InfoKind = ConstantInt::get(Ty: Int64Ty, V: C->getSExtValue());
7280
7281 // Built the IR for the preserve_field_info intrinsic.
7282 llvm::Function *FnGetFieldInfo = Intrinsic::getOrInsertDeclaration(
7283 M: &CGM.getModule(), id: Intrinsic::bpf_preserve_field_info,
7284 OverloadTys: {FieldAddr->getType()});
7285 return Builder.CreateCall(Callee: FnGetFieldInfo, Args: {FieldAddr, InfoKind});
7286 }
7287 case BPF::BI__builtin_btf_type_id:
7288 case BPF::BI__builtin_preserve_type_info: {
7289 if (!getDebugInfo()) {
7290 CGM.Error(loc: E->getExprLoc(), error: "using builtin function without -g");
7291 return nullptr;
7292 }
7293
7294 const Expr *Arg0 = E->getArg(Arg: 0);
7295 llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
7296 Ty: Arg0->getType(), Loc: Arg0->getExprLoc());
7297
7298 ConstantInt *Flag = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
7299 Value *FlagValue = ConstantInt::get(Ty: Int64Ty, V: Flag->getSExtValue());
7300 Value *SeqNumVal = ConstantInt::get(Ty: Int32Ty, V: BuiltinSeqNum++);
7301
7302 llvm::Function *FnDecl;
7303 if (BuiltinID == BPF::BI__builtin_btf_type_id)
7304 FnDecl = Intrinsic::getOrInsertDeclaration(
7305 M: &CGM.getModule(), id: Intrinsic::bpf_btf_type_id, OverloadTys: {});
7306 else
7307 FnDecl = Intrinsic::getOrInsertDeclaration(
7308 M: &CGM.getModule(), id: Intrinsic::bpf_preserve_type_info, OverloadTys: {});
7309 CallInst *Fn = Builder.CreateCall(Callee: FnDecl, Args: {SeqNumVal, FlagValue});
7310 Fn->setMetadata(KindID: LLVMContext::MD_preserve_access_index, Node: DbgInfo);
7311 return Fn;
7312 }
7313 case BPF::BI__builtin_preserve_enum_value: {
7314 if (!getDebugInfo()) {
7315 CGM.Error(loc: E->getExprLoc(), error: "using builtin function without -g");
7316 return nullptr;
7317 }
7318
7319 const Expr *Arg0 = E->getArg(Arg: 0);
7320 llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
7321 Ty: Arg0->getType(), Loc: Arg0->getExprLoc());
7322
7323 // Find enumerator
7324 const auto *UO = cast<UnaryOperator>(Val: Arg0->IgnoreParens());
7325 const auto *CE = cast<CStyleCastExpr>(Val: UO->getSubExpr());
7326 const auto *DR = cast<DeclRefExpr>(Val: CE->getSubExpr());
7327 const auto *Enumerator = cast<EnumConstantDecl>(Val: DR->getDecl());
7328
7329 auto InitVal = Enumerator->getInitVal();
7330 std::string InitValStr;
7331 if (InitVal.isNegative() || InitVal > uint64_t(INT64_MAX))
7332 InitValStr = std::to_string(val: InitVal.getSExtValue());
7333 else
7334 InitValStr = std::to_string(val: InitVal.getZExtValue());
7335 std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr;
7336 Value *EnumStrVal = Builder.CreateGlobalString(Str: EnumStr);
7337
7338 ConstantInt *Flag = cast<ConstantInt>(Val: EmitScalarExpr(E: E->getArg(Arg: 1)));
7339 Value *FlagValue = ConstantInt::get(Ty: Int64Ty, V: Flag->getSExtValue());
7340 Value *SeqNumVal = ConstantInt::get(Ty: Int32Ty, V: BuiltinSeqNum++);
7341
7342 llvm::Function *IntrinsicFn = Intrinsic::getOrInsertDeclaration(
7343 M: &CGM.getModule(), id: Intrinsic::bpf_preserve_enum_value, OverloadTys: {});
7344 CallInst *Fn =
7345 Builder.CreateCall(Callee: IntrinsicFn, Args: {SeqNumVal, EnumStrVal, FlagValue});
7346 Fn->setMetadata(KindID: LLVMContext::MD_preserve_access_index, Node: DbgInfo);
7347 return Fn;
7348 }
7349 }
7350}
7351
7352llvm::Value *CodeGenFunction::
7353BuildVector(ArrayRef<llvm::Value*> Ops) {
7354 assert((Ops.size() & (Ops.size() - 1)) == 0 &&
7355 "Not a power-of-two sized vector!");
7356 bool AllConstants = true;
7357 for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
7358 AllConstants &= isa<Constant>(Val: Ops[i]);
7359
7360 // If this is a constant vector, create a ConstantVector.
7361 if (AllConstants) {
7362 SmallVector<llvm::Constant*, 16> CstOps;
7363 for (llvm::Value *Op : Ops)
7364 CstOps.push_back(Elt: cast<Constant>(Val: Op));
7365 return llvm::ConstantVector::get(V: CstOps);
7366 }
7367
7368 // Otherwise, insertelement the values to build the vector.
7369 Value *Result = llvm::PoisonValue::get(
7370 T: llvm::FixedVectorType::get(ElementType: Ops[0]->getType(), NumElts: Ops.size()));
7371
7372 for (unsigned i = 0, e = Ops.size(); i != e; ++i)
7373 Result = Builder.CreateInsertElement(Vec: Result, NewElt: Ops[i], Idx: Builder.getInt64(C: i));
7374
7375 return Result;
7376}
7377
7378Value *CodeGenFunction::EmitAArch64CpuInit() {
7379 llvm::FunctionType *FTy = llvm::FunctionType::get(Result: VoidTy, isVarArg: false);
7380 llvm::FunctionCallee Func =
7381 CGM.CreateRuntimeFunction(Ty: FTy, Name: "__init_cpu_features_resolver");
7382 cast<llvm::GlobalValue>(Val: Func.getCallee())->setDSOLocal(true);
7383 cast<llvm::GlobalValue>(Val: Func.getCallee())
7384 ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
7385 return Builder.CreateCall(Callee: Func);
7386}
7387
7388Value *CodeGenFunction::EmitAArch64CpuSupports(const CallExpr *E) {
7389 const Expr *ArgExpr = E->getArg(Arg: 0)->IgnoreParenCasts();
7390 StringRef ArgStr = cast<StringLiteral>(Val: ArgExpr)->getString();
7391 llvm::SmallVector<StringRef, 8> Features;
7392 ArgStr.split(A&: Features, Separator: "+");
7393 for (auto &Feature : Features) {
7394 Feature = Feature.trim();
7395 if (!llvm::AArch64::parseFMVExtension(Extension: Feature))
7396 return Builder.getFalse();
7397 if (Feature != "default")
7398 Features.push_back(Elt: Feature);
7399 }
7400 return EmitAArch64CpuSupports(FeatureStrs: Features);
7401}
7402
7403llvm::Value *
7404CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) {
7405 llvm::APInt FeaturesMask = llvm::AArch64::getCpuSupportsMask(Features: FeaturesStrs);
7406 Value *Result = Builder.getTrue();
7407 if (FeaturesMask != 0) {
7408 // Get features from structure in runtime library
7409 // struct {
7410 // unsigned long long features;
7411 // } __aarch64_cpu_features;
7412 llvm::Type *STy = llvm::StructType::get(elt1: Int64Ty);
7413 llvm::Constant *AArch64CPUFeatures =
7414 CGM.CreateRuntimeVariable(Ty: STy, Name: "__aarch64_cpu_features");
7415 cast<llvm::GlobalValue>(Val: AArch64CPUFeatures)->setDSOLocal(true);
7416 llvm::Value *CpuFeatures = Builder.CreateGEP(
7417 Ty: STy, Ptr: AArch64CPUFeatures,
7418 IdxList: {ConstantInt::get(Ty: Int32Ty, V: 0), ConstantInt::get(Ty: Int32Ty, V: 0)});
7419 Value *Features = Builder.CreateAlignedLoad(Ty: Int64Ty, Addr: CpuFeatures,
7420 Align: CharUnits::fromQuantity(Quantity: 8));
7421 Value *Mask = Builder.getInt(AI: FeaturesMask.trunc(width: 64));
7422 Value *Bitset = Builder.CreateAnd(LHS: Features, RHS: Mask);
7423 Value *Cmp = Builder.CreateICmpEQ(LHS: Bitset, RHS: Mask);
7424 Result = Builder.CreateAnd(LHS: Result, RHS: Cmp);
7425 }
7426 return Result;
7427}
7428